aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/9p.h375
-rw-r--r--fs/9p/Makefile6
-rw-r--r--fs/9p/conv.c845
-rw-r--r--fs/9p/conv.h50
-rw-r--r--fs/9p/debug.h77
-rw-r--r--fs/9p/error.c93
-rw-r--r--fs/9p/error.h177
-rw-r--r--fs/9p/fcall.c427
-rw-r--r--fs/9p/fcprint.c345
-rw-r--r--fs/9p/fid.c168
-rw-r--r--fs/9p/fid.h43
-rw-r--r--fs/9p/mux.c1033
-rw-r--r--fs/9p/mux.h55
-rw-r--r--fs/9p/trans_fd.c308
-rw-r--r--fs/9p/transport.h45
-rw-r--r--fs/9p/v9fs.c295
-rw-r--r--fs/9p/v9fs.h32
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c57
-rw-r--r--fs/9p/vfs_dentry.c37
-rw-r--r--fs/9p/vfs_dir.c155
-rw-r--r--fs/9p/vfs_file.c166
-rw-r--r--fs/9p/vfs_inode.c754
-rw-r--r--fs/9p/vfs_super.c93
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/afs.h8
-rw-r--r--fs/afs/afs_fs.h3
-rw-r--r--fs/afs/callback.c3
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/flock.c558
-rw-r--r--fs/afs/fsclient.c155
-rw-r--r--fs/afs/internal.h30
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/proc.c81
-rw-r--r--fs/afs/super.c3
-rw-r--r--fs/afs/vnode.c132
-rw-r--r--fs/anon_inodes.c11
-rw-r--r--fs/attr.c4
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/binfmt_elf.c116
-rw-r--r--fs/bio.c2
-rw-r--r--fs/block_dev.c64
-rw-r--r--fs/buffer.c63
-rw-r--r--fs/cifs/cifsfs.c9
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/export.c2
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c11
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c289
-rw-r--r--fs/configfs/file.c28
-rw-r--r--fs/configfs/item.c29
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/debugfs/inode.c63
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/Makefile1
-rw-r--r--fs/dlm/config.c39
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/debug_fs.c186
-rw-r--r--fs/dlm/dlm_internal.h17
-rw-r--r--fs/dlm/lock.c470
-rw-r--r--fs/dlm/lock.h13
-rw-r--r--fs/dlm/lockspace.c86
-rw-r--r--fs/dlm/lowcomms.c23
-rw-r--r--fs/dlm/main.c11
-rw-r--r--fs/dlm/member.c11
-rw-r--r--fs/dlm/netlink.c153
-rw-r--r--fs/dlm/rcom.c13
-rw-r--r--fs/dlm/recoverd.c4
-rw-r--r--fs/dlm/user.c129
-rw-r--r--fs/dquot.c7
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c15
-rw-r--r--fs/ecryptfs/inode.c70
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/mmap.c55
-rw-r--r--fs/efs/namei.c32
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/exportfs/expfs.c439
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/file.c8
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/super.c30
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c6
-rw-r--r--fs/ext3/ioctl.c6
-rw-r--r--fs/ext3/namei.c10
-rw-r--r--fs/ext3/super.c52
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/extents.c682
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inode.c122
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/namei.c86
-rw-r--r--fs/ext4/super.c99
-rw-r--r--fs/ext4/xattr.c276
-rw-r--r--fs/ext4/xattr.h17
-rw-r--r--fs/fat/dir.c31
-rw-r--r--fs/fat/fatent.c7
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/freevxfs/vxfs_dir.h2
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/bmap.c23
-rw-r--r--fs/gfs2/daemon.c11
-rw-r--r--fs/gfs2/dir.c69
-rw-r--r--fs/gfs2/dir.h9
-rw-r--r--fs/gfs2/eaops.c1
-rw-r--r--fs/gfs2/eattr.c14
-rw-r--r--fs/gfs2/glock.c123
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h81
-rw-r--r--fs/gfs2/inode.c288
-rw-r--r--fs/gfs2/inode.h30
-rw-r--r--fs/gfs2/locking/dlm/lock.c11
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h2
-rw-r--r--fs/gfs2/locking/dlm/mount.c2
-rw-r--r--fs/gfs2/locking/dlm/plock.c8
-rw-r--r--fs/gfs2/locking/dlm/thread.c11
-rw-r--r--fs/gfs2/log.c129
-rw-r--r--fs/gfs2/lops.c49
-rw-r--r--fs/gfs2/lops.h23
-rw-r--r--fs/gfs2/meta_io.c8
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/mount.c25
-rw-r--r--fs/gfs2/ondisk.c251
-rw-r--r--fs/gfs2/ops_address.c69
-rw-r--r--fs/gfs2/ops_address.h2
-rw-r--r--fs/gfs2/ops_dentry.c24
-rw-r--r--fs/gfs2/ops_export.c66
-rw-r--r--fs/gfs2/ops_export.h22
-rw-r--r--fs/gfs2/ops_file.c5
-rw-r--r--fs/gfs2/ops_fstype.c33
-rw-r--r--fs/gfs2/ops_fstype.h1
-rw-r--r--fs/gfs2/ops_inode.c30
-rw-r--r--fs/gfs2/ops_super.c8
-rw-r--r--fs/gfs2/ops_vm.c2
-rw-r--r--fs/gfs2/quota.c57
-rw-r--r--fs/gfs2/recovery.c22
-rw-r--r--fs/gfs2/rgrp.c377
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c79
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/util.c6
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/btree.c4
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c7
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/unicode.c230
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/file.c2
-rw-r--r--fs/hugetlbfs/inode.c96
-rw-r--r--fs/inode.c17
-rw-r--r--fs/ioctl.c22
-rw-r--r--fs/isofs/dir.c87
-rw-r--r--fs/isofs/inode.c417
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c10
-rw-r--r--fs/isofs/namei.c26
-rw-r--r--fs/jbd/commit.c3
-rw-r--r--fs/jbd/revoke.c5
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/journal.c79
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/revoke.c5
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/background.c1
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/readinode.c23
-rw-r--r--fs/jfs/endian24.h2
-rw-r--r--fs/jfs/file.c1
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_debug.c28
-rw-r--r--fs/jfs/jfs_debug.h2
-rw-r--r--fs/jfs/jfs_dinode.h42
-rw-r--r--fs/jfs/jfs_dmap.c419
-rw-r--r--fs/jfs/jfs_dmap.h118
-rw-r--r--fs/jfs/jfs_dtree.c105
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_extent.c102
-rw-r--r--fs/jfs/jfs_filsys.h13
-rw-r--r--fs/jfs/jfs_imap.c296
-rw-r--r--fs/jfs/jfs_imap.h98
-rw-r--r--fs/jfs/jfs_incore.h4
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/jfs_logmgr.c90
-rw-r--r--fs/jfs/jfs_logmgr.h26
-rw-r--r--fs/jfs/jfs_metapage.c3
-rw-r--r--fs/jfs/jfs_mount.c6
-rw-r--r--fs/jfs/jfs_txnmgr.c302
-rw-r--r--fs/jfs/jfs_txnmgr.h2
-rw-r--r--fs/jfs/jfs_types.h20
-rw-r--r--fs/jfs/jfs_umount.c2
-rw-r--r--fs/jfs/jfs_xtree.c428
-rw-r--r--fs/jfs/jfs_xtree.h48
-rw-r--r--fs/jfs/namei.c58
-rw-r--r--fs/jfs/resize.c48
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c11
-rw-r--r--fs/lockd/host.c39
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c35
-rw-r--r--fs/mbcache.c9
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/namei.c2
-rw-r--r--fs/namespace.c23
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/client.c82
-rw-r--r--fs/nfs/delegation.c186
-rw-r--r--fs/nfs/delegation.h26
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c34
-rw-r--r--fs/nfs/file.c15
-rw-r--r--fs/nfs/inode.c73
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/mount_clnt.c169
-rw-r--r--fs/nfs/nfs2xdr.c6
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs3xdr.c8
-rw-r--r--fs/nfs/nfs4_fs.h40
-rw-r--r--fs/nfs/nfs4proc.c760
-rw-r--r--fs/nfs/nfs4state.c310
-rw-r--r--fs/nfs/nfs4xdr.c126
-rw-r--r--fs/nfs/nfsroot.c5
-rw-r--r--fs/nfs/pagelist.c60
-rw-r--r--fs/nfs/read.c40
-rw-r--r--fs/nfs/super.c1199
-rw-r--r--fs/nfs/write.c149
-rw-r--r--fs/nfsd/auth.c18
-rw-r--r--fs/nfsd/export.c289
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs4acl.c12
-rw-r--r--fs/nfsd/nfs4callback.c20
-rw-r--r--fs/nfsd/nfs4idmap.c13
-rw-r--r--fs/nfsd/nfs4proc.c35
-rw-r--r--fs/nfsd/nfs4state.c47
-rw-r--r--fs/nfsd/nfs4xdr.c101
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/nfsfh.c51
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c12
-rw-r--r--fs/nfsd/vfs.c156
-rw-r--r--fs/nls/Makefile2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1015
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.h2
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c719
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c12
-rw-r--r--fs/ocfs2/ioctl.c17
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c167
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c27
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/open.c73
-rw-r--r--fs/partitions/acorn.c9
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/partitions/ibm.c167
-rw-r--r--fs/partitions/ldm.c137
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/pipe.c70
-rw-r--r--fs/proc/array.c123
-rw-r--r--fs/proc/base.c156
-rw-r--r--fs/proc/generic.c52
-rw-r--r--fs/proc/inode.c254
-rw-r--r--fs/proc/proc_misc.c7
-rw-r--r--fs/proc/proc_tty.c15
-rw-r--r--fs/qnx4/file.c2
-rw-r--r--fs/quota.c118
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c20
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/ioctl.c5
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/seq_file.c52
-rw-r--r--fs/signalfd.c3
-rw-r--r--fs/smbfs/file.c9
-rw-r--r--fs/splice.c452
-rw-r--r--fs/super.c1
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/bin.c195
-rw-r--r--fs/sysfs/dir.c1297
-rw-r--r--fs/sysfs/file.c379
-rw-r--r--fs/sysfs/group.c55
-rw-r--r--fs/sysfs/inode.c221
-rw-r--r--fs/sysfs/mount.c36
-rw-r--r--fs/sysfs/symlink.c150
-rw-r--r--fs/sysfs/sysfs.h169
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/udf/crc.c4
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c51
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/utimes.c13
-rw-r--r--fs/xattr.c3
-rw-r--r--fs/xfs/Makefile-linux-2.62
-rw-r--r--fs/xfs/linux-2.6/kmem.h19
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c43
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c67
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c321
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c44
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h11
-rw-r--r--fs/xfs/quota/xfs_qm.c13
-rw-r--r--fs/xfs/xfs.h1
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_alloc.c101
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c20
-rw-r--r--fs/xfs/xfs_bit.c91
-rw-r--r--fs/xfs/xfs_bit.h4
-rw-r--r--fs/xfs/xfs_bmap.c369
-rw-r--r--fs/xfs/xfs_bmap.h6
-rw-r--r--fs/xfs/xfs_bmap_btree.c88
-rw-r--r--fs/xfs/xfs_btree.h32
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_clnt.h2
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_dir2.c12
-rw-r--r--fs/xfs/xfs_dir2_block.c98
-rw-r--r--fs/xfs/xfs_dir2_block.h2
-rw-r--r--fs/xfs/xfs_dir2_data.c54
-rw-r--r--fs/xfs/xfs_dir2_data.h12
-rw-r--r--fs/xfs/xfs_dir2_leaf.c106
-rw-r--r--fs/xfs/xfs_dir2_leaf.h29
-rw-r--r--fs/xfs/xfs_dir2_node.c66
-rw-r--r--fs/xfs/xfs_dir2_node.h4
-rw-r--r--fs/xfs/xfs_dir2_sf.c204
-rw-r--r--fs/xfs/xfs_dir2_sf.h20
-rw-r--r--fs/xfs/xfs_filestream.c771
-rw-r--r--fs/xfs/xfs_filestream.h136
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_fsops.c17
-rw-r--r--fs/xfs/xfs_ialloc.c28
-rw-r--r--fs/xfs/xfs_ialloc.h10
-rw-r--r--fs/xfs/xfs_inode.c39
-rw-r--r--fs/xfs/xfs_inode.h16
-rw-r--r--fs/xfs/xfs_iomap.c41
-rw-r--r--fs/xfs/xfs_itable.c42
-rw-r--r--fs/xfs/xfs_itable.h20
-rw-r--r--fs/xfs/xfs_log.c41
-rw-r--r--fs/xfs/xfs_log_recover.c8
-rw-r--r--fs/xfs/xfs_mount.c237
-rw-r--r--fs/xfs/xfs_mount.h15
-rw-r--r--fs/xfs/xfs_mru_cache.c608
-rw-r--r--fs/xfs/xfs_mru_cache.h57
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_rw.h36
-rw-r--r--fs/xfs/xfs_sb.h16
-rw-r--r--fs/xfs/xfs_trans.c125
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_vfsops.c159
-rw-r--r--fs/xfs/xfs_vnodeops.c122
416 files changed, 20865 insertions, 13374 deletions
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
deleted file mode 100644
index 94e2f92ab2..0000000000
--- a/fs/9p/9p.h
+++ /dev/null
@@ -1,375 +0,0 @@
1/*
2 * linux/fs/9p/9p.h
3 *
4 * 9P protocol definitions.
5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to:
21 * Free Software Foundation
22 * 51 Franklin Street, Fifth Floor
23 * Boston, MA 02111-1301 USA
24 *
25 */
26
27/* Message Types */
28enum {
29 TVERSION = 100,
30 RVERSION,
31 TAUTH = 102,
32 RAUTH,
33 TATTACH = 104,
34 RATTACH,
35 TERROR = 106,
36 RERROR,
37 TFLUSH = 108,
38 RFLUSH,
39 TWALK = 110,
40 RWALK,
41 TOPEN = 112,
42 ROPEN,
43 TCREATE = 114,
44 RCREATE,
45 TREAD = 116,
46 RREAD,
47 TWRITE = 118,
48 RWRITE,
49 TCLUNK = 120,
50 RCLUNK,
51 TREMOVE = 122,
52 RREMOVE,
53 TSTAT = 124,
54 RSTAT,
55 TWSTAT = 126,
56 RWSTAT,
57};
58
59/* modes */
60enum {
61 V9FS_OREAD = 0x00,
62 V9FS_OWRITE = 0x01,
63 V9FS_ORDWR = 0x02,
64 V9FS_OEXEC = 0x03,
65 V9FS_OEXCL = 0x04,
66 V9FS_OTRUNC = 0x10,
67 V9FS_OREXEC = 0x20,
68 V9FS_ORCLOSE = 0x40,
69 V9FS_OAPPEND = 0x80,
70};
71
72/* permissions */
73enum {
74 V9FS_DMDIR = 0x80000000,
75 V9FS_DMAPPEND = 0x40000000,
76 V9FS_DMEXCL = 0x20000000,
77 V9FS_DMMOUNT = 0x10000000,
78 V9FS_DMAUTH = 0x08000000,
79 V9FS_DMTMP = 0x04000000,
80 V9FS_DMSYMLINK = 0x02000000,
81 V9FS_DMLINK = 0x01000000,
82 /* 9P2000.u extensions */
83 V9FS_DMDEVICE = 0x00800000,
84 V9FS_DMNAMEDPIPE = 0x00200000,
85 V9FS_DMSOCKET = 0x00100000,
86 V9FS_DMSETUID = 0x00080000,
87 V9FS_DMSETGID = 0x00040000,
88};
89
90/* qid.types */
91enum {
92 V9FS_QTDIR = 0x80,
93 V9FS_QTAPPEND = 0x40,
94 V9FS_QTEXCL = 0x20,
95 V9FS_QTMOUNT = 0x10,
96 V9FS_QTAUTH = 0x08,
97 V9FS_QTTMP = 0x04,
98 V9FS_QTSYMLINK = 0x02,
99 V9FS_QTLINK = 0x01,
100 V9FS_QTFILE = 0x00,
101};
102
103#define V9FS_NOTAG (u16)(~0)
104#define V9FS_NOFID (u32)(~0)
105#define V9FS_MAXWELEM 16
106
107/* ample room for Twrite/Rread header (iounit) */
108#define V9FS_IOHDRSZ 24
109
110struct v9fs_str {
111 u16 len;
112 char *str;
113};
114
115/* qids are the unique ID for a file (like an inode */
116struct v9fs_qid {
117 u8 type;
118 u32 version;
119 u64 path;
120};
121
122/* Plan 9 file metadata (stat) structure */
123struct v9fs_stat {
124 u16 size;
125 u16 type;
126 u32 dev;
127 struct v9fs_qid qid;
128 u32 mode;
129 u32 atime;
130 u32 mtime;
131 u64 length;
132 struct v9fs_str name;
133 struct v9fs_str uid;
134 struct v9fs_str gid;
135 struct v9fs_str muid;
136 struct v9fs_str extension; /* 9p2000.u extensions */
137 u32 n_uid; /* 9p2000.u extensions */
138 u32 n_gid; /* 9p2000.u extensions */
139 u32 n_muid; /* 9p2000.u extensions */
140};
141
142/* file metadata (stat) structure used to create Twstat message
143 The is similar to v9fs_stat, but the strings don't point to
144 the same memory block and should be freed separately
145*/
146struct v9fs_wstat {
147 u16 size;
148 u16 type;
149 u32 dev;
150 struct v9fs_qid qid;
151 u32 mode;
152 u32 atime;
153 u32 mtime;
154 u64 length;
155 char *name;
156 char *uid;
157 char *gid;
158 char *muid;
159 char *extension; /* 9p2000.u extensions */
160 u32 n_uid; /* 9p2000.u extensions */
161 u32 n_gid; /* 9p2000.u extensions */
162 u32 n_muid; /* 9p2000.u extensions */
163};
164
165/* Structures for Protocol Operations */
166
167struct Tversion {
168 u32 msize;
169 struct v9fs_str version;
170};
171
172struct Rversion {
173 u32 msize;
174 struct v9fs_str version;
175};
176
177struct Tauth {
178 u32 afid;
179 struct v9fs_str uname;
180 struct v9fs_str aname;
181};
182
183struct Rauth {
184 struct v9fs_qid qid;
185};
186
187struct Rerror {
188 struct v9fs_str error;
189 u32 errno; /* 9p2000.u extension */
190};
191
192struct Tflush {
193 u16 oldtag;
194};
195
196struct Rflush {
197};
198
199struct Tattach {
200 u32 fid;
201 u32 afid;
202 struct v9fs_str uname;
203 struct v9fs_str aname;
204};
205
206struct Rattach {
207 struct v9fs_qid qid;
208};
209
210struct Twalk {
211 u32 fid;
212 u32 newfid;
213 u16 nwname;
214 struct v9fs_str wnames[16];
215};
216
217struct Rwalk {
218 u16 nwqid;
219 struct v9fs_qid wqids[16];
220};
221
222struct Topen {
223 u32 fid;
224 u8 mode;
225};
226
227struct Ropen {
228 struct v9fs_qid qid;
229 u32 iounit;
230};
231
232struct Tcreate {
233 u32 fid;
234 struct v9fs_str name;
235 u32 perm;
236 u8 mode;
237 struct v9fs_str extension;
238};
239
240struct Rcreate {
241 struct v9fs_qid qid;
242 u32 iounit;
243};
244
245struct Tread {
246 u32 fid;
247 u64 offset;
248 u32 count;
249};
250
251struct Rread {
252 u32 count;
253 u8 *data;
254};
255
256struct Twrite {
257 u32 fid;
258 u64 offset;
259 u32 count;
260 u8 *data;
261};
262
263struct Rwrite {
264 u32 count;
265};
266
267struct Tclunk {
268 u32 fid;
269};
270
271struct Rclunk {
272};
273
274struct Tremove {
275 u32 fid;
276};
277
278struct Rremove {
279};
280
281struct Tstat {
282 u32 fid;
283};
284
285struct Rstat {
286 struct v9fs_stat stat;
287};
288
289struct Twstat {
290 u32 fid;
291 struct v9fs_stat stat;
292};
293
294struct Rwstat {
295};
296
297/*
298 * fcall is the primary packet structure
299 *
300 */
301
302struct v9fs_fcall {
303 u32 size;
304 u8 id;
305 u16 tag;
306 void *sdata;
307
308 union {
309 struct Tversion tversion;
310 struct Rversion rversion;
311 struct Tauth tauth;
312 struct Rauth rauth;
313 struct Rerror rerror;
314 struct Tflush tflush;
315 struct Rflush rflush;
316 struct Tattach tattach;
317 struct Rattach rattach;
318 struct Twalk twalk;
319 struct Rwalk rwalk;
320 struct Topen topen;
321 struct Ropen ropen;
322 struct Tcreate tcreate;
323 struct Rcreate rcreate;
324 struct Tread tread;
325 struct Rread rread;
326 struct Twrite twrite;
327 struct Rwrite rwrite;
328 struct Tclunk tclunk;
329 struct Rclunk rclunk;
330 struct Tremove tremove;
331 struct Rremove rremove;
332 struct Tstat tstat;
333 struct Rstat rstat;
334 struct Twstat twstat;
335 struct Rwstat rwstat;
336 } params;
337};
338
339#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \
340 fcall?fcall->params.rerror.error.len:0, \
341 fcall?fcall->params.rerror.error.str:"");
342
343int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
344 char *version, struct v9fs_fcall **rcall);
345
346int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
347 u32 fid, u32 afid, struct v9fs_fcall **rcall);
348
349int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid);
350
351int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
352 struct v9fs_fcall **rcall);
353
354int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
355 struct v9fs_wstat *wstat, struct v9fs_fcall **rcall);
356
357int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
358 char *name, struct v9fs_fcall **rcall);
359
360int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
361 struct v9fs_fcall **rcall);
362
363int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
364 struct v9fs_fcall **rcall);
365
366int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
367 u32 perm, u8 mode, char *extension, struct v9fs_fcall **rcall);
368
369int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
370 u64 offset, u32 count, struct v9fs_fcall **rcall);
371
372int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
373 u32 count, const char __user * data,
374 struct v9fs_fcall **rcall);
375int v9fs_printfcall(char *, int, struct v9fs_fcall *, int);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 87897f84df..bc7f0d1551 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -1,18 +1,12 @@
1obj-$(CONFIG_9P_FS) := 9p.o 1obj-$(CONFIG_9P_FS) := 9p.o
2 2
39p-objs := \ 39p-objs := \
4 trans_fd.o \
5 mux.o \
6 fcall.o \
7 conv.o \
8 vfs_super.o \ 4 vfs_super.o \
9 vfs_inode.o \ 5 vfs_inode.o \
10 vfs_addr.o \ 6 vfs_addr.o \
11 vfs_file.o \ 7 vfs_file.o \
12 vfs_dir.o \ 8 vfs_dir.o \
13 vfs_dentry.o \ 9 vfs_dentry.o \
14 error.o \
15 v9fs.o \ 10 v9fs.o \
16 fid.o \ 11 fid.o \
17 fcprint.o
18 12
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
deleted file mode 100644
index a3ed571eee..0000000000
--- a/fs/9p/conv.c
+++ /dev/null
@@ -1,845 +0,0 @@
1/*
2 * linux/fs/9p/conv.c
3 *
4 * 9P protocol conversion functions
5 *
6 * Copyright (C) 2004, 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to:
21 * Free Software Foundation
22 * 51 Franklin Street, Fifth Floor
23 * Boston, MA 02111-1301 USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/idr.h>
32#include <asm/uaccess.h>
33#include "debug.h"
34#include "v9fs.h"
35#include "9p.h"
36#include "conv.h"
37
38/*
39 * Buffer to help with string parsing
40 */
41struct cbuf {
42 unsigned char *sp;
43 unsigned char *p;
44 unsigned char *ep;
45};
46
47static inline void buf_init(struct cbuf *buf, void *data, int datalen)
48{
49 buf->sp = buf->p = data;
50 buf->ep = data + datalen;
51}
52
53static inline int buf_check_overflow(struct cbuf *buf)
54{
55 return buf->p > buf->ep;
56}
57
58static int buf_check_size(struct cbuf *buf, int len)
59{
60 if (buf->p + len > buf->ep) {
61 if (buf->p < buf->ep) {
62 eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
63 len, (int)(buf->ep - buf->p));
64 dump_stack();
65 buf->p = buf->ep + 1;
66 }
67
68 return 0;
69 }
70
71 return 1;
72}
73
74static void *buf_alloc(struct cbuf *buf, int len)
75{
76 void *ret = NULL;
77
78 if (buf_check_size(buf, len)) {
79 ret = buf->p;
80 buf->p += len;
81 }
82
83 return ret;
84}
85
86static void buf_put_int8(struct cbuf *buf, u8 val)
87{
88 if (buf_check_size(buf, 1)) {
89 buf->p[0] = val;
90 buf->p++;
91 }
92}
93
94static void buf_put_int16(struct cbuf *buf, u16 val)
95{
96 if (buf_check_size(buf, 2)) {
97 *(__le16 *) buf->p = cpu_to_le16(val);
98 buf->p += 2;
99 }
100}
101
102static void buf_put_int32(struct cbuf *buf, u32 val)
103{
104 if (buf_check_size(buf, 4)) {
105 *(__le32 *)buf->p = cpu_to_le32(val);
106 buf->p += 4;
107 }
108}
109
110static void buf_put_int64(struct cbuf *buf, u64 val)
111{
112 if (buf_check_size(buf, 8)) {
113 *(__le64 *)buf->p = cpu_to_le64(val);
114 buf->p += 8;
115 }
116}
117
118static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
119{
120 char *ret;
121
122 ret = NULL;
123 if (buf_check_size(buf, slen + 2)) {
124 buf_put_int16(buf, slen);
125 ret = buf->p;
126 memcpy(buf->p, s, slen);
127 buf->p += slen;
128 }
129
130 return ret;
131}
132
133static inline void buf_put_string(struct cbuf *buf, const char *s)
134{
135 buf_put_stringn(buf, s, strlen(s));
136}
137
138static u8 buf_get_int8(struct cbuf *buf)
139{
140 u8 ret = 0;
141
142 if (buf_check_size(buf, 1)) {
143 ret = buf->p[0];
144 buf->p++;
145 }
146
147 return ret;
148}
149
150static u16 buf_get_int16(struct cbuf *buf)
151{
152 u16 ret = 0;
153
154 if (buf_check_size(buf, 2)) {
155 ret = le16_to_cpu(*(__le16 *)buf->p);
156 buf->p += 2;
157 }
158
159 return ret;
160}
161
162static u32 buf_get_int32(struct cbuf *buf)
163{
164 u32 ret = 0;
165
166 if (buf_check_size(buf, 4)) {
167 ret = le32_to_cpu(*(__le32 *)buf->p);
168 buf->p += 4;
169 }
170
171 return ret;
172}
173
174static u64 buf_get_int64(struct cbuf *buf)
175{
176 u64 ret = 0;
177
178 if (buf_check_size(buf, 8)) {
179 ret = le64_to_cpu(*(__le64 *)buf->p);
180 buf->p += 8;
181 }
182
183 return ret;
184}
185
186static void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr)
187{
188 vstr->len = buf_get_int16(buf);
189 if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) {
190 vstr->str = buf->p;
191 buf->p += vstr->len;
192 } else {
193 vstr->len = 0;
194 vstr->str = NULL;
195 }
196}
197
198static void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid)
199{
200 qid->type = buf_get_int8(bufp);
201 qid->version = buf_get_int32(bufp);
202 qid->path = buf_get_int64(bufp);
203}
204
205/**
206 * v9fs_size_wstat - calculate the size of a variable length stat struct
207 * @stat: metadata (stat) structure
208 * @extended: non-zero if 9P2000.u
209 *
210 */
211
212static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended)
213{
214 int size = 0;
215
216 if (wstat == NULL) {
217 eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
218 return 0;
219 }
220
221 size = /* 2 + *//* size[2] */
222 2 + /* type[2] */
223 4 + /* dev[4] */
224 1 + /* qid.type[1] */
225 4 + /* qid.vers[4] */
226 8 + /* qid.path[8] */
227 4 + /* mode[4] */
228 4 + /* atime[4] */
229 4 + /* mtime[4] */
230 8 + /* length[8] */
231 8; /* minimum sum of string lengths */
232
233 if (wstat->name)
234 size += strlen(wstat->name);
235 if (wstat->uid)
236 size += strlen(wstat->uid);
237 if (wstat->gid)
238 size += strlen(wstat->gid);
239 if (wstat->muid)
240 size += strlen(wstat->muid);
241
242 if (extended) {
243 size += 4 + /* n_uid[4] */
244 4 + /* n_gid[4] */
245 4 + /* n_muid[4] */
246 2; /* string length of extension[4] */
247 if (wstat->extension)
248 size += strlen(wstat->extension);
249 }
250
251 return size;
252}
253
254/**
255 * buf_get_stat - safely decode a recieved metadata (stat) structure
256 * @bufp: buffer to deserialize
257 * @stat: metadata (stat) structure
258 * @extended: non-zero if 9P2000.u
259 *
260 */
261
262static void
263buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended)
264{
265 stat->size = buf_get_int16(bufp);
266 stat->type = buf_get_int16(bufp);
267 stat->dev = buf_get_int32(bufp);
268 stat->qid.type = buf_get_int8(bufp);
269 stat->qid.version = buf_get_int32(bufp);
270 stat->qid.path = buf_get_int64(bufp);
271 stat->mode = buf_get_int32(bufp);
272 stat->atime = buf_get_int32(bufp);
273 stat->mtime = buf_get_int32(bufp);
274 stat->length = buf_get_int64(bufp);
275 buf_get_str(bufp, &stat->name);
276 buf_get_str(bufp, &stat->uid);
277 buf_get_str(bufp, &stat->gid);
278 buf_get_str(bufp, &stat->muid);
279
280 if (extended) {
281 buf_get_str(bufp, &stat->extension);
282 stat->n_uid = buf_get_int32(bufp);
283 stat->n_gid = buf_get_int32(bufp);
284 stat->n_muid = buf_get_int32(bufp);
285 }
286}
287
288/**
289 * v9fs_deserialize_stat - decode a received metadata structure
290 * @buf: buffer to deserialize
291 * @buflen: length of received buffer
292 * @stat: metadata structure to decode into
293 * @extended: non-zero if 9P2000.u
294 *
295 * Note: stat will point to the buf region.
296 */
297
298int
299v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
300 int extended)
301{
302 struct cbuf buffer;
303 struct cbuf *bufp = &buffer;
304 unsigned char *p;
305
306 buf_init(bufp, buf, buflen);
307 p = bufp->p;
308 buf_get_stat(bufp, stat, extended);
309
310 if (buf_check_overflow(bufp))
311 return 0;
312 else
313 return bufp->p - p;
314}
315
316/**
317 * deserialize_fcall - unmarshal a response
318 * @buf: recieved buffer
319 * @buflen: length of received buffer
320 * @rcall: fcall structure to populate
321 * @rcalllen: length of fcall structure to populate
322 * @extended: non-zero if 9P2000.u
323 *
324 */
325
326int
327v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
328 int extended)
329{
330
331 struct cbuf buffer;
332 struct cbuf *bufp = &buffer;
333 int i = 0;
334
335 buf_init(bufp, buf, buflen);
336
337 rcall->size = buf_get_int32(bufp);
338 rcall->id = buf_get_int8(bufp);
339 rcall->tag = buf_get_int16(bufp);
340
341 dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
342 rcall->tag);
343
344 switch (rcall->id) {
345 default:
346 eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
347 return -EPROTO;
348 case RVERSION:
349 rcall->params.rversion.msize = buf_get_int32(bufp);
350 buf_get_str(bufp, &rcall->params.rversion.version);
351 break;
352 case RFLUSH:
353 break;
354 case RATTACH:
355 rcall->params.rattach.qid.type = buf_get_int8(bufp);
356 rcall->params.rattach.qid.version = buf_get_int32(bufp);
357 rcall->params.rattach.qid.path = buf_get_int64(bufp);
358 break;
359 case RWALK:
360 rcall->params.rwalk.nwqid = buf_get_int16(bufp);
361 if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) {
362 eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n",
363 V9FS_MAXWELEM, rcall->params.rwalk.nwqid);
364 return -EPROTO;
365 }
366
367 for (i = 0; i < rcall->params.rwalk.nwqid; i++)
368 buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]);
369 break;
370 case ROPEN:
371 buf_get_qid(bufp, &rcall->params.ropen.qid);
372 rcall->params.ropen.iounit = buf_get_int32(bufp);
373 break;
374 case RCREATE:
375 buf_get_qid(bufp, &rcall->params.rcreate.qid);
376 rcall->params.rcreate.iounit = buf_get_int32(bufp);
377 break;
378 case RREAD:
379 rcall->params.rread.count = buf_get_int32(bufp);
380 rcall->params.rread.data = bufp->p;
381 buf_check_size(bufp, rcall->params.rread.count);
382 break;
383 case RWRITE:
384 rcall->params.rwrite.count = buf_get_int32(bufp);
385 break;
386 case RCLUNK:
387 break;
388 case RREMOVE:
389 break;
390 case RSTAT:
391 buf_get_int16(bufp);
392 buf_get_stat(bufp, &rcall->params.rstat.stat, extended);
393 break;
394 case RWSTAT:
395 break;
396 case RERROR:
397 buf_get_str(bufp, &rcall->params.rerror.error);
398 if (extended)
399 rcall->params.rerror.errno = buf_get_int16(bufp);
400 break;
401 }
402
403 if (buf_check_overflow(bufp)) {
404 dprintk(DEBUG_ERROR, "buffer overflow\n");
405 return -EIO;
406 }
407
408 return bufp->p - bufp->sp;
409}
410
411static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p)
412{
413 *p = val;
414 buf_put_int8(bufp, val);
415}
416
417static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p)
418{
419 *p = val;
420 buf_put_int16(bufp, val);
421}
422
423static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p)
424{
425 *p = val;
426 buf_put_int32(bufp, val);
427}
428
429static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
430{
431 *p = val;
432 buf_put_int64(bufp, val);
433}
434
435static void
436v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
437{
438 int len;
439 char *s;
440
441 if (data)
442 len = strlen(data);
443 else
444 len = 0;
445
446 s = buf_put_stringn(bufp, data, len);
447 if (str) {
448 str->len = len;
449 str->str = s;
450 }
451}
452
453static int
454v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count,
455 unsigned char **pdata)
456{
457 *pdata = buf_alloc(bufp, count);
458 return copy_from_user(*pdata, data, count);
459}
460
461static void
462v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat,
463 struct v9fs_stat *stat, int statsz, int extended)
464{
465 v9fs_put_int16(bufp, statsz, &stat->size);
466 v9fs_put_int16(bufp, wstat->type, &stat->type);
467 v9fs_put_int32(bufp, wstat->dev, &stat->dev);
468 v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type);
469 v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version);
470 v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path);
471 v9fs_put_int32(bufp, wstat->mode, &stat->mode);
472 v9fs_put_int32(bufp, wstat->atime, &stat->atime);
473 v9fs_put_int32(bufp, wstat->mtime, &stat->mtime);
474 v9fs_put_int64(bufp, wstat->length, &stat->length);
475
476 v9fs_put_str(bufp, wstat->name, &stat->name);
477 v9fs_put_str(bufp, wstat->uid, &stat->uid);
478 v9fs_put_str(bufp, wstat->gid, &stat->gid);
479 v9fs_put_str(bufp, wstat->muid, &stat->muid);
480
481 if (extended) {
482 v9fs_put_str(bufp, wstat->extension, &stat->extension);
483 v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid);
484 v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid);
485 v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid);
486 }
487}
488
489static struct v9fs_fcall *
490v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
491{
492 struct v9fs_fcall *fc;
493
494 size += 4 + 1 + 2; /* size[4] id[1] tag[2] */
495 fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL);
496 if (!fc)
497 return ERR_PTR(-ENOMEM);
498
499 fc->sdata = (char *)fc + sizeof(*fc);
500
501 buf_init(bufp, (char *)fc->sdata, size);
502 v9fs_put_int32(bufp, size, &fc->size);
503 v9fs_put_int8(bufp, id, &fc->id);
504 v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag);
505
506 return fc;
507}
508
509void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
510{
511 fc->tag = tag;
512 *(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
513}
514
515struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version)
516{
517 int size;
518 struct v9fs_fcall *fc;
519 struct cbuf buffer;
520 struct cbuf *bufp = &buffer;
521
522 size = 4 + 2 + strlen(version); /* msize[4] version[s] */
523 fc = v9fs_create_common(bufp, size, TVERSION);
524 if (IS_ERR(fc))
525 goto error;
526
527 v9fs_put_int32(bufp, msize, &fc->params.tversion.msize);
528 v9fs_put_str(bufp, version, &fc->params.tversion.version);
529
530 if (buf_check_overflow(bufp)) {
531 kfree(fc);
532 fc = ERR_PTR(-ENOMEM);
533 }
534 error:
535 return fc;
536}
537
538#if 0
539struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname)
540{
541 int size;
542 struct v9fs_fcall *fc;
543 struct cbuf buffer;
544 struct cbuf *bufp = &buffer;
545
546 size = 4 + 2 + strlen(uname) + 2 + strlen(aname); /* afid[4] uname[s] aname[s] */
547 fc = v9fs_create_common(bufp, size, TAUTH);
548 if (IS_ERR(fc))
549 goto error;
550
551 v9fs_put_int32(bufp, afid, &fc->params.tauth.afid);
552 v9fs_put_str(bufp, uname, &fc->params.tauth.uname);
553 v9fs_put_str(bufp, aname, &fc->params.tauth.aname);
554
555 if (buf_check_overflow(bufp)) {
556 kfree(fc);
557 fc = ERR_PTR(-ENOMEM);
558 }
559 error:
560 return fc;
561}
562#endif /* 0 */
563
564struct v9fs_fcall *
565v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname)
566{
567 int size;
568 struct v9fs_fcall *fc;
569 struct cbuf buffer;
570 struct cbuf *bufp = &buffer;
571
572 size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname); /* fid[4] afid[4] uname[s] aname[s] */
573 fc = v9fs_create_common(bufp, size, TATTACH);
574 if (IS_ERR(fc))
575 goto error;
576
577 v9fs_put_int32(bufp, fid, &fc->params.tattach.fid);
578 v9fs_put_int32(bufp, afid, &fc->params.tattach.afid);
579 v9fs_put_str(bufp, uname, &fc->params.tattach.uname);
580 v9fs_put_str(bufp, aname, &fc->params.tattach.aname);
581
582 error:
583 return fc;
584}
585
586struct v9fs_fcall *v9fs_create_tflush(u16 oldtag)
587{
588 int size;
589 struct v9fs_fcall *fc;
590 struct cbuf buffer;
591 struct cbuf *bufp = &buffer;
592
593 size = 2; /* oldtag[2] */
594 fc = v9fs_create_common(bufp, size, TFLUSH);
595 if (IS_ERR(fc))
596 goto error;
597
598 v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag);
599
600 if (buf_check_overflow(bufp)) {
601 kfree(fc);
602 fc = ERR_PTR(-ENOMEM);
603 }
604 error:
605 return fc;
606}
607
608struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
609 char **wnames)
610{
611 int i, size;
612 struct v9fs_fcall *fc;
613 struct cbuf buffer;
614 struct cbuf *bufp = &buffer;
615
616 if (nwname > V9FS_MAXWELEM) {
617 dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM);
618 return NULL;
619 }
620
621 size = 4 + 4 + 2; /* fid[4] newfid[4] nwname[2] ... */
622 for (i = 0; i < nwname; i++) {
623 size += 2 + strlen(wnames[i]); /* wname[s] */
624 }
625
626 fc = v9fs_create_common(bufp, size, TWALK);
627 if (IS_ERR(fc))
628 goto error;
629
630 v9fs_put_int32(bufp, fid, &fc->params.twalk.fid);
631 v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid);
632 v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname);
633 for (i = 0; i < nwname; i++) {
634 v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]);
635 }
636
637 if (buf_check_overflow(bufp)) {
638 kfree(fc);
639 fc = ERR_PTR(-ENOMEM);
640 }
641 error:
642 return fc;
643}
644
645struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode)
646{
647 int size;
648 struct v9fs_fcall *fc;
649 struct cbuf buffer;
650 struct cbuf *bufp = &buffer;
651
652 size = 4 + 1; /* fid[4] mode[1] */
653 fc = v9fs_create_common(bufp, size, TOPEN);
654 if (IS_ERR(fc))
655 goto error;
656
657 v9fs_put_int32(bufp, fid, &fc->params.topen.fid);
658 v9fs_put_int8(bufp, mode, &fc->params.topen.mode);
659
660 if (buf_check_overflow(bufp)) {
661 kfree(fc);
662 fc = ERR_PTR(-ENOMEM);
663 }
664 error:
665 return fc;
666}
667
668struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode,
669 char *extension, int extended)
670{
671 int size;
672 struct v9fs_fcall *fc;
673 struct cbuf buffer;
674 struct cbuf *bufp = &buffer;
675
676 size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */
677 if (extended) {
678 size += 2 + /* extension[s] */
679 (extension == NULL ? 0 : strlen(extension));
680 }
681
682 fc = v9fs_create_common(bufp, size, TCREATE);
683 if (IS_ERR(fc))
684 goto error;
685
686 v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid);
687 v9fs_put_str(bufp, name, &fc->params.tcreate.name);
688 v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm);
689 v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode);
690 if (extended)
691 v9fs_put_str(bufp, extension, &fc->params.tcreate.extension);
692
693 if (buf_check_overflow(bufp)) {
694 kfree(fc);
695 fc = ERR_PTR(-ENOMEM);
696 }
697 error:
698 return fc;
699}
700
701struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count)
702{
703 int size;
704 struct v9fs_fcall *fc;
705 struct cbuf buffer;
706 struct cbuf *bufp = &buffer;
707
708 size = 4 + 8 + 4; /* fid[4] offset[8] count[4] */
709 fc = v9fs_create_common(bufp, size, TREAD);
710 if (IS_ERR(fc))
711 goto error;
712
713 v9fs_put_int32(bufp, fid, &fc->params.tread.fid);
714 v9fs_put_int64(bufp, offset, &fc->params.tread.offset);
715 v9fs_put_int32(bufp, count, &fc->params.tread.count);
716
717 if (buf_check_overflow(bufp)) {
718 kfree(fc);
719 fc = ERR_PTR(-ENOMEM);
720 }
721 error:
722 return fc;
723}
724
725struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
726 const char __user * data)
727{
728 int size, err;
729 struct v9fs_fcall *fc;
730 struct cbuf buffer;
731 struct cbuf *bufp = &buffer;
732
733 size = 4 + 8 + 4 + count; /* fid[4] offset[8] count[4] data[count] */
734 fc = v9fs_create_common(bufp, size, TWRITE);
735 if (IS_ERR(fc))
736 goto error;
737
738 v9fs_put_int32(bufp, fid, &fc->params.twrite.fid);
739 v9fs_put_int64(bufp, offset, &fc->params.twrite.offset);
740 v9fs_put_int32(bufp, count, &fc->params.twrite.count);
741 err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data);
742 if (err) {
743 kfree(fc);
744 fc = ERR_PTR(err);
745 }
746
747 if (buf_check_overflow(bufp)) {
748 kfree(fc);
749 fc = ERR_PTR(-ENOMEM);
750 }
751 error:
752 return fc;
753}
754
755struct v9fs_fcall *v9fs_create_tclunk(u32 fid)
756{
757 int size;
758 struct v9fs_fcall *fc;
759 struct cbuf buffer;
760 struct cbuf *bufp = &buffer;
761
762 size = 4; /* fid[4] */
763 fc = v9fs_create_common(bufp, size, TCLUNK);
764 if (IS_ERR(fc))
765 goto error;
766
767 v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid);
768
769 if (buf_check_overflow(bufp)) {
770 kfree(fc);
771 fc = ERR_PTR(-ENOMEM);
772 }
773 error:
774 return fc;
775}
776
777struct v9fs_fcall *v9fs_create_tremove(u32 fid)
778{
779 int size;
780 struct v9fs_fcall *fc;
781 struct cbuf buffer;
782 struct cbuf *bufp = &buffer;
783
784 size = 4; /* fid[4] */
785 fc = v9fs_create_common(bufp, size, TREMOVE);
786 if (IS_ERR(fc))
787 goto error;
788
789 v9fs_put_int32(bufp, fid, &fc->params.tremove.fid);
790
791 if (buf_check_overflow(bufp)) {
792 kfree(fc);
793 fc = ERR_PTR(-ENOMEM);
794 }
795 error:
796 return fc;
797}
798
799struct v9fs_fcall *v9fs_create_tstat(u32 fid)
800{
801 int size;
802 struct v9fs_fcall *fc;
803 struct cbuf buffer;
804 struct cbuf *bufp = &buffer;
805
806 size = 4; /* fid[4] */
807 fc = v9fs_create_common(bufp, size, TSTAT);
808 if (IS_ERR(fc))
809 goto error;
810
811 v9fs_put_int32(bufp, fid, &fc->params.tstat.fid);
812
813 if (buf_check_overflow(bufp)) {
814 kfree(fc);
815 fc = ERR_PTR(-ENOMEM);
816 }
817 error:
818 return fc;
819}
820
821struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
822 int extended)
823{
824 int size, statsz;
825 struct v9fs_fcall *fc;
826 struct cbuf buffer;
827 struct cbuf *bufp = &buffer;
828
829 statsz = v9fs_size_wstat(wstat, extended);
830 size = 4 + 2 + 2 + statsz; /* fid[4] stat[n] */
831 fc = v9fs_create_common(bufp, size, TWSTAT);
832 if (IS_ERR(fc))
833 goto error;
834
835 v9fs_put_int32(bufp, fid, &fc->params.twstat.fid);
836 buf_put_int16(bufp, statsz + 2);
837 v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended);
838
839 if (buf_check_overflow(bufp)) {
840 kfree(fc);
841 fc = ERR_PTR(-ENOMEM);
842 }
843 error:
844 return fc;
845}
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
deleted file mode 100644
index dd5b6b1b61..0000000000
--- a/fs/9p/conv.h
+++ /dev/null
@@ -1,50 +0,0 @@
1/*
2 * linux/fs/9p/conv.h
3 *
4 * 9P protocol conversion definitions.
5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to:
21 * Free Software Foundation
22 * 51 Franklin Street, Fifth Floor
23 * Boston, MA 02111-1301 USA
24 *
25 */
26
27int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
28 int extended);
29int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
30 int extended);
31
32void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag);
33
34struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version);
35struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname,
36 char *aname);
37struct v9fs_fcall *v9fs_create_tflush(u16 oldtag);
38struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
39 char **wnames);
40struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode);
41struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode,
42 char *extension, int extended);
43struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count);
44struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
45 const char __user *data);
46struct v9fs_fcall *v9fs_create_tclunk(u32 fid);
47struct v9fs_fcall *v9fs_create_tremove(u32 fid);
48struct v9fs_fcall *v9fs_create_tstat(u32 fid);
49struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
50 int extended);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
deleted file mode 100644
index 4228c0bb3c..0000000000
--- a/fs/9p/debug.h
+++ /dev/null
@@ -1,77 +0,0 @@
1/*
2 * linux/fs/9p/debug.h - V9FS Debug Definitions
3 *
4 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
5 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2
9 * as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to:
18 * Free Software Foundation
19 * 51 Franklin Street, Fifth Floor
20 * Boston, MA 02111-1301 USA
21 *
22 */
23
24#define DEBUG_ERROR (1<<0)
25#define DEBUG_CURRENT (1<<1)
26#define DEBUG_9P (1<<2)
27#define DEBUG_VFS (1<<3)
28#define DEBUG_CONV (1<<4)
29#define DEBUG_MUX (1<<5)
30#define DEBUG_TRANS (1<<6)
31#define DEBUG_SLABS (1<<7)
32#define DEBUG_FCALL (1<<8)
33
34#define DEBUG_DUMP_PKT 0
35
36extern int v9fs_debug_level;
37
38#define dprintk(level, format, arg...) \
39do { \
40 if((v9fs_debug_level & level)==level) \
41 printk(KERN_NOTICE "-- %s (%d): " \
42 format , __FUNCTION__, current->pid , ## arg); \
43} while(0)
44
45#define eprintk(level, format, arg...) \
46do { \
47 printk(level "v9fs: %s (%d): " \
48 format , __FUNCTION__, current->pid , ## arg); \
49} while(0)
50
51#if DEBUG_DUMP_PKT
52static inline void dump_data(const unsigned char *data, unsigned int datalen)
53{
54 int i, n;
55 char buf[5*8];
56
57 n = 0;
58 i = 0;
59 while (i < datalen) {
60 n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]);
61 if (i%4 == 0)
62 n += snprintf(buf+n, sizeof(buf)-n, " ");
63
64 if (i%16 == 0) {
65 dprintk(DEBUG_ERROR, "%s\n", buf);
66 n = 0;
67 }
68 }
69
70 dprintk(DEBUG_ERROR, "%s\n", buf);
71}
72#else /* DEBUG_DUMP_PKT */
73static inline void dump_data(const unsigned char *data, unsigned int datalen)
74{
75
76}
77#endif /* DEBUG_DUMP_PKT */
diff --git a/fs/9p/error.c b/fs/9p/error.c
deleted file mode 100644
index 0d7fa4e088..0000000000
--- a/fs/9p/error.c
+++ /dev/null
@@ -1,93 +0,0 @@
1/*
2 * linux/fs/9p/error.c
3 *
4 * Error string handling
5 *
6 * Plan 9 uses error strings, Unix uses error numbers. These functions
7 * try to help manage that and provide for dynamically adding error
8 * mappings.
9 *
10 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
11 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to:
24 * Free Software Foundation
25 * 51 Franklin Street, Fifth Floor
26 * Boston, MA 02111-1301 USA
27 *
28 */
29
30#include <linux/module.h>
31
32#include <linux/list.h>
33#include <linux/jhash.h>
34
35#include "debug.h"
36#include "error.h"
37
38/**
39 * v9fs_error_init - preload
40 * @errstr: error string
41 *
42 */
43
44int v9fs_error_init(void)
45{
46 struct errormap *c;
47 int bucket;
48
49 /* initialize hash table */
50 for (bucket = 0; bucket < ERRHASHSZ; bucket++)
51 INIT_HLIST_HEAD(&hash_errmap[bucket]);
52
53 /* load initial error map into hash table */
54 for (c = errmap; c->name != NULL; c++) {
55 c->namelen = strlen(c->name);
56 bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
57 INIT_HLIST_NODE(&c->list);
58 hlist_add_head(&c->list, &hash_errmap[bucket]);
59 }
60
61 return 1;
62}
63
64/**
65 * errstr2errno - convert error string to error number
66 * @errstr: error string
67 *
68 */
69
70int v9fs_errstr2errno(char *errstr, int len)
71{
72 int errno = 0;
73 struct hlist_node *p = NULL;
74 struct errormap *c = NULL;
75 int bucket = jhash(errstr, len, 0) % ERRHASHSZ;
76
77 hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
78 if (c->namelen==len && !memcmp(c->name, errstr, len)) {
79 errno = c->val;
80 break;
81 }
82 }
83
84 if (errno == 0) {
85 /* TODO: if error isn't found, add it dynamically */
86 errstr[len] = 0;
87 printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__,
88 errstr);
89 errno = 1;
90 }
91
92 return -errno;
93}
diff --git a/fs/9p/error.h b/fs/9p/error.h
deleted file mode 100644
index 5f3ca522b3..0000000000
--- a/fs/9p/error.h
+++ /dev/null
@@ -1,177 +0,0 @@
1/*
2 * linux/fs/9p/error.h
3 *
4 * Huge Nasty Error Table
5 *
6 * Plan 9 uses error strings, Unix uses error numbers. This table tries to
7 * match UNIX strings and Plan 9 strings to unix error numbers. It is used
8 * to preload the dynamic error table which can also track user-specific error
9 * strings.
10 *
11 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
12 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2
16 * as published by the Free Software Foundation.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to:
25 * Free Software Foundation
26 * 51 Franklin Street, Fifth Floor
27 * Boston, MA 02111-1301 USA
28 *
29 */
30
31#include <linux/errno.h>
32#include <asm/errno.h>
33
34struct errormap {
35 char *name;
36 int val;
37
38 int namelen;
39 struct hlist_node list;
40};
41
42#define ERRHASHSZ 32
43static struct hlist_head hash_errmap[ERRHASHSZ];
44
45/* FixMe - reduce to a reasonable size */
46static struct errormap errmap[] = {
47 {"Operation not permitted", EPERM},
48 {"wstat prohibited", EPERM},
49 {"No such file or directory", ENOENT},
50 {"directory entry not found", ENOENT},
51 {"file not found", ENOENT},
52 {"Interrupted system call", EINTR},
53 {"Input/output error", EIO},
54 {"No such device or address", ENXIO},
55 {"Argument list too long", E2BIG},
56 {"Bad file descriptor", EBADF},
57 {"Resource temporarily unavailable", EAGAIN},
58 {"Cannot allocate memory", ENOMEM},
59 {"Permission denied", EACCES},
60 {"Bad address", EFAULT},
61 {"Block device required", ENOTBLK},
62 {"Device or resource busy", EBUSY},
63 {"File exists", EEXIST},
64 {"Invalid cross-device link", EXDEV},
65 {"No such device", ENODEV},
66 {"Not a directory", ENOTDIR},
67 {"Is a directory", EISDIR},
68 {"Invalid argument", EINVAL},
69 {"Too many open files in system", ENFILE},
70 {"Too many open files", EMFILE},
71 {"Text file busy", ETXTBSY},
72 {"File too large", EFBIG},
73 {"No space left on device", ENOSPC},
74 {"Illegal seek", ESPIPE},
75 {"Read-only file system", EROFS},
76 {"Too many links", EMLINK},
77 {"Broken pipe", EPIPE},
78 {"Numerical argument out of domain", EDOM},
79 {"Numerical result out of range", ERANGE},
80 {"Resource deadlock avoided", EDEADLK},
81 {"File name too long", ENAMETOOLONG},
82 {"No locks available", ENOLCK},
83 {"Function not implemented", ENOSYS},
84 {"Directory not empty", ENOTEMPTY},
85 {"Too many levels of symbolic links", ELOOP},
86 {"No message of desired type", ENOMSG},
87 {"Identifier removed", EIDRM},
88 {"No data available", ENODATA},
89 {"Machine is not on the network", ENONET},
90 {"Package not installed", ENOPKG},
91 {"Object is remote", EREMOTE},
92 {"Link has been severed", ENOLINK},
93 {"Communication error on send", ECOMM},
94 {"Protocol error", EPROTO},
95 {"Bad message", EBADMSG},
96 {"File descriptor in bad state", EBADFD},
97 {"Streams pipe error", ESTRPIPE},
98 {"Too many users", EUSERS},
99 {"Socket operation on non-socket", ENOTSOCK},
100 {"Message too long", EMSGSIZE},
101 {"Protocol not available", ENOPROTOOPT},
102 {"Protocol not supported", EPROTONOSUPPORT},
103 {"Socket type not supported", ESOCKTNOSUPPORT},
104 {"Operation not supported", EOPNOTSUPP},
105 {"Protocol family not supported", EPFNOSUPPORT},
106 {"Network is down", ENETDOWN},
107 {"Network is unreachable", ENETUNREACH},
108 {"Network dropped connection on reset", ENETRESET},
109 {"Software caused connection abort", ECONNABORTED},
110 {"Connection reset by peer", ECONNRESET},
111 {"No buffer space available", ENOBUFS},
112 {"Transport endpoint is already connected", EISCONN},
113 {"Transport endpoint is not connected", ENOTCONN},
114 {"Cannot send after transport endpoint shutdown", ESHUTDOWN},
115 {"Connection timed out", ETIMEDOUT},
116 {"Connection refused", ECONNREFUSED},
117 {"Host is down", EHOSTDOWN},
118 {"No route to host", EHOSTUNREACH},
119 {"Operation already in progress", EALREADY},
120 {"Operation now in progress", EINPROGRESS},
121 {"Is a named type file", EISNAM},
122 {"Remote I/O error", EREMOTEIO},
123 {"Disk quota exceeded", EDQUOT},
124/* errors from fossil, vacfs, and u9fs */
125 {"fid unknown or out of range", EBADF},
126 {"permission denied", EACCES},
127 {"file does not exist", ENOENT},
128 {"authentication failed", ECONNREFUSED},
129 {"bad offset in directory read", ESPIPE},
130 {"bad use of fid", EBADF},
131 {"wstat can't convert between files and directories", EPERM},
132 {"directory is not empty", ENOTEMPTY},
133 {"file exists", EEXIST},
134 {"file already exists", EEXIST},
135 {"file or directory already exists", EEXIST},
136 {"fid already in use", EBADF},
137 {"file in use", ETXTBSY},
138 {"i/o error", EIO},
139 {"file already open for I/O", ETXTBSY},
140 {"illegal mode", EINVAL},
141 {"illegal name", ENAMETOOLONG},
142 {"not a directory", ENOTDIR},
143 {"not a member of proposed group", EPERM},
144 {"not owner", EACCES},
145 {"only owner can change group in wstat", EACCES},
146 {"read only file system", EROFS},
147 {"no access to special file", EPERM},
148 {"i/o count too large", EIO},
149 {"unknown group", EINVAL},
150 {"unknown user", EINVAL},
151 {"bogus wstat buffer", EPROTO},
152 {"exclusive use file already open", EAGAIN},
153 {"corrupted directory entry", EIO},
154 {"corrupted file entry", EIO},
155 {"corrupted block label", EIO},
156 {"corrupted meta data", EIO},
157 {"illegal offset", EINVAL},
158 {"illegal path element", ENOENT},
159 {"root of file system is corrupted", EIO},
160 {"corrupted super block", EIO},
161 {"protocol botch", EPROTO},
162 {"file system is full", ENOSPC},
163 {"file is in use", EAGAIN},
164 {"directory entry is not allocated", ENOENT},
165 {"file is read only", EROFS},
166 {"file has been removed", EIDRM},
167 {"only support truncation to zero length", EPERM},
168 {"cannot remove root", EPERM},
169 {"file too big", EFBIG},
170 {"venti i/o error", EIO},
171 /* these are not errors */
172 {"u9fs rhostsauth: no authentication required", 0},
173 {"u9fs authnone: no authentication required", 0},
174 {NULL, -1}
175};
176
177extern int v9fs_error_init(void);
diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c
deleted file mode 100644
index dc336a6759..0000000000
--- a/fs/9p/fcall.c
+++ /dev/null
@@ -1,427 +0,0 @@
1/*
2 * linux/fs/9p/fcall.c
3 *
4 * This file contains functions to perform synchronous 9P calls
5 *
6 * Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to:
21 * Free Software Foundation
22 * 51 Franklin Street, Fifth Floor
23 * Boston, MA 02111-1301 USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/sched.h>
31#include <linux/idr.h>
32
33#include "debug.h"
34#include "v9fs.h"
35#include "9p.h"
36#include "conv.h"
37#include "mux.h"
38
39/**
40 * v9fs_t_version - negotiate protocol parameters with sever
41 * @v9ses: 9P2000 session information
42 * @msize: requested max size packet
43 * @version: requested version.extension string
44 * @fcall: pointer to response fcall pointer
45 *
46 */
47
48int
49v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
50 char *version, struct v9fs_fcall **rcp)
51{
52 int ret;
53 struct v9fs_fcall *tc;
54
55 dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
56 tc = v9fs_create_tversion(msize, version);
57
58 if (!IS_ERR(tc)) {
59 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
60 kfree(tc);
61 } else
62 ret = PTR_ERR(tc);
63
64 return ret;
65}
66
67/**
68 * v9fs_t_attach - mount the server
69 * @v9ses: 9P2000 session information
70 * @uname: user name doing the attach
71 * @aname: remote name being attached to
72 * @fid: mount fid to attatch to root node
73 * @afid: authentication fid (in this case result key)
74 * @fcall: pointer to response fcall pointer
75 *
76 */
77
78int
79v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
80 u32 fid, u32 afid, struct v9fs_fcall **rcp)
81{
82 int ret;
83 struct v9fs_fcall* tc;
84
85 dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
86 aname, fid, afid);
87
88 tc = v9fs_create_tattach(fid, afid, uname, aname);
89 if (!IS_ERR(tc)) {
90 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
91 kfree(tc);
92 } else
93 ret = PTR_ERR(tc);
94
95 return ret;
96}
97
98static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
99 struct v9fs_fcall *rc, int err)
100{
101 int fid, id;
102 struct v9fs_session_info *v9ses;
103
104 id = 0;
105 fid = tc->params.tclunk.fid;
106 if (rc)
107 id = rc->id;
108
109 kfree(tc);
110 kfree(rc);
111 if (id == RCLUNK) {
112 v9ses = a;
113 v9fs_put_idpool(fid, &v9ses->fidpool);
114 }
115}
116
117/**
118 * v9fs_t_clunk - release a fid (finish a transaction)
119 * @v9ses: 9P2000 session information
120 * @fid: fid to release
121 * @fcall: pointer to response fcall pointer
122 *
123 */
124
125int
126v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
127{
128 int ret;
129 struct v9fs_fcall *tc, *rc;
130
131 dprintk(DEBUG_9P, "fid %d\n", fid);
132
133 rc = NULL;
134 tc = v9fs_create_tclunk(fid);
135 if (!IS_ERR(tc))
136 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
137 else
138 ret = PTR_ERR(tc);
139
140 if (ret)
141 dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret);
142
143 v9fs_t_clunk_cb(v9ses, tc, rc, ret);
144 return ret;
145}
146
147#if 0
148/**
149 * v9fs_v9fs_t_flush - flush a pending transaction
150 * @v9ses: 9P2000 session information
151 * @tag: tag to release
152 *
153 */
154int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
155{
156 int ret;
157 struct v9fs_fcall *tc;
158
159 dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
160
161 tc = v9fs_create_tflush(oldtag);
162 if (!IS_ERR(tc)) {
163 ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
164 kfree(tc);
165 } else
166 ret = PTR_ERR(tc);
167
168 return ret;
169}
170#endif
171
172/**
173 * v9fs_t_stat - read a file's meta-data
174 * @v9ses: 9P2000 session information
175 * @fid: fid pointing to file or directory to get info about
176 * @fcall: pointer to response fcall
177 *
178 */
179
180int
181v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp)
182{
183 int ret;
184 struct v9fs_fcall *tc;
185
186 dprintk(DEBUG_9P, "fid %d\n", fid);
187
188 ret = -ENOMEM;
189 tc = v9fs_create_tstat(fid);
190 if (!IS_ERR(tc)) {
191 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
192 kfree(tc);
193 } else
194 ret = PTR_ERR(tc);
195
196 return ret;
197}
198
199/**
200 * v9fs_t_wstat - write a file's meta-data
201 * @v9ses: 9P2000 session information
202 * @fid: fid pointing to file or directory to write info about
203 * @stat: metadata
204 * @fcall: pointer to response fcall
205 *
206 */
207
208int
209v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
210 struct v9fs_wstat *wstat, struct v9fs_fcall **rcp)
211{
212 int ret;
213 struct v9fs_fcall *tc;
214
215 dprintk(DEBUG_9P, "fid %d\n", fid);
216
217 tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
218 if (!IS_ERR(tc)) {
219 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
220 kfree(tc);
221 } else
222 ret = PTR_ERR(tc);
223
224 return ret;
225}
226
227/**
228 * v9fs_t_walk - walk a fid to a new file or directory
229 * @v9ses: 9P2000 session information
230 * @fid: fid to walk
231 * @newfid: new fid (for clone operations)
232 * @name: path to walk fid to
233 * @fcall: pointer to response fcall
234 *
235 */
236
237/* TODO: support multiple walk */
238
239int
240v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
241 char *name, struct v9fs_fcall **rcp)
242{
243 int ret;
244 struct v9fs_fcall *tc;
245 int nwname;
246
247 dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
248
249 if (name)
250 nwname = 1;
251 else
252 nwname = 0;
253
254 tc = v9fs_create_twalk(fid, newfid, nwname, &name);
255 if (!IS_ERR(tc)) {
256 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
257 kfree(tc);
258 } else
259 ret = PTR_ERR(tc);
260
261 return ret;
262}
263
264/**
265 * v9fs_t_open - open a file
266 *
267 * @v9ses - 9P2000 session information
268 * @fid - fid to open
269 * @mode - mode to open file (R, RW, etc)
270 * @fcall - pointer to response fcall
271 *
272 */
273
274int
275v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
276 struct v9fs_fcall **rcp)
277{
278 int ret;
279 struct v9fs_fcall *tc;
280
281 dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
282
283 tc = v9fs_create_topen(fid, mode);
284 if (!IS_ERR(tc)) {
285 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
286 kfree(tc);
287 } else
288 ret = PTR_ERR(tc);
289
290 return ret;
291}
292
293/**
294 * v9fs_t_remove - remove a file or directory
295 * @v9ses: 9P2000 session information
296 * @fid: fid to remove
297 * @fcall: pointer to response fcall
298 *
299 */
300
301int
302v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
303 struct v9fs_fcall **rcp)
304{
305 int ret;
306 struct v9fs_fcall *tc;
307
308 dprintk(DEBUG_9P, "fid %d\n", fid);
309
310 tc = v9fs_create_tremove(fid);
311 if (!IS_ERR(tc)) {
312 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
313 kfree(tc);
314 } else
315 ret = PTR_ERR(tc);
316
317 return ret;
318}
319
320/**
321 * v9fs_t_create - create a file or directory
322 * @v9ses: 9P2000 session information
323 * @fid: fid to create
324 * @name: name of the file or directory to create
325 * @perm: permissions to create with
326 * @mode: mode to open file (R, RW, etc)
327 * @fcall: pointer to response fcall
328 *
329 */
330
331int
332v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, u32 perm,
333 u8 mode, char *extension, struct v9fs_fcall **rcp)
334{
335 int ret;
336 struct v9fs_fcall *tc;
337
338 dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
339 fid, name, perm, mode);
340
341 tc = v9fs_create_tcreate(fid, name, perm, mode, extension,
342 v9ses->extended);
343
344 if (!IS_ERR(tc)) {
345 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
346 kfree(tc);
347 } else
348 ret = PTR_ERR(tc);
349
350 return ret;
351}
352
353/**
354 * v9fs_t_read - read data
355 * @v9ses: 9P2000 session information
356 * @fid: fid to read from
357 * @offset: offset to start read at
358 * @count: how many bytes to read
359 * @fcall: pointer to response fcall (with data)
360 *
361 */
362
363int
364v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
365 u32 count, struct v9fs_fcall **rcp)
366{
367 int ret;
368 struct v9fs_fcall *tc, *rc;
369
370 dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
371 (long long unsigned) offset, count);
372
373 tc = v9fs_create_tread(fid, offset, count);
374 if (!IS_ERR(tc)) {
375 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
376 if (!ret)
377 ret = rc->params.rread.count;
378 if (rcp)
379 *rcp = rc;
380 else
381 kfree(rc);
382
383 kfree(tc);
384 } else
385 ret = PTR_ERR(tc);
386
387 return ret;
388}
389
390/**
391 * v9fs_t_write - write data
392 * @v9ses: 9P2000 session information
393 * @fid: fid to write to
394 * @offset: offset to start write at
395 * @count: how many bytes to write
396 * @fcall: pointer to response fcall
397 *
398 */
399
400int
401v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
402 const char __user *data, struct v9fs_fcall **rcp)
403{
404 int ret;
405 struct v9fs_fcall *tc, *rc;
406
407 dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
408 (long long unsigned) offset, count);
409
410 tc = v9fs_create_twrite(fid, offset, count, data);
411 if (!IS_ERR(tc)) {
412 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
413
414 if (!ret)
415 ret = rc->params.rwrite.count;
416 if (rcp)
417 *rcp = rc;
418 else
419 kfree(rc);
420
421 kfree(tc);
422 } else
423 ret = PTR_ERR(tc);
424
425 return ret;
426}
427
diff --git a/fs/9p/fcprint.c b/fs/9p/fcprint.c
deleted file mode 100644
index 34b96114a2..0000000000
--- a/fs/9p/fcprint.c
+++ /dev/null
@@ -1,345 +0,0 @@
1/*
2 * linux/fs/9p/fcprint.c
3 *
4 * Print 9P call.
5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to:
19 * Free Software Foundation
20 * 51 Franklin Street, Fifth Floor
21 * Boston, MA 02111-1301 USA
22 *
23 */
24#include <linux/module.h>
25#include <linux/errno.h>
26#include <linux/fs.h>
27#include <linux/idr.h>
28
29#include "debug.h"
30#include "v9fs.h"
31#include "9p.h"
32#include "mux.h"
33
34static int
35v9fs_printqid(char *buf, int buflen, struct v9fs_qid *q)
36{
37 int n;
38 char b[10];
39
40 n = 0;
41 if (q->type & V9FS_QTDIR)
42 b[n++] = 'd';
43 if (q->type & V9FS_QTAPPEND)
44 b[n++] = 'a';
45 if (q->type & V9FS_QTAUTH)
46 b[n++] = 'A';
47 if (q->type & V9FS_QTEXCL)
48 b[n++] = 'l';
49 if (q->type & V9FS_QTTMP)
50 b[n++] = 't';
51 if (q->type & V9FS_QTSYMLINK)
52 b[n++] = 'L';
53 b[n] = '\0';
54
55 return scnprintf(buf, buflen, "(%.16llx %x %s)", (long long int) q->path,
56 q->version, b);
57}
58
59static int
60v9fs_printperm(char *buf, int buflen, int perm)
61{
62 int n;
63 char b[15];
64
65 n = 0;
66 if (perm & V9FS_DMDIR)
67 b[n++] = 'd';
68 if (perm & V9FS_DMAPPEND)
69 b[n++] = 'a';
70 if (perm & V9FS_DMAUTH)
71 b[n++] = 'A';
72 if (perm & V9FS_DMEXCL)
73 b[n++] = 'l';
74 if (perm & V9FS_DMTMP)
75 b[n++] = 't';
76 if (perm & V9FS_DMDEVICE)
77 b[n++] = 'D';
78 if (perm & V9FS_DMSOCKET)
79 b[n++] = 'S';
80 if (perm & V9FS_DMNAMEDPIPE)
81 b[n++] = 'P';
82 if (perm & V9FS_DMSYMLINK)
83 b[n++] = 'L';
84 b[n] = '\0';
85
86 return scnprintf(buf, buflen, "%s%03o", b, perm&077);
87}
88
89static int
90v9fs_printstat(char *buf, int buflen, struct v9fs_stat *st, int extended)
91{
92 int n;
93
94 n = scnprintf(buf, buflen, "'%.*s' '%.*s'", st->name.len,
95 st->name.str, st->uid.len, st->uid.str);
96 if (extended)
97 n += scnprintf(buf+n, buflen-n, "(%d)", st->n_uid);
98
99 n += scnprintf(buf+n, buflen-n, " '%.*s'", st->gid.len, st->gid.str);
100 if (extended)
101 n += scnprintf(buf+n, buflen-n, "(%d)", st->n_gid);
102
103 n += scnprintf(buf+n, buflen-n, " '%.*s'", st->muid.len, st->muid.str);
104 if (extended)
105 n += scnprintf(buf+n, buflen-n, "(%d)", st->n_muid);
106
107 n += scnprintf(buf+n, buflen-n, " q ");
108 n += v9fs_printqid(buf+n, buflen-n, &st->qid);
109 n += scnprintf(buf+n, buflen-n, " m ");
110 n += v9fs_printperm(buf+n, buflen-n, st->mode);
111 n += scnprintf(buf+n, buflen-n, " at %d mt %d l %lld",
112 st->atime, st->mtime, (long long int) st->length);
113
114 if (extended)
115 n += scnprintf(buf+n, buflen-n, " ext '%.*s'",
116 st->extension.len, st->extension.str);
117
118 return n;
119}
120
121static int
122v9fs_dumpdata(char *buf, int buflen, u8 *data, int datalen)
123{
124 int i, n;
125
126 i = n = 0;
127 while (i < datalen) {
128 n += scnprintf(buf + n, buflen - n, "%02x", data[i]);
129 if (i%4 == 3)
130 n += scnprintf(buf + n, buflen - n, " ");
131 if (i%32 == 31)
132 n += scnprintf(buf + n, buflen - n, "\n");
133
134 i++;
135 }
136 n += scnprintf(buf + n, buflen - n, "\n");
137
138 return n;
139}
140
141static int
142v9fs_printdata(char *buf, int buflen, u8 *data, int datalen)
143{
144 return v9fs_dumpdata(buf, buflen, data, datalen<16?datalen:16);
145}
146
147int
148v9fs_printfcall(char *buf, int buflen, struct v9fs_fcall *fc, int extended)
149{
150 int i, ret, type, tag;
151
152 if (!fc)
153 return scnprintf(buf, buflen, "<NULL>");
154
155 type = fc->id;
156 tag = fc->tag;
157
158 ret = 0;
159 switch (type) {
160 case TVERSION:
161 ret += scnprintf(buf+ret, buflen-ret,
162 "Tversion tag %u msize %u version '%.*s'", tag,
163 fc->params.tversion.msize, fc->params.tversion.version.len,
164 fc->params.tversion.version.str);
165 break;
166
167 case RVERSION:
168 ret += scnprintf(buf+ret, buflen-ret,
169 "Rversion tag %u msize %u version '%.*s'", tag,
170 fc->params.rversion.msize, fc->params.rversion.version.len,
171 fc->params.rversion.version.str);
172 break;
173
174 case TAUTH:
175 ret += scnprintf(buf+ret, buflen-ret,
176 "Tauth tag %u afid %d uname '%.*s' aname '%.*s'", tag,
177 fc->params.tauth.afid, fc->params.tauth.uname.len,
178 fc->params.tauth.uname.str, fc->params.tauth.aname.len,
179 fc->params.tauth.aname.str);
180 break;
181
182 case RAUTH:
183 ret += scnprintf(buf+ret, buflen-ret, "Rauth tag %u qid ", tag);
184 v9fs_printqid(buf+ret, buflen-ret, &fc->params.rauth.qid);
185 break;
186
187 case TATTACH:
188 ret += scnprintf(buf+ret, buflen-ret,
189 "Tattach tag %u fid %d afid %d uname '%.*s' aname '%.*s'",
190 tag, fc->params.tattach.fid, fc->params.tattach.afid,
191 fc->params.tattach.uname.len, fc->params.tattach.uname.str,
192 fc->params.tattach.aname.len, fc->params.tattach.aname.str);
193 break;
194
195 case RATTACH:
196 ret += scnprintf(buf+ret, buflen-ret, "Rattach tag %u qid ", tag);
197 v9fs_printqid(buf+ret, buflen-ret, &fc->params.rattach.qid);
198 break;
199
200 case RERROR:
201 ret += scnprintf(buf+ret, buflen-ret, "Rerror tag %u ename '%.*s'",
202 tag, fc->params.rerror.error.len,
203 fc->params.rerror.error.str);
204 if (extended)
205 ret += scnprintf(buf+ret, buflen-ret, " ecode %d\n",
206 fc->params.rerror.errno);
207 break;
208
209 case TFLUSH:
210 ret += scnprintf(buf+ret, buflen-ret, "Tflush tag %u oldtag %u",
211 tag, fc->params.tflush.oldtag);
212 break;
213
214 case RFLUSH:
215 ret += scnprintf(buf+ret, buflen-ret, "Rflush tag %u", tag);
216 break;
217
218 case TWALK:
219 ret += scnprintf(buf+ret, buflen-ret,
220 "Twalk tag %u fid %d newfid %d nwname %d", tag,
221 fc->params.twalk.fid, fc->params.twalk.newfid,
222 fc->params.twalk.nwname);
223 for(i = 0; i < fc->params.twalk.nwname; i++)
224 ret += scnprintf(buf+ret, buflen-ret," '%.*s'",
225 fc->params.twalk.wnames[i].len,
226 fc->params.twalk.wnames[i].str);
227 break;
228
229 case RWALK:
230 ret += scnprintf(buf+ret, buflen-ret, "Rwalk tag %u nwqid %d",
231 tag, fc->params.rwalk.nwqid);
232 for(i = 0; i < fc->params.rwalk.nwqid; i++)
233 ret += v9fs_printqid(buf+ret, buflen-ret,
234 &fc->params.rwalk.wqids[i]);
235 break;
236
237 case TOPEN:
238 ret += scnprintf(buf+ret, buflen-ret,
239 "Topen tag %u fid %d mode %d", tag,
240 fc->params.topen.fid, fc->params.topen.mode);
241 break;
242
243 case ROPEN:
244 ret += scnprintf(buf+ret, buflen-ret, "Ropen tag %u", tag);
245 ret += v9fs_printqid(buf+ret, buflen-ret, &fc->params.ropen.qid);
246 ret += scnprintf(buf+ret, buflen-ret," iounit %d",
247 fc->params.ropen.iounit);
248 break;
249
250 case TCREATE:
251 ret += scnprintf(buf+ret, buflen-ret,
252 "Tcreate tag %u fid %d name '%.*s' perm ", tag,
253 fc->params.tcreate.fid, fc->params.tcreate.name.len,
254 fc->params.tcreate.name.str);
255
256 ret += v9fs_printperm(buf+ret, buflen-ret, fc->params.tcreate.perm);
257 ret += scnprintf(buf+ret, buflen-ret, " mode %d",
258 fc->params.tcreate.mode);
259 break;
260
261 case RCREATE:
262 ret += scnprintf(buf+ret, buflen-ret, "Rcreate tag %u", tag);
263 ret += v9fs_printqid(buf+ret, buflen-ret, &fc->params.rcreate.qid);
264 ret += scnprintf(buf+ret, buflen-ret, " iounit %d",
265 fc->params.rcreate.iounit);
266 break;
267
268 case TREAD:
269 ret += scnprintf(buf+ret, buflen-ret,
270 "Tread tag %u fid %d offset %lld count %u", tag,
271 fc->params.tread.fid,
272 (long long int) fc->params.tread.offset,
273 fc->params.tread.count);
274 break;
275
276 case RREAD:
277 ret += scnprintf(buf+ret, buflen-ret,
278 "Rread tag %u count %u data ", tag,
279 fc->params.rread.count);
280 ret += v9fs_printdata(buf+ret, buflen-ret, fc->params.rread.data,
281 fc->params.rread.count);
282 break;
283
284 case TWRITE:
285 ret += scnprintf(buf+ret, buflen-ret,
286 "Twrite tag %u fid %d offset %lld count %u data ",
287 tag, fc->params.twrite.fid,
288 (long long int) fc->params.twrite.offset,
289 fc->params.twrite.count);
290 ret += v9fs_printdata(buf+ret, buflen-ret, fc->params.twrite.data,
291 fc->params.twrite.count);
292 break;
293
294 case RWRITE:
295 ret += scnprintf(buf+ret, buflen-ret, "Rwrite tag %u count %u",
296 tag, fc->params.rwrite.count);
297 break;
298
299 case TCLUNK:
300 ret += scnprintf(buf+ret, buflen-ret, "Tclunk tag %u fid %d",
301 tag, fc->params.tclunk.fid);
302 break;
303
304 case RCLUNK:
305 ret += scnprintf(buf+ret, buflen-ret, "Rclunk tag %u", tag);
306 break;
307
308 case TREMOVE:
309 ret += scnprintf(buf+ret, buflen-ret, "Tremove tag %u fid %d",
310 tag, fc->params.tremove.fid);
311 break;
312
313 case RREMOVE:
314 ret += scnprintf(buf+ret, buflen-ret, "Rremove tag %u", tag);
315 break;
316
317 case TSTAT:
318 ret += scnprintf(buf+ret, buflen-ret, "Tstat tag %u fid %d",
319 tag, fc->params.tstat.fid);
320 break;
321
322 case RSTAT:
323 ret += scnprintf(buf+ret, buflen-ret, "Rstat tag %u ", tag);
324 ret += v9fs_printstat(buf+ret, buflen-ret, &fc->params.rstat.stat,
325 extended);
326 break;
327
328 case TWSTAT:
329 ret += scnprintf(buf+ret, buflen-ret, "Twstat tag %u fid %d ",
330 tag, fc->params.twstat.fid);
331 ret += v9fs_printstat(buf+ret, buflen-ret, &fc->params.twstat.stat,
332 extended);
333 break;
334
335 case RWSTAT:
336 ret += scnprintf(buf+ret, buflen-ret, "Rwstat tag %u", tag);
337 break;
338
339 default:
340 ret += scnprintf(buf+ret, buflen-ret, "unknown type %d", type);
341 break;
342 }
343
344 return ret;
345}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 90419715c7..08fa320b7e 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,10 +26,10 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/idr.h> 27#include <linux/idr.h>
28#include <asm/semaphore.h> 28#include <asm/semaphore.h>
29#include <net/9p/9p.h>
30#include <net/9p/client.h>
29 31
30#include "debug.h"
31#include "v9fs.h" 32#include "v9fs.h"
32#include "9p.h"
33#include "v9fs_vfs.h" 33#include "v9fs_vfs.h"
34#include "fid.h" 34#include "fid.h"
35 35
@@ -40,67 +40,29 @@
40 * 40 *
41 */ 41 */
42 42
43int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) 43int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
44{ 44{
45 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; 45 struct v9fs_dentry *dent;
46 dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
47 dentry->d_iname, dentry);
48 if (dentry->d_fsdata == NULL) {
49 dentry->d_fsdata =
50 kmalloc(sizeof(struct list_head), GFP_KERNEL);
51 if (dentry->d_fsdata == NULL) {
52 dprintk(DEBUG_ERROR, "Out of memory\n");
53 return -ENOMEM;
54 }
55 fid_list = (struct list_head *)dentry->d_fsdata;
56 INIT_LIST_HEAD(fid_list); /* Initialize list head */
57 }
58 46
59 fid->uid = current->uid; 47 P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
60 list_add(&fid->list, fid_list); 48 fid->fid, dentry->d_iname);
61 return 0;
62}
63 49
64/** 50 dent = dentry->d_fsdata;
65 * v9fs_fid_create - allocate a FID structure 51 if (!dent) {
66 * @dentry - dentry to link newly created fid to 52 dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL);
67 * 53 if (!dent)
68 */ 54 return -ENOMEM;
69
70struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
71{
72 struct v9fs_fid *new;
73 55
74 dprintk(DEBUG_9P, "fid create fid %d\n", fid); 56 spin_lock_init(&dent->lock);
75 new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); 57 INIT_LIST_HEAD(&dent->fidlist);
76 if (new == NULL) { 58 dentry->d_fsdata = dent;
77 dprintk(DEBUG_ERROR, "Out of Memory\n");
78 return ERR_PTR(-ENOMEM);
79 } 59 }
80 60
81 new->fid = fid; 61 spin_lock(&dent->lock);
82 new->v9ses = v9ses; 62 list_add(&fid->dlist, &dent->fidlist);
83 new->fidopen = 0; 63 spin_unlock(&dent->lock);
84 new->fidclunked = 0;
85 new->iounit = 0;
86 new->rdir_pos = 0;
87 new->rdir_fcall = NULL;
88 init_MUTEX(&new->lock);
89 INIT_LIST_HEAD(&new->list);
90
91 return new;
92}
93
94/**
95 * v9fs_fid_destroy - deallocate a FID structure
96 * @fid: fid to destroy
97 *
98 */
99 64
100void v9fs_fid_destroy(struct v9fs_fid *fid) 65 return 0;
101{
102 list_del(&fid->list);
103 kfree(fid);
104} 66}
105 67
106/** 68/**
@@ -114,30 +76,42 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
114 * 76 *
115 */ 77 */
116 78
117struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) 79struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
118{ 80{
119 struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; 81 struct v9fs_dentry *dent;
120 struct v9fs_fid *return_fid = NULL; 82 struct p9_fid *fid;
121 83
122 dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); 84 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
123 85 dent = dentry->d_fsdata;
124 if (fid_list) 86 if (dent)
125 return_fid = list_entry(fid_list->next, struct v9fs_fid, list); 87 fid = list_entry(dent->fidlist.next, struct p9_fid, dlist);
88 else
89 fid = ERR_PTR(-EBADF);
90
91 P9_DPRINTK(P9_DEBUG_VFS, " fid: %p\n", fid);
92 return fid;
93}
126 94
127 if (!return_fid) { 95struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry)
128 dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n"); 96{
129 return_fid = ERR_PTR(-EBADF); 97 struct p9_fid *fid;
98 struct v9fs_dentry *dent;
99
100 dent = dentry->d_fsdata;
101 fid = v9fs_fid_lookup(dentry);
102 if (!IS_ERR(fid)) {
103 spin_lock(&dent->lock);
104 list_del(&fid->dlist);
105 spin_unlock(&dent->lock);
130 } 106 }
131 107
132 if(down_interruptible(&return_fid->lock)) 108 return fid;
133 return ERR_PTR(-EINTR);
134
135 return return_fid;
136} 109}
137 110
111
138/** 112/**
139 * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and 113 * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and
140 * release it 114 * release it
141 * @dentry: dentry to look for fid in 115 * @dentry: dentry to look for fid in
142 * 116 *
143 * find a fid in the dentry and then clone to a new private fid 117 * find a fid in the dentry and then clone to a new private fid
@@ -146,49 +120,15 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
146 * 120 *
147 */ 121 */
148 122
149struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry) 123struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
150{ 124{
151 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); 125 struct p9_fid *ofid, *fid;
152 struct v9fs_fid *base_fid, *new_fid = ERR_PTR(-EBADF);
153 struct v9fs_fcall *fcall = NULL;
154 int fid, err;
155
156 base_fid = v9fs_fid_lookup(dentry);
157
158 if(IS_ERR(base_fid))
159 return base_fid;
160
161 if(base_fid) { /* clone fid */
162 fid = v9fs_get_idpool(&v9ses->fidpool);
163 if (fid < 0) {
164 eprintk(KERN_WARNING, "newfid fails!\n");
165 new_fid = ERR_PTR(-ENOSPC);
166 goto Release_Fid;
167 }
168
169 err = v9fs_t_walk(v9ses, base_fid->fid, fid, NULL, &fcall);
170 if (err < 0) {
171 dprintk(DEBUG_ERROR, "clone walk didn't work\n");
172 v9fs_put_idpool(fid, &v9ses->fidpool);
173 new_fid = ERR_PTR(err);
174 goto Free_Fcall;
175 }
176 new_fid = v9fs_fid_create(v9ses, fid);
177 if (new_fid == NULL) {
178 dprintk(DEBUG_ERROR, "out of memory\n");
179 new_fid = ERR_PTR(-ENOMEM);
180 }
181Free_Fcall:
182 kfree(fcall);
183 }
184 126
185Release_Fid: 127 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
186 up(&base_fid->lock); 128 ofid = v9fs_fid_lookup(dentry);
187 return new_fid; 129 if (IS_ERR(ofid))
188} 130 return ofid;
189 131
190void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid) 132 fid = p9_client_walk(ofid, 0, NULL, 1);
191{ 133 return fid;
192 v9fs_t_clunk(v9ses, fid->fid);
193 v9fs_fid_destroy(fid);
194} 134}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 48fc170c26..47a0ba7428 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -22,41 +22,12 @@
22 22
23#include <linux/list.h> 23#include <linux/list.h>
24 24
25#define FID_OP 0 25struct v9fs_dentry {
26#define FID_WALK 1 26 spinlock_t lock; /* protect fidlist */
27#define FID_CREATE 2 27 struct list_head fidlist;
28
29struct v9fs_fid {
30 struct list_head list; /* list of fids associated with a dentry */
31 struct list_head active; /* XXX - debug */
32
33 struct semaphore lock;
34
35 u32 fid;
36 unsigned char fidopen; /* set when fid is opened */
37 unsigned char fidclunked; /* set when fid has already been clunked */
38
39 struct v9fs_qid qid;
40 u32 iounit;
41
42 /* readdir stuff */
43 int rdir_fpos;
44 loff_t rdir_pos;
45 struct v9fs_fcall *rdir_fcall;
46
47 /* management stuff */
48 uid_t uid; /* user associated with this fid */
49
50 /* private data */
51 struct file *filp; /* backpointer to File struct for open files */
52 struct v9fs_session_info *v9ses; /* session info for this FID */
53}; 28};
54 29
55struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry); 30struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
56struct v9fs_fid *v9fs_fid_get_created(struct dentry *); 31struct p9_fid *v9fs_fid_lookup_remove(struct dentry *dentry);
57void v9fs_fid_destroy(struct v9fs_fid *fid); 32struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
58struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid); 33int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
59int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry);
60struct v9fs_fid *v9fs_fid_clone(struct dentry *dentry);
61void v9fs_fid_clunk(struct v9fs_session_info *v9ses, struct v9fs_fid *fid);
62
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
deleted file mode 100644
index c783874a9c..0000000000
--- a/fs/9p/mux.c
+++ /dev/null
@@ -1,1033 +0,0 @@
1/*
2 * linux/fs/9p/mux.c
3 *
4 * Protocol Multiplexer
5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26#include <linux/module.h>
27#include <linux/errno.h>
28#include <linux/fs.h>
29#include <linux/poll.h>
30#include <linux/kthread.h>
31#include <linux/idr.h>
32#include <linux/mutex.h>
33
34#include "debug.h"
35#include "v9fs.h"
36#include "9p.h"
37#include "conv.h"
38#include "transport.h"
39#include "mux.h"
40
41#define ERREQFLUSH 1
42#define SCHED_TIMEOUT 10
43#define MAXPOLLWADDR 2
44
45enum {
46 Rworksched = 1, /* read work scheduled or running */
47 Rpending = 2, /* can read */
48 Wworksched = 4, /* write work scheduled or running */
49 Wpending = 8, /* can write */
50};
51
52enum {
53 None,
54 Flushing,
55 Flushed,
56};
57
58struct v9fs_mux_poll_task;
59
60struct v9fs_req {
61 spinlock_t lock;
62 int tag;
63 struct v9fs_fcall *tcall;
64 struct v9fs_fcall *rcall;
65 int err;
66 v9fs_mux_req_callback cb;
67 void *cba;
68 int flush;
69 struct list_head req_list;
70};
71
72struct v9fs_mux_data {
73 spinlock_t lock;
74 struct list_head mux_list;
75 struct v9fs_mux_poll_task *poll_task;
76 int msize;
77 unsigned char *extended;
78 struct v9fs_transport *trans;
79 struct v9fs_idpool tagpool;
80 int err;
81 wait_queue_head_t equeue;
82 struct list_head req_list;
83 struct list_head unsent_req_list;
84 struct v9fs_fcall *rcall;
85 int rpos;
86 char *rbuf;
87 int wpos;
88 int wsize;
89 char *wbuf;
90 wait_queue_t poll_wait[MAXPOLLWADDR];
91 wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
92 poll_table pt;
93 struct work_struct rq;
94 struct work_struct wq;
95 unsigned long wsched;
96};
97
98struct v9fs_mux_poll_task {
99 struct task_struct *task;
100 struct list_head mux_list;
101 int muxnum;
102};
103
104struct v9fs_mux_rpc {
105 struct v9fs_mux_data *m;
106 int err;
107 struct v9fs_fcall *tcall;
108 struct v9fs_fcall *rcall;
109 wait_queue_head_t wqueue;
110};
111
112static int v9fs_poll_proc(void *);
113static void v9fs_read_work(struct work_struct *work);
114static void v9fs_write_work(struct work_struct *work);
115static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
116 poll_table * p);
117static u16 v9fs_mux_get_tag(struct v9fs_mux_data *);
118static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16);
119
120static DEFINE_MUTEX(v9fs_mux_task_lock);
121static struct workqueue_struct *v9fs_mux_wq;
122
123static int v9fs_mux_num;
124static int v9fs_mux_poll_task_num;
125static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
126
127int v9fs_mux_global_init(void)
128{
129 int i;
130
131 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++)
132 v9fs_mux_poll_tasks[i].task = NULL;
133
134 v9fs_mux_wq = create_workqueue("v9fs");
135 if (!v9fs_mux_wq) {
136 printk(KERN_WARNING "v9fs: mux: creating workqueue failed\n");
137 return -ENOMEM;
138 }
139
140 return 0;
141}
142
143void v9fs_mux_global_exit(void)
144{
145 destroy_workqueue(v9fs_mux_wq);
146}
147
148/**
149 * v9fs_mux_calc_poll_procs - calculates the number of polling procs
150 * based on the number of mounted v9fs filesystems.
151 *
152 * The current implementation returns sqrt of the number of mounts.
153 */
154static int v9fs_mux_calc_poll_procs(int muxnum)
155{
156 int n;
157
158 if (v9fs_mux_poll_task_num)
159 n = muxnum / v9fs_mux_poll_task_num +
160 (muxnum % v9fs_mux_poll_task_num ? 1 : 0);
161 else
162 n = 1;
163
164 if (n > ARRAY_SIZE(v9fs_mux_poll_tasks))
165 n = ARRAY_SIZE(v9fs_mux_poll_tasks);
166
167 return n;
168}
169
170static int v9fs_mux_poll_start(struct v9fs_mux_data *m)
171{
172 int i, n;
173 struct v9fs_mux_poll_task *vpt, *vptlast;
174 struct task_struct *pproc;
175
176 dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
177 v9fs_mux_poll_task_num);
178 mutex_lock(&v9fs_mux_task_lock);
179
180 n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1);
181 if (n > v9fs_mux_poll_task_num) {
182 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
183 if (v9fs_mux_poll_tasks[i].task == NULL) {
184 vpt = &v9fs_mux_poll_tasks[i];
185 dprintk(DEBUG_MUX, "create proc %p\n", vpt);
186 pproc = kthread_create(v9fs_poll_proc, vpt,
187 "v9fs-poll");
188
189 if (!IS_ERR(pproc)) {
190 vpt->task = pproc;
191 INIT_LIST_HEAD(&vpt->mux_list);
192 vpt->muxnum = 0;
193 v9fs_mux_poll_task_num++;
194 wake_up_process(vpt->task);
195 }
196 break;
197 }
198 }
199
200 if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks))
201 dprintk(DEBUG_ERROR, "warning: no free poll slots\n");
202 }
203
204 n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num +
205 ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0);
206
207 vptlast = NULL;
208 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
209 vpt = &v9fs_mux_poll_tasks[i];
210 if (vpt->task != NULL) {
211 vptlast = vpt;
212 if (vpt->muxnum < n) {
213 dprintk(DEBUG_MUX, "put in proc %d\n", i);
214 list_add(&m->mux_list, &vpt->mux_list);
215 vpt->muxnum++;
216 m->poll_task = vpt;
217 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
218 init_poll_funcptr(&m->pt, v9fs_pollwait);
219 break;
220 }
221 }
222 }
223
224 if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
225 if (vptlast == NULL)
226 return -ENOMEM;
227
228 dprintk(DEBUG_MUX, "put in proc %d\n", i);
229 list_add(&m->mux_list, &vptlast->mux_list);
230 vptlast->muxnum++;
231 m->poll_task = vptlast;
232 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
233 init_poll_funcptr(&m->pt, v9fs_pollwait);
234 }
235
236 v9fs_mux_num++;
237 mutex_unlock(&v9fs_mux_task_lock);
238
239 return 0;
240}
241
242static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
243{
244 int i;
245 struct v9fs_mux_poll_task *vpt;
246
247 mutex_lock(&v9fs_mux_task_lock);
248 vpt = m->poll_task;
249 list_del(&m->mux_list);
250 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
251 if (m->poll_waddr[i] != NULL) {
252 remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
253 m->poll_waddr[i] = NULL;
254 }
255 }
256 vpt->muxnum--;
257 if (!vpt->muxnum) {
258 dprintk(DEBUG_MUX, "destroy proc %p\n", vpt);
259 kthread_stop(vpt->task);
260 vpt->task = NULL;
261 v9fs_mux_poll_task_num--;
262 }
263 v9fs_mux_num--;
264 mutex_unlock(&v9fs_mux_task_lock);
265}
266
267/**
268 * v9fs_mux_init - allocate and initialize the per-session mux data
269 * Creates the polling task if this is the first session.
270 *
271 * @trans - transport structure
272 * @msize - maximum message size
273 * @extended - pointer to the extended flag
274 */
275struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
276 unsigned char *extended)
277{
278 int i, n;
279 struct v9fs_mux_data *m, *mtmp;
280
281 dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
282 m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL);
283 if (!m)
284 return ERR_PTR(-ENOMEM);
285
286 spin_lock_init(&m->lock);
287 INIT_LIST_HEAD(&m->mux_list);
288 m->msize = msize;
289 m->extended = extended;
290 m->trans = trans;
291 idr_init(&m->tagpool.pool);
292 init_MUTEX(&m->tagpool.lock);
293 m->err = 0;
294 init_waitqueue_head(&m->equeue);
295 INIT_LIST_HEAD(&m->req_list);
296 INIT_LIST_HEAD(&m->unsent_req_list);
297 m->rcall = NULL;
298 m->rpos = 0;
299 m->rbuf = NULL;
300 m->wpos = m->wsize = 0;
301 m->wbuf = NULL;
302 INIT_WORK(&m->rq, v9fs_read_work);
303 INIT_WORK(&m->wq, v9fs_write_work);
304 m->wsched = 0;
305 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
306 m->poll_task = NULL;
307 n = v9fs_mux_poll_start(m);
308 if (n)
309 return ERR_PTR(n);
310
311 n = trans->poll(trans, &m->pt);
312 if (n & POLLIN) {
313 dprintk(DEBUG_MUX, "mux %p can read\n", m);
314 set_bit(Rpending, &m->wsched);
315 }
316
317 if (n & POLLOUT) {
318 dprintk(DEBUG_MUX, "mux %p can write\n", m);
319 set_bit(Wpending, &m->wsched);
320 }
321
322 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
323 if (IS_ERR(m->poll_waddr[i])) {
324 v9fs_mux_poll_stop(m);
325 mtmp = (void *)m->poll_waddr; /* the error code */
326 kfree(m);
327 m = mtmp;
328 break;
329 }
330 }
331
332 return m;
333}
334
335/**
336 * v9fs_mux_destroy - cancels all pending requests and frees mux resources
337 */
338void v9fs_mux_destroy(struct v9fs_mux_data *m)
339{
340 dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m,
341 m->mux_list.prev, m->mux_list.next);
342 v9fs_mux_cancel(m, -ECONNRESET);
343
344 if (!list_empty(&m->req_list)) {
345 /* wait until all processes waiting on this session exit */
346 dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n",
347 m);
348 wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
349 dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m,
350 list_empty(&m->req_list));
351 }
352
353 v9fs_mux_poll_stop(m);
354 m->trans = NULL;
355
356 kfree(m);
357}
358
359/**
360 * v9fs_pollwait - called by files poll operation to add v9fs-poll task
361 * to files wait queue
362 */
363static void
364v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
365 poll_table * p)
366{
367 int i;
368 struct v9fs_mux_data *m;
369
370 m = container_of(p, struct v9fs_mux_data, pt);
371 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
372 if (m->poll_waddr[i] == NULL)
373 break;
374
375 if (i >= ARRAY_SIZE(m->poll_waddr)) {
376 dprintk(DEBUG_ERROR, "not enough wait_address slots\n");
377 return;
378 }
379
380 m->poll_waddr[i] = wait_address;
381
382 if (!wait_address) {
383 dprintk(DEBUG_ERROR, "no wait_address\n");
384 m->poll_waddr[i] = ERR_PTR(-EIO);
385 return;
386 }
387
388 init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
389 add_wait_queue(wait_address, &m->poll_wait[i]);
390}
391
392/**
393 * v9fs_poll_mux - polls a mux and schedules read or write works if necessary
394 */
395static void v9fs_poll_mux(struct v9fs_mux_data *m)
396{
397 int n;
398
399 if (m->err < 0)
400 return;
401
402 n = m->trans->poll(m->trans, NULL);
403 if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
404 dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n);
405 if (n >= 0)
406 n = -ECONNRESET;
407 v9fs_mux_cancel(m, n);
408 }
409
410 if (n & POLLIN) {
411 set_bit(Rpending, &m->wsched);
412 dprintk(DEBUG_MUX, "mux %p can read\n", m);
413 if (!test_and_set_bit(Rworksched, &m->wsched)) {
414 dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
415 queue_work(v9fs_mux_wq, &m->rq);
416 }
417 }
418
419 if (n & POLLOUT) {
420 set_bit(Wpending, &m->wsched);
421 dprintk(DEBUG_MUX, "mux %p can write\n", m);
422 if ((m->wsize || !list_empty(&m->unsent_req_list))
423 && !test_and_set_bit(Wworksched, &m->wsched)) {
424 dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
425 queue_work(v9fs_mux_wq, &m->wq);
426 }
427 }
428}
429
430/**
431 * v9fs_poll_proc - polls all v9fs transports for new events and queues
432 * the appropriate work to the work queue
433 */
434static int v9fs_poll_proc(void *a)
435{
436 struct v9fs_mux_data *m, *mtmp;
437 struct v9fs_mux_poll_task *vpt;
438
439 vpt = a;
440 dprintk(DEBUG_MUX, "start %p %p\n", current, vpt);
441 while (!kthread_should_stop()) {
442 set_current_state(TASK_INTERRUPTIBLE);
443
444 list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
445 v9fs_poll_mux(m);
446 }
447
448 dprintk(DEBUG_MUX, "sleeping...\n");
449 schedule_timeout(SCHED_TIMEOUT * HZ);
450 }
451
452 __set_current_state(TASK_RUNNING);
453 dprintk(DEBUG_MUX, "finish\n");
454 return 0;
455}
456
457/**
458 * v9fs_write_work - called when a transport can send some data
459 */
460static void v9fs_write_work(struct work_struct *work)
461{
462 int n, err;
463 struct v9fs_mux_data *m;
464 struct v9fs_req *req;
465
466 m = container_of(work, struct v9fs_mux_data, wq);
467
468 if (m->err < 0) {
469 clear_bit(Wworksched, &m->wsched);
470 return;
471 }
472
473 if (!m->wsize) {
474 if (list_empty(&m->unsent_req_list)) {
475 clear_bit(Wworksched, &m->wsched);
476 return;
477 }
478
479 spin_lock(&m->lock);
480again:
481 req = list_entry(m->unsent_req_list.next, struct v9fs_req,
482 req_list);
483 list_move_tail(&req->req_list, &m->req_list);
484 if (req->err == ERREQFLUSH)
485 goto again;
486
487 m->wbuf = req->tcall->sdata;
488 m->wsize = req->tcall->size;
489 m->wpos = 0;
490 dump_data(m->wbuf, m->wsize);
491 spin_unlock(&m->lock);
492 }
493
494 dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize);
495 clear_bit(Wpending, &m->wsched);
496 err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
497 dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
498 if (err == -EAGAIN) {
499 clear_bit(Wworksched, &m->wsched);
500 return;
501 }
502
503 if (err <= 0)
504 goto error;
505
506 m->wpos += err;
507 if (m->wpos == m->wsize)
508 m->wpos = m->wsize = 0;
509
510 if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
511 if (test_and_clear_bit(Wpending, &m->wsched))
512 n = POLLOUT;
513 else
514 n = m->trans->poll(m->trans, NULL);
515
516 if (n & POLLOUT) {
517 dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
518 queue_work(v9fs_mux_wq, &m->wq);
519 } else
520 clear_bit(Wworksched, &m->wsched);
521 } else
522 clear_bit(Wworksched, &m->wsched);
523
524 return;
525
526 error:
527 v9fs_mux_cancel(m, err);
528 clear_bit(Wworksched, &m->wsched);
529}
530
531static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
532{
533 int ecode;
534 struct v9fs_str *ename;
535
536 if (!req->err && req->rcall->id == RERROR) {
537 ecode = req->rcall->params.rerror.errno;
538 ename = &req->rcall->params.rerror.error;
539
540 dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str);
541
542 if (*m->extended)
543 req->err = -ecode;
544
545 if (!req->err) {
546 req->err = v9fs_errstr2errno(ename->str, ename->len);
547
548 if (!req->err) { /* string match failed */
549 PRINT_FCALL_ERROR("unknown error", req->rcall);
550 }
551
552 if (!req->err)
553 req->err = -ESERVERFAULT;
554 }
555 } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
556 dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n",
557 req->tcall->id + 1, req->rcall->id);
558 if (!req->err)
559 req->err = -EIO;
560 }
561}
562
563/**
564 * v9fs_read_work - called when there is some data to be read from a transport
565 */
566static void v9fs_read_work(struct work_struct *work)
567{
568 int n, err;
569 struct v9fs_mux_data *m;
570 struct v9fs_req *req, *rptr, *rreq;
571 struct v9fs_fcall *rcall;
572 char *rbuf;
573
574 m = container_of(work, struct v9fs_mux_data, rq);
575
576 if (m->err < 0)
577 return;
578
579 rcall = NULL;
580 dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
581
582 if (!m->rcall) {
583 m->rcall =
584 kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL);
585 if (!m->rcall) {
586 err = -ENOMEM;
587 goto error;
588 }
589
590 m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
591 m->rpos = 0;
592 }
593
594 clear_bit(Rpending, &m->wsched);
595 err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
596 dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
597 if (err == -EAGAIN) {
598 clear_bit(Rworksched, &m->wsched);
599 return;
600 }
601
602 if (err <= 0)
603 goto error;
604
605 m->rpos += err;
606 while (m->rpos > 4) {
607 n = le32_to_cpu(*(__le32 *) m->rbuf);
608 if (n >= m->msize) {
609 dprintk(DEBUG_ERROR,
610 "requested packet size too big: %d\n", n);
611 err = -EIO;
612 goto error;
613 }
614
615 if (m->rpos < n)
616 break;
617
618 dump_data(m->rbuf, n);
619 err =
620 v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
621 if (err < 0) {
622 goto error;
623 }
624
625 if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) {
626 char buf[150];
627
628 v9fs_printfcall(buf, sizeof(buf), m->rcall,
629 *m->extended);
630 printk(KERN_NOTICE ">>> %p %s\n", m, buf);
631 }
632
633 rcall = m->rcall;
634 rbuf = m->rbuf;
635 if (m->rpos > n) {
636 m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize,
637 GFP_KERNEL);
638 if (!m->rcall) {
639 err = -ENOMEM;
640 goto error;
641 }
642
643 m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
644 memmove(m->rbuf, rbuf + n, m->rpos - n);
645 m->rpos -= n;
646 } else {
647 m->rcall = NULL;
648 m->rbuf = NULL;
649 m->rpos = 0;
650 }
651
652 dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
653 rcall->tag);
654
655 req = NULL;
656 spin_lock(&m->lock);
657 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
658 if (rreq->tag == rcall->tag) {
659 req = rreq;
660 if (req->flush != Flushing)
661 list_del(&req->req_list);
662 break;
663 }
664 }
665 spin_unlock(&m->lock);
666
667 if (req) {
668 req->rcall = rcall;
669 process_request(m, req);
670
671 if (req->flush != Flushing) {
672 if (req->cb)
673 (*req->cb) (req, req->cba);
674 else
675 kfree(req->rcall);
676
677 wake_up(&m->equeue);
678 }
679 } else {
680 if (err >= 0 && rcall->id != RFLUSH)
681 dprintk(DEBUG_ERROR,
682 "unexpected response mux %p id %d tag %d\n",
683 m, rcall->id, rcall->tag);
684 kfree(rcall);
685 }
686 }
687
688 if (!list_empty(&m->req_list)) {
689 if (test_and_clear_bit(Rpending, &m->wsched))
690 n = POLLIN;
691 else
692 n = m->trans->poll(m->trans, NULL);
693
694 if (n & POLLIN) {
695 dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
696 queue_work(v9fs_mux_wq, &m->rq);
697 } else
698 clear_bit(Rworksched, &m->wsched);
699 } else
700 clear_bit(Rworksched, &m->wsched);
701
702 return;
703
704 error:
705 v9fs_mux_cancel(m, err);
706 clear_bit(Rworksched, &m->wsched);
707}
708
709/**
710 * v9fs_send_request - send 9P request
711 * The function can sleep until the request is scheduled for sending.
712 * The function can be interrupted. Return from the function is not
713 * a guarantee that the request is sent successfully. Can return errors
714 * that can be retrieved by PTR_ERR macros.
715 *
716 * @m: mux data
717 * @tc: request to be sent
718 * @cb: callback function to call when response is received
719 * @cba: parameter to pass to the callback function
720 */
721static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
722 struct v9fs_fcall *tc,
723 v9fs_mux_req_callback cb, void *cba)
724{
725 int n;
726 struct v9fs_req *req;
727
728 dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
729 tc, tc->id);
730 if (m->err < 0)
731 return ERR_PTR(m->err);
732
733 req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL);
734 if (!req)
735 return ERR_PTR(-ENOMEM);
736
737 if (tc->id == TVERSION)
738 n = V9FS_NOTAG;
739 else
740 n = v9fs_mux_get_tag(m);
741
742 if (n < 0)
743 return ERR_PTR(-ENOMEM);
744
745 v9fs_set_tag(tc, n);
746 if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) {
747 char buf[150];
748
749 v9fs_printfcall(buf, sizeof(buf), tc, *m->extended);
750 printk(KERN_NOTICE "<<< %p %s\n", m, buf);
751 }
752
753 spin_lock_init(&req->lock);
754 req->tag = n;
755 req->tcall = tc;
756 req->rcall = NULL;
757 req->err = 0;
758 req->cb = cb;
759 req->cba = cba;
760 req->flush = None;
761
762 spin_lock(&m->lock);
763 list_add_tail(&req->req_list, &m->unsent_req_list);
764 spin_unlock(&m->lock);
765
766 if (test_and_clear_bit(Wpending, &m->wsched))
767 n = POLLOUT;
768 else
769 n = m->trans->poll(m->trans, NULL);
770
771 if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
772 queue_work(v9fs_mux_wq, &m->wq);
773
774 return req;
775}
776
777static void v9fs_mux_free_request(struct v9fs_mux_data *m, struct v9fs_req *req)
778{
779 v9fs_mux_put_tag(m, req->tag);
780 kfree(req);
781}
782
783static void v9fs_mux_flush_cb(struct v9fs_req *freq, void *a)
784{
785 v9fs_mux_req_callback cb;
786 int tag;
787 struct v9fs_mux_data *m;
788 struct v9fs_req *req, *rreq, *rptr;
789
790 m = a;
791 dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
792 freq->tcall, freq->rcall, freq->err,
793 freq->tcall->params.tflush.oldtag);
794
795 spin_lock(&m->lock);
796 cb = NULL;
797 tag = freq->tcall->params.tflush.oldtag;
798 req = NULL;
799 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
800 if (rreq->tag == tag) {
801 req = rreq;
802 list_del(&req->req_list);
803 break;
804 }
805 }
806 spin_unlock(&m->lock);
807
808 if (req) {
809 spin_lock(&req->lock);
810 req->flush = Flushed;
811 spin_unlock(&req->lock);
812
813 if (req->cb)
814 (*req->cb) (req, req->cba);
815 else
816 kfree(req->rcall);
817
818 wake_up(&m->equeue);
819 }
820
821 kfree(freq->tcall);
822 kfree(freq->rcall);
823 v9fs_mux_free_request(m, freq);
824}
825
826static int
827v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
828{
829 struct v9fs_fcall *fc;
830 struct v9fs_req *rreq, *rptr;
831
832 dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
833
834 /* if a response was received for a request, do nothing */
835 spin_lock(&req->lock);
836 if (req->rcall || req->err) {
837 spin_unlock(&req->lock);
838 dprintk(DEBUG_MUX, "mux %p req %p response already received\n", m, req);
839 return 0;
840 }
841
842 req->flush = Flushing;
843 spin_unlock(&req->lock);
844
845 spin_lock(&m->lock);
846 /* if the request is not sent yet, just remove it from the list */
847 list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
848 if (rreq->tag == req->tag) {
849 dprintk(DEBUG_MUX, "mux %p req %p request is not sent yet\n", m, req);
850 list_del(&rreq->req_list);
851 req->flush = Flushed;
852 spin_unlock(&m->lock);
853 if (req->cb)
854 (*req->cb) (req, req->cba);
855 return 0;
856 }
857 }
858 spin_unlock(&m->lock);
859
860 clear_thread_flag(TIF_SIGPENDING);
861 fc = v9fs_create_tflush(req->tag);
862 v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
863 return 1;
864}
865
866static void
867v9fs_mux_rpc_cb(struct v9fs_req *req, void *a)
868{
869 struct v9fs_mux_rpc *r;
870
871 dprintk(DEBUG_MUX, "req %p r %p\n", req, a);
872 r = a;
873 r->rcall = req->rcall;
874 r->err = req->err;
875
876 if (req->flush!=None && !req->err)
877 r->err = -ERESTARTSYS;
878
879 wake_up(&r->wqueue);
880}
881
882/**
883 * v9fs_mux_rpc - sends 9P request and waits until a response is available.
884 * The function can be interrupted.
885 * @m: mux data
886 * @tc: request to be sent
887 * @rc: pointer where a pointer to the response is stored
888 */
889int
890v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
891 struct v9fs_fcall **rc)
892{
893 int err, sigpending;
894 unsigned long flags;
895 struct v9fs_req *req;
896 struct v9fs_mux_rpc r;
897
898 r.err = 0;
899 r.tcall = tc;
900 r.rcall = NULL;
901 r.m = m;
902 init_waitqueue_head(&r.wqueue);
903
904 if (rc)
905 *rc = NULL;
906
907 sigpending = 0;
908 if (signal_pending(current)) {
909 sigpending = 1;
910 clear_thread_flag(TIF_SIGPENDING);
911 }
912
913 req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r);
914 if (IS_ERR(req)) {
915 err = PTR_ERR(req);
916 dprintk(DEBUG_MUX, "error %d\n", err);
917 return err;
918 }
919
920 err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
921 if (r.err < 0)
922 err = r.err;
923
924 if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) {
925 if (v9fs_mux_flush_request(m, req)) {
926 /* wait until we get response of the flush message */
927 do {
928 clear_thread_flag(TIF_SIGPENDING);
929 err = wait_event_interruptible(r.wqueue,
930 r.rcall || r.err);
931 } while (!r.rcall && !r.err && err==-ERESTARTSYS &&
932 m->trans->status==Connected && !m->err);
933
934 err = -ERESTARTSYS;
935 }
936 sigpending = 1;
937 }
938
939 if (sigpending) {
940 spin_lock_irqsave(&current->sighand->siglock, flags);
941 recalc_sigpending();
942 spin_unlock_irqrestore(&current->sighand->siglock, flags);
943 }
944
945 if (rc)
946 *rc = r.rcall;
947 else
948 kfree(r.rcall);
949
950 v9fs_mux_free_request(m, req);
951 if (err > 0)
952 err = -EIO;
953
954 return err;
955}
956
957#if 0
958/**
959 * v9fs_mux_rpcnb - sends 9P request without waiting for response.
960 * @m: mux data
961 * @tc: request to be sent
962 * @cb: callback function to be called when response arrives
963 * @cba: value to pass to the callback function
964 */
965int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
966 v9fs_mux_req_callback cb, void *a)
967{
968 int err;
969 struct v9fs_req *req;
970
971 req = v9fs_send_request(m, tc, cb, a);
972 if (IS_ERR(req)) {
973 err = PTR_ERR(req);
974 dprintk(DEBUG_MUX, "error %d\n", err);
975 return PTR_ERR(req);
976 }
977
978 dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
979 return 0;
980}
981#endif /* 0 */
982
983/**
984 * v9fs_mux_cancel - cancel all pending requests with error
985 * @m: mux data
986 * @err: error code
987 */
988void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
989{
990 struct v9fs_req *req, *rtmp;
991 LIST_HEAD(cancel_list);
992
993 dprintk(DEBUG_ERROR, "mux %p err %d\n", m, err);
994 m->err = err;
995 spin_lock(&m->lock);
996 list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
997 list_move(&req->req_list, &cancel_list);
998 }
999 list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
1000 list_move(&req->req_list, &cancel_list);
1001 }
1002 spin_unlock(&m->lock);
1003
1004 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
1005 list_del(&req->req_list);
1006 if (!req->err)
1007 req->err = err;
1008
1009 if (req->cb)
1010 (*req->cb) (req, req->cba);
1011 else
1012 kfree(req->rcall);
1013 }
1014
1015 wake_up(&m->equeue);
1016}
1017
1018static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m)
1019{
1020 int tag;
1021
1022 tag = v9fs_get_idpool(&m->tagpool);
1023 if (tag < 0)
1024 return V9FS_NOTAG;
1025 else
1026 return (u16) tag;
1027}
1028
1029static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag)
1030{
1031 if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tagpool))
1032 v9fs_put_idpool(tag, &m->tagpool);
1033}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
deleted file mode 100644
index fb10c50186..0000000000
--- a/fs/9p/mux.h
+++ /dev/null
@@ -1,55 +0,0 @@
1/*
2 * linux/fs/9p/mux.h
3 *
4 * Multiplexer Definitions
5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26struct v9fs_mux_data;
27struct v9fs_req;
28
29/**
30 * v9fs_mux_req_callback - callback function that is called when the
31 * response of a request is received. The callback is called from
32 * a workqueue and shouldn't block.
33 *
34 * @a - the pointer that was specified when the request was send to be
35 * passed to the callback
36 * @tc - request call
37 * @rc - response call
38 * @err - error code (non-zero if error occured)
39 */
40typedef void (*v9fs_mux_req_callback)(struct v9fs_req *req, void *a);
41
42int v9fs_mux_global_init(void);
43void v9fs_mux_global_exit(void);
44
45struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
46 unsigned char *extended);
47void v9fs_mux_destroy(struct v9fs_mux_data *);
48
49int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc);
50struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m);
51int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc);
52
53void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
54void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
55int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
deleted file mode 100644
index 34d43355be..0000000000
--- a/fs/9p/trans_fd.c
+++ /dev/null
@@ -1,308 +0,0 @@
1/*
2 * linux/fs/9p/trans_fd.c
3 *
4 * Fd transport layer. Includes deprecated socket layer.
5 *
6 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
8 * Copyright (C) 2004-2005 by Eric Van Hensbergen <ericvh@gmail.com>
9 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2
13 * as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to:
22 * Free Software Foundation
23 * 51 Franklin Street, Fifth Floor
24 * Boston, MA 02111-1301 USA
25 *
26 */
27
28#include <linux/in.h>
29#include <linux/module.h>
30#include <linux/net.h>
31#include <linux/ipv6.h>
32#include <linux/errno.h>
33#include <linux/kernel.h>
34#include <linux/un.h>
35#include <asm/uaccess.h>
36#include <linux/inet.h>
37#include <linux/idr.h>
38#include <linux/file.h>
39
40#include "debug.h"
41#include "v9fs.h"
42#include "transport.h"
43
44#define V9FS_PORT 564
45
46struct v9fs_trans_fd {
47 struct file *rd;
48 struct file *wr;
49};
50
51/**
52 * v9fs_fd_read- read from a fd
53 * @v9ses: session information
54 * @v: buffer to receive data into
55 * @len: size of receive buffer
56 *
57 */
58static int v9fs_fd_read(struct v9fs_transport *trans, void *v, int len)
59{
60 int ret;
61 struct v9fs_trans_fd *ts;
62
63 if (!trans || trans->status == Disconnected || !(ts = trans->priv))
64 return -EREMOTEIO;
65
66 if (!(ts->rd->f_flags & O_NONBLOCK))
67 dprintk(DEBUG_ERROR, "blocking read ...\n");
68
69 ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
70 if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
71 trans->status = Disconnected;
72 return ret;
73}
74
75/**
76 * v9fs_fd_write - write to a socket
77 * @v9ses: session information
78 * @v: buffer to send data from
79 * @len: size of send buffer
80 *
81 */
82static int v9fs_fd_write(struct v9fs_transport *trans, void *v, int len)
83{
84 int ret;
85 mm_segment_t oldfs;
86 struct v9fs_trans_fd *ts;
87
88 if (!trans || trans->status == Disconnected || !(ts = trans->priv))
89 return -EREMOTEIO;
90
91 if (!(ts->wr->f_flags & O_NONBLOCK))
92 dprintk(DEBUG_ERROR, "blocking write ...\n");
93
94 oldfs = get_fs();
95 set_fs(get_ds());
96 /* The cast to a user pointer is valid due to the set_fs() */
97 ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos);
98 set_fs(oldfs);
99
100 if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
101 trans->status = Disconnected;
102 return ret;
103}
104
105static unsigned int
106v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt)
107{
108 int ret, n;
109 struct v9fs_trans_fd *ts;
110 mm_segment_t oldfs;
111
112 if (!trans || trans->status != Connected || !(ts = trans->priv))
113 return -EREMOTEIO;
114
115 if (!ts->rd->f_op || !ts->rd->f_op->poll)
116 return -EIO;
117
118 if (!ts->wr->f_op || !ts->wr->f_op->poll)
119 return -EIO;
120
121 oldfs = get_fs();
122 set_fs(get_ds());
123
124 ret = ts->rd->f_op->poll(ts->rd, pt);
125 if (ret < 0)
126 goto end;
127
128 if (ts->rd != ts->wr) {
129 n = ts->wr->f_op->poll(ts->wr, pt);
130 if (n < 0) {
131 ret = n;
132 goto end;
133 }
134 ret = (ret & ~POLLOUT) | (n & ~POLLIN);
135 }
136
137 end:
138 set_fs(oldfs);
139 return ret;
140}
141
142static int v9fs_fd_open(struct v9fs_session_info *v9ses, int rfd, int wfd)
143{
144 struct v9fs_transport *trans = v9ses->transport;
145 struct v9fs_trans_fd *ts = kmalloc(sizeof(struct v9fs_trans_fd),
146 GFP_KERNEL);
147 if (!ts)
148 return -ENOMEM;
149
150 ts->rd = fget(rfd);
151 ts->wr = fget(wfd);
152 if (!ts->rd || !ts->wr) {
153 if (ts->rd)
154 fput(ts->rd);
155 if (ts->wr)
156 fput(ts->wr);
157 kfree(ts);
158 return -EIO;
159 }
160
161 trans->priv = ts;
162 trans->status = Connected;
163
164 return 0;
165}
166
167static int v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr,
168 char *data)
169{
170 if (v9ses->rfdno == ~0 || v9ses->wfdno == ~0) {
171 printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
172 return -ENOPROTOOPT;
173 }
174
175 return v9fs_fd_open(v9ses, v9ses->rfdno, v9ses->wfdno);
176}
177
178static int v9fs_socket_open(struct v9fs_session_info *v9ses,
179 struct socket *csocket)
180{
181 int fd, ret;
182
183 csocket->sk->sk_allocation = GFP_NOIO;
184 if ((fd = sock_map_fd(csocket)) < 0) {
185 eprintk(KERN_ERR, "v9fs_socket_open: failed to map fd\n");
186 ret = fd;
187 release_csocket:
188 sock_release(csocket);
189 return ret;
190 }
191
192 if ((ret = v9fs_fd_open(v9ses, fd, fd)) < 0) {
193 sockfd_put(csocket);
194 eprintk(KERN_ERR, "v9fs_socket_open: failed to open fd\n");
195 goto release_csocket;
196 }
197
198 ((struct v9fs_trans_fd *)v9ses->transport->priv)->rd->f_flags |=
199 O_NONBLOCK;
200 return 0;
201}
202
203static int v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr,
204 char *data)
205{
206 int ret;
207 struct socket *csocket = NULL;
208 struct sockaddr_in sin_server;
209
210 sin_server.sin_family = AF_INET;
211 sin_server.sin_addr.s_addr = in_aton(addr);
212 sin_server.sin_port = htons(v9ses->port);
213 sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket);
214
215 if (!csocket) {
216 eprintk(KERN_ERR, "v9fs_trans_tcp: problem creating socket\n");
217 return -1;
218 }
219
220 ret = csocket->ops->connect(csocket,
221 (struct sockaddr *)&sin_server,
222 sizeof(struct sockaddr_in), 0);
223 if (ret < 0) {
224 eprintk(KERN_ERR,
225 "v9fs_trans_tcp: problem connecting socket to %s\n",
226 addr);
227 return ret;
228 }
229
230 return v9fs_socket_open(v9ses, csocket);
231}
232
233static int
234v9fs_unix_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
235{
236 int ret;
237 struct socket *csocket;
238 struct sockaddr_un sun_server;
239
240 if (strlen(addr) > UNIX_PATH_MAX) {
241 eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
242 addr);
243 return -ENAMETOOLONG;
244 }
245
246 sun_server.sun_family = PF_UNIX;
247 strcpy(sun_server.sun_path, addr);
248 sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket);
249 ret = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
250 sizeof(struct sockaddr_un) - 1, 0);
251 if (ret < 0) {
252 eprintk(KERN_ERR,
253 "v9fs_trans_unix: problem connecting socket: %s: %d\n",
254 addr, ret);
255 return ret;
256 }
257
258 return v9fs_socket_open(v9ses, csocket);
259}
260
261/**
262 * v9fs_sock_close - shutdown socket
263 * @trans: private socket structure
264 *
265 */
266static void v9fs_fd_close(struct v9fs_transport *trans)
267{
268 struct v9fs_trans_fd *ts;
269
270 if (!trans)
271 return;
272
273 ts = xchg(&trans->priv, NULL);
274
275 if (!ts)
276 return;
277
278 trans->status = Disconnected;
279 if (ts->rd)
280 fput(ts->rd);
281 if (ts->wr)
282 fput(ts->wr);
283 kfree(ts);
284}
285
286struct v9fs_transport v9fs_trans_fd = {
287 .init = v9fs_fd_init,
288 .write = v9fs_fd_write,
289 .read = v9fs_fd_read,
290 .close = v9fs_fd_close,
291 .poll = v9fs_fd_poll,
292};
293
294struct v9fs_transport v9fs_trans_tcp = {
295 .init = v9fs_tcp_init,
296 .write = v9fs_fd_write,
297 .read = v9fs_fd_read,
298 .close = v9fs_fd_close,
299 .poll = v9fs_fd_poll,
300};
301
302struct v9fs_transport v9fs_trans_unix = {
303 .init = v9fs_unix_init,
304 .write = v9fs_fd_write,
305 .read = v9fs_fd_read,
306 .close = v9fs_fd_close,
307 .poll = v9fs_fd_poll,
308};
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
deleted file mode 100644
index b38a4b8a41..0000000000
--- a/fs/9p/transport.h
+++ /dev/null
@@ -1,45 +0,0 @@
1/*
2 * linux/fs/9p/transport.h
3 *
4 * Transport Definition
5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to:
20 * Free Software Foundation
21 * 51 Franklin Street, Fifth Floor
22 * Boston, MA 02111-1301 USA
23 *
24 */
25
26enum v9fs_transport_status {
27 Connected,
28 Disconnected,
29 Hung,
30};
31
32struct v9fs_transport {
33 enum v9fs_transport_status status;
34 void *priv;
35
36 int (*init) (struct v9fs_session_info *, const char *, char *);
37 int (*write) (struct v9fs_transport *, void *, int);
38 int (*read) (struct v9fs_transport *, void *, int);
39 void (*close) (struct v9fs_transport *);
40 unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *);
41};
42
43extern struct v9fs_transport v9fs_trans_tcp;
44extern struct v9fs_transport v9fs_trans_unix;
45extern struct v9fs_transport v9fs_trans_fd;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6ad6f192b6..0a7068e30e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,16 +29,12 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32 32#include <net/9p/9p.h>
33#include "debug.h" 33#include <net/9p/transport.h>
34#include <net/9p/conn.h>
35#include <net/9p/client.h>
34#include "v9fs.h" 36#include "v9fs.h"
35#include "9p.h"
36#include "v9fs_vfs.h" 37#include "v9fs_vfs.h"
37#include "transport.h"
38#include "mux.h"
39
40/* TODO: sysfs or debugfs interface */
41int v9fs_debug_level = 0; /* feature-rific global debug level */
42 38
43/* 39/*
44 * Option Parsing (code inspired by NFS code) 40 * Option Parsing (code inspired by NFS code)
@@ -47,12 +43,12 @@ int v9fs_debug_level = 0; /* feature-rific global debug level */
47 43
48enum { 44enum {
49 /* Options that take integer arguments */ 45 /* Options that take integer arguments */
50 Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_debug, 46 Opt_debug, Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid,
51 Opt_rfdno, Opt_wfdno, 47 Opt_rfdno, Opt_wfdno,
52 /* String options */ 48 /* String options */
53 Opt_uname, Opt_remotename, 49 Opt_uname, Opt_remotename,
54 /* Options that take no arguments */ 50 /* Options that take no arguments */
55 Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd, 51 Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd, Opt_pci,
56 /* Cache options */ 52 /* Cache options */
57 Opt_cache_loose, 53 Opt_cache_loose,
58 /* Error token */ 54 /* Error token */
@@ -60,6 +56,7 @@ enum {
60}; 56};
61 57
62static match_table_t tokens = { 58static match_table_t tokens = {
59 {Opt_debug, "debug=%x"},
63 {Opt_port, "port=%u"}, 60 {Opt_port, "port=%u"},
64 {Opt_msize, "msize=%u"}, 61 {Opt_msize, "msize=%u"},
65 {Opt_uid, "uid=%u"}, 62 {Opt_uid, "uid=%u"},
@@ -67,12 +64,14 @@ static match_table_t tokens = {
67 {Opt_afid, "afid=%u"}, 64 {Opt_afid, "afid=%u"},
68 {Opt_rfdno, "rfdno=%u"}, 65 {Opt_rfdno, "rfdno=%u"},
69 {Opt_wfdno, "wfdno=%u"}, 66 {Opt_wfdno, "wfdno=%u"},
70 {Opt_debug, "debug=%x"},
71 {Opt_uname, "uname=%s"}, 67 {Opt_uname, "uname=%s"},
72 {Opt_remotename, "aname=%s"}, 68 {Opt_remotename, "aname=%s"},
73 {Opt_unix, "proto=unix"}, 69 {Opt_unix, "proto=unix"},
74 {Opt_tcp, "proto=tcp"}, 70 {Opt_tcp, "proto=tcp"},
75 {Opt_fd, "proto=fd"}, 71 {Opt_fd, "proto=fd"},
72#ifdef CONFIG_PCI_9P
73 {Opt_pci, "proto=pci"},
74#endif
76 {Opt_tcp, "tcp"}, 75 {Opt_tcp, "tcp"},
77 {Opt_unix, "unix"}, 76 {Opt_unix, "unix"},
78 {Opt_fd, "fd"}, 77 {Opt_fd, "fd"},
@@ -83,6 +82,8 @@ static match_table_t tokens = {
83 {Opt_err, NULL} 82 {Opt_err, NULL}
84}; 83};
85 84
85extern struct p9_transport *p9pci_trans_create(void);
86
86/* 87/*
87 * Parse option string. 88 * Parse option string.
88 */ 89 */
@@ -122,12 +123,18 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
122 token = match_token(p, tokens, args); 123 token = match_token(p, tokens, args);
123 if (token < Opt_uname) { 124 if (token < Opt_uname) {
124 if ((ret = match_int(&args[0], &option)) < 0) { 125 if ((ret = match_int(&args[0], &option)) < 0) {
125 dprintk(DEBUG_ERROR, 126 P9_DPRINTK(P9_DEBUG_ERROR,
126 "integer field, but no integer?\n"); 127 "integer field, but no integer?\n");
127 continue; 128 continue;
128 } 129 }
129 } 130 }
130 switch (token) { 131 switch (token) {
132 case Opt_debug:
133 v9ses->debug = option;
134#ifdef CONFIG_NET_9P_DEBUG
135 p9_debug_level = option;
136#endif
137 break;
131 case Opt_port: 138 case Opt_port:
132 v9ses->port = option; 139 v9ses->port = option;
133 break; 140 break;
@@ -149,15 +156,15 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
149 case Opt_wfdno: 156 case Opt_wfdno:
150 v9ses->wfdno = option; 157 v9ses->wfdno = option;
151 break; 158 break;
152 case Opt_debug:
153 v9ses->debug = option;
154 break;
155 case Opt_tcp: 159 case Opt_tcp:
156 v9ses->proto = PROTO_TCP; 160 v9ses->proto = PROTO_TCP;
157 break; 161 break;
158 case Opt_unix: 162 case Opt_unix:
159 v9ses->proto = PROTO_UNIX; 163 v9ses->proto = PROTO_UNIX;
160 break; 164 break;
165 case Opt_pci:
166 v9ses->proto = PROTO_PCI;
167 break;
161 case Opt_fd: 168 case Opt_fd:
162 v9ses->proto = PROTO_FD; 169 v9ses->proto = PROTO_FD;
163 break; 170 break;
@@ -183,82 +190,6 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
183} 190}
184 191
185/** 192/**
186 * v9fs_inode2v9ses - safely extract v9fs session info from super block
187 * @inode: inode to extract information from
188 *
189 * Paranoid function to extract v9ses information from superblock,
190 * if anything is missing it will report an error.
191 *
192 */
193
194struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
195{
196 return (inode->i_sb->s_fs_info);
197}
198
199/**
200 * v9fs_get_idpool - allocate numeric id from pool
201 * @p - pool to allocate from
202 *
203 * XXX - This seems to be an awful generic function, should it be in idr.c with
204 * the lock included in struct idr?
205 */
206
207int v9fs_get_idpool(struct v9fs_idpool *p)
208{
209 int i = 0;
210 int error;
211
212retry:
213 if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
214 return 0;
215
216 if (down_interruptible(&p->lock) == -EINTR) {
217 eprintk(KERN_WARNING, "Interrupted while locking\n");
218 return -1;
219 }
220
221 /* no need to store exactly p, we just need something non-null */
222 error = idr_get_new(&p->pool, p, &i);
223 up(&p->lock);
224
225 if (error == -EAGAIN)
226 goto retry;
227 else if (error)
228 return -1;
229
230 return i;
231}
232
233/**
234 * v9fs_put_idpool - release numeric id from pool
235 * @p - pool to allocate from
236 *
237 * XXX - This seems to be an awful generic function, should it be in idr.c with
238 * the lock included in struct idr?
239 */
240
241void v9fs_put_idpool(int id, struct v9fs_idpool *p)
242{
243 if (down_interruptible(&p->lock) == -EINTR) {
244 eprintk(KERN_WARNING, "Interrupted while locking\n");
245 return;
246 }
247 idr_remove(&p->pool, id);
248 up(&p->lock);
249}
250
251/**
252 * v9fs_check_idpool - check if the specified id is available
253 * @id - id to check
254 * @p - pool
255 */
256int v9fs_check_idpool(int id, struct v9fs_idpool *p)
257{
258 return idr_find(&p->pool, id) != NULL;
259}
260
261/**
262 * v9fs_session_init - initialize session 193 * v9fs_session_init - initialize session
263 * @v9ses: session information structure 194 * @v9ses: session information structure
264 * @dev_name: device being mounted 195 * @dev_name: device being mounted
@@ -266,25 +197,21 @@ int v9fs_check_idpool(int id, struct v9fs_idpool *p)
266 * 197 *
267 */ 198 */
268 199
269int 200struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
270v9fs_session_init(struct v9fs_session_info *v9ses,
271 const char *dev_name, char *data) 201 const char *dev_name, char *data)
272{ 202{
273 struct v9fs_fcall *fcall = NULL;
274 struct v9fs_transport *trans_proto;
275 int n = 0;
276 int newfid = -1;
277 int retval = -EINVAL; 203 int retval = -EINVAL;
278 struct v9fs_str *version; 204 struct p9_transport *trans;
205 struct p9_fid *fid;
279 206
280 v9ses->name = __getname(); 207 v9ses->name = __getname();
281 if (!v9ses->name) 208 if (!v9ses->name)
282 return -ENOMEM; 209 return ERR_PTR(-ENOMEM);
283 210
284 v9ses->remotename = __getname(); 211 v9ses->remotename = __getname();
285 if (!v9ses->remotename) { 212 if (!v9ses->remotename) {
286 __putname(v9ses->name); 213 __putname(v9ses->name);
287 return -ENOMEM; 214 return ERR_PTR(-ENOMEM);
288 } 215 }
289 216
290 strcpy(v9ses->name, V9FS_DEFUSER); 217 strcpy(v9ses->name, V9FS_DEFUSER);
@@ -292,130 +219,60 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
292 219
293 v9fs_parse_options(data, v9ses); 220 v9fs_parse_options(data, v9ses);
294 221
295 /* set global debug level */
296 v9fs_debug_level = v9ses->debug;
297
298 /* id pools that are session-dependent: fids and tags */
299 idr_init(&v9ses->fidpool.pool);
300 init_MUTEX(&v9ses->fidpool.lock);
301
302 switch (v9ses->proto) { 222 switch (v9ses->proto) {
303 case PROTO_TCP: 223 case PROTO_TCP:
304 trans_proto = &v9fs_trans_tcp; 224 trans = p9_trans_create_tcp(dev_name, v9ses->port);
305 break; 225 break;
306 case PROTO_UNIX: 226 case PROTO_UNIX:
307 trans_proto = &v9fs_trans_unix; 227 trans = p9_trans_create_unix(dev_name);
308 *v9ses->remotename = 0; 228 *v9ses->remotename = 0;
309 break; 229 break;
310 case PROTO_FD: 230 case PROTO_FD:
311 trans_proto = &v9fs_trans_fd; 231 trans = p9_trans_create_fd(v9ses->rfdno, v9ses->wfdno);
312 *v9ses->remotename = 0; 232 *v9ses->remotename = 0;
313 break; 233 break;
234#ifdef CONFIG_PCI_9P
235 case PROTO_PCI:
236 trans = p9pci_trans_create();
237 *v9ses->remotename = 0;
238 break;
239#endif
314 default: 240 default:
315 printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto); 241 printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto);
316 retval = -ENOPROTOOPT; 242 retval = -ENOPROTOOPT;
317 goto SessCleanUp; 243 goto error;
318 }; 244 };
319 245
320 v9ses->transport = kmalloc(sizeof(*v9ses->transport), GFP_KERNEL); 246 if (IS_ERR(trans)) {
321 if (!v9ses->transport) { 247 retval = PTR_ERR(trans);
322 retval = -ENOMEM; 248 trans = NULL;
323 goto SessCleanUp; 249 goto error;
324 } 250 }
325 251
326 memmove(v9ses->transport, trans_proto, sizeof(*v9ses->transport)); 252 v9ses->clnt = p9_client_create(trans, v9ses->maxdata + P9_IOHDRSZ,
253 v9ses->extended);
327 254
328 if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) { 255 if (IS_ERR(v9ses->clnt)) {
329 eprintk(KERN_ERR, "problem initializing transport\n"); 256 retval = PTR_ERR(v9ses->clnt);
330 goto SessCleanUp; 257 v9ses->clnt = NULL;
258 P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
259 goto error;
331 } 260 }
332 261
333 v9ses->inprogress = 0; 262 fid = p9_client_attach(v9ses->clnt, NULL, v9ses->name,
334 v9ses->shutdown = 0; 263 v9ses->remotename);
335 v9ses->session_hung = 0; 264 if (IS_ERR(fid)) {
336 265 retval = PTR_ERR(fid);
337 v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ, 266 fid = NULL;
338 &v9ses->extended); 267 P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
339 268 goto error;
340 if (IS_ERR(v9ses->mux)) {
341 retval = PTR_ERR(v9ses->mux);
342 v9ses->mux = NULL;
343 dprintk(DEBUG_ERROR, "problem initializing mux\n");
344 goto SessCleanUp;
345 } 269 }
346 270
347 if (v9ses->afid == ~0) { 271 return fid;
348 if (v9ses->extended)
349 retval =
350 v9fs_t_version(v9ses, v9ses->maxdata, "9P2000.u",
351 &fcall);
352 else
353 retval = v9fs_t_version(v9ses, v9ses->maxdata, "9P2000",
354 &fcall);
355
356 if (retval < 0) {
357 dprintk(DEBUG_ERROR, "v9fs_t_version failed\n");
358 goto FreeFcall;
359 }
360
361 version = &fcall->params.rversion.version;
362 if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) {
363 dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
364 v9ses->extended = 1;
365 } else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) {
366 dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
367 v9ses->extended = 0;
368 } else {
369 retval = -EREMOTEIO;
370 goto FreeFcall;
371 }
372 272
373 n = fcall->params.rversion.msize; 273error:
374 kfree(fcall);
375
376 if (n < v9ses->maxdata)
377 v9ses->maxdata = n;
378 }
379
380 newfid = v9fs_get_idpool(&v9ses->fidpool);
381 if (newfid < 0) {
382 eprintk(KERN_WARNING, "couldn't allocate FID\n");
383 retval = -ENOMEM;
384 goto SessCleanUp;
385 }
386 /* it is a little bit ugly, but we have to prevent newfid */
387 /* being the same as afid, so if it is, get a new fid */
388 if (v9ses->afid != ~0 && newfid == v9ses->afid) {
389 newfid = v9fs_get_idpool(&v9ses->fidpool);
390 if (newfid < 0) {
391 eprintk(KERN_WARNING, "couldn't allocate FID\n");
392 retval = -ENOMEM;
393 goto SessCleanUp;
394 }
395 }
396
397 if ((retval =
398 v9fs_t_attach(v9ses, v9ses->name, v9ses->remotename, newfid,
399 v9ses->afid, NULL))
400 < 0) {
401 dprintk(DEBUG_ERROR, "cannot attach\n");
402 goto SessCleanUp;
403 }
404
405 if (v9ses->afid != ~0) {
406 dprintk(DEBUG_ERROR, "afid not equal to ~0\n");
407 if (v9fs_t_clunk(v9ses, v9ses->afid))
408 dprintk(DEBUG_ERROR, "clunk failed\n");
409 }
410
411 return newfid;
412
413 FreeFcall:
414 kfree(fcall);
415
416 SessCleanUp:
417 v9fs_session_close(v9ses); 274 v9fs_session_close(v9ses);
418 return retval; 275 return ERR_PTR(retval);
419} 276}
420 277
421/** 278/**
@@ -426,15 +283,9 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
426 283
427void v9fs_session_close(struct v9fs_session_info *v9ses) 284void v9fs_session_close(struct v9fs_session_info *v9ses)
428{ 285{
429 if (v9ses->mux) { 286 if (v9ses->clnt) {
430 v9fs_mux_destroy(v9ses->mux); 287 p9_client_destroy(v9ses->clnt);
431 v9ses->mux = NULL; 288 v9ses->clnt = NULL;
432 }
433
434 if (v9ses->transport) {
435 v9ses->transport->close(v9ses->transport);
436 kfree(v9ses->transport);
437 v9ses->transport = NULL;
438 } 289 }
439 290
440 __putname(v9ses->name); 291 __putname(v9ses->name);
@@ -446,9 +297,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
446 * and cancel all pending requests. 297 * and cancel all pending requests.
447 */ 298 */
448void v9fs_session_cancel(struct v9fs_session_info *v9ses) { 299void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
449 dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses); 300 P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
450 v9ses->transport->status = Disconnected; 301 p9_client_disconnect(v9ses->clnt);
451 v9fs_mux_cancel(v9ses->mux, -EIO);
452} 302}
453 303
454extern int v9fs_error_init(void); 304extern int v9fs_error_init(void);
@@ -460,24 +310,9 @@ extern int v9fs_error_init(void);
460 310
461static int __init init_v9fs(void) 311static int __init init_v9fs(void)
462{ 312{
463 int ret;
464
465 v9fs_error_init();
466
467 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); 313 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
468 314
469 ret = v9fs_mux_global_init(); 315 return register_filesystem(&v9fs_fs_type);
470 if (ret) {
471 printk(KERN_WARNING "v9fs: starting mux failed\n");
472 return ret;
473 }
474 ret = register_filesystem(&v9fs_fs_type);
475 if (ret) {
476 printk(KERN_WARNING "v9fs: registering file system failed\n");
477 v9fs_mux_global_exit();
478 }
479
480 return ret;
481} 316}
482 317
483/** 318/**
@@ -487,13 +322,13 @@ static int __init init_v9fs(void)
487 322
488static void __exit exit_v9fs(void) 323static void __exit exit_v9fs(void)
489{ 324{
490 v9fs_mux_global_exit();
491 unregister_filesystem(&v9fs_fs_type); 325 unregister_filesystem(&v9fs_fs_type);
492} 326}
493 327
494module_init(init_v9fs) 328module_init(init_v9fs)
495module_exit(exit_v9fs) 329module_exit(exit_v9fs)
496 330
331MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
497MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); 332MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
498MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); 333MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
499MODULE_LICENSE("GPL"); 334MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 820bf5ca35..abc4b1668a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -22,16 +22,6 @@
22 */ 22 */
23 23
24/* 24/*
25 * Idpool structure provides lock and id management
26 *
27 */
28
29struct v9fs_idpool {
30 struct semaphore lock;
31 struct idr pool;
32};
33
34/*
35 * Session structure provides information for an opened session 25 * Session structure provides information for an opened session
36 * 26 *
37 */ 27 */
@@ -54,15 +44,7 @@ struct v9fs_session_info {
54 unsigned int uid; /* default uid/muid for legacy support */ 44 unsigned int uid; /* default uid/muid for legacy support */
55 unsigned int gid; /* default gid for legacy support */ 45 unsigned int gid; /* default gid for legacy support */
56 46
57 /* book keeping */ 47 struct p9_client *clnt; /* 9p client */
58 struct v9fs_idpool fidpool; /* The FID pool for file descriptors */
59
60 struct v9fs_transport *transport;
61 struct v9fs_mux_data *mux;
62
63 int inprogress; /* session in progress => true */
64 int shutdown; /* session shutting down. no more attaches. */
65 unsigned char session_hung;
66 struct dentry *debugfs_dir; 48 struct dentry *debugfs_dir;
67}; 49};
68 50
@@ -71,6 +53,7 @@ enum {
71 PROTO_TCP, 53 PROTO_TCP,
72 PROTO_UNIX, 54 PROTO_UNIX,
73 PROTO_FD, 55 PROTO_FD,
56 PROTO_PCI,
74}; 57};
75 58
76/* possible values of ->cache */ 59/* possible values of ->cache */
@@ -82,12 +65,9 @@ enum {
82 65
83extern struct dentry *v9fs_debugfs_root; 66extern struct dentry *v9fs_debugfs_root;
84 67
85int v9fs_session_init(struct v9fs_session_info *, const char *, char *); 68struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
86struct v9fs_session_info *v9fs_inode2v9ses(struct inode *); 69 char *);
87void v9fs_session_close(struct v9fs_session_info *v9ses); 70void v9fs_session_close(struct v9fs_session_info *v9ses);
88int v9fs_get_idpool(struct v9fs_idpool *p);
89void v9fs_put_idpool(int id, struct v9fs_idpool *p);
90int v9fs_check_idpool(int id, struct v9fs_idpool *p);
91void v9fs_session_cancel(struct v9fs_session_info *v9ses); 71void v9fs_session_cancel(struct v9fs_session_info *v9ses);
92 72
93#define V9FS_MAGIC 0x01021997 73#define V9FS_MAGIC 0x01021997
@@ -97,3 +77,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses);
97#define V9FS_DEFUSER "nobody" 77#define V9FS_DEFUSER "nobody"
98#define V9FS_DEFANAME "" 78#define V9FS_DEFANAME ""
99 79
80static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
81{
82 return (inode->i_sb->s_fs_info);
83}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 6a82d39dc4..fd01d90cad 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -45,10 +45,10 @@ extern struct dentry_operations v9fs_dentry_operations;
45extern struct dentry_operations v9fs_cached_dentry_operations; 45extern struct dentry_operations v9fs_cached_dentry_operations;
46 46
47struct inode *v9fs_get_inode(struct super_block *sb, int mode); 47struct inode *v9fs_get_inode(struct super_block *sb, int mode);
48ino_t v9fs_qid2ino(struct v9fs_qid *qid); 48ino_t v9fs_qid2ino(struct p9_qid *qid);
49void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *); 49void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *);
50int v9fs_dir_release(struct inode *inode, struct file *filp); 50int v9fs_dir_release(struct inode *inode, struct file *filp);
51int v9fs_file_open(struct inode *inode, struct file *file); 51int v9fs_file_open(struct inode *inode, struct file *file);
52void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); 52void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
53void v9fs_dentry_release(struct dentry *); 53void v9fs_dentry_release(struct dentry *);
54int v9fs_uflags2omode(int uflags); 54int v9fs_uflags2omode(int uflags);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9ac4ffe9ac..6248f0e727 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,10 +33,10 @@
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <net/9p/9p.h>
37#include <net/9p/client.h>
36 38
37#include "debug.h"
38#include "v9fs.h" 39#include "v9fs.h"
39#include "9p.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "fid.h" 41#include "fid.h"
42 42
@@ -50,55 +50,26 @@
50 50
51static int v9fs_vfs_readpage(struct file *filp, struct page *page) 51static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52{ 52{
53 char *buffer = NULL; 53 int retval;
54 int retval = -EIO; 54 loff_t offset;
55 loff_t offset = page_offset(page); 55 char *buffer;
56 int count = PAGE_CACHE_SIZE; 56 struct p9_fid *fid;
57 struct inode *inode = filp->f_path.dentry->d_inode;
58 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
59 int rsize = v9ses->maxdata - V9FS_IOHDRSZ;
60 struct v9fs_fid *v9f = filp->private_data;
61 struct v9fs_fcall *fcall = NULL;
62 int fid = v9f->fid;
63 int total = 0;
64 int result = 0;
65
66 dprintk(DEBUG_VFS, "\n");
67 57
58 P9_DPRINTK(P9_DEBUG_VFS, "\n");
59 fid = filp->private_data;
68 buffer = kmap(page); 60 buffer = kmap(page);
69 do { 61 offset = page_offset(page);
70 if (count < rsize)
71 rsize = count;
72
73 result = v9fs_t_read(v9ses, fid, offset, rsize, &fcall);
74
75 if (result < 0) {
76 printk(KERN_ERR "v9fs_t_read returned %d\n",
77 result);
78
79 kfree(fcall);
80 goto UnmapAndUnlock;
81 } else
82 offset += result;
83
84 memcpy(buffer, fcall->params.rread.data, result);
85
86 count -= result;
87 buffer += result;
88 total += result;
89
90 kfree(fcall);
91 62
92 if (result < rsize) 63 retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE);
93 break; 64 if (retval < 0)
94 } while (count); 65 goto done;
95 66
96 memset(buffer, 0, count); 67 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
97 flush_dcache_page(page); 68 flush_dcache_page(page);
98 SetPageUptodate(page); 69 SetPageUptodate(page);
99 retval = 0; 70 retval = 0;
100 71
101UnmapAndUnlock: 72done:
102 kunmap(page); 73 kunmap(page);
103 unlock_page(page); 74 unlock_page(page);
104 return retval; 75 return retval;
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d93960429c..f9534f18df 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,10 +34,10 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <net/9p/9p.h>
38#include <net/9p/client.h>
37 39
38#include "debug.h"
39#include "v9fs.h" 40#include "v9fs.h"
40#include "9p.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43 43
@@ -52,7 +52,7 @@
52 52
53static int v9fs_dentry_delete(struct dentry *dentry) 53static int v9fs_dentry_delete(struct dentry *dentry)
54{ 54{
55 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); 55 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
56 56
57 return 1; 57 return 1;
58} 58}
@@ -69,7 +69,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
69static int v9fs_cached_dentry_delete(struct dentry *dentry) 69static int v9fs_cached_dentry_delete(struct dentry *dentry)
70{ 70{
71 struct inode *inode = dentry->d_inode; 71 struct inode *inode = dentry->d_inode;
72 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); 72 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
73 73
74 if(!inode) 74 if(!inode)
75 return 1; 75 return 1;
@@ -85,26 +85,19 @@ static int v9fs_cached_dentry_delete(struct dentry *dentry)
85 85
86void v9fs_dentry_release(struct dentry *dentry) 86void v9fs_dentry_release(struct dentry *dentry)
87{ 87{
88 int err; 88 struct v9fs_dentry *dent;
89 89 struct p9_fid *temp, *current_fid;
90 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); 90
91 91 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
92 if (dentry->d_fsdata != NULL) { 92 dent = dentry->d_fsdata;
93 struct list_head *fid_list = dentry->d_fsdata; 93 if (dent) {
94 struct v9fs_fid *temp = NULL; 94 list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
95 struct v9fs_fid *current_fid = NULL; 95 dlist) {
96 96 p9_client_clunk(current_fid);
97 list_for_each_entry_safe(current_fid, temp, fid_list, list) {
98 err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
99
100 if (err < 0)
101 dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n",
102 err, dentry->d_iname);
103
104 v9fs_fid_destroy(current_fid);
105 } 97 }
106 98
107 kfree(dentry->d_fsdata); /* free the list_head */ 99 kfree(dent);
100 dentry->d_fsdata = NULL;
108 } 101 }
109} 102}
110 103
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 1dd86ee90b..0924d4477d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,11 +32,10 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <net/9p/9p.h>
36#include <net/9p/client.h>
35 37
36#include "debug.h"
37#include "v9fs.h" 38#include "v9fs.h"
38#include "9p.h"
39#include "conv.h"
40#include "v9fs_vfs.h" 39#include "v9fs_vfs.h"
41#include "fid.h" 40#include "fid.h"
42 41
@@ -46,14 +45,14 @@
46 * 45 *
47 */ 46 */
48 47
49static inline int dt_type(struct v9fs_stat *mistat) 48static inline int dt_type(struct p9_stat *mistat)
50{ 49{
51 unsigned long perm = mistat->mode; 50 unsigned long perm = mistat->mode;
52 int rettype = DT_REG; 51 int rettype = DT_REG;
53 52
54 if (perm & V9FS_DMDIR) 53 if (perm & P9_DMDIR)
55 rettype = DT_DIR; 54 rettype = DT_DIR;
56 if (perm & V9FS_DMSYMLINK) 55 if (perm & P9_DMSYMLINK)
57 rettype = DT_LNK; 56 rettype = DT_LNK;
58 57
59 return rettype; 58 return rettype;
@@ -69,106 +68,36 @@ static inline int dt_type(struct v9fs_stat *mistat)
69 68
70static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) 69static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
71{ 70{
72 struct v9fs_fcall *fcall = NULL; 71 int over;
73 struct inode *inode = filp->f_path.dentry->d_inode; 72 struct p9_fid *fid;
74 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 73 struct v9fs_session_info *v9ses;
75 struct v9fs_fid *file = filp->private_data; 74 struct inode *inode;
76 unsigned int i, n, s; 75 struct p9_stat *st;
77 int fid = -1; 76
78 int ret = 0; 77 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
79 struct v9fs_stat stat; 78 inode = filp->f_path.dentry->d_inode;
80 int over = 0; 79 v9ses = v9fs_inode2v9ses(inode);
81 80 fid = filp->private_data;
82 dprintk(DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 81 while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) {
83 82 if (IS_ERR(st))
84 fid = file->fid; 83 return PTR_ERR(st);
85 84
86 if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) { 85 over = filldir(dirent, st->name.str, st->name.len, filp->f_pos,
87 kfree(file->rdir_fcall); 86 v9fs_qid2ino(&st->qid), dt_type(st));
88 file->rdir_fcall = NULL; 87
89 } 88 if (over)
90
91 if (file->rdir_fcall) {
92 n = file->rdir_fcall->params.rread.count;
93 i = file->rdir_fpos;
94 while (i < n) {
95 s = v9fs_deserialize_stat(
96 file->rdir_fcall->params.rread.data + i,
97 n - i, &stat, v9ses->extended);
98
99 if (s == 0) {
100 dprintk(DEBUG_ERROR,
101 "error while deserializing stat\n");
102 ret = -EIO;
103 goto FreeStructs;
104 }
105
106 over = filldir(dirent, stat.name.str, stat.name.len,
107 filp->f_pos, v9fs_qid2ino(&stat.qid),
108 dt_type(&stat));
109
110 if (over) {
111 file->rdir_fpos = i;
112 file->rdir_pos = filp->f_pos;
113 break;
114 }
115
116 i += s;
117 filp->f_pos += s;
118 }
119
120 if (!over) {
121 kfree(file->rdir_fcall);
122 file->rdir_fcall = NULL;
123 }
124 }
125
126 while (!over) {
127 ret = v9fs_t_read(v9ses, fid, filp->f_pos,
128 v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
129 if (ret < 0) {
130 dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
131 ret, fcall);
132 goto FreeStructs;
133 } else if (ret == 0)
134 break; 89 break;
135 90
136 n = ret; 91 filp->f_pos += st->size;
137 i = 0; 92 kfree(st);
138 while (i < n) { 93 st = NULL;
139 s = v9fs_deserialize_stat(fcall->params.rread.data + i,
140 n - i, &stat, v9ses->extended);
141
142 if (s == 0) {
143 dprintk(DEBUG_ERROR,
144 "error while deserializing stat\n");
145 return -EIO;
146 }
147
148 over = filldir(dirent, stat.name.str, stat.name.len,
149 filp->f_pos, v9fs_qid2ino(&stat.qid),
150 dt_type(&stat));
151
152 if (over) {
153 file->rdir_fcall = fcall;
154 file->rdir_fpos = i;
155 file->rdir_pos = filp->f_pos;
156 fcall = NULL;
157 break;
158 }
159
160 i += s;
161 filp->f_pos += s;
162 }
163
164 kfree(fcall);
165 } 94 }
166 95
167 FreeStructs: 96 kfree(st);
168 kfree(fcall); 97 return 0;
169 return ret;
170} 98}
171 99
100
172/** 101/**
173 * v9fs_dir_release - close a directory 102 * v9fs_dir_release - close a directory
174 * @inode: inode of the directory 103 * @inode: inode of the directory
@@ -178,29 +107,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
178 107
179int v9fs_dir_release(struct inode *inode, struct file *filp) 108int v9fs_dir_release(struct inode *inode, struct file *filp)
180{ 109{
181 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 110 struct p9_fid *fid;
182 struct v9fs_fid *fid = filp->private_data;
183 int fidnum = -1;
184
185 dprintk(DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp,
186 fid->fid);
187 fidnum = fid->fid;
188 111
112 fid = filp->private_data;
113 P9_DPRINTK(P9_DEBUG_VFS,
114 "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid);
189 filemap_write_and_wait(inode->i_mapping); 115 filemap_write_and_wait(inode->i_mapping);
190 116 p9_client_clunk(fid);
191 if (fidnum >= 0) {
192 dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
193 fid->fid);
194
195 if (v9fs_t_clunk(v9ses, fidnum))
196 dprintk(DEBUG_ERROR, "clunk failed\n");
197
198 kfree(fid->rdir_fcall);
199 kfree(fid);
200
201 filp->private_data = NULL;
202 }
203
204 return 0; 117 return 0;
205} 118}
206 119
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6e7678e485..2a40c2946d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -34,10 +34,10 @@
34#include <linux/list.h> 34#include <linux/list.h>
35#include <asm/uaccess.h> 35#include <asm/uaccess.h>
36#include <linux/idr.h> 36#include <linux/idr.h>
37#include <net/9p/9p.h>
38#include <net/9p/client.h>
37 39
38#include "debug.h"
39#include "v9fs.h" 40#include "v9fs.h"
40#include "9p.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43 43
@@ -52,48 +52,40 @@ static const struct file_operations v9fs_cached_file_operations;
52 52
53int v9fs_file_open(struct inode *inode, struct file *file) 53int v9fs_file_open(struct inode *inode, struct file *file)
54{ 54{
55 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
56 struct v9fs_fid *vfid;
57 struct v9fs_fcall *fcall = NULL;
58 int omode;
59 int err; 55 int err;
56 struct v9fs_session_info *v9ses;
57 struct p9_fid *fid;
58 int omode;
60 59
61 dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); 60 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
62 61 v9ses = v9fs_inode2v9ses(inode);
63 vfid = v9fs_fid_clone(file->f_path.dentry);
64 if (IS_ERR(vfid))
65 return PTR_ERR(vfid);
66
67 omode = v9fs_uflags2omode(file->f_flags); 62 omode = v9fs_uflags2omode(file->f_flags);
68 err = v9fs_t_open(v9ses, vfid->fid, omode, &fcall); 63 fid = file->private_data;
69 if (err < 0) { 64 if (!fid) {
70 PRINT_FCALL_ERROR("open failed", fcall); 65 fid = v9fs_fid_clone(file->f_path.dentry);
71 goto Clunk_Fid; 66 if (IS_ERR(fid))
67 return PTR_ERR(fid);
68
69 err = p9_client_open(fid, omode);
70 if (err < 0) {
71 p9_client_clunk(fid);
72 return err;
73 }
74 if (omode & P9_OTRUNC) {
75 inode->i_size = 0;
76 inode->i_blocks = 0;
77 }
72 } 78 }
73 79
74 file->private_data = vfid; 80 file->private_data = fid;
75 vfid->fidopen = 1; 81 if ((fid->qid.version) && (v9ses->cache)) {
76 vfid->fidclunked = 0; 82 P9_DPRINTK(P9_DEBUG_VFS, "cached");
77 vfid->iounit = fcall->params.ropen.iounit;
78 vfid->rdir_pos = 0;
79 vfid->rdir_fcall = NULL;
80 vfid->filp = file;
81 kfree(fcall);
82
83 if((vfid->qid.version) && (v9ses->cache)) {
84 dprintk(DEBUG_VFS, "cached");
85 /* enable cached file options */ 83 /* enable cached file options */
86 if(file->f_op == &v9fs_file_operations) 84 if(file->f_op == &v9fs_file_operations)
87 file->f_op = &v9fs_cached_file_operations; 85 file->f_op = &v9fs_cached_file_operations;
88 } 86 }
89 87
90 return 0; 88 return 0;
91
92Clunk_Fid:
93 v9fs_fid_clunk(v9ses, vfid);
94 kfree(fcall);
95
96 return err;
97} 89}
98 90
99/** 91/**
@@ -110,7 +102,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
110 int res = 0; 102 int res = 0;
111 struct inode *inode = filp->f_path.dentry->d_inode; 103 struct inode *inode = filp->f_path.dentry->d_inode;
112 104
113 dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 105 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
114 106
115 /* No mandatory locks */ 107 /* No mandatory locks */
116 if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) 108 if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
@@ -136,55 +128,16 @@ static ssize_t
136v9fs_file_read(struct file *filp, char __user * data, size_t count, 128v9fs_file_read(struct file *filp, char __user * data, size_t count,
137 loff_t * offset) 129 loff_t * offset)
138{ 130{
139 struct inode *inode = filp->f_path.dentry->d_inode; 131 int ret;
140 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 132 struct p9_fid *fid;
141 struct v9fs_fid *v9f = filp->private_data;
142 struct v9fs_fcall *fcall = NULL;
143 int fid = v9f->fid;
144 int rsize = 0;
145 int result = 0;
146 int total = 0;
147 int n;
148
149 dprintk(DEBUG_VFS, "\n");
150
151 rsize = v9ses->maxdata - V9FS_IOHDRSZ;
152 if (v9f->iounit != 0 && rsize > v9f->iounit)
153 rsize = v9f->iounit;
154
155 do {
156 if (count < rsize)
157 rsize = count;
158 133
159 result = v9fs_t_read(v9ses, fid, *offset, rsize, &fcall); 134 P9_DPRINTK(P9_DEBUG_VFS, "\n");
135 fid = filp->private_data;
136 ret = p9_client_uread(fid, data, *offset, count);
137 if (ret > 0)
138 *offset += ret;
160 139
161 if (result < 0) { 140 return ret;
162 printk(KERN_ERR "9P2000: v9fs_t_read returned %d\n",
163 result);
164
165 kfree(fcall);
166 return total;
167 } else
168 *offset += result;
169
170 n = copy_to_user(data, fcall->params.rread.data, result);
171 if (n) {
172 dprintk(DEBUG_ERROR, "Problem copying to user %d\n", n);
173 kfree(fcall);
174 return -EFAULT;
175 }
176
177 count -= result;
178 data += result;
179 total += result;
180
181 kfree(fcall);
182
183 if (result < rsize)
184 break;
185 } while (count);
186
187 return total;
188} 141}
189 142
190/** 143/**
@@ -200,50 +153,25 @@ static ssize_t
200v9fs_file_write(struct file *filp, const char __user * data, 153v9fs_file_write(struct file *filp, const char __user * data,
201 size_t count, loff_t * offset) 154 size_t count, loff_t * offset)
202{ 155{
156 int ret;
157 struct p9_fid *fid;
203 struct inode *inode = filp->f_path.dentry->d_inode; 158 struct inode *inode = filp->f_path.dentry->d_inode;
204 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
205 struct v9fs_fid *v9fid = filp->private_data;
206 struct v9fs_fcall *fcall;
207 int fid = v9fid->fid;
208 int result = -EIO;
209 int rsize = 0;
210 int total = 0;
211
212 dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count,
213 (int)*offset);
214 rsize = v9ses->maxdata - V9FS_IOHDRSZ;
215 if (v9fid->iounit != 0 && rsize > v9fid->iounit)
216 rsize = v9fid->iounit;
217
218 do {
219 if (count < rsize)
220 rsize = count;
221 159
222 result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall); 160 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
223 if (result < 0) { 161 (int)count, (int)*offset);
224 PRINT_FCALL_ERROR("error while writing", fcall);
225 kfree(fcall);
226 return result;
227 } else
228 *offset += result;
229 162
230 kfree(fcall); 163 fid = filp->private_data;
231 fcall = NULL; 164 ret = p9_client_uwrite(fid, data, *offset, count);
165 if (ret > 0)
166 *offset += ret;
232 167
233 if (result != rsize) { 168 if (*offset > inode->i_size) {
234 eprintk(KERN_ERR, 169 inode->i_size = *offset;
235 "short write: v9fs_t_write returned %d\n", 170 inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
236 result); 171 }
237 break;
238 }
239
240 count -= result;
241 data += result;
242 total += result;
243 } while (count);
244 172
245 invalidate_inode_pages2(inode->i_mapping); 173 invalidate_inode_pages2(inode->i_mapping);
246 return total; 174 return ret;
247} 175}
248 176
249static const struct file_operations v9fs_cached_file_operations = { 177static const struct file_operations v9fs_cached_file_operations = {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c76cd8fa3f..e5c45eed58 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,10 +34,10 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <net/9p/9p.h>
38#include <net/9p/client.h>
37 39
38#include "debug.h"
39#include "v9fs.h" 40#include "v9fs.h"
40#include "9p.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43 43
@@ -58,27 +58,27 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
58 int res; 58 int res;
59 res = mode & 0777; 59 res = mode & 0777;
60 if (S_ISDIR(mode)) 60 if (S_ISDIR(mode))
61 res |= V9FS_DMDIR; 61 res |= P9_DMDIR;
62 if (v9ses->extended) { 62 if (v9ses->extended) {
63 if (S_ISLNK(mode)) 63 if (S_ISLNK(mode))
64 res |= V9FS_DMSYMLINK; 64 res |= P9_DMSYMLINK;
65 if (v9ses->nodev == 0) { 65 if (v9ses->nodev == 0) {
66 if (S_ISSOCK(mode)) 66 if (S_ISSOCK(mode))
67 res |= V9FS_DMSOCKET; 67 res |= P9_DMSOCKET;
68 if (S_ISFIFO(mode)) 68 if (S_ISFIFO(mode))
69 res |= V9FS_DMNAMEDPIPE; 69 res |= P9_DMNAMEDPIPE;
70 if (S_ISBLK(mode)) 70 if (S_ISBLK(mode))
71 res |= V9FS_DMDEVICE; 71 res |= P9_DMDEVICE;
72 if (S_ISCHR(mode)) 72 if (S_ISCHR(mode))
73 res |= V9FS_DMDEVICE; 73 res |= P9_DMDEVICE;
74 } 74 }
75 75
76 if ((mode & S_ISUID) == S_ISUID) 76 if ((mode & S_ISUID) == S_ISUID)
77 res |= V9FS_DMSETUID; 77 res |= P9_DMSETUID;
78 if ((mode & S_ISGID) == S_ISGID) 78 if ((mode & S_ISGID) == S_ISGID)
79 res |= V9FS_DMSETGID; 79 res |= P9_DMSETGID;
80 if ((mode & V9FS_DMLINK)) 80 if ((mode & P9_DMLINK))
81 res |= V9FS_DMLINK; 81 res |= P9_DMLINK;
82 } 82 }
83 83
84 return res; 84 return res;
@@ -97,27 +97,27 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
97 97
98 res = mode & 0777; 98 res = mode & 0777;
99 99
100 if ((mode & V9FS_DMDIR) == V9FS_DMDIR) 100 if ((mode & P9_DMDIR) == P9_DMDIR)
101 res |= S_IFDIR; 101 res |= S_IFDIR;
102 else if ((mode & V9FS_DMSYMLINK) && (v9ses->extended)) 102 else if ((mode & P9_DMSYMLINK) && (v9ses->extended))
103 res |= S_IFLNK; 103 res |= S_IFLNK;
104 else if ((mode & V9FS_DMSOCKET) && (v9ses->extended) 104 else if ((mode & P9_DMSOCKET) && (v9ses->extended)
105 && (v9ses->nodev == 0)) 105 && (v9ses->nodev == 0))
106 res |= S_IFSOCK; 106 res |= S_IFSOCK;
107 else if ((mode & V9FS_DMNAMEDPIPE) && (v9ses->extended) 107 else if ((mode & P9_DMNAMEDPIPE) && (v9ses->extended)
108 && (v9ses->nodev == 0)) 108 && (v9ses->nodev == 0))
109 res |= S_IFIFO; 109 res |= S_IFIFO;
110 else if ((mode & V9FS_DMDEVICE) && (v9ses->extended) 110 else if ((mode & P9_DMDEVICE) && (v9ses->extended)
111 && (v9ses->nodev == 0)) 111 && (v9ses->nodev == 0))
112 res |= S_IFBLK; 112 res |= S_IFBLK;
113 else 113 else
114 res |= S_IFREG; 114 res |= S_IFREG;
115 115
116 if (v9ses->extended) { 116 if (v9ses->extended) {
117 if ((mode & V9FS_DMSETUID) == V9FS_DMSETUID) 117 if ((mode & P9_DMSETUID) == P9_DMSETUID)
118 res |= S_ISUID; 118 res |= S_ISUID;
119 119
120 if ((mode & V9FS_DMSETGID) == V9FS_DMSETGID) 120 if ((mode & P9_DMSETGID) == P9_DMSETGID)
121 res |= S_ISGID; 121 res |= S_ISGID;
122 } 122 }
123 123
@@ -132,26 +132,26 @@ int v9fs_uflags2omode(int uflags)
132 switch (uflags&3) { 132 switch (uflags&3) {
133 default: 133 default:
134 case O_RDONLY: 134 case O_RDONLY:
135 ret = V9FS_OREAD; 135 ret = P9_OREAD;
136 break; 136 break;
137 137
138 case O_WRONLY: 138 case O_WRONLY:
139 ret = V9FS_OWRITE; 139 ret = P9_OWRITE;
140 break; 140 break;
141 141
142 case O_RDWR: 142 case O_RDWR:
143 ret = V9FS_ORDWR; 143 ret = P9_ORDWR;
144 break; 144 break;
145 } 145 }
146 146
147 if (uflags & O_EXCL) 147 if (uflags & O_EXCL)
148 ret |= V9FS_OEXCL; 148 ret |= P9_OEXCL;
149 149
150 if (uflags & O_TRUNC) 150 if (uflags & O_TRUNC)
151 ret |= V9FS_OTRUNC; 151 ret |= P9_OTRUNC;
152 152
153 if (uflags & O_APPEND) 153 if (uflags & O_APPEND)
154 ret |= V9FS_OAPPEND; 154 ret |= P9_OAPPEND;
155 155
156 return ret; 156 return ret;
157} 157}
@@ -164,7 +164,7 @@ int v9fs_uflags2omode(int uflags)
164 */ 164 */
165 165
166static void 166static void
167v9fs_blank_wstat(struct v9fs_wstat *wstat) 167v9fs_blank_wstat(struct p9_wstat *wstat)
168{ 168{
169 wstat->type = ~0; 169 wstat->type = ~0;
170 wstat->dev = ~0; 170 wstat->dev = ~0;
@@ -197,7 +197,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
197 struct inode *inode; 197 struct inode *inode;
198 struct v9fs_session_info *v9ses = sb->s_fs_info; 198 struct v9fs_session_info *v9ses = sb->s_fs_info;
199 199
200 dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); 200 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
201 201
202 inode = new_inode(sb); 202 inode = new_inode(sb);
203 if (inode) { 203 if (inode) {
@@ -215,7 +215,8 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
215 case S_IFCHR: 215 case S_IFCHR:
216 case S_IFSOCK: 216 case S_IFSOCK:
217 if(!v9ses->extended) { 217 if(!v9ses->extended) {
218 dprintk(DEBUG_ERROR, "special files without extended mode\n"); 218 P9_DPRINTK(P9_DEBUG_ERROR,
219 "special files without extended mode\n");
219 return ERR_PTR(-EINVAL); 220 return ERR_PTR(-EINVAL);
220 } 221 }
221 init_special_inode(inode, inode->i_mode, 222 init_special_inode(inode, inode->i_mode,
@@ -227,7 +228,8 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
227 break; 228 break;
228 case S_IFLNK: 229 case S_IFLNK:
229 if(!v9ses->extended) { 230 if(!v9ses->extended) {
230 dprintk(DEBUG_ERROR, "extended modes used w/o 9P2000.u\n"); 231 P9_DPRINTK(P9_DEBUG_ERROR,
232 "extended modes used w/o 9P2000.u\n");
231 return ERR_PTR(-EINVAL); 233 return ERR_PTR(-EINVAL);
232 } 234 }
233 inode->i_op = &v9fs_symlink_inode_operations; 235 inode->i_op = &v9fs_symlink_inode_operations;
@@ -241,71 +243,19 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
241 inode->i_fop = &v9fs_dir_operations; 243 inode->i_fop = &v9fs_dir_operations;
242 break; 244 break;
243 default: 245 default:
244 dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 246 P9_DPRINTK(P9_DEBUG_ERROR,
247 "BAD mode 0x%x S_IFMT 0x%x\n",
245 mode, mode & S_IFMT); 248 mode, mode & S_IFMT);
246 return ERR_PTR(-EINVAL); 249 return ERR_PTR(-EINVAL);
247 } 250 }
248 } else { 251 } else {
249 eprintk(KERN_WARNING, "Problem allocating inode\n"); 252 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
250 return ERR_PTR(-ENOMEM); 253 return ERR_PTR(-ENOMEM);
251 } 254 }
252 return inode; 255 return inode;
253} 256}
254 257
255static int 258/*
256v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm,
257 u8 mode, char *extension, u32 *fidp, struct v9fs_qid *qid, u32 *iounit)
258{
259 int fid;
260 int err;
261 struct v9fs_fcall *fcall;
262
263 fid = v9fs_get_idpool(&v9ses->fidpool);
264 if (fid < 0) {
265 eprintk(KERN_WARNING, "no free fids available\n");
266 return -ENOSPC;
267 }
268
269 err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
270 if (err < 0) {
271 PRINT_FCALL_ERROR("clone error", fcall);
272 if (fcall && fcall->id == RWALK)
273 goto clunk_fid;
274 else
275 goto put_fid;
276 }
277 kfree(fcall);
278
279 err = v9fs_t_create(v9ses, fid, name, perm, mode, extension, &fcall);
280 if (err < 0) {
281 PRINT_FCALL_ERROR("create fails", fcall);
282 goto clunk_fid;
283 }
284
285 if (iounit)
286 *iounit = fcall->params.rcreate.iounit;
287
288 if (qid)
289 *qid = fcall->params.rcreate.qid;
290
291 if (fidp)
292 *fidp = fid;
293
294 kfree(fcall);
295 return 0;
296
297clunk_fid:
298 v9fs_t_clunk(v9ses, fid);
299 fid = V9FS_NOFID;
300
301put_fid:
302 if (fid != V9FS_NOFID)
303 v9fs_put_idpool(fid, &v9ses->fidpool);
304
305 kfree(fcall);
306 return err;
307}
308
309static struct v9fs_fid* 259static struct v9fs_fid*
310v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) 260v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
311{ 261{
@@ -355,23 +305,25 @@ error:
355 kfree(fcall); 305 kfree(fcall);
356 return ERR_PTR(err); 306 return ERR_PTR(err);
357} 307}
308*/
358 309
359static struct inode * 310static struct inode *
360v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid, 311v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
361 struct super_block *sb) 312 struct super_block *sb)
362{ 313{
363 int err, umode; 314 int err, umode;
364 struct inode *ret; 315 struct inode *ret;
365 struct v9fs_fcall *fcall; 316 struct p9_stat *st;
366 317
367 ret = NULL; 318 ret = NULL;
368 err = v9fs_t_stat(v9ses, fid, &fcall); 319 st = p9_client_stat(fid);
369 if (err) { 320 if (IS_ERR(st)) {
370 PRINT_FCALL_ERROR("stat error", fcall); 321 err = PTR_ERR(st);
322 st = NULL;
371 goto error; 323 goto error;
372 } 324 }
373 325
374 umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode); 326 umode = p9mode2unixmode(v9ses, st->mode);
375 ret = v9fs_get_inode(sb, umode); 327 ret = v9fs_get_inode(sb, umode);
376 if (IS_ERR(ret)) { 328 if (IS_ERR(ret)) {
377 err = PTR_ERR(ret); 329 err = PTR_ERR(ret);
@@ -379,12 +331,13 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid,
379 goto error; 331 goto error;
380 } 332 }
381 333
382 v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb); 334 v9fs_stat2inode(st, ret, sb);
383 kfree(fcall); 335 ret->i_ino = v9fs_qid2ino(&st->qid);
336 kfree(st);
384 return ret; 337 return ret;
385 338
386error: 339error:
387 kfree(fcall); 340 kfree(st);
388 if (ret) 341 if (ret)
389 iput(ret); 342 iput(ret);
390 343
@@ -401,43 +354,20 @@ error:
401 354
402static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 355static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
403{ 356{
404 struct v9fs_fcall *fcall = NULL; 357 struct inode *file_inode;
405 struct super_block *sb = NULL; 358 struct v9fs_session_info *v9ses;
406 struct v9fs_session_info *v9ses = NULL; 359 struct p9_fid *v9fid;
407 struct v9fs_fid *v9fid = NULL;
408 struct inode *file_inode = NULL;
409 int fid = -1;
410 int result = 0;
411 360
412 dprintk(DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 361 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
413 rmdir); 362 rmdir);
414 363
415 file_inode = file->d_inode; 364 file_inode = file->d_inode;
416 sb = file_inode->i_sb;
417 v9ses = v9fs_inode2v9ses(file_inode); 365 v9ses = v9fs_inode2v9ses(file_inode);
418 v9fid = v9fs_fid_clone(file); 366 v9fid = v9fs_fid_clone(file);
419 if(IS_ERR(v9fid)) 367 if(IS_ERR(v9fid))
420 return PTR_ERR(v9fid); 368 return PTR_ERR(v9fid);
421 369
422 fid = v9fid->fid; 370 return p9_client_remove(v9fid);
423 if (fid < 0) {
424 dprintk(DEBUG_ERROR, "inode #%lu, no fid!\n",
425 file_inode->i_ino);
426 return -EBADF;
427 }
428
429 result = v9fs_t_remove(v9ses, fid, &fcall);
430 if (result < 0) {
431 PRINT_FCALL_ERROR("remove fails", fcall);
432 goto Error;
433 }
434
435 v9fs_put_idpool(fid, &v9ses->fidpool);
436 v9fs_fid_destroy(v9fid);
437
438Error:
439 kfree(fcall);
440 return result;
441} 371}
442 372
443static int 373static int
@@ -446,61 +376,59 @@ v9fs_open_created(struct inode *inode, struct file *file)
446 return 0; 376 return 0;
447} 377}
448 378
379
449/** 380/**
450 * v9fs_vfs_create - VFS hook to create files 381 * v9fs_create - Create a file
451 * @inode: directory inode that is being deleted 382 * @dentry: dentry that is being created
452 * @dentry: dentry that is being deleted 383 * @perm: create permissions
453 * @mode: create permissions 384 * @mode: open mode
454 * @nd: path information
455 * 385 *
456 */ 386 */
457 387static struct p9_fid *
458static int 388v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
459v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, 389 struct dentry *dentry, char *extension, u32 perm, u8 mode)
460 struct nameidata *nd)
461{ 390{
462 int err; 391 int err;
463 u32 fid, perm, iounit; 392 char *name;
464 int flags; 393 struct p9_fid *dfid, *ofid, *fid;
465 struct v9fs_session_info *v9ses;
466 struct v9fs_fid *dfid, *vfid, *ffid;
467 struct inode *inode; 394 struct inode *inode;
468 struct v9fs_qid qid;
469 struct file *filp;
470 395
471 inode = NULL; 396 err = 0;
472 vfid = NULL; 397 ofid = NULL;
473 v9ses = v9fs_inode2v9ses(dir); 398 fid = NULL;
399 name = (char *) dentry->d_name.name;
474 dfid = v9fs_fid_clone(dentry->d_parent); 400 dfid = v9fs_fid_clone(dentry->d_parent);
475 if(IS_ERR(dfid)) { 401 if(IS_ERR(dfid)) {
476 err = PTR_ERR(dfid); 402 err = PTR_ERR(dfid);
403 dfid = NULL;
477 goto error; 404 goto error;
478 } 405 }
479 406
480 perm = unixmode2p9mode(v9ses, mode); 407 /* clone a fid to use for creation */
481 if (nd && nd->flags & LOOKUP_OPEN) 408 ofid = p9_client_walk(dfid, 0, NULL, 1);
482 flags = nd->intent.open.flags - 1; 409 if (IS_ERR(ofid)) {
483 else 410 err = PTR_ERR(ofid);
484 flags = O_RDWR; 411 ofid = NULL;
485 412 goto error;
486 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, 413 }
487 perm, v9fs_uflags2omode(flags), NULL, &fid, &qid, &iounit);
488 414
489 if (err) 415 err = p9_client_fcreate(ofid, name, perm, mode, extension);
490 goto clunk_dfid; 416 if (err < 0)
417 goto error;
491 418
492 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); 419 /* now walk from the parent so we can get unopened fid */
493 v9fs_fid_clunk(v9ses, dfid); 420 fid = p9_client_walk(dfid, 1, &name, 0);
494 if (IS_ERR(vfid)) { 421 if (IS_ERR(fid)) {
495 err = PTR_ERR(vfid); 422 err = PTR_ERR(fid);
496 vfid = NULL; 423 fid = NULL;
497 goto error; 424 goto error;
498 } 425 } else
426 dfid = NULL;
499 427
500 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); 428 /* instantiate inode and assign the unopened fid to the dentry */
429 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
501 if (IS_ERR(inode)) { 430 if (IS_ERR(inode)) {
502 err = PTR_ERR(inode); 431 err = PTR_ERR(inode);
503 inode = NULL;
504 goto error; 432 goto error;
505 } 433 }
506 434
@@ -508,35 +436,78 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
508 dentry->d_op = &v9fs_cached_dentry_operations; 436 dentry->d_op = &v9fs_cached_dentry_operations;
509 else 437 else
510 dentry->d_op = &v9fs_dentry_operations; 438 dentry->d_op = &v9fs_dentry_operations;
439
511 d_instantiate(dentry, inode); 440 d_instantiate(dentry, inode);
441 v9fs_fid_add(dentry, fid);
442 return ofid;
512 443
513 if (nd && nd->flags & LOOKUP_OPEN) { 444error:
514 ffid = v9fs_fid_create(v9ses, fid); 445 if (dfid)
515 if (!ffid) 446 p9_client_clunk(dfid);
516 return -ENOMEM; 447
448 if (ofid)
449 p9_client_clunk(ofid);
450
451 if (fid)
452 p9_client_clunk(fid);
453
454 return ERR_PTR(err);
455}
456
457/**
458 * v9fs_vfs_create - VFS hook to create files
459 * @inode: directory inode that is being created
460 * @dentry: dentry that is being deleted
461 * @mode: create permissions
462 * @nd: path information
463 *
464 */
517 465
466static int
467v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
468 struct nameidata *nd)
469{
470 int err;
471 u32 perm;
472 int flags;
473 struct v9fs_session_info *v9ses;
474 struct p9_fid *fid;
475 struct file *filp;
476
477 err = 0;
478 fid = NULL;
479 v9ses = v9fs_inode2v9ses(dir);
480 perm = unixmode2p9mode(v9ses, mode);
481 if (nd && nd->flags & LOOKUP_OPEN)
482 flags = nd->intent.open.flags - 1;
483 else
484 flags = O_RDWR;
485
486 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
487 v9fs_uflags2omode(flags));
488 if (IS_ERR(fid)) {
489 err = PTR_ERR(fid);
490 fid = NULL;
491 goto error;
492 }
493
494 /* if we are opening a file, assign the open fid to the file */
495 if (nd && nd->flags & LOOKUP_OPEN) {
518 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); 496 filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
519 if (IS_ERR(filp)) { 497 if (IS_ERR(filp)) {
520 v9fs_fid_destroy(ffid); 498 err = PTR_ERR(filp);
521 return PTR_ERR(filp); 499 goto error;
522 } 500 }
523 501
524 ffid->rdir_pos = 0; 502 filp->private_data = fid;
525 ffid->rdir_fcall = NULL; 503 } else
526 ffid->fidopen = 1; 504 p9_client_clunk(fid);
527 ffid->iounit = iounit;
528 ffid->filp = filp;
529 filp->private_data = ffid;
530 }
531 505
532 return 0; 506 return 0;
533 507
534clunk_dfid:
535 v9fs_fid_clunk(v9ses, dfid);
536
537error: 508error:
538 if (vfid) 509 if (fid)
539 v9fs_fid_destroy(vfid); 510 p9_client_clunk(fid);
540 511
541 return err; 512 return err;
542} 513}
@@ -552,57 +523,23 @@ error:
552static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 523static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
553{ 524{
554 int err; 525 int err;
555 u32 fid, perm; 526 u32 perm;
556 struct v9fs_session_info *v9ses; 527 struct v9fs_session_info *v9ses;
557 struct v9fs_fid *dfid, *vfid; 528 struct p9_fid *fid;
558 struct inode *inode;
559 529
560 inode = NULL; 530 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
561 vfid = NULL; 531 err = 0;
562 v9ses = v9fs_inode2v9ses(dir); 532 v9ses = v9fs_inode2v9ses(dir);
563 dfid = v9fs_fid_clone(dentry->d_parent);
564 if(IS_ERR(dfid)) {
565 err = PTR_ERR(dfid);
566 goto error;
567 }
568
569 perm = unixmode2p9mode(v9ses, mode | S_IFDIR); 533 perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
570 534 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_OREAD);
571 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, 535 if (IS_ERR(fid)) {
572 perm, V9FS_OREAD, NULL, &fid, NULL, NULL); 536 err = PTR_ERR(fid);
573 537 fid = NULL;
574 if (err) {
575 dprintk(DEBUG_ERROR, "create error %d\n", err);
576 goto clean_up_dfid;
577 } 538 }
578 539
579 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); 540 if (fid)
580 if (IS_ERR(vfid)) { 541 p9_client_clunk(fid);
581 err = PTR_ERR(vfid);
582 vfid = NULL;
583 goto clean_up_dfid;
584 }
585 542
586 v9fs_fid_clunk(v9ses, dfid);
587 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
588 if (IS_ERR(inode)) {
589 err = PTR_ERR(inode);
590 inode = NULL;
591 v9fs_fid_destroy(vfid);
592 goto error;
593 }
594
595 if(v9ses->cache)
596 dentry->d_op = &v9fs_cached_dentry_operations;
597 else
598 dentry->d_op = &v9fs_dentry_operations;
599 d_instantiate(dentry, inode);
600 return 0;
601
602clean_up_dfid:
603 v9fs_fid_clunk(v9ses, dfid);
604
605error:
606 return err; 543 return err;
607} 544}
608 545
@@ -619,104 +556,54 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
619{ 556{
620 struct super_block *sb; 557 struct super_block *sb;
621 struct v9fs_session_info *v9ses; 558 struct v9fs_session_info *v9ses;
622 struct v9fs_fid *dirfid; 559 struct p9_fid *dfid, *fid;
623 struct v9fs_fid *fid;
624 struct inode *inode; 560 struct inode *inode;
625 struct v9fs_fcall *fcall = NULL; 561 char *name;
626 int dirfidnum = -1;
627 int newfid = -1;
628 int result = 0; 562 int result = 0;
629 563
630 dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 564 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
631 dir, dentry->d_name.name, dentry, nameidata); 565 dir, dentry->d_name.name, dentry, nameidata);
632 566
633 sb = dir->i_sb; 567 sb = dir->i_sb;
634 v9ses = v9fs_inode2v9ses(dir); 568 v9ses = v9fs_inode2v9ses(dir);
635 dirfid = v9fs_fid_lookup(dentry->d_parent); 569 dfid = v9fs_fid_lookup(dentry->d_parent);
636 570 if (IS_ERR(dfid))
637 if(IS_ERR(dirfid)) 571 return ERR_PTR(PTR_ERR(dfid));
638 return ERR_PTR(PTR_ERR(dirfid)); 572
639 573 name = (char *) dentry->d_name.name;
640 dirfidnum = dirfid->fid; 574 fid = p9_client_walk(dfid, 1, &name, 1);
641 575 if (IS_ERR(fid)) {
642 newfid = v9fs_get_idpool(&v9ses->fidpool); 576 result = PTR_ERR(fid);
643 if (newfid < 0) {
644 eprintk(KERN_WARNING, "newfid fails!\n");
645 result = -ENOSPC;
646 goto Release_Dirfid;
647 }
648
649 result = v9fs_t_walk(v9ses, dirfidnum, newfid,
650 (char *)dentry->d_name.name, &fcall);
651
652 up(&dirfid->lock);
653
654 if (result < 0) {
655 if (fcall && fcall->id == RWALK)
656 v9fs_t_clunk(v9ses, newfid);
657 else
658 v9fs_put_idpool(newfid, &v9ses->fidpool);
659
660 if (result == -ENOENT) { 577 if (result == -ENOENT) {
661 d_add(dentry, NULL); 578 d_add(dentry, NULL);
662 dprintk(DEBUG_VFS,
663 "Return negative dentry %p count %d\n",
664 dentry, atomic_read(&dentry->d_count));
665 kfree(fcall);
666 return NULL; 579 return NULL;
667 } 580 }
668 dprintk(DEBUG_ERROR, "walk error:%d\n", result);
669 goto FreeFcall;
670 }
671 kfree(fcall);
672
673 result = v9fs_t_stat(v9ses, newfid, &fcall);
674 if (result < 0) {
675 dprintk(DEBUG_ERROR, "stat error\n");
676 goto FreeFcall;
677 }
678
679 inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses,
680 fcall->params.rstat.stat.mode));
681 581
682 if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) { 582 return ERR_PTR(result);
683 eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
684 PTR_ERR(inode));
685
686 result = -ENOSPC;
687 goto FreeFcall;
688 } 583 }
689 584
690 inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); 585 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
691 586 if (IS_ERR(inode)) {
692 fid = v9fs_fid_create(v9ses, newfid); 587 result = PTR_ERR(inode);
693 if (fid == NULL) { 588 inode = NULL;
694 dprintk(DEBUG_ERROR, "couldn't insert\n"); 589 goto error;
695 result = -ENOMEM;
696 goto FreeFcall;
697 } 590 }
698 591
699 result = v9fs_fid_insert(fid, dentry); 592 result = v9fs_fid_add(dentry, fid);
700 if (result < 0) 593 if (result < 0)
701 goto FreeFcall; 594 goto error;
702 595
703 fid->qid = fcall->params.rstat.stat.qid;
704 v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
705 if((fid->qid.version)&&(v9ses->cache)) 596 if((fid->qid.version)&&(v9ses->cache))
706 dentry->d_op = &v9fs_cached_dentry_operations; 597 dentry->d_op = &v9fs_cached_dentry_operations;
707 else 598 else
708 dentry->d_op = &v9fs_dentry_operations; 599 dentry->d_op = &v9fs_dentry_operations;
709 600
710 d_add(dentry, inode); 601 d_add(dentry, inode);
711 kfree(fcall);
712
713 return NULL; 602 return NULL;
714 603
715Release_Dirfid: 604error:
716 up(&dirfid->lock); 605 if (fid)
717 606 p9_client_clunk(fid);
718FreeFcall:
719 kfree(fcall);
720 607
721 return ERR_PTR(result); 608 return ERR_PTR(result);
722} 609}
@@ -758,73 +645,54 @@ static int
758v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 645v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
759 struct inode *new_dir, struct dentry *new_dentry) 646 struct inode *new_dir, struct dentry *new_dentry)
760{ 647{
761 struct inode *old_inode = old_dentry->d_inode; 648 struct inode *old_inode;
762 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode); 649 struct v9fs_session_info *v9ses;
763 struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry); 650 struct p9_fid *oldfid;
764 struct v9fs_fid *olddirfid; 651 struct p9_fid *olddirfid;
765 struct v9fs_fid *newdirfid; 652 struct p9_fid *newdirfid;
766 struct v9fs_wstat wstat; 653 struct p9_wstat wstat;
767 struct v9fs_fcall *fcall = NULL; 654 int retval;
768 int fid = -1;
769 int olddirfidnum = -1;
770 int newdirfidnum = -1;
771 int retval = 0;
772
773 dprintk(DEBUG_VFS, "\n");
774 655
656 P9_DPRINTK(P9_DEBUG_VFS, "\n");
657 retval = 0;
658 old_inode = old_dentry->d_inode;
659 v9ses = v9fs_inode2v9ses(old_inode);
660 oldfid = v9fs_fid_lookup(old_dentry);
775 if(IS_ERR(oldfid)) 661 if(IS_ERR(oldfid))
776 return PTR_ERR(oldfid); 662 return PTR_ERR(oldfid);
777 663
778 olddirfid = v9fs_fid_clone(old_dentry->d_parent); 664 olddirfid = v9fs_fid_clone(old_dentry->d_parent);
779 if(IS_ERR(olddirfid)) { 665 if(IS_ERR(olddirfid)) {
780 retval = PTR_ERR(olddirfid); 666 retval = PTR_ERR(olddirfid);
781 goto Release_lock; 667 goto done;
782 } 668 }
783 669
784 newdirfid = v9fs_fid_clone(new_dentry->d_parent); 670 newdirfid = v9fs_fid_clone(new_dentry->d_parent);
785 if(IS_ERR(newdirfid)) { 671 if(IS_ERR(newdirfid)) {
786 retval = PTR_ERR(newdirfid); 672 retval = PTR_ERR(newdirfid);
787 goto Clunk_olddir; 673 goto clunk_olddir;
788 } 674 }
789 675
790 /* 9P can only handle file rename in the same directory */ 676 /* 9P can only handle file rename in the same directory */
791 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 677 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
792 dprintk(DEBUG_ERROR, "old dir and new dir are different\n"); 678 P9_DPRINTK(P9_DEBUG_ERROR,
679 "old dir and new dir are different\n");
793 retval = -EXDEV; 680 retval = -EXDEV;
794 goto Clunk_newdir; 681 goto clunk_newdir;
795 }
796
797 fid = oldfid->fid;
798 olddirfidnum = olddirfid->fid;
799 newdirfidnum = newdirfid->fid;
800
801 if (fid < 0) {
802 dprintk(DEBUG_ERROR, "no fid for old file #%lu\n",
803 old_inode->i_ino);
804 retval = -EBADF;
805 goto Clunk_newdir;
806 } 682 }
807 683
808 v9fs_blank_wstat(&wstat); 684 v9fs_blank_wstat(&wstat);
809 wstat.muid = v9ses->name; 685 wstat.muid = v9ses->name;
810 wstat.name = (char *) new_dentry->d_name.name; 686 wstat.name = (char *) new_dentry->d_name.name;
687 retval = p9_client_wstat(oldfid, &wstat);
811 688
812 retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall); 689clunk_newdir:
690 p9_client_clunk(olddirfid);
813 691
814 if (retval < 0) 692clunk_olddir:
815 PRINT_FCALL_ERROR("wstat error", fcall); 693 p9_client_clunk(newdirfid);
816
817 kfree(fcall);
818
819Clunk_newdir:
820 v9fs_fid_clunk(v9ses, newdirfid);
821
822Clunk_olddir:
823 v9fs_fid_clunk(v9ses, olddirfid);
824
825Release_lock:
826 up(&oldfid->lock);
827 694
695done:
828 return retval; 696 return retval;
829} 697}
830 698
@@ -840,28 +708,30 @@ static int
840v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 708v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
841 struct kstat *stat) 709 struct kstat *stat)
842{ 710{
843 struct v9fs_fcall *fcall = NULL; 711 int err;
844 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); 712 struct v9fs_session_info *v9ses;
845 struct v9fs_fid *fid = v9fs_fid_clone(dentry); 713 struct p9_fid *fid;
846 int err = -EPERM; 714 struct p9_stat *st;
847 715
848 dprintk(DEBUG_VFS, "dentry: %p\n", dentry); 716 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
849 if(IS_ERR(fid)) 717 err = -EPERM;
718 v9ses = v9fs_inode2v9ses(dentry->d_inode);
719 if (v9ses->cache == CACHE_LOOSE)
720 return simple_getattr(mnt, dentry, stat);
721
722 fid = v9fs_fid_lookup(dentry);
723 if (IS_ERR(fid))
850 return PTR_ERR(fid); 724 return PTR_ERR(fid);
851 725
852 err = v9fs_t_stat(v9ses, fid->fid, &fcall); 726 st = p9_client_stat(fid);
727 if (IS_ERR(st))
728 return PTR_ERR(st);
853 729
854 if (err < 0) 730 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
855 dprintk(DEBUG_ERROR, "stat error\n");
856 else {
857 v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode,
858 dentry->d_inode->i_sb);
859 generic_fillattr(dentry->d_inode, stat); 731 generic_fillattr(dentry->d_inode, stat);
860 }
861 732
862 kfree(fcall); 733 kfree(st);
863 v9fs_fid_clunk(v9ses, fid); 734 return 0;
864 return err;
865} 735}
866 736
867/** 737/**
@@ -873,13 +743,15 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
873 743
874static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) 744static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
875{ 745{
876 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); 746 int retval;
877 struct v9fs_fid *fid = v9fs_fid_clone(dentry); 747 struct v9fs_session_info *v9ses;
878 struct v9fs_fcall *fcall = NULL; 748 struct p9_fid *fid;
879 struct v9fs_wstat wstat; 749 struct p9_wstat wstat;
880 int res = -EPERM;
881 750
882 dprintk(DEBUG_VFS, "\n"); 751 P9_DPRINTK(P9_DEBUG_VFS, "\n");
752 retval = -EPERM;
753 v9ses = v9fs_inode2v9ses(dentry->d_inode);
754 fid = v9fs_fid_lookup(dentry);
883 if(IS_ERR(fid)) 755 if(IS_ERR(fid))
884 return PTR_ERR(fid); 756 return PTR_ERR(fid);
885 757
@@ -904,17 +776,11 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
904 wstat.n_gid = iattr->ia_gid; 776 wstat.n_gid = iattr->ia_gid;
905 } 777 }
906 778
907 res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); 779 retval = p9_client_wstat(fid, &wstat);
780 if (retval >= 0)
781 retval = inode_setattr(dentry->d_inode, iattr);
908 782
909 if (res < 0) 783 return retval;
910 PRINT_FCALL_ERROR("wstat error", fcall);
911
912 kfree(fcall);
913 if (res >= 0)
914 res = inode_setattr(dentry->d_inode, iattr);
915
916 v9fs_fid_clunk(v9ses, fid);
917 return res;
918} 784}
919 785
920/** 786/**
@@ -926,7 +792,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
926 */ 792 */
927 793
928void 794void
929v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, 795v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
930 struct super_block *sb) 796 struct super_block *sb)
931{ 797{
932 int n; 798 int n;
@@ -967,8 +833,9 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
967 case 'b': 833 case 'b':
968 break; 834 break;
969 default: 835 default:
970 dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n", 836 P9_DPRINTK(P9_DEBUG_ERROR,
971 type, stat->extension.len, stat->extension.str); 837 "Unknown special type %c (%.*s)\n", type,
838 stat->extension.len, stat->extension.str);
972 }; 839 };
973 inode->i_rdev = MKDEV(major, minor); 840 inode->i_rdev = MKDEV(major, minor);
974 } else 841 } else
@@ -976,8 +843,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
976 843
977 inode->i_size = stat->length; 844 inode->i_size = stat->length;
978 845
979 inode->i_blocks = 846 /* not real number of blocks, but 512 byte ones ... */
980 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 847 inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
981} 848}
982 849
983/** 850/**
@@ -987,7 +854,7 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
987 * BUG: potential for inode number collisions? 854 * BUG: potential for inode number collisions?
988 */ 855 */
989 856
990ino_t v9fs_qid2ino(struct v9fs_qid *qid) 857ino_t v9fs_qid2ino(struct p9_qid *qid)
991{ 858{
992 u64 path = qid->path + 2; 859 u64 path = qid->path + 2;
993 ino_t i = 0; 860 ino_t i = 0;
@@ -1010,56 +877,46 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid)
1010 877
1011static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) 878static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1012{ 879{
1013 int retval = -EPERM; 880 int retval;
1014 881
1015 struct v9fs_fcall *fcall = NULL; 882 struct v9fs_session_info *v9ses;
1016 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); 883 struct p9_fid *fid;
1017 struct v9fs_fid *fid = v9fs_fid_clone(dentry); 884 struct p9_stat *st;
1018 885
886 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
887 retval = -EPERM;
888 v9ses = v9fs_inode2v9ses(dentry->d_inode);
889 fid = v9fs_fid_lookup(dentry);
1019 if(IS_ERR(fid)) 890 if(IS_ERR(fid))
1020 return PTR_ERR(fid); 891 return PTR_ERR(fid);
1021 892
1022 if (!v9ses->extended) { 893 if (!v9ses->extended)
1023 retval = -EBADF; 894 return -EBADF;
1024 dprintk(DEBUG_ERROR, "not extended\n");
1025 goto ClunkFid;
1026 }
1027
1028 dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name);
1029 retval = v9fs_t_stat(v9ses, fid->fid, &fcall);
1030
1031 if (retval < 0) {
1032 dprintk(DEBUG_ERROR, "stat error\n");
1033 goto FreeFcall;
1034 }
1035 895
1036 if (!fcall) { 896 st = p9_client_stat(fid);
1037 retval = -EIO; 897 if (IS_ERR(st))
1038 goto ClunkFid; 898 return PTR_ERR(st);
1039 }
1040 899
1041 if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) { 900 if (!(st->mode & P9_DMSYMLINK)) {
1042 retval = -EINVAL; 901 retval = -EINVAL;
1043 goto FreeFcall; 902 goto done;
1044 } 903 }
1045 904
1046 /* copy extension buffer into buffer */ 905 /* copy extension buffer into buffer */
1047 if (fcall->params.rstat.stat.extension.len < buflen) 906 if (st->extension.len < buflen)
1048 buflen = fcall->params.rstat.stat.extension.len + 1; 907 buflen = st->extension.len + 1;
1049 908
1050 memmove(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); 909 memmove(buffer, st->extension.str, buflen - 1);
1051 buffer[buflen-1] = 0; 910 buffer[buflen-1] = 0;
1052 911
1053 dprintk(DEBUG_ERROR, "%s -> %.*s (%s)\n", dentry->d_name.name, fcall->params.rstat.stat.extension.len, 912 P9_DPRINTK(P9_DEBUG_VFS,
1054 fcall->params.rstat.stat.extension.str, buffer); 913 "%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len,
1055 retval = buflen; 914 st->extension.str, buffer);
1056 915
1057FreeFcall: 916 retval = buflen;
1058 kfree(fcall);
1059
1060ClunkFid:
1061 v9fs_fid_clunk(v9ses, fid);
1062 917
918done:
919 kfree(st);
1063 return retval; 920 return retval;
1064} 921}
1065 922
@@ -1084,14 +941,14 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
1084 if (buflen > PATH_MAX) 941 if (buflen > PATH_MAX)
1085 buflen = PATH_MAX; 942 buflen = PATH_MAX;
1086 943
1087 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); 944 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
1088 945
1089 retval = v9fs_readlink(dentry, link, buflen); 946 retval = v9fs_readlink(dentry, link, buflen);
1090 947
1091 if (retval > 0) { 948 if (retval > 0) {
1092 if ((ret = copy_to_user(buffer, link, retval)) != 0) { 949 if ((ret = copy_to_user(buffer, link, retval)) != 0) {
1093 dprintk(DEBUG_ERROR, "problem copying to user: %d\n", 950 P9_DPRINTK(P9_DEBUG_ERROR,
1094 ret); 951 "problem copying to user: %d\n", ret);
1095 retval = ret; 952 retval = ret;
1096 } 953 }
1097 } 954 }
@@ -1112,7 +969,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1112 int len = 0; 969 int len = 0;
1113 char *link = __getname(); 970 char *link = __getname();
1114 971
1115 dprintk(DEBUG_VFS, "%s n", dentry->d_name.name); 972 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
1116 973
1117 if (!link) 974 if (!link)
1118 link = ERR_PTR(-ENOMEM); 975 link = ERR_PTR(-ENOMEM);
@@ -1141,7 +998,7 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
1141{ 998{
1142 char *s = nd_get_link(nd); 999 char *s = nd_get_link(nd);
1143 1000
1144 dprintk(DEBUG_VFS, " %s %s\n", dentry->d_name.name, s); 1001 P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
1145 if (!IS_ERR(s)) 1002 if (!IS_ERR(s))
1146 __putname(s); 1003 __putname(s);
1147} 1004}
@@ -1149,66 +1006,24 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
1149static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, 1006static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1150 int mode, const char *extension) 1007 int mode, const char *extension)
1151{ 1008{
1152 int err; 1009 u32 perm;
1153 u32 fid, perm;
1154 struct v9fs_session_info *v9ses; 1010 struct v9fs_session_info *v9ses;
1155 struct v9fs_fid *dfid, *vfid = NULL; 1011 struct p9_fid *fid;
1156 struct inode *inode = NULL;
1157 1012
1158 v9ses = v9fs_inode2v9ses(dir); 1013 v9ses = v9fs_inode2v9ses(dir);
1159 if (!v9ses->extended) { 1014 if (!v9ses->extended) {
1160 dprintk(DEBUG_ERROR, "not extended\n"); 1015 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
1161 return -EPERM; 1016 return -EPERM;
1162 } 1017 }
1163 1018
1164 dfid = v9fs_fid_clone(dentry->d_parent);
1165 if(IS_ERR(dfid)) {
1166 err = PTR_ERR(dfid);
1167 goto error;
1168 }
1169
1170 perm = unixmode2p9mode(v9ses, mode); 1019 perm = unixmode2p9mode(v9ses, mode);
1020 fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm,
1021 P9_OREAD);
1022 if (IS_ERR(fid))
1023 return PTR_ERR(fid);
1171 1024
1172 err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, 1025 p9_client_clunk(fid);
1173 perm, V9FS_OREAD, (char *) extension, &fid, NULL, NULL);
1174
1175 if (err)
1176 goto clunk_dfid;
1177
1178 err = v9fs_t_clunk(v9ses, fid);
1179 if (err)
1180 goto clunk_dfid;
1181
1182 vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
1183 if (IS_ERR(vfid)) {
1184 err = PTR_ERR(vfid);
1185 vfid = NULL;
1186 goto clunk_dfid;
1187 }
1188
1189 inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
1190 if (IS_ERR(inode)) {
1191 err = PTR_ERR(inode);
1192 inode = NULL;
1193 goto free_vfid;
1194 }
1195
1196 if(v9ses->cache)
1197 dentry->d_op = &v9fs_cached_dentry_operations;
1198 else
1199 dentry->d_op = &v9fs_dentry_operations;
1200 d_instantiate(dentry, inode);
1201 return 0; 1026 return 0;
1202
1203free_vfid:
1204 v9fs_fid_destroy(vfid);
1205
1206clunk_dfid:
1207 v9fs_fid_clunk(v9ses, dfid);
1208
1209error:
1210 return err;
1211
1212} 1027}
1213 1028
1214/** 1029/**
@@ -1224,8 +1039,8 @@ error:
1224static int 1039static int
1225v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1040v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1226{ 1041{
1227 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1042 P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
1228 symname); 1043 dentry->d_name.name, symname);
1229 1044
1230 return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); 1045 return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
1231} 1046}
@@ -1247,11 +1062,11 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1247 struct dentry *dentry) 1062 struct dentry *dentry)
1248{ 1063{
1249 int retval; 1064 int retval;
1250 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); 1065 struct p9_fid *oldfid;
1251 struct v9fs_fid *oldfid;
1252 char *name; 1066 char *name;
1253 1067
1254 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1068 P9_DPRINTK(P9_DEBUG_VFS,
1069 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
1255 old_dentry->d_name.name); 1070 old_dentry->d_name.name);
1256 1071
1257 oldfid = v9fs_fid_clone(old_dentry); 1072 oldfid = v9fs_fid_clone(old_dentry);
@@ -1265,11 +1080,11 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1265 } 1080 }
1266 1081
1267 sprintf(name, "%d\n", oldfid->fid); 1082 sprintf(name, "%d\n", oldfid->fid);
1268 retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); 1083 retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
1269 __putname(name); 1084 __putname(name);
1270 1085
1271clunk_fid: 1086clunk_fid:
1272 v9fs_fid_clunk(v9ses, oldfid); 1087 p9_client_clunk(oldfid);
1273 return retval; 1088 return retval;
1274} 1089}
1275 1090
@@ -1288,7 +1103,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1288 int retval; 1103 int retval;
1289 char *name; 1104 char *name;
1290 1105
1291 dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, 1106 P9_DPRINTK(P9_DEBUG_VFS,
1107 " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1292 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); 1108 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
1293 1109
1294 if (!new_valid_dev(rdev)) 1110 if (!new_valid_dev(rdev))
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 7bdf8b3268..ba90437121 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,10 +37,10 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <net/9p/9p.h>
41#include <net/9p/client.h>
40 42
41#include "debug.h"
42#include "v9fs.h" 43#include "v9fs.h"
43#include "9p.h"
44#include "v9fs_vfs.h" 44#include "v9fs_vfs.h"
45#include "fid.h" 45#include "fid.h"
46 46
@@ -107,41 +107,48 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
107 struct vfsmount *mnt) 107 struct vfsmount *mnt)
108{ 108{
109 struct super_block *sb = NULL; 109 struct super_block *sb = NULL;
110 struct v9fs_fcall *fcall = NULL;
111 struct inode *inode = NULL; 110 struct inode *inode = NULL;
112 struct dentry *root = NULL; 111 struct dentry *root = NULL;
113 struct v9fs_session_info *v9ses = NULL; 112 struct v9fs_session_info *v9ses = NULL;
114 struct v9fs_fid *root_fid = NULL; 113 struct p9_stat *st = NULL;
115 int mode = S_IRWXUGO | S_ISVTX; 114 int mode = S_IRWXUGO | S_ISVTX;
116 uid_t uid = current->fsuid; 115 uid_t uid = current->fsuid;
117 gid_t gid = current->fsgid; 116 gid_t gid = current->fsgid;
118 int stat_result = 0; 117 struct p9_fid *fid;
119 int newfid = 0;
120 int retval = 0; 118 int retval = 0;
121 119
122 dprintk(DEBUG_VFS, " \n"); 120 P9_DPRINTK(P9_DEBUG_VFS, " \n");
123 121
124 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 122 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
125 if (!v9ses) 123 if (!v9ses)
126 return -ENOMEM; 124 return -ENOMEM;
127 125
128 if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { 126 fid = v9fs_session_init(v9ses, dev_name, data);
129 dprintk(DEBUG_ERROR, "problem initiating session\n"); 127 if (IS_ERR(fid)) {
130 retval = newfid; 128 retval = PTR_ERR(fid);
131 goto out_free_session; 129 fid = NULL;
130 kfree(v9ses);
131 v9ses = NULL;
132 goto error;
133 }
134
135 st = p9_client_stat(fid);
136 if (IS_ERR(st)) {
137 retval = PTR_ERR(st);
138 goto error;
132 } 139 }
133 140
134 sb = sget(fs_type, NULL, v9fs_set_super, v9ses); 141 sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
135 if (IS_ERR(sb)) { 142 if (IS_ERR(sb)) {
136 retval = PTR_ERR(sb); 143 retval = PTR_ERR(sb);
137 goto out_close_session; 144 goto error;
138 } 145 }
139 v9fs_fill_super(sb, v9ses, flags); 146 v9fs_fill_super(sb, v9ses, flags);
140 147
141 inode = v9fs_get_inode(sb, S_IFDIR | mode); 148 inode = v9fs_get_inode(sb, S_IFDIR | mode);
142 if (IS_ERR(inode)) { 149 if (IS_ERR(inode)) {
143 retval = PTR_ERR(inode); 150 retval = PTR_ERR(inode);
144 goto put_back_sb; 151 goto error;
145 } 152 }
146 153
147 inode->i_uid = uid; 154 inode->i_uid = uid;
@@ -150,54 +157,30 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
150 root = d_alloc_root(inode); 157 root = d_alloc_root(inode);
151 if (!root) { 158 if (!root) {
152 retval = -ENOMEM; 159 retval = -ENOMEM;
153 goto put_back_sb; 160 goto error;
154 } 161 }
155 162
156 sb->s_root = root; 163 sb->s_root = root;
164 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
165 v9fs_stat2inode(st, root->d_inode, sb);
166 v9fs_fid_add(root, fid);
157 167
158 stat_result = v9fs_t_stat(v9ses, newfid, &fcall); 168 return simple_set_mnt(mnt, sb);
159 if (stat_result < 0) {
160 dprintk(DEBUG_ERROR, "stat error\n");
161 v9fs_t_clunk(v9ses, newfid);
162 } else {
163 /* Setup the Root Inode */
164 root_fid = v9fs_fid_create(v9ses, newfid);
165 if (root_fid == NULL) {
166 retval = -ENOMEM;
167 goto put_back_sb;
168 }
169
170 retval = v9fs_fid_insert(root_fid, root);
171 if (retval < 0) {
172 kfree(fcall);
173 goto put_back_sb;
174 }
175
176 root_fid->qid = fcall->params.rstat.stat.qid;
177 root->d_inode->i_ino =
178 v9fs_qid2ino(&fcall->params.rstat.stat.qid);
179 v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
180 }
181 169
182 kfree(fcall); 170error:
171 if (fid)
172 p9_client_clunk(fid);
183 173
184 if (stat_result < 0) { 174 if (v9ses) {
185 retval = stat_result; 175 v9fs_session_close(v9ses);
186 goto put_back_sb; 176 kfree(v9ses);
187 } 177 }
188 178
189 return simple_set_mnt(mnt, sb); 179 if (sb) {
190 180 up_write(&sb->s_umount);
191out_close_session: 181 deactivate_super(sb);
192 v9fs_session_close(v9ses); 182 }
193out_free_session:
194 kfree(v9ses);
195 return retval;
196 183
197put_back_sb:
198 /* deactivate_super calls v9fs_kill_super which will frees the rest */
199 up_write(&sb->s_umount);
200 deactivate_super(sb);
201 return retval; 184 return retval;
202} 185}
203 186
@@ -211,7 +194,7 @@ static void v9fs_kill_super(struct super_block *s)
211{ 194{
212 struct v9fs_session_info *v9ses = s->s_fs_info; 195 struct v9fs_session_info *v9ses = s->s_fs_info;
213 196
214 dprintk(DEBUG_VFS, " %p\n", s); 197 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
215 198
216 v9fs_dentry_release(s->s_root); /* clunk root */ 199 v9fs_dentry_release(s->s_root); /* clunk root */
217 200
@@ -219,7 +202,7 @@ static void v9fs_kill_super(struct super_block *s)
219 202
220 v9fs_session_close(v9ses); 203 v9fs_session_close(v9ses);
221 kfree(v9ses); 204 kfree(v9ses);
222 dprintk(DEBUG_VFS, "exiting kill_super\n"); 205 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
223} 206}
224 207
225/** 208/**
@@ -234,7 +217,7 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
234 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; 217 struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
235 218
236 if (v9ses->debug != 0) 219 if (v9ses->debug != 0)
237 seq_printf(m, ",debug=%u", v9ses->debug); 220 seq_printf(m, ",debug=%x", v9ses->debug);
238 if (v9ses->port != V9FS_PORT) 221 if (v9ses->port != V9FS_PORT)
239 seq_printf(m, ",port=%u", v9ses->port); 222 seq_printf(m, ",port=%u", v9ses->port);
240 if (v9ses->maxdata != 9000) 223 if (v9ses->maxdata != 9000)
diff --git a/fs/Kconfig b/fs/Kconfig
index 0fa0c1193e..6a649902c5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -251,7 +251,7 @@ config JBD2
251 251
252config JBD2_DEBUG 252config JBD2_DEBUG
253 bool "JBD2 (ext4dev/ext4) debugging support" 253 bool "JBD2 (ext4dev/ext4) debugging support"
254 depends on JBD2 254 depends on JBD2 && DEBUG_FS
255 help 255 help
256 If you are using the ext4dev/ext4 journaled file system (or 256 If you are using the ext4dev/ext4 journaled file system (or
257 potentially any other filesystem/device using JBD2), this option 257 potentially any other filesystem/device using JBD2), this option
@@ -260,10 +260,10 @@ config JBD2_DEBUG
260 By default, the debugging output will be turned off. 260 By default, the debugging output will be turned off.
261 261
262 If you select Y here, then you will be able to turn on debugging 262 If you select Y here, then you will be able to turn on debugging
263 with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between 263 with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
264 1 and 5. The higher the number, the more debugging output is 264 number between 1 and 5. The higher the number, the more debugging
265 generated. To turn debugging off again, do 265 output is generated. To turn debugging off again, do
266 "echo 0 > /proc/sys/fs/jbd2-debug". 266 "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
267 267
268config FS_MBCACHE 268config FS_MBCACHE
269# Meta block cache for Extended Attributes (ext2/ext3/ext4) 269# Meta block cache for Extended Attributes (ext2/ext3/ext4)
@@ -991,7 +991,7 @@ config TMPFS_POSIX_ACL
991 991
992config HUGETLBFS 992config HUGETLBFS
993 bool "HugeTLB file system support" 993 bool "HugeTLB file system support"
994 depends on X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN 994 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN
995 help 995 help
996 hugetlbfs is a filesystem backing for HugeTLB pages, based on 996 hugetlbfs is a filesystem backing for HugeTLB pages, based on
997 ramfs. For architectures that support it, say Y here and read 997 ramfs. For architectures that support it, say Y here and read
@@ -1675,6 +1675,7 @@ config NFSD_V3_ACL
1675config NFSD_V4 1675config NFSD_V4
1676 bool "Provide NFSv4 server support (EXPERIMENTAL)" 1676 bool "Provide NFSv4 server support (EXPERIMENTAL)"
1677 depends on NFSD_V3 && EXPERIMENTAL 1677 depends on NFSD_V3 && EXPERIMENTAL
1678 select RPCSEC_GSS_KRB5
1678 help 1679 help
1679 If you would like to include the NFSv4 server as well as the NFSv2 1680 If you would like to include the NFSv4 server as well as the NFSv2
1680 and NFSv3 servers, say Y here. This feature is experimental, and 1681 and NFSv3 servers, say Y here. This feature is experimental, and
@@ -2048,7 +2049,7 @@ config AFS_DEBUG
2048 2049
2049config 9P_FS 2050config 9P_FS
2050 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" 2051 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
2051 depends on INET && EXPERIMENTAL 2052 depends on INET && NET_9P && EXPERIMENTAL
2052 help 2053 help
2053 If you say Y here, you will get experimental support for 2054 If you say Y here, you will get experimental support for
2054 Plan 9 resource sharing via the 9P2000 protocol. 2055 Plan 9 resource sharing via the 9P2000 protocol.
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index f544a28559..36e381c6a9 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -33,7 +33,7 @@ const struct file_operations adfs_file_operations = {
33 .fsync = file_fsync, 33 .fsync = file_fsync,
34 .write = do_sync_write, 34 .write = do_sync_write,
35 .aio_write = generic_file_aio_write, 35 .aio_write = generic_file_aio_write,
36 .sendfile = generic_file_sendfile, 36 .splice_read = generic_file_splice_read,
37}; 37};
38 38
39const struct inode_operations adfs_file_inode_operations = { 39const struct inode_operations adfs_file_inode_operations = {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c8796906f5..c314a35f09 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -35,7 +35,7 @@ const struct file_operations affs_file_operations = {
35 .open = affs_file_open, 35 .open = affs_file_open,
36 .release = affs_file_release, 36 .release = affs_file_release,
37 .fsync = file_fsync, 37 .fsync = file_fsync,
38 .sendfile = generic_file_sendfile, 38 .splice_read = generic_file_splice_read,
39}; 39};
40 40
41const struct inode_operations affs_file_inode_operations = { 41const struct inode_operations affs_file_inode_operations = {
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 73ce561f3e..a66671082c 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-objs := \
8 cmservice.o \ 8 cmservice.o \
9 dir.o \ 9 dir.o \
10 file.o \ 10 file.o \
11 flock.o \
11 fsclient.o \ 12 fsclient.o \
12 inode.o \ 13 inode.o \
13 main.o \ 14 main.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 2452579481..c548aa346f 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -37,6 +37,13 @@ typedef enum {
37 AFS_FTYPE_SYMLINK = 3, 37 AFS_FTYPE_SYMLINK = 3,
38} afs_file_type_t; 38} afs_file_type_t;
39 39
40typedef enum {
41 AFS_LOCK_READ = 0, /* read lock request */
42 AFS_LOCK_WRITE = 1, /* write lock request */
43} afs_lock_type_t;
44
45#define AFS_LOCKWAIT (5 * 60) /* time until a lock times out (seconds) */
46
40/* 47/*
41 * AFS file identifier 48 * AFS file identifier
42 */ 49 */
@@ -120,6 +127,7 @@ struct afs_file_status {
120 struct afs_fid parent; /* parent dir ID for non-dirs only */ 127 struct afs_fid parent; /* parent dir ID for non-dirs only */
121 time_t mtime_client; /* last time client changed data */ 128 time_t mtime_client; /* last time client changed data */
122 time_t mtime_server; /* last time server changed data */ 129 time_t mtime_server; /* last time server changed data */
130 s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
123}; 131};
124 132
125/* 133/*
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index a18c374ebe..eb647323d8 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -31,6 +31,9 @@ enum AFS_FS_Operations {
31 FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ 31 FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */
32 FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ 32 FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */
33 FSGETROOTVOLUME = 151, /* AFS Get root volume name */ 33 FSGETROOTVOLUME = 151, /* AFS Get root volume name */
34 FSSETLOCK = 156, /* AFS Request a file lock */
35 FSEXTENDLOCK = 157, /* AFS Extend a file lock */
36 FSRELEASELOCK = 158, /* AFS Release a file lock */
34 FSLOOKUP = 161, /* AFS lookup file in directory */ 37 FSLOOKUP = 161, /* AFS lookup file in directory */
35 FSFETCHDATA64 = 65537, /* AFS Fetch file data */ 38 FSFETCHDATA64 = 65537, /* AFS Fetch file data */
36 FSSTOREDATA64 = 65538, /* AFS Store file data */ 39 FSSTOREDATA64 = 65538, /* AFS Store file data */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index bacf518c6f..b824394581 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -125,6 +125,9 @@ static void afs_break_callback(struct afs_server *server,
125 spin_unlock(&server->cb_lock); 125 spin_unlock(&server->cb_lock);
126 126
127 queue_work(afs_callback_update_worker, &vnode->cb_broken_work); 127 queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
128 if (list_empty(&vnode->granted_locks) &&
129 !list_empty(&vnode->pending_locks))
130 afs_lock_may_be_available(vnode);
128 spin_unlock(&vnode->lock); 131 spin_unlock(&vnode->lock);
129 } 132 }
130} 133}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 546c59522e..33fe39ad4e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -44,6 +44,7 @@ const struct file_operations afs_dir_file_operations = {
44 .open = afs_dir_open, 44 .open = afs_dir_open,
45 .release = afs_release, 45 .release = afs_release,
46 .readdir = afs_readdir, 46 .readdir = afs_readdir,
47 .lock = afs_lock,
47}; 48};
48 49
49const struct inode_operations afs_dir_inode_operations = { 50const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 9c0e721d9f..525f7c56e0 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -32,8 +32,10 @@ const struct file_operations afs_file_operations = {
32 .aio_read = generic_file_aio_read, 32 .aio_read = generic_file_aio_read,
33 .aio_write = afs_file_write, 33 .aio_write = afs_file_write,
34 .mmap = generic_file_readonly_mmap, 34 .mmap = generic_file_readonly_mmap,
35 .sendfile = generic_file_sendfile, 35 .splice_read = generic_file_splice_read,
36 .fsync = afs_fsync, 36 .fsync = afs_fsync,
37 .lock = afs_lock,
38 .flock = afs_flock,
37}; 39};
38 40
39const struct inode_operations afs_file_inode_operations = { 41const struct inode_operations afs_file_inode_operations = {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
new file mode 100644
index 0000000000..8f07f8d1bf
--- /dev/null
+++ b/fs/afs/flock.c
@@ -0,0 +1,558 @@
1/* AFS file locking support
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/smp_lock.h>
13#include "internal.h"
14
15#define AFS_LOCK_GRANTED 0
16#define AFS_LOCK_PENDING 1
17
18static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
19static void afs_fl_release_private(struct file_lock *fl);
20
21static struct workqueue_struct *afs_lock_manager;
22
23static struct file_lock_operations afs_lock_ops = {
24 .fl_copy_lock = afs_fl_copy_lock,
25 .fl_release_private = afs_fl_release_private,
26};
27
28/*
29 * initialise the lock manager thread if it isn't already running
30 */
31static int afs_init_lock_manager(void)
32{
33 if (!afs_lock_manager) {
34 afs_lock_manager = create_singlethread_workqueue("kafs_lockd");
35 if (!afs_lock_manager)
36 return -ENOMEM;
37 }
38 return 0;
39}
40
41/*
42 * destroy the lock manager thread if it's running
43 */
44void __exit afs_kill_lock_manager(void)
45{
46 if (afs_lock_manager)
47 destroy_workqueue(afs_lock_manager);
48}
49
50/*
51 * if the callback is broken on this vnode, then the lock may now be available
52 */
53void afs_lock_may_be_available(struct afs_vnode *vnode)
54{
55 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
56
57 queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
58}
59
60/*
61 * the lock will time out in 5 minutes unless we extend it, so schedule
62 * extension in a bit less than that time
63 */
64static void afs_schedule_lock_extension(struct afs_vnode *vnode)
65{
66 queue_delayed_work(afs_lock_manager, &vnode->lock_work,
67 AFS_LOCKWAIT * HZ / 2);
68}
69
70/*
71 * do work for a lock, including:
72 * - probing for a lock we're waiting on but didn't get immediately
73 * - extending a lock that's close to timing out
74 */
75void afs_lock_work(struct work_struct *work)
76{
77 struct afs_vnode *vnode =
78 container_of(work, struct afs_vnode, lock_work.work);
79 struct file_lock *fl;
80 afs_lock_type_t type;
81 struct key *key;
82 int ret;
83
84 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
85
86 spin_lock(&vnode->lock);
87
88 if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
89 _debug("unlock");
90 spin_unlock(&vnode->lock);
91
92 /* attempt to release the server lock; if it fails, we just
93 * wait 5 minutes and it'll time out anyway */
94 ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
95 if (ret < 0)
96 printk(KERN_WARNING "AFS:"
97 " Failed to release lock on {%x:%x} error %d\n",
98 vnode->fid.vid, vnode->fid.vnode, ret);
99
100 spin_lock(&vnode->lock);
101 key_put(vnode->unlock_key);
102 vnode->unlock_key = NULL;
103 clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
104 }
105
106 /* if we've got a lock, then it must be time to extend that lock as AFS
107 * locks time out after 5 minutes */
108 if (!list_empty(&vnode->granted_locks)) {
109 _debug("extend");
110
111 if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
112 BUG();
113 fl = list_entry(vnode->granted_locks.next,
114 struct file_lock, fl_u.afs.link);
115 key = key_get(fl->fl_file->private_data);
116 spin_unlock(&vnode->lock);
117
118 ret = afs_vnode_extend_lock(vnode, key);
119 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
120 key_put(key);
121 switch (ret) {
122 case 0:
123 afs_schedule_lock_extension(vnode);
124 break;
125 default:
126 /* ummm... we failed to extend the lock - retry
127 * extension shortly */
128 printk(KERN_WARNING "AFS:"
129 " Failed to extend lock on {%x:%x} error %d\n",
130 vnode->fid.vid, vnode->fid.vnode, ret);
131 queue_delayed_work(afs_lock_manager, &vnode->lock_work,
132 HZ * 10);
133 break;
134 }
135 _leave(" [extend]");
136 return;
137 }
138
139 /* if we don't have a granted lock, then we must've been called back by
140 * the server, and so if might be possible to get a lock we're
141 * currently waiting for */
142 if (!list_empty(&vnode->pending_locks)) {
143 _debug("get");
144
145 if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
146 BUG();
147 fl = list_entry(vnode->pending_locks.next,
148 struct file_lock, fl_u.afs.link);
149 key = key_get(fl->fl_file->private_data);
150 type = (fl->fl_type == F_RDLCK) ?
151 AFS_LOCK_READ : AFS_LOCK_WRITE;
152 spin_unlock(&vnode->lock);
153
154 ret = afs_vnode_set_lock(vnode, key, type);
155 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
156 switch (ret) {
157 case -EWOULDBLOCK:
158 _debug("blocked");
159 break;
160 case 0:
161 _debug("acquired");
162 if (type == AFS_LOCK_READ)
163 set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
164 else
165 set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
166 ret = AFS_LOCK_GRANTED;
167 default:
168 spin_lock(&vnode->lock);
169 /* the pending lock may have been withdrawn due to a
170 * signal */
171 if (list_entry(vnode->pending_locks.next,
172 struct file_lock, fl_u.afs.link) == fl) {
173 fl->fl_u.afs.state = ret;
174 if (ret == AFS_LOCK_GRANTED)
175 list_move_tail(&fl->fl_u.afs.link,
176 &vnode->granted_locks);
177 else
178 list_del_init(&fl->fl_u.afs.link);
179 wake_up(&fl->fl_wait);
180 spin_unlock(&vnode->lock);
181 } else {
182 _debug("withdrawn");
183 clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
184 clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
185 spin_unlock(&vnode->lock);
186 afs_vnode_release_lock(vnode, key);
187 if (!list_empty(&vnode->pending_locks))
188 afs_lock_may_be_available(vnode);
189 }
190 break;
191 }
192 key_put(key);
193 _leave(" [pend]");
194 return;
195 }
196
197 /* looks like the lock request was withdrawn on a signal */
198 spin_unlock(&vnode->lock);
199 _leave(" [no locks]");
200}
201
202/*
203 * pass responsibility for the unlocking of a vnode on the server to the
204 * manager thread, lest a pending signal in the calling thread interrupt
205 * AF_RXRPC
206 * - the caller must hold the vnode lock
207 */
208static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
209{
210 cancel_delayed_work(&vnode->lock_work);
211 if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
212 !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
213 BUG();
214 if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
215 BUG();
216 vnode->unlock_key = key_get(key);
217 afs_lock_may_be_available(vnode);
218}
219
220/*
221 * request a lock on a file on the server
222 */
223static int afs_do_setlk(struct file *file, struct file_lock *fl)
224{
225 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
226 afs_lock_type_t type;
227 struct key *key = file->private_data;
228 int ret;
229
230 _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
231
232 /* only whole-file locks are supported */
233 if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
234 return -EINVAL;
235
236 ret = afs_init_lock_manager();
237 if (ret < 0)
238 return ret;
239
240 fl->fl_ops = &afs_lock_ops;
241 INIT_LIST_HEAD(&fl->fl_u.afs.link);
242 fl->fl_u.afs.state = AFS_LOCK_PENDING;
243
244 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
245
246 lock_kernel();
247
248 /* make sure we've got a callback on this file and that our view of the
249 * data version is up to date */
250 ret = afs_vnode_fetch_status(vnode, NULL, key);
251 if (ret < 0)
252 goto error;
253
254 if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
255 ret = -EAGAIN;
256 goto error;
257 }
258
259 spin_lock(&vnode->lock);
260
261 if (list_empty(&vnode->pending_locks)) {
262 /* if there's no-one else with a lock on this vnode, then we
263 * need to ask the server for a lock */
264 if (list_empty(&vnode->granted_locks)) {
265 _debug("not locked");
266 ASSERTCMP(vnode->flags &
267 ((1 << AFS_VNODE_LOCKING) |
268 (1 << AFS_VNODE_READLOCKED) |
269 (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
270 list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
271 set_bit(AFS_VNODE_LOCKING, &vnode->flags);
272 spin_unlock(&vnode->lock);
273
274 ret = afs_vnode_set_lock(vnode, key, type);
275 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
276 switch (ret) {
277 case 0:
278 goto acquired_server_lock;
279 case -EWOULDBLOCK:
280 spin_lock(&vnode->lock);
281 ASSERT(list_empty(&vnode->granted_locks));
282 ASSERTCMP(vnode->pending_locks.next, ==,
283 &fl->fl_u.afs.link);
284 goto wait;
285 default:
286 spin_lock(&vnode->lock);
287 list_del_init(&fl->fl_u.afs.link);
288 spin_unlock(&vnode->lock);
289 goto error;
290 }
291 }
292
293 /* if we've already got a readlock on the server and no waiting
294 * writelocks, then we might be able to instantly grant another
295 * readlock */
296 if (type == AFS_LOCK_READ &&
297 vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
298 _debug("instant readlock");
299 ASSERTCMP(vnode->flags &
300 ((1 << AFS_VNODE_LOCKING) |
301 (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
302 ASSERT(!list_empty(&vnode->granted_locks));
303 goto sharing_existing_lock;
304 }
305 }
306
307 /* otherwise, we need to wait for a local lock to become available */
308 _debug("wait local");
309 list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
310wait:
311 if (!(fl->fl_flags & FL_SLEEP)) {
312 _debug("noblock");
313 ret = -EAGAIN;
314 goto abort_attempt;
315 }
316 spin_unlock(&vnode->lock);
317
318 /* now we need to sleep and wait for the lock manager thread to get the
319 * lock from the server */
320 _debug("sleep");
321 ret = wait_event_interruptible(fl->fl_wait,
322 fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
323 if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
324 ret = fl->fl_u.afs.state;
325 if (ret < 0)
326 goto error;
327 spin_lock(&vnode->lock);
328 goto given_lock;
329 }
330
331 /* we were interrupted, but someone may still be in the throes of
332 * giving us the lock */
333 _debug("intr");
334 ASSERTCMP(ret, ==, -ERESTARTSYS);
335
336 spin_lock(&vnode->lock);
337 if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
338 ret = fl->fl_u.afs.state;
339 if (ret < 0) {
340 spin_unlock(&vnode->lock);
341 goto error;
342 }
343 goto given_lock;
344 }
345
346abort_attempt:
347 /* we aren't going to get the lock, either because we're unwilling to
348 * wait, or because some signal happened */
349 _debug("abort");
350 if (list_empty(&vnode->granted_locks) &&
351 vnode->pending_locks.next == &fl->fl_u.afs.link) {
352 if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
353 /* kick the next pending lock into having a go */
354 list_del_init(&fl->fl_u.afs.link);
355 afs_lock_may_be_available(vnode);
356 }
357 } else {
358 list_del_init(&fl->fl_u.afs.link);
359 }
360 spin_unlock(&vnode->lock);
361 goto error;
362
363acquired_server_lock:
364 /* we've acquired a server lock, but it needs to be renewed after 5
365 * mins */
366 spin_lock(&vnode->lock);
367 afs_schedule_lock_extension(vnode);
368 if (type == AFS_LOCK_READ)
369 set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
370 else
371 set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
372sharing_existing_lock:
373 /* the lock has been granted as far as we're concerned... */
374 fl->fl_u.afs.state = AFS_LOCK_GRANTED;
375 list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
376given_lock:
377 /* ... but we do still need to get the VFS's blessing */
378 ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
379 ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
380 (1 << AFS_VNODE_WRITELOCKED))) != 0);
381 ret = posix_lock_file(file, fl, NULL);
382 if (ret < 0)
383 goto vfs_rejected_lock;
384 spin_unlock(&vnode->lock);
385
386 /* again, make sure we've got a callback on this file and, again, make
387 * sure that our view of the data version is up to date (we ignore
388 * errors incurred here and deal with the consequences elsewhere) */
389 afs_vnode_fetch_status(vnode, NULL, key);
390
391error:
392 unlock_kernel();
393 _leave(" = %d", ret);
394 return ret;
395
396vfs_rejected_lock:
397 /* the VFS rejected the lock we just obtained, so we have to discard
398 * what we just got */
399 _debug("vfs refused %d", ret);
400 list_del_init(&fl->fl_u.afs.link);
401 if (list_empty(&vnode->granted_locks))
402 afs_defer_unlock(vnode, key);
403 spin_unlock(&vnode->lock);
404 goto abort_attempt;
405}
406
407/*
408 * unlock on a file on the server
409 */
410static int afs_do_unlk(struct file *file, struct file_lock *fl)
411{
412 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
413 struct key *key = file->private_data;
414 int ret;
415
416 _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
417
418 /* only whole-file unlocks are supported */
419 if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
420 return -EINVAL;
421
422 fl->fl_ops = &afs_lock_ops;
423 INIT_LIST_HEAD(&fl->fl_u.afs.link);
424 fl->fl_u.afs.state = AFS_LOCK_PENDING;
425
426 spin_lock(&vnode->lock);
427 ret = posix_lock_file(file, fl, NULL);
428 if (ret < 0) {
429 spin_unlock(&vnode->lock);
430 _leave(" = %d [vfs]", ret);
431 return ret;
432 }
433
434 /* discard the server lock only if all granted locks are gone */
435 if (list_empty(&vnode->granted_locks))
436 afs_defer_unlock(vnode, key);
437 spin_unlock(&vnode->lock);
438 _leave(" = 0");
439 return 0;
440}
441
442/*
443 * return information about a lock we currently hold, if indeed we hold one
444 */
445static int afs_do_getlk(struct file *file, struct file_lock *fl)
446{
447 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
448 struct key *key = file->private_data;
449 int ret, lock_count;
450
451 _enter("");
452
453 fl->fl_type = F_UNLCK;
454
455 mutex_lock(&vnode->vfs_inode.i_mutex);
456
457 /* check local lock records first */
458 ret = 0;
459 if (posix_test_lock(file, fl) == 0) {
460 /* no local locks; consult the server */
461 ret = afs_vnode_fetch_status(vnode, NULL, key);
462 if (ret < 0)
463 goto error;
464 lock_count = vnode->status.lock_count;
465 if (lock_count) {
466 if (lock_count > 0)
467 fl->fl_type = F_RDLCK;
468 else
469 fl->fl_type = F_WRLCK;
470 fl->fl_start = 0;
471 fl->fl_end = OFFSET_MAX;
472 }
473 }
474
475error:
476 mutex_unlock(&vnode->vfs_inode.i_mutex);
477 _leave(" = %d [%hd]", ret, fl->fl_type);
478 return ret;
479}
480
481/*
482 * manage POSIX locks on a file
483 */
484int afs_lock(struct file *file, int cmd, struct file_lock *fl)
485{
486 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
487
488 _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
489 vnode->fid.vid, vnode->fid.vnode, cmd,
490 fl->fl_type, fl->fl_flags,
491 (long long) fl->fl_start, (long long) fl->fl_end);
492
493 /* AFS doesn't support mandatory locks */
494 if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
495 fl->fl_type != F_UNLCK)
496 return -ENOLCK;
497
498 if (IS_GETLK(cmd))
499 return afs_do_getlk(file, fl);
500 if (fl->fl_type == F_UNLCK)
501 return afs_do_unlk(file, fl);
502 return afs_do_setlk(file, fl);
503}
504
505/*
506 * manage FLOCK locks on a file
507 */
508int afs_flock(struct file *file, int cmd, struct file_lock *fl)
509{
510 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
511
512 _enter("{%x:%u},%d,{t=%x,fl=%x}",
513 vnode->fid.vid, vnode->fid.vnode, cmd,
514 fl->fl_type, fl->fl_flags);
515
516 /*
517 * No BSD flocks over NFS allowed.
518 * Note: we could try to fake a POSIX lock request here by
519 * using ((u32) filp | 0x80000000) or some such as the pid.
520 * Not sure whether that would be unique, though, or whether
521 * that would break in other places.
522 */
523 if (!(fl->fl_flags & FL_FLOCK))
524 return -ENOLCK;
525
526 /* we're simulating flock() locks using posix locks on the server */
527 fl->fl_owner = (fl_owner_t) file;
528 fl->fl_start = 0;
529 fl->fl_end = OFFSET_MAX;
530
531 if (fl->fl_type == F_UNLCK)
532 return afs_do_unlk(file, fl);
533 return afs_do_setlk(file, fl);
534}
535
536/*
537 * the POSIX lock management core VFS code copies the lock record and adds the
538 * copy into its own list, so we need to add that copy to the vnode's lock
539 * queue in the same place as the original (which will be deleted shortly
540 * after)
541 */
542static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
543{
544 _enter("");
545
546 list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
547}
548
549/*
550 * need to remove this lock from the vnode queue when it's removed from the
551 * VFS's list
552 */
553static void afs_fl_release_private(struct file_lock *fl)
554{
555 _enter("");
556
557 list_del_init(&fl->fl_u.afs.link);
558}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 5dff1308b6..023b95b0d9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -67,7 +67,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
67 EXTRACT(status->group); 67 EXTRACT(status->group);
68 bp++; /* sync counter */ 68 bp++; /* sync counter */
69 data_version |= (u64) ntohl(*bp++) << 32; 69 data_version |= (u64) ntohl(*bp++) << 32;
70 bp++; /* lock count */ 70 EXTRACT(status->lock_count);
71 size |= (u64) ntohl(*bp++) << 32; 71 size |= (u64) ntohl(*bp++) << 32;
72 bp++; /* spare 4 */ 72 bp++; /* spare 4 */
73 *_bp = bp; 73 *_bp = bp;
@@ -1748,3 +1748,156 @@ int afs_fs_get_volume_status(struct afs_server *server,
1748 1748
1749 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); 1749 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1750} 1750}
1751
1752/*
1753 * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
1754 */
1755static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
1756 struct sk_buff *skb, bool last)
1757{
1758 const __be32 *bp;
1759
1760 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
1761
1762 afs_transfer_reply(call, skb);
1763 if (!last)
1764 return 0;
1765
1766 if (call->reply_size != call->reply_max)
1767 return -EBADMSG;
1768
1769 /* unmarshall the reply once we've received all of it */
1770 bp = call->buffer;
1771 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
1772
1773 _leave(" = 0 [done]");
1774 return 0;
1775}
1776
1777/*
1778 * FS.SetLock operation type
1779 */
1780static const struct afs_call_type afs_RXFSSetLock = {
1781 .name = "FS.SetLock",
1782 .deliver = afs_deliver_fs_xxxx_lock,
1783 .abort_to_error = afs_abort_to_error,
1784 .destructor = afs_flat_call_destructor,
1785};
1786
1787/*
1788 * FS.ExtendLock operation type
1789 */
1790static const struct afs_call_type afs_RXFSExtendLock = {
1791 .name = "FS.ExtendLock",
1792 .deliver = afs_deliver_fs_xxxx_lock,
1793 .abort_to_error = afs_abort_to_error,
1794 .destructor = afs_flat_call_destructor,
1795};
1796
1797/*
1798 * FS.ReleaseLock operation type
1799 */
1800static const struct afs_call_type afs_RXFSReleaseLock = {
1801 .name = "FS.ReleaseLock",
1802 .deliver = afs_deliver_fs_xxxx_lock,
1803 .abort_to_error = afs_abort_to_error,
1804 .destructor = afs_flat_call_destructor,
1805};
1806
1807/*
1808 * get a lock on a file
1809 */
1810int afs_fs_set_lock(struct afs_server *server,
1811 struct key *key,
1812 struct afs_vnode *vnode,
1813 afs_lock_type_t type,
1814 const struct afs_wait_mode *wait_mode)
1815{
1816 struct afs_call *call;
1817 __be32 *bp;
1818
1819 _enter("");
1820
1821 call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
1822 if (!call)
1823 return -ENOMEM;
1824
1825 call->key = key;
1826 call->reply = vnode;
1827 call->service_id = FS_SERVICE;
1828 call->port = htons(AFS_FS_PORT);
1829
1830 /* marshall the parameters */
1831 bp = call->request;
1832 *bp++ = htonl(FSSETLOCK);
1833 *bp++ = htonl(vnode->fid.vid);
1834 *bp++ = htonl(vnode->fid.vnode);
1835 *bp++ = htonl(vnode->fid.unique);
1836 *bp++ = htonl(type);
1837
1838 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1839}
1840
1841/*
1842 * extend a lock on a file
1843 */
1844int afs_fs_extend_lock(struct afs_server *server,
1845 struct key *key,
1846 struct afs_vnode *vnode,
1847 const struct afs_wait_mode *wait_mode)
1848{
1849 struct afs_call *call;
1850 __be32 *bp;
1851
1852 _enter("");
1853
1854 call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
1855 if (!call)
1856 return -ENOMEM;
1857
1858 call->key = key;
1859 call->reply = vnode;
1860 call->service_id = FS_SERVICE;
1861 call->port = htons(AFS_FS_PORT);
1862
1863 /* marshall the parameters */
1864 bp = call->request;
1865 *bp++ = htonl(FSEXTENDLOCK);
1866 *bp++ = htonl(vnode->fid.vid);
1867 *bp++ = htonl(vnode->fid.vnode);
1868 *bp++ = htonl(vnode->fid.unique);
1869
1870 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1871}
1872
1873/*
1874 * release a lock on a file
1875 */
1876int afs_fs_release_lock(struct afs_server *server,
1877 struct key *key,
1878 struct afs_vnode *vnode,
1879 const struct afs_wait_mode *wait_mode)
1880{
1881 struct afs_call *call;
1882 __be32 *bp;
1883
1884 _enter("");
1885
1886 call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
1887 if (!call)
1888 return -ENOMEM;
1889
1890 call->key = key;
1891 call->reply = vnode;
1892 call->service_id = FS_SERVICE;
1893 call->port = htons(AFS_FS_PORT);
1894
1895 /* marshall the parameters */
1896 bp = call->request;
1897 *bp++ = htonl(FSRELEASELOCK);
1898 *bp++ = htonl(vnode->fid.vid);
1899 *bp++ = htonl(vnode->fid.vnode);
1900 *bp++ = htonl(vnode->fid.unique);
1901
1902 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1903}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2c55dd94a1..6306438f33 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -351,10 +351,18 @@ struct afs_vnode {
351#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ 351#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
352#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ 352#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
353#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ 353#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
354#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */
355#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
356#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
357#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
354 358
355 long acl_order; /* ACL check count (callback break count) */ 359 long acl_order; /* ACL check count (callback break count) */
356 360
357 struct list_head writebacks; /* alterations in pagecache that need writing */ 361 struct list_head writebacks; /* alterations in pagecache that need writing */
362 struct list_head pending_locks; /* locks waiting to be granted */
363 struct list_head granted_locks; /* locks granted on this file */
364 struct delayed_work lock_work; /* work to be done in locking */
365 struct key *unlock_key; /* key to be used in unlocking */
358 366
359 /* outstanding callback notification on this file */ 367 /* outstanding callback notification on this file */
360 struct rb_node server_rb; /* link in server->fs_vnodes */ 368 struct rb_node server_rb; /* link in server->fs_vnodes */
@@ -474,6 +482,15 @@ extern int afs_open(struct inode *, struct file *);
474extern int afs_release(struct inode *, struct file *); 482extern int afs_release(struct inode *, struct file *);
475 483
476/* 484/*
485 * flock.c
486 */
487extern void __exit afs_kill_lock_manager(void);
488extern void afs_lock_work(struct work_struct *);
489extern void afs_lock_may_be_available(struct afs_vnode *);
490extern int afs_lock(struct file *, int, struct file_lock *);
491extern int afs_flock(struct file *, int, struct file_lock *);
492
493/*
477 * fsclient.c 494 * fsclient.c
478 */ 495 */
479extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, 496extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
@@ -513,6 +530,15 @@ extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
513 struct afs_vnode *, 530 struct afs_vnode *,
514 struct afs_volume_status *, 531 struct afs_volume_status *,
515 const struct afs_wait_mode *); 532 const struct afs_wait_mode *);
533extern int afs_fs_set_lock(struct afs_server *, struct key *,
534 struct afs_vnode *, afs_lock_type_t,
535 const struct afs_wait_mode *);
536extern int afs_fs_extend_lock(struct afs_server *, struct key *,
537 struct afs_vnode *,
538 const struct afs_wait_mode *);
539extern int afs_fs_release_lock(struct afs_server *, struct key *,
540 struct afs_vnode *,
541 const struct afs_wait_mode *);
516 542
517/* 543/*
518 * inode.c 544 * inode.c
@@ -681,6 +707,10 @@ extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
681extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); 707extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
682extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, 708extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
683 struct afs_volume_status *); 709 struct afs_volume_status *);
710extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
711 afs_lock_type_t);
712extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
713extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
684 714
685/* 715/*
686 * volume.c 716 * volume.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cd21195bbb..0f60f6b357 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -168,6 +168,7 @@ static void __exit afs_exit(void)
168 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); 168 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
169 169
170 afs_fs_exit(); 170 afs_fs_exit();
171 afs_kill_lock_manager();
171 afs_close_socket(); 172 afs_close_socket();
172 afs_purge_servers(); 173 afs_purge_servers();
173 afs_callback_update_kill(); 174 afs_callback_update_kill();
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index d1a889c407..2d33a5f7d2 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -35,6 +35,7 @@ int afs_abort_to_error(u32 abort_code)
35 case VOVERQUOTA: return -EDQUOT; 35 case VOVERQUOTA: return -EDQUOT;
36 case VBUSY: return -EBUSY; 36 case VBUSY: return -EBUSY;
37 case VMOVED: return -ENXIO; 37 case VMOVED: return -ENXIO;
38 case 0x2f6df0a: return -EWOULDBLOCK;
38 case 0x2f6df0c: return -EACCES; 39 case 0x2f6df0c: return -EACCES;
39 case 0x2f6df0f: return -EBUSY; 40 case 0x2f6df0f: return -EBUSY;
40 case 0x2f6df10: return -EEXIST; 41 case 0x2f6df10: return -EEXIST;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 13df512aea..6edb56683b 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -201,23 +201,9 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
201 */ 201 */
202static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) 202static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
203{ 203{
204 struct list_head *_p;
205 loff_t pos = *_pos;
206
207 /* lock the list against modification */ 204 /* lock the list against modification */
208 down_read(&afs_proc_cells_sem); 205 down_read(&afs_proc_cells_sem);
209 206 return seq_list_start_head(&afs_proc_cells, *_pos);
210 /* allow for the header line */
211 if (!pos)
212 return (void *) 1;
213 pos--;
214
215 /* find the n'th element in the list */
216 list_for_each(_p, &afs_proc_cells)
217 if (!pos--)
218 break;
219
220 return _p != &afs_proc_cells ? _p : NULL;
221} 207}
222 208
223/* 209/*
@@ -225,14 +211,7 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
225 */ 211 */
226static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) 212static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
227{ 213{
228 struct list_head *_p; 214 return seq_list_next(v, &afs_proc_cells, pos);
229
230 (*pos)++;
231
232 _p = v;
233 _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
234
235 return _p != &afs_proc_cells ? _p : NULL;
236} 215}
237 216
238/* 217/*
@@ -250,7 +229,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
250{ 229{
251 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); 230 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
252 231
253 if (v == (void *) 1) { 232 if (v == &afs_proc_cells) {
254 /* display header on line 1 */ 233 /* display header on line 1 */
255 seq_puts(m, "USE NAME\n"); 234 seq_puts(m, "USE NAME\n");
256 return 0; 235 return 0;
@@ -503,26 +482,13 @@ static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
503 */ 482 */
504static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) 483static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
505{ 484{
506 struct list_head *_p;
507 struct afs_cell *cell = m->private; 485 struct afs_cell *cell = m->private;
508 loff_t pos = *_pos;
509 486
510 _enter("cell=%p pos=%Ld", cell, *_pos); 487 _enter("cell=%p pos=%Ld", cell, *_pos);
511 488
512 /* lock the list against modification */ 489 /* lock the list against modification */
513 down_read(&cell->vl_sem); 490 down_read(&cell->vl_sem);
514 491 return seq_list_start_head(&cell->vl_list, *_pos);
515 /* allow for the header line */
516 if (!pos)
517 return (void *) 1;
518 pos--;
519
520 /* find the n'th element in the list */
521 list_for_each(_p, &cell->vl_list)
522 if (!pos--)
523 break;
524
525 return _p != &cell->vl_list ? _p : NULL;
526} 492}
527 493
528/* 494/*
@@ -531,17 +497,10 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
531static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, 497static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
532 loff_t *_pos) 498 loff_t *_pos)
533{ 499{
534 struct list_head *_p;
535 struct afs_cell *cell = p->private; 500 struct afs_cell *cell = p->private;
536 501
537 _enter("cell=%p pos=%Ld", cell, *_pos); 502 _enter("cell=%p pos=%Ld", cell, *_pos);
538 503 return seq_list_next(v, &cell->vl_list, _pos);
539 (*_pos)++;
540
541 _p = v;
542 _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
543
544 return (_p != &cell->vl_list) ? _p : NULL;
545} 504}
546 505
547/* 506/*
@@ -569,11 +528,12 @@ const char afs_vlocation_states[][4] = {
569 */ 528 */
570static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) 529static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
571{ 530{
531 struct afs_cell *cell = m->private;
572 struct afs_vlocation *vlocation = 532 struct afs_vlocation *vlocation =
573 list_entry(v, struct afs_vlocation, link); 533 list_entry(v, struct afs_vlocation, link);
574 534
575 /* display header on line 1 */ 535 /* display header on line 1 */
576 if (v == (void *) 1) { 536 if (v == &cell->vl_list) {
577 seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); 537 seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n");
578 return 0; 538 return 0;
579 } 539 }
@@ -734,26 +694,13 @@ static int afs_proc_cell_servers_release(struct inode *inode,
734static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) 694static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
735 __acquires(m->private->servers_lock) 695 __acquires(m->private->servers_lock)
736{ 696{
737 struct list_head *_p;
738 struct afs_cell *cell = m->private; 697 struct afs_cell *cell = m->private;
739 loff_t pos = *_pos;
740 698
741 _enter("cell=%p pos=%Ld", cell, *_pos); 699 _enter("cell=%p pos=%Ld", cell, *_pos);
742 700
743 /* lock the list against modification */ 701 /* lock the list against modification */
744 read_lock(&cell->servers_lock); 702 read_lock(&cell->servers_lock);
745 703 return seq_list_start_head(&cell->servers, *_pos);
746 /* allow for the header line */
747 if (!pos)
748 return (void *) 1;
749 pos--;
750
751 /* find the n'th element in the list */
752 list_for_each(_p, &cell->servers)
753 if (!pos--)
754 break;
755
756 return _p != &cell->servers ? _p : NULL;
757} 704}
758 705
759/* 706/*
@@ -762,17 +709,10 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
762static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, 709static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
763 loff_t *_pos) 710 loff_t *_pos)
764{ 711{
765 struct list_head *_p;
766 struct afs_cell *cell = p->private; 712 struct afs_cell *cell = p->private;
767 713
768 _enter("cell=%p pos=%Ld", cell, *_pos); 714 _enter("cell=%p pos=%Ld", cell, *_pos);
769 715 return seq_list_next(v, &cell->servers, _pos);
770 (*_pos)++;
771
772 _p = v;
773 _p = v == (void *) 1 ? cell->servers.next : _p->next;
774
775 return _p != &cell->servers ? _p : NULL;
776} 716}
777 717
778/* 718/*
@@ -791,11 +731,12 @@ static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
791 */ 731 */
792static int afs_proc_cell_servers_show(struct seq_file *m, void *v) 732static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
793{ 733{
734 struct afs_cell *cell = m->private;
794 struct afs_server *server = list_entry(v, struct afs_server, link); 735 struct afs_server *server = list_entry(v, struct afs_server, link);
795 char ipaddr[20]; 736 char ipaddr[20];
796 737
797 /* display header on line 1 */ 738 /* display header on line 1 */
798 if (v == (void *) 1) { 739 if (v == &cell->servers) {
799 seq_puts(m, "USE ADDR STATE\n"); 740 seq_puts(m, "USE ADDR STATE\n");
800 return 0; 741 return 0;
801 } 742 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 2e8496ba12..993cdf1cce 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -460,6 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
460 spin_lock_init(&vnode->writeback_lock); 460 spin_lock_init(&vnode->writeback_lock);
461 spin_lock_init(&vnode->lock); 461 spin_lock_init(&vnode->lock);
462 INIT_LIST_HEAD(&vnode->writebacks); 462 INIT_LIST_HEAD(&vnode->writebacks);
463 INIT_LIST_HEAD(&vnode->pending_locks);
464 INIT_LIST_HEAD(&vnode->granted_locks);
465 INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
463 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); 466 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
464} 467}
465 468
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 232c55dc24..2f05c4fc2a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -561,7 +561,7 @@ no_server:
561/* 561/*
562 * create a hard link 562 * create a hard link
563 */ 563 */
564extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, 564int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
565 struct key *key, const char *name) 565 struct key *key, const char *name)
566{ 566{
567 struct afs_server *server; 567 struct afs_server *server;
@@ -887,11 +887,6 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
887 vnode->fid.unique, 887 vnode->fid.unique,
888 key_serial(key)); 888 key_serial(key));
889 889
890 /* this op will fetch the status */
891 spin_lock(&vnode->lock);
892 vnode->update_cnt++;
893 spin_unlock(&vnode->lock);
894
895 do { 890 do {
896 /* pick a server to query */ 891 /* pick a server to query */
897 server = afs_volume_pick_fileserver(vnode); 892 server = afs_volume_pick_fileserver(vnode);
@@ -905,20 +900,127 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
905 } while (!afs_volume_release_fileserver(vnode, server, ret)); 900 } while (!afs_volume_release_fileserver(vnode, server, ret));
906 901
907 /* adjust the flags */ 902 /* adjust the flags */
908 if (ret == 0) { 903 if (ret == 0)
909 afs_vnode_finalise_status_update(vnode, server); 904 afs_put_server(server);
905
906 _leave(" = %d", ret);
907 return ret;
908
909no_server:
910 return PTR_ERR(server);
911}
912
913/*
914 * get a lock on a file
915 */
916int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
917 afs_lock_type_t type)
918{
919 struct afs_server *server;
920 int ret;
921
922 _enter("%s{%x:%u.%u},%x,%u",
923 vnode->volume->vlocation->vldb.name,
924 vnode->fid.vid,
925 vnode->fid.vnode,
926 vnode->fid.unique,
927 key_serial(key), type);
928
929 do {
930 /* pick a server to query */
931 server = afs_volume_pick_fileserver(vnode);
932 if (IS_ERR(server))
933 goto no_server;
934
935 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
936
937 ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
938
939 } while (!afs_volume_release_fileserver(vnode, server, ret));
940
941 /* adjust the flags */
942 if (ret == 0)
943 afs_put_server(server);
944
945 _leave(" = %d", ret);
946 return ret;
947
948no_server:
949 return PTR_ERR(server);
950}
951
952/*
953 * extend a lock on a file
954 */
955int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
956{
957 struct afs_server *server;
958 int ret;
959
960 _enter("%s{%x:%u.%u},%x",
961 vnode->volume->vlocation->vldb.name,
962 vnode->fid.vid,
963 vnode->fid.vnode,
964 vnode->fid.unique,
965 key_serial(key));
966
967 do {
968 /* pick a server to query */
969 server = afs_volume_pick_fileserver(vnode);
970 if (IS_ERR(server))
971 goto no_server;
972
973 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
974
975 ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
976
977 } while (!afs_volume_release_fileserver(vnode, server, ret));
978
979 /* adjust the flags */
980 if (ret == 0)
981 afs_put_server(server);
982
983 _leave(" = %d", ret);
984 return ret;
985
986no_server:
987 return PTR_ERR(server);
988}
989
990/*
991 * release a lock on a file
992 */
993int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
994{
995 struct afs_server *server;
996 int ret;
997
998 _enter("%s{%x:%u.%u},%x",
999 vnode->volume->vlocation->vldb.name,
1000 vnode->fid.vid,
1001 vnode->fid.vnode,
1002 vnode->fid.unique,
1003 key_serial(key));
1004
1005 do {
1006 /* pick a server to query */
1007 server = afs_volume_pick_fileserver(vnode);
1008 if (IS_ERR(server))
1009 goto no_server;
1010
1011 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
1012
1013 ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
1014
1015 } while (!afs_volume_release_fileserver(vnode, server, ret));
1016
1017 /* adjust the flags */
1018 if (ret == 0)
910 afs_put_server(server); 1019 afs_put_server(server);
911 } else {
912 afs_vnode_status_update_failed(vnode, ret);
913 }
914 1020
915 _leave(" = %d", ret); 1021 _leave(" = %d", ret);
916 return ret; 1022 return ret;
917 1023
918no_server: 1024no_server:
919 spin_lock(&vnode->lock);
920 vnode->update_cnt--;
921 ASSERTCMP(vnode->update_cnt, >=, 0);
922 spin_unlock(&vnode->lock);
923 return PTR_ERR(server); 1025 return PTR_ERR(server);
924} 1026}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 40fe3a3222..b4a75880f6 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -53,7 +53,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
53}; 53};
54 54
55/** 55/**
56 * anon_inode_getfd - creates a new file instance by hooking it up to and 56 * anon_inode_getfd - creates a new file instance by hooking it up to an
57 * anonymous inode, and a dentry that describe the "class" 57 * anonymous inode, and a dentry that describe the "class"
58 * of the file 58 * of the file
59 * 59 *
@@ -66,7 +66,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
66 * 66 *
67 * Creates a new file by hooking it on a single inode. This is useful for files 67 * Creates a new file by hooking it on a single inode. This is useful for files
68 * that do not need to have a full-fledged inode in order to operate correctly. 68 * that do not need to have a full-fledged inode in order to operate correctly.
69 * All the files created with anon_inode_getfd() will share a single inode, by 69 * All the files created with anon_inode_getfd() will share a single inode,
70 * hence saving memory and avoiding code duplication for the file/inode/dentry 70 * hence saving memory and avoiding code duplication for the file/inode/dentry
71 * setup. 71 * setup.
72 */ 72 */
@@ -139,11 +139,12 @@ err_put_filp:
139 put_filp(file); 139 put_filp(file);
140 return error; 140 return error;
141} 141}
142EXPORT_SYMBOL_GPL(anon_inode_getfd);
142 143
143/* 144/*
144 * A single inode exist for all anon_inode files. Contrary to pipes, 145 * A single inode exists for all anon_inode files. Contrary to pipes,
145 * anon_inode inodes has no per-instance data associated, so we can avoid 146 * anon_inode inodes have no associated per-instance data, so we need
146 * the allocation of multiple of them. 147 * only allocate one of them.
147 */ 148 */
148static struct inode *anon_inode_mkinode(void) 149static struct inode *anon_inode_mkinode(void)
149{ 150{
diff --git a/fs/attr.c b/fs/attr.c
index a0a0c7b07b..f8dfc2269d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -42,7 +42,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
42 42
43 /* Make sure a caller can chmod. */ 43 /* Make sure a caller can chmod. */
44 if (ia_valid & ATTR_MODE) { 44 if (ia_valid & ATTR_MODE) {
45 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 45 if (!is_owner_or_cap(inode))
46 goto error; 46 goto error;
47 /* Also check the setgid bit! */ 47 /* Also check the setgid bit! */
48 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 48 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -52,7 +52,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
52 52
53 /* Check for setting the inode time. */ 53 /* Check for setting the inode time. */
54 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { 54 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
55 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) 55 if (!is_owner_or_cap(inode))
56 goto error; 56 goto error;
57 } 57 }
58fine: 58fine:
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 329ee473ee..521ff7caad 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -114,12 +114,6 @@ static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
114 return -EIO; 114 return -EIO;
115} 115}
116 116
117static ssize_t bad_file_sendfile(struct file *in_file, loff_t *ppos,
118 size_t count, read_actor_t actor, void *target)
119{
120 return -EIO;
121}
122
123static ssize_t bad_file_sendpage(struct file *file, struct page *page, 117static ssize_t bad_file_sendpage(struct file *file, struct page *page,
124 int off, size_t len, loff_t *pos, int more) 118 int off, size_t len, loff_t *pos, int more)
125{ 119{
@@ -182,7 +176,6 @@ static const struct file_operations bad_file_ops =
182 .aio_fsync = bad_file_aio_fsync, 176 .aio_fsync = bad_file_aio_fsync,
183 .fasync = bad_file_fasync, 177 .fasync = bad_file_fasync,
184 .lock = bad_file_lock, 178 .lock = bad_file_lock,
185 .sendfile = bad_file_sendfile,
186 .sendpage = bad_file_sendpage, 179 .sendpage = bad_file_sendpage,
187 .get_unmapped_area = bad_file_get_unmapped_area, 180 .get_unmapped_area = bad_file_get_unmapped_area,
188 .check_flags = bad_file_check_flags, 181 .check_flags = bad_file_check_flags,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ef4d1fa04e..24310e9ee0 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -24,7 +24,7 @@ const struct file_operations bfs_file_operations = {
24 .write = do_sync_write, 24 .write = do_sync_write,
25 .aio_write = generic_file_aio_write, 25 .aio_write = generic_file_aio_write,
26 .mmap = generic_file_mmap, 26 .mmap = generic_file_mmap,
27 .sendfile = generic_file_sendfile, 27 .splice_read = generic_file_splice_read,
28}; 28};
29 29
30static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb) 30static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fa8ea33ab0..a27e42bf34 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@
45 45
46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
47static int load_elf_library(struct file *); 47static int load_elf_library(struct file *);
48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); 48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
49 49
50/* 50/*
51 * If we don't support core dumping, then supply a NULL so we 51 * If we don't support core dumping, then supply a NULL so we
@@ -80,7 +80,7 @@ static struct linux_binfmt elf_format = {
80 .hasvdso = 1 80 .hasvdso = 1
81}; 81};
82 82
83#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 83#define BAD_ADDR(x) IS_ERR_VALUE(x)
84 84
85static int set_brk(unsigned long start, unsigned long end) 85static int set_brk(unsigned long start, unsigned long end)
86{ 86{
@@ -285,33 +285,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
285#ifndef elf_map 285#ifndef elf_map
286 286
287static unsigned long elf_map(struct file *filep, unsigned long addr, 287static unsigned long elf_map(struct file *filep, unsigned long addr,
288 struct elf_phdr *eppnt, int prot, int type) 288 struct elf_phdr *eppnt, int prot, int type,
289 unsigned long total_size)
289{ 290{
290 unsigned long map_addr; 291 unsigned long map_addr;
291 unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); 292 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
293 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
294 addr = ELF_PAGESTART(addr);
295 size = ELF_PAGEALIGN(size);
292 296
293 down_write(&current->mm->mmap_sem);
294 /* mmap() will return -EINVAL if given a zero size, but a 297 /* mmap() will return -EINVAL if given a zero size, but a
295 * segment with zero filesize is perfectly valid */ 298 * segment with zero filesize is perfectly valid */
296 if (eppnt->p_filesz + pageoffset) 299 if (!size)
297 map_addr = do_mmap(filep, ELF_PAGESTART(addr), 300 return addr;
298 eppnt->p_filesz + pageoffset, prot, type, 301
299 eppnt->p_offset - pageoffset); 302 down_write(&current->mm->mmap_sem);
300 else 303 /*
301 map_addr = ELF_PAGESTART(addr); 304 * total_size is the size of the ELF (interpreter) image.
305 * The _first_ mmap needs to know the full size, otherwise
306 * randomization might put this image into an overlapping
307 * position with the ELF binary image. (since size < total_size)
308 * So we first map the 'big' image - and unmap the remainder at
309 * the end. (which unmap is needed for ELF images with holes.)
310 */
311 if (total_size) {
312 total_size = ELF_PAGEALIGN(total_size);
313 map_addr = do_mmap(filep, addr, total_size, prot, type, off);
314 if (!BAD_ADDR(map_addr))
315 do_munmap(current->mm, map_addr+size, total_size-size);
316 } else
317 map_addr = do_mmap(filep, addr, size, prot, type, off);
318
302 up_write(&current->mm->mmap_sem); 319 up_write(&current->mm->mmap_sem);
303 return(map_addr); 320 return(map_addr);
304} 321}
305 322
306#endif /* !elf_map */ 323#endif /* !elf_map */
307 324
325static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
326{
327 int i, first_idx = -1, last_idx = -1;
328
329 for (i = 0; i < nr; i++) {
330 if (cmds[i].p_type == PT_LOAD) {
331 last_idx = i;
332 if (first_idx == -1)
333 first_idx = i;
334 }
335 }
336 if (first_idx == -1)
337 return 0;
338
339 return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
340 ELF_PAGESTART(cmds[first_idx].p_vaddr);
341}
342
343
308/* This is much more generalized than the library routine read function, 344/* This is much more generalized than the library routine read function,
309 so we keep this separate. Technically the library read function 345 so we keep this separate. Technically the library read function
310 is only provided so that we can read a.out libraries that have 346 is only provided so that we can read a.out libraries that have
311 an ELF header */ 347 an ELF header */
312 348
313static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, 349static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
314 struct file *interpreter, unsigned long *interp_load_addr) 350 struct file *interpreter, unsigned long *interp_map_addr,
351 unsigned long no_base)
315{ 352{
316 struct elf_phdr *elf_phdata; 353 struct elf_phdr *elf_phdata;
317 struct elf_phdr *eppnt; 354 struct elf_phdr *eppnt;
@@ -319,6 +356,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
319 int load_addr_set = 0; 356 int load_addr_set = 0;
320 unsigned long last_bss = 0, elf_bss = 0; 357 unsigned long last_bss = 0, elf_bss = 0;
321 unsigned long error = ~0UL; 358 unsigned long error = ~0UL;
359 unsigned long total_size;
322 int retval, i, size; 360 int retval, i, size;
323 361
324 /* First of all, some simple consistency checks */ 362 /* First of all, some simple consistency checks */
@@ -357,6 +395,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
357 goto out_close; 395 goto out_close;
358 } 396 }
359 397
398 total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
399 if (!total_size) {
400 error = -EINVAL;
401 goto out_close;
402 }
403
360 eppnt = elf_phdata; 404 eppnt = elf_phdata;
361 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { 405 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
362 if (eppnt->p_type == PT_LOAD) { 406 if (eppnt->p_type == PT_LOAD) {
@@ -374,9 +418,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
374 vaddr = eppnt->p_vaddr; 418 vaddr = eppnt->p_vaddr;
375 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) 419 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
376 elf_type |= MAP_FIXED; 420 elf_type |= MAP_FIXED;
421 else if (no_base && interp_elf_ex->e_type == ET_DYN)
422 load_addr = -vaddr;
377 423
378 map_addr = elf_map(interpreter, load_addr + vaddr, 424 map_addr = elf_map(interpreter, load_addr + vaddr,
379 eppnt, elf_prot, elf_type); 425 eppnt, elf_prot, elf_type, total_size);
426 total_size = 0;
427 if (!*interp_map_addr)
428 *interp_map_addr = map_addr;
380 error = map_addr; 429 error = map_addr;
381 if (BAD_ADDR(map_addr)) 430 if (BAD_ADDR(map_addr))
382 goto out_close; 431 goto out_close;
@@ -442,8 +491,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
442 goto out_close; 491 goto out_close;
443 } 492 }
444 493
445 *interp_load_addr = load_addr; 494 error = load_addr;
446 error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
447 495
448out_close: 496out_close:
449 kfree(elf_phdata); 497 kfree(elf_phdata);
@@ -540,7 +588,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
540 int elf_exec_fileno; 588 int elf_exec_fileno;
541 int retval, i; 589 int retval, i;
542 unsigned int size; 590 unsigned int size;
543 unsigned long elf_entry, interp_load_addr = 0; 591 unsigned long elf_entry;
592 unsigned long interp_load_addr = 0;
544 unsigned long start_code, end_code, start_data, end_data; 593 unsigned long start_code, end_code, start_data, end_data;
545 unsigned long reloc_func_desc = 0; 594 unsigned long reloc_func_desc = 0;
546 char passed_fileno[6]; 595 char passed_fileno[6];
@@ -808,9 +857,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
808 current->mm->start_stack = bprm->p; 857 current->mm->start_stack = bprm->p;
809 858
810 /* Now we do a little grungy work by mmaping the ELF image into 859 /* Now we do a little grungy work by mmaping the ELF image into
811 the correct location in memory. At this point, we assume that 860 the correct location in memory. */
812 the image should be loaded at fixed address, not at a variable
813 address. */
814 for(i = 0, elf_ppnt = elf_phdata; 861 for(i = 0, elf_ppnt = elf_phdata;
815 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 862 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
816 int elf_prot = 0, elf_flags; 863 int elf_prot = 0, elf_flags;
@@ -864,11 +911,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
864 * default mmap base, as well as whatever program they 911 * default mmap base, as well as whatever program they
865 * might try to exec. This is because the brk will 912 * might try to exec. This is because the brk will
866 * follow the loader, and is not movable. */ 913 * follow the loader, and is not movable. */
914#ifdef CONFIG_X86
915 load_bias = 0;
916#else
867 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 917 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
918#endif
868 } 919 }
869 920
870 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 921 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
871 elf_prot, elf_flags); 922 elf_prot, elf_flags,0);
872 if (BAD_ADDR(error)) { 923 if (BAD_ADDR(error)) {
873 send_sig(SIGKILL, current, 0); 924 send_sig(SIGKILL, current, 0);
874 retval = IS_ERR((void *)error) ? 925 retval = IS_ERR((void *)error) ?
@@ -944,13 +995,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
944 } 995 }
945 996
946 if (elf_interpreter) { 997 if (elf_interpreter) {
947 if (interpreter_type == INTERPRETER_AOUT) 998 if (interpreter_type == INTERPRETER_AOUT) {
948 elf_entry = load_aout_interp(&loc->interp_ex, 999 elf_entry = load_aout_interp(&loc->interp_ex,
949 interpreter); 1000 interpreter);
950 else 1001 } else {
1002 unsigned long uninitialized_var(interp_map_addr);
1003
951 elf_entry = load_elf_interp(&loc->interp_elf_ex, 1004 elf_entry = load_elf_interp(&loc->interp_elf_ex,
952 interpreter, 1005 interpreter,
953 &interp_load_addr); 1006 &interp_map_addr,
1007 load_bias);
1008 if (!BAD_ADDR(elf_entry)) {
1009 /*
1010 * load_elf_interp() returns relocation
1011 * adjustment
1012 */
1013 interp_load_addr = elf_entry;
1014 elf_entry += loc->interp_elf_ex.e_entry;
1015 }
1016 }
954 if (BAD_ADDR(elf_entry)) { 1017 if (BAD_ADDR(elf_entry)) {
955 force_sig(SIGSEGV, current); 1018 force_sig(SIGSEGV, current);
956 retval = IS_ERR((void *)elf_entry) ? 1019 retval = IS_ERR((void *)elf_entry) ?
@@ -1499,6 +1562,9 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1499#endif 1562#endif
1500 int thread_status_size = 0; 1563 int thread_status_size = 0;
1501 elf_addr_t *auxv; 1564 elf_addr_t *auxv;
1565#ifdef ELF_CORE_WRITE_EXTRA_NOTES
1566 int extra_notes_size;
1567#endif
1502 1568
1503 /* 1569 /*
1504 * We no longer stop all VM operations. 1570 * We no longer stop all VM operations.
@@ -1628,7 +1694,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1628 sz += thread_status_size; 1694 sz += thread_status_size;
1629 1695
1630#ifdef ELF_CORE_WRITE_EXTRA_NOTES 1696#ifdef ELF_CORE_WRITE_EXTRA_NOTES
1631 sz += ELF_CORE_EXTRA_NOTES_SIZE; 1697 extra_notes_size = ELF_CORE_EXTRA_NOTES_SIZE;
1698 sz += extra_notes_size;
1632#endif 1699#endif
1633 1700
1634 fill_elf_note_phdr(&phdr, sz, offset); 1701 fill_elf_note_phdr(&phdr, sz, offset);
@@ -1674,6 +1741,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1674 1741
1675#ifdef ELF_CORE_WRITE_EXTRA_NOTES 1742#ifdef ELF_CORE_WRITE_EXTRA_NOTES
1676 ELF_CORE_WRITE_EXTRA_NOTES; 1743 ELF_CORE_WRITE_EXTRA_NOTES;
1744 foffset += extra_notes_size;
1677#endif 1745#endif
1678 1746
1679 /* write out the thread status notes section */ 1747 /* write out the thread status notes section */
diff --git a/fs/bio.c b/fs/bio.c
index 093345f001..33e46340a7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1223,8 +1223,6 @@ EXPORT_SYMBOL(bio_hw_segments);
1223EXPORT_SYMBOL(bio_add_page); 1223EXPORT_SYMBOL(bio_add_page);
1224EXPORT_SYMBOL(bio_add_pc_page); 1224EXPORT_SYMBOL(bio_add_pc_page);
1225EXPORT_SYMBOL(bio_get_nr_vecs); 1225EXPORT_SYMBOL(bio_get_nr_vecs);
1226EXPORT_SYMBOL(bio_map_user);
1227EXPORT_SYMBOL(bio_unmap_user);
1228EXPORT_SYMBOL(bio_map_kern); 1226EXPORT_SYMBOL(bio_map_kern);
1229EXPORT_SYMBOL(bio_pair_release); 1227EXPORT_SYMBOL(bio_pair_release);
1230EXPORT_SYMBOL(bio_split); 1228EXPORT_SYMBOL(bio_split);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ea1480a16f..3635315e3b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -588,12 +588,10 @@ EXPORT_SYMBOL(bdget);
588 588
589long nr_blockdev_pages(void) 589long nr_blockdev_pages(void)
590{ 590{
591 struct list_head *p; 591 struct block_device *bdev;
592 long ret = 0; 592 long ret = 0;
593 spin_lock(&bdev_lock); 593 spin_lock(&bdev_lock);
594 list_for_each(p, &all_bdevs) { 594 list_for_each_entry(bdev, &all_bdevs, bd_list) {
595 struct block_device *bdev;
596 bdev = list_entry(p, struct block_device, bd_list);
597 ret += bdev->bd_inode->i_mapping->nrpages; 595 ret += bdev->bd_inode->i_mapping->nrpages;
598 } 596 }
599 spin_unlock(&bdev_lock); 597 spin_unlock(&bdev_lock);
@@ -874,7 +872,7 @@ static struct bd_holder *find_bd_holder(struct block_device *bdev,
874 */ 872 */
875static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 873static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
876{ 874{
877 int ret; 875 int err;
878 876
879 if (!bo) 877 if (!bo)
880 return -EINVAL; 878 return -EINVAL;
@@ -882,15 +880,18 @@ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
882 if (!bd_holder_grab_dirs(bdev, bo)) 880 if (!bd_holder_grab_dirs(bdev, bo))
883 return -EBUSY; 881 return -EBUSY;
884 882
885 ret = add_symlink(bo->sdir, bo->sdev); 883 err = add_symlink(bo->sdir, bo->sdev);
886 if (ret == 0) { 884 if (err)
887 ret = add_symlink(bo->hdir, bo->hdev); 885 return err;
888 if (ret) 886
889 del_symlink(bo->sdir, bo->sdev); 887 err = add_symlink(bo->hdir, bo->hdev);
888 if (err) {
889 del_symlink(bo->sdir, bo->sdev);
890 return err;
890 } 891 }
891 if (ret == 0) 892
892 list_add_tail(&bo->list, &bdev->bd_holder_list); 893 list_add_tail(&bo->list, &bdev->bd_holder_list);
893 return ret; 894 return 0;
894} 895}
895 896
896/** 897/**
@@ -948,7 +949,7 @@ static struct bd_holder *del_bd_holder(struct block_device *bdev,
948static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 949static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
949 struct kobject *kobj) 950 struct kobject *kobj)
950{ 951{
951 int res; 952 int err;
952 struct bd_holder *bo, *found; 953 struct bd_holder *bo, *found;
953 954
954 if (!kobj) 955 if (!kobj)
@@ -959,21 +960,24 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
959 return -ENOMEM; 960 return -ENOMEM;
960 961
961 mutex_lock(&bdev->bd_mutex); 962 mutex_lock(&bdev->bd_mutex);
962 res = bd_claim(bdev, holder);
963 if (res == 0) {
964 found = find_bd_holder(bdev, bo);
965 if (found == NULL) {
966 res = add_bd_holder(bdev, bo);
967 if (res)
968 bd_release(bdev);
969 }
970 }
971 963
972 if (res || found) 964 err = bd_claim(bdev, holder);
973 free_bd_holder(bo); 965 if (err)
974 mutex_unlock(&bdev->bd_mutex); 966 goto fail;
975 967
976 return res; 968 found = find_bd_holder(bdev, bo);
969 if (found)
970 goto fail;
971
972 err = add_bd_holder(bdev, bo);
973 if (err)
974 bd_release(bdev);
975 else
976 bo = NULL;
977fail:
978 mutex_unlock(&bdev->bd_mutex);
979 free_bd_holder(bo);
980 return err;
977} 981}
978 982
979/** 983/**
@@ -987,15 +991,12 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
987static void bd_release_from_kobject(struct block_device *bdev, 991static void bd_release_from_kobject(struct block_device *bdev,
988 struct kobject *kobj) 992 struct kobject *kobj)
989{ 993{
990 struct bd_holder *bo;
991
992 if (!kobj) 994 if (!kobj)
993 return; 995 return;
994 996
995 mutex_lock(&bdev->bd_mutex); 997 mutex_lock(&bdev->bd_mutex);
996 bd_release(bdev); 998 bd_release(bdev);
997 if ((bo = del_bd_holder(bdev, kobj))) 999 free_bd_holder(del_bd_holder(bdev, kobj));
998 free_bd_holder(bo);
999 mutex_unlock(&bdev->bd_mutex); 1000 mutex_unlock(&bdev->bd_mutex);
1000} 1001}
1001 1002
@@ -1346,7 +1347,6 @@ const struct file_operations def_blk_fops = {
1346#ifdef CONFIG_COMPAT 1347#ifdef CONFIG_COMPAT
1347 .compat_ioctl = compat_blkdev_ioctl, 1348 .compat_ioctl = compat_blkdev_ioctl,
1348#endif 1349#endif
1349 .sendfile = generic_file_sendfile,
1350 .splice_read = generic_file_splice_read, 1350 .splice_read = generic_file_splice_read,
1351 .splice_write = generic_file_splice_write, 1351 .splice_write = generic_file_splice_write,
1352}; 1352};
diff --git a/fs/buffer.c b/fs/buffer.c
index aa68206bd5..0f90067142 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
356 for_each_online_pgdat(pgdat) { 356 for_each_online_pgdat(pgdat) {
357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; 357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
358 if (*zones) 358 if (*zones)
359 try_to_free_pages(zones, GFP_NOFS); 359 try_to_free_pages(zones, 0, GFP_NOFS);
360 } 360 }
361} 361}
362 362
@@ -676,6 +676,39 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
676EXPORT_SYMBOL(mark_buffer_dirty_inode); 676EXPORT_SYMBOL(mark_buffer_dirty_inode);
677 677
678/* 678/*
679 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
680 * dirty.
681 *
682 * If warn is true, then emit a warning if the page is not uptodate and has
683 * not been truncated.
684 */
685static int __set_page_dirty(struct page *page,
686 struct address_space *mapping, int warn)
687{
688 if (unlikely(!mapping))
689 return !TestSetPageDirty(page);
690
691 if (TestSetPageDirty(page))
692 return 0;
693
694 write_lock_irq(&mapping->tree_lock);
695 if (page->mapping) { /* Race with truncate? */
696 WARN_ON_ONCE(warn && !PageUptodate(page));
697
698 if (mapping_cap_account_dirty(mapping)) {
699 __inc_zone_page_state(page, NR_FILE_DIRTY);
700 task_io_account_write(PAGE_CACHE_SIZE);
701 }
702 radix_tree_tag_set(&mapping->page_tree,
703 page_index(page), PAGECACHE_TAG_DIRTY);
704 }
705 write_unlock_irq(&mapping->tree_lock);
706 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
707
708 return 1;
709}
710
711/*
679 * Add a page to the dirty page list. 712 * Add a page to the dirty page list.
680 * 713 *
681 * It is a sad fact of life that this function is called from several places 714 * It is a sad fact of life that this function is called from several places
@@ -702,7 +735,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
702 */ 735 */
703int __set_page_dirty_buffers(struct page *page) 736int __set_page_dirty_buffers(struct page *page)
704{ 737{
705 struct address_space * const mapping = page_mapping(page); 738 struct address_space *mapping = page_mapping(page);
706 739
707 if (unlikely(!mapping)) 740 if (unlikely(!mapping))
708 return !TestSetPageDirty(page); 741 return !TestSetPageDirty(page);
@@ -719,21 +752,7 @@ int __set_page_dirty_buffers(struct page *page)
719 } 752 }
720 spin_unlock(&mapping->private_lock); 753 spin_unlock(&mapping->private_lock);
721 754
722 if (TestSetPageDirty(page)) 755 return __set_page_dirty(page, mapping, 1);
723 return 0;
724
725 write_lock_irq(&mapping->tree_lock);
726 if (page->mapping) { /* Race with truncate? */
727 if (mapping_cap_account_dirty(mapping)) {
728 __inc_zone_page_state(page, NR_FILE_DIRTY);
729 task_io_account_write(PAGE_CACHE_SIZE);
730 }
731 radix_tree_tag_set(&mapping->page_tree,
732 page_index(page), PAGECACHE_TAG_DIRTY);
733 }
734 write_unlock_irq(&mapping->tree_lock);
735 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
736 return 1;
737} 756}
738EXPORT_SYMBOL(__set_page_dirty_buffers); 757EXPORT_SYMBOL(__set_page_dirty_buffers);
739 758
@@ -982,7 +1001,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
982 struct buffer_head *bh; 1001 struct buffer_head *bh;
983 1002
984 page = find_or_create_page(inode->i_mapping, index, 1003 page = find_or_create_page(inode->i_mapping, index,
985 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 1004 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
986 if (!page) 1005 if (!page)
987 return NULL; 1006 return NULL;
988 1007
@@ -1026,11 +1045,6 @@ failed:
1026/* 1045/*
1027 * Create buffers for the specified block device block's page. If 1046 * Create buffers for the specified block device block's page. If
1028 * that page was dirty, the buffers are set dirty also. 1047 * that page was dirty, the buffers are set dirty also.
1029 *
1030 * Except that's a bug. Attaching dirty buffers to a dirty
1031 * blockdev's page can result in filesystem corruption, because
1032 * some of those buffers may be aliases of filesystem data.
1033 * grow_dev_page() will go BUG() if this happens.
1034 */ 1048 */
1035static int 1049static int
1036grow_buffers(struct block_device *bdev, sector_t block, int size) 1050grow_buffers(struct block_device *bdev, sector_t block, int size)
@@ -1137,8 +1151,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1137 */ 1151 */
1138void fastcall mark_buffer_dirty(struct buffer_head *bh) 1152void fastcall mark_buffer_dirty(struct buffer_head *bh)
1139{ 1153{
1154 WARN_ON_ONCE(!buffer_uptodate(bh));
1140 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) 1155 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1141 __set_page_dirty_nobuffers(bh->b_page); 1156 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1142} 1157}
1143 1158
1144/* 1159/*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1cebb7e342..1fd0dc85f5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -623,7 +623,7 @@ const struct file_operations cifs_file_ops = {
623 .fsync = cifs_fsync, 623 .fsync = cifs_fsync,
624 .flush = cifs_flush, 624 .flush = cifs_flush,
625 .mmap = cifs_file_mmap, 625 .mmap = cifs_file_mmap,
626 .sendfile = generic_file_sendfile, 626 .splice_read = generic_file_splice_read,
627 .llseek = cifs_llseek, 627 .llseek = cifs_llseek,
628#ifdef CONFIG_CIFS_POSIX 628#ifdef CONFIG_CIFS_POSIX
629 .ioctl = cifs_ioctl, 629 .ioctl = cifs_ioctl,
@@ -644,7 +644,7 @@ const struct file_operations cifs_file_direct_ops = {
644 .lock = cifs_lock, 644 .lock = cifs_lock,
645 .fsync = cifs_fsync, 645 .fsync = cifs_fsync,
646 .flush = cifs_flush, 646 .flush = cifs_flush,
647 .sendfile = generic_file_sendfile, /* BB removeme BB */ 647 .splice_read = generic_file_splice_read,
648#ifdef CONFIG_CIFS_POSIX 648#ifdef CONFIG_CIFS_POSIX
649 .ioctl = cifs_ioctl, 649 .ioctl = cifs_ioctl,
650#endif /* CONFIG_CIFS_POSIX */ 650#endif /* CONFIG_CIFS_POSIX */
@@ -663,7 +663,7 @@ const struct file_operations cifs_file_nobrl_ops = {
663 .fsync = cifs_fsync, 663 .fsync = cifs_fsync,
664 .flush = cifs_flush, 664 .flush = cifs_flush,
665 .mmap = cifs_file_mmap, 665 .mmap = cifs_file_mmap,
666 .sendfile = generic_file_sendfile, 666 .splice_read = generic_file_splice_read,
667 .llseek = cifs_llseek, 667 .llseek = cifs_llseek,
668#ifdef CONFIG_CIFS_POSIX 668#ifdef CONFIG_CIFS_POSIX
669 .ioctl = cifs_ioctl, 669 .ioctl = cifs_ioctl,
@@ -683,7 +683,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
683 .release = cifs_close, 683 .release = cifs_close,
684 .fsync = cifs_fsync, 684 .fsync = cifs_fsync,
685 .flush = cifs_flush, 685 .flush = cifs_flush,
686 .sendfile = generic_file_sendfile, /* BB removeme BB */ 686 .splice_read = generic_file_splice_read,
687#ifdef CONFIG_CIFS_POSIX 687#ifdef CONFIG_CIFS_POSIX
688 .ioctl = cifs_ioctl, 688 .ioctl = cifs_ioctl,
689#endif /* CONFIG_CIFS_POSIX */ 689#endif /* CONFIG_CIFS_POSIX */
@@ -856,6 +856,7 @@ static int cifs_oplock_thread(void *dummyarg)
856 __u16 netfid; 856 __u16 netfid;
857 int rc; 857 int rc;
858 858
859 set_freezable();
859 do { 860 do {
860 if (try_to_freeze()) 861 if (try_to_freeze())
861 continue; 862 continue;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e93da7ad90..4af3588c1a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -364,6 +364,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
364 GFP_KERNEL); 364 GFP_KERNEL);
365 } 365 }
366 366
367 set_freezable();
367 while (!kthread_should_stop()) { 368 while (!kthread_should_stop()) {
368 if (try_to_freeze()) 369 if (try_to_freeze())
369 continue; 370 continue;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index c21d3d09d0..893fd0aebf 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -44,9 +44,7 @@
44#include "cifsglob.h" 44#include "cifsglob.h"
45#include "cifs_debug.h" 45#include "cifs_debug.h"
46 46
47
48#ifdef CONFIG_CIFS_EXPERIMENTAL 47#ifdef CONFIG_CIFS_EXPERIMENTAL
49
50static struct dentry *cifs_get_parent(struct dentry *dentry) 48static struct dentry *cifs_get_parent(struct dentry *dentry)
51{ 49{
52 /* BB need to add code here eventually to enable export via NFSD */ 50 /* BB need to add code here eventually to enable export via NFSD */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 9ddf5ed621..898a86dde8 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -470,7 +470,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
470 470
471 ret = -ENOENT; 471 ret = -ENOENT;
472 if (!IS_DEADDIR(host_inode)) { 472 if (!IS_DEADDIR(host_inode)) {
473 ret = host_file->f_op->readdir(host_file, filldir, dirent); 473 ret = host_file->f_op->readdir(host_file, dirent, filldir);
474 file_accessed(host_file); 474 file_accessed(host_file);
475 } 475 }
476 } 476 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 5ef2b609ec..99dbe86681 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,8 +47,9 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p
47} 47}
48 48
49static ssize_t 49static ssize_t
50coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count, 50coda_file_splice_read(struct file *coda_file, loff_t *ppos,
51 read_actor_t actor, void *target) 51 struct pipe_inode_info *pipe, size_t count,
52 unsigned int flags)
52{ 53{
53 struct coda_file_info *cfi; 54 struct coda_file_info *cfi;
54 struct file *host_file; 55 struct file *host_file;
@@ -57,10 +58,10 @@ coda_file_sendfile(struct file *coda_file, loff_t *ppos, size_t count,
57 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 58 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
58 host_file = cfi->cfi_container; 59 host_file = cfi->cfi_container;
59 60
60 if (!host_file->f_op || !host_file->f_op->sendfile) 61 if (!host_file->f_op || !host_file->f_op->splice_read)
61 return -EINVAL; 62 return -EINVAL;
62 63
63 return host_file->f_op->sendfile(host_file, ppos, count, actor, target); 64 return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
64} 65}
65 66
66static ssize_t 67static ssize_t
@@ -295,6 +296,6 @@ const struct file_operations coda_file_operations = {
295 .flush = coda_flush, 296 .flush = coda_flush,
296 .release = coda_release, 297 .release = coda_release,
297 .fsync = coda_fsync, 298 .fsync = coda_fsync,
298 .sendfile = coda_file_sendfile, 299 .splice_read = coda_file_splice_read,
299}; 300};
300 301
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6b44cdc96f..e440a7b95d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -63,6 +63,7 @@
63#include <linux/wireless.h> 63#include <linux/wireless.h>
64#include <linux/atalk.h> 64#include <linux/atalk.h>
65#include <linux/blktrace_api.h> 65#include <linux/blktrace_api.h>
66#include <linux/loop.h>
66 67
67#include <net/bluetooth/bluetooth.h> 68#include <net/bluetooth/bluetooth.h>
68#include <net/bluetooth/hci.h> 69#include <net/bluetooth/hci.h>
@@ -3489,6 +3490,9 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
3489 3490
3490IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) 3491IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
3491IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) 3492IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
3493
3494/* loop */
3495IGNORE_IOCTL(LOOP_CLR_FD)
3492}; 3496};
3493 3497
3494#define IOCTL_HASHSIZE 256 3498#define IOCTL_HASHSIZE 256
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b3..3b0185fdf9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
29 29
30struct configfs_dirent { 30struct configfs_dirent {
31 atomic_t s_count; 31 atomic_t s_count;
32 int s_dependent_count;
32 struct list_head s_sibling; 33 struct list_head s_sibling;
33 struct list_head s_children; 34 struct list_head s_children;
34 struct list_head s_links; 35 struct list_head s_links;
35 void * s_element; 36 void * s_element;
36 int s_type; 37 int s_type;
37 umode_t s_mode; 38 umode_t s_mode;
38 struct dentry * s_dentry; 39 struct dentry * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
41 42
42#define CONFIGFS_ROOT 0x0001 43#define CONFIGFS_ROOT 0x0001
43#define CONFIGFS_DIR 0x0002 44#define CONFIGFS_DIR 0x0002
44#define CONFIGFS_ITEM_ATTR 0x0004 45#define CONFIGFS_ITEM_ATTR 0x0004
45#define CONFIGFS_ITEM_LINK 0x0020 46#define CONFIGFS_ITEM_LINK 0x0020
46#define CONFIGFS_USET_DIR 0x0040 47#define CONFIGFS_USET_DIR 0x0040
47#define CONFIGFS_USET_DEFAULT 0x0080 48#define CONFIGFS_USET_DEFAULT 0x0080
48#define CONFIGFS_USET_DROPPING 0x0100 49#define CONFIGFS_USET_DROPPING 0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f..2f436d4f1d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
355 /* Mark that we've taken i_mutex */ 355 /* Mark that we've taken i_mutex */
356 sd->s_type |= CONFIGFS_USET_DROPPING; 356 sd->s_type |= CONFIGFS_USET_DROPPING;
357 357
358 /*
359 * Yup, recursive. If there's a problem, blame
360 * deep nesting of default_groups
361 */
358 ret = configfs_detach_prep(sd->s_dentry); 362 ret = configfs_detach_prep(sd->s_dentry);
359 if (!ret) 363 if (!ret)
360 continue; 364 continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
562 566
563/* 567/*
564 * All of link_obj/unlink_obj/link_group/unlink_group require that 568 * All of link_obj/unlink_obj/link_group/unlink_group require that
565 * subsys->su_sem is held. 569 * subsys->su_mutex is held.
566 */ 570 */
567 571
568static void unlink_obj(struct config_item *item) 572static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
714} 718}
715 719
716/* 720/*
721 * After the item has been detached from the filesystem view, we are
722 * ready to tear it out of the hierarchy. Notify the client before
723 * we do that so they can perform any cleanup that requires
724 * navigating the hierarchy. A client does not need to provide this
725 * callback. The subsystem semaphore MUST be held by the caller, and
726 * references must be valid for both items. It also assumes the
727 * caller has validated ci_type.
728 */
729static void client_disconnect_notify(struct config_item *parent_item,
730 struct config_item *item)
731{
732 struct config_item_type *type;
733
734 type = parent_item->ci_type;
735 BUG_ON(!type);
736
737 if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
738 type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
739 item);
740}
741
742/*
717 * Drop the initial reference from make_item()/make_group() 743 * Drop the initial reference from make_item()/make_group()
718 * This function assumes that reference is held on item 744 * This function assumes that reference is held on item
719 * and that item holds a valid reference to the parent. Also, it 745 * and that item holds a valid reference to the parent. Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
733 */ 759 */
734 if (type->ct_group_ops && type->ct_group_ops->drop_item) 760 if (type->ct_group_ops && type->ct_group_ops->drop_item)
735 type->ct_group_ops->drop_item(to_config_group(parent_item), 761 type->ct_group_ops->drop_item(to_config_group(parent_item),
736 item); 762 item);
737 else 763 else
738 config_item_put(item); 764 config_item_put(item);
739} 765}
740 766
767#ifdef DEBUG
768static void configfs_dump_one(struct configfs_dirent *sd, int level)
769{
770 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
771
772#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
773 type_print(CONFIGFS_ROOT);
774 type_print(CONFIGFS_DIR);
775 type_print(CONFIGFS_ITEM_ATTR);
776 type_print(CONFIGFS_ITEM_LINK);
777 type_print(CONFIGFS_USET_DIR);
778 type_print(CONFIGFS_USET_DEFAULT);
779 type_print(CONFIGFS_USET_DROPPING);
780#undef type_print
781}
782
783static int configfs_dump(struct configfs_dirent *sd, int level)
784{
785 struct configfs_dirent *child_sd;
786 int ret = 0;
787
788 configfs_dump_one(sd, level);
789
790 if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
791 return 0;
792
793 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
794 ret = configfs_dump(child_sd, level + 2);
795 if (ret)
796 break;
797 }
798
799 return ret;
800}
801#endif
802
803
804/*
805 * configfs_depend_item() and configfs_undepend_item()
806 *
807 * WARNING: Do not call these from a configfs callback!
808 *
809 * This describes these functions and their helpers.
810 *
811 * Allow another kernel system to depend on a config_item. If this
812 * happens, the item cannot go away until the dependant can live without
813 * it. The idea is to give client modules as simple an interface as
814 * possible. When a system asks them to depend on an item, they just
815 * call configfs_depend_item(). If the item is live and the client
816 * driver is in good shape, we'll happily do the work for them.
817 *
818 * Why is the locking complex? Because configfs uses the VFS to handle
819 * all locking, but this function is called outside the normal
820 * VFS->configfs path. So it must take VFS locks to prevent the
821 * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
822 * why you can't call these functions underneath configfs callbacks.
823 *
824 * Note, btw, that this can be called at *any* time, even when a configfs
825 * subsystem isn't registered, or when configfs is loading or unloading.
826 * Just like configfs_register_subsystem(). So we take the same
827 * precautions. We pin the filesystem. We lock each i_mutex _in_order_
828 * on our way down the tree. If we can find the target item in the
829 * configfs tree, it must be part of the subsystem tree as well, so we
830 * do not need the subsystem semaphore. Holding the i_mutex chain locks
831 * out mkdir() and rmdir(), who might be racing us.
832 */
833
834/*
835 * configfs_depend_prep()
836 *
837 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
838 * attributes. This is similar but not the same to configfs_detach_prep().
839 * Note that configfs_detach_prep() expects the parent to be locked when it
840 * is called, but we lock the parent *inside* configfs_depend_prep(). We
841 * do that so we can unlock it if we find nothing.
842 *
843 * Here we do a depth-first search of the dentry hierarchy looking for
844 * our object. We take i_mutex on each step of the way down. IT IS
845 * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch,
846 * we'll drop the i_mutex.
847 *
848 * If the target is not found, -ENOENT is bubbled up and we have released
849 * all locks. If the target was found, the locks will be cleared by
850 * configfs_depend_rollback().
851 *
852 * This adds a requirement that all config_items be unique!
853 *
854 * This is recursive because the locking traversal is tricky. There isn't
855 * much on the stack, though, so folks that need this function - be careful
856 * about your stack! Patches will be accepted to make it iterative.
857 */
858static int configfs_depend_prep(struct dentry *origin,
859 struct config_item *target)
860{
861 struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
862 int ret = 0;
863
864 BUG_ON(!origin || !sd);
865
866 /* Lock this guy on the way down */
867 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
868 if (sd->s_element == target) /* Boo-yah */
869 goto out;
870
871 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
872 if (child_sd->s_type & CONFIGFS_DIR) {
873 ret = configfs_depend_prep(child_sd->s_dentry,
874 target);
875 if (!ret)
876 goto out; /* Child path boo-yah */
877 }
878 }
879
880 /* We looped all our children and didn't find target */
881 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
882 ret = -ENOENT;
883
884out:
885 return ret;
886}
887
888/*
889 * This is ONLY called if configfs_depend_prep() did its job. So we can
890 * trust the entire path from item back up to origin.
891 *
892 * We walk backwards from item, unlocking each i_mutex. We finish by
893 * unlocking origin.
894 */
895static void configfs_depend_rollback(struct dentry *origin,
896 struct config_item *item)
897{
898 struct dentry *dentry = item->ci_dentry;
899
900 while (dentry != origin) {
901 mutex_unlock(&dentry->d_inode->i_mutex);
902 dentry = dentry->d_parent;
903 }
904
905 mutex_unlock(&origin->d_inode->i_mutex);
906}
907
908int configfs_depend_item(struct configfs_subsystem *subsys,
909 struct config_item *target)
910{
911 int ret;
912 struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
913 struct config_item *s_item = &subsys->su_group.cg_item;
914
915 /*
916 * Pin the configfs filesystem. This means we can safely access
917 * the root of the configfs filesystem.
918 */
919 ret = configfs_pin_fs();
920 if (ret)
921 return ret;
922
923 /*
924 * Next, lock the root directory. We're going to check that the
925 * subsystem is really registered, and so we need to lock out
926 * configfs_[un]register_subsystem().
927 */
928 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
929
930 root_sd = configfs_sb->s_root->d_fsdata;
931
932 list_for_each_entry(p, &root_sd->s_children, s_sibling) {
933 if (p->s_type & CONFIGFS_DIR) {
934 if (p->s_element == s_item) {
935 subsys_sd = p;
936 break;
937 }
938 }
939 }
940
941 if (!subsys_sd) {
942 ret = -ENOENT;
943 goto out_unlock_fs;
944 }
945
946 /* Ok, now we can trust subsys/s_item */
947
948 /* Scan the tree, locking i_mutex recursively, return 0 if found */
949 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
950 if (ret)
951 goto out_unlock_fs;
952
953 /* We hold all i_mutexes from the subsystem down to the target */
954 p = target->ci_dentry->d_fsdata;
955 p->s_dependent_count += 1;
956
957 configfs_depend_rollback(subsys_sd->s_dentry, target);
958
959out_unlock_fs:
960 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
961
962 /*
963 * If we succeeded, the fs is pinned via other methods. If not,
964 * we're done with it anyway. So release_fs() is always right.
965 */
966 configfs_release_fs();
967
968 return ret;
969}
970EXPORT_SYMBOL(configfs_depend_item);
971
972/*
973 * Release the dependent linkage. This is much simpler than
974 * configfs_depend_item() because we know that that the client driver is
975 * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
976 */
977void configfs_undepend_item(struct configfs_subsystem *subsys,
978 struct config_item *target)
979{
980 struct configfs_dirent *sd;
981
982 /*
983 * Since we can trust everything is pinned, we just need i_mutex
984 * on the item.
985 */
986 mutex_lock(&target->ci_dentry->d_inode->i_mutex);
987
988 sd = target->ci_dentry->d_fsdata;
989 BUG_ON(sd->s_dependent_count < 1);
990
991 sd->s_dependent_count -= 1;
992
993 /*
994 * After this unlock, we cannot trust the item to stay alive!
995 * DO NOT REFERENCE item after this unlock.
996 */
997 mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
998}
999EXPORT_SYMBOL(configfs_undepend_item);
741 1000
742static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1001static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
743{ 1002{
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
783 1042
784 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); 1043 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
785 1044
786 down(&subsys->su_sem); 1045 mutex_lock(&subsys->su_mutex);
787 group = NULL; 1046 group = NULL;
788 item = NULL; 1047 item = NULL;
789 if (type->ct_group_ops->make_group) { 1048 if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
797 if (item) 1056 if (item)
798 link_obj(parent_item, item); 1057 link_obj(parent_item, item);
799 } 1058 }
800 up(&subsys->su_sem); 1059 mutex_unlock(&subsys->su_mutex);
801 1060
802 kfree(name); 1061 kfree(name);
803 if (!item) { 1062 if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
841out_unlink: 1100out_unlink:
842 if (ret) { 1101 if (ret) {
843 /* Tear down everything we built up */ 1102 /* Tear down everything we built up */
844 down(&subsys->su_sem); 1103 mutex_lock(&subsys->su_mutex);
1104
1105 client_disconnect_notify(parent_item, item);
845 if (group) 1106 if (group)
846 unlink_group(group); 1107 unlink_group(group);
847 else 1108 else
848 unlink_obj(item); 1109 unlink_obj(item);
849 client_drop_item(parent_item, item); 1110 client_drop_item(parent_item, item);
850 up(&subsys->su_sem); 1111
1112 mutex_unlock(&subsys->su_mutex);
851 1113
852 if (module_got) 1114 if (module_got)
853 module_put(owner); 1115 module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
881 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1143 if (sd->s_type & CONFIGFS_USET_DEFAULT)
882 return -EPERM; 1144 return -EPERM;
883 1145
1146 /*
1147 * Here's where we check for dependents. We're protected by
1148 * i_mutex.
1149 */
1150 if (sd->s_dependent_count)
1151 return -EBUSY;
1152
884 /* Get a working ref until we have the child */ 1153 /* Get a working ref until we have the child */
885 parent_item = configfs_get_config_item(dentry->d_parent); 1154 parent_item = configfs_get_config_item(dentry->d_parent);
886 subsys = to_config_group(parent_item)->cg_subsys; 1155 subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
910 if (sd->s_type & CONFIGFS_USET_DIR) { 1179 if (sd->s_type & CONFIGFS_USET_DIR) {
911 configfs_detach_group(item); 1180 configfs_detach_group(item);
912 1181
913 down(&subsys->su_sem); 1182 mutex_lock(&subsys->su_mutex);
1183 client_disconnect_notify(parent_item, item);
914 unlink_group(to_config_group(item)); 1184 unlink_group(to_config_group(item));
915 } else { 1185 } else {
916 configfs_detach_item(item); 1186 configfs_detach_item(item);
917 1187
918 down(&subsys->su_sem); 1188 mutex_lock(&subsys->su_mutex);
1189 client_disconnect_notify(parent_item, item);
919 unlink_obj(item); 1190 unlink_obj(item);
920 } 1191 }
921 1192
922 client_drop_item(parent_item, item); 1193 client_drop_item(parent_item, item);
923 up(&subsys->su_sem); 1194 mutex_unlock(&subsys->su_mutex);
924 1195
925 /* Drop our reference from above */ 1196 /* Drop our reference from above */
926 config_item_put(item); 1197 config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c6de..a3658f9a08 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/semaphore.h>
32 32
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include "configfs_internal.h" 34#include "configfs_internal.h"
35 35
36/*
37 * A simple attribute can only be 4096 characters. Why 4k? Because the
38 * original code limited it to PAGE_SIZE. That's a bad idea, though,
39 * because an attribute of 16k on ia64 won't work on x86. So we limit to
40 * 4k, our minimum common page size.
41 */
42#define SIMPLE_ATTR_SIZE 4096
36 43
37struct configfs_buffer { 44struct configfs_buffer {
38 size_t count; 45 size_t count;
39 loff_t pos; 46 loff_t pos;
40 char * page; 47 char * page;
41 struct configfs_item_operations * ops; 48 struct configfs_item_operations * ops;
42 struct semaphore sem; 49 struct mutex mutex;
43 int needs_read_fill; 50 int needs_read_fill;
44}; 51};
45 52
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
69 76
70 count = ops->show_attribute(item,attr,buffer->page); 77 count = ops->show_attribute(item,attr,buffer->page);
71 buffer->needs_read_fill = 0; 78 buffer->needs_read_fill = 0;
72 BUG_ON(count > (ssize_t)PAGE_SIZE); 79 BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
73 if (count >= 0) 80 if (count >= 0)
74 buffer->count = count; 81 buffer->count = count;
75 else 82 else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
102 struct configfs_buffer * buffer = file->private_data; 109 struct configfs_buffer * buffer = file->private_data;
103 ssize_t retval = 0; 110 ssize_t retval = 0;
104 111
105 down(&buffer->sem); 112 mutex_lock(&buffer->mutex);
106 if (buffer->needs_read_fill) { 113 if (buffer->needs_read_fill) {
107 if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) 114 if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
108 goto out; 115 goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
112 retval = simple_read_from_buffer(buf, count, ppos, buffer->page, 119 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
113 buffer->count); 120 buffer->count);
114out: 121out:
115 up(&buffer->sem); 122 mutex_unlock(&buffer->mutex);
116 return retval; 123 return retval;
117} 124}
118 125
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
137 if (!buffer->page) 144 if (!buffer->page)
138 return -ENOMEM; 145 return -ENOMEM;
139 146
140 if (count >= PAGE_SIZE) 147 if (count >= SIMPLE_ATTR_SIZE)
141 count = PAGE_SIZE - 1; 148 count = SIMPLE_ATTR_SIZE - 1;
142 error = copy_from_user(buffer->page,buf,count); 149 error = copy_from_user(buffer->page,buf,count);
143 buffer->needs_read_fill = 1; 150 buffer->needs_read_fill = 1;
144 /* if buf is assumed to contain a string, terminate it by \0, 151 /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
193 struct configfs_buffer * buffer = file->private_data; 200 struct configfs_buffer * buffer = file->private_data;
194 ssize_t len; 201 ssize_t len;
195 202
196 down(&buffer->sem); 203 mutex_lock(&buffer->mutex);
197 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
198 if (len > 0) 205 if (len > 0)
199 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, count);
200 if (len > 0) 207 if (len > 0)
201 *ppos += len; 208 *ppos += len;
202 up(&buffer->sem); 209 mutex_unlock(&buffer->mutex);
203 return len; 210 return len;
204} 211}
205 212
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
253 error = -ENOMEM; 260 error = -ENOMEM;
254 goto Enomem; 261 goto Enomem;
255 } 262 }
256 init_MUTEX(&buffer->sem); 263 mutex_init(&buffer->mutex);
257 buffer->needs_read_fill = 1; 264 buffer->needs_read_fill = 1;
258 buffer->ops = ops; 265 buffer->ops = ops;
259 file->private_data = buffer; 266 file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
292 if (buffer) { 299 if (buffer) {
293 if (buffer->page) 300 if (buffer->page)
294 free_page((unsigned long)buffer->page); 301 free_page((unsigned long)buffer->page);
302 mutex_destroy(&buffer->mutex);
295 kfree(buffer); 303 kfree(buffer);
296 } 304 }
297 return 0; 305 return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f8..76dc4c3e5d 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
62 * dynamically allocated string that @item->ci_name points to. 62 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array. 63 * Otherwise, use the static @item->ci_namebuf array.
64 */ 64 */
65
66int config_item_set_name(struct config_item * item, const char * fmt, ...) 65int config_item_set_name(struct config_item * item, const char * fmt, ...)
67{ 66{
68 int error = 0; 67 int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
139 return item; 138 return item;
140} 139}
141 140
142/** 141static void config_item_cleanup(struct config_item * item)
143 * config_item_cleanup - free config_item resources.
144 * @item: item.
145 */
146
147void config_item_cleanup(struct config_item * item)
148{ 142{
149 struct config_item_type * t = item->ci_type; 143 struct config_item_type * t = item->ci_type;
150 struct config_group * s = item->ci_group; 144 struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
179 kref_put(&item->ci_kref, config_item_release); 173 kref_put(&item->ci_kref, config_item_release);
180} 174}
181 175
182
183/** 176/**
184 * config_group_init - initialize a group for use 177 * config_group_init - initialize a group for use
185 * @k: group 178 * @k: group
186 */ 179 */
187
188void config_group_init(struct config_group *group) 180void config_group_init(struct config_group *group)
189{ 181{
190 config_item_init(&group->cg_item); 182 config_item_init(&group->cg_item);
191 INIT_LIST_HEAD(&group->cg_children); 183 INIT_LIST_HEAD(&group->cg_children);
192} 184}
193 185
194
195/** 186/**
196 * config_group_find_obj - search for item in group. 187 * config_group_find_item - search for item in group.
197 * @group: group we're looking in. 188 * @group: group we're looking in.
198 * @name: item's name. 189 * @name: item's name.
199 * 190 *
200 * Lock group via @group->cg_subsys, and iterate over @group->cg_list, 191 * Iterate over @group->cg_list, looking for a matching config_item.
201 * looking for a matching config_item. If matching item is found 192 * If matching item is found take a reference and return the item.
202 * take a reference and return the item. 193 * Caller must have locked group via @group->cg_subsys->su_mtx.
203 */ 194 */
204 195struct config_item *config_group_find_item(struct config_group *group,
205struct config_item * config_group_find_obj(struct config_group * group, const char * name) 196 const char *name)
206{ 197{
207 struct list_head * entry; 198 struct list_head * entry;
208 struct config_item * ret = NULL; 199 struct config_item * ret = NULL;
209 200
210 /* XXX LOCKING! */
211 list_for_each(entry,&group->cg_children) { 201 list_for_each(entry,&group->cg_children) {
212 struct config_item * item = to_item(entry); 202 struct config_item * item = to_item(entry);
213 if (config_item_name(item) && 203 if (config_item_name(item) &&
214 !strcmp(config_item_name(item), name)) { 204 !strcmp(config_item_name(item), name)) {
215 ret = config_item_get(item); 205 ret = config_item_get(item);
216 break; 206 break;
217 } 207 }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
219 return ret; 209 return ret;
220} 210}
221 211
222
223EXPORT_SYMBOL(config_item_init); 212EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 213EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 214EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 215EXPORT_SYMBOL(config_item_put);
227EXPORT_SYMBOL(config_group_find_obj); 216EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dcache.c b/fs/dcache.c
index 0e73aa0a0e..cb9d05056b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -883,6 +883,11 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
883 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 883 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
884} 884}
885 885
886static struct shrinker dcache_shrinker = {
887 .shrink = shrink_dcache_memory,
888 .seeks = DEFAULT_SEEKS,
889};
890
886/** 891/**
887 * d_alloc - allocate a dcache entry 892 * d_alloc - allocate a dcache entry
888 * @parent: parent of entry to allocate 893 * @parent: parent of entry to allocate
@@ -2115,7 +2120,7 @@ static void __init dcache_init(unsigned long mempages)
2115 dentry_cache = KMEM_CACHE(dentry, 2120 dentry_cache = KMEM_CACHE(dentry,
2116 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); 2121 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
2117 2122
2118 set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); 2123 register_shrinker(&dcache_shrinker);
2119 2124
2120 /* Hash may have been set up in dcache_init_early */ 2125 /* Hash may have been set up in dcache_init_early */
2121 if (!hashdist) 2126 if (!hashdist)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index ec8896b264..1d533a2ec3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -368,6 +368,69 @@ void debugfs_remove(struct dentry *dentry)
368} 368}
369EXPORT_SYMBOL_GPL(debugfs_remove); 369EXPORT_SYMBOL_GPL(debugfs_remove);
370 370
371/**
372 * debugfs_rename - rename a file/directory in the debugfs filesystem
373 * @old_dir: a pointer to the parent dentry for the renamed object. This
374 * should be a directory dentry.
375 * @old_dentry: dentry of an object to be renamed.
376 * @new_dir: a pointer to the parent dentry where the object should be
377 * moved. This should be a directory dentry.
378 * @new_name: a pointer to a string containing the target name.
379 *
380 * This function renames a file/directory in debugfs. The target must not
381 * exist for rename to succeed.
382 *
383 * This function will return a pointer to old_dentry (which is updated to
384 * reflect renaming) if it succeeds. If an error occurs, %NULL will be
385 * returned.
386 *
387 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
388 * returned.
389 */
390struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
391 struct dentry *new_dir, const char *new_name)
392{
393 int error;
394 struct dentry *dentry = NULL, *trap;
395 const char *old_name;
396
397 trap = lock_rename(new_dir, old_dir);
398 /* Source or destination directories don't exist? */
399 if (!old_dir->d_inode || !new_dir->d_inode)
400 goto exit;
401 /* Source does not exist, cyclic rename, or mountpoint? */
402 if (!old_dentry->d_inode || old_dentry == trap ||
403 d_mountpoint(old_dentry))
404 goto exit;
405 dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
406 /* Lookup failed, cyclic rename or target exists? */
407 if (IS_ERR(dentry) || dentry == trap || dentry->d_inode)
408 goto exit;
409
410 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
411
412 error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode,
413 dentry);
414 if (error) {
415 fsnotify_oldname_free(old_name);
416 goto exit;
417 }
418 d_move(old_dentry, dentry);
419 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
420 old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode),
421 NULL, old_dentry->d_inode);
422 fsnotify_oldname_free(old_name);
423 unlock_rename(new_dir, old_dir);
424 dput(dentry);
425 return old_dentry;
426exit:
427 if (dentry && !IS_ERR(dentry))
428 dput(dentry);
429 unlock_rename(new_dir, old_dir);
430 return NULL;
431}
432EXPORT_SYMBOL_GPL(debugfs_rename);
433
371static decl_subsys(debug, NULL, NULL); 434static decl_subsys(debug, NULL, NULL);
372 435
373static int __init debugfs_init(void) 436static int __init debugfs_init(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8593f3dfd2..52bb2638f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1106,7 +1106,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1106 spin_lock_irqsave(&dio->bio_lock, flags); 1106 spin_lock_irqsave(&dio->bio_lock, flags);
1107 ret2 = --dio->refcount; 1107 ret2 = --dio->refcount;
1108 spin_unlock_irqrestore(&dio->bio_lock, flags); 1108 spin_unlock_irqrestore(&dio->bio_lock, flags);
1109 BUG_ON(!dio->is_async && ret2 != 0); 1109
1110 if (ret2 == 0) { 1110 if (ret2 == 0) {
1111 ret = dio_complete(dio, offset, ret); 1111 ret = dio_complete(dio, offset, ret);
1112 kfree(dio); 1112 kfree(dio);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 69a94690e4..54bcc00ec8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,7 +3,7 @@ menu "Distributed Lock Manager"
3 3
4config DLM 4config DLM
5 tristate "Distributed Lock Manager (DLM)" 5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n 6 depends on SYSFS && (IPV6 || IPV6=n)
7 select CONFIGFS_FS 7 select CONFIGFS_FS
8 select IP_SCTP 8 select IP_SCTP
9 help 9 help
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 604cf7dc5f..d248e60951 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -8,6 +8,7 @@ dlm-y := ast.o \
8 member.o \ 8 member.o \
9 memory.o \ 9 memory.o \
10 midcomms.o \ 10 midcomms.o \
11 netlink.o \
11 lowcomms.o \ 12 lowcomms.o \
12 rcom.o \ 13 rcom.o \
13 recover.o \ 14 recover.o \
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 822abdcd14..2f8e3c81bc 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -90,6 +90,7 @@ struct cluster {
90 unsigned int cl_scan_secs; 90 unsigned int cl_scan_secs;
91 unsigned int cl_log_debug; 91 unsigned int cl_log_debug;
92 unsigned int cl_protocol; 92 unsigned int cl_protocol;
93 unsigned int cl_timewarn_cs;
93}; 94};
94 95
95enum { 96enum {
@@ -103,6 +104,7 @@ enum {
103 CLUSTER_ATTR_SCAN_SECS, 104 CLUSTER_ATTR_SCAN_SECS,
104 CLUSTER_ATTR_LOG_DEBUG, 105 CLUSTER_ATTR_LOG_DEBUG,
105 CLUSTER_ATTR_PROTOCOL, 106 CLUSTER_ATTR_PROTOCOL,
107 CLUSTER_ATTR_TIMEWARN_CS,
106}; 108};
107 109
108struct cluster_attribute { 110struct cluster_attribute {
@@ -131,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
131 return len; 133 return len;
132} 134}
133 135
134#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \
135 .attr = { .ca_name = __stringify(_name), \
136 .ca_mode = _mode, \
137 .ca_owner = THIS_MODULE }, \
138 .show = _read, \
139 .store = _write, \
140}
141
142#define CLUSTER_ATTR(name, check_zero) \ 136#define CLUSTER_ATTR(name, check_zero) \
143static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ 137static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
144{ \ 138{ \
@@ -162,6 +156,7 @@ CLUSTER_ATTR(toss_secs, 1);
162CLUSTER_ATTR(scan_secs, 1); 156CLUSTER_ATTR(scan_secs, 1);
163CLUSTER_ATTR(log_debug, 0); 157CLUSTER_ATTR(log_debug, 0);
164CLUSTER_ATTR(protocol, 0); 158CLUSTER_ATTR(protocol, 0);
159CLUSTER_ATTR(timewarn_cs, 1);
165 160
166static struct configfs_attribute *cluster_attrs[] = { 161static struct configfs_attribute *cluster_attrs[] = {
167 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 162 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -174,6 +169,7 @@ static struct configfs_attribute *cluster_attrs[] = {
174 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, 169 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
175 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, 170 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
176 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, 171 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
172 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
177 NULL, 173 NULL,
178}; 174};
179 175
@@ -429,6 +425,8 @@ static struct config_group *make_cluster(struct config_group *g,
429 cl->cl_toss_secs = dlm_config.ci_toss_secs; 425 cl->cl_toss_secs = dlm_config.ci_toss_secs;
430 cl->cl_scan_secs = dlm_config.ci_scan_secs; 426 cl->cl_scan_secs = dlm_config.ci_scan_secs;
431 cl->cl_log_debug = dlm_config.ci_log_debug; 427 cl->cl_log_debug = dlm_config.ci_log_debug;
428 cl->cl_protocol = dlm_config.ci_protocol;
429 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
432 430
433 space_list = &sps->ss_group; 431 space_list = &sps->ss_group;
434 comm_list = &cms->cs_group; 432 comm_list = &cms->cs_group;
@@ -609,7 +607,7 @@ static struct clusters clusters_root = {
609int dlm_config_init(void) 607int dlm_config_init(void)
610{ 608{
611 config_group_init(&clusters_root.subsys.su_group); 609 config_group_init(&clusters_root.subsys.su_group);
612 init_MUTEX(&clusters_root.subsys.su_sem); 610 mutex_init(&clusters_root.subsys.su_mutex);
613 return configfs_register_subsystem(&clusters_root.subsys); 611 return configfs_register_subsystem(&clusters_root.subsys);
614} 612}
615 613
@@ -748,9 +746,16 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
748 746
749static struct space *get_space(char *name) 747static struct space *get_space(char *name)
750{ 748{
749 struct config_item *i;
750
751 if (!space_list) 751 if (!space_list)
752 return NULL; 752 return NULL;
753 return to_space(config_group_find_obj(space_list, name)); 753
754 mutex_lock(&space_list->cg_subsys->su_mutex);
755 i = config_group_find_item(space_list, name);
756 mutex_unlock(&space_list->cg_subsys->su_mutex);
757
758 return to_space(i);
754} 759}
755 760
756static void put_space(struct space *sp) 761static void put_space(struct space *sp)
@@ -767,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
767 if (!comm_list) 772 if (!comm_list)
768 return NULL; 773 return NULL;
769 774
770 down(&clusters_root.subsys.su_sem); 775 mutex_lock(&clusters_root.subsys.su_mutex);
771 776
772 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 777 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
773 cm = to_comm(i); 778 cm = to_comm(i);
@@ -776,20 +781,20 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
776 if (cm->nodeid != nodeid) 781 if (cm->nodeid != nodeid)
777 continue; 782 continue;
778 found = 1; 783 found = 1;
784 config_item_get(i);
779 break; 785 break;
780 } else { 786 } else {
781 if (!cm->addr_count || 787 if (!cm->addr_count ||
782 memcmp(cm->addr[0], addr, sizeof(*addr))) 788 memcmp(cm->addr[0], addr, sizeof(*addr)))
783 continue; 789 continue;
784 found = 1; 790 found = 1;
791 config_item_get(i);
785 break; 792 break;
786 } 793 }
787 } 794 }
788 up(&clusters_root.subsys.su_sem); 795 mutex_unlock(&clusters_root.subsys.su_mutex);
789 796
790 if (found) 797 if (!found)
791 config_item_get(i);
792 else
793 cm = NULL; 798 cm = NULL;
794 return cm; 799 return cm;
795} 800}
@@ -909,6 +914,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
909#define DEFAULT_SCAN_SECS 5 914#define DEFAULT_SCAN_SECS 5
910#define DEFAULT_LOG_DEBUG 0 915#define DEFAULT_LOG_DEBUG 0
911#define DEFAULT_PROTOCOL 0 916#define DEFAULT_PROTOCOL 0
917#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
912 918
913struct dlm_config_info dlm_config = { 919struct dlm_config_info dlm_config = {
914 .ci_tcp_port = DEFAULT_TCP_PORT, 920 .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -920,6 +926,7 @@ struct dlm_config_info dlm_config = {
920 .ci_toss_secs = DEFAULT_TOSS_SECS, 926 .ci_toss_secs = DEFAULT_TOSS_SECS,
921 .ci_scan_secs = DEFAULT_SCAN_SECS, 927 .ci_scan_secs = DEFAULT_SCAN_SECS,
922 .ci_log_debug = DEFAULT_LOG_DEBUG, 928 .ci_log_debug = DEFAULT_LOG_DEBUG,
923 .ci_protocol = DEFAULT_PROTOCOL 929 .ci_protocol = DEFAULT_PROTOCOL,
930 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
924}; 931};
925 932
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 967cc3d72e..a3170fe220 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -27,6 +27,7 @@ struct dlm_config_info {
27 int ci_scan_secs; 27 int ci_scan_secs;
28 int ci_log_debug; 28 int ci_log_debug;
29 int ci_protocol; 29 int ci_protocol;
30 int ci_timewarn_cs;
30}; 31};
31 32
32extern struct dlm_config_info dlm_config; 33extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 61ba670b9e..12c3bfd5e6 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -17,6 +17,7 @@
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18 18
19#include "dlm_internal.h" 19#include "dlm_internal.h"
20#include "lock.h"
20 21
21#define DLM_DEBUG_BUF_LEN 4096 22#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN]; 23static char debug_buf[DLM_DEBUG_BUF_LEN];
@@ -26,6 +27,8 @@ static struct dentry *dlm_root;
26 27
27struct rsb_iter { 28struct rsb_iter {
28 int entry; 29 int entry;
30 int locks;
31 int header;
29 struct dlm_ls *ls; 32 struct dlm_ls *ls;
30 struct list_head *next; 33 struct list_head *next;
31 struct dlm_rsb *rsb; 34 struct dlm_rsb *rsb;
@@ -57,8 +60,8 @@ static char *print_lockmode(int mode)
57 } 60 }
58} 61}
59 62
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, 63static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res) 64 struct dlm_rsb *res)
62{ 65{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64 67
@@ -85,6 +88,8 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
85 struct dlm_lkb *lkb; 88 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; 89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87 90
91 lock_rsb(res);
92
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); 93 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) { 94 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i])) 95 if (isprint(res->res_name[i]))
@@ -129,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
129 /* Print the locks attached to this resource */ 134 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n"); 135 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res); 137 print_resource_lock(s, lkb, res);
133 138
134 seq_printf(s, "Conversion Queue\n"); 139 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res); 141 print_resource_lock(s, lkb, res);
137 142
138 seq_printf(s, "Waiting Queue\n"); 143 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res); 145 print_resource_lock(s, lkb, res);
141 146
142 if (list_empty(&res->res_lookup)) 147 if (list_empty(&res->res_lookup))
143 goto out; 148 goto out;
@@ -151,6 +156,61 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
151 seq_printf(s, "\n"); 156 seq_printf(s, "\n");
152 } 157 }
153 out: 158 out:
159 unlock_rsb(res);
160 return 0;
161}
162
163static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
164{
165 struct dlm_user_args *ua;
166 unsigned int waiting = 0;
167 uint64_t xid = 0;
168
169 if (lkb->lkb_flags & DLM_IFL_USER) {
170 ua = (struct dlm_user_args *) lkb->lkb_astparam;
171 if (ua)
172 xid = ua->xid;
173 }
174
175 if (lkb->lkb_timestamp)
176 waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
177
178 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
179 r_nodeid r_len r_name */
180
181 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
182 lkb->lkb_id,
183 lkb->lkb_nodeid,
184 lkb->lkb_remid,
185 lkb->lkb_ownpid,
186 (unsigned long long)xid,
187 lkb->lkb_exflags,
188 lkb->lkb_flags,
189 lkb->lkb_status,
190 lkb->lkb_grmode,
191 lkb->lkb_rqmode,
192 waiting,
193 r->res_nodeid,
194 r->res_length,
195 r->res_name);
196}
197
198static int print_locks(struct dlm_rsb *r, struct seq_file *s)
199{
200 struct dlm_lkb *lkb;
201
202 lock_rsb(r);
203
204 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
205 print_lock(s, lkb, r);
206
207 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
208 print_lock(s, lkb, r);
209
210 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
211 print_lock(s, lkb, r);
212
213 unlock_rsb(r);
154 return 0; 214 return 0;
155} 215}
156 216
@@ -166,6 +226,9 @@ static int rsb_iter_next(struct rsb_iter *ri)
166 read_lock(&ls->ls_rsbtbl[i].lock); 226 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) { 227 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next; 228 ri->next = ls->ls_rsbtbl[i].list.next;
229 ri->rsb = list_entry(ri->next, struct dlm_rsb,
230 res_hashchain);
231 dlm_hold_rsb(ri->rsb);
169 read_unlock(&ls->ls_rsbtbl[i].lock); 232 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break; 233 break;
171 } 234 }
@@ -176,6 +239,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
176 if (ri->entry >= ls->ls_rsbtbl_size) 239 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1; 240 return 1;
178 } else { 241 } else {
242 struct dlm_rsb *old = ri->rsb;
179 i = ri->entry; 243 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock); 244 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next; 245 ri->next = ri->next->next;
@@ -184,11 +248,14 @@ static int rsb_iter_next(struct rsb_iter *ri)
184 ri->next = NULL; 248 ri->next = NULL;
185 ri->entry++; 249 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock); 250 read_unlock(&ls->ls_rsbtbl[i].lock);
251 dlm_put_rsb(old);
187 goto top; 252 goto top;
188 } 253 }
254 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
255 dlm_hold_rsb(ri->rsb);
189 read_unlock(&ls->ls_rsbtbl[i].lock); 256 read_unlock(&ls->ls_rsbtbl[i].lock);
257 dlm_put_rsb(old);
190 } 258 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192 259
193 return 0; 260 return 0;
194} 261}
@@ -202,7 +269,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{ 269{
203 struct rsb_iter *ri; 270 struct rsb_iter *ri;
204 271
205 ri = kmalloc(sizeof *ri, GFP_KERNEL); 272 ri = kzalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri) 273 if (!ri)
207 return NULL; 274 return NULL;
208 275
@@ -260,7 +327,17 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{ 327{
261 struct rsb_iter *ri = iter_ptr; 328 struct rsb_iter *ri = iter_ptr;
262 329
263 print_resource(ri->rsb, file); 330 if (ri->locks) {
331 if (ri->header) {
332 seq_printf(file, "id nodeid remid pid xid exflags flags "
333 "sts grmode rqmode time_ms r_nodeid "
334 "r_len r_name\n");
335 ri->header = 0;
336 }
337 print_locks(ri->rsb, file);
338 } else {
339 print_resource(ri->rsb, file);
340 }
264 341
265 return 0; 342 return 0;
266} 343}
@@ -296,6 +373,83 @@ static const struct file_operations rsb_fops = {
296}; 373};
297 374
298/* 375/*
376 * Dump state in compact per-lock listing
377 */
378
379static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
380{
381 struct rsb_iter *ri;
382
383 ri = kzalloc(sizeof *ri, GFP_KERNEL);
384 if (!ri)
385 return NULL;
386
387 ri->ls = ls;
388 ri->entry = 0;
389 ri->next = NULL;
390 ri->locks = 1;
391
392 if (*pos == 0)
393 ri->header = 1;
394
395 if (rsb_iter_next(ri)) {
396 rsb_iter_free(ri);
397 return NULL;
398 }
399
400 return ri;
401}
402
403static void *locks_seq_start(struct seq_file *file, loff_t *pos)
404{
405 struct rsb_iter *ri;
406 loff_t n = *pos;
407
408 ri = locks_iter_init(file->private, pos);
409 if (!ri)
410 return NULL;
411
412 while (n--) {
413 if (rsb_iter_next(ri)) {
414 rsb_iter_free(ri);
415 return NULL;
416 }
417 }
418
419 return ri;
420}
421
422static struct seq_operations locks_seq_ops = {
423 .start = locks_seq_start,
424 .next = rsb_seq_next,
425 .stop = rsb_seq_stop,
426 .show = rsb_seq_show,
427};
428
429static int locks_open(struct inode *inode, struct file *file)
430{
431 struct seq_file *seq;
432 int ret;
433
434 ret = seq_open(file, &locks_seq_ops);
435 if (ret)
436 return ret;
437
438 seq = file->private_data;
439 seq->private = inode->i_private;
440
441 return 0;
442}
443
444static const struct file_operations locks_fops = {
445 .owner = THIS_MODULE,
446 .open = locks_open,
447 .read = seq_read,
448 .llseek = seq_lseek,
449 .release = seq_release
450};
451
452/*
299 * dump lkb's on the ls_waiters list 453 * dump lkb's on the ls_waiters list
300 */ 454 */
301 455
@@ -362,6 +516,20 @@ int dlm_create_debug_file(struct dlm_ls *ls)
362 return -ENOMEM; 516 return -ENOMEM;
363 } 517 }
364 518
519 memset(name, 0, sizeof(name));
520 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
521
522 ls->ls_debug_locks_dentry = debugfs_create_file(name,
523 S_IFREG | S_IRUGO,
524 dlm_root,
525 ls,
526 &locks_fops);
527 if (!ls->ls_debug_locks_dentry) {
528 debugfs_remove(ls->ls_debug_waiters_dentry);
529 debugfs_remove(ls->ls_debug_rsb_dentry);
530 return -ENOMEM;
531 }
532
365 return 0; 533 return 0;
366} 534}
367 535
@@ -371,6 +539,8 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
371 debugfs_remove(ls->ls_debug_rsb_dentry); 539 debugfs_remove(ls->ls_debug_rsb_dentry);
372 if (ls->ls_debug_waiters_dentry) 540 if (ls->ls_debug_waiters_dentry)
373 debugfs_remove(ls->ls_debug_waiters_dentry); 541 debugfs_remove(ls->ls_debug_waiters_dentry);
542 if (ls->ls_debug_locks_dentry)
543 debugfs_remove(ls->ls_debug_locks_dentry);
374} 544}
375 545
376int dlm_register_debugfs(void) 546int dlm_register_debugfs(void)
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 30994d68f6..74901e981e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -151,6 +151,7 @@ struct dlm_args {
151 void *bastaddr; 151 void *bastaddr;
152 int mode; 152 int mode;
153 struct dlm_lksb *lksb; 153 struct dlm_lksb *lksb;
154 unsigned long timeout;
154}; 155};
155 156
156 157
@@ -213,6 +214,9 @@ struct dlm_args {
213#define DLM_IFL_OVERLAP_UNLOCK 0x00080000 214#define DLM_IFL_OVERLAP_UNLOCK 0x00080000
214#define DLM_IFL_OVERLAP_CANCEL 0x00100000 215#define DLM_IFL_OVERLAP_CANCEL 0x00100000
215#define DLM_IFL_ENDOFLIFE 0x00200000 216#define DLM_IFL_ENDOFLIFE 0x00200000
217#define DLM_IFL_WATCH_TIMEWARN 0x00400000
218#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
219#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
216#define DLM_IFL_USER 0x00000001 220#define DLM_IFL_USER 0x00000001
217#define DLM_IFL_ORPHAN 0x00000002 221#define DLM_IFL_ORPHAN 0x00000002
218 222
@@ -243,6 +247,9 @@ struct dlm_lkb {
243 struct list_head lkb_wait_reply; /* waiting for remote reply */ 247 struct list_head lkb_wait_reply; /* waiting for remote reply */
244 struct list_head lkb_astqueue; /* need ast to be sent */ 248 struct list_head lkb_astqueue; /* need ast to be sent */
245 struct list_head lkb_ownqueue; /* list of locks for a process */ 249 struct list_head lkb_ownqueue; /* list of locks for a process */
250 struct list_head lkb_time_list;
251 unsigned long lkb_timestamp;
252 unsigned long lkb_timeout_cs;
246 253
247 char *lkb_lvbptr; 254 char *lkb_lvbptr;
248 struct dlm_lksb *lkb_lksb; /* caller's status block */ 255 struct dlm_lksb *lkb_lksb; /* caller's status block */
@@ -447,12 +454,16 @@ struct dlm_ls {
447 struct mutex ls_orphans_mutex; 454 struct mutex ls_orphans_mutex;
448 struct list_head ls_orphans; 455 struct list_head ls_orphans;
449 456
457 struct mutex ls_timeout_mutex;
458 struct list_head ls_timeout;
459
450 struct list_head ls_nodes; /* current nodes in ls */ 460 struct list_head ls_nodes; /* current nodes in ls */
451 struct list_head ls_nodes_gone; /* dead node list, recovery */ 461 struct list_head ls_nodes_gone; /* dead node list, recovery */
452 int ls_num_nodes; /* number of nodes in ls */ 462 int ls_num_nodes; /* number of nodes in ls */
453 int ls_low_nodeid; 463 int ls_low_nodeid;
454 int ls_total_weight; 464 int ls_total_weight;
455 int *ls_node_array; 465 int *ls_node_array;
466 gfp_t ls_allocation;
456 467
457 struct dlm_rsb ls_stub_rsb; /* for returning errors */ 468 struct dlm_rsb ls_stub_rsb; /* for returning errors */
458 struct dlm_lkb ls_stub_lkb; /* for returning errors */ 469 struct dlm_lkb ls_stub_lkb; /* for returning errors */
@@ -460,9 +471,12 @@ struct dlm_ls {
460 471
461 struct dentry *ls_debug_rsb_dentry; /* debugfs */ 472 struct dentry *ls_debug_rsb_dentry; /* debugfs */
462 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 473 struct dentry *ls_debug_waiters_dentry; /* debugfs */
474 struct dentry *ls_debug_locks_dentry; /* debugfs */
463 475
464 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 476 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
465 int ls_uevent_result; 477 int ls_uevent_result;
478 struct completion ls_members_done;
479 int ls_members_result;
466 480
467 struct miscdevice ls_device; 481 struct miscdevice ls_device;
468 482
@@ -472,6 +486,7 @@ struct dlm_ls {
472 struct task_struct *ls_recoverd_task; 486 struct task_struct *ls_recoverd_task;
473 struct mutex ls_recoverd_active; 487 struct mutex ls_recoverd_active;
474 spinlock_t ls_recover_lock; 488 spinlock_t ls_recover_lock;
489 unsigned long ls_recover_begin; /* jiffies timestamp */
475 uint32_t ls_recover_status; /* DLM_RS_ */ 490 uint32_t ls_recover_status; /* DLM_RS_ */
476 uint64_t ls_recover_seq; 491 uint64_t ls_recover_seq;
477 struct dlm_recover *ls_recover_args; 492 struct dlm_recover *ls_recover_args;
@@ -501,6 +516,7 @@ struct dlm_ls {
501#define LSFL_RCOM_READY 3 516#define LSFL_RCOM_READY 3
502#define LSFL_RCOM_WAIT 4 517#define LSFL_RCOM_WAIT 4
503#define LSFL_UEVENT_WAIT 5 518#define LSFL_UEVENT_WAIT 5
519#define LSFL_TIMEWARN 6
504 520
505/* much of this is just saving user space pointers associated with the 521/* much of this is just saving user space pointers associated with the
506 lock that we pass back to the user lib with an ast */ 522 lock that we pass back to the user lib with an ast */
@@ -518,6 +534,7 @@ struct dlm_user_args {
518 void __user *castaddr; 534 void __user *castaddr;
519 void __user *bastparam; 535 void __user *bastparam;
520 void __user *bastaddr; 536 void __user *bastaddr;
537 uint64_t xid;
521}; 538};
522 539
523#define DLM_PROC_FLAGS_CLOSING 1 540#define DLM_PROC_FLAGS_CLOSING 1
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index d8d6e729f9..b455919c19 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -82,10 +82,13 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); 82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r); 83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 86static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms); 87 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms); 88static int receive_extralen(struct dlm_message *ms);
88static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 89static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
90static void del_timeout(struct dlm_lkb *lkb);
91void dlm_timeout_warn(struct dlm_lkb *lkb);
89 92
90/* 93/*
91 * Lock compatibilty matrix - thanks Steve 94 * Lock compatibilty matrix - thanks Steve
@@ -194,17 +197,17 @@ void dlm_dump_rsb(struct dlm_rsb *r)
194 197
195/* Threads cannot use the lockspace while it's being recovered */ 198/* Threads cannot use the lockspace while it's being recovered */
196 199
197static inline void lock_recovery(struct dlm_ls *ls) 200static inline void dlm_lock_recovery(struct dlm_ls *ls)
198{ 201{
199 down_read(&ls->ls_in_recovery); 202 down_read(&ls->ls_in_recovery);
200} 203}
201 204
202static inline void unlock_recovery(struct dlm_ls *ls) 205void dlm_unlock_recovery(struct dlm_ls *ls)
203{ 206{
204 up_read(&ls->ls_in_recovery); 207 up_read(&ls->ls_in_recovery);
205} 208}
206 209
207static inline int lock_recovery_try(struct dlm_ls *ls) 210int dlm_lock_recovery_try(struct dlm_ls *ls)
208{ 211{
209 return down_read_trylock(&ls->ls_in_recovery); 212 return down_read_trylock(&ls->ls_in_recovery);
210} 213}
@@ -286,8 +289,22 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
286 if (is_master_copy(lkb)) 289 if (is_master_copy(lkb))
287 return; 290 return;
288 291
292 del_timeout(lkb);
293
289 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); 294 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
290 295
296 /* if the operation was a cancel, then return -DLM_ECANCEL, if a
297 timeout caused the cancel then return -ETIMEDOUT */
298 if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
299 lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
300 rv = -ETIMEDOUT;
301 }
302
303 if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
304 lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
305 rv = -EDEADLK;
306 }
307
291 lkb->lkb_lksb->sb_status = rv; 308 lkb->lkb_lksb->sb_status = rv;
292 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 309 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
293 310
@@ -581,6 +598,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
581 kref_init(&lkb->lkb_ref); 598 kref_init(&lkb->lkb_ref);
582 INIT_LIST_HEAD(&lkb->lkb_ownqueue); 599 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
583 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); 600 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
601 INIT_LIST_HEAD(&lkb->lkb_time_list);
584 602
585 get_random_bytes(&bucket, sizeof(bucket)); 603 get_random_bytes(&bucket, sizeof(bucket));
586 bucket &= (ls->ls_lkbtbl_size - 1); 604 bucket &= (ls->ls_lkbtbl_size - 1);
@@ -985,15 +1003,136 @@ void dlm_scan_rsbs(struct dlm_ls *ls)
985{ 1003{
986 int i; 1004 int i;
987 1005
988 if (dlm_locking_stopped(ls))
989 return;
990
991 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 1006 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
992 shrink_bucket(ls, i); 1007 shrink_bucket(ls, i);
1008 if (dlm_locking_stopped(ls))
1009 break;
993 cond_resched(); 1010 cond_resched();
994 } 1011 }
995} 1012}
996 1013
1014static void add_timeout(struct dlm_lkb *lkb)
1015{
1016 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1017
1018 if (is_master_copy(lkb)) {
1019 lkb->lkb_timestamp = jiffies;
1020 return;
1021 }
1022
1023 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1024 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1025 lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
1026 goto add_it;
1027 }
1028 if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
1029 goto add_it;
1030 return;
1031
1032 add_it:
1033 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1034 mutex_lock(&ls->ls_timeout_mutex);
1035 hold_lkb(lkb);
1036 lkb->lkb_timestamp = jiffies;
1037 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1038 mutex_unlock(&ls->ls_timeout_mutex);
1039}
1040
1041static void del_timeout(struct dlm_lkb *lkb)
1042{
1043 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1044
1045 mutex_lock(&ls->ls_timeout_mutex);
1046 if (!list_empty(&lkb->lkb_time_list)) {
1047 list_del_init(&lkb->lkb_time_list);
1048 unhold_lkb(lkb);
1049 }
1050 mutex_unlock(&ls->ls_timeout_mutex);
1051}
1052
1053/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
1054 lkb_lksb_timeout without lock_rsb? Note: we can't lock timeout_mutex
1055 and then lock rsb because of lock ordering in add_timeout. We may need
1056 to specify some special timeout-related bits in the lkb that are just to
1057 be accessed under the timeout_mutex. */
1058
1059void dlm_scan_timeout(struct dlm_ls *ls)
1060{
1061 struct dlm_rsb *r;
1062 struct dlm_lkb *lkb;
1063 int do_cancel, do_warn;
1064
1065 for (;;) {
1066 if (dlm_locking_stopped(ls))
1067 break;
1068
1069 do_cancel = 0;
1070 do_warn = 0;
1071 mutex_lock(&ls->ls_timeout_mutex);
1072 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
1073
1074 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
1075 time_after_eq(jiffies, lkb->lkb_timestamp +
1076 lkb->lkb_timeout_cs * HZ/100))
1077 do_cancel = 1;
1078
1079 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1080 time_after_eq(jiffies, lkb->lkb_timestamp +
1081 dlm_config.ci_timewarn_cs * HZ/100))
1082 do_warn = 1;
1083
1084 if (!do_cancel && !do_warn)
1085 continue;
1086 hold_lkb(lkb);
1087 break;
1088 }
1089 mutex_unlock(&ls->ls_timeout_mutex);
1090
1091 if (!do_cancel && !do_warn)
1092 break;
1093
1094 r = lkb->lkb_resource;
1095 hold_rsb(r);
1096 lock_rsb(r);
1097
1098 if (do_warn) {
1099 /* clear flag so we only warn once */
1100 lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1101 if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
1102 del_timeout(lkb);
1103 dlm_timeout_warn(lkb);
1104 }
1105
1106 if (do_cancel) {
1107 log_debug(ls, "timeout cancel %x node %d %s",
1108 lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1109 lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1110 lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
1111 del_timeout(lkb);
1112 _cancel_lock(r, lkb);
1113 }
1114
1115 unlock_rsb(r);
1116 unhold_rsb(r);
1117 dlm_put_lkb(lkb);
1118 }
1119}
1120
1121/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
1122 dlm_recoverd before checking/setting ls_recover_begin. */
1123
1124void dlm_adjust_timeouts(struct dlm_ls *ls)
1125{
1126 struct dlm_lkb *lkb;
1127 long adj = jiffies - ls->ls_recover_begin;
1128
1129 ls->ls_recover_begin = 0;
1130 mutex_lock(&ls->ls_timeout_mutex);
1131 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1132 lkb->lkb_timestamp += adj;
1133 mutex_unlock(&ls->ls_timeout_mutex);
1134}
1135
997/* lkb is master or local copy */ 1136/* lkb is master or local copy */
998 1137
999static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1138static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -1275,10 +1414,8 @@ static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1275 * queue for one resource. The granted mode of each lock blocks the requested 1414 * queue for one resource. The granted mode of each lock blocks the requested
1276 * mode of the other lock." 1415 * mode of the other lock."
1277 * 1416 *
1278 * Part 2: if the granted mode of lkb is preventing the first lkb in the 1417 * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
1279 * convert queue from being granted, then demote lkb (set grmode to NL). 1418 * convert queue from being granted, then deadlk/demote lkb.
1280 * This second form requires that we check for conv-deadlk even when
1281 * now == 0 in _can_be_granted().
1282 * 1419 *
1283 * Example: 1420 * Example:
1284 * Granted Queue: empty 1421 * Granted Queue: empty
@@ -1287,41 +1424,52 @@ static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1287 * 1424 *
1288 * The first lock can't be granted because of the granted mode of the second 1425 * The first lock can't be granted because of the granted mode of the second
1289 * lock and the second lock can't be granted because it's not first in the 1426 * lock and the second lock can't be granted because it's not first in the
1290 * list. We demote the granted mode of the second lock (the lkb passed to this 1427 * list. We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
1291 * function). 1428 * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
1429 * flag set and return DEMOTED in the lksb flags.
1430 *
1431 * Originally, this function detected conv-deadlk in a more limited scope:
1432 * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
1433 * - if lkb1 was the first entry in the queue (not just earlier), and was
1434 * blocked by the granted mode of lkb2, and there was nothing on the
1435 * granted queue preventing lkb1 from being granted immediately, i.e.
1436 * lkb2 was the only thing preventing lkb1 from being granted.
1437 *
1438 * That second condition meant we'd only say there was conv-deadlk if
1439 * resolving it (by demotion) would lead to the first lock on the convert
1440 * queue being granted right away. It allowed conversion deadlocks to exist
1441 * between locks on the convert queue while they couldn't be granted anyway.
1292 * 1442 *
1293 * After the resolution, the "grant pending" function needs to go back and try 1443 * Now, we detect and take action on conversion deadlocks immediately when
1294 * to grant locks on the convert queue again since the first lock can now be 1444 * they're created, even if they may not be immediately consequential. If
1295 * granted. 1445 * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
1446 * mode that would prevent lkb1's conversion from being granted, we do a
1447 * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
1448 * I think this means that the lkb_is_ahead condition below should always
1449 * be zero, i.e. there will never be conv-deadlk between two locks that are
1450 * both already on the convert queue.
1296 */ 1451 */
1297 1452
1298static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb) 1453static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
1299{ 1454{
1300 struct dlm_lkb *this, *first = NULL, *self = NULL; 1455 struct dlm_lkb *lkb1;
1456 int lkb_is_ahead = 0;
1301 1457
1302 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { 1458 list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
1303 if (!first) 1459 if (lkb1 == lkb2) {
1304 first = this; 1460 lkb_is_ahead = 1;
1305 if (this == lkb) {
1306 self = lkb;
1307 continue; 1461 continue;
1308 } 1462 }
1309 1463
1310 if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) 1464 if (!lkb_is_ahead) {
1311 return 1; 1465 if (!modes_compat(lkb2, lkb1))
1312 } 1466 return 1;
1313 1467 } else {
1314 /* if lkb is on the convert queue and is preventing the first 1468 if (!modes_compat(lkb2, lkb1) &&
1315 from being granted, then there's deadlock and we demote lkb. 1469 !modes_compat(lkb1, lkb2))
1316 multiple converting locks may need to do this before the first 1470 return 1;
1317 converting lock can be granted. */ 1471 }
1318
1319 if (self && self != first) {
1320 if (!modes_compat(lkb, first) &&
1321 !queue_conflict(&rsb->res_grantqueue, first))
1322 return 1;
1323 } 1472 }
1324
1325 return 0; 1473 return 0;
1326} 1474}
1327 1475
@@ -1450,42 +1598,57 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1450 if (!now && !conv && list_empty(&r->res_convertqueue) && 1598 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1451 first_in_list(lkb, &r->res_waitqueue)) 1599 first_in_list(lkb, &r->res_waitqueue))
1452 return 1; 1600 return 1;
1453
1454 out: 1601 out:
1455 /*
1456 * The following, enabled by CONVDEADLK, departs from VMS.
1457 */
1458
1459 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1460 conversion_deadlock_detect(r, lkb)) {
1461 lkb->lkb_grmode = DLM_LOCK_NL;
1462 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1463 }
1464
1465 return 0; 1602 return 0;
1466} 1603}
1467 1604
1468/* 1605static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
1469 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a 1606 int *err)
1470 * simple way to provide a big optimization to applications that can use them.
1471 */
1472
1473static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1474{ 1607{
1475 uint32_t flags = lkb->lkb_exflags;
1476 int rv; 1608 int rv;
1477 int8_t alt = 0, rqmode = lkb->lkb_rqmode; 1609 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1610 int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
1611
1612 if (err)
1613 *err = 0;
1478 1614
1479 rv = _can_be_granted(r, lkb, now); 1615 rv = _can_be_granted(r, lkb, now);
1480 if (rv) 1616 if (rv)
1481 goto out; 1617 goto out;
1482 1618
1483 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED) 1619 /*
1620 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
1621 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
1622 * cancels one of the locks.
1623 */
1624
1625 if (is_convert && can_be_queued(lkb) &&
1626 conversion_deadlock_detect(r, lkb)) {
1627 if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
1628 lkb->lkb_grmode = DLM_LOCK_NL;
1629 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1630 } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1631 if (err)
1632 *err = -EDEADLK;
1633 else {
1634 log_print("can_be_granted deadlock %x now %d",
1635 lkb->lkb_id, now);
1636 dlm_dump_rsb(r);
1637 }
1638 }
1484 goto out; 1639 goto out;
1640 }
1485 1641
1486 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR) 1642 /*
1643 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
1644 * to grant a request in a mode other than the normal rqmode. It's a
1645 * simple way to provide a big optimization to applications that can
1646 * use them.
1647 */
1648
1649 if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
1487 alt = DLM_LOCK_PR; 1650 alt = DLM_LOCK_PR;
1488 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW) 1651 else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
1489 alt = DLM_LOCK_CW; 1652 alt = DLM_LOCK_CW;
1490 1653
1491 if (alt) { 1654 if (alt) {
@@ -1500,10 +1663,20 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1500 return rv; 1663 return rv;
1501} 1664}
1502 1665
1666/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock
1667 for locks pending on the convert list. Once verified (watch for these
1668 log_prints), we should be able to just call _can_be_granted() and not
1669 bother with the demote/deadlk cases here (and there's no easy way to deal
1670 with a deadlk here, we'd have to generate something like grant_lock with
1671 the deadlk error.) */
1672
1673/* returns the highest requested mode of all blocked conversions */
1674
1503static int grant_pending_convert(struct dlm_rsb *r, int high) 1675static int grant_pending_convert(struct dlm_rsb *r, int high)
1504{ 1676{
1505 struct dlm_lkb *lkb, *s; 1677 struct dlm_lkb *lkb, *s;
1506 int hi, demoted, quit, grant_restart, demote_restart; 1678 int hi, demoted, quit, grant_restart, demote_restart;
1679 int deadlk;
1507 1680
1508 quit = 0; 1681 quit = 0;
1509 restart: 1682 restart:
@@ -1513,14 +1686,29 @@ static int grant_pending_convert(struct dlm_rsb *r, int high)
1513 1686
1514 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { 1687 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1515 demoted = is_demoted(lkb); 1688 demoted = is_demoted(lkb);
1516 if (can_be_granted(r, lkb, 0)) { 1689 deadlk = 0;
1690
1691 if (can_be_granted(r, lkb, 0, &deadlk)) {
1517 grant_lock_pending(r, lkb); 1692 grant_lock_pending(r, lkb);
1518 grant_restart = 1; 1693 grant_restart = 1;
1519 } else { 1694 continue;
1520 hi = max_t(int, lkb->lkb_rqmode, hi);
1521 if (!demoted && is_demoted(lkb))
1522 demote_restart = 1;
1523 } 1695 }
1696
1697 if (!demoted && is_demoted(lkb)) {
1698 log_print("WARN: pending demoted %x node %d %s",
1699 lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1700 demote_restart = 1;
1701 continue;
1702 }
1703
1704 if (deadlk) {
1705 log_print("WARN: pending deadlock %x node %d %s",
1706 lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1707 dlm_dump_rsb(r);
1708 continue;
1709 }
1710
1711 hi = max_t(int, lkb->lkb_rqmode, hi);
1524 } 1712 }
1525 1713
1526 if (grant_restart) 1714 if (grant_restart)
@@ -1538,7 +1726,7 @@ static int grant_pending_wait(struct dlm_rsb *r, int high)
1538 struct dlm_lkb *lkb, *s; 1726 struct dlm_lkb *lkb, *s;
1539 1727
1540 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { 1728 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1541 if (can_be_granted(r, lkb, 0)) 1729 if (can_be_granted(r, lkb, 0, NULL))
1542 grant_lock_pending(r, lkb); 1730 grant_lock_pending(r, lkb);
1543 else 1731 else
1544 high = max_t(int, lkb->lkb_rqmode, high); 1732 high = max_t(int, lkb->lkb_rqmode, high);
@@ -1733,7 +1921,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
1733} 1921}
1734 1922
1735static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, 1923static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1736 int namelen, uint32_t parent_lkid, void *ast, 1924 int namelen, unsigned long timeout_cs, void *ast,
1737 void *astarg, void *bast, struct dlm_args *args) 1925 void *astarg, void *bast, struct dlm_args *args)
1738{ 1926{
1739 int rv = -EINVAL; 1927 int rv = -EINVAL;
@@ -1776,10 +1964,6 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1776 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) 1964 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1777 goto out; 1965 goto out;
1778 1966
1779 /* parent/child locks not yet supported */
1780 if (parent_lkid)
1781 goto out;
1782
1783 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) 1967 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1784 goto out; 1968 goto out;
1785 1969
@@ -1791,6 +1975,7 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1791 args->astaddr = ast; 1975 args->astaddr = ast;
1792 args->astparam = (long) astarg; 1976 args->astparam = (long) astarg;
1793 args->bastaddr = bast; 1977 args->bastaddr = bast;
1978 args->timeout = timeout_cs;
1794 args->mode = mode; 1979 args->mode = mode;
1795 args->lksb = lksb; 1980 args->lksb = lksb;
1796 rv = 0; 1981 rv = 0;
@@ -1845,6 +2030,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1845 lkb->lkb_lksb = args->lksb; 2030 lkb->lkb_lksb = args->lksb;
1846 lkb->lkb_lvbptr = args->lksb->sb_lvbptr; 2031 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1847 lkb->lkb_ownpid = (int) current->pid; 2032 lkb->lkb_ownpid = (int) current->pid;
2033 lkb->lkb_timeout_cs = args->timeout;
1848 rv = 0; 2034 rv = 0;
1849 out: 2035 out:
1850 return rv; 2036 return rv;
@@ -1903,6 +2089,9 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1903 if (is_overlap(lkb)) 2089 if (is_overlap(lkb))
1904 goto out; 2090 goto out;
1905 2091
2092 /* don't let scand try to do a cancel */
2093 del_timeout(lkb);
2094
1906 if (lkb->lkb_flags & DLM_IFL_RESEND) { 2095 if (lkb->lkb_flags & DLM_IFL_RESEND) {
1907 lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; 2096 lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
1908 rv = -EBUSY; 2097 rv = -EBUSY;
@@ -1934,6 +2123,9 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1934 if (is_overlap_unlock(lkb)) 2123 if (is_overlap_unlock(lkb))
1935 goto out; 2124 goto out;
1936 2125
2126 /* don't let scand try to do a cancel */
2127 del_timeout(lkb);
2128
1937 if (lkb->lkb_flags & DLM_IFL_RESEND) { 2129 if (lkb->lkb_flags & DLM_IFL_RESEND) {
1938 lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; 2130 lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
1939 rv = -EBUSY; 2131 rv = -EBUSY;
@@ -1984,7 +2176,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1984{ 2176{
1985 int error = 0; 2177 int error = 0;
1986 2178
1987 if (can_be_granted(r, lkb, 1)) { 2179 if (can_be_granted(r, lkb, 1, NULL)) {
1988 grant_lock(r, lkb); 2180 grant_lock(r, lkb);
1989 queue_cast(r, lkb, 0); 2181 queue_cast(r, lkb, 0);
1990 goto out; 2182 goto out;
@@ -1994,6 +2186,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1994 error = -EINPROGRESS; 2186 error = -EINPROGRESS;
1995 add_lkb(r, lkb, DLM_LKSTS_WAITING); 2187 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1996 send_blocking_asts(r, lkb); 2188 send_blocking_asts(r, lkb);
2189 add_timeout(lkb);
1997 goto out; 2190 goto out;
1998 } 2191 }
1999 2192
@@ -2009,16 +2202,32 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2009static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2202static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2010{ 2203{
2011 int error = 0; 2204 int error = 0;
2205 int deadlk = 0;
2012 2206
2013 /* changing an existing lock may allow others to be granted */ 2207 /* changing an existing lock may allow others to be granted */
2014 2208
2015 if (can_be_granted(r, lkb, 1)) { 2209 if (can_be_granted(r, lkb, 1, &deadlk)) {
2016 grant_lock(r, lkb); 2210 grant_lock(r, lkb);
2017 queue_cast(r, lkb, 0); 2211 queue_cast(r, lkb, 0);
2018 grant_pending_locks(r); 2212 grant_pending_locks(r);
2019 goto out; 2213 goto out;
2020 } 2214 }
2021 2215
2216 /* can_be_granted() detected that this lock would block in a conversion
2217 deadlock, so we leave it on the granted queue and return EDEADLK in
2218 the ast for the convert. */
2219
2220 if (deadlk) {
2221 /* it's left on the granted queue */
2222 log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s",
2223 lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status,
2224 lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name);
2225 revert_lock(r, lkb);
2226 queue_cast(r, lkb, -EDEADLK);
2227 error = -EDEADLK;
2228 goto out;
2229 }
2230
2022 /* is_demoted() means the can_be_granted() above set the grmode 2231 /* is_demoted() means the can_be_granted() above set the grmode
2023 to NL, and left us on the granted queue. This auto-demotion 2232 to NL, and left us on the granted queue. This auto-demotion
2024 (due to CONVDEADLK) might mean other locks, and/or this lock, are 2233 (due to CONVDEADLK) might mean other locks, and/or this lock, are
@@ -2041,6 +2250,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2041 del_lkb(r, lkb); 2250 del_lkb(r, lkb);
2042 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2251 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2043 send_blocking_asts(r, lkb); 2252 send_blocking_asts(r, lkb);
2253 add_timeout(lkb);
2044 goto out; 2254 goto out;
2045 } 2255 }
2046 2256
@@ -2274,7 +2484,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
2274 if (!ls) 2484 if (!ls)
2275 return -EINVAL; 2485 return -EINVAL;
2276 2486
2277 lock_recovery(ls); 2487 dlm_lock_recovery(ls);
2278 2488
2279 if (convert) 2489 if (convert)
2280 error = find_lkb(ls, lksb->sb_lkid, &lkb); 2490 error = find_lkb(ls, lksb->sb_lkid, &lkb);
@@ -2284,7 +2494,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
2284 if (error) 2494 if (error)
2285 goto out; 2495 goto out;
2286 2496
2287 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast, 2497 error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
2288 astarg, bast, &args); 2498 astarg, bast, &args);
2289 if (error) 2499 if (error)
2290 goto out_put; 2500 goto out_put;
@@ -2299,10 +2509,10 @@ int dlm_lock(dlm_lockspace_t *lockspace,
2299 out_put: 2509 out_put:
2300 if (convert || error) 2510 if (convert || error)
2301 __put_lkb(ls, lkb); 2511 __put_lkb(ls, lkb);
2302 if (error == -EAGAIN) 2512 if (error == -EAGAIN || error == -EDEADLK)
2303 error = 0; 2513 error = 0;
2304 out: 2514 out:
2305 unlock_recovery(ls); 2515 dlm_unlock_recovery(ls);
2306 dlm_put_lockspace(ls); 2516 dlm_put_lockspace(ls);
2307 return error; 2517 return error;
2308} 2518}
@@ -2322,7 +2532,7 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
2322 if (!ls) 2532 if (!ls)
2323 return -EINVAL; 2533 return -EINVAL;
2324 2534
2325 lock_recovery(ls); 2535 dlm_lock_recovery(ls);
2326 2536
2327 error = find_lkb(ls, lkid, &lkb); 2537 error = find_lkb(ls, lkid, &lkb);
2328 if (error) 2538 if (error)
@@ -2344,7 +2554,7 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
2344 out_put: 2554 out_put:
2345 dlm_put_lkb(lkb); 2555 dlm_put_lkb(lkb);
2346 out: 2556 out:
2347 unlock_recovery(ls); 2557 dlm_unlock_recovery(ls);
2348 dlm_put_lockspace(ls); 2558 dlm_put_lockspace(ls);
2349 return error; 2559 return error;
2350} 2560}
@@ -2384,7 +2594,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
2384 pass into lowcomms_commit and a message buffer (mb) that we 2594 pass into lowcomms_commit and a message buffer (mb) that we
2385 write our data into */ 2595 write our data into */
2386 2596
2387 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); 2597 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
2388 if (!mh) 2598 if (!mh)
2389 return -ENOBUFS; 2599 return -ENOBUFS;
2390 2600
@@ -3111,9 +3321,10 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
3111 lkb->lkb_remid = ms->m_lkid; 3321 lkb->lkb_remid = ms->m_lkid;
3112 if (is_altmode(lkb)) 3322 if (is_altmode(lkb))
3113 munge_altmode(lkb, ms); 3323 munge_altmode(lkb, ms);
3114 if (result) 3324 if (result) {
3115 add_lkb(r, lkb, DLM_LKSTS_WAITING); 3325 add_lkb(r, lkb, DLM_LKSTS_WAITING);
3116 else { 3326 add_timeout(lkb);
3327 } else {
3117 grant_lock_pc(r, lkb, ms); 3328 grant_lock_pc(r, lkb, ms);
3118 queue_cast(r, lkb, 0); 3329 queue_cast(r, lkb, 0);
3119 } 3330 }
@@ -3172,6 +3383,12 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3172 queue_cast(r, lkb, -EAGAIN); 3383 queue_cast(r, lkb, -EAGAIN);
3173 break; 3384 break;
3174 3385
3386 case -EDEADLK:
3387 receive_flags_reply(lkb, ms);
3388 revert_lock_pc(r, lkb);
3389 queue_cast(r, lkb, -EDEADLK);
3390 break;
3391
3175 case -EINPROGRESS: 3392 case -EINPROGRESS:
3176 /* convert was queued on remote master */ 3393 /* convert was queued on remote master */
3177 receive_flags_reply(lkb, ms); 3394 receive_flags_reply(lkb, ms);
@@ -3179,6 +3396,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3179 munge_demoted(lkb, ms); 3396 munge_demoted(lkb, ms);
3180 del_lkb(r, lkb); 3397 del_lkb(r, lkb);
3181 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 3398 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3399 add_timeout(lkb);
3182 break; 3400 break;
3183 3401
3184 case 0: 3402 case 0:
@@ -3298,8 +3516,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3298 case -DLM_ECANCEL: 3516 case -DLM_ECANCEL:
3299 receive_flags_reply(lkb, ms); 3517 receive_flags_reply(lkb, ms);
3300 revert_lock_pc(r, lkb); 3518 revert_lock_pc(r, lkb);
3301 if (ms->m_result) 3519 queue_cast(r, lkb, -DLM_ECANCEL);
3302 queue_cast(r, lkb, -DLM_ECANCEL);
3303 break; 3520 break;
3304 case 0: 3521 case 0:
3305 break; 3522 break;
@@ -3424,7 +3641,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3424 } 3641 }
3425 } 3642 }
3426 3643
3427 if (lock_recovery_try(ls)) 3644 if (dlm_lock_recovery_try(ls))
3428 break; 3645 break;
3429 schedule(); 3646 schedule();
3430 } 3647 }
@@ -3503,7 +3720,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3503 log_error(ls, "unknown message type %d", ms->m_type); 3720 log_error(ls, "unknown message type %d", ms->m_type);
3504 } 3721 }
3505 3722
3506 unlock_recovery(ls); 3723 dlm_unlock_recovery(ls);
3507 out: 3724 out:
3508 dlm_put_lockspace(ls); 3725 dlm_put_lockspace(ls);
3509 dlm_astd_wake(); 3726 dlm_astd_wake();
@@ -4034,13 +4251,13 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
4034 4251
4035int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, 4252int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4036 int mode, uint32_t flags, void *name, unsigned int namelen, 4253 int mode, uint32_t flags, void *name, unsigned int namelen,
4037 uint32_t parent_lkid) 4254 unsigned long timeout_cs)
4038{ 4255{
4039 struct dlm_lkb *lkb; 4256 struct dlm_lkb *lkb;
4040 struct dlm_args args; 4257 struct dlm_args args;
4041 int error; 4258 int error;
4042 4259
4043 lock_recovery(ls); 4260 dlm_lock_recovery(ls);
4044 4261
4045 error = create_lkb(ls, &lkb); 4262 error = create_lkb(ls, &lkb);
4046 if (error) { 4263 if (error) {
@@ -4062,7 +4279,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4062 When DLM_IFL_USER is set, the dlm knows that this is a userspace 4279 When DLM_IFL_USER is set, the dlm knows that this is a userspace
4063 lock and that lkb_astparam is the dlm_user_args structure. */ 4280 lock and that lkb_astparam is the dlm_user_args structure. */
4064 4281
4065 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid, 4282 error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
4066 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args); 4283 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
4067 lkb->lkb_flags |= DLM_IFL_USER; 4284 lkb->lkb_flags |= DLM_IFL_USER;
4068 ua->old_mode = DLM_LOCK_IV; 4285 ua->old_mode = DLM_LOCK_IV;
@@ -4094,19 +4311,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4094 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); 4311 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
4095 spin_unlock(&ua->proc->locks_spin); 4312 spin_unlock(&ua->proc->locks_spin);
4096 out: 4313 out:
4097 unlock_recovery(ls); 4314 dlm_unlock_recovery(ls);
4098 return error; 4315 return error;
4099} 4316}
4100 4317
4101int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, 4318int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4102 int mode, uint32_t flags, uint32_t lkid, char *lvb_in) 4319 int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
4320 unsigned long timeout_cs)
4103{ 4321{
4104 struct dlm_lkb *lkb; 4322 struct dlm_lkb *lkb;
4105 struct dlm_args args; 4323 struct dlm_args args;
4106 struct dlm_user_args *ua; 4324 struct dlm_user_args *ua;
4107 int error; 4325 int error;
4108 4326
4109 lock_recovery(ls); 4327 dlm_lock_recovery(ls);
4110 4328
4111 error = find_lkb(ls, lkid, &lkb); 4329 error = find_lkb(ls, lkid, &lkb);
4112 if (error) 4330 if (error)
@@ -4127,6 +4345,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4127 if (lvb_in && ua->lksb.sb_lvbptr) 4345 if (lvb_in && ua->lksb.sb_lvbptr)
4128 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); 4346 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
4129 4347
4348 ua->xid = ua_tmp->xid;
4130 ua->castparam = ua_tmp->castparam; 4349 ua->castparam = ua_tmp->castparam;
4131 ua->castaddr = ua_tmp->castaddr; 4350 ua->castaddr = ua_tmp->castaddr;
4132 ua->bastparam = ua_tmp->bastparam; 4351 ua->bastparam = ua_tmp->bastparam;
@@ -4134,19 +4353,19 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4134 ua->user_lksb = ua_tmp->user_lksb; 4353 ua->user_lksb = ua_tmp->user_lksb;
4135 ua->old_mode = lkb->lkb_grmode; 4354 ua->old_mode = lkb->lkb_grmode;
4136 4355
4137 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST, 4356 error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
4138 ua, DLM_FAKE_USER_AST, &args); 4357 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
4139 if (error) 4358 if (error)
4140 goto out_put; 4359 goto out_put;
4141 4360
4142 error = convert_lock(ls, lkb, &args); 4361 error = convert_lock(ls, lkb, &args);
4143 4362
4144 if (error == -EINPROGRESS || error == -EAGAIN) 4363 if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
4145 error = 0; 4364 error = 0;
4146 out_put: 4365 out_put:
4147 dlm_put_lkb(lkb); 4366 dlm_put_lkb(lkb);
4148 out: 4367 out:
4149 unlock_recovery(ls); 4368 dlm_unlock_recovery(ls);
4150 kfree(ua_tmp); 4369 kfree(ua_tmp);
4151 return error; 4370 return error;
4152} 4371}
@@ -4159,7 +4378,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4159 struct dlm_user_args *ua; 4378 struct dlm_user_args *ua;
4160 int error; 4379 int error;
4161 4380
4162 lock_recovery(ls); 4381 dlm_lock_recovery(ls);
4163 4382
4164 error = find_lkb(ls, lkid, &lkb); 4383 error = find_lkb(ls, lkid, &lkb);
4165 if (error) 4384 if (error)
@@ -4194,7 +4413,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4194 out_put: 4413 out_put:
4195 dlm_put_lkb(lkb); 4414 dlm_put_lkb(lkb);
4196 out: 4415 out:
4197 unlock_recovery(ls); 4416 dlm_unlock_recovery(ls);
4198 kfree(ua_tmp); 4417 kfree(ua_tmp);
4199 return error; 4418 return error;
4200} 4419}
@@ -4207,7 +4426,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4207 struct dlm_user_args *ua; 4426 struct dlm_user_args *ua;
4208 int error; 4427 int error;
4209 4428
4210 lock_recovery(ls); 4429 dlm_lock_recovery(ls);
4211 4430
4212 error = find_lkb(ls, lkid, &lkb); 4431 error = find_lkb(ls, lkid, &lkb);
4213 if (error) 4432 if (error)
@@ -4231,11 +4450,59 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4231 out_put: 4450 out_put:
4232 dlm_put_lkb(lkb); 4451 dlm_put_lkb(lkb);
4233 out: 4452 out:
4234 unlock_recovery(ls); 4453 dlm_unlock_recovery(ls);
4235 kfree(ua_tmp); 4454 kfree(ua_tmp);
4236 return error; 4455 return error;
4237} 4456}
4238 4457
4458int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
4459{
4460 struct dlm_lkb *lkb;
4461 struct dlm_args args;
4462 struct dlm_user_args *ua;
4463 struct dlm_rsb *r;
4464 int error;
4465
4466 dlm_lock_recovery(ls);
4467
4468 error = find_lkb(ls, lkid, &lkb);
4469 if (error)
4470 goto out;
4471
4472 ua = (struct dlm_user_args *)lkb->lkb_astparam;
4473
4474 error = set_unlock_args(flags, ua, &args);
4475 if (error)
4476 goto out_put;
4477
4478 /* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
4479
4480 r = lkb->lkb_resource;
4481 hold_rsb(r);
4482 lock_rsb(r);
4483
4484 error = validate_unlock_args(lkb, &args);
4485 if (error)
4486 goto out_r;
4487 lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
4488
4489 error = _cancel_lock(r, lkb);
4490 out_r:
4491 unlock_rsb(r);
4492 put_rsb(r);
4493
4494 if (error == -DLM_ECANCEL)
4495 error = 0;
4496 /* from validate_unlock_args() */
4497 if (error == -EBUSY)
4498 error = 0;
4499 out_put:
4500 dlm_put_lkb(lkb);
4501 out:
4502 dlm_unlock_recovery(ls);
4503 return error;
4504}
4505
4239/* lkb's that are removed from the waiters list by revert are just left on the 4506/* lkb's that are removed from the waiters list by revert are just left on the
4240 orphans list with the granted orphan locks, to be freed by purge */ 4507 orphans list with the granted orphan locks, to be freed by purge */
4241 4508
@@ -4314,12 +4581,13 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4314{ 4581{
4315 struct dlm_lkb *lkb, *safe; 4582 struct dlm_lkb *lkb, *safe;
4316 4583
4317 lock_recovery(ls); 4584 dlm_lock_recovery(ls);
4318 4585
4319 while (1) { 4586 while (1) {
4320 lkb = del_proc_lock(ls, proc); 4587 lkb = del_proc_lock(ls, proc);
4321 if (!lkb) 4588 if (!lkb)
4322 break; 4589 break;
4590 del_timeout(lkb);
4323 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) 4591 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
4324 orphan_proc_lock(ls, lkb); 4592 orphan_proc_lock(ls, lkb);
4325 else 4593 else
@@ -4347,7 +4615,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4347 } 4615 }
4348 4616
4349 mutex_unlock(&ls->ls_clear_proc_locks); 4617 mutex_unlock(&ls->ls_clear_proc_locks);
4350 unlock_recovery(ls); 4618 dlm_unlock_recovery(ls);
4351} 4619}
4352 4620
4353static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) 4621static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
@@ -4429,12 +4697,12 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
4429 if (nodeid != dlm_our_nodeid()) { 4697 if (nodeid != dlm_our_nodeid()) {
4430 error = send_purge(ls, nodeid, pid); 4698 error = send_purge(ls, nodeid, pid);
4431 } else { 4699 } else {
4432 lock_recovery(ls); 4700 dlm_lock_recovery(ls);
4433 if (pid == current->pid) 4701 if (pid == current->pid)
4434 purge_proc_locks(ls, proc); 4702 purge_proc_locks(ls, proc);
4435 else 4703 else
4436 do_purge(ls, nodeid, pid); 4704 do_purge(ls, nodeid, pid);
4437 unlock_recovery(ls); 4705 dlm_unlock_recovery(ls);
4438 } 4706 }
4439 return error; 4707 return error;
4440} 4708}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 64fc4ec406..1720313c22 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -24,6 +24,10 @@ void dlm_put_rsb(struct dlm_rsb *r);
24void dlm_hold_rsb(struct dlm_rsb *r); 24void dlm_hold_rsb(struct dlm_rsb *r);
25int dlm_put_lkb(struct dlm_lkb *lkb); 25int dlm_put_lkb(struct dlm_lkb *lkb);
26void dlm_scan_rsbs(struct dlm_ls *ls); 26void dlm_scan_rsbs(struct dlm_ls *ls);
27int dlm_lock_recovery_try(struct dlm_ls *ls);
28void dlm_unlock_recovery(struct dlm_ls *ls);
29void dlm_scan_timeout(struct dlm_ls *ls);
30void dlm_adjust_timeouts(struct dlm_ls *ls);
27 31
28int dlm_purge_locks(struct dlm_ls *ls); 32int dlm_purge_locks(struct dlm_ls *ls);
29void dlm_purge_mstcpy_locks(struct dlm_rsb *r); 33void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
@@ -34,15 +38,18 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); 38int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
35 39
36int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, 40int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
37 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid); 41 uint32_t flags, void *name, unsigned int namelen,
42 unsigned long timeout_cs);
38int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, 43int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
39 int mode, uint32_t flags, uint32_t lkid, char *lvb_in); 44 int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
45 unsigned long timeout_cs);
40int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, 46int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in); 47 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, 48int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid); 49 uint32_t flags, uint32_t lkid);
44int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, 50int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
45 int nodeid, int pid); 51 int nodeid, int pid);
52int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
46void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); 53void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
47 54
48static inline int is_master(struct dlm_rsb *r) 55static inline int is_master(struct dlm_rsb *r)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a677b2a5ee..1dc72105ab 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -197,13 +197,24 @@ static int do_uevent(struct dlm_ls *ls, int in)
197 else 197 else
198 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); 198 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
199 199
200 log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving");
201
202 /* dlm_controld will see the uevent, do the necessary group management
203 and then write to sysfs to wake us */
204
200 error = wait_event_interruptible(ls->ls_uevent_wait, 205 error = wait_event_interruptible(ls->ls_uevent_wait,
201 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); 206 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
207
208 log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result);
209
202 if (error) 210 if (error)
203 goto out; 211 goto out;
204 212
205 error = ls->ls_uevent_result; 213 error = ls->ls_uevent_result;
206 out: 214 out:
215 if (error)
216 log_error(ls, "group %s failed %d %d", in ? "join" : "leave",
217 error, ls->ls_uevent_result);
207 return error; 218 return error;
208} 219}
209 220
@@ -234,8 +245,13 @@ static int dlm_scand(void *data)
234 struct dlm_ls *ls; 245 struct dlm_ls *ls;
235 246
236 while (!kthread_should_stop()) { 247 while (!kthread_should_stop()) {
237 list_for_each_entry(ls, &lslist, ls_list) 248 list_for_each_entry(ls, &lslist, ls_list) {
238 dlm_scan_rsbs(ls); 249 if (dlm_lock_recovery_try(ls)) {
250 dlm_scan_rsbs(ls);
251 dlm_scan_timeout(ls);
252 dlm_unlock_recovery(ls);
253 }
254 }
239 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); 255 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
240 } 256 }
241 return 0; 257 return 0;
@@ -395,6 +411,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
395{ 411{
396 struct dlm_ls *ls; 412 struct dlm_ls *ls;
397 int i, size, error = -ENOMEM; 413 int i, size, error = -ENOMEM;
414 int do_unreg = 0;
398 415
399 if (namelen > DLM_LOCKSPACE_LEN) 416 if (namelen > DLM_LOCKSPACE_LEN)
400 return -EINVAL; 417 return -EINVAL;
@@ -417,11 +434,22 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
417 goto out; 434 goto out;
418 memcpy(ls->ls_name, name, namelen); 435 memcpy(ls->ls_name, name, namelen);
419 ls->ls_namelen = namelen; 436 ls->ls_namelen = namelen;
420 ls->ls_exflags = flags;
421 ls->ls_lvblen = lvblen; 437 ls->ls_lvblen = lvblen;
422 ls->ls_count = 0; 438 ls->ls_count = 0;
423 ls->ls_flags = 0; 439 ls->ls_flags = 0;
424 440
441 if (flags & DLM_LSFL_TIMEWARN)
442 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
443
444 if (flags & DLM_LSFL_FS)
445 ls->ls_allocation = GFP_NOFS;
446 else
447 ls->ls_allocation = GFP_KERNEL;
448
449 /* ls_exflags are forced to match among nodes, and we don't
450 need to require all nodes to have TIMEWARN or FS set */
451 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS));
452
425 size = dlm_config.ci_rsbtbl_size; 453 size = dlm_config.ci_rsbtbl_size;
426 ls->ls_rsbtbl_size = size; 454 ls->ls_rsbtbl_size = size;
427 455
@@ -461,6 +489,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
461 mutex_init(&ls->ls_waiters_mutex); 489 mutex_init(&ls->ls_waiters_mutex);
462 INIT_LIST_HEAD(&ls->ls_orphans); 490 INIT_LIST_HEAD(&ls->ls_orphans);
463 mutex_init(&ls->ls_orphans_mutex); 491 mutex_init(&ls->ls_orphans_mutex);
492 INIT_LIST_HEAD(&ls->ls_timeout);
493 mutex_init(&ls->ls_timeout_mutex);
464 494
465 INIT_LIST_HEAD(&ls->ls_nodes); 495 INIT_LIST_HEAD(&ls->ls_nodes);
466 INIT_LIST_HEAD(&ls->ls_nodes_gone); 496 INIT_LIST_HEAD(&ls->ls_nodes_gone);
@@ -477,6 +507,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
477 507
478 init_waitqueue_head(&ls->ls_uevent_wait); 508 init_waitqueue_head(&ls->ls_uevent_wait);
479 ls->ls_uevent_result = 0; 509 ls->ls_uevent_result = 0;
510 init_completion(&ls->ls_members_done);
511 ls->ls_members_result = -1;
480 512
481 ls->ls_recoverd_task = NULL; 513 ls->ls_recoverd_task = NULL;
482 mutex_init(&ls->ls_recoverd_active); 514 mutex_init(&ls->ls_recoverd_active);
@@ -513,32 +545,49 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
513 error = dlm_recoverd_start(ls); 545 error = dlm_recoverd_start(ls);
514 if (error) { 546 if (error) {
515 log_error(ls, "can't start dlm_recoverd %d", error); 547 log_error(ls, "can't start dlm_recoverd %d", error);
516 goto out_rcomfree; 548 goto out_delist;
517 } 549 }
518 550
519 dlm_create_debug_file(ls);
520
521 error = kobject_setup(ls); 551 error = kobject_setup(ls);
522 if (error) 552 if (error)
523 goto out_del; 553 goto out_stop;
524 554
525 error = kobject_register(&ls->ls_kobj); 555 error = kobject_register(&ls->ls_kobj);
526 if (error) 556 if (error)
527 goto out_del; 557 goto out_stop;
558
559 /* let kobject handle freeing of ls if there's an error */
560 do_unreg = 1;
561
562 /* This uevent triggers dlm_controld in userspace to add us to the
563 group of nodes that are members of this lockspace (managed by the
564 cluster infrastructure.) Once it's done that, it tells us who the
565 current lockspace members are (via configfs) and then tells the
566 lockspace to start running (via sysfs) in dlm_ls_start(). */
528 567
529 error = do_uevent(ls, 1); 568 error = do_uevent(ls, 1);
530 if (error) 569 if (error)
531 goto out_unreg; 570 goto out_stop;
571
572 wait_for_completion(&ls->ls_members_done);
573 error = ls->ls_members_result;
574 if (error)
575 goto out_members;
576
577 dlm_create_debug_file(ls);
578
579 log_debug(ls, "join complete");
532 580
533 *lockspace = ls; 581 *lockspace = ls;
534 return 0; 582 return 0;
535 583
536 out_unreg: 584 out_members:
537 kobject_unregister(&ls->ls_kobj); 585 do_uevent(ls, 0);
538 out_del: 586 dlm_clear_members(ls);
539 dlm_delete_debug_file(ls); 587 kfree(ls->ls_node_array);
588 out_stop:
540 dlm_recoverd_stop(ls); 589 dlm_recoverd_stop(ls);
541 out_rcomfree: 590 out_delist:
542 spin_lock(&lslist_lock); 591 spin_lock(&lslist_lock);
543 list_del(&ls->ls_list); 592 list_del(&ls->ls_list);
544 spin_unlock(&lslist_lock); 593 spin_unlock(&lslist_lock);
@@ -550,7 +599,10 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
550 out_rsbfree: 599 out_rsbfree:
551 kfree(ls->ls_rsbtbl); 600 kfree(ls->ls_rsbtbl);
552 out_lsfree: 601 out_lsfree:
553 kfree(ls); 602 if (do_unreg)
603 kobject_unregister(&ls->ls_kobj);
604 else
605 kfree(ls);
554 out: 606 out:
555 module_put(THIS_MODULE); 607 module_put(THIS_MODULE);
556 return error; 608 return error;
@@ -570,6 +622,8 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace,
570 error = new_lockspace(name, namelen, lockspace, flags, lvblen); 622 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
571 if (!error) 623 if (!error)
572 ls_count++; 624 ls_count++;
625 else if (!ls_count)
626 threads_stop();
573 out: 627 out:
574 mutex_unlock(&ls_lock); 628 mutex_unlock(&ls_lock);
575 return error; 629 return error;
@@ -696,7 +750,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
696 dlm_clear_members_gone(ls); 750 dlm_clear_members_gone(ls);
697 kfree(ls->ls_node_array); 751 kfree(ls->ls_node_array);
698 kobject_unregister(&ls->ls_kobj); 752 kobject_unregister(&ls->ls_kobj);
699 /* The ls structure will be freed when the kobject is done with */ 753 /* The ls structure will be freed when the kobject is done with */
700 754
701 mutex_lock(&ls_lock); 755 mutex_lock(&ls_lock);
702 ls_count--; 756 ls_count--;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 27970a58d2..0553a6158d 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -260,7 +260,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
260static void lowcomms_data_ready(struct sock *sk, int count_unused) 260static void lowcomms_data_ready(struct sock *sk, int count_unused)
261{ 261{
262 struct connection *con = sock2con(sk); 262 struct connection *con = sock2con(sk);
263 if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) 263 if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
264 queue_work(recv_workqueue, &con->rwork); 264 queue_work(recv_workqueue, &con->rwork);
265} 265}
266 266
@@ -268,7 +268,7 @@ static void lowcomms_write_space(struct sock *sk)
268{ 268{
269 struct connection *con = sock2con(sk); 269 struct connection *con = sock2con(sk);
270 270
271 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 271 if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
272 queue_work(send_workqueue, &con->swork); 272 queue_work(send_workqueue, &con->swork);
273} 273}
274 274
@@ -720,11 +720,17 @@ static int tcp_accept_from_sock(struct connection *con)
720 INIT_WORK(&othercon->rwork, process_recv_sockets); 720 INIT_WORK(&othercon->rwork, process_recv_sockets);
721 set_bit(CF_IS_OTHERCON, &othercon->flags); 721 set_bit(CF_IS_OTHERCON, &othercon->flags);
722 newcon->othercon = othercon; 722 newcon->othercon = othercon;
723 othercon->sock = newsock;
724 newsock->sk->sk_user_data = othercon;
725 add_sock(newsock, othercon);
726 addcon = othercon;
727 }
728 else {
729 printk("Extra connection from node %d attempted\n", nodeid);
730 result = -EAGAIN;
731 mutex_unlock(&newcon->sock_mutex);
732 goto accept_err;
723 } 733 }
724 othercon->sock = newsock;
725 newsock->sk->sk_user_data = othercon;
726 add_sock(newsock, othercon);
727 addcon = othercon;
728 } 734 }
729 else { 735 else {
730 newsock->sk->sk_user_data = newcon; 736 newsock->sk->sk_user_data = newcon;
@@ -1400,8 +1406,11 @@ void dlm_lowcomms_stop(void)
1400 down(&connections_lock); 1406 down(&connections_lock);
1401 for (i = 0; i <= max_nodeid; i++) { 1407 for (i = 0; i <= max_nodeid; i++) {
1402 con = __nodeid2con(i, 0); 1408 con = __nodeid2con(i, 0);
1403 if (con) 1409 if (con) {
1404 con->flags |= 0xFF; 1410 con->flags |= 0xFF;
1411 if (con->sock)
1412 con->sock->sk->sk_user_data = NULL;
1413 }
1405 } 1414 }
1406 up(&connections_lock); 1415 up(&connections_lock);
1407 1416
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 162fbae58f..eca2907f23 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -25,6 +25,8 @@ void dlm_unregister_debugfs(void);
25static inline int dlm_register_debugfs(void) { return 0; } 25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { } 26static inline void dlm_unregister_debugfs(void) { }
27#endif 27#endif
28int dlm_netlink_init(void);
29void dlm_netlink_exit(void);
28 30
29static int __init init_dlm(void) 31static int __init init_dlm(void)
30{ 32{
@@ -50,10 +52,16 @@ static int __init init_dlm(void)
50 if (error) 52 if (error)
51 goto out_debug; 53 goto out_debug;
52 54
55 error = dlm_netlink_init();
56 if (error)
57 goto out_user;
58
53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); 59 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
54 60
55 return 0; 61 return 0;
56 62
63 out_user:
64 dlm_user_exit();
57 out_debug: 65 out_debug:
58 dlm_unregister_debugfs(); 66 dlm_unregister_debugfs();
59 out_config: 67 out_config:
@@ -68,6 +76,7 @@ static int __init init_dlm(void)
68 76
69static void __exit exit_dlm(void) 77static void __exit exit_dlm(void)
70{ 78{
79 dlm_netlink_exit();
71 dlm_user_exit(); 80 dlm_user_exit();
72 dlm_config_exit(); 81 dlm_config_exit();
73 dlm_memory_exit(); 82 dlm_memory_exit();
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 85e2897bd7..073599dced 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -233,6 +233,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
233 *neg_out = neg; 233 *neg_out = neg;
234 234
235 error = ping_members(ls); 235 error = ping_members(ls);
236 if (!error || error == -EPROTO) {
237 /* new_lockspace() may be waiting to know if the config
238 is good or bad */
239 ls->ls_members_result = error;
240 complete(&ls->ls_members_done);
241 }
236 if (error) 242 if (error)
237 goto out; 243 goto out;
238 244
@@ -284,6 +290,9 @@ int dlm_ls_stop(struct dlm_ls *ls)
284 dlm_recoverd_suspend(ls); 290 dlm_recoverd_suspend(ls);
285 ls->ls_recover_status = 0; 291 ls->ls_recover_status = 0;
286 dlm_recoverd_resume(ls); 292 dlm_recoverd_resume(ls);
293
294 if (!ls->ls_recover_begin)
295 ls->ls_recover_begin = jiffies;
287 return 0; 296 return 0;
288} 297}
289 298
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
new file mode 100644
index 0000000000..863b87d0dc
--- /dev/null
+++ b/fs/dlm/netlink.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <net/genetlink.h>
10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h>
12
13#include "dlm_internal.h"
14
15static uint32_t dlm_nl_seqnum;
16static uint32_t listener_nlpid;
17
18static struct genl_family family = {
19 .id = GENL_ID_GENERATE,
20 .name = DLM_GENL_NAME,
21 .version = DLM_GENL_VERSION,
22};
23
24static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
25{
26 struct sk_buff *skb;
27 void *data;
28
29 skb = genlmsg_new(size, GFP_KERNEL);
30 if (!skb)
31 return -ENOMEM;
32
33 /* add the message headers */
34 data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd);
35 if (!data) {
36 nlmsg_free(skb);
37 return -EINVAL;
38 }
39
40 *skbp = skb;
41 return 0;
42}
43
44static struct dlm_lock_data *mk_data(struct sk_buff *skb)
45{
46 struct nlattr *ret;
47
48 ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data));
49 if (!ret)
50 return NULL;
51 return nla_data(ret);
52}
53
54static int send_data(struct sk_buff *skb)
55{
56 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
57 void *data = genlmsg_data(genlhdr);
58 int rv;
59
60 rv = genlmsg_end(skb, data);
61 if (rv < 0) {
62 nlmsg_free(skb);
63 return rv;
64 }
65
66 return genlmsg_unicast(skb, listener_nlpid);
67}
68
69static int user_cmd(struct sk_buff *skb, struct genl_info *info)
70{
71 listener_nlpid = info->snd_pid;
72 printk("user_cmd nlpid %u\n", listener_nlpid);
73 return 0;
74}
75
76static struct genl_ops dlm_nl_ops = {
77 .cmd = DLM_CMD_HELLO,
78 .doit = user_cmd,
79};
80
81int dlm_netlink_init(void)
82{
83 int rv;
84
85 rv = genl_register_family(&family);
86 if (rv)
87 return rv;
88
89 rv = genl_register_ops(&family, &dlm_nl_ops);
90 if (rv < 0)
91 goto err;
92 return 0;
93 err:
94 genl_unregister_family(&family);
95 return rv;
96}
97
98void dlm_netlink_exit(void)
99{
100 genl_unregister_ops(&family, &dlm_nl_ops);
101 genl_unregister_family(&family);
102}
103
104static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
105{
106 struct dlm_rsb *r = lkb->lkb_resource;
107 struct dlm_user_args *ua = (struct dlm_user_args *) lkb->lkb_astparam;
108
109 memset(data, 0, sizeof(struct dlm_lock_data));
110
111 data->version = DLM_LOCK_DATA_VERSION;
112 data->nodeid = lkb->lkb_nodeid;
113 data->ownpid = lkb->lkb_ownpid;
114 data->id = lkb->lkb_id;
115 data->remid = lkb->lkb_remid;
116 data->status = lkb->lkb_status;
117 data->grmode = lkb->lkb_grmode;
118 data->rqmode = lkb->lkb_rqmode;
119 data->timestamp = lkb->lkb_timestamp;
120 if (ua)
121 data->xid = ua->xid;
122 if (r) {
123 data->lockspace_id = r->res_ls->ls_global_id;
124 data->resource_namelen = r->res_length;
125 memcpy(data->resource_name, r->res_name, r->res_length);
126 }
127}
128
129void dlm_timeout_warn(struct dlm_lkb *lkb)
130{
131 struct dlm_lock_data *data;
132 struct sk_buff *send_skb;
133 size_t size;
134 int rv;
135
136 size = nla_total_size(sizeof(struct dlm_lock_data)) +
137 nla_total_size(0); /* why this? */
138
139 rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size);
140 if (rv < 0)
141 return;
142
143 data = mk_data(send_skb);
144 if (!data) {
145 nlmsg_free(send_skb);
146 return;
147 }
148
149 fill_data(data, lkb);
150
151 send_data(send_skb);
152}
153
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 6bfbd61538..e3a1527cbd 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
38 char *mb; 38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len; 39 int mb_len = sizeof(struct dlm_rcom) + len;
40 40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); 41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
42 if (!mh) { 42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS", 43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len); 44 to_nodeid, type, len);
@@ -90,7 +90,7 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
90 log_error(ls, "version mismatch: %x nodeid %d: %x", 90 log_error(ls, "version mismatch: %x nodeid %d: %x",
91 DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, 91 DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
92 rc->rc_header.h_version); 92 rc->rc_header.h_version);
93 return -EINVAL; 93 return -EPROTO;
94 } 94 }
95 95
96 if (rf->rf_lvblen != ls->ls_lvblen || 96 if (rf->rf_lvblen != ls->ls_lvblen ||
@@ -98,7 +98,7 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
98 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", 98 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
99 ls->ls_lvblen, ls->ls_exflags, 99 ls->ls_lvblen, ls->ls_exflags,
100 nodeid, rf->rf_lvblen, rf->rf_lsflags); 100 nodeid, rf->rf_lvblen, rf->rf_lsflags);
101 return -EINVAL; 101 return -EPROTO;
102 } 102 }
103 return 0; 103 return 0;
104} 104}
@@ -386,7 +386,8 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
386 dlm_recover_process_copy(ls, rc_in); 386 dlm_recover_process_copy(ls, rc_in);
387} 387}
388 388
389static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) 389static int send_ls_not_ready(struct dlm_ls *ls, int nodeid,
390 struct dlm_rcom *rc_in)
390{ 391{
391 struct dlm_rcom *rc; 392 struct dlm_rcom *rc;
392 struct rcom_config *rf; 393 struct rcom_config *rf;
@@ -394,7 +395,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
394 char *mb; 395 char *mb;
395 int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); 396 int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
396 397
397 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb); 398 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, ls->ls_allocation, &mb);
398 if (!mh) 399 if (!mh)
399 return -ENOBUFS; 400 return -ENOBUFS;
400 memset(mb, 0, mb_len); 401 memset(mb, 0, mb_len);
@@ -464,7 +465,7 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
464 log_print("lockspace %x from %d type %x not found", 465 log_print("lockspace %x from %d type %x not found",
465 hd->h_lockspace, nodeid, rc->rc_type); 466 hd->h_lockspace, nodeid, rc->rc_type);
466 if (rc->rc_type == DLM_RCOM_STATUS) 467 if (rc->rc_type == DLM_RCOM_STATUS)
467 send_ls_not_ready(nodeid, rc); 468 send_ls_not_ready(ls, nodeid, rc);
468 return; 469 return;
469 } 470 }
470 471
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 3cb636d602..6657599786 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -190,6 +190,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
190 190
191 dlm_clear_members_gone(ls); 191 dlm_clear_members_gone(ls);
192 192
193 dlm_adjust_timeouts(ls);
194
193 error = enable_locking(ls, rv->seq); 195 error = enable_locking(ls, rv->seq);
194 if (error) { 196 if (error) {
195 log_debug(ls, "enable_locking failed %d", error); 197 log_debug(ls, "enable_locking failed %d", error);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b0201ec325..6438941ab1 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -33,16 +33,17 @@ static const struct file_operations device_fops;
33struct dlm_lock_params32 { 33struct dlm_lock_params32 {
34 __u8 mode; 34 __u8 mode;
35 __u8 namelen; 35 __u8 namelen;
36 __u16 flags; 36 __u16 unused;
37 __u32 flags;
37 __u32 lkid; 38 __u32 lkid;
38 __u32 parent; 39 __u32 parent;
39 40 __u64 xid;
41 __u64 timeout;
40 __u32 castparam; 42 __u32 castparam;
41 __u32 castaddr; 43 __u32 castaddr;
42 __u32 bastparam; 44 __u32 bastparam;
43 __u32 bastaddr; 45 __u32 bastaddr;
44 __u32 lksb; 46 __u32 lksb;
45
46 char lvb[DLM_USER_LVB_LEN]; 47 char lvb[DLM_USER_LVB_LEN];
47 char name[0]; 48 char name[0];
48}; 49};
@@ -68,6 +69,7 @@ struct dlm_lksb32 {
68}; 69};
69 70
70struct dlm_lock_result32 { 71struct dlm_lock_result32 {
72 __u32 version[3];
71 __u32 length; 73 __u32 length;
72 __u32 user_astaddr; 74 __u32 user_astaddr;
73 __u32 user_astparam; 75 __u32 user_astparam;
@@ -102,6 +104,8 @@ static void compat_input(struct dlm_write_request *kb,
102 kb->i.lock.flags = kb32->i.lock.flags; 104 kb->i.lock.flags = kb32->i.lock.flags;
103 kb->i.lock.lkid = kb32->i.lock.lkid; 105 kb->i.lock.lkid = kb32->i.lock.lkid;
104 kb->i.lock.parent = kb32->i.lock.parent; 106 kb->i.lock.parent = kb32->i.lock.parent;
107 kb->i.lock.xid = kb32->i.lock.xid;
108 kb->i.lock.timeout = kb32->i.lock.timeout;
105 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam; 109 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
106 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr; 110 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
107 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam; 111 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
@@ -115,6 +119,10 @@ static void compat_input(struct dlm_write_request *kb,
115static void compat_output(struct dlm_lock_result *res, 119static void compat_output(struct dlm_lock_result *res,
116 struct dlm_lock_result32 *res32) 120 struct dlm_lock_result32 *res32)
117{ 121{
122 res32->version[0] = res->version[0];
123 res32->version[1] = res->version[1];
124 res32->version[2] = res->version[2];
125
118 res32->user_astaddr = (__u32)(long)res->user_astaddr; 126 res32->user_astaddr = (__u32)(long)res->user_astaddr;
119 res32->user_astparam = (__u32)(long)res->user_astparam; 127 res32->user_astparam = (__u32)(long)res->user_astparam;
120 res32->user_lksb = (__u32)(long)res->user_lksb; 128 res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -130,6 +138,36 @@ static void compat_output(struct dlm_lock_result *res,
130} 138}
131#endif 139#endif
132 140
141/* Figure out if this lock is at the end of its life and no longer
142 available for the application to use. The lkb still exists until
143 the final ast is read. A lock becomes EOL in three situations:
144 1. a noqueue request fails with EAGAIN
145 2. an unlock completes with EUNLOCK
146 3. a cancel of a waiting request completes with ECANCEL/EDEADLK
147 An EOL lock needs to be removed from the process's list of locks.
148 And we can't allow any new operation on an EOL lock. This is
149 not related to the lifetime of the lkb struct which is managed
150 entirely by refcount. */
151
152static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
153{
154 switch (sb_status) {
155 case -DLM_EUNLOCK:
156 return 1;
157 case -DLM_ECANCEL:
158 case -ETIMEDOUT:
159 case -EDEADLK:
160 if (lkb->lkb_grmode == DLM_LOCK_IV)
161 return 1;
162 break;
163 case -EAGAIN:
164 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
165 return 1;
166 break;
167 }
168 return 0;
169}
170
133/* we could possibly check if the cancel of an orphan has resulted in the lkb 171/* we could possibly check if the cancel of an orphan has resulted in the lkb
134 being removed and then remove that lkb from the orphans list and free it */ 172 being removed and then remove that lkb from the orphans list and free it */
135 173
@@ -176,25 +214,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
176 log_debug(ls, "ast overlap %x status %x %x", 214 log_debug(ls, "ast overlap %x status %x %x",
177 lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags); 215 lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
178 216
179 /* Figure out if this lock is at the end of its life and no longer 217 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
180 available for the application to use. The lkb still exists until
181 the final ast is read. A lock becomes EOL in three situations:
182 1. a noqueue request fails with EAGAIN
183 2. an unlock completes with EUNLOCK
184 3. a cancel of a waiting request completes with ECANCEL
185 An EOL lock needs to be removed from the process's list of locks.
186 And we can't allow any new operation on an EOL lock. This is
187 not related to the lifetime of the lkb struct which is managed
188 entirely by refcount. */
189
190 if (type == AST_COMP &&
191 lkb->lkb_grmode == DLM_LOCK_IV &&
192 ua->lksb.sb_status == -EAGAIN)
193 eol = 1;
194 else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
195 (ua->lksb.sb_status == -DLM_ECANCEL &&
196 lkb->lkb_grmode == DLM_LOCK_IV))
197 eol = 1;
198 if (eol) { 218 if (eol) {
199 lkb->lkb_ast_type &= ~AST_BAST; 219 lkb->lkb_ast_type &= ~AST_BAST;
200 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; 220 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
@@ -252,16 +272,18 @@ static int device_user_lock(struct dlm_user_proc *proc,
252 ua->castaddr = params->castaddr; 272 ua->castaddr = params->castaddr;
253 ua->bastparam = params->bastparam; 273 ua->bastparam = params->bastparam;
254 ua->bastaddr = params->bastaddr; 274 ua->bastaddr = params->bastaddr;
275 ua->xid = params->xid;
255 276
256 if (params->flags & DLM_LKF_CONVERT) 277 if (params->flags & DLM_LKF_CONVERT)
257 error = dlm_user_convert(ls, ua, 278 error = dlm_user_convert(ls, ua,
258 params->mode, params->flags, 279 params->mode, params->flags,
259 params->lkid, params->lvb); 280 params->lkid, params->lvb,
281 (unsigned long) params->timeout);
260 else { 282 else {
261 error = dlm_user_request(ls, ua, 283 error = dlm_user_request(ls, ua,
262 params->mode, params->flags, 284 params->mode, params->flags,
263 params->name, params->namelen, 285 params->name, params->namelen,
264 params->parent); 286 (unsigned long) params->timeout);
265 if (!error) 287 if (!error)
266 error = ua->lksb.sb_lkid; 288 error = ua->lksb.sb_lkid;
267 } 289 }
@@ -299,6 +321,22 @@ static int device_user_unlock(struct dlm_user_proc *proc,
299 return error; 321 return error;
300} 322}
301 323
324static int device_user_deadlock(struct dlm_user_proc *proc,
325 struct dlm_lock_params *params)
326{
327 struct dlm_ls *ls;
328 int error;
329
330 ls = dlm_find_lockspace_local(proc->lockspace);
331 if (!ls)
332 return -ENOENT;
333
334 error = dlm_user_deadlock(ls, params->flags, params->lkid);
335
336 dlm_put_lockspace(ls);
337 return error;
338}
339
302static int create_misc_device(struct dlm_ls *ls, char *name) 340static int create_misc_device(struct dlm_ls *ls, char *name)
303{ 341{
304 int error, len; 342 int error, len;
@@ -348,7 +386,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
348 return -EPERM; 386 return -EPERM;
349 387
350 error = dlm_new_lockspace(params->name, strlen(params->name), 388 error = dlm_new_lockspace(params->name, strlen(params->name),
351 &lockspace, 0, DLM_USER_LVB_LEN); 389 &lockspace, params->flags, DLM_USER_LVB_LEN);
352 if (error) 390 if (error)
353 return error; 391 return error;
354 392
@@ -524,6 +562,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
524 error = device_user_unlock(proc, &kbuf->i.lock); 562 error = device_user_unlock(proc, &kbuf->i.lock);
525 break; 563 break;
526 564
565 case DLM_USER_DEADLOCK:
566 if (!proc) {
567 log_print("no locking on control device");
568 goto out_sig;
569 }
570 error = device_user_deadlock(proc, &kbuf->i.lock);
571 break;
572
527 case DLM_USER_CREATE_LOCKSPACE: 573 case DLM_USER_CREATE_LOCKSPACE:
528 if (proc) { 574 if (proc) {
529 log_print("create/remove only on control device"); 575 log_print("create/remove only on control device");
@@ -641,6 +687,9 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
641 int struct_len; 687 int struct_len;
642 688
643 memset(&result, 0, sizeof(struct dlm_lock_result)); 689 memset(&result, 0, sizeof(struct dlm_lock_result));
690 result.version[0] = DLM_DEVICE_VERSION_MAJOR;
691 result.version[1] = DLM_DEVICE_VERSION_MINOR;
692 result.version[2] = DLM_DEVICE_VERSION_PATCH;
644 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb)); 693 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
645 result.user_lksb = ua->user_lksb; 694 result.user_lksb = ua->user_lksb;
646 695
@@ -699,6 +748,20 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
699 return error; 748 return error;
700} 749}
701 750
751static int copy_version_to_user(char __user *buf, size_t count)
752{
753 struct dlm_device_version ver;
754
755 memset(&ver, 0, sizeof(struct dlm_device_version));
756 ver.version[0] = DLM_DEVICE_VERSION_MAJOR;
757 ver.version[1] = DLM_DEVICE_VERSION_MINOR;
758 ver.version[2] = DLM_DEVICE_VERSION_PATCH;
759
760 if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version)))
761 return -EFAULT;
762 return sizeof(struct dlm_device_version);
763}
764
702/* a read returns a single ast described in a struct dlm_lock_result */ 765/* a read returns a single ast described in a struct dlm_lock_result */
703 766
704static ssize_t device_read(struct file *file, char __user *buf, size_t count, 767static ssize_t device_read(struct file *file, char __user *buf, size_t count,
@@ -710,6 +773,16 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
710 DECLARE_WAITQUEUE(wait, current); 773 DECLARE_WAITQUEUE(wait, current);
711 int error, type=0, bmode=0, removed = 0; 774 int error, type=0, bmode=0, removed = 0;
712 775
776 if (count == sizeof(struct dlm_device_version)) {
777 error = copy_version_to_user(buf, count);
778 return error;
779 }
780
781 if (!proc) {
782 log_print("non-version read from control device %zu", count);
783 return -EINVAL;
784 }
785
713#ifdef CONFIG_COMPAT 786#ifdef CONFIG_COMPAT
714 if (count < sizeof(struct dlm_lock_result32)) 787 if (count < sizeof(struct dlm_lock_result32))
715#else 788#else
@@ -747,11 +820,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
747 } 820 }
748 } 821 }
749 822
750 if (list_empty(&proc->asts)) {
751 spin_unlock(&proc->asts_spin);
752 return -EAGAIN;
753 }
754
755 /* there may be both completion and blocking asts to return for 823 /* there may be both completion and blocking asts to return for
756 the lkb, don't remove lkb from asts list unless no asts remain */ 824 the lkb, don't remove lkb from asts list unless no asts remain */
757 825
@@ -823,6 +891,7 @@ static const struct file_operations device_fops = {
823static const struct file_operations ctl_device_fops = { 891static const struct file_operations ctl_device_fops = {
824 .open = ctl_device_open, 892 .open = ctl_device_open,
825 .release = ctl_device_close, 893 .release = ctl_device_close,
894 .read = device_read,
826 .write = device_write, 895 .write = device_write,
827 .owner = THIS_MODULE, 896 .owner = THIS_MODULE,
828}; 897};
diff --git a/fs/dquot.c b/fs/dquot.c
index 8819d28150..7e273151f5 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -538,6 +538,11 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
538 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 538 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
539} 539}
540 540
541static struct shrinker dqcache_shrinker = {
542 .shrink = shrink_dqcache_memory,
543 .seeks = DEFAULT_SEEKS,
544};
545
541/* 546/*
542 * Put reference to dquot 547 * Put reference to dquot
543 * NOTE: If you change this function please check whether dqput_blocks() works right... 548 * NOTE: If you change this function please check whether dqput_blocks() works right...
@@ -1870,7 +1875,7 @@ static int __init dquot_init(void)
1870 printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n", 1875 printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
1871 nr_hash, order, (PAGE_SIZE << order)); 1876 nr_hash, order, (PAGE_SIZE << order));
1872 1877
1873 set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory); 1878 register_shrinker(&dqcache_shrinker);
1874 1879
1875 return 0; 1880 return 0;
1876} 1881}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 03ea7696fe..59375efcf3 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb)
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 21 if (inode->i_state & (I_FREEING|I_WILL_FREE))
22 continue; 22 continue;
23 invalidate_mapping_pages(inode->i_mapping, 0, -1); 23 __invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
24 } 24 }
25 spin_unlock(&inode_lock); 25 spin_unlock(&inode_lock);
26} 26}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 403e3bad14..1b9dd9a96f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -580,5 +580,7 @@ void
580ecryptfs_write_header_metadata(char *virt, 580ecryptfs_write_header_metadata(char *virt,
581 struct ecryptfs_crypt_stat *crypt_stat, 581 struct ecryptfs_crypt_stat *crypt_stat,
582 size_t *written); 582 size_t *written);
583int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start,
584 int num_zeros);
583 585
584#endif /* #ifndef ECRYPTFS_KERNEL_H */ 586#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 59288d8170..94f456fe4d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -338,16 +338,17 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
338 return rc; 338 return rc;
339} 339}
340 340
341static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos, 341static ssize_t ecryptfs_splice_read(struct file *file, loff_t * ppos,
342 size_t count, read_actor_t actor, void *target) 342 struct pipe_inode_info *pipe, size_t count,
343 unsigned int flags)
343{ 344{
344 struct file *lower_file = NULL; 345 struct file *lower_file = NULL;
345 int rc = -EINVAL; 346 int rc = -EINVAL;
346 347
347 lower_file = ecryptfs_file_to_lower(file); 348 lower_file = ecryptfs_file_to_lower(file);
348 if (lower_file->f_op && lower_file->f_op->sendfile) 349 if (lower_file->f_op && lower_file->f_op->splice_read)
349 rc = lower_file->f_op->sendfile(lower_file, ppos, count, 350 rc = lower_file->f_op->splice_read(lower_file, ppos, pipe,
350 actor, target); 351 count, flags);
351 352
352 return rc; 353 return rc;
353} 354}
@@ -364,7 +365,7 @@ const struct file_operations ecryptfs_dir_fops = {
364 .release = ecryptfs_release, 365 .release = ecryptfs_release,
365 .fsync = ecryptfs_fsync, 366 .fsync = ecryptfs_fsync,
366 .fasync = ecryptfs_fasync, 367 .fasync = ecryptfs_fasync,
367 .sendfile = ecryptfs_sendfile, 368 .splice_read = ecryptfs_splice_read,
368}; 369};
369 370
370const struct file_operations ecryptfs_main_fops = { 371const struct file_operations ecryptfs_main_fops = {
@@ -381,7 +382,7 @@ const struct file_operations ecryptfs_main_fops = {
381 .release = ecryptfs_release, 382 .release = ecryptfs_release,
382 .fsync = ecryptfs_fsync, 383 .fsync = ecryptfs_fsync,
383 .fasync = ecryptfs_fasync, 384 .fasync = ecryptfs_fasync,
384 .sendfile = ecryptfs_sendfile, 385 .splice_read = ecryptfs_splice_read,
385}; 386};
386 387
387static int 388static int
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1548be26b5..e77a2ec71a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -282,7 +282,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
282 struct dentry *lower_dentry; 282 struct dentry *lower_dentry;
283 struct vfsmount *lower_mnt; 283 struct vfsmount *lower_mnt;
284 char *encoded_name; 284 char *encoded_name;
285 unsigned int encoded_namelen; 285 int encoded_namelen;
286 struct ecryptfs_crypt_stat *crypt_stat = NULL; 286 struct ecryptfs_crypt_stat *crypt_stat = NULL;
287 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 287 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
288 char *page_virt = NULL; 288 char *page_virt = NULL;
@@ -473,7 +473,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
473 struct dentry *lower_dir_dentry; 473 struct dentry *lower_dir_dentry;
474 umode_t mode; 474 umode_t mode;
475 char *encoded_symname; 475 char *encoded_symname;
476 unsigned int encoded_symlen; 476 int encoded_symlen;
477 struct ecryptfs_crypt_stat *crypt_stat = NULL; 477 struct ecryptfs_crypt_stat *crypt_stat = NULL;
478 478
479 lower_dentry = ecryptfs_dentry_to_lower(dentry); 479 lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -800,6 +800,25 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
800 goto out_fput; 800 goto out_fput;
801 } 801 }
802 } else { /* new_length < i_size_read(inode) */ 802 } else { /* new_length < i_size_read(inode) */
803 pgoff_t index = 0;
804 int end_pos_in_page = -1;
805
806 if (new_length != 0) {
807 index = ((new_length - 1) >> PAGE_CACHE_SHIFT);
808 end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK);
809 }
810 if (end_pos_in_page != (PAGE_CACHE_SIZE - 1)) {
811 if ((rc = ecryptfs_write_zeros(&fake_ecryptfs_file,
812 index,
813 (end_pos_in_page + 1),
814 ((PAGE_CACHE_SIZE - 1)
815 - end_pos_in_page)))) {
816 printk(KERN_ERR "Error attempting to zero out "
817 "the remainder of the end page on "
818 "reducing truncate; rc = [%d]\n", rc);
819 goto out_fput;
820 }
821 }
803 vmtruncate(inode, new_length); 822 vmtruncate(inode, new_length);
804 rc = ecryptfs_write_inode_size_to_metadata( 823 rc = ecryptfs_write_inode_size_to_metadata(
805 lower_file, lower_dentry->d_inode, inode, dentry, 824 lower_file, lower_dentry->d_inode, inode, dentry,
@@ -875,9 +894,54 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
875 struct ecryptfs_crypt_stat *crypt_stat; 894 struct ecryptfs_crypt_stat *crypt_stat;
876 895
877 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 896 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
878 lower_dentry = ecryptfs_dentry_to_lower(dentry); 897 if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
898 ecryptfs_init_crypt_stat(crypt_stat);
879 inode = dentry->d_inode; 899 inode = dentry->d_inode;
880 lower_inode = ecryptfs_inode_to_lower(inode); 900 lower_inode = ecryptfs_inode_to_lower(inode);
901 lower_dentry = ecryptfs_dentry_to_lower(dentry);
902 mutex_lock(&crypt_stat->cs_mutex);
903 if (S_ISDIR(dentry->d_inode->i_mode))
904 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
905 else if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
906 || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
907 struct vfsmount *lower_mnt;
908 struct file *lower_file = NULL;
909 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
910 int lower_flags;
911
912 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
913 lower_flags = O_RDONLY;
914 if ((rc = ecryptfs_open_lower_file(&lower_file, lower_dentry,
915 lower_mnt, lower_flags))) {
916 printk(KERN_ERR
917 "Error opening lower file; rc = [%d]\n", rc);
918 mutex_unlock(&crypt_stat->cs_mutex);
919 goto out;
920 }
921 mount_crypt_stat = &ecryptfs_superblock_to_private(
922 dentry->d_sb)->mount_crypt_stat;
923 if ((rc = ecryptfs_read_metadata(dentry, lower_file))) {
924 if (!(mount_crypt_stat->flags
925 & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
926 rc = -EIO;
927 printk(KERN_WARNING "Attempt to read file that "
928 "is not in a valid eCryptfs format, "
929 "and plaintext passthrough mode is not "
930 "enabled; returning -EIO\n");
931
932 mutex_unlock(&crypt_stat->cs_mutex);
933 fput(lower_file);
934 goto out;
935 }
936 rc = 0;
937 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
938 mutex_unlock(&crypt_stat->cs_mutex);
939 fput(lower_file);
940 goto out;
941 }
942 fput(lower_file);
943 }
944 mutex_unlock(&crypt_stat->cs_mutex);
881 if (ia->ia_valid & ATTR_SIZE) { 945 if (ia->ia_valid & ATTR_SIZE) {
882 ecryptfs_printk(KERN_DEBUG, 946 ecryptfs_printk(KERN_DEBUG,
883 "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", 947 "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 606128f5c9..02ca6f1e55 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -840,8 +840,6 @@ static int __init ecryptfs_init(void)
840 goto out; 840 goto out;
841 } 841 }
842 kobj_set_kset_s(&ecryptfs_subsys, fs_subsys); 842 kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
843 sysfs_attr_version.attr.owner = THIS_MODULE;
844 sysfs_attr_version_str.attr.owner = THIS_MODULE;
845 rc = do_sysfs_registration(); 843 rc = do_sysfs_registration();
846 if (rc) { 844 if (rc) {
847 printk(KERN_ERR "sysfs registration failed\n"); 845 printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 55cec98a84..7d5a43cb0d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -56,9 +56,6 @@ static struct page *ecryptfs_get1page(struct file *file, int index)
56 return read_mapping_page(mapping, index, (void *)file); 56 return read_mapping_page(mapping, index, (void *)file);
57} 57}
58 58
59static
60int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros);
61
62/** 59/**
63 * ecryptfs_fill_zeros 60 * ecryptfs_fill_zeros
64 * @file: The ecryptfs file 61 * @file: The ecryptfs file
@@ -101,10 +98,13 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
101 if (old_end_page_index == new_end_page_index) { 98 if (old_end_page_index == new_end_page_index) {
102 /* Start and end are in the same page; we just need to 99 /* Start and end are in the same page; we just need to
103 * set a portion of the existing page to zero's */ 100 * set a portion of the existing page to zero's */
104 rc = write_zeros(file, index, (old_end_pos_in_page + 1), 101 rc = ecryptfs_write_zeros(file, index,
105 (new_end_pos_in_page - old_end_pos_in_page)); 102 (old_end_pos_in_page + 1),
103 (new_end_pos_in_page
104 - old_end_pos_in_page));
106 if (rc) 105 if (rc)
107 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " 106 ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros("
107 "file=[%p], "
108 "index=[0x%.16x], " 108 "index=[0x%.16x], "
109 "old_end_pos_in_page=[d], " 109 "old_end_pos_in_page=[d], "
110 "(PAGE_CACHE_SIZE - new_end_pos_in_page" 110 "(PAGE_CACHE_SIZE - new_end_pos_in_page"
@@ -117,10 +117,10 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
117 goto out; 117 goto out;
118 } 118 }
119 /* Fill the remainder of the previous last page with zeros */ 119 /* Fill the remainder of the previous last page with zeros */
120 rc = write_zeros(file, index, (old_end_pos_in_page + 1), 120 rc = ecryptfs_write_zeros(file, index, (old_end_pos_in_page + 1),
121 ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page)); 121 ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page));
122 if (rc) { 122 if (rc) {
123 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " 123 ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(file=[%p], "
124 "index=[0x%.16x], old_end_pos_in_page=[d], " 124 "index=[0x%.16x], old_end_pos_in_page=[d], "
125 "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) " 125 "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) "
126 "returned [%d]\n", file, index, 126 "returned [%d]\n", file, index,
@@ -131,9 +131,10 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
131 index++; 131 index++;
132 while (index < new_end_page_index) { 132 while (index < new_end_page_index) {
133 /* Fill all intermediate pages with zeros */ 133 /* Fill all intermediate pages with zeros */
134 rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE); 134 rc = ecryptfs_write_zeros(file, index, 0, PAGE_CACHE_SIZE);
135 if (rc) { 135 if (rc) {
136 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " 136 ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros("
137 "file=[%p], "
137 "index=[0x%.16x], " 138 "index=[0x%.16x], "
138 "old_end_pos_in_page=[d], " 139 "old_end_pos_in_page=[d], "
139 "(PAGE_CACHE_SIZE - new_end_pos_in_page" 140 "(PAGE_CACHE_SIZE - new_end_pos_in_page"
@@ -149,9 +150,9 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
149 } 150 }
150 /* Fill the portion at the beginning of the last new page with 151 /* Fill the portion at the beginning of the last new page with
151 * zero's */ 152 * zero's */
152 rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1)); 153 rc = ecryptfs_write_zeros(file, index, 0, (new_end_pos_in_page + 1));
153 if (rc) { 154 if (rc) {
154 ecryptfs_printk(KERN_ERR, "write_zeros(file=" 155 ecryptfs_printk(KERN_ERR, "ecryptfs_write_zeros(file="
155 "[%p], index=[0x%.16x], 0, " 156 "[%p], index=[0x%.16x], 0, "
156 "new_end_pos_in_page=[%d]" 157 "new_end_pos_in_page=[%d]"
157 "returned [%d]\n", file, index, 158 "returned [%d]\n", file, index,
@@ -400,7 +401,6 @@ out:
400static int ecryptfs_prepare_write(struct file *file, struct page *page, 401static int ecryptfs_prepare_write(struct file *file, struct page *page,
401 unsigned from, unsigned to) 402 unsigned from, unsigned to)
402{ 403{
403 loff_t pos;
404 int rc = 0; 404 int rc = 0;
405 405
406 if (from == 0 && to == PAGE_CACHE_SIZE) 406 if (from == 0 && to == PAGE_CACHE_SIZE)
@@ -408,15 +408,22 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
408 up to date. */ 408 up to date. */
409 if (!PageUptodate(page)) 409 if (!PageUptodate(page))
410 rc = ecryptfs_do_readpage(file, page, page->index); 410 rc = ecryptfs_do_readpage(file, page, page->index);
411 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 411 if (page->index != 0) {
412 if (pos > i_size_read(page->mapping->host)) { 412 loff_t end_of_prev_pg_pos =
413 rc = ecryptfs_truncate(file->f_path.dentry, pos); 413 (((loff_t)page->index << PAGE_CACHE_SHIFT) - 1);
414 if (rc) { 414
415 printk(KERN_ERR "Error on attempt to " 415 if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) {
416 "truncate to (higher) offset [%lld];" 416 rc = ecryptfs_truncate(file->f_path.dentry,
417 " rc = [%d]\n", pos, rc); 417 end_of_prev_pg_pos);
418 goto out; 418 if (rc) {
419 printk(KERN_ERR "Error on attempt to "
420 "truncate to (higher) offset [%lld];"
421 " rc = [%d]\n", end_of_prev_pg_pos, rc);
422 goto out;
423 }
419 } 424 }
425 if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host))
426 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
420 } 427 }
421out: 428out:
422 return rc; 429 return rc;
@@ -753,7 +760,7 @@ out:
753} 760}
754 761
755/** 762/**
756 * write_zeros 763 * ecryptfs_write_zeros
757 * @file: The ecryptfs file 764 * @file: The ecryptfs file
758 * @index: The index in which we are writing 765 * @index: The index in which we are writing
759 * @start: The position after the last block of data 766 * @start: The position after the last block of data
@@ -763,8 +770,8 @@ out:
763 * 770 *
764 * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE 771 * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE
765 */ 772 */
766static 773int
767int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) 774ecryptfs_write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
768{ 775{
769 int rc = 0; 776 int rc = 0;
770 struct page *tmp_page; 777 struct page *tmp_page;
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index ed4a207fe2..5276b19423 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -75,6 +75,38 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
75 return NULL; 75 return NULL;
76} 76}
77 77
78struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp)
79{
80 __u32 *objp = vobjp;
81 unsigned long ino = objp[0];
82 __u32 generation = objp[1];
83 struct inode *inode;
84 struct dentry *result;
85
86 if (ino == 0)
87 return ERR_PTR(-ESTALE);
88 inode = iget(sb, ino);
89 if (inode == NULL)
90 return ERR_PTR(-ENOMEM);
91
92 if (is_bad_inode(inode) ||
93 (generation && inode->i_generation != generation)) {
94 result = ERR_PTR(-ESTALE);
95 goto out_iput;
96 }
97
98 result = d_alloc_anon(inode);
99 if (!result) {
100 result = ERR_PTR(-ENOMEM);
101 goto out_iput;
102 }
103 return result;
104
105 out_iput:
106 iput(inode);
107 return result;
108}
109
78struct dentry *efs_get_parent(struct dentry *child) 110struct dentry *efs_get_parent(struct dentry *child)
79{ 111{
80 struct dentry *parent; 112 struct dentry *parent;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e0a6839e68..d360c81f3a 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -11,6 +11,7 @@
11#include <linux/efs_fs.h> 11#include <linux/efs_fs.h>
12#include <linux/efs_vh.h> 12#include <linux/efs_vh.h>
13#include <linux/efs_fs_sb.h> 13#include <linux/efs_fs_sb.h>
14#include <linux/exportfs.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
@@ -113,6 +114,7 @@ static const struct super_operations efs_superblock_operations = {
113}; 114};
114 115
115static struct export_operations efs_export_ops = { 116static struct export_operations efs_export_ops = {
117 .get_dentry = efs_get_dentry,
116 .get_parent = efs_get_parent, 118 .get_parent = efs_get_parent,
117}; 119};
118 120
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e98f6cd720..8adb32a938 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -1,15 +1,45 @@
1 1
2#include <linux/exportfs.h>
2#include <linux/fs.h> 3#include <linux/fs.h>
3#include <linux/file.h> 4#include <linux/file.h>
4#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/mount.h>
5#include <linux/namei.h> 7#include <linux/namei.h>
6 8
7struct export_operations export_op_default; 9#define dprintk(fmt, args...) do{}while(0)
8 10
9#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
10 11
11#define dprintk(fmt, args...) do{}while(0) 12static int get_name(struct dentry *dentry, char *name,
13 struct dentry *child);
14
15
16static struct dentry *exportfs_get_dentry(struct super_block *sb, void *obj)
17{
18 struct dentry *result = ERR_PTR(-ESTALE);
19
20 if (sb->s_export_op->get_dentry) {
21 result = sb->s_export_op->get_dentry(sb, obj);
22 if (!result)
23 result = ERR_PTR(-ESTALE);
24 }
25
26 return result;
27}
28
29static int exportfs_get_name(struct dentry *dir, char *name,
30 struct dentry *child)
31{
32 struct export_operations *nop = dir->d_sb->s_export_op;
12 33
34 if (nop->get_name)
35 return nop->get_name(dir, name, child);
36 else
37 return get_name(dir, name, child);
38}
39
40/*
41 * Check if the dentry or any of it's aliases is acceptable.
42 */
13static struct dentry * 43static struct dentry *
14find_acceptable_alias(struct dentry *result, 44find_acceptable_alias(struct dentry *result,
15 int (*acceptable)(void *context, struct dentry *dentry), 45 int (*acceptable)(void *context, struct dentry *dentry),
@@ -17,6 +47,9 @@ find_acceptable_alias(struct dentry *result,
17{ 47{
18 struct dentry *dentry, *toput = NULL; 48 struct dentry *dentry, *toput = NULL;
19 49
50 if (acceptable(context, result))
51 return result;
52
20 spin_lock(&dcache_lock); 53 spin_lock(&dcache_lock);
21 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { 54 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
22 dget_locked(dentry); 55 dget_locked(dentry);
@@ -37,130 +70,50 @@ find_acceptable_alias(struct dentry *result,
37 return NULL; 70 return NULL;
38} 71}
39 72
40/** 73/*
41 * find_exported_dentry - helper routine to implement export_operations->decode_fh 74 * Find root of a disconnected subtree and return a reference to it.
42 * @sb: The &super_block identifying the filesystem
43 * @obj: An opaque identifier of the object to be found - passed to
44 * get_inode
45 * @parent: An optional opqaue identifier of the parent of the object.
46 * @acceptable: A function used to test possible &dentries to see if they are
47 * acceptable
48 * @context: A parameter to @acceptable so that it knows on what basis to
49 * judge.
50 *
51 * find_exported_dentry is the central helper routine to enable file systems
52 * to provide the decode_fh() export_operation. It's main task is to take
53 * an &inode, find or create an appropriate &dentry structure, and possibly
54 * splice this into the dcache in the correct place.
55 *
56 * The decode_fh() operation provided by the filesystem should call
57 * find_exported_dentry() with the same parameters that it received except
58 * that instead of the file handle fragment, pointers to opaque identifiers
59 * for the object and optionally its parent are passed. The default decode_fh
60 * routine passes one pointer to the start of the filehandle fragment, and
61 * one 8 bytes into the fragment. It is expected that most filesystems will
62 * take this approach, though the offset to the parent identifier may well be
63 * different.
64 *
65 * find_exported_dentry() will call get_dentry to get an dentry pointer from
66 * the file system. If any &dentry in the d_alias list is acceptable, it will
67 * be returned. Otherwise find_exported_dentry() will attempt to splice a new
68 * &dentry into the dcache using get_name() and get_parent() to find the
69 * appropriate place.
70 */ 75 */
71 76static struct dentry *
72struct dentry * 77find_disconnected_root(struct dentry *dentry)
73find_exported_dentry(struct super_block *sb, void *obj, void *parent,
74 int (*acceptable)(void *context, struct dentry *de),
75 void *context)
76{ 78{
77 struct dentry *result = NULL; 79 dget(dentry);
78 struct dentry *target_dir; 80 spin_lock(&dentry->d_lock);
79 int err; 81 while (!IS_ROOT(dentry) &&
80 struct export_operations *nops = sb->s_export_op; 82 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
81 struct dentry *alias; 83 struct dentry *parent = dentry->d_parent;
82 int noprogress; 84 dget(parent);
83 char nbuf[NAME_MAX+1]; 85 spin_unlock(&dentry->d_lock);
84 86 dput(dentry);
85 /* 87 dentry = parent;
86 * Attempt to find the inode. 88 spin_lock(&dentry->d_lock);
87 */
88 result = CALL(sb->s_export_op,get_dentry)(sb,obj);
89 err = -ESTALE;
90 if (result == NULL)
91 goto err_out;
92 if (IS_ERR(result)) {
93 err = PTR_ERR(result);
94 goto err_out;
95 } 89 }
96 if (S_ISDIR(result->d_inode->i_mode) && 90 spin_unlock(&dentry->d_lock);
97 (result->d_flags & DCACHE_DISCONNECTED)) { 91 return dentry;
98 /* it is an unconnected directory, we must connect it */ 92}
99 ;
100 } else {
101 if (acceptable(context, result))
102 return result;
103 if (S_ISDIR(result->d_inode->i_mode)) {
104 err = -EACCES;
105 goto err_result;
106 }
107 93
108 alias = find_acceptable_alias(result, acceptable, context);
109 if (alias)
110 return alias;
111 }
112
113 /* It's a directory, or we are required to confirm the file's
114 * location in the tree based on the parent information
115 */
116 dprintk("find_exported_dentry: need to look harder for %s/%d\n",sb->s_id,*(int*)obj);
117 if (S_ISDIR(result->d_inode->i_mode))
118 target_dir = dget(result);
119 else {
120 if (parent == NULL)
121 goto err_result;
122 94
123 target_dir = CALL(sb->s_export_op,get_dentry)(sb,parent); 95/*
124 if (IS_ERR(target_dir)) 96 * Make sure target_dir is fully connected to the dentry tree.
125 err = PTR_ERR(target_dir); 97 *
126 if (target_dir == NULL || IS_ERR(target_dir)) 98 * It may already be, as the flag isn't always updated when connection happens.
127 goto err_result; 99 */
128 } 100static int
129 /* 101reconnect_path(struct super_block *sb, struct dentry *target_dir)
130 * Now we need to make sure that target_dir is properly connected. 102{
131 * It may already be, as the flag isn't always updated when connection 103 char nbuf[NAME_MAX+1];
132 * happens. 104 int noprogress = 0;
133 * So, we walk up parent links until we find a connected directory, 105 int err = -ESTALE;
134 * or we run out of directories. Then we find the parent, find
135 * the name of the child in that parent, and do a lookup.
136 * This should connect the child into the parent
137 * We then repeat.
138 */
139 106
140 /* it is possible that a confused file system might not let us complete 107 /*
108 * It is possible that a confused file system might not let us complete
141 * the path to the root. For example, if get_parent returns a directory 109 * the path to the root. For example, if get_parent returns a directory
142 * in which we cannot find a name for the child. While this implies a 110 * in which we cannot find a name for the child. While this implies a
143 * very sick filesystem we don't want it to cause knfsd to spin. Hence 111 * very sick filesystem we don't want it to cause knfsd to spin. Hence
144 * the noprogress counter. If we go through the loop 10 times (2 is 112 * the noprogress counter. If we go through the loop 10 times (2 is
145 * probably enough) without getting anywhere, we just give up 113 * probably enough) without getting anywhere, we just give up
146 */ 114 */
147 noprogress= 0;
148 while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { 115 while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
149 struct dentry *pd = target_dir; 116 struct dentry *pd = find_disconnected_root(target_dir);
150
151 dget(pd);
152 spin_lock(&pd->d_lock);
153 while (!IS_ROOT(pd) &&
154 (pd->d_parent->d_flags&DCACHE_DISCONNECTED)) {
155 struct dentry *parent = pd->d_parent;
156
157 dget(parent);
158 spin_unlock(&pd->d_lock);
159 dput(pd);
160 pd = parent;
161 spin_lock(&pd->d_lock);
162 }
163 spin_unlock(&pd->d_lock);
164 117
165 if (!IS_ROOT(pd)) { 118 if (!IS_ROOT(pd)) {
166 /* must have found a connected parent - great */ 119 /* must have found a connected parent - great */
@@ -175,29 +128,40 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
175 spin_unlock(&pd->d_lock); 128 spin_unlock(&pd->d_lock);
176 noprogress = 0; 129 noprogress = 0;
177 } else { 130 } else {
178 /* we have hit the top of a disconnected path. Try 131 /*
179 * to find parent and connect 132 * We have hit the top of a disconnected path, try to
180 * note: racing with some other process renaming a 133 * find parent and connect.
181 * directory isn't much of a problem here. If someone 134 *
182 * renames the directory, it will end up properly 135 * Racing with some other process renaming a directory
183 * connected, which is what we want 136 * isn't much of a problem here. If someone renames
137 * the directory, it will end up properly connected,
138 * which is what we want
139 *
140 * Getting the parent can't be supported generically,
141 * the locking is too icky.
142 *
143 * Instead we just return EACCES. If server reboots
144 * or inodes get flushed, you lose
184 */ 145 */
185 struct dentry *ppd; 146 struct dentry *ppd = ERR_PTR(-EACCES);
186 struct dentry *npd; 147 struct dentry *npd;
187 148
188 mutex_lock(&pd->d_inode->i_mutex); 149 mutex_lock(&pd->d_inode->i_mutex);
189 ppd = CALL(nops,get_parent)(pd); 150 if (sb->s_export_op->get_parent)
151 ppd = sb->s_export_op->get_parent(pd);
190 mutex_unlock(&pd->d_inode->i_mutex); 152 mutex_unlock(&pd->d_inode->i_mutex);
191 153
192 if (IS_ERR(ppd)) { 154 if (IS_ERR(ppd)) {
193 err = PTR_ERR(ppd); 155 err = PTR_ERR(ppd);
194 dprintk("find_exported_dentry: get_parent of %ld failed, err %d\n", 156 dprintk("%s: get_parent of %ld failed, err %d\n",
195 pd->d_inode->i_ino, err); 157 __FUNCTION__, pd->d_inode->i_ino, err);
196 dput(pd); 158 dput(pd);
197 break; 159 break;
198 } 160 }
199 dprintk("find_exported_dentry: find name of %lu in %lu\n", pd->d_inode->i_ino, ppd->d_inode->i_ino); 161
200 err = CALL(nops,get_name)(ppd, nbuf, pd); 162 dprintk("%s: find name of %lu in %lu\n", __FUNCTION__,
163 pd->d_inode->i_ino, ppd->d_inode->i_ino);
164 err = exportfs_get_name(ppd, nbuf, pd);
201 if (err) { 165 if (err) {
202 dput(ppd); 166 dput(ppd);
203 dput(pd); 167 dput(pd);
@@ -208,13 +172,14 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
208 continue; 172 continue;
209 break; 173 break;
210 } 174 }
211 dprintk("find_exported_dentry: found name: %s\n", nbuf); 175 dprintk("%s: found name: %s\n", __FUNCTION__, nbuf);
212 mutex_lock(&ppd->d_inode->i_mutex); 176 mutex_lock(&ppd->d_inode->i_mutex);
213 npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); 177 npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
214 mutex_unlock(&ppd->d_inode->i_mutex); 178 mutex_unlock(&ppd->d_inode->i_mutex);
215 if (IS_ERR(npd)) { 179 if (IS_ERR(npd)) {
216 err = PTR_ERR(npd); 180 err = PTR_ERR(npd);
217 dprintk("find_exported_dentry: lookup failed: %d\n", err); 181 dprintk("%s: lookup failed: %d\n",
182 __FUNCTION__, err);
218 dput(ppd); 183 dput(ppd);
219 dput(pd); 184 dput(pd);
220 break; 185 break;
@@ -227,7 +192,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
227 if (npd == pd) 192 if (npd == pd)
228 noprogress = 0; 193 noprogress = 0;
229 else 194 else
230 printk("find_exported_dentry: npd != pd\n"); 195 printk("%s: npd != pd\n", __FUNCTION__);
231 dput(npd); 196 dput(npd);
232 dput(ppd); 197 dput(ppd);
233 if (IS_ROOT(pd)) { 198 if (IS_ROOT(pd)) {
@@ -243,15 +208,101 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
243 /* something went wrong - oh-well */ 208 /* something went wrong - oh-well */
244 if (!err) 209 if (!err)
245 err = -ESTALE; 210 err = -ESTALE;
246 goto err_target; 211 return err;
247 } 212 }
248 /* if we weren't after a directory, have one more step to go */ 213
249 if (result != target_dir) { 214 return 0;
250 struct dentry *nresult; 215}
251 err = CALL(nops,get_name)(target_dir, nbuf, result); 216
217/**
218 * find_exported_dentry - helper routine to implement export_operations->decode_fh
219 * @sb: The &super_block identifying the filesystem
220 * @obj: An opaque identifier of the object to be found - passed to
221 * get_inode
222 * @parent: An optional opqaue identifier of the parent of the object.
223 * @acceptable: A function used to test possible &dentries to see if they are
224 * acceptable
225 * @context: A parameter to @acceptable so that it knows on what basis to
226 * judge.
227 *
228 * find_exported_dentry is the central helper routine to enable file systems
229 * to provide the decode_fh() export_operation. It's main task is to take
230 * an &inode, find or create an appropriate &dentry structure, and possibly
231 * splice this into the dcache in the correct place.
232 *
233 * The decode_fh() operation provided by the filesystem should call
234 * find_exported_dentry() with the same parameters that it received except
235 * that instead of the file handle fragment, pointers to opaque identifiers
236 * for the object and optionally its parent are passed. The default decode_fh
237 * routine passes one pointer to the start of the filehandle fragment, and
238 * one 8 bytes into the fragment. It is expected that most filesystems will
239 * take this approach, though the offset to the parent identifier may well be
240 * different.
241 *
242 * find_exported_dentry() will call get_dentry to get an dentry pointer from
243 * the file system. If any &dentry in the d_alias list is acceptable, it will
244 * be returned. Otherwise find_exported_dentry() will attempt to splice a new
245 * &dentry into the dcache using get_name() and get_parent() to find the
246 * appropriate place.
247 */
248
249struct dentry *
250find_exported_dentry(struct super_block *sb, void *obj, void *parent,
251 int (*acceptable)(void *context, struct dentry *de),
252 void *context)
253{
254 struct dentry *result, *alias;
255 int err = -ESTALE;
256
257 /*
258 * Attempt to find the inode.
259 */
260 result = exportfs_get_dentry(sb, obj);
261 if (IS_ERR(result))
262 return result;
263
264 if (S_ISDIR(result->d_inode->i_mode)) {
265 if (!(result->d_flags & DCACHE_DISCONNECTED)) {
266 if (acceptable(context, result))
267 return result;
268 err = -EACCES;
269 goto err_result;
270 }
271
272 err = reconnect_path(sb, result);
273 if (err)
274 goto err_result;
275 } else {
276 struct dentry *target_dir, *nresult;
277 char nbuf[NAME_MAX+1];
278
279 alias = find_acceptable_alias(result, acceptable, context);
280 if (alias)
281 return alias;
282
283 if (parent == NULL)
284 goto err_result;
285
286 target_dir = exportfs_get_dentry(sb,parent);
287 if (IS_ERR(target_dir)) {
288 err = PTR_ERR(target_dir);
289 goto err_result;
290 }
291
292 err = reconnect_path(sb, target_dir);
293 if (err) {
294 dput(target_dir);
295 goto err_result;
296 }
297
298 /*
299 * As we weren't after a directory, have one more step to go.
300 */
301 err = exportfs_get_name(target_dir, nbuf, result);
252 if (!err) { 302 if (!err) {
253 mutex_lock(&target_dir->d_inode->i_mutex); 303 mutex_lock(&target_dir->d_inode->i_mutex);
254 nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); 304 nresult = lookup_one_len(nbuf, target_dir,
305 strlen(nbuf));
255 mutex_unlock(&target_dir->d_inode->i_mutex); 306 mutex_unlock(&target_dir->d_inode->i_mutex);
256 if (!IS_ERR(nresult)) { 307 if (!IS_ERR(nresult)) {
257 if (nresult->d_inode) { 308 if (nresult->d_inode) {
@@ -261,11 +312,8 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
261 dput(nresult); 312 dput(nresult);
262 } 313 }
263 } 314 }
315 dput(target_dir);
264 } 316 }
265 dput(target_dir);
266 /* now result is properly connected, it is our best bet */
267 if (acceptable(context, result))
268 return result;
269 317
270 alias = find_acceptable_alias(result, acceptable, context); 318 alias = find_acceptable_alias(result, acceptable, context);
271 if (alias) 319 if (alias)
@@ -275,32 +323,16 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
275 dput(result); 323 dput(result);
276 /* It might be justifiable to return ESTALE here, 324 /* It might be justifiable to return ESTALE here,
277 * but the filehandle at-least looks reasonable good 325 * but the filehandle at-least looks reasonable good
278 * and it just be a permission problem, so returning 326 * and it may just be a permission problem, so returning
279 * -EACCESS is safer 327 * -EACCESS is safer
280 */ 328 */
281 return ERR_PTR(-EACCES); 329 return ERR_PTR(-EACCES);
282 330
283 err_target:
284 dput(target_dir);
285 err_result: 331 err_result:
286 dput(result); 332 dput(result);
287 err_out:
288 return ERR_PTR(err); 333 return ERR_PTR(err);
289} 334}
290 335
291
292
293static struct dentry *get_parent(struct dentry *child)
294{
295 /* get_parent cannot be supported generically, the locking
296 * is too icky.
297 * instead, we just return EACCES. If server reboots or inodes
298 * get flushed, you lose
299 */
300 return ERR_PTR(-EACCES);
301}
302
303
304struct getdents_callback { 336struct getdents_callback {
305 char *name; /* name that was found. It already points to a 337 char *name; /* name that was found. It already points to a
306 buffer NAME_MAX+1 is size */ 338 buffer NAME_MAX+1 is size */
@@ -390,61 +422,6 @@ out:
390 return error; 422 return error;
391} 423}
392 424
393
394static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u32 generation)
395{
396
397 /* iget isn't really right if the inode is currently unallocated!!
398 * This should really all be done inside each filesystem
399 *
400 * ext2fs' read_inode has been strengthed to return a bad_inode if
401 * the inode had been deleted.
402 *
403 * Currently we don't know the generation for parent directory, so
404 * a generation of 0 means "accept any"
405 */
406 struct inode *inode;
407 struct dentry *result;
408 if (ino == 0)
409 return ERR_PTR(-ESTALE);
410 inode = iget(sb, ino);
411 if (inode == NULL)
412 return ERR_PTR(-ENOMEM);
413 if (is_bad_inode(inode)
414 || (generation && inode->i_generation != generation)
415 ) {
416 /* we didn't find the right inode.. */
417 dprintk("fh_verify: Inode %lu, Bad count: %d %d or version %u %u\n",
418 inode->i_ino,
419 inode->i_nlink, atomic_read(&inode->i_count),
420 inode->i_generation,
421 generation);
422
423 iput(inode);
424 return ERR_PTR(-ESTALE);
425 }
426 /* now to find a dentry.
427 * If possible, get a well-connected one
428 */
429 result = d_alloc_anon(inode);
430 if (!result) {
431 iput(inode);
432 return ERR_PTR(-ENOMEM);
433 }
434 return result;
435}
436
437
438static struct dentry *get_object(struct super_block *sb, void *vobjp)
439{
440 __u32 *objp = vobjp;
441 unsigned long ino = objp[0];
442 __u32 generation = objp[1];
443
444 return export_iget(sb, ino, generation);
445}
446
447
448/** 425/**
449 * export_encode_fh - default export_operations->encode_fh function 426 * export_encode_fh - default export_operations->encode_fh function
450 * @dentry: the dentry to encode 427 * @dentry: the dentry to encode
@@ -517,16 +494,40 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh
517 acceptable, context); 494 acceptable, context);
518} 495}
519 496
520struct export_operations export_op_default = { 497int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
521 .decode_fh = export_decode_fh, 498 int connectable)
522 .encode_fh = export_encode_fh, 499{
500 struct export_operations *nop = dentry->d_sb->s_export_op;
501 int error;
502
503 if (nop->encode_fh)
504 error = nop->encode_fh(dentry, fh, max_len, connectable);
505 else
506 error = export_encode_fh(dentry, fh, max_len, connectable);
523 507
524 .get_name = get_name, 508 return error;
525 .get_parent = get_parent, 509}
526 .get_dentry = get_object, 510EXPORT_SYMBOL_GPL(exportfs_encode_fh);
527}; 511
512struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len,
513 int fileid_type, int (*acceptable)(void *, struct dentry *),
514 void *context)
515{
516 struct export_operations *nop = mnt->mnt_sb->s_export_op;
517 struct dentry *result;
518
519 if (nop->decode_fh) {
520 result = nop->decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
521 acceptable, context);
522 } else {
523 result = export_decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
524 acceptable, context);
525 }
526
527 return result;
528}
529EXPORT_SYMBOL_GPL(exportfs_decode_fh);
528 530
529EXPORT_SYMBOL(export_op_default);
530EXPORT_SYMBOL(find_exported_dentry); 531EXPORT_SYMBOL(find_exported_dentry);
531 532
532MODULE_LICENSE("GPL"); 533MODULE_LICENSE("GPL");
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7c420b800c..e58669e1b8 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -464,7 +464,7 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
464 464
465 if (!test_opt(inode->i_sb, POSIX_ACL)) 465 if (!test_opt(inode->i_sb, POSIX_ACL))
466 return -EOPNOTSUPP; 466 return -EOPNOTSUPP;
467 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 467 if (!is_owner_or_cap(inode))
468 return -EPERM; 468 return -EPERM;
469 469
470 if (value) { 470 if (value) {
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 566d4e2d38..ab7961260c 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -24,9 +24,9 @@
24#include "acl.h" 24#include "acl.h"
25 25
26/* 26/*
27 * Called when an inode is released. Note that this is different 27 * Called when filp is released. This happens when all file descriptors
28 * from ext2_open_file: open gets called at every open, but release 28 * for a single struct file are closed. Note that different open() calls
29 * gets called only when /all/ the files are closed. 29 * for the same file yield different struct file structures.
30 */ 30 */
31static int ext2_release_file (struct inode * inode, struct file * filp) 31static int ext2_release_file (struct inode * inode, struct file * filp)
32{ 32{
@@ -53,7 +53,6 @@ const struct file_operations ext2_file_operations = {
53 .open = generic_file_open, 53 .open = generic_file_open,
54 .release = ext2_release_file, 54 .release = ext2_release_file,
55 .fsync = ext2_sync_file, 55 .fsync = ext2_sync_file,
56 .sendfile = generic_file_sendfile,
57 .splice_read = generic_file_splice_read, 56 .splice_read = generic_file_splice_read,
58 .splice_write = generic_file_splice_write, 57 .splice_write = generic_file_splice_write,
59}; 58};
@@ -71,7 +70,6 @@ const struct file_operations ext2_xip_file_operations = {
71 .open = generic_file_open, 70 .open = generic_file_open,
72 .release = ext2_release_file, 71 .release = ext2_release_file,
73 .fsync = ext2_sync_file, 72 .fsync = ext2_sync_file,
74 .sendfile = xip_file_sendfile,
75}; 73};
76#endif 74#endif
77 75
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e85c482182..3bcd25422e 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -36,7 +36,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
36 if (IS_RDONLY(inode)) 36 if (IS_RDONLY(inode))
37 return -EROFS; 37 return -EROFS;
38 38
39 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 39 if (!is_owner_or_cap(inode))
40 return -EACCES; 40 return -EACCES;
41 41
42 if (get_user(flags, (int __user *) arg)) 42 if (get_user(flags, (int __user *) arg))
@@ -74,7 +74,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
74 case EXT2_IOC_GETVERSION: 74 case EXT2_IOC_GETVERSION:
75 return put_user(inode->i_generation, (int __user *) arg); 75 return put_user(inode->i_generation, (int __user *) arg);
76 case EXT2_IOC_SETVERSION: 76 case EXT2_IOC_SETVERSION:
77 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 77 if (!is_owner_or_cap(inode))
78 return -EPERM; 78 return -EPERM;
79 if (IS_RDONLY(inode)) 79 if (IS_RDONLY(inode))
80 return -EROFS; 80 return -EROFS;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 16337bff02..3eefa97fe2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -25,6 +25,7 @@
25#include <linux/parser.h> 25#include <linux/parser.h>
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/vfs.h> 30#include <linux/vfs.h>
30#include <linux/seq_file.h> 31#include <linux/seq_file.h>
@@ -1038,6 +1039,15 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1038 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1039 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1039 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1040 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1040 1041
1042 ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
1043 EXT2_MOUNT_XIP if not */
1044
1045 if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
1046 printk("XIP: Unsupported blocksize\n");
1047 err = -EINVAL;
1048 goto restore_opts;
1049 }
1050
1041 es = sbi->s_es; 1051 es = sbi->s_es;
1042 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1052 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1043 (old_mount_opt & EXT2_MOUNT_XIP)) && 1053 (old_mount_opt & EXT2_MOUNT_XIP)) &&
@@ -1090,15 +1100,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1090 struct super_block *sb = dentry->d_sb; 1100 struct super_block *sb = dentry->d_sb;
1091 struct ext2_sb_info *sbi = EXT2_SB(sb); 1101 struct ext2_sb_info *sbi = EXT2_SB(sb);
1092 struct ext2_super_block *es = sbi->s_es; 1102 struct ext2_super_block *es = sbi->s_es;
1093 unsigned long overhead;
1094 int i;
1095 u64 fsid; 1103 u64 fsid;
1096 1104
1097 if (test_opt (sb, MINIX_DF)) 1105 if (test_opt (sb, MINIX_DF))
1098 overhead = 0; 1106 sbi->s_overhead_last = 0;
1099 else { 1107 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
1108 unsigned long i, overhead = 0;
1109 smp_rmb();
1110
1100 /* 1111 /*
1101 * Compute the overhead (FS structures) 1112 * Compute the overhead (FS structures). This is constant
1113 * for a given filesystem unless the number of block groups
1114 * changes so we cache the previous value until it does.
1102 */ 1115 */
1103 1116
1104 /* 1117 /*
@@ -1122,17 +1135,22 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1122 */ 1135 */
1123 overhead += (sbi->s_groups_count * 1136 overhead += (sbi->s_groups_count *
1124 (2 + sbi->s_itb_per_group)); 1137 (2 + sbi->s_itb_per_group));
1138 sbi->s_overhead_last = overhead;
1139 smp_wmb();
1140 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
1125 } 1141 }
1126 1142
1127 buf->f_type = EXT2_SUPER_MAGIC; 1143 buf->f_type = EXT2_SUPER_MAGIC;
1128 buf->f_bsize = sb->s_blocksize; 1144 buf->f_bsize = sb->s_blocksize;
1129 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; 1145 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
1130 buf->f_bfree = ext2_count_free_blocks(sb); 1146 buf->f_bfree = ext2_count_free_blocks(sb);
1147 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
1131 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); 1148 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
1132 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) 1149 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
1133 buf->f_bavail = 0; 1150 buf->f_bavail = 0;
1134 buf->f_files = le32_to_cpu(es->s_inodes_count); 1151 buf->f_files = le32_to_cpu(es->s_inodes_count);
1135 buf->f_ffree = ext2_count_free_inodes(sb); 1152 buf->f_ffree = ext2_count_free_inodes(sb);
1153 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
1136 buf->f_namelen = EXT2_NAME_LEN; 1154 buf->f_namelen = EXT2_NAME_LEN;
1137 fsid = le64_to_cpup((void *)es->s_uuid) ^ 1155 fsid = le64_to_cpup((void *)es->s_uuid) ^
1138 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1156 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 1e5038d9a0..d34e996743 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -489,7 +489,7 @@ ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
489 489
490 if (!test_opt(inode->i_sb, POSIX_ACL)) 490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP; 491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 492 if (!is_owner_or_cap(inode))
493 return -EPERM; 493 return -EPERM;
494 494
495 if (value) { 495 if (value) {
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 1e6f138645..acc4913d30 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -120,7 +120,6 @@ const struct file_operations ext3_file_operations = {
120 .open = generic_file_open, 120 .open = generic_file_open,
121 .release = ext3_release_file, 121 .release = ext3_release_file,
122 .fsync = ext3_sync_file, 122 .fsync = ext3_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read, 123 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write, 124 .splice_write = generic_file_splice_write,
126}; 125};
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a6cb6171c3..de4e3161e4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2677,8 +2677,10 @@ void ext3_read_inode(struct inode * inode)
2677 */ 2677 */
2678 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 2678 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2679 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 2679 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2680 EXT3_INODE_SIZE(inode->i_sb)) 2680 EXT3_INODE_SIZE(inode->i_sb)) {
2681 brelse (bh);
2681 goto bad_inode; 2682 goto bad_inode;
2683 }
2682 if (ei->i_extra_isize == 0) { 2684 if (ei->i_extra_isize == 0) {
2683 /* The extra space is currently unused. Use it. */ 2685 /* The extra space is currently unused. Use it. */
2684 ei->i_extra_isize = sizeof(struct ext3_inode) - 2686 ei->i_extra_isize = sizeof(struct ext3_inode) -
@@ -3193,7 +3195,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val)
3193 */ 3195 */
3194 3196
3195 journal = EXT3_JOURNAL(inode); 3197 journal = EXT3_JOURNAL(inode);
3196 if (is_journal_aborted(journal) || IS_RDONLY(inode)) 3198 if (is_journal_aborted(journal))
3197 return -EROFS; 3199 return -EROFS;
3198 3200
3199 journal_lock_updates(journal); 3201 journal_lock_updates(journal);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 965006dba6..4a2a02c95b 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -41,7 +41,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
41 if (IS_RDONLY(inode)) 41 if (IS_RDONLY(inode))
42 return -EROFS; 42 return -EROFS;
43 43
44 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 44 if (!is_owner_or_cap(inode))
45 return -EACCES; 45 return -EACCES;
46 46
47 if (get_user(flags, (int __user *) arg)) 47 if (get_user(flags, (int __user *) arg))
@@ -122,7 +122,7 @@ flags_err:
122 __u32 generation; 122 __u32 generation;
123 int err; 123 int err;
124 124
125 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 125 if (!is_owner_or_cap(inode))
126 return -EPERM; 126 return -EPERM;
127 if (IS_RDONLY(inode)) 127 if (IS_RDONLY(inode))
128 return -EROFS; 128 return -EROFS;
@@ -181,7 +181,7 @@ flags_err:
181 if (IS_RDONLY(inode)) 181 if (IS_RDONLY(inode))
182 return -EROFS; 182 return -EROFS;
183 183
184 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 184 if (!is_owner_or_cap(inode))
185 return -EACCES; 185 return -EACCES;
186 186
187 if (get_user(rsv_window_size, (int __user *)arg)) 187 if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 9bb046df82..1586807b81 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1019,6 +1019,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1019 1019
1020 if (!inode) 1020 if (!inode)
1021 return ERR_PTR(-EACCES); 1021 return ERR_PTR(-EACCES);
1022
1023 if (is_bad_inode(inode)) {
1024 iput(inode);
1025 return ERR_PTR(-ENOENT);
1026 }
1022 } 1027 }
1023 return d_splice_alias(inode, dentry); 1028 return d_splice_alias(inode, dentry);
1024} 1029}
@@ -1054,6 +1059,11 @@ struct dentry *ext3_get_parent(struct dentry *child)
1054 if (!inode) 1059 if (!inode)
1055 return ERR_PTR(-EACCES); 1060 return ERR_PTR(-EACCES);
1056 1061
1062 if (is_bad_inode(inode)) {
1063 iput(inode);
1064 return ERR_PTR(-ENOENT);
1065 }
1066
1057 parent = d_alloc_anon(inode); 1067 parent = d_alloc_anon(inode);
1058 if (!parent) { 1068 if (!parent) {
1059 iput(inode); 1069 iput(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e3062913a..4f84dc8662 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -29,12 +29,14 @@
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
32#include <linux/exportfs.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/mount.h> 35#include <linux/mount.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/quotaops.h> 37#include <linux/quotaops.h>
37#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/log2.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40 42
@@ -459,6 +461,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
459 461
460static void ext3_destroy_inode(struct inode *inode) 462static void ext3_destroy_inode(struct inode *inode)
461{ 463{
464 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
465 printk("EXT3 Inode %p: orphan list check failed!\n",
466 EXT3_I(inode));
467 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
468 EXT3_I(inode), sizeof(struct ext3_inode_info),
469 false);
470 dump_stack();
471 }
462 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); 472 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
463} 473}
464 474
@@ -1566,7 +1576,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1566 sbi->s_inode_size = le16_to_cpu(es->s_inode_size); 1576 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1567 sbi->s_first_ino = le32_to_cpu(es->s_first_ino); 1577 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1568 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || 1578 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1569 (sbi->s_inode_size & (sbi->s_inode_size - 1)) || 1579 (!is_power_of_2(sbi->s_inode_size)) ||
1570 (sbi->s_inode_size > blocksize)) { 1580 (sbi->s_inode_size > blocksize)) {
1571 printk (KERN_ERR 1581 printk (KERN_ERR
1572 "EXT3-fs: unsupported inode size: %d\n", 1582 "EXT3-fs: unsupported inode size: %d\n",
@@ -2075,6 +2085,7 @@ static int ext3_create_journal(struct super_block * sb,
2075 unsigned int journal_inum) 2085 unsigned int journal_inum)
2076{ 2086{
2077 journal_t *journal; 2087 journal_t *journal;
2088 int err;
2078 2089
2079 if (sb->s_flags & MS_RDONLY) { 2090 if (sb->s_flags & MS_RDONLY) {
2080 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " 2091 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
@@ -2082,13 +2093,15 @@ static int ext3_create_journal(struct super_block * sb,
2082 return -EROFS; 2093 return -EROFS;
2083 } 2094 }
2084 2095
2085 if (!(journal = ext3_get_journal(sb, journal_inum))) 2096 journal = ext3_get_journal(sb, journal_inum);
2097 if (!journal)
2086 return -EINVAL; 2098 return -EINVAL;
2087 2099
2088 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", 2100 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
2089 journal_inum); 2101 journal_inum);
2090 2102
2091 if (journal_create(journal)) { 2103 err = journal_create(journal);
2104 if (err) {
2092 printk(KERN_ERR "EXT3-fs: error creating journal.\n"); 2105 printk(KERN_ERR "EXT3-fs: error creating journal.\n");
2093 journal_destroy(journal); 2106 journal_destroy(journal);
2094 return -EIO; 2107 return -EIO;
@@ -2139,12 +2152,14 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
2139 2152
2140 journal_lock_updates(journal); 2153 journal_lock_updates(journal);
2141 journal_flush(journal); 2154 journal_flush(journal);
2155 lock_super(sb);
2142 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && 2156 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2143 sb->s_flags & MS_RDONLY) { 2157 sb->s_flags & MS_RDONLY) {
2144 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2158 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2145 sb->s_dirt = 0; 2159 sb->s_dirt = 0;
2146 ext3_commit_super(sb, es, 1); 2160 ext3_commit_super(sb, es, 1);
2147 } 2161 }
2162 unlock_super(sb);
2148 journal_unlock_updates(journal); 2163 journal_unlock_updates(journal);
2149} 2164}
2150 2165
@@ -2333,7 +2348,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2333 (sbi->s_mount_state & EXT3_VALID_FS)) 2348 (sbi->s_mount_state & EXT3_VALID_FS))
2334 es->s_state = cpu_to_le16(sbi->s_mount_state); 2349 es->s_state = cpu_to_le16(sbi->s_mount_state);
2335 2350
2351 /*
2352 * We have to unlock super so that we can wait for
2353 * transactions.
2354 */
2355 unlock_super(sb);
2336 ext3_mark_recovery_complete(sb, es); 2356 ext3_mark_recovery_complete(sb, es);
2357 lock_super(sb);
2337 } else { 2358 } else {
2338 __le32 ret; 2359 __le32 ret;
2339 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, 2360 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -2406,19 +2427,19 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2406 struct super_block *sb = dentry->d_sb; 2427 struct super_block *sb = dentry->d_sb;
2407 struct ext3_sb_info *sbi = EXT3_SB(sb); 2428 struct ext3_sb_info *sbi = EXT3_SB(sb);
2408 struct ext3_super_block *es = sbi->s_es; 2429 struct ext3_super_block *es = sbi->s_es;
2409 ext3_fsblk_t overhead;
2410 int i;
2411 u64 fsid; 2430 u64 fsid;
2412 2431
2413 if (test_opt (sb, MINIX_DF)) 2432 if (test_opt(sb, MINIX_DF)) {
2414 overhead = 0; 2433 sbi->s_overhead_last = 0;
2415 else { 2434 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2416 unsigned long ngroups; 2435 unsigned long ngroups = sbi->s_groups_count, i;
2417 ngroups = EXT3_SB(sb)->s_groups_count; 2436 ext3_fsblk_t overhead = 0;
2418 smp_rmb(); 2437 smp_rmb();
2419 2438
2420 /* 2439 /*
2421 * Compute the overhead (FS structures) 2440 * Compute the overhead (FS structures). This is constant
2441 * for a given filesystem unless the number of block groups
2442 * changes so we cache the previous value until it does.
2422 */ 2443 */
2423 2444
2424 /* 2445 /*
@@ -2442,18 +2463,23 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2442 * Every block group has an inode bitmap, a block 2463 * Every block group has an inode bitmap, a block
2443 * bitmap, and an inode table. 2464 * bitmap, and an inode table.
2444 */ 2465 */
2445 overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group)); 2466 overhead += ngroups * (2 + sbi->s_itb_per_group);
2467 sbi->s_overhead_last = overhead;
2468 smp_wmb();
2469 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2446 } 2470 }
2447 2471
2448 buf->f_type = EXT3_SUPER_MAGIC; 2472 buf->f_type = EXT3_SUPER_MAGIC;
2449 buf->f_bsize = sb->s_blocksize; 2473 buf->f_bsize = sb->s_blocksize;
2450 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; 2474 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
2451 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); 2475 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2476 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
2452 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); 2477 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2453 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) 2478 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2454 buf->f_bavail = 0; 2479 buf->f_bavail = 0;
2455 buf->f_files = le32_to_cpu(es->s_inodes_count); 2480 buf->f_files = le32_to_cpu(es->s_inodes_count);
2456 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); 2481 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2482 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
2457 buf->f_namelen = EXT3_NAME_LEN; 2483 buf->f_namelen = EXT3_NAME_LEN;
2458 fsid = le64_to_cpup((void *)es->s_uuid) ^ 2484 fsid = le64_to_cpup((void *)es->s_uuid) ^
2459 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 2485 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 9e882546d9..a8bae8cd1d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -489,7 +489,7 @@ ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
489 489
490 if (!test_opt(inode->i_sb, POSIX_ACL)) 490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP; 491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 492 if (!is_owner_or_cap(inode))
493 return -EPERM; 493 return -EPERM;
494 494
495 if (value) { 495 if (value) {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 3b64bb16c7..e53b4af52f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -517,7 +517,7 @@ do_more:
517 /* 517 /*
518 * An HJ special. This is expensive... 518 * An HJ special. This is expensive...
519 */ 519 */
520#ifdef CONFIG_JBD_DEBUG 520#ifdef CONFIG_JBD2_DEBUG
521 jbd_unlock_bh_state(bitmap_bh); 521 jbd_unlock_bh_state(bitmap_bh);
522 { 522 {
523 struct buffer_head *debug_bh; 523 struct buffer_head *debug_bh;
@@ -1585,7 +1585,7 @@ allocated:
1585 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no); 1585 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1586 1586
1587 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || 1587 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1588 in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || 1588 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1589 in_range(ret_block, ext4_inode_table(sb, gdp), 1589 in_range(ret_block, ext4_inode_table(sb, gdp),
1590 EXT4_SB(sb)->s_itb_per_group) || 1590 EXT4_SB(sb)->s_itb_per_group) ||
1591 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), 1591 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
@@ -1597,7 +1597,7 @@ allocated:
1597 1597
1598 performed_allocation = 1; 1598 performed_allocation = 1;
1599 1599
1600#ifdef CONFIG_JBD_DEBUG 1600#ifdef CONFIG_JBD2_DEBUG
1601 { 1601 {
1602 struct buffer_head *debug_bh; 1602 struct buffer_head *debug_bh;
1603 1603
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b9ce241290..750c46f7d8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
39#include <linux/quotaops.h> 39#include <linux/quotaops.h>
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/falloc.h>
42#include <linux/ext4_fs_extents.h> 43#include <linux/ext4_fs_extents.h>
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
@@ -91,36 +92,6 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
92} 93}
93 94
94static int ext4_ext_check_header(const char *function, struct inode *inode,
95 struct ext4_extent_header *eh)
96{
97 const char *error_msg = NULL;
98
99 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
100 error_msg = "invalid magic";
101 goto corrupted;
102 }
103 if (unlikely(eh->eh_max == 0)) {
104 error_msg = "invalid eh_max";
105 goto corrupted;
106 }
107 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
108 error_msg = "invalid eh_entries";
109 goto corrupted;
110 }
111 return 0;
112
113corrupted:
114 ext4_error(inode->i_sb, function,
115 "bad header in inode #%lu: %s - magic %x, "
116 "entries %u, max %u, depth %u",
117 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
118 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
119 le16_to_cpu(eh->eh_depth));
120
121 return -EIO;
122}
123
124static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) 95static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
125{ 96{
126 int err; 97 int err;
@@ -269,6 +240,70 @@ static int ext4_ext_space_root_idx(struct inode *inode)
269 return size; 240 return size;
270} 241}
271 242
243static int
244ext4_ext_max_entries(struct inode *inode, int depth)
245{
246 int max;
247
248 if (depth == ext_depth(inode)) {
249 if (depth == 0)
250 max = ext4_ext_space_root(inode);
251 else
252 max = ext4_ext_space_root_idx(inode);
253 } else {
254 if (depth == 0)
255 max = ext4_ext_space_block(inode);
256 else
257 max = ext4_ext_space_block_idx(inode);
258 }
259
260 return max;
261}
262
263static int __ext4_ext_check_header(const char *function, struct inode *inode,
264 struct ext4_extent_header *eh,
265 int depth)
266{
267 const char *error_msg;
268 int max = 0;
269
270 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
271 error_msg = "invalid magic";
272 goto corrupted;
273 }
274 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
275 error_msg = "unexpected eh_depth";
276 goto corrupted;
277 }
278 if (unlikely(eh->eh_max == 0)) {
279 error_msg = "invalid eh_max";
280 goto corrupted;
281 }
282 max = ext4_ext_max_entries(inode, depth);
283 if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
284 error_msg = "too large eh_max";
285 goto corrupted;
286 }
287 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
288 error_msg = "invalid eh_entries";
289 goto corrupted;
290 }
291 return 0;
292
293corrupted:
294 ext4_error(inode->i_sb, function,
295 "bad header in inode #%lu: %s - magic %x, "
296 "entries %u, max %u(%u), depth %u(%u)",
297 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
298 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
299 max, le16_to_cpu(eh->eh_depth), depth);
300
301 return -EIO;
302}
303
304#define ext4_ext_check_header(inode, eh, depth) \
305 __ext4_ext_check_header(__FUNCTION__, inode, eh, depth)
306
272#ifdef EXT_DEBUG 307#ifdef EXT_DEBUG
273static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 308static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
274{ 309{
@@ -282,7 +317,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
282 } else if (path->p_ext) { 317 } else if (path->p_ext) {
283 ext_debug(" %d:%d:%llu ", 318 ext_debug(" %d:%d:%llu ",
284 le32_to_cpu(path->p_ext->ee_block), 319 le32_to_cpu(path->p_ext->ee_block),
285 le16_to_cpu(path->p_ext->ee_len), 320 ext4_ext_get_actual_len(path->p_ext),
286 ext_pblock(path->p_ext)); 321 ext_pblock(path->p_ext));
287 } else 322 } else
288 ext_debug(" []"); 323 ext_debug(" []");
@@ -305,7 +340,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
305 340
306 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 341 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
307 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 342 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
308 le16_to_cpu(ex->ee_len), ext_pblock(ex)); 343 ext4_ext_get_actual_len(ex), ext_pblock(ex));
309 } 344 }
310 ext_debug("\n"); 345 ext_debug("\n");
311} 346}
@@ -329,6 +364,7 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
329/* 364/*
330 * ext4_ext_binsearch_idx: 365 * ext4_ext_binsearch_idx:
331 * binary search for the closest index of the given block 366 * binary search for the closest index of the given block
367 * the header must be checked before calling this
332 */ 368 */
333static void 369static void
334ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) 370ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
@@ -336,27 +372,25 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
336 struct ext4_extent_header *eh = path->p_hdr; 372 struct ext4_extent_header *eh = path->p_hdr;
337 struct ext4_extent_idx *r, *l, *m; 373 struct ext4_extent_idx *r, *l, *m;
338 374
339 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
340 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
341 BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
342 375
343 ext_debug("binsearch for %d(idx): ", block); 376 ext_debug("binsearch for %d(idx): ", block);
344 377
345 l = EXT_FIRST_INDEX(eh) + 1; 378 l = EXT_FIRST_INDEX(eh) + 1;
346 r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1; 379 r = EXT_LAST_INDEX(eh);
347 while (l <= r) { 380 while (l <= r) {
348 m = l + (r - l) / 2; 381 m = l + (r - l) / 2;
349 if (block < le32_to_cpu(m->ei_block)) 382 if (block < le32_to_cpu(m->ei_block))
350 r = m - 1; 383 r = m - 1;
351 else 384 else
352 l = m + 1; 385 l = m + 1;
353 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block, 386 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
354 m, m->ei_block, r, r->ei_block); 387 m, le32_to_cpu(m->ei_block),
388 r, le32_to_cpu(r->ei_block));
355 } 389 }
356 390
357 path->p_idx = l - 1; 391 path->p_idx = l - 1;
358 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 392 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
359 idx_block(path->p_idx)); 393 idx_pblock(path->p_idx));
360 394
361#ifdef CHECK_BINSEARCH 395#ifdef CHECK_BINSEARCH
362 { 396 {
@@ -388,6 +422,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
388/* 422/*
389 * ext4_ext_binsearch: 423 * ext4_ext_binsearch:
390 * binary search for closest extent of the given block 424 * binary search for closest extent of the given block
425 * the header must be checked before calling this
391 */ 426 */
392static void 427static void
393ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) 428ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
@@ -395,9 +430,6 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
395 struct ext4_extent_header *eh = path->p_hdr; 430 struct ext4_extent_header *eh = path->p_hdr;
396 struct ext4_extent *r, *l, *m; 431 struct ext4_extent *r, *l, *m;
397 432
398 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
399 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
400
401 if (eh->eh_entries == 0) { 433 if (eh->eh_entries == 0) {
402 /* 434 /*
403 * this leaf is empty: 435 * this leaf is empty:
@@ -409,7 +441,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
409 ext_debug("binsearch for %d: ", block); 441 ext_debug("binsearch for %d: ", block);
410 442
411 l = EXT_FIRST_EXTENT(eh) + 1; 443 l = EXT_FIRST_EXTENT(eh) + 1;
412 r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1; 444 r = EXT_LAST_EXTENT(eh);
413 445
414 while (l <= r) { 446 while (l <= r) {
415 m = l + (r - l) / 2; 447 m = l + (r - l) / 2;
@@ -417,15 +449,16 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
417 r = m - 1; 449 r = m - 1;
418 else 450 else
419 l = m + 1; 451 l = m + 1;
420 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block, 452 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
421 m, m->ee_block, r, r->ee_block); 453 m, le32_to_cpu(m->ee_block),
454 r, le32_to_cpu(r->ee_block));
422 } 455 }
423 456
424 path->p_ext = l - 1; 457 path->p_ext = l - 1;
425 ext_debug(" -> %d:%llu:%d ", 458 ext_debug(" -> %d:%llu:%d ",
426 le32_to_cpu(path->p_ext->ee_block), 459 le32_to_cpu(path->p_ext->ee_block),
427 ext_pblock(path->p_ext), 460 ext_pblock(path->p_ext),
428 le16_to_cpu(path->p_ext->ee_len)); 461 ext4_ext_get_actual_len(path->p_ext));
429 462
430#ifdef CHECK_BINSEARCH 463#ifdef CHECK_BINSEARCH
431 { 464 {
@@ -468,11 +501,10 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
468 short int depth, i, ppos = 0, alloc = 0; 501 short int depth, i, ppos = 0, alloc = 0;
469 502
470 eh = ext_inode_hdr(inode); 503 eh = ext_inode_hdr(inode);
471 BUG_ON(eh == NULL); 504 depth = ext_depth(inode);
472 if (ext4_ext_check_header(__FUNCTION__, inode, eh)) 505 if (ext4_ext_check_header(inode, eh, depth))
473 return ERR_PTR(-EIO); 506 return ERR_PTR(-EIO);
474 507
475 i = depth = ext_depth(inode);
476 508
477 /* account possible depth increase */ 509 /* account possible depth increase */
478 if (!path) { 510 if (!path) {
@@ -484,10 +516,12 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
484 } 516 }
485 path[0].p_hdr = eh; 517 path[0].p_hdr = eh;
486 518
519 i = depth;
487 /* walk through the tree */ 520 /* walk through the tree */
488 while (i) { 521 while (i) {
489 ext_debug("depth %d: num %d, max %d\n", 522 ext_debug("depth %d: num %d, max %d\n",
490 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 523 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
524
491 ext4_ext_binsearch_idx(inode, path + ppos, block); 525 ext4_ext_binsearch_idx(inode, path + ppos, block);
492 path[ppos].p_block = idx_pblock(path[ppos].p_idx); 526 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
493 path[ppos].p_depth = i; 527 path[ppos].p_depth = i;
@@ -504,7 +538,7 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
504 path[ppos].p_hdr = eh; 538 path[ppos].p_hdr = eh;
505 i--; 539 i--;
506 540
507 if (ext4_ext_check_header(__FUNCTION__, inode, eh)) 541 if (ext4_ext_check_header(inode, eh, i))
508 goto err; 542 goto err;
509 } 543 }
510 544
@@ -513,9 +547,6 @@ ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
513 path[ppos].p_ext = NULL; 547 path[ppos].p_ext = NULL;
514 path[ppos].p_idx = NULL; 548 path[ppos].p_idx = NULL;
515 549
516 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
517 goto err;
518
519 /* find extent */ 550 /* find extent */
520 ext4_ext_binsearch(inode, path + ppos, block); 551 ext4_ext_binsearch(inode, path + ppos, block);
521 552
@@ -553,7 +584,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
553 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 584 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
554 len = (len - 1) * sizeof(struct ext4_extent_idx); 585 len = (len - 1) * sizeof(struct ext4_extent_idx);
555 len = len < 0 ? 0 : len; 586 len = len < 0 ? 0 : len;
556 ext_debug("insert new index %d after: %d. " 587 ext_debug("insert new index %d after: %llu. "
557 "move %d from 0x%p to 0x%p\n", 588 "move %d from 0x%p to 0x%p\n",
558 logical, ptr, len, 589 logical, ptr, len,
559 (curp->p_idx + 1), (curp->p_idx + 2)); 590 (curp->p_idx + 1), (curp->p_idx + 2));
@@ -564,7 +595,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
564 /* insert before */ 595 /* insert before */
565 len = len * sizeof(struct ext4_extent_idx); 596 len = len * sizeof(struct ext4_extent_idx);
566 len = len < 0 ? 0 : len; 597 len = len < 0 ? 0 : len;
567 ext_debug("insert new index %d before: %d. " 598 ext_debug("insert new index %d before: %llu. "
568 "move %d from 0x%p to 0x%p\n", 599 "move %d from 0x%p to 0x%p\n",
569 logical, ptr, len, 600 logical, ptr, len,
570 curp->p_idx, (curp->p_idx + 1)); 601 curp->p_idx, (curp->p_idx + 1));
@@ -686,7 +717,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
686 ext_debug("move %d:%llu:%d in new leaf %llu\n", 717 ext_debug("move %d:%llu:%d in new leaf %llu\n",
687 le32_to_cpu(path[depth].p_ext->ee_block), 718 le32_to_cpu(path[depth].p_ext->ee_block),
688 ext_pblock(path[depth].p_ext), 719 ext_pblock(path[depth].p_ext),
689 le16_to_cpu(path[depth].p_ext->ee_len), 720 ext4_ext_get_actual_len(path[depth].p_ext),
690 newblock); 721 newblock);
691 /*memmove(ex++, path[depth].p_ext++, 722 /*memmove(ex++, path[depth].p_ext++,
692 sizeof(struct ext4_extent)); 723 sizeof(struct ext4_extent));
@@ -764,7 +795,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
764 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 795 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
765 EXT_LAST_INDEX(path[i].p_hdr)); 796 EXT_LAST_INDEX(path[i].p_hdr));
766 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 797 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
767 ext_debug("%d: move %d:%d in new index %llu\n", i, 798 ext_debug("%d: move %d:%llu in new index %llu\n", i,
768 le32_to_cpu(path[i].p_idx->ei_block), 799 le32_to_cpu(path[i].p_idx->ei_block),
769 idx_pblock(path[i].p_idx), 800 idx_pblock(path[i].p_idx),
770 newblock); 801 newblock);
@@ -893,8 +924,13 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
893 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 924 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
894 curp->p_hdr->eh_entries = cpu_to_le16(1); 925 curp->p_hdr->eh_entries = cpu_to_le16(1);
895 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 926 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
896 /* FIXME: it works, but actually path[0] can be index */ 927
897 curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 928 if (path[0].p_hdr->eh_depth)
929 curp->p_idx->ei_block =
930 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
931 else
932 curp->p_idx->ei_block =
933 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
898 ext4_idx_store_pblock(curp->p_idx, newblock); 934 ext4_idx_store_pblock(curp->p_idx, newblock);
899 935
900 neh = ext_inode_hdr(inode); 936 neh = ext_inode_hdr(inode);
@@ -1106,7 +1142,24 @@ static int
1106ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1142ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1107 struct ext4_extent *ex2) 1143 struct ext4_extent *ex2)
1108{ 1144{
1109 if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != 1145 unsigned short ext1_ee_len, ext2_ee_len, max_len;
1146
1147 /*
1148 * Make sure that either both extents are uninitialized, or
1149 * both are _not_.
1150 */
1151 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1152 return 0;
1153
1154 if (ext4_ext_is_uninitialized(ex1))
1155 max_len = EXT_UNINIT_MAX_LEN;
1156 else
1157 max_len = EXT_INIT_MAX_LEN;
1158
1159 ext1_ee_len = ext4_ext_get_actual_len(ex1);
1160 ext2_ee_len = ext4_ext_get_actual_len(ex2);
1161
1162 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1110 le32_to_cpu(ex2->ee_block)) 1163 le32_to_cpu(ex2->ee_block))
1111 return 0; 1164 return 0;
1112 1165
@@ -1115,19 +1168,66 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1115 * as an RO_COMPAT feature, refuse to merge to extents if 1168 * as an RO_COMPAT feature, refuse to merge to extents if
1116 * this can result in the top bit of ee_len being set. 1169 * this can result in the top bit of ee_len being set.
1117 */ 1170 */
1118 if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) 1171 if (ext1_ee_len + ext2_ee_len > max_len)
1119 return 0; 1172 return 0;
1120#ifdef AGGRESSIVE_TEST 1173#ifdef AGGRESSIVE_TEST
1121 if (le16_to_cpu(ex1->ee_len) >= 4) 1174 if (le16_to_cpu(ex1->ee_len) >= 4)
1122 return 0; 1175 return 0;
1123#endif 1176#endif
1124 1177
1125 if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) 1178 if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1126 return 1; 1179 return 1;
1127 return 0; 1180 return 0;
1128} 1181}
1129 1182
1130/* 1183/*
1184 * This function tries to merge the "ex" extent to the next extent in the tree.
1185 * It always tries to merge towards right. If you want to merge towards
1186 * left, pass "ex - 1" as argument instead of "ex".
1187 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1188 * 1 if they got merged.
1189 */
1190int ext4_ext_try_to_merge(struct inode *inode,
1191 struct ext4_ext_path *path,
1192 struct ext4_extent *ex)
1193{
1194 struct ext4_extent_header *eh;
1195 unsigned int depth, len;
1196 int merge_done = 0;
1197 int uninitialized = 0;
1198
1199 depth = ext_depth(inode);
1200 BUG_ON(path[depth].p_hdr == NULL);
1201 eh = path[depth].p_hdr;
1202
1203 while (ex < EXT_LAST_EXTENT(eh)) {
1204 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1205 break;
1206 /* merge with next extent! */
1207 if (ext4_ext_is_uninitialized(ex))
1208 uninitialized = 1;
1209 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1210 + ext4_ext_get_actual_len(ex + 1));
1211 if (uninitialized)
1212 ext4_ext_mark_uninitialized(ex);
1213
1214 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1215 len = (EXT_LAST_EXTENT(eh) - ex - 1)
1216 * sizeof(struct ext4_extent);
1217 memmove(ex + 1, ex + 2, len);
1218 }
1219 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
1220 merge_done = 1;
1221 WARN_ON(eh->eh_entries == 0);
1222 if (!eh->eh_entries)
1223 ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1224 "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1225 }
1226
1227 return merge_done;
1228}
1229
1230/*
1131 * check if a portion of the "newext" extent overlaps with an 1231 * check if a portion of the "newext" extent overlaps with an
1132 * existing extent. 1232 * existing extent.
1133 * 1233 *
@@ -1144,7 +1244,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1144 unsigned int ret = 0; 1244 unsigned int ret = 0;
1145 1245
1146 b1 = le32_to_cpu(newext->ee_block); 1246 b1 = le32_to_cpu(newext->ee_block);
1147 len1 = le16_to_cpu(newext->ee_len); 1247 len1 = ext4_ext_get_actual_len(newext);
1148 depth = ext_depth(inode); 1248 depth = ext_depth(inode);
1149 if (!path[depth].p_ext) 1249 if (!path[depth].p_ext)
1150 goto out; 1250 goto out;
@@ -1191,8 +1291,9 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1191 struct ext4_extent *nearex; /* nearest extent */ 1291 struct ext4_extent *nearex; /* nearest extent */
1192 struct ext4_ext_path *npath = NULL; 1292 struct ext4_ext_path *npath = NULL;
1193 int depth, len, err, next; 1293 int depth, len, err, next;
1294 unsigned uninitialized = 0;
1194 1295
1195 BUG_ON(newext->ee_len == 0); 1296 BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1196 depth = ext_depth(inode); 1297 depth = ext_depth(inode);
1197 ex = path[depth].p_ext; 1298 ex = path[depth].p_ext;
1198 BUG_ON(path[depth].p_hdr == NULL); 1299 BUG_ON(path[depth].p_hdr == NULL);
@@ -1200,14 +1301,24 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1200 /* try to insert block into found extent and return */ 1301 /* try to insert block into found extent and return */
1201 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1302 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1202 ext_debug("append %d block to %d:%d (from %llu)\n", 1303 ext_debug("append %d block to %d:%d (from %llu)\n",
1203 le16_to_cpu(newext->ee_len), 1304 ext4_ext_get_actual_len(newext),
1204 le32_to_cpu(ex->ee_block), 1305 le32_to_cpu(ex->ee_block),
1205 le16_to_cpu(ex->ee_len), ext_pblock(ex)); 1306 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1206 err = ext4_ext_get_access(handle, inode, path + depth); 1307 err = ext4_ext_get_access(handle, inode, path + depth);
1207 if (err) 1308 if (err)
1208 return err; 1309 return err;
1209 ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) 1310
1210 + le16_to_cpu(newext->ee_len)); 1311 /*
1312 * ext4_can_extents_be_merged should have checked that either
1313 * both extents are uninitialized, or both aren't. Thus we
1314 * need to check only one of them here.
1315 */
1316 if (ext4_ext_is_uninitialized(ex))
1317 uninitialized = 1;
1318 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1319 + ext4_ext_get_actual_len(newext));
1320 if (uninitialized)
1321 ext4_ext_mark_uninitialized(ex);
1211 eh = path[depth].p_hdr; 1322 eh = path[depth].p_hdr;
1212 nearex = ex; 1323 nearex = ex;
1213 goto merge; 1324 goto merge;
@@ -1263,7 +1374,7 @@ has_space:
1263 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1374 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1264 le32_to_cpu(newext->ee_block), 1375 le32_to_cpu(newext->ee_block),
1265 ext_pblock(newext), 1376 ext_pblock(newext),
1266 le16_to_cpu(newext->ee_len)); 1377 ext4_ext_get_actual_len(newext));
1267 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1378 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1268 } else if (le32_to_cpu(newext->ee_block) 1379 } else if (le32_to_cpu(newext->ee_block)
1269 > le32_to_cpu(nearex->ee_block)) { 1380 > le32_to_cpu(nearex->ee_block)) {
@@ -1276,7 +1387,7 @@ has_space:
1276 "move %d from 0x%p to 0x%p\n", 1387 "move %d from 0x%p to 0x%p\n",
1277 le32_to_cpu(newext->ee_block), 1388 le32_to_cpu(newext->ee_block),
1278 ext_pblock(newext), 1389 ext_pblock(newext),
1279 le16_to_cpu(newext->ee_len), 1390 ext4_ext_get_actual_len(newext),
1280 nearex, len, nearex + 1, nearex + 2); 1391 nearex, len, nearex + 1, nearex + 2);
1281 memmove(nearex + 2, nearex + 1, len); 1392 memmove(nearex + 2, nearex + 1, len);
1282 } 1393 }
@@ -1289,7 +1400,7 @@ has_space:
1289 "move %d from 0x%p to 0x%p\n", 1400 "move %d from 0x%p to 0x%p\n",
1290 le32_to_cpu(newext->ee_block), 1401 le32_to_cpu(newext->ee_block),
1291 ext_pblock(newext), 1402 ext_pblock(newext),
1292 le16_to_cpu(newext->ee_len), 1403 ext4_ext_get_actual_len(newext),
1293 nearex, len, nearex + 1, nearex + 2); 1404 nearex, len, nearex + 1, nearex + 2);
1294 memmove(nearex + 1, nearex, len); 1405 memmove(nearex + 1, nearex, len);
1295 path[depth].p_ext = nearex; 1406 path[depth].p_ext = nearex;
@@ -1304,20 +1415,7 @@ has_space:
1304 1415
1305merge: 1416merge:
1306 /* try to merge extents to the right */ 1417 /* try to merge extents to the right */
1307 while (nearex < EXT_LAST_EXTENT(eh)) { 1418 ext4_ext_try_to_merge(inode, path, nearex);
1308 if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
1309 break;
1310 /* merge with next extent! */
1311 nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
1312 + le16_to_cpu(nearex[1].ee_len));
1313 if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1314 len = (EXT_LAST_EXTENT(eh) - nearex - 1)
1315 * sizeof(struct ext4_extent);
1316 memmove(nearex + 1, nearex + 2, len);
1317 }
1318 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1319 BUG_ON(eh->eh_entries == 0);
1320 }
1321 1419
1322 /* try to merge extents to the left */ 1420 /* try to merge extents to the left */
1323 1421
@@ -1379,8 +1477,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1379 end = le32_to_cpu(ex->ee_block); 1477 end = le32_to_cpu(ex->ee_block);
1380 if (block + num < end) 1478 if (block + num < end)
1381 end = block + num; 1479 end = block + num;
1382 } else if (block >= 1480 } else if (block >= le32_to_cpu(ex->ee_block)
1383 le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { 1481 + ext4_ext_get_actual_len(ex)) {
1384 /* need to allocate space after found extent */ 1482 /* need to allocate space after found extent */
1385 start = block; 1483 start = block;
1386 end = block + num; 1484 end = block + num;
@@ -1392,7 +1490,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1392 * by found extent 1490 * by found extent
1393 */ 1491 */
1394 start = block; 1492 start = block;
1395 end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); 1493 end = le32_to_cpu(ex->ee_block)
1494 + ext4_ext_get_actual_len(ex);
1396 if (block + num < end) 1495 if (block + num < end)
1397 end = block + num; 1496 end = block + num;
1398 exists = 1; 1497 exists = 1;
@@ -1408,7 +1507,7 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1408 cbex.ec_type = EXT4_EXT_CACHE_GAP; 1507 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1409 } else { 1508 } else {
1410 cbex.ec_block = le32_to_cpu(ex->ee_block); 1509 cbex.ec_block = le32_to_cpu(ex->ee_block);
1411 cbex.ec_len = le16_to_cpu(ex->ee_len); 1510 cbex.ec_len = ext4_ext_get_actual_len(ex);
1412 cbex.ec_start = ext_pblock(ex); 1511 cbex.ec_start = ext_pblock(ex);
1413 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1512 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1414 } 1513 }
@@ -1481,15 +1580,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1481 ext_debug("cache gap(before): %lu [%lu:%lu]", 1580 ext_debug("cache gap(before): %lu [%lu:%lu]",
1482 (unsigned long) block, 1581 (unsigned long) block,
1483 (unsigned long) le32_to_cpu(ex->ee_block), 1582 (unsigned long) le32_to_cpu(ex->ee_block),
1484 (unsigned long) le16_to_cpu(ex->ee_len)); 1583 (unsigned long) ext4_ext_get_actual_len(ex));
1485 } else if (block >= le32_to_cpu(ex->ee_block) 1584 } else if (block >= le32_to_cpu(ex->ee_block)
1486 + le16_to_cpu(ex->ee_len)) { 1585 + ext4_ext_get_actual_len(ex)) {
1487 lblock = le32_to_cpu(ex->ee_block) 1586 lblock = le32_to_cpu(ex->ee_block)
1488 + le16_to_cpu(ex->ee_len); 1587 + ext4_ext_get_actual_len(ex);
1489 len = ext4_ext_next_allocated_block(path); 1588 len = ext4_ext_next_allocated_block(path);
1490 ext_debug("cache gap(after): [%lu:%lu] %lu", 1589 ext_debug("cache gap(after): [%lu:%lu] %lu",
1491 (unsigned long) le32_to_cpu(ex->ee_block), 1590 (unsigned long) le32_to_cpu(ex->ee_block),
1492 (unsigned long) le16_to_cpu(ex->ee_len), 1591 (unsigned long) ext4_ext_get_actual_len(ex),
1493 (unsigned long) block); 1592 (unsigned long) block);
1494 BUG_ON(len == lblock); 1593 BUG_ON(len == lblock);
1495 len = len - lblock; 1594 len = len - lblock;
@@ -1619,12 +1718,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1619 unsigned long from, unsigned long to) 1718 unsigned long from, unsigned long to)
1620{ 1719{
1621 struct buffer_head *bh; 1720 struct buffer_head *bh;
1721 unsigned short ee_len = ext4_ext_get_actual_len(ex);
1622 int i; 1722 int i;
1623 1723
1624#ifdef EXTENTS_STATS 1724#ifdef EXTENTS_STATS
1625 { 1725 {
1626 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1726 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1627 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1628 spin_lock(&sbi->s_ext_stats_lock); 1727 spin_lock(&sbi->s_ext_stats_lock);
1629 sbi->s_ext_blocks += ee_len; 1728 sbi->s_ext_blocks += ee_len;
1630 sbi->s_ext_extents++; 1729 sbi->s_ext_extents++;
@@ -1638,12 +1737,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1638 } 1737 }
1639#endif 1738#endif
1640 if (from >= le32_to_cpu(ex->ee_block) 1739 if (from >= le32_to_cpu(ex->ee_block)
1641 && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { 1740 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1642 /* tail removal */ 1741 /* tail removal */
1643 unsigned long num; 1742 unsigned long num;
1644 ext4_fsblk_t start; 1743 ext4_fsblk_t start;
1645 num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; 1744 num = le32_to_cpu(ex->ee_block) + ee_len - from;
1646 start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; 1745 start = ext_pblock(ex) + ee_len - num;
1647 ext_debug("free last %lu blocks starting %llu\n", num, start); 1746 ext_debug("free last %lu blocks starting %llu\n", num, start);
1648 for (i = 0; i < num; i++) { 1747 for (i = 0; i < num; i++) {
1649 bh = sb_find_get_block(inode->i_sb, start + i); 1748 bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1651,12 +1750,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1651 } 1750 }
1652 ext4_free_blocks(handle, inode, start, num); 1751 ext4_free_blocks(handle, inode, start, num);
1653 } else if (from == le32_to_cpu(ex->ee_block) 1752 } else if (from == le32_to_cpu(ex->ee_block)
1654 && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { 1753 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1655 printk("strange request: removal %lu-%lu from %u:%u\n", 1754 printk("strange request: removal %lu-%lu from %u:%u\n",
1656 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); 1755 from, to, le32_to_cpu(ex->ee_block), ee_len);
1657 } else { 1756 } else {
1658 printk("strange request: removal(2) %lu-%lu from %u:%u\n", 1757 printk("strange request: removal(2) %lu-%lu from %u:%u\n",
1659 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); 1758 from, to, le32_to_cpu(ex->ee_block), ee_len);
1660 } 1759 }
1661 return 0; 1760 return 0;
1662} 1761}
@@ -1671,21 +1770,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1671 unsigned a, b, block, num; 1770 unsigned a, b, block, num;
1672 unsigned long ex_ee_block; 1771 unsigned long ex_ee_block;
1673 unsigned short ex_ee_len; 1772 unsigned short ex_ee_len;
1773 unsigned uninitialized = 0;
1674 struct ext4_extent *ex; 1774 struct ext4_extent *ex;
1675 1775
1776 /* the header must be checked already in ext4_ext_remove_space() */
1676 ext_debug("truncate since %lu in leaf\n", start); 1777 ext_debug("truncate since %lu in leaf\n", start);
1677 if (!path[depth].p_hdr) 1778 if (!path[depth].p_hdr)
1678 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 1779 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1679 eh = path[depth].p_hdr; 1780 eh = path[depth].p_hdr;
1680 BUG_ON(eh == NULL); 1781 BUG_ON(eh == NULL);
1681 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
1682 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
1683 1782
1684 /* find where to start removing */ 1783 /* find where to start removing */
1685 ex = EXT_LAST_EXTENT(eh); 1784 ex = EXT_LAST_EXTENT(eh);
1686 1785
1687 ex_ee_block = le32_to_cpu(ex->ee_block); 1786 ex_ee_block = le32_to_cpu(ex->ee_block);
1688 ex_ee_len = le16_to_cpu(ex->ee_len); 1787 if (ext4_ext_is_uninitialized(ex))
1788 uninitialized = 1;
1789 ex_ee_len = ext4_ext_get_actual_len(ex);
1689 1790
1690 while (ex >= EXT_FIRST_EXTENT(eh) && 1791 while (ex >= EXT_FIRST_EXTENT(eh) &&
1691 ex_ee_block + ex_ee_len > start) { 1792 ex_ee_block + ex_ee_len > start) {
@@ -1753,6 +1854,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1753 1854
1754 ex->ee_block = cpu_to_le32(block); 1855 ex->ee_block = cpu_to_le32(block);
1755 ex->ee_len = cpu_to_le16(num); 1856 ex->ee_len = cpu_to_le16(num);
1857 /*
1858 * Do not mark uninitialized if all the blocks in the
1859 * extent have been removed.
1860 */
1861 if (uninitialized && num)
1862 ext4_ext_mark_uninitialized(ex);
1756 1863
1757 err = ext4_ext_dirty(handle, inode, path + depth); 1864 err = ext4_ext_dirty(handle, inode, path + depth);
1758 if (err) 1865 if (err)
@@ -1762,7 +1869,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1762 ext_pblock(ex)); 1869 ext_pblock(ex));
1763 ex--; 1870 ex--;
1764 ex_ee_block = le32_to_cpu(ex->ee_block); 1871 ex_ee_block = le32_to_cpu(ex->ee_block);
1765 ex_ee_len = le16_to_cpu(ex->ee_len); 1872 ex_ee_len = ext4_ext_get_actual_len(ex);
1766 } 1873 }
1767 1874
1768 if (correct_index && eh->eh_entries) 1875 if (correct_index && eh->eh_entries)
@@ -1825,7 +1932,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1825 return -ENOMEM; 1932 return -ENOMEM;
1826 } 1933 }
1827 path[0].p_hdr = ext_inode_hdr(inode); 1934 path[0].p_hdr = ext_inode_hdr(inode);
1828 if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) { 1935 if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
1829 err = -EIO; 1936 err = -EIO;
1830 goto out; 1937 goto out;
1831 } 1938 }
@@ -1846,17 +1953,8 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1846 if (!path[i].p_hdr) { 1953 if (!path[i].p_hdr) {
1847 ext_debug("initialize header\n"); 1954 ext_debug("initialize header\n");
1848 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 1955 path[i].p_hdr = ext_block_hdr(path[i].p_bh);
1849 if (ext4_ext_check_header(__FUNCTION__, inode,
1850 path[i].p_hdr)) {
1851 err = -EIO;
1852 goto out;
1853 }
1854 } 1956 }
1855 1957
1856 BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
1857 > le16_to_cpu(path[i].p_hdr->eh_max));
1858 BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
1859
1860 if (!path[i].p_idx) { 1958 if (!path[i].p_idx) {
1861 /* this level hasn't been touched yet */ 1959 /* this level hasn't been touched yet */
1862 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 1960 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
@@ -1873,17 +1971,27 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1873 i, EXT_FIRST_INDEX(path[i].p_hdr), 1971 i, EXT_FIRST_INDEX(path[i].p_hdr),
1874 path[i].p_idx); 1972 path[i].p_idx);
1875 if (ext4_ext_more_to_rm(path + i)) { 1973 if (ext4_ext_more_to_rm(path + i)) {
1974 struct buffer_head *bh;
1876 /* go to the next level */ 1975 /* go to the next level */
1877 ext_debug("move to level %d (block %llu)\n", 1976 ext_debug("move to level %d (block %llu)\n",
1878 i + 1, idx_pblock(path[i].p_idx)); 1977 i + 1, idx_pblock(path[i].p_idx));
1879 memset(path + i + 1, 0, sizeof(*path)); 1978 memset(path + i + 1, 0, sizeof(*path));
1880 path[i+1].p_bh = 1979 bh = sb_bread(sb, idx_pblock(path[i].p_idx));
1881 sb_bread(sb, idx_pblock(path[i].p_idx)); 1980 if (!bh) {
1882 if (!path[i+1].p_bh) {
1883 /* should we reset i_size? */ 1981 /* should we reset i_size? */
1884 err = -EIO; 1982 err = -EIO;
1885 break; 1983 break;
1886 } 1984 }
1985 if (WARN_ON(i + 1 > depth)) {
1986 err = -EIO;
1987 break;
1988 }
1989 if (ext4_ext_check_header(inode, ext_block_hdr(bh),
1990 depth - i - 1)) {
1991 err = -EIO;
1992 break;
1993 }
1994 path[i + 1].p_bh = bh;
1887 1995
1888 /* save actual number of indexes since this 1996 /* save actual number of indexes since this
1889 * number is changed at the next iteration */ 1997 * number is changed at the next iteration */
@@ -1977,15 +2085,158 @@ void ext4_ext_release(struct super_block *sb)
1977#endif 2085#endif
1978} 2086}
1979 2087
2088/*
2089 * This function is called by ext4_ext_get_blocks() if someone tries to write
2090 * to an uninitialized extent. It may result in splitting the uninitialized
2091 * extent into multiple extents (upto three - one initialized and two
2092 * uninitialized).
2093 * There are three possibilities:
2094 * a> There is no split required: Entire extent should be initialized
2095 * b> Splits in two extents: Write is happening at either end of the extent
2096 * c> Splits in three extents: Somone is writing in middle of the extent
2097 */
2098int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
2099 struct ext4_ext_path *path,
2100 ext4_fsblk_t iblock,
2101 unsigned long max_blocks)
2102{
2103 struct ext4_extent *ex, newex;
2104 struct ext4_extent *ex1 = NULL;
2105 struct ext4_extent *ex2 = NULL;
2106 struct ext4_extent *ex3 = NULL;
2107 struct ext4_extent_header *eh;
2108 unsigned int allocated, ee_block, ee_len, depth;
2109 ext4_fsblk_t newblock;
2110 int err = 0;
2111 int ret = 0;
2112
2113 depth = ext_depth(inode);
2114 eh = path[depth].p_hdr;
2115 ex = path[depth].p_ext;
2116 ee_block = le32_to_cpu(ex->ee_block);
2117 ee_len = ext4_ext_get_actual_len(ex);
2118 allocated = ee_len - (iblock - ee_block);
2119 newblock = iblock - ee_block + ext_pblock(ex);
2120 ex2 = ex;
2121
2122 /* ex1: ee_block to iblock - 1 : uninitialized */
2123 if (iblock > ee_block) {
2124 ex1 = ex;
2125 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2126 ext4_ext_mark_uninitialized(ex1);
2127 ex2 = &newex;
2128 }
2129 /*
2130 * for sanity, update the length of the ex2 extent before
2131 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2132 * overlap of blocks.
2133 */
2134 if (!ex1 && allocated > max_blocks)
2135 ex2->ee_len = cpu_to_le16(max_blocks);
2136 /* ex3: to ee_block + ee_len : uninitialised */
2137 if (allocated > max_blocks) {
2138 unsigned int newdepth;
2139 ex3 = &newex;
2140 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2141 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2142 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2143 ext4_ext_mark_uninitialized(ex3);
2144 err = ext4_ext_insert_extent(handle, inode, path, ex3);
2145 if (err)
2146 goto out;
2147 /*
2148 * The depth, and hence eh & ex might change
2149 * as part of the insert above.
2150 */
2151 newdepth = ext_depth(inode);
2152 if (newdepth != depth) {
2153 depth = newdepth;
2154 path = ext4_ext_find_extent(inode, iblock, NULL);
2155 if (IS_ERR(path)) {
2156 err = PTR_ERR(path);
2157 path = NULL;
2158 goto out;
2159 }
2160 eh = path[depth].p_hdr;
2161 ex = path[depth].p_ext;
2162 if (ex2 != &newex)
2163 ex2 = ex;
2164 }
2165 allocated = max_blocks;
2166 }
2167 /*
2168 * If there was a change of depth as part of the
2169 * insertion of ex3 above, we need to update the length
2170 * of the ex1 extent again here
2171 */
2172 if (ex1 && ex1 != ex) {
2173 ex1 = ex;
2174 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2175 ext4_ext_mark_uninitialized(ex1);
2176 ex2 = &newex;
2177 }
2178 /* ex2: iblock to iblock + maxblocks-1 : initialised */
2179 ex2->ee_block = cpu_to_le32(iblock);
2180 ex2->ee_start = cpu_to_le32(newblock);
2181 ext4_ext_store_pblock(ex2, newblock);
2182 ex2->ee_len = cpu_to_le16(allocated);
2183 if (ex2 != ex)
2184 goto insert;
2185 err = ext4_ext_get_access(handle, inode, path + depth);
2186 if (err)
2187 goto out;
2188 /*
2189 * New (initialized) extent starts from the first block
2190 * in the current extent. i.e., ex2 == ex
2191 * We have to see if it can be merged with the extent
2192 * on the left.
2193 */
2194 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2195 /*
2196 * To merge left, pass "ex2 - 1" to try_to_merge(),
2197 * since it merges towards right _only_.
2198 */
2199 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2200 if (ret) {
2201 err = ext4_ext_correct_indexes(handle, inode, path);
2202 if (err)
2203 goto out;
2204 depth = ext_depth(inode);
2205 ex2--;
2206 }
2207 }
2208 /*
2209 * Try to Merge towards right. This might be required
2210 * only when the whole extent is being written to.
2211 * i.e. ex2 == ex and ex3 == NULL.
2212 */
2213 if (!ex3) {
2214 ret = ext4_ext_try_to_merge(inode, path, ex2);
2215 if (ret) {
2216 err = ext4_ext_correct_indexes(handle, inode, path);
2217 if (err)
2218 goto out;
2219 }
2220 }
2221 /* Mark modified extent as dirty */
2222 err = ext4_ext_dirty(handle, inode, path + depth);
2223 goto out;
2224insert:
2225 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2226out:
2227 return err ? err : allocated;
2228}
2229
1980int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2230int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1981 ext4_fsblk_t iblock, 2231 ext4_fsblk_t iblock,
1982 unsigned long max_blocks, struct buffer_head *bh_result, 2232 unsigned long max_blocks, struct buffer_head *bh_result,
1983 int create, int extend_disksize) 2233 int create, int extend_disksize)
1984{ 2234{
1985 struct ext4_ext_path *path = NULL; 2235 struct ext4_ext_path *path = NULL;
2236 struct ext4_extent_header *eh;
1986 struct ext4_extent newex, *ex; 2237 struct ext4_extent newex, *ex;
1987 ext4_fsblk_t goal, newblock; 2238 ext4_fsblk_t goal, newblock;
1988 int err = 0, depth; 2239 int err = 0, depth, ret;
1989 unsigned long allocated = 0; 2240 unsigned long allocated = 0;
1990 2241
1991 __clear_bit(BH_New, &bh_result->b_state); 2242 __clear_bit(BH_New, &bh_result->b_state);
@@ -1998,8 +2249,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1998 if (goal) { 2249 if (goal) {
1999 if (goal == EXT4_EXT_CACHE_GAP) { 2250 if (goal == EXT4_EXT_CACHE_GAP) {
2000 if (!create) { 2251 if (!create) {
2001 /* block isn't allocated yet and 2252 /*
2002 * user doesn't want to allocate it */ 2253 * block isn't allocated yet and
2254 * user doesn't want to allocate it
2255 */
2003 goto out2; 2256 goto out2;
2004 } 2257 }
2005 /* we should allocate requested block */ 2258 /* we should allocate requested block */
@@ -2033,21 +2286,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2033 * this is why assert can't be put in ext4_ext_find_extent() 2286 * this is why assert can't be put in ext4_ext_find_extent()
2034 */ 2287 */
2035 BUG_ON(path[depth].p_ext == NULL && depth != 0); 2288 BUG_ON(path[depth].p_ext == NULL && depth != 0);
2289 eh = path[depth].p_hdr;
2036 2290
2037 ex = path[depth].p_ext; 2291 ex = path[depth].p_ext;
2038 if (ex) { 2292 if (ex) {
2039 unsigned long ee_block = le32_to_cpu(ex->ee_block); 2293 unsigned long ee_block = le32_to_cpu(ex->ee_block);
2040 ext4_fsblk_t ee_start = ext_pblock(ex); 2294 ext4_fsblk_t ee_start = ext_pblock(ex);
2041 unsigned short ee_len = le16_to_cpu(ex->ee_len); 2295 unsigned short ee_len;
2042 2296
2043 /* 2297 /*
2044 * Allow future support for preallocated extents to be added
2045 * as an RO_COMPAT feature:
2046 * Uninitialized extents are treated as holes, except that 2298 * Uninitialized extents are treated as holes, except that
2047 * we avoid (fail) allocating new blocks during a write. 2299 * we split out initialized portions during a write.
2048 */ 2300 */
2049 if (ee_len > EXT_MAX_LEN) 2301 ee_len = ext4_ext_get_actual_len(ex);
2050 goto out2;
2051 /* if found extent covers block, simply return it */ 2302 /* if found extent covers block, simply return it */
2052 if (iblock >= ee_block && iblock < ee_block + ee_len) { 2303 if (iblock >= ee_block && iblock < ee_block + ee_len) {
2053 newblock = iblock - ee_block + ee_start; 2304 newblock = iblock - ee_block + ee_start;
@@ -2055,9 +2306,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2055 allocated = ee_len - (iblock - ee_block); 2306 allocated = ee_len - (iblock - ee_block);
2056 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, 2307 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
2057 ee_block, ee_len, newblock); 2308 ee_block, ee_len, newblock);
2058 ext4_ext_put_in_cache(inode, ee_block, ee_len, 2309
2059 ee_start, EXT4_EXT_CACHE_EXTENT); 2310 /* Do not put uninitialized extent in the cache */
2060 goto out; 2311 if (!ext4_ext_is_uninitialized(ex)) {
2312 ext4_ext_put_in_cache(inode, ee_block,
2313 ee_len, ee_start,
2314 EXT4_EXT_CACHE_EXTENT);
2315 goto out;
2316 }
2317 if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2318 goto out;
2319 if (!create)
2320 goto out2;
2321
2322 ret = ext4_ext_convert_to_initialized(handle, inode,
2323 path, iblock,
2324 max_blocks);
2325 if (ret <= 0)
2326 goto out2;
2327 else
2328 allocated = ret;
2329 goto outnew;
2061 } 2330 }
2062 } 2331 }
2063 2332
@@ -2066,8 +2335,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2066 * we couldn't try to create block if create flag is zero 2335 * we couldn't try to create block if create flag is zero
2067 */ 2336 */
2068 if (!create) { 2337 if (!create) {
2069 /* put just found gap into cache to speed up 2338 /*
2070 * subsequent requests */ 2339 * put just found gap into cache to speed up
2340 * subsequent requests
2341 */
2071 ext4_ext_put_gap_in_cache(inode, path, iblock); 2342 ext4_ext_put_gap_in_cache(inode, path, iblock);
2072 goto out2; 2343 goto out2;
2073 } 2344 }
@@ -2081,6 +2352,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2081 /* allocate new block */ 2352 /* allocate new block */
2082 goal = ext4_ext_find_goal(inode, path, iblock); 2353 goal = ext4_ext_find_goal(inode, path, iblock);
2083 2354
2355 /*
2356 * See if request is beyond maximum number of blocks we can have in
2357 * a single extent. For an initialized extent this limit is
2358 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2359 * EXT_UNINIT_MAX_LEN.
2360 */
2361 if (max_blocks > EXT_INIT_MAX_LEN &&
2362 create != EXT4_CREATE_UNINITIALIZED_EXT)
2363 max_blocks = EXT_INIT_MAX_LEN;
2364 else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2365 create == EXT4_CREATE_UNINITIALIZED_EXT)
2366 max_blocks = EXT_UNINIT_MAX_LEN;
2367
2084 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 2368 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2085 newex.ee_block = cpu_to_le32(iblock); 2369 newex.ee_block = cpu_to_le32(iblock);
2086 newex.ee_len = cpu_to_le16(max_blocks); 2370 newex.ee_len = cpu_to_le16(max_blocks);
@@ -2098,6 +2382,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2098 /* try to insert new extent into found leaf and return */ 2382 /* try to insert new extent into found leaf and return */
2099 ext4_ext_store_pblock(&newex, newblock); 2383 ext4_ext_store_pblock(&newex, newblock);
2100 newex.ee_len = cpu_to_le16(allocated); 2384 newex.ee_len = cpu_to_le16(allocated);
2385 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
2386 ext4_ext_mark_uninitialized(&newex);
2101 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2387 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2102 if (err) { 2388 if (err) {
2103 /* free data blocks we just allocated */ 2389 /* free data blocks we just allocated */
@@ -2111,10 +2397,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2111 2397
2112 /* previous routine could use block we allocated */ 2398 /* previous routine could use block we allocated */
2113 newblock = ext_pblock(&newex); 2399 newblock = ext_pblock(&newex);
2400outnew:
2114 __set_bit(BH_New, &bh_result->b_state); 2401 __set_bit(BH_New, &bh_result->b_state);
2115 2402
2116 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 2403 /* Cache only when it is _not_ an uninitialized extent */
2117 EXT4_EXT_CACHE_EXTENT); 2404 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2405 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2406 EXT4_EXT_CACHE_EXTENT);
2118out: 2407out:
2119 if (allocated > max_blocks) 2408 if (allocated > max_blocks)
2120 allocated = max_blocks; 2409 allocated = max_blocks;
@@ -2178,7 +2467,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2178 err = ext4_ext_remove_space(inode, last_block); 2467 err = ext4_ext_remove_space(inode, last_block);
2179 2468
2180 /* In a multi-transaction truncate, we only make the final 2469 /* In a multi-transaction truncate, we only make the final
2181 * transaction synchronous. */ 2470 * transaction synchronous.
2471 */
2182 if (IS_SYNC(inode)) 2472 if (IS_SYNC(inode))
2183 handle->h_sync = 1; 2473 handle->h_sync = 1;
2184 2474
@@ -2217,3 +2507,127 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2217 2507
2218 return needed; 2508 return needed;
2219} 2509}
2510
2511/*
2512 * preallocate space for a file. This implements ext4's fallocate inode
2513 * operation, which gets called from sys_fallocate system call.
2514 * For block-mapped files, posix_fallocate should fall back to the method
2515 * of writing zeroes to the required new blocks (the same behavior which is
2516 * expected for file systems which do not support fallocate() system call).
2517 */
2518long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2519{
2520 handle_t *handle;
2521 ext4_fsblk_t block, max_blocks;
2522 ext4_fsblk_t nblocks = 0;
2523 int ret = 0;
2524 int ret2 = 0;
2525 int retries = 0;
2526 struct buffer_head map_bh;
2527 unsigned int credits, blkbits = inode->i_blkbits;
2528
2529 /*
2530 * currently supporting (pre)allocate mode for extent-based
2531 * files _only_
2532 */
2533 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
2534 return -EOPNOTSUPP;
2535
2536 /* preallocation to directories is currently not supported */
2537 if (S_ISDIR(inode->i_mode))
2538 return -ENODEV;
2539
2540 block = offset >> blkbits;
2541 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2542 - block;
2543
2544 /*
2545 * credits to insert 1 extent into extent tree + buffers to be able to
2546 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2547 */
2548 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2549retry:
2550 while (ret >= 0 && ret < max_blocks) {
2551 block = block + ret;
2552 max_blocks = max_blocks - ret;
2553 handle = ext4_journal_start(inode, credits);
2554 if (IS_ERR(handle)) {
2555 ret = PTR_ERR(handle);
2556 break;
2557 }
2558
2559 ret = ext4_ext_get_blocks(handle, inode, block,
2560 max_blocks, &map_bh,
2561 EXT4_CREATE_UNINITIALIZED_EXT, 0);
2562 WARN_ON(!ret);
2563 if (!ret) {
2564 ext4_error(inode->i_sb, "ext4_fallocate",
2565 "ext4_ext_get_blocks returned 0! inode#%lu"
2566 ", block=%llu, max_blocks=%llu",
2567 inode->i_ino, block, max_blocks);
2568 ret = -EIO;
2569 ext4_mark_inode_dirty(handle, inode);
2570 ret2 = ext4_journal_stop(handle);
2571 break;
2572 }
2573 if (ret > 0) {
2574 /* check wrap through sign-bit/zero here */
2575 if ((block + ret) < 0 || (block + ret) < block) {
2576 ret = -EIO;
2577 ext4_mark_inode_dirty(handle, inode);
2578 ret2 = ext4_journal_stop(handle);
2579 break;
2580 }
2581 if (buffer_new(&map_bh) && ((block + ret) >
2582 (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
2583 >> blkbits)))
2584 nblocks = nblocks + ret;
2585 }
2586
2587 /* Update ctime if new blocks get allocated */
2588 if (nblocks) {
2589 struct timespec now;
2590
2591 now = current_fs_time(inode->i_sb);
2592 if (!timespec_equal(&inode->i_ctime, &now))
2593 inode->i_ctime = now;
2594 }
2595
2596 ext4_mark_inode_dirty(handle, inode);
2597 ret2 = ext4_journal_stop(handle);
2598 if (ret2)
2599 break;
2600 }
2601
2602 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2603 goto retry;
2604
2605 /*
2606 * Time to update the file size.
2607 * Update only when preallocation was requested beyond the file size.
2608 */
2609 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
2610 (offset + len) > i_size_read(inode)) {
2611 if (ret > 0) {
2612 /*
2613 * if no error, we assume preallocation succeeded
2614 * completely
2615 */
2616 mutex_lock(&inode->i_mutex);
2617 i_size_write(inode, offset + len);
2618 EXT4_I(inode)->i_disksize = i_size_read(inode);
2619 mutex_unlock(&inode->i_mutex);
2620 } else if (ret < 0 && nblocks) {
2621 /* Handle partial allocation scenario */
2622 loff_t newsize;
2623
2624 mutex_lock(&inode->i_mutex);
2625 newsize = (nblocks << blkbits) + i_size_read(inode);
2626 i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
2627 EXT4_I(inode)->i_disksize = i_size_read(inode);
2628 mutex_unlock(&inode->i_mutex);
2629 }
2630 }
2631
2632 return ret > 0 ? ret2 : ret;
2633}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3c6c1fd2be..1a81cd66d6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -120,7 +120,6 @@ const struct file_operations ext4_file_operations = {
120 .open = generic_file_open, 120 .open = generic_file_open,
121 .release = ext4_release_file, 121 .release = ext4_release_file,
122 .fsync = ext4_sync_file, 122 .fsync = ext4_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read, 123 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write, 124 .splice_write = generic_file_splice_write,
126}; 125};
@@ -135,5 +134,6 @@ const struct inode_operations ext4_file_inode_operations = {
135 .removexattr = generic_removexattr, 134 .removexattr = generic_removexattr,
136#endif 135#endif
137 .permission = ext4_permission, 136 .permission = ext4_permission,
137 .fallocate = ext4_fallocate,
138}; 138};
139 139
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c88b439ba5..427f83066a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -563,7 +563,8 @@ got:
563 inode->i_ino = ino; 563 inode->i_ino = ino;
564 /* This is the optimal IO size (for stat), not the fs block size */ 564 /* This is the optimal IO size (for stat), not the fs block size */
565 inode->i_blocks = 0; 565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 566 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
567 ext4_current_time(inode);
567 568
568 memset(ei->i_data, 0, sizeof(ei->i_data)); 569 memset(ei->i_data, 0, sizeof(ei->i_data));
569 ei->i_dir_start_lookup = 0; 570 ei->i_dir_start_lookup = 0;
@@ -595,9 +596,8 @@ got:
595 spin_unlock(&sbi->s_next_gen_lock); 596 spin_unlock(&sbi->s_next_gen_lock);
596 597
597 ei->i_state = EXT4_STATE_NEW; 598 ei->i_state = EXT4_STATE_NEW;
598 ei->i_extra_isize = 599
599 (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ? 600 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
600 sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
601 601
602 ret = inode; 602 ret = inode;
603 if(DQUOT_ALLOC_INODE(inode)) { 603 if(DQUOT_ALLOC_INODE(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0bcf62a750..de26c25d6a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -726,7 +726,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
726 726
727 /* We are done with atomic stuff, now do the rest of housekeeping */ 727 /* We are done with atomic stuff, now do the rest of housekeeping */
728 728
729 inode->i_ctime = CURRENT_TIME_SEC; 729 inode->i_ctime = ext4_current_time(inode);
730 ext4_mark_inode_dirty(handle, inode); 730 ext4_mark_inode_dirty(handle, inode);
731 731
732 /* had we spliced it onto indirect block? */ 732 /* had we spliced it onto indirect block? */
@@ -1766,7 +1766,6 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1766 struct inode *inode = mapping->host; 1766 struct inode *inode = mapping->host;
1767 struct buffer_head *bh; 1767 struct buffer_head *bh;
1768 int err = 0; 1768 int err = 0;
1769 void *kaddr;
1770 1769
1771 blocksize = inode->i_sb->s_blocksize; 1770 blocksize = inode->i_sb->s_blocksize;
1772 length = blocksize - (offset & (blocksize - 1)); 1771 length = blocksize - (offset & (blocksize - 1));
@@ -1778,10 +1777,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1778 */ 1777 */
1779 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 1778 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1780 ext4_should_writeback_data(inode) && PageUptodate(page)) { 1779 ext4_should_writeback_data(inode) && PageUptodate(page)) {
1781 kaddr = kmap_atomic(page, KM_USER0); 1780 zero_user_page(page, offset, length, KM_USER0);
1782 memset(kaddr + offset, 0, length);
1783 flush_dcache_page(page);
1784 kunmap_atomic(kaddr, KM_USER0);
1785 set_page_dirty(page); 1781 set_page_dirty(page);
1786 goto unlock; 1782 goto unlock;
1787 } 1783 }
@@ -1834,10 +1830,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1834 goto unlock; 1830 goto unlock;
1835 } 1831 }
1836 1832
1837 kaddr = kmap_atomic(page, KM_USER0); 1833 zero_user_page(page, offset, length, KM_USER0);
1838 memset(kaddr + offset, 0, length);
1839 flush_dcache_page(page);
1840 kunmap_atomic(kaddr, KM_USER0);
1841 1834
1842 BUFFER_TRACE(bh, "zeroed end of block"); 1835 BUFFER_TRACE(bh, "zeroed end of block");
1843 1836
@@ -2375,7 +2368,7 @@ do_indirects:
2375 ext4_discard_reservation(inode); 2368 ext4_discard_reservation(inode);
2376 2369
2377 mutex_unlock(&ei->truncate_mutex); 2370 mutex_unlock(&ei->truncate_mutex);
2378 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 2371 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2379 ext4_mark_inode_dirty(handle, inode); 2372 ext4_mark_inode_dirty(handle, inode);
2380 2373
2381 /* 2374 /*
@@ -2583,6 +2576,25 @@ void ext4_set_inode_flags(struct inode *inode)
2583 inode->i_flags |= S_DIRSYNC; 2576 inode->i_flags |= S_DIRSYNC;
2584} 2577}
2585 2578
2579/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
2580void ext4_get_inode_flags(struct ext4_inode_info *ei)
2581{
2582 unsigned int flags = ei->vfs_inode.i_flags;
2583
2584 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
2585 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
2586 if (flags & S_SYNC)
2587 ei->i_flags |= EXT4_SYNC_FL;
2588 if (flags & S_APPEND)
2589 ei->i_flags |= EXT4_APPEND_FL;
2590 if (flags & S_IMMUTABLE)
2591 ei->i_flags |= EXT4_IMMUTABLE_FL;
2592 if (flags & S_NOATIME)
2593 ei->i_flags |= EXT4_NOATIME_FL;
2594 if (flags & S_DIRSYNC)
2595 ei->i_flags |= EXT4_DIRSYNC_FL;
2596}
2597
2586void ext4_read_inode(struct inode * inode) 2598void ext4_read_inode(struct inode * inode)
2587{ 2599{
2588 struct ext4_iloc iloc; 2600 struct ext4_iloc iloc;
@@ -2610,10 +2622,6 @@ void ext4_read_inode(struct inode * inode)
2610 } 2622 }
2611 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2623 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2612 inode->i_size = le32_to_cpu(raw_inode->i_size); 2624 inode->i_size = le32_to_cpu(raw_inode->i_size);
2613 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
2614 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
2615 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2616 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2617 2625
2618 ei->i_state = 0; 2626 ei->i_state = 0;
2619 ei->i_dir_start_lookup = 0; 2627 ei->i_dir_start_lookup = 0;
@@ -2673,8 +2681,10 @@ void ext4_read_inode(struct inode * inode)
2673 */ 2681 */
2674 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 2682 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2675 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 2683 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2676 EXT4_INODE_SIZE(inode->i_sb)) 2684 EXT4_INODE_SIZE(inode->i_sb)) {
2685 brelse (bh);
2677 goto bad_inode; 2686 goto bad_inode;
2687 }
2678 if (ei->i_extra_isize == 0) { 2688 if (ei->i_extra_isize == 0) {
2679 /* The extra space is currently unused. Use it. */ 2689 /* The extra space is currently unused. Use it. */
2680 ei->i_extra_isize = sizeof(struct ext4_inode) - 2690 ei->i_extra_isize = sizeof(struct ext4_inode) -
@@ -2689,6 +2699,11 @@ void ext4_read_inode(struct inode * inode)
2689 } else 2699 } else
2690 ei->i_extra_isize = 0; 2700 ei->i_extra_isize = 0;
2691 2701
2702 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
2703 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
2704 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
2705 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
2706
2692 if (S_ISREG(inode->i_mode)) { 2707 if (S_ISREG(inode->i_mode)) {
2693 inode->i_op = &ext4_file_inode_operations; 2708 inode->i_op = &ext4_file_inode_operations;
2694 inode->i_fop = &ext4_file_operations; 2709 inode->i_fop = &ext4_file_operations;
@@ -2742,6 +2757,7 @@ static int ext4_do_update_inode(handle_t *handle,
2742 if (ei->i_state & EXT4_STATE_NEW) 2757 if (ei->i_state & EXT4_STATE_NEW)
2743 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 2758 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
2744 2759
2760 ext4_get_inode_flags(ei);
2745 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 2761 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2746 if(!(test_opt(inode->i_sb, NO_UID32))) { 2762 if(!(test_opt(inode->i_sb, NO_UID32))) {
2747 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 2763 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
@@ -2769,9 +2785,12 @@ static int ext4_do_update_inode(handle_t *handle,
2769 } 2785 }
2770 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 2786 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2771 raw_inode->i_size = cpu_to_le32(ei->i_disksize); 2787 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2772 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 2788
2773 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 2789 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
2774 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 2790 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
2791 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
2792 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
2793
2775 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 2794 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2776 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 2795 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2777 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 2796 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
@@ -3080,6 +3099,39 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3080} 3099}
3081 3100
3082/* 3101/*
3102 * Expand an inode by new_extra_isize bytes.
3103 * Returns 0 on success or negative error number on failure.
3104 */
3105int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
3106 struct ext4_iloc iloc, handle_t *handle)
3107{
3108 struct ext4_inode *raw_inode;
3109 struct ext4_xattr_ibody_header *header;
3110 struct ext4_xattr_entry *entry;
3111
3112 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
3113 return 0;
3114
3115 raw_inode = ext4_raw_inode(&iloc);
3116
3117 header = IHDR(inode, raw_inode);
3118 entry = IFIRST(header);
3119
3120 /* No extended attributes present */
3121 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
3122 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
3123 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
3124 new_extra_isize);
3125 EXT4_I(inode)->i_extra_isize = new_extra_isize;
3126 return 0;
3127 }
3128
3129 /* try to expand with EAs present */
3130 return ext4_expand_extra_isize_ea(inode, new_extra_isize,
3131 raw_inode, handle);
3132}
3133
3134/*
3083 * What we do here is to mark the in-core inode as clean with respect to inode 3135 * What we do here is to mark the in-core inode as clean with respect to inode
3084 * dirtiness (it may still be data-dirty). 3136 * dirtiness (it may still be data-dirty).
3085 * This means that the in-core inode may be reaped by prune_icache 3137 * This means that the in-core inode may be reaped by prune_icache
@@ -3103,10 +3155,38 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3103int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 3155int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
3104{ 3156{
3105 struct ext4_iloc iloc; 3157 struct ext4_iloc iloc;
3106 int err; 3158 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3159 static unsigned int mnt_count;
3160 int err, ret;
3107 3161
3108 might_sleep(); 3162 might_sleep();
3109 err = ext4_reserve_inode_write(handle, inode, &iloc); 3163 err = ext4_reserve_inode_write(handle, inode, &iloc);
3164 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
3165 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
3166 /*
3167 * We need extra buffer credits since we may write into EA block
3168 * with this same handle. If journal_extend fails, then it will
3169 * only result in a minor loss of functionality for that inode.
3170 * If this is felt to be critical, then e2fsck should be run to
3171 * force a large enough s_min_extra_isize.
3172 */
3173 if ((jbd2_journal_extend(handle,
3174 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
3175 ret = ext4_expand_extra_isize(inode,
3176 sbi->s_want_extra_isize,
3177 iloc, handle);
3178 if (ret) {
3179 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
3180 if (mnt_count != sbi->s_es->s_mnt_count) {
3181 ext4_warning(inode->i_sb, __FUNCTION__,
3182 "Unable to expand inode %lu. Delete"
3183 " some EAs or run e2fsck.",
3184 inode->i_ino);
3185 mnt_count = sbi->s_es->s_mnt_count;
3186 }
3187 }
3188 }
3189 }
3110 if (!err) 3190 if (!err)
3111 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 3191 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
3112 return err; 3192 return err;
@@ -3195,7 +3275,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
3195 */ 3275 */
3196 3276
3197 journal = EXT4_JOURNAL(inode); 3277 journal = EXT4_JOURNAL(inode);
3198 if (is_journal_aborted(journal) || IS_RDONLY(inode)) 3278 if (is_journal_aborted(journal))
3199 return -EROFS; 3279 return -EROFS;
3200 3280
3201 jbd2_journal_lock_updates(journal); 3281 jbd2_journal_lock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 500567dd53..c04c7ccba9 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -28,6 +28,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
28 28
29 switch (cmd) { 29 switch (cmd) {
30 case EXT4_IOC_GETFLAGS: 30 case EXT4_IOC_GETFLAGS:
31 ext4_get_inode_flags(ei);
31 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; 32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
32 return put_user(flags, (int __user *) arg); 33 return put_user(flags, (int __user *) arg);
33 case EXT4_IOC_SETFLAGS: { 34 case EXT4_IOC_SETFLAGS: {
@@ -40,7 +41,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
40 if (IS_RDONLY(inode)) 41 if (IS_RDONLY(inode))
41 return -EROFS; 42 return -EROFS;
42 43
43 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 44 if (!is_owner_or_cap(inode))
44 return -EACCES; 45 return -EACCES;
45 46
46 if (get_user(flags, (int __user *) arg)) 47 if (get_user(flags, (int __user *) arg))
@@ -96,7 +97,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
96 ei->i_flags = flags; 97 ei->i_flags = flags;
97 98
98 ext4_set_inode_flags(inode); 99 ext4_set_inode_flags(inode);
99 inode->i_ctime = CURRENT_TIME_SEC; 100 inode->i_ctime = ext4_current_time(inode);
100 101
101 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 102 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
102flags_err: 103flags_err:
@@ -121,7 +122,7 @@ flags_err:
121 __u32 generation; 122 __u32 generation;
122 int err; 123 int err;
123 124
124 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 125 if (!is_owner_or_cap(inode))
125 return -EPERM; 126 return -EPERM;
126 if (IS_RDONLY(inode)) 127 if (IS_RDONLY(inode))
127 return -EROFS; 128 return -EROFS;
@@ -133,14 +134,14 @@ flags_err:
133 return PTR_ERR(handle); 134 return PTR_ERR(handle);
134 err = ext4_reserve_inode_write(handle, inode, &iloc); 135 err = ext4_reserve_inode_write(handle, inode, &iloc);
135 if (err == 0) { 136 if (err == 0) {
136 inode->i_ctime = CURRENT_TIME_SEC; 137 inode->i_ctime = ext4_current_time(inode);
137 inode->i_generation = generation; 138 inode->i_generation = generation;
138 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 139 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
139 } 140 }
140 ext4_journal_stop(handle); 141 ext4_journal_stop(handle);
141 return err; 142 return err;
142 } 143 }
143#ifdef CONFIG_JBD_DEBUG 144#ifdef CONFIG_JBD2_DEBUG
144 case EXT4_IOC_WAIT_FOR_READONLY: 145 case EXT4_IOC_WAIT_FOR_READONLY:
145 /* 146 /*
146 * This is racy - by the time we're woken up and running, 147 * This is racy - by the time we're woken up and running,
@@ -180,7 +181,7 @@ flags_err:
180 if (IS_RDONLY(inode)) 181 if (IS_RDONLY(inode))
181 return -EROFS; 182 return -EROFS;
182 183
183 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 184 if (!is_owner_or_cap(inode))
184 return -EACCES; 185 return -EACCES;
185 186
186 if (get_user(rsv_window_size, (int __user *)arg)) 187 if (get_user(rsv_window_size, (int __user *)arg))
@@ -282,7 +283,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
282 case EXT4_IOC32_SETVERSION_OLD: 283 case EXT4_IOC32_SETVERSION_OLD:
283 cmd = EXT4_IOC_SETVERSION_OLD; 284 cmd = EXT4_IOC_SETVERSION_OLD;
284 break; 285 break;
285#ifdef CONFIG_JBD_DEBUG 286#ifdef CONFIG_JBD2_DEBUG
286 case EXT4_IOC32_WAIT_FOR_READONLY: 287 case EXT4_IOC32_WAIT_FOR_READONLY:
287 cmd = EXT4_IOC_WAIT_FOR_READONLY; 288 cmd = EXT4_IOC_WAIT_FOR_READONLY;
288 break; 289 break;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2811e5720a..da224974af 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1017,6 +1017,11 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
1017 1017
1018 if (!inode) 1018 if (!inode)
1019 return ERR_PTR(-EACCES); 1019 return ERR_PTR(-EACCES);
1020
1021 if (is_bad_inode(inode)) {
1022 iput(inode);
1023 return ERR_PTR(-ENOENT);
1024 }
1020 } 1025 }
1021 return d_splice_alias(inode, dentry); 1026 return d_splice_alias(inode, dentry);
1022} 1027}
@@ -1052,6 +1057,11 @@ struct dentry *ext4_get_parent(struct dentry *child)
1052 if (!inode) 1057 if (!inode)
1053 return ERR_PTR(-EACCES); 1058 return ERR_PTR(-EACCES);
1054 1059
1060 if (is_bad_inode(inode)) {
1061 iput(inode);
1062 return ERR_PTR(-ENOENT);
1063 }
1064
1055 parent = d_alloc_anon(inode); 1065 parent = d_alloc_anon(inode);
1056 if (!parent) { 1066 if (!parent) {
1057 iput(inode); 1067 iput(inode);
@@ -1285,7 +1295,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1285 * happen is that the times are slightly out of date 1295 * happen is that the times are slightly out of date
1286 * and/or different from the directory change time. 1296 * and/or different from the directory change time.
1287 */ 1297 */
1288 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 1298 dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1289 ext4_update_dx_flag(dir); 1299 ext4_update_dx_flag(dir);
1290 dir->i_version++; 1300 dir->i_version++;
1291 ext4_mark_inode_dirty(handle, dir); 1301 ext4_mark_inode_dirty(handle, dir);
@@ -1619,6 +1629,35 @@ static int ext4_delete_entry (handle_t *handle,
1619 return -ENOENT; 1629 return -ENOENT;
1620} 1630}
1621 1631
1632/*
1633 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
1634 * since this indicates that nlinks count was previously 1.
1635 */
1636static void ext4_inc_count(handle_t *handle, struct inode *inode)
1637{
1638 inc_nlink(inode);
1639 if (is_dx(inode) && inode->i_nlink > 1) {
1640 /* limit is 16-bit i_links_count */
1641 if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
1642 inode->i_nlink = 1;
1643 EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
1644 EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
1645 }
1646 }
1647}
1648
1649/*
1650 * If a directory had nlink == 1, then we should let it be 1. This indicates
1651 * directory has >EXT4_LINK_MAX subdirs.
1652 */
1653static void ext4_dec_count(handle_t *handle, struct inode *inode)
1654{
1655 drop_nlink(inode);
1656 if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
1657 inc_nlink(inode);
1658}
1659
1660
1622static int ext4_add_nondir(handle_t *handle, 1661static int ext4_add_nondir(handle_t *handle,
1623 struct dentry *dentry, struct inode *inode) 1662 struct dentry *dentry, struct inode *inode)
1624{ 1663{
@@ -1715,7 +1754,7 @@ static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1715 struct ext4_dir_entry_2 * de; 1754 struct ext4_dir_entry_2 * de;
1716 int err, retries = 0; 1755 int err, retries = 0;
1717 1756
1718 if (dir->i_nlink >= EXT4_LINK_MAX) 1757 if (EXT4_DIR_LINK_MAX(dir))
1719 return -EMLINK; 1758 return -EMLINK;
1720 1759
1721retry: 1760retry:
@@ -1738,7 +1777,7 @@ retry:
1738 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1777 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1739 dir_block = ext4_bread (handle, inode, 0, 1, &err); 1778 dir_block = ext4_bread (handle, inode, 0, 1, &err);
1740 if (!dir_block) { 1779 if (!dir_block) {
1741 drop_nlink(inode); /* is this nlink == 0? */ 1780 ext4_dec_count(handle, inode); /* is this nlink == 0? */
1742 ext4_mark_inode_dirty(handle, inode); 1781 ext4_mark_inode_dirty(handle, inode);
1743 iput (inode); 1782 iput (inode);
1744 goto out_stop; 1783 goto out_stop;
@@ -1770,7 +1809,7 @@ retry:
1770 iput (inode); 1809 iput (inode);
1771 goto out_stop; 1810 goto out_stop;
1772 } 1811 }
1773 inc_nlink(dir); 1812 ext4_inc_count(handle, dir);
1774 ext4_update_dx_flag(dir); 1813 ext4_update_dx_flag(dir);
1775 ext4_mark_inode_dirty(handle, dir); 1814 ext4_mark_inode_dirty(handle, dir);
1776 d_instantiate(dentry, inode); 1815 d_instantiate(dentry, inode);
@@ -2035,9 +2074,9 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2035 retval = ext4_delete_entry(handle, dir, de, bh); 2074 retval = ext4_delete_entry(handle, dir, de, bh);
2036 if (retval) 2075 if (retval)
2037 goto end_rmdir; 2076 goto end_rmdir;
2038 if (inode->i_nlink != 2) 2077 if (!EXT4_DIR_LINK_EMPTY(inode))
2039 ext4_warning (inode->i_sb, "ext4_rmdir", 2078 ext4_warning (inode->i_sb, "ext4_rmdir",
2040 "empty directory has nlink!=2 (%d)", 2079 "empty directory has too many links (%d)",
2041 inode->i_nlink); 2080 inode->i_nlink);
2042 inode->i_version++; 2081 inode->i_version++;
2043 clear_nlink(inode); 2082 clear_nlink(inode);
@@ -2046,9 +2085,9 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2046 * recovery. */ 2085 * recovery. */
2047 inode->i_size = 0; 2086 inode->i_size = 0;
2048 ext4_orphan_add(handle, inode); 2087 ext4_orphan_add(handle, inode);
2049 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 2088 inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
2050 ext4_mark_inode_dirty(handle, inode); 2089 ext4_mark_inode_dirty(handle, inode);
2051 drop_nlink(dir); 2090 ext4_dec_count(handle, dir);
2052 ext4_update_dx_flag(dir); 2091 ext4_update_dx_flag(dir);
2053 ext4_mark_inode_dirty(handle, dir); 2092 ext4_mark_inode_dirty(handle, dir);
2054 2093
@@ -2096,13 +2135,13 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2096 retval = ext4_delete_entry(handle, dir, de, bh); 2135 retval = ext4_delete_entry(handle, dir, de, bh);
2097 if (retval) 2136 if (retval)
2098 goto end_unlink; 2137 goto end_unlink;
2099 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 2138 dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
2100 ext4_update_dx_flag(dir); 2139 ext4_update_dx_flag(dir);
2101 ext4_mark_inode_dirty(handle, dir); 2140 ext4_mark_inode_dirty(handle, dir);
2102 drop_nlink(inode); 2141 ext4_dec_count(handle, inode);
2103 if (!inode->i_nlink) 2142 if (!inode->i_nlink)
2104 ext4_orphan_add(handle, inode); 2143 ext4_orphan_add(handle, inode);
2105 inode->i_ctime = dir->i_ctime; 2144 inode->i_ctime = ext4_current_time(inode);
2106 ext4_mark_inode_dirty(handle, inode); 2145 ext4_mark_inode_dirty(handle, inode);
2107 retval = 0; 2146 retval = 0;
2108 2147
@@ -2149,7 +2188,7 @@ retry:
2149 err = __page_symlink(inode, symname, l, 2188 err = __page_symlink(inode, symname, l,
2150 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 2189 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2151 if (err) { 2190 if (err) {
2152 drop_nlink(inode); 2191 ext4_dec_count(handle, inode);
2153 ext4_mark_inode_dirty(handle, inode); 2192 ext4_mark_inode_dirty(handle, inode);
2154 iput (inode); 2193 iput (inode);
2155 goto out_stop; 2194 goto out_stop;
@@ -2175,8 +2214,9 @@ static int ext4_link (struct dentry * old_dentry,
2175 struct inode *inode = old_dentry->d_inode; 2214 struct inode *inode = old_dentry->d_inode;
2176 int err, retries = 0; 2215 int err, retries = 0;
2177 2216
2178 if (inode->i_nlink >= EXT4_LINK_MAX) 2217 if (EXT4_DIR_LINK_MAX(inode))
2179 return -EMLINK; 2218 return -EMLINK;
2219
2180 /* 2220 /*
2181 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2221 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2182 * otherwise has the potential to corrupt the orphan inode list. 2222 * otherwise has the potential to corrupt the orphan inode list.
@@ -2193,8 +2233,8 @@ retry:
2193 if (IS_DIRSYNC(dir)) 2233 if (IS_DIRSYNC(dir))
2194 handle->h_sync = 1; 2234 handle->h_sync = 1;
2195 2235
2196 inode->i_ctime = CURRENT_TIME_SEC; 2236 inode->i_ctime = ext4_current_time(inode);
2197 inc_nlink(inode); 2237 ext4_inc_count(handle, inode);
2198 atomic_inc(&inode->i_count); 2238 atomic_inc(&inode->i_count);
2199 2239
2200 err = ext4_add_nondir(handle, dentry, inode); 2240 err = ext4_add_nondir(handle, dentry, inode);
@@ -2295,7 +2335,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2295 * Like most other Unix systems, set the ctime for inodes on a 2335 * Like most other Unix systems, set the ctime for inodes on a
2296 * rename. 2336 * rename.
2297 */ 2337 */
2298 old_inode->i_ctime = CURRENT_TIME_SEC; 2338 old_inode->i_ctime = ext4_current_time(old_inode);
2299 ext4_mark_inode_dirty(handle, old_inode); 2339 ext4_mark_inode_dirty(handle, old_inode);
2300 2340
2301 /* 2341 /*
@@ -2327,10 +2367,10 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2327 } 2367 }
2328 2368
2329 if (new_inode) { 2369 if (new_inode) {
2330 drop_nlink(new_inode); 2370 ext4_dec_count(handle, new_inode);
2331 new_inode->i_ctime = CURRENT_TIME_SEC; 2371 new_inode->i_ctime = ext4_current_time(new_inode);
2332 } 2372 }
2333 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; 2373 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2334 ext4_update_dx_flag(old_dir); 2374 ext4_update_dx_flag(old_dir);
2335 if (dir_bh) { 2375 if (dir_bh) {
2336 BUFFER_TRACE(dir_bh, "get_write_access"); 2376 BUFFER_TRACE(dir_bh, "get_write_access");
@@ -2338,11 +2378,13 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2338 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2378 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2339 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); 2379 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
2340 ext4_journal_dirty_metadata(handle, dir_bh); 2380 ext4_journal_dirty_metadata(handle, dir_bh);
2341 drop_nlink(old_dir); 2381 ext4_dec_count(handle, old_dir);
2342 if (new_inode) { 2382 if (new_inode) {
2343 drop_nlink(new_inode); 2383 /* checked empty_dir above, can't have another parent,
2384 * ext3_dec_count() won't work for many-linked dirs */
2385 new_inode->i_nlink = 0;
2344 } else { 2386 } else {
2345 inc_nlink(new_dir); 2387 ext4_inc_count(handle, new_dir);
2346 ext4_update_dx_flag(new_dir); 2388 ext4_update_dx_flag(new_dir);
2347 ext4_mark_inode_dirty(handle, new_dir); 2389 ext4_mark_inode_dirty(handle, new_dir);
2348 } 2390 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 175b68c609..6dcbb28dc0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -29,12 +29,14 @@
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
32#include <linux/exportfs.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/mount.h> 35#include <linux/mount.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/quotaops.h> 37#include <linux/quotaops.h>
37#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/log2.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40 42
@@ -510,6 +512,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
510 512
511static void ext4_destroy_inode(struct inode *inode) 513static void ext4_destroy_inode(struct inode *inode)
512{ 514{
515 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
516 printk("EXT4 Inode %p: orphan list check failed!\n",
517 EXT4_I(inode));
518 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
519 EXT4_I(inode), sizeof(struct ext4_inode_info),
520 true);
521 dump_stack();
522 }
513 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 523 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
514} 524}
515 525
@@ -725,7 +735,7 @@ enum {
725 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 735 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
726 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 736 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
727 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 737 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
728 Opt_grpquota, Opt_extents, 738 Opt_grpquota, Opt_extents, Opt_noextents,
729}; 739};
730 740
731static match_table_t tokens = { 741static match_table_t tokens = {
@@ -776,6 +786,7 @@ static match_table_t tokens = {
776 {Opt_usrquota, "usrquota"}, 786 {Opt_usrquota, "usrquota"},
777 {Opt_barrier, "barrier=%u"}, 787 {Opt_barrier, "barrier=%u"},
778 {Opt_extents, "extents"}, 788 {Opt_extents, "extents"},
789 {Opt_noextents, "noextents"},
779 {Opt_err, NULL}, 790 {Opt_err, NULL},
780 {Opt_resize, "resize"}, 791 {Opt_resize, "resize"},
781}; 792};
@@ -1111,6 +1122,9 @@ clear_qf_name:
1111 case Opt_extents: 1122 case Opt_extents:
1112 set_opt (sbi->s_mount_opt, EXTENTS); 1123 set_opt (sbi->s_mount_opt, EXTENTS);
1113 break; 1124 break;
1125 case Opt_noextents:
1126 clear_opt (sbi->s_mount_opt, EXTENTS);
1127 break;
1114 default: 1128 default:
1115 printk (KERN_ERR 1129 printk (KERN_ERR
1116 "EXT4-fs: Unrecognized mount option \"%s\" " 1130 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1542,6 +1556,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1542 1556
1543 set_opt(sbi->s_mount_opt, RESERVATION); 1557 set_opt(sbi->s_mount_opt, RESERVATION);
1544 1558
1559 /*
1560 * turn on extents feature by default in ext4 filesystem
1561 * User -o noextents to turn it off
1562 */
1563 set_opt(sbi->s_mount_opt, EXTENTS);
1564
1545 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 1565 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1546 NULL, 0)) 1566 NULL, 0))
1547 goto failed_mount; 1567 goto failed_mount;
@@ -1625,13 +1645,15 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1625 sbi->s_inode_size = le16_to_cpu(es->s_inode_size); 1645 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1626 sbi->s_first_ino = le32_to_cpu(es->s_first_ino); 1646 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1627 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || 1647 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
1628 (sbi->s_inode_size & (sbi->s_inode_size - 1)) || 1648 (!is_power_of_2(sbi->s_inode_size)) ||
1629 (sbi->s_inode_size > blocksize)) { 1649 (sbi->s_inode_size > blocksize)) {
1630 printk (KERN_ERR 1650 printk (KERN_ERR
1631 "EXT4-fs: unsupported inode size: %d\n", 1651 "EXT4-fs: unsupported inode size: %d\n",
1632 sbi->s_inode_size); 1652 sbi->s_inode_size);
1633 goto failed_mount; 1653 goto failed_mount;
1634 } 1654 }
1655 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
1656 sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
1635 } 1657 }
1636 sbi->s_frag_size = EXT4_MIN_FRAG_SIZE << 1658 sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
1637 le32_to_cpu(es->s_log_frag_size); 1659 le32_to_cpu(es->s_log_frag_size);
@@ -1794,6 +1816,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1794 goto failed_mount3; 1816 goto failed_mount3;
1795 } 1817 }
1796 1818
1819 if (ext4_blocks_count(es) > 0xffffffffULL &&
1820 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
1821 JBD2_FEATURE_INCOMPAT_64BIT)) {
1822 printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
1823 goto failed_mount4;
1824 }
1825
1797 /* We have now updated the journal if required, so we can 1826 /* We have now updated the journal if required, so we can
1798 * validate the data journaling mode. */ 1827 * validate the data journaling mode. */
1799 switch (test_opt(sb, DATA_FLAGS)) { 1828 switch (test_opt(sb, DATA_FLAGS)) {
@@ -1848,6 +1877,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1848 } 1877 }
1849 1878
1850 ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); 1879 ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1880
1881 /* determine the minimum size of new large inodes, if present */
1882 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
1883 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
1884 EXT4_GOOD_OLD_INODE_SIZE;
1885 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1886 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
1887 if (sbi->s_want_extra_isize <
1888 le16_to_cpu(es->s_want_extra_isize))
1889 sbi->s_want_extra_isize =
1890 le16_to_cpu(es->s_want_extra_isize);
1891 if (sbi->s_want_extra_isize <
1892 le16_to_cpu(es->s_min_extra_isize))
1893 sbi->s_want_extra_isize =
1894 le16_to_cpu(es->s_min_extra_isize);
1895 }
1896 }
1897 /* Check if enough inode space is available */
1898 if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
1899 sbi->s_inode_size) {
1900 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
1901 EXT4_GOOD_OLD_INODE_SIZE;
1902 printk(KERN_INFO "EXT4-fs: required extra inode space not"
1903 "available.\n");
1904 }
1905
1851 /* 1906 /*
1852 * akpm: core read_super() calls in here with the superblock locked. 1907 * akpm: core read_super() calls in here with the superblock locked.
1853 * That deadlocks, because orphan cleanup needs to lock the superblock 1908 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2150,6 +2205,7 @@ static int ext4_create_journal(struct super_block * sb,
2150 unsigned int journal_inum) 2205 unsigned int journal_inum)
2151{ 2206{
2152 journal_t *journal; 2207 journal_t *journal;
2208 int err;
2153 2209
2154 if (sb->s_flags & MS_RDONLY) { 2210 if (sb->s_flags & MS_RDONLY) {
2155 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " 2211 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
@@ -2157,13 +2213,15 @@ static int ext4_create_journal(struct super_block * sb,
2157 return -EROFS; 2213 return -EROFS;
2158 } 2214 }
2159 2215
2160 if (!(journal = ext4_get_journal(sb, journal_inum))) 2216 journal = ext4_get_journal(sb, journal_inum);
2217 if (!journal)
2161 return -EINVAL; 2218 return -EINVAL;
2162 2219
2163 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", 2220 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2164 journal_inum); 2221 journal_inum);
2165 2222
2166 if (jbd2_journal_create(journal)) { 2223 err = jbd2_journal_create(journal);
2224 if (err) {
2167 printk(KERN_ERR "EXT4-fs: error creating journal.\n"); 2225 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2168 jbd2_journal_destroy(journal); 2226 jbd2_journal_destroy(journal);
2169 return -EIO; 2227 return -EIO;
@@ -2214,12 +2272,14 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
2214 2272
2215 jbd2_journal_lock_updates(journal); 2273 jbd2_journal_lock_updates(journal);
2216 jbd2_journal_flush(journal); 2274 jbd2_journal_flush(journal);
2275 lock_super(sb);
2217 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 2276 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2218 sb->s_flags & MS_RDONLY) { 2277 sb->s_flags & MS_RDONLY) {
2219 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 2278 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2220 sb->s_dirt = 0; 2279 sb->s_dirt = 0;
2221 ext4_commit_super(sb, es, 1); 2280 ext4_commit_super(sb, es, 1);
2222 } 2281 }
2282 unlock_super(sb);
2223 jbd2_journal_unlock_updates(journal); 2283 jbd2_journal_unlock_updates(journal);
2224} 2284}
2225 2285
@@ -2408,7 +2468,13 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
2408 (sbi->s_mount_state & EXT4_VALID_FS)) 2468 (sbi->s_mount_state & EXT4_VALID_FS))
2409 es->s_state = cpu_to_le16(sbi->s_mount_state); 2469 es->s_state = cpu_to_le16(sbi->s_mount_state);
2410 2470
2471 /*
2472 * We have to unlock super so that we can wait for
2473 * transactions.
2474 */
2475 unlock_super(sb);
2411 ext4_mark_recovery_complete(sb, es); 2476 ext4_mark_recovery_complete(sb, es);
2477 lock_super(sb);
2412 } else { 2478 } else {
2413 __le32 ret; 2479 __le32 ret;
2414 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2480 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2481,19 +2547,19 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2481 struct super_block *sb = dentry->d_sb; 2547 struct super_block *sb = dentry->d_sb;
2482 struct ext4_sb_info *sbi = EXT4_SB(sb); 2548 struct ext4_sb_info *sbi = EXT4_SB(sb);
2483 struct ext4_super_block *es = sbi->s_es; 2549 struct ext4_super_block *es = sbi->s_es;
2484 ext4_fsblk_t overhead;
2485 int i;
2486 u64 fsid; 2550 u64 fsid;
2487 2551
2488 if (test_opt (sb, MINIX_DF)) 2552 if (test_opt(sb, MINIX_DF)) {
2489 overhead = 0; 2553 sbi->s_overhead_last = 0;
2490 else { 2554 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2491 unsigned long ngroups; 2555 unsigned long ngroups = sbi->s_groups_count, i;
2492 ngroups = EXT4_SB(sb)->s_groups_count; 2556 ext4_fsblk_t overhead = 0;
2493 smp_rmb(); 2557 smp_rmb();
2494 2558
2495 /* 2559 /*
2496 * Compute the overhead (FS structures) 2560 * Compute the overhead (FS structures). This is constant
2561 * for a given filesystem unless the number of block groups
2562 * changes so we cache the previous value until it does.
2497 */ 2563 */
2498 2564
2499 /* 2565 /*
@@ -2517,18 +2583,23 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2517 * Every block group has an inode bitmap, a block 2583 * Every block group has an inode bitmap, a block
2518 * bitmap, and an inode table. 2584 * bitmap, and an inode table.
2519 */ 2585 */
2520 overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group)); 2586 overhead += ngroups * (2 + sbi->s_itb_per_group);
2587 sbi->s_overhead_last = overhead;
2588 smp_wmb();
2589 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2521 } 2590 }
2522 2591
2523 buf->f_type = EXT4_SUPER_MAGIC; 2592 buf->f_type = EXT4_SUPER_MAGIC;
2524 buf->f_bsize = sb->s_blocksize; 2593 buf->f_bsize = sb->s_blocksize;
2525 buf->f_blocks = ext4_blocks_count(es) - overhead; 2594 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
2526 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); 2595 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2596 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
2527 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 2597 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
2528 if (buf->f_bfree < ext4_r_blocks_count(es)) 2598 if (buf->f_bfree < ext4_r_blocks_count(es))
2529 buf->f_bavail = 0; 2599 buf->f_bavail = 0;
2530 buf->f_files = le32_to_cpu(es->s_inodes_count); 2600 buf->f_files = le32_to_cpu(es->s_inodes_count);
2531 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); 2601 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2602 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
2532 buf->f_namelen = EXT4_NAME_LEN; 2603 buf->f_namelen = EXT4_NAME_LEN;
2533 fsid = le64_to_cpup((void *)es->s_uuid) ^ 2604 fsid = le64_to_cpup((void *)es->s_uuid) ^
2534 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 2605 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e832e96095..b10d68fffb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -66,13 +66,6 @@
66#define BFIRST(bh) ENTRY(BHDR(bh)+1) 66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) 67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68 68
69#define IHDR(inode, raw_inode) \
70 ((struct ext4_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT4_GOOD_OLD_INODE_SIZE + \
73 EXT4_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
75
76#ifdef EXT4_XATTR_DEBUG 69#ifdef EXT4_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \ 70# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%lu: ", \ 71 printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -508,6 +501,24 @@ out:
508 return; 501 return;
509} 502}
510 503
504/*
505 * Find the available free space for EAs. This also returns the total number of
506 * bytes used by EA entries.
507 */
508static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
509 size_t *min_offs, void *base, int *total)
510{
511 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
512 *total += EXT4_XATTR_LEN(last->e_name_len);
513 if (!last->e_value_block && last->e_value_size) {
514 size_t offs = le16_to_cpu(last->e_value_offs);
515 if (offs < *min_offs)
516 *min_offs = offs;
517 }
518 }
519 return (*min_offs - ((void *)last - base) - sizeof(__u32));
520}
521
511struct ext4_xattr_info { 522struct ext4_xattr_info {
512 int name_index; 523 int name_index;
513 const char *name; 524 const char *name;
@@ -1013,7 +1024,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1013 } 1024 }
1014 if (!error) { 1025 if (!error) {
1015 ext4_xattr_update_super_block(handle, inode->i_sb); 1026 ext4_xattr_update_super_block(handle, inode->i_sb);
1016 inode->i_ctime = CURRENT_TIME_SEC; 1027 inode->i_ctime = ext4_current_time(inode);
1028 if (!value)
1029 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
1017 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1030 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1018 /* 1031 /*
1019 * The bh is consumed by ext4_mark_iloc_dirty, even with 1032 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,6 +1080,253 @@ retry:
1067} 1080}
1068 1081
1069/* 1082/*
1083 * Shift the EA entries in the inode to create space for the increased
1084 * i_extra_isize.
1085 */
1086static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
1087 int value_offs_shift, void *to,
1088 void *from, size_t n, int blocksize)
1089{
1090 struct ext4_xattr_entry *last = entry;
1091 int new_offs;
1092
1093 /* Adjust the value offsets of the entries */
1094 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
1095 if (!last->e_value_block && last->e_value_size) {
1096 new_offs = le16_to_cpu(last->e_value_offs) +
1097 value_offs_shift;
1098 BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
1099 > blocksize);
1100 last->e_value_offs = cpu_to_le16(new_offs);
1101 }
1102 }
1103 /* Shift the entries by n bytes */
1104 memmove(to, from, n);
1105}
1106
1107/*
1108 * Expand an inode by new_extra_isize bytes when EAs are present.
1109 * Returns 0 on success or negative error number on failure.
1110 */
1111int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1112 struct ext4_inode *raw_inode, handle_t *handle)
1113{
1114 struct ext4_xattr_ibody_header *header;
1115 struct ext4_xattr_entry *entry, *last, *first;
1116 struct buffer_head *bh = NULL;
1117 struct ext4_xattr_ibody_find *is = NULL;
1118 struct ext4_xattr_block_find *bs = NULL;
1119 char *buffer = NULL, *b_entry_name = NULL;
1120 size_t min_offs, free;
1121 int total_ino, total_blk;
1122 void *base, *start, *end;
1123 int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
1124 int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize;
1125
1126 down_write(&EXT4_I(inode)->xattr_sem);
1127retry:
1128 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
1129 up_write(&EXT4_I(inode)->xattr_sem);
1130 return 0;
1131 }
1132
1133 header = IHDR(inode, raw_inode);
1134 entry = IFIRST(header);
1135
1136 /*
1137 * Check if enough free space is available in the inode to shift the
1138 * entries ahead by new_extra_isize.
1139 */
1140
1141 base = start = entry;
1142 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
1143 min_offs = end - base;
1144 last = entry;
1145 total_ino = sizeof(struct ext4_xattr_ibody_header);
1146
1147 free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
1148 if (free >= new_extra_isize) {
1149 entry = IFIRST(header);
1150 ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
1151 - new_extra_isize, (void *)raw_inode +
1152 EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
1153 (void *)header, total_ino,
1154 inode->i_sb->s_blocksize);
1155 EXT4_I(inode)->i_extra_isize = new_extra_isize;
1156 error = 0;
1157 goto cleanup;
1158 }
1159
1160 /*
1161 * Enough free space isn't available in the inode, check if
1162 * EA block can hold new_extra_isize bytes.
1163 */
1164 if (EXT4_I(inode)->i_file_acl) {
1165 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1166 error = -EIO;
1167 if (!bh)
1168 goto cleanup;
1169 if (ext4_xattr_check_block(bh)) {
1170 ext4_error(inode->i_sb, __FUNCTION__,
1171 "inode %lu: bad block %llu", inode->i_ino,
1172 EXT4_I(inode)->i_file_acl);
1173 error = -EIO;
1174 goto cleanup;
1175 }
1176 base = BHDR(bh);
1177 first = BFIRST(bh);
1178 end = bh->b_data + bh->b_size;
1179 min_offs = end - base;
1180 free = ext4_xattr_free_space(first, &min_offs, base,
1181 &total_blk);
1182 if (free < new_extra_isize) {
1183 if (!tried_min_extra_isize && s_min_extra_isize) {
1184 tried_min_extra_isize++;
1185 new_extra_isize = s_min_extra_isize;
1186 brelse(bh);
1187 goto retry;
1188 }
1189 error = -1;
1190 goto cleanup;
1191 }
1192 } else {
1193 free = inode->i_sb->s_blocksize;
1194 }
1195
1196 while (new_extra_isize > 0) {
1197 size_t offs, size, entry_size;
1198 struct ext4_xattr_entry *small_entry = NULL;
1199 struct ext4_xattr_info i = {
1200 .value = NULL,
1201 .value_len = 0,
1202 };
1203 unsigned int total_size; /* EA entry size + value size */
1204 unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
1205 unsigned int min_total_size = ~0U;
1206
1207 is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
1208 bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
1209 if (!is || !bs) {
1210 error = -ENOMEM;
1211 goto cleanup;
1212 }
1213
1214 is->s.not_found = -ENODATA;
1215 bs->s.not_found = -ENODATA;
1216 is->iloc.bh = NULL;
1217 bs->bh = NULL;
1218
1219 last = IFIRST(header);
1220 /* Find the entry best suited to be pushed into EA block */
1221 entry = NULL;
1222 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
1223 total_size =
1224 EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
1225 EXT4_XATTR_LEN(last->e_name_len);
1226 if (total_size <= free && total_size < min_total_size) {
1227 if (total_size < new_extra_isize) {
1228 small_entry = last;
1229 } else {
1230 entry = last;
1231 min_total_size = total_size;
1232 }
1233 }
1234 }
1235
1236 if (entry == NULL) {
1237 if (small_entry) {
1238 entry = small_entry;
1239 } else {
1240 if (!tried_min_extra_isize &&
1241 s_min_extra_isize) {
1242 tried_min_extra_isize++;
1243 new_extra_isize = s_min_extra_isize;
1244 goto retry;
1245 }
1246 error = -1;
1247 goto cleanup;
1248 }
1249 }
1250 offs = le16_to_cpu(entry->e_value_offs);
1251 size = le32_to_cpu(entry->e_value_size);
1252 entry_size = EXT4_XATTR_LEN(entry->e_name_len);
1253 i.name_index = entry->e_name_index,
1254 buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
1255 b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
1256 if (!buffer || !b_entry_name) {
1257 error = -ENOMEM;
1258 goto cleanup;
1259 }
1260 /* Save the entry name and the entry value */
1261 memcpy(buffer, (void *)IFIRST(header) + offs,
1262 EXT4_XATTR_SIZE(size));
1263 memcpy(b_entry_name, entry->e_name, entry->e_name_len);
1264 b_entry_name[entry->e_name_len] = '\0';
1265 i.name = b_entry_name;
1266
1267 error = ext4_get_inode_loc(inode, &is->iloc);
1268 if (error)
1269 goto cleanup;
1270
1271 error = ext4_xattr_ibody_find(inode, &i, is);
1272 if (error)
1273 goto cleanup;
1274
1275 /* Remove the chosen entry from the inode */
1276 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1277
1278 entry = IFIRST(header);
1279 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
1280 shift_bytes = new_extra_isize;
1281 else
1282 shift_bytes = entry_size + size;
1283 /* Adjust the offsets and shift the remaining entries ahead */
1284 ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
1285 shift_bytes, (void *)raw_inode +
1286 EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
1287 (void *)header, total_ino - entry_size,
1288 inode->i_sb->s_blocksize);
1289
1290 extra_isize += shift_bytes;
1291 new_extra_isize -= shift_bytes;
1292 EXT4_I(inode)->i_extra_isize = extra_isize;
1293
1294 i.name = b_entry_name;
1295 i.value = buffer;
1296 i.value_len = cpu_to_le32(size);
1297 error = ext4_xattr_block_find(inode, &i, bs);
1298 if (error)
1299 goto cleanup;
1300
1301 /* Add entry which was removed from the inode into the block */
1302 error = ext4_xattr_block_set(handle, inode, &i, bs);
1303 if (error)
1304 goto cleanup;
1305 kfree(b_entry_name);
1306 kfree(buffer);
1307 brelse(is->iloc.bh);
1308 kfree(is);
1309 kfree(bs);
1310 }
1311 brelse(bh);
1312 up_write(&EXT4_I(inode)->xattr_sem);
1313 return 0;
1314
1315cleanup:
1316 kfree(b_entry_name);
1317 kfree(buffer);
1318 if (is)
1319 brelse(is->iloc.bh);
1320 kfree(is);
1321 kfree(bs);
1322 brelse(bh);
1323 up_write(&EXT4_I(inode)->xattr_sem);
1324 return error;
1325}
1326
1327
1328
1329/*
1070 * ext4_xattr_delete_inode() 1330 * ext4_xattr_delete_inode()
1071 * 1331 *
1072 * Free extended attribute resources associated with this inode. This 1332 * Free extended attribute resources associated with this inode. This
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 79432b3539..d7f5d6a126 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -56,6 +56,13 @@ struct ext4_xattr_entry {
56#define EXT4_XATTR_SIZE(size) \ 56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) 57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58 58
59#define IHDR(inode, raw_inode) \
60 ((struct ext4_xattr_ibody_header *) \
61 ((void *)raw_inode + \
62 EXT4_GOOD_OLD_INODE_SIZE + \
63 EXT4_I(inode)->i_extra_isize))
64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
65
59# ifdef CONFIG_EXT4DEV_FS_XATTR 66# ifdef CONFIG_EXT4DEV_FS_XATTR
60 67
61extern struct xattr_handler ext4_xattr_user_handler; 68extern struct xattr_handler ext4_xattr_user_handler;
@@ -74,6 +81,9 @@ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *,
74extern void ext4_xattr_delete_inode(handle_t *, struct inode *); 81extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
75extern void ext4_xattr_put_super(struct super_block *); 82extern void ext4_xattr_put_super(struct super_block *);
76 83
84extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
85 struct ext4_inode *raw_inode, handle_t *handle);
86
77extern int init_ext4_xattr(void); 87extern int init_ext4_xattr(void);
78extern void exit_ext4_xattr(void); 88extern void exit_ext4_xattr(void);
79 89
@@ -129,6 +139,13 @@ exit_ext4_xattr(void)
129{ 139{
130} 140}
131 141
142static inline int
143ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
144 struct ext4_inode *raw_inode, handle_t *handle)
145{
146 return -EOPNOTSUPP;
147}
148
132#define ext4_xattr_handlers NULL 149#define ext4_xattr_handlers NULL
133 150
134# endif /* CONFIG_EXT4DEV_FS_XATTR */ 151# endif /* CONFIG_EXT4DEV_FS_XATTR */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ccf161dffb..72cbcd61bd 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -313,7 +313,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
313 wchar_t bufuname[14]; 313 wchar_t bufuname[14];
314 unsigned char xlate_len, nr_slots; 314 unsigned char xlate_len, nr_slots;
315 wchar_t *unicode = NULL; 315 wchar_t *unicode = NULL;
316 unsigned char work[8], bufname[260]; /* 256 + 4 */ 316 unsigned char work[MSDOS_NAME], bufname[260]; /* 256 + 4 */
317 int uni_xlate = sbi->options.unicode_xlate; 317 int uni_xlate = sbi->options.unicode_xlate;
318 int utf8 = sbi->options.utf8; 318 int utf8 = sbi->options.utf8;
319 int anycase = (sbi->options.name_check != 's'); 319 int anycase = (sbi->options.name_check != 's');
@@ -351,7 +351,8 @@ parse_record:
351 if (work[0] == 0x05) 351 if (work[0] == 0x05)
352 work[0] = 0xE5; 352 work[0] = 0xE5;
353 for (i = 0, j = 0, last_u = 0; i < 8;) { 353 for (i = 0, j = 0, last_u = 0; i < 8;) {
354 if (!work[i]) break; 354 if (!work[i])
355 break;
355 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, 356 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
356 &bufuname[j++], opt_shortname, 357 &bufuname[j++], opt_shortname,
357 de->lcase & CASE_LOWER_BASE); 358 de->lcase & CASE_LOWER_BASE);
@@ -365,13 +366,15 @@ parse_record:
365 } 366 }
366 j = last_u; 367 j = last_u;
367 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); 368 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
368 for (i = 0; i < 3;) { 369 for (i = 8; i < MSDOS_NAME;) {
369 if (!de->ext[i]) break; 370 if (!work[i])
370 chl = fat_shortname2uni(nls_disk, &de->ext[i], 3 - i, 371 break;
372 chl = fat_shortname2uni(nls_disk, &work[i],
373 MSDOS_NAME - i,
371 &bufuname[j++], opt_shortname, 374 &bufuname[j++], opt_shortname,
372 de->lcase & CASE_LOWER_EXT); 375 de->lcase & CASE_LOWER_EXT);
373 if (chl <= 1) { 376 if (chl <= 1) {
374 if (de->ext[i] != ' ') 377 if (work[i] != ' ')
375 last_u = j; 378 last_u = j;
376 } else { 379 } else {
377 last_u = j; 380 last_u = j;
@@ -445,7 +448,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
445 int fill_len; 448 int fill_len;
446 wchar_t bufuname[14]; 449 wchar_t bufuname[14];
447 wchar_t *unicode = NULL; 450 wchar_t *unicode = NULL;
448 unsigned char c, work[8], bufname[56], *ptname = bufname; 451 unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
449 unsigned long lpos, dummy, *furrfu = &lpos; 452 unsigned long lpos, dummy, *furrfu = &lpos;
450 int uni_xlate = sbi->options.unicode_xlate; 453 int uni_xlate = sbi->options.unicode_xlate;
451 int isvfat = sbi->options.isvfat; 454 int isvfat = sbi->options.isvfat;
@@ -527,7 +530,8 @@ parse_record:
527 if (work[0] == 0x05) 530 if (work[0] == 0x05)
528 work[0] = 0xE5; 531 work[0] = 0xE5;
529 for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) { 532 for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) {
530 if (!(c = work[i])) break; 533 if (!(c = work[i]))
534 break;
531 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, 535 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
532 &bufuname[j++], opt_shortname, 536 &bufuname[j++], opt_shortname,
533 de->lcase & CASE_LOWER_BASE); 537 de->lcase & CASE_LOWER_BASE);
@@ -549,9 +553,10 @@ parse_record:
549 j = last_u; 553 j = last_u;
550 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); 554 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
551 ptname[i++] = '.'; 555 ptname[i++] = '.';
552 for (i2 = 0; i2 < 3;) { 556 for (i2 = 8; i2 < MSDOS_NAME;) {
553 if (!(c = de->ext[i2])) break; 557 if (!(c = work[i2]))
554 chl = fat_shortname2uni(nls_disk, &de->ext[i2], 3 - i2, 558 break;
559 chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2,
555 &bufuname[j++], opt_shortname, 560 &bufuname[j++], opt_shortname,
556 de->lcase & CASE_LOWER_EXT); 561 de->lcase & CASE_LOWER_EXT);
557 if (chl <= 1) { 562 if (chl <= 1) {
@@ -563,8 +568,8 @@ parse_record:
563 } 568 }
564 } else { 569 } else {
565 last_u = j; 570 last_u = j;
566 for (chi = 0; chi < chl && i2 < 3; chi++) { 571 for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) {
567 ptname[i++] = de->ext[i2++]; 572 ptname[i++] = work[i2++];
568 last = i; 573 last = i;
569 } 574 }
570 } 575 }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index ab171ea8e8..2c1b73fb82 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -17,6 +17,8 @@ struct fatent_operations {
17 int (*ent_next)(struct fat_entry *); 17 int (*ent_next)(struct fat_entry *);
18}; 18};
19 19
20static DEFINE_SPINLOCK(fat12_entry_lock);
21
20static void fat12_ent_blocknr(struct super_block *sb, int entry, 22static void fat12_ent_blocknr(struct super_block *sb, int entry,
21 int *offset, sector_t *blocknr) 23 int *offset, sector_t *blocknr)
22{ 24{
@@ -116,10 +118,13 @@ static int fat12_ent_get(struct fat_entry *fatent)
116 u8 **ent12_p = fatent->u.ent12_p; 118 u8 **ent12_p = fatent->u.ent12_p;
117 int next; 119 int next;
118 120
121 spin_lock(&fat12_entry_lock);
119 if (fatent->entry & 1) 122 if (fatent->entry & 1)
120 next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); 123 next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
121 else 124 else
122 next = (*ent12_p[1] << 8) | *ent12_p[0]; 125 next = (*ent12_p[1] << 8) | *ent12_p[0];
126 spin_unlock(&fat12_entry_lock);
127
123 next &= 0x0fff; 128 next &= 0x0fff;
124 if (next >= BAD_FAT12) 129 if (next >= BAD_FAT12)
125 next = FAT_ENT_EOF; 130 next = FAT_ENT_EOF;
@@ -151,6 +156,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
151 if (new == FAT_ENT_EOF) 156 if (new == FAT_ENT_EOF)
152 new = EOF_FAT12; 157 new = EOF_FAT12;
153 158
159 spin_lock(&fat12_entry_lock);
154 if (fatent->entry & 1) { 160 if (fatent->entry & 1) {
155 *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); 161 *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
156 *ent12_p[1] = new >> 4; 162 *ent12_p[1] = new >> 4;
@@ -158,6 +164,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
158 *ent12_p[0] = new & 0xff; 164 *ent12_p[0] = new & 0xff;
159 *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); 165 *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
160 } 166 }
167 spin_unlock(&fat12_entry_lock);
161 168
162 mark_buffer_dirty(fatent->bhs[0]); 169 mark_buffer_dirty(fatent->bhs[0]);
163 if (fatent->nr_bhs == 2) 170 if (fatent->nr_bhs == 2)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 55d3c7461c..69a83b59dc 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -134,7 +134,7 @@ const struct file_operations fat_file_operations = {
134 .release = fat_file_release, 134 .release = fat_file_release,
135 .ioctl = fat_generic_ioctl, 135 .ioctl = fat_generic_ioctl,
136 .fsync = file_fsync, 136 .fsync = file_fsync,
137 .sendfile = generic_file_sendfile, 137 .splice_read = generic_file_splice_read,
138}; 138};
139 139
140static int fat_cont_expand(struct inode *inode, loff_t size) 140static int fat_cont_expand(struct inode *inode, loff_t size)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 479722d896..0a7ddb39a5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/mpage.h> 21#include <linux/mpage.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/exportfs.h>
23#include <linux/mount.h> 24#include <linux/mount.h>
24#include <linux/vfs.h> 25#include <linux/vfs.h>
25#include <linux/parser.h> 26#include <linux/parser.h>
@@ -354,8 +355,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
354 } else { /* not a directory */ 355 } else { /* not a directory */
355 inode->i_generation |= 1; 356 inode->i_generation |= 1;
356 inode->i_mode = MSDOS_MKMODE(de->attr, 357 inode->i_mode = MSDOS_MKMODE(de->attr,
357 ((sbi->options.showexec && 358 ((sbi->options.showexec && !is_exec(de->name + 8))
358 !is_exec(de->ext))
359 ? S_IRUGO|S_IWUGO : S_IRWXUGO) 359 ? S_IRUGO|S_IWUGO : S_IRWXUGO)
360 & ~sbi->options.fs_fmask) | S_IFREG; 360 & ~sbi->options.fs_fmask) | S_IFREG;
361 MSDOS_I(inode)->i_start = le16_to_cpu(de->start); 361 MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5d51..3f22e9f4f6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -215,7 +215,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
215 215
216 /* O_NOATIME can only be set by the owner or superuser */ 216 /* O_NOATIME can only be set by the owner or superuser */
217 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) 217 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
218 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) 218 if (!is_owner_or_cap(inode))
219 return -EPERM; 219 return -EPERM;
220 220
221 /* required for strict SunOS emulation */ 221 /* required for strict SunOS emulation */
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index 8a4dfef1dd..3c96d6e639 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -80,7 +80,7 @@ struct vxfs_direct {
80 * a d_name with size len. 80 * a d_name with size len.
81 */ 81 */
82#define VXFS_DIRPAD 4 82#define VXFS_DIRPAD 4
83#define VXFS_NAMEMIN ((int)((struct vxfs_direct *)0)->d_name) 83#define VXFS_NAMEMIN offsetof(struct vxfs_direct, d_name)
84#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1)) 84#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1))
85#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len))) 85#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len)))
86 86
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index adf7995232..f79de7c8cd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -802,7 +802,7 @@ static const struct file_operations fuse_file_operations = {
802 .release = fuse_release, 802 .release = fuse_release,
803 .fsync = fuse_fsync, 803 .fsync = fuse_fsync,
804 .lock = fuse_file_lock, 804 .lock = fuse_file_lock,
805 .sendfile = generic_file_sendfile, 805 .splice_read = generic_file_splice_read,
806}; 806};
807 807
808static const struct file_operations fuse_direct_io_file_operations = { 808static const struct file_operations fuse_direct_io_file_operations = {
@@ -814,7 +814,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
814 .release = fuse_release, 814 .release = fuse_release,
815 .fsync = fuse_fsync, 815 .fsync = fuse_fsync,
816 .lock = fuse_file_lock, 816 .lock = fuse_file_lock,
817 /* no mmap and sendfile */ 817 /* no mmap and splice_read */
818}; 818};
819 819
820static const struct address_space_operations fuse_file_aops = { 820static const struct address_space_operations fuse_file_aops = {
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 9ccb789471..995d63b2e7 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -78,7 +78,7 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
78 78
79 if (S_ISLNK(inode->i_mode)) 79 if (S_ISLNK(inode->i_mode))
80 return -EOPNOTSUPP; 80 return -EOPNOTSUPP;
81 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) 81 if (!is_owner_or_cap(inode))
82 return -EPERM; 82 return -EPERM;
83 if (value) { 83 if (value) {
84 acl = posix_acl_from_xattr(value, size); 84 acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e3f1ada643..04ad0caebe 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,7 +1,7 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 6e80844367..1047a8c722 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -74,7 +74,7 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
74{ 74{
75 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) 75 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
76 return -EOPNOTSUPP; 76 return -EOPNOTSUPP;
77 if (current->fsuid != ip->i_inode.i_uid && !capable(CAP_FOWNER)) 77 if (!is_owner_or_cap(&ip->i_inode))
78 return -EPERM; 78 return -EPERM;
79 if (S_ISLNK(ip->i_inode.i_mode)) 79 if (S_ISLNK(ip->i_inode.i_mode))
80 return -EOPNOTSUPP; 80 return -EOPNOTSUPP;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c53a5d2d05..cd805a6688 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -718,7 +718,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
718 for (x = 0; x < rlist.rl_rgrps; x++) { 718 for (x = 0; x < rlist.rl_rgrps; x++) {
719 struct gfs2_rgrpd *rgd; 719 struct gfs2_rgrpd *rgd;
720 rgd = rlist.rl_ghs[x].gh_gl->gl_object; 720 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
721 rg_blocks += rgd->rd_ri.ri_length; 721 rg_blocks += rgd->rd_length;
722 } 722 }
723 723
724 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); 724 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
@@ -772,7 +772,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
772 gfs2_free_data(ip, bstart, blen); 772 gfs2_free_data(ip, bstart, blen);
773 } 773 }
774 774
775 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 775 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
776 776
777 gfs2_dinode_out(ip, dibh->b_data); 777 gfs2_dinode_out(ip, dibh->b_data);
778 778
@@ -824,7 +824,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
824 goto out_gunlock_q; 824 goto out_gunlock_q;
825 825
826 error = gfs2_trans_begin(sdp, 826 error = gfs2_trans_begin(sdp,
827 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length + 827 sdp->sd_max_height + al->al_rgd->rd_length +
828 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); 828 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
829 if (error) 829 if (error)
830 goto out_ipres; 830 goto out_ipres;
@@ -847,7 +847,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
847 } 847 }
848 848
849 ip->i_di.di_size = size; 849 ip->i_di.di_size = size;
850 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 850 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
851 851
852 error = gfs2_meta_inode_buffer(ip, &dibh); 852 error = gfs2_meta_inode_buffer(ip, &dibh);
853 if (error) 853 if (error)
@@ -885,7 +885,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
885 unsigned blocksize, iblock, length, pos; 885 unsigned blocksize, iblock, length, pos;
886 struct buffer_head *bh; 886 struct buffer_head *bh;
887 struct page *page; 887 struct page *page;
888 void *kaddr;
889 int err; 888 int err;
890 889
891 page = grab_cache_page(mapping, index); 890 page = grab_cache_page(mapping, index);
@@ -928,15 +927,13 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
928 /* Uhhuh. Read error. Complain and punt. */ 927 /* Uhhuh. Read error. Complain and punt. */
929 if (!buffer_uptodate(bh)) 928 if (!buffer_uptodate(bh))
930 goto unlock; 929 goto unlock;
930 err = 0;
931 } 931 }
932 932
933 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 933 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
934 gfs2_trans_add_bh(ip->i_gl, bh, 0); 934 gfs2_trans_add_bh(ip->i_gl, bh, 0);
935 935
936 kaddr = kmap_atomic(page, KM_USER0); 936 zero_user_page(page, offset, length, KM_USER0);
937 memset(kaddr + offset, 0, length);
938 flush_dcache_page(page);
939 kunmap_atomic(kaddr, KM_USER0);
940 937
941unlock: 938unlock:
942 unlock_page(page); 939 unlock_page(page);
@@ -962,7 +959,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
962 959
963 if (gfs2_is_stuffed(ip)) { 960 if (gfs2_is_stuffed(ip)) {
964 ip->i_di.di_size = size; 961 ip->i_di.di_size = size;
965 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 962 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
966 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 963 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
967 gfs2_dinode_out(ip, dibh->b_data); 964 gfs2_dinode_out(ip, dibh->b_data);
968 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); 965 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
@@ -974,7 +971,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
974 971
975 if (!error) { 972 if (!error) {
976 ip->i_di.di_size = size; 973 ip->i_di.di_size = size;
977 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 974 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
978 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; 975 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
979 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 976 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
980 gfs2_dinode_out(ip, dibh->b_data); 977 gfs2_dinode_out(ip, dibh->b_data);
@@ -1044,10 +1041,10 @@ static int trunc_end(struct gfs2_inode *ip)
1044 ip->i_di.di_height = 0; 1041 ip->i_di.di_height = 0;
1045 ip->i_di.di_goal_meta = 1042 ip->i_di.di_goal_meta =
1046 ip->i_di.di_goal_data = 1043 ip->i_di.di_goal_data =
1047 ip->i_num.no_addr; 1044 ip->i_no_addr;
1048 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1045 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1049 } 1046 }
1050 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 1047 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1051 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; 1048 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1052 1049
1053 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1050 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 683cb5bda8..3548d9f31e 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h> 17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h> 18#include <linux/lm_interface.h>
19#include <linux/freezer.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -49,6 +50,8 @@ int gfs2_scand(void *data)
49 while (!kthread_should_stop()) { 50 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp); 51 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ; 52 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
53 if (freezing(current))
54 refrigerator();
52 schedule_timeout_interruptible(t); 55 schedule_timeout_interruptible(t);
53 } 56 }
54 57
@@ -74,6 +77,8 @@ int gfs2_glockd(void *data)
74 wait_event_interruptible(sdp->sd_reclaim_wq, 77 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) || 78 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop())); 79 kthread_should_stop()));
80 if (freezing(current))
81 refrigerator();
77 } 82 }
78 83
79 return 0; 84 return 0;
@@ -93,6 +98,8 @@ int gfs2_recoverd(void *data)
93 while (!kthread_should_stop()) { 98 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp); 99 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; 100 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
101 if (freezing(current))
102 refrigerator();
96 schedule_timeout_interruptible(t); 103 schedule_timeout_interruptible(t);
97 } 104 }
98 105
@@ -141,6 +148,8 @@ int gfs2_logd(void *data)
141 } 148 }
142 149
143 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; 150 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
151 if (freezing(current))
152 refrigerator();
144 schedule_timeout_interruptible(t); 153 schedule_timeout_interruptible(t);
145 } 154 }
146 155
@@ -191,6 +200,8 @@ int gfs2_quotad(void *data)
191 gfs2_quota_scan(sdp); 200 gfs2_quota_scan(sdp);
192 201
193 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; 202 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
203 if (freezing(current))
204 refrigerator();
194 schedule_timeout_interruptible(t); 205 schedule_timeout_interruptible(t);
195 } 206 }
196 207
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a96fa07b3f..2beb2f401a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
131 if (ip->i_di.di_size < offset + size) 131 if (ip->i_di.di_size < offset + size)
132 ip->i_di.di_size = offset + size; 132 ip->i_di.di_size = offset + size;
133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
134 gfs2_dinode_out(ip, dibh->b_data); 134 gfs2_dinode_out(ip, dibh->b_data);
135 135
136 brelse(dibh); 136 brelse(dibh);
@@ -228,7 +228,7 @@ out:
228 228
229 if (ip->i_di.di_size < offset + copied) 229 if (ip->i_di.di_size < offset + copied)
230 ip->i_di.di_size = offset + copied; 230 ip->i_di.di_size = offset + copied;
231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
232 232
233 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 233 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
234 gfs2_dinode_out(ip, dibh->b_data); 234 gfs2_dinode_out(ip, dibh->b_data);
@@ -1456,7 +1456,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1456 if (dip->i_di.di_entries != g.offset) { 1456 if (dip->i_di.di_entries != g.offset) {
1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, " 1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n", 1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n",
1459 (unsigned long long)dip->i_num.no_addr, 1459 (unsigned long long)dip->i_no_addr,
1460 dip->i_di.di_entries, 1460 dip->i_di.di_entries,
1461 g.offset); 1461 g.offset);
1462 error = -EIO; 1462 error = -EIO;
@@ -1488,24 +1488,55 @@ out:
1488 * Returns: errno 1488 * Returns: errno
1489 */ 1489 */
1490 1490
1491int gfs2_dir_search(struct inode *dir, const struct qstr *name, 1491struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1492 struct gfs2_inum_host *inum, unsigned int *type)
1493{ 1492{
1494 struct buffer_head *bh; 1493 struct buffer_head *bh;
1495 struct gfs2_dirent *dent; 1494 struct gfs2_dirent *dent;
1495 struct inode *inode;
1496
1497 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1498 if (dent) {
1499 if (IS_ERR(dent))
1500 return ERR_PTR(PTR_ERR(dent));
1501 inode = gfs2_inode_lookup(dir->i_sb,
1502 be16_to_cpu(dent->de_type),
1503 be64_to_cpu(dent->de_inum.no_addr),
1504 be64_to_cpu(dent->de_inum.no_formal_ino));
1505 brelse(bh);
1506 return inode;
1507 }
1508 return ERR_PTR(-ENOENT);
1509}
1510
1511int gfs2_dir_check(struct inode *dir, const struct qstr *name,
1512 const struct gfs2_inode *ip)
1513{
1514 struct buffer_head *bh;
1515 struct gfs2_dirent *dent;
1516 int ret = -ENOENT;
1496 1517
1497 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); 1518 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1498 if (dent) { 1519 if (dent) {
1499 if (IS_ERR(dent)) 1520 if (IS_ERR(dent))
1500 return PTR_ERR(dent); 1521 return PTR_ERR(dent);
1501 if (inum) 1522 if (ip) {
1502 gfs2_inum_in(inum, (char *)&dent->de_inum); 1523 if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr)
1503 if (type) 1524 goto out;
1504 *type = be16_to_cpu(dent->de_type); 1525 if (be64_to_cpu(dent->de_inum.no_formal_ino) !=
1526 ip->i_no_formal_ino)
1527 goto out;
1528 if (unlikely(IF2DT(ip->i_inode.i_mode) !=
1529 be16_to_cpu(dent->de_type))) {
1530 gfs2_consist_inode(GFS2_I(dir));
1531 ret = -EIO;
1532 goto out;
1533 }
1534 }
1535 ret = 0;
1536out:
1505 brelse(bh); 1537 brelse(bh);
1506 return 0;
1507 } 1538 }
1508 return -ENOENT; 1539 return ret;
1509} 1540}
1510 1541
1511static int dir_new_leaf(struct inode *inode, const struct qstr *name) 1542static int dir_new_leaf(struct inode *inode, const struct qstr *name)
@@ -1565,7 +1596,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1565 */ 1596 */
1566 1597
1567int gfs2_dir_add(struct inode *inode, const struct qstr *name, 1598int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1568 const struct gfs2_inum_host *inum, unsigned type) 1599 const struct gfs2_inode *nip, unsigned type)
1569{ 1600{
1570 struct gfs2_inode *ip = GFS2_I(inode); 1601 struct gfs2_inode *ip = GFS2_I(inode);
1571 struct buffer_head *bh; 1602 struct buffer_head *bh;
@@ -1580,7 +1611,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1580 if (IS_ERR(dent)) 1611 if (IS_ERR(dent))
1581 return PTR_ERR(dent); 1612 return PTR_ERR(dent);
1582 dent = gfs2_init_dirent(inode, dent, name, bh); 1613 dent = gfs2_init_dirent(inode, dent, name, bh);
1583 gfs2_inum_out(inum, (char *)&dent->de_inum); 1614 gfs2_inum_out(nip, dent);
1584 dent->de_type = cpu_to_be16(type); 1615 dent->de_type = cpu_to_be16(type);
1585 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 1616 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1586 leaf = (struct gfs2_leaf *)bh->b_data; 1617 leaf = (struct gfs2_leaf *)bh->b_data;
@@ -1592,7 +1623,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1592 break; 1623 break;
1593 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1624 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1594 ip->i_di.di_entries++; 1625 ip->i_di.di_entries++;
1595 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; 1626 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1596 gfs2_dinode_out(ip, bh->b_data); 1627 gfs2_dinode_out(ip, bh->b_data);
1597 brelse(bh); 1628 brelse(bh);
1598 error = 0; 1629 error = 0;
@@ -1678,7 +1709,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1678 gfs2_consist_inode(dip); 1709 gfs2_consist_inode(dip);
1679 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1710 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1680 dip->i_di.di_entries--; 1711 dip->i_di.di_entries--;
1681 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; 1712 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1682 gfs2_dinode_out(dip, bh->b_data); 1713 gfs2_dinode_out(dip, bh->b_data);
1683 brelse(bh); 1714 brelse(bh);
1684 mark_inode_dirty(&dip->i_inode); 1715 mark_inode_dirty(&dip->i_inode);
@@ -1700,7 +1731,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1700 */ 1731 */
1701 1732
1702int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 1733int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1703 struct gfs2_inum_host *inum, unsigned int new_type) 1734 const struct gfs2_inode *nip, unsigned int new_type)
1704{ 1735{
1705 struct buffer_head *bh; 1736 struct buffer_head *bh;
1706 struct gfs2_dirent *dent; 1737 struct gfs2_dirent *dent;
@@ -1715,7 +1746,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1715 return PTR_ERR(dent); 1746 return PTR_ERR(dent);
1716 1747
1717 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1748 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1718 gfs2_inum_out(inum, (char *)&dent->de_inum); 1749 gfs2_inum_out(nip, dent);
1719 dent->de_type = cpu_to_be16(new_type); 1750 dent->de_type = cpu_to_be16(new_type);
1720 1751
1721 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1752 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
@@ -1726,7 +1757,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1726 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1757 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1727 } 1758 }
1728 1759
1729 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; 1760 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1730 gfs2_dinode_out(dip, bh->b_data); 1761 gfs2_dinode_out(dip, bh->b_data);
1731 brelse(bh); 1762 brelse(bh);
1732 return 0; 1763 return 0;
@@ -1867,7 +1898,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1867 for (x = 0; x < rlist.rl_rgrps; x++) { 1898 for (x = 0; x < rlist.rl_rgrps; x++) {
1868 struct gfs2_rgrpd *rgd; 1899 struct gfs2_rgrpd *rgd;
1869 rgd = rlist.rl_ghs[x].gh_gl->gl_object; 1900 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1870 rg_blocks += rgd->rd_ri.ri_length; 1901 rg_blocks += rgd->rd_length;
1871 } 1902 }
1872 1903
1873 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); 1904 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 48fe89046b..8a468cac93 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,15 +16,16 @@ struct inode;
16struct gfs2_inode; 16struct gfs2_inode;
17struct gfs2_inum; 17struct gfs2_inum;
18 18
19int gfs2_dir_search(struct inode *dir, const struct qstr *filename, 19struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
20 struct gfs2_inum_host *inum, unsigned int *type); 20int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
21 const struct gfs2_inode *ip);
21int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 22int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
22 const struct gfs2_inum_host *inum, unsigned int type); 23 const struct gfs2_inode *ip, unsigned int type);
23int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 24int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
24int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 25int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
25 filldir_t filldir); 26 filldir_t filldir);
26int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 27int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
27 struct gfs2_inum_host *new_inum, unsigned int new_type); 28 const struct gfs2_inode *nip, unsigned int new_type);
28 29
29int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 30int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
30 31
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index c1f4400985..1ab3e9d738 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -11,6 +11,7 @@
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/capability.h>
14#include <linux/xattr.h> 15#include <linux/xattr.h>
15#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 5b83ca6aca..2a7435b5c4 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -254,7 +254,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
254 if (error) 254 if (error)
255 return error; 255 return error;
256 256
257 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE + 257 error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
258 RES_EATTR + RES_STATFS + RES_QUOTA, blks); 258 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
259 if (error) 259 if (error)
260 goto out_gunlock; 260 goto out_gunlock;
@@ -300,7 +300,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
300 300
301 error = gfs2_meta_inode_buffer(ip, &dibh); 301 error = gfs2_meta_inode_buffer(ip, &dibh);
302 if (!error) { 302 if (!error) {
303 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 303 ip->i_inode.i_ctime = CURRENT_TIME;
304 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 304 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
305 gfs2_dinode_out(ip, dibh->b_data); 305 gfs2_dinode_out(ip, dibh->b_data);
306 brelse(dibh); 306 brelse(dibh);
@@ -700,7 +700,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
700 goto out_gunlock_q; 700 goto out_gunlock_q;
701 701
702 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 702 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
703 blks + al->al_rgd->rd_ri.ri_length + 703 blks + al->al_rgd->rd_length +
704 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 704 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
705 if (error) 705 if (error)
706 goto out_ipres; 706 goto out_ipres;
@@ -717,7 +717,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
717 (er->er_mode & S_IFMT)); 717 (er->er_mode & S_IFMT));
718 ip->i_inode.i_mode = er->er_mode; 718 ip->i_inode.i_mode = er->er_mode;
719 } 719 }
720 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 720 ip->i_inode.i_ctime = CURRENT_TIME;
721 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 721 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
722 gfs2_dinode_out(ip, dibh->b_data); 722 gfs2_dinode_out(ip, dibh->b_data);
723 brelse(dibh); 723 brelse(dibh);
@@ -852,7 +852,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
852 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); 852 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
853 ip->i_inode.i_mode = er->er_mode; 853 ip->i_inode.i_mode = er->er_mode;
854 } 854 }
855 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 855 ip->i_inode.i_ctime = CURRENT_TIME;
856 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 856 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
857 gfs2_dinode_out(ip, dibh->b_data); 857 gfs2_dinode_out(ip, dibh->b_data);
858 brelse(dibh); 858 brelse(dibh);
@@ -1133,7 +1133,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1133 1133
1134 error = gfs2_meta_inode_buffer(ip, &dibh); 1134 error = gfs2_meta_inode_buffer(ip, &dibh);
1135 if (!error) { 1135 if (!error) {
1136 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 1136 ip->i_inode.i_ctime = CURRENT_TIME;
1137 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1137 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1138 gfs2_dinode_out(ip, dibh->b_data); 1138 gfs2_dinode_out(ip, dibh->b_data);
1139 brelse(dibh); 1139 brelse(dibh);
@@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1352 for (x = 0; x < rlist.rl_rgrps; x++) { 1352 for (x = 0; x < rlist.rl_rgrps; x++) {
1353 struct gfs2_rgrpd *rgd; 1353 struct gfs2_rgrpd *rgd;
1354 rgd = rlist.rl_ghs[x].gh_gl->gl_object; 1354 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1355 rg_blocks += rgd->rd_ri.ri_length; 1355 rg_blocks += rgd->rd_length;
1356 } 1356 }
1357 1357
1358 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); 1358 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 1815429a29..3f0974e1af 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -422,11 +422,11 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
422static void gfs2_holder_wake(struct gfs2_holder *gh) 422static void gfs2_holder_wake(struct gfs2_holder *gh)
423{ 423{
424 clear_bit(HIF_WAIT, &gh->gh_iflags); 424 clear_bit(HIF_WAIT, &gh->gh_iflags);
425 smp_mb(); 425 smp_mb__after_clear_bit();
426 wake_up_bit(&gh->gh_iflags, HIF_WAIT); 426 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
427} 427}
428 428
429static int holder_wait(void *word) 429static int just_schedule(void *word)
430{ 430{
431 schedule(); 431 schedule();
432 return 0; 432 return 0;
@@ -435,7 +435,20 @@ static int holder_wait(void *word)
435static void wait_on_holder(struct gfs2_holder *gh) 435static void wait_on_holder(struct gfs2_holder *gh)
436{ 436{
437 might_sleep(); 437 might_sleep();
438 wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE); 438 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
439}
440
441static void gfs2_demote_wake(struct gfs2_glock *gl)
442{
443 clear_bit(GLF_DEMOTE, &gl->gl_flags);
444 smp_mb__after_clear_bit();
445 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
446}
447
448static void wait_on_demote(struct gfs2_glock *gl)
449{
450 might_sleep();
451 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
439} 452}
440 453
441/** 454/**
@@ -528,7 +541,7 @@ static int rq_demote(struct gfs2_glock *gl)
528 541
529 if (gl->gl_state == gl->gl_demote_state || 542 if (gl->gl_state == gl->gl_demote_state ||
530 gl->gl_state == LM_ST_UNLOCKED) { 543 gl->gl_state == LM_ST_UNLOCKED) {
531 clear_bit(GLF_DEMOTE, &gl->gl_flags); 544 gfs2_demote_wake(gl);
532 return 0; 545 return 0;
533 } 546 }
534 set_bit(GLF_LOCK, &gl->gl_flags); 547 set_bit(GLF_LOCK, &gl->gl_flags);
@@ -666,12 +679,22 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
666 * practise: LM_ST_SHARED and LM_ST_UNLOCKED 679 * practise: LM_ST_SHARED and LM_ST_UNLOCKED
667 */ 680 */
668 681
669static void handle_callback(struct gfs2_glock *gl, unsigned int state) 682static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote)
670{ 683{
671 spin_lock(&gl->gl_spin); 684 spin_lock(&gl->gl_spin);
672 if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) { 685 if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
673 gl->gl_demote_state = state; 686 gl->gl_demote_state = state;
674 gl->gl_demote_time = jiffies; 687 gl->gl_demote_time = jiffies;
688 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
689 gl->gl_object) {
690 struct inode *inode = igrab(gl->gl_object);
691 spin_unlock(&gl->gl_spin);
692 if (inode) {
693 d_prune_aliases(inode);
694 iput(inode);
695 }
696 return;
697 }
675 } else if (gl->gl_demote_state != LM_ST_UNLOCKED) { 698 } else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
676 gl->gl_demote_state = state; 699 gl->gl_demote_state = state;
677 } 700 }
@@ -740,7 +763,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
740 if (ret & LM_OUT_CANCELED) 763 if (ret & LM_OUT_CANCELED)
741 op_done = 0; 764 op_done = 0;
742 else 765 else
743 clear_bit(GLF_DEMOTE, &gl->gl_flags); 766 gfs2_demote_wake(gl);
744 } else { 767 } else {
745 spin_lock(&gl->gl_spin); 768 spin_lock(&gl->gl_spin);
746 list_del_init(&gh->gh_list); 769 list_del_init(&gh->gh_list);
@@ -848,7 +871,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
848 gfs2_assert_warn(sdp, !ret); 871 gfs2_assert_warn(sdp, !ret);
849 872
850 state_change(gl, LM_ST_UNLOCKED); 873 state_change(gl, LM_ST_UNLOCKED);
851 clear_bit(GLF_DEMOTE, &gl->gl_flags); 874 gfs2_demote_wake(gl);
852 875
853 if (glops->go_inval) 876 if (glops->go_inval)
854 glops->go_inval(gl, DIO_METADATA); 877 glops->go_inval(gl, DIO_METADATA);
@@ -1174,7 +1197,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1174 const struct gfs2_glock_operations *glops = gl->gl_ops; 1197 const struct gfs2_glock_operations *glops = gl->gl_ops;
1175 1198
1176 if (gh->gh_flags & GL_NOCACHE) 1199 if (gh->gh_flags & GL_NOCACHE)
1177 handle_callback(gl, LM_ST_UNLOCKED); 1200 handle_callback(gl, LM_ST_UNLOCKED, 0);
1178 1201
1179 gfs2_glmutex_lock(gl); 1202 gfs2_glmutex_lock(gl);
1180 1203
@@ -1196,6 +1219,13 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1196 spin_unlock(&gl->gl_spin); 1219 spin_unlock(&gl->gl_spin);
1197} 1220}
1198 1221
1222void gfs2_glock_dq_wait(struct gfs2_holder *gh)
1223{
1224 struct gfs2_glock *gl = gh->gh_gl;
1225 gfs2_glock_dq(gh);
1226 wait_on_demote(gl);
1227}
1228
1199/** 1229/**
1200 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it 1230 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1201 * @gh: the holder structure 1231 * @gh: the holder structure
@@ -1297,10 +1327,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1297 * @num_gh: the number of structures 1327 * @num_gh: the number of structures
1298 * @ghs: an array of struct gfs2_holder structures 1328 * @ghs: an array of struct gfs2_holder structures
1299 * 1329 *
1300 * Figure out how big an impact this function has. Either:
1301 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1302 * 2) Forget async stuff and just call nq_m_sync()
1303 * 3) Leave it like it is
1304 * 1330 *
1305 * Returns: 0 on success (all glocks acquired), 1331 * Returns: 0 on success (all glocks acquired),
1306 * errno on failure (no glocks acquired) 1332 * errno on failure (no glocks acquired)
@@ -1308,62 +1334,28 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1308 1334
1309int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) 1335int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1310{ 1336{
1311 int *e; 1337 struct gfs2_holder *tmp[4];
1312 unsigned int x; 1338 struct gfs2_holder **pph = tmp;
1313 int borked = 0, serious = 0;
1314 int error = 0; 1339 int error = 0;
1315 1340
1316 if (!num_gh) 1341 switch(num_gh) {
1342 case 0:
1317 return 0; 1343 return 0;
1318 1344 case 1:
1319 if (num_gh == 1) {
1320 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); 1345 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1321 return gfs2_glock_nq(ghs); 1346 return gfs2_glock_nq(ghs);
1322 } 1347 default:
1323 1348 if (num_gh <= 4)
1324 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1325 if (!e)
1326 return -ENOMEM;
1327
1328 for (x = 0; x < num_gh; x++) {
1329 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1330 error = gfs2_glock_nq(&ghs[x]);
1331 if (error) {
1332 borked = 1;
1333 serious = error;
1334 num_gh = x;
1335 break; 1349 break;
1336 } 1350 pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS);
1337 } 1351 if (!pph)
1338 1352 return -ENOMEM;
1339 for (x = 0; x < num_gh; x++) {
1340 error = e[x] = glock_wait_internal(&ghs[x]);
1341 if (error) {
1342 borked = 1;
1343 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1344 serious = error;
1345 }
1346 } 1353 }
1347 1354
1348 if (!borked) { 1355 error = nq_m_sync(num_gh, ghs, pph);
1349 kfree(e);
1350 return 0;
1351 }
1352
1353 for (x = 0; x < num_gh; x++)
1354 if (!e[x])
1355 gfs2_glock_dq(&ghs[x]);
1356
1357 if (serious)
1358 error = serious;
1359 else {
1360 for (x = 0; x < num_gh; x++)
1361 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1362 &ghs[x]);
1363 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1364 }
1365 1356
1366 kfree(e); 1357 if (pph != tmp)
1358 kfree(pph);
1367 1359
1368 return error; 1360 return error;
1369} 1361}
@@ -1456,7 +1448,7 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1456 if (!gl) 1448 if (!gl)
1457 return; 1449 return;
1458 1450
1459 handle_callback(gl, state); 1451 handle_callback(gl, state, 1);
1460 1452
1461 spin_lock(&gl->gl_spin); 1453 spin_lock(&gl->gl_spin);
1462 run_queue(gl); 1454 run_queue(gl);
@@ -1596,7 +1588,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1596 if (gfs2_glmutex_trylock(gl)) { 1588 if (gfs2_glmutex_trylock(gl)) {
1597 if (list_empty(&gl->gl_holders) && 1589 if (list_empty(&gl->gl_holders) &&
1598 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1590 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1599 handle_callback(gl, LM_ST_UNLOCKED); 1591 handle_callback(gl, LM_ST_UNLOCKED, 0);
1600 gfs2_glmutex_unlock(gl); 1592 gfs2_glmutex_unlock(gl);
1601 } 1593 }
1602 1594
@@ -1709,7 +1701,7 @@ static void clear_glock(struct gfs2_glock *gl)
1709 if (gfs2_glmutex_trylock(gl)) { 1701 if (gfs2_glmutex_trylock(gl)) {
1710 if (list_empty(&gl->gl_holders) && 1702 if (list_empty(&gl->gl_holders) &&
1711 gl->gl_state != LM_ST_UNLOCKED) 1703 gl->gl_state != LM_ST_UNLOCKED)
1712 handle_callback(gl, LM_ST_UNLOCKED); 1704 handle_callback(gl, LM_ST_UNLOCKED, 0);
1713 gfs2_glmutex_unlock(gl); 1705 gfs2_glmutex_unlock(gl);
1714 } 1706 }
1715} 1707}
@@ -1823,7 +1815,8 @@ static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
1823 1815
1824 print_dbg(gi, " Inode:\n"); 1816 print_dbg(gi, " Inode:\n");
1825 print_dbg(gi, " num = %llu/%llu\n", 1817 print_dbg(gi, " num = %llu/%llu\n",
1826 ip->i_num.no_formal_ino, ip->i_num.no_addr); 1818 (unsigned long long)ip->i_no_formal_ino,
1819 (unsigned long long)ip->i_no_addr);
1827 print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); 1820 print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode));
1828 print_dbg(gi, " i_flags ="); 1821 print_dbg(gi, " i_flags =");
1829 for (x = 0; x < 32; x++) 1822 for (x = 0; x < 32; x++)
@@ -1909,8 +1902,8 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1909 } 1902 }
1910 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { 1903 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
1911 print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n", 1904 print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
1912 gl->gl_demote_state, 1905 gl->gl_demote_state, (unsigned long long)
1913 (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ)); 1906 (jiffies - gl->gl_demote_time)*(1000000/HZ));
1914 } 1907 }
1915 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { 1908 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
1916 if (!test_bit(GLF_LOCK, &gl->gl_flags) && 1909 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b3e152db70..7721ca3fff 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,6 +87,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh);
87int gfs2_glock_poll(struct gfs2_holder *gh); 87int gfs2_glock_poll(struct gfs2_holder *gh);
88int gfs2_glock_wait(struct gfs2_holder *gh); 88int gfs2_glock_wait(struct gfs2_holder *gh);
89void gfs2_glock_dq(struct gfs2_holder *gh); 89void gfs2_glock_dq(struct gfs2_holder *gh);
90void gfs2_glock_dq_wait(struct gfs2_holder *gh);
90 91
91void gfs2_glock_dq_uninit(struct gfs2_holder *gh); 92void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
92int gfs2_glock_nq_num(struct gfs2_sbd *sdp, 93int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 7b82657a99..777ca46010 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,9 +156,9 @@ static void inode_go_sync(struct gfs2_glock *gl)
156 ip = NULL; 156 ip = NULL;
157 157
158 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 158 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
159 gfs2_log_flush(gl->gl_sbd, gl);
160 if (ip) 159 if (ip)
161 filemap_fdatawrite(ip->i_inode.i_mapping); 160 filemap_fdatawrite(ip->i_inode.i_mapping);
161 gfs2_log_flush(gl->gl_sbd, gl);
162 gfs2_meta_sync(gl); 162 gfs2_meta_sync(gl);
163 if (ip) { 163 if (ip) {
164 struct address_space *mapping = ip->i_inode.i_mapping; 164 struct address_space *mapping = ip->i_inode.i_mapping;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d995441373..170ba93829 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -28,6 +28,14 @@ struct gfs2_sbd;
28 28
29typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); 29typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
30 30
31struct gfs2_log_header_host {
32 u64 lh_sequence; /* Sequence number of this transaction */
33 u32 lh_flags; /* GFS2_LOG_HEAD_... */
34 u32 lh_tail; /* Block number of log tail */
35 u32 lh_blkno;
36 u32 lh_hash;
37};
38
31/* 39/*
32 * Structure of operations that are associated with each 40 * Structure of operations that are associated with each
33 * type of element in the log. 41 * type of element in the log.
@@ -60,12 +68,23 @@ struct gfs2_bitmap {
60 u32 bi_len; 68 u32 bi_len;
61}; 69};
62 70
71struct gfs2_rgrp_host {
72 u32 rg_flags;
73 u32 rg_free;
74 u32 rg_dinodes;
75 u64 rg_igeneration;
76};
77
63struct gfs2_rgrpd { 78struct gfs2_rgrpd {
64 struct list_head rd_list; /* Link with superblock */ 79 struct list_head rd_list; /* Link with superblock */
65 struct list_head rd_list_mru; 80 struct list_head rd_list_mru;
66 struct list_head rd_recent; /* Recently used rgrps */ 81 struct list_head rd_recent; /* Recently used rgrps */
67 struct gfs2_glock *rd_gl; /* Glock for this rgrp */ 82 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
68 struct gfs2_rindex_host rd_ri; 83 u64 rd_addr; /* grp block disk address */
84 u64 rd_data0; /* first data location */
85 u32 rd_length; /* length of rgrp header in fs blocks */
86 u32 rd_data; /* num of data blocks in rgrp */
87 u32 rd_bitbytes; /* number of bytes in data bitmaps */
69 struct gfs2_rgrp_host rd_rg; 88 struct gfs2_rgrp_host rd_rg;
70 u64 rd_rg_vn; 89 u64 rd_rg_vn;
71 struct gfs2_bitmap *rd_bits; 90 struct gfs2_bitmap *rd_bits;
@@ -76,6 +95,8 @@ struct gfs2_rgrpd {
76 u32 rd_last_alloc_data; 95 u32 rd_last_alloc_data;
77 u32 rd_last_alloc_meta; 96 u32 rd_last_alloc_meta;
78 struct gfs2_sbd *rd_sbd; 97 struct gfs2_sbd *rd_sbd;
98 unsigned long rd_flags;
99#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */
79}; 100};
80 101
81enum gfs2_state_bits { 102enum gfs2_state_bits {
@@ -211,10 +232,24 @@ enum {
211 GIF_SW_PAGED = 3, 232 GIF_SW_PAGED = 3,
212}; 233};
213 234
235struct gfs2_dinode_host {
236 u64 di_size; /* number of bytes in file */
237 u64 di_blocks; /* number of blocks in file */
238 u64 di_goal_meta; /* rgrp to alloc from next */
239 u64 di_goal_data; /* data block goal */
240 u64 di_generation; /* generation number for NFS */
241 u32 di_flags; /* GFS2_DIF_... */
242 u16 di_height; /* height of metadata */
243 /* These only apply to directories */
244 u16 di_depth; /* Number of bits in the table */
245 u32 di_entries; /* The number of entries in the directory */
246 u64 di_eattr; /* extended attribute block number */
247};
248
214struct gfs2_inode { 249struct gfs2_inode {
215 struct inode i_inode; 250 struct inode i_inode;
216 struct gfs2_inum_host i_num; 251 u64 i_no_addr;
217 252 u64 i_no_formal_ino;
218 unsigned long i_flags; /* GIF_... */ 253 unsigned long i_flags; /* GIF_... */
219 254
220 struct gfs2_dinode_host i_di; /* To be replaced by ref to block */ 255 struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
@@ -275,14 +310,6 @@ enum {
275 QDF_LOCKED = 2, 310 QDF_LOCKED = 2,
276}; 311};
277 312
278struct gfs2_quota_lvb {
279 __be32 qb_magic;
280 u32 __pad;
281 __be64 qb_limit; /* Hard limit of # blocks to alloc */
282 __be64 qb_warn; /* Warn user when alloc is above this # */
283 __be64 qb_value; /* Current # blocks allocated */
284};
285
286struct gfs2_quota_data { 313struct gfs2_quota_data {
287 struct list_head qd_list; 314 struct list_head qd_list;
288 unsigned int qd_count; 315 unsigned int qd_count;
@@ -327,7 +354,9 @@ struct gfs2_trans {
327 354
328 unsigned int tr_num_buf; 355 unsigned int tr_num_buf;
329 unsigned int tr_num_buf_new; 356 unsigned int tr_num_buf_new;
357 unsigned int tr_num_databuf_new;
330 unsigned int tr_num_buf_rm; 358 unsigned int tr_num_buf_rm;
359 unsigned int tr_num_databuf_rm;
331 struct list_head tr_list_buf; 360 struct list_head tr_list_buf;
332 361
333 unsigned int tr_num_revoke; 362 unsigned int tr_num_revoke;
@@ -354,6 +383,12 @@ struct gfs2_jdesc {
354 unsigned int jd_blocks; 383 unsigned int jd_blocks;
355}; 384};
356 385
386struct gfs2_statfs_change_host {
387 s64 sc_total;
388 s64 sc_free;
389 s64 sc_dinodes;
390};
391
357#define GFS2_GLOCKD_DEFAULT 1 392#define GFS2_GLOCKD_DEFAULT 1
358#define GFS2_GLOCKD_MAX 16 393#define GFS2_GLOCKD_MAX 16
359 394
@@ -426,6 +461,28 @@ enum {
426 461
427#define GFS2_FSNAME_LEN 256 462#define GFS2_FSNAME_LEN 256
428 463
464struct gfs2_inum_host {
465 u64 no_formal_ino;
466 u64 no_addr;
467};
468
469struct gfs2_sb_host {
470 u32 sb_magic;
471 u32 sb_type;
472 u32 sb_format;
473
474 u32 sb_fs_format;
475 u32 sb_multihost_format;
476 u32 sb_bsize;
477 u32 sb_bsize_shift;
478
479 struct gfs2_inum_host sb_master_dir;
480 struct gfs2_inum_host sb_root_dir;
481
482 char sb_lockproto[GFS2_LOCKNAME_LEN];
483 char sb_locktable[GFS2_LOCKNAME_LEN];
484};
485
429struct gfs2_sbd { 486struct gfs2_sbd {
430 struct super_block *sd_vfs; 487 struct super_block *sd_vfs;
431 struct super_block *sd_vfs_meta; 488 struct super_block *sd_vfs_meta;
@@ -544,6 +601,7 @@ struct gfs2_sbd {
544 601
545 unsigned int sd_log_blks_reserved; 602 unsigned int sd_log_blks_reserved;
546 unsigned int sd_log_commited_buf; 603 unsigned int sd_log_commited_buf;
604 unsigned int sd_log_commited_databuf;
547 unsigned int sd_log_commited_revoke; 605 unsigned int sd_log_commited_revoke;
548 606
549 unsigned int sd_log_num_gl; 607 unsigned int sd_log_num_gl;
@@ -552,7 +610,6 @@ struct gfs2_sbd {
552 unsigned int sd_log_num_rg; 610 unsigned int sd_log_num_rg;
553 unsigned int sd_log_num_databuf; 611 unsigned int sd_log_num_databuf;
554 unsigned int sd_log_num_jdata; 612 unsigned int sd_log_num_jdata;
555 unsigned int sd_log_num_hdrs;
556 613
557 struct list_head sd_log_le_gl; 614 struct list_head sd_log_le_gl;
558 struct list_head sd_log_le_buf; 615 struct list_head sd_log_le_buf;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index df0b8b3018..34f7bcdea1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -38,12 +38,17 @@
38#include "trans.h" 38#include "trans.h"
39#include "util.h" 39#include "util.h"
40 40
41struct gfs2_inum_range_host {
42 u64 ir_start;
43 u64 ir_length;
44};
45
41static int iget_test(struct inode *inode, void *opaque) 46static int iget_test(struct inode *inode, void *opaque)
42{ 47{
43 struct gfs2_inode *ip = GFS2_I(inode); 48 struct gfs2_inode *ip = GFS2_I(inode);
44 struct gfs2_inum_host *inum = opaque; 49 u64 *no_addr = opaque;
45 50
46 if (ip->i_num.no_addr == inum->no_addr && 51 if (ip->i_no_addr == *no_addr &&
47 inode->i_private != NULL) 52 inode->i_private != NULL)
48 return 1; 53 return 1;
49 54
@@ -53,37 +58,70 @@ static int iget_test(struct inode *inode, void *opaque)
53static int iget_set(struct inode *inode, void *opaque) 58static int iget_set(struct inode *inode, void *opaque)
54{ 59{
55 struct gfs2_inode *ip = GFS2_I(inode); 60 struct gfs2_inode *ip = GFS2_I(inode);
56 struct gfs2_inum_host *inum = opaque; 61 u64 *no_addr = opaque;
57 62
58 ip->i_num = *inum; 63 inode->i_ino = (unsigned long)*no_addr;
59 inode->i_ino = inum->no_addr; 64 ip->i_no_addr = *no_addr;
60 return 0; 65 return 0;
61} 66}
62 67
63struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum) 68struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
69{
70 unsigned long hash = (unsigned long)no_addr;
71 return ilookup5(sb, hash, iget_test, &no_addr);
72}
73
74static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
64{ 75{
65 return ilookup5(sb, (unsigned long)inum->no_addr, 76 unsigned long hash = (unsigned long)no_addr;
66 iget_test, inum); 77 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
67} 78}
68 79
69static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum_host *inum) 80/**
81 * GFS2 lookup code fills in vfs inode contents based on info obtained
82 * from directory entry inside gfs2_inode_lookup(). This has caused issues
83 * with NFS code path since its get_dentry routine doesn't have the relevant
84 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
85 * segment inside gfs2_inode_lookup code needs to get moved around.
86 *
87 * Clean up I_LOCK and I_NEW as well.
88 **/
89
90void gfs2_set_iop(struct inode *inode)
70{ 91{
71 return iget5_locked(sb, (unsigned long)inum->no_addr, 92 umode_t mode = inode->i_mode;
72 iget_test, iget_set, inum); 93
94 if (S_ISREG(mode)) {
95 inode->i_op = &gfs2_file_iops;
96 inode->i_fop = &gfs2_file_fops;
97 inode->i_mapping->a_ops = &gfs2_file_aops;
98 } else if (S_ISDIR(mode)) {
99 inode->i_op = &gfs2_dir_iops;
100 inode->i_fop = &gfs2_dir_fops;
101 } else if (S_ISLNK(mode)) {
102 inode->i_op = &gfs2_symlink_iops;
103 } else {
104 inode->i_op = &gfs2_dev_iops;
105 }
106
107 unlock_new_inode(inode);
73} 108}
74 109
75/** 110/**
76 * gfs2_inode_lookup - Lookup an inode 111 * gfs2_inode_lookup - Lookup an inode
77 * @sb: The super block 112 * @sb: The super block
78 * @inum: The inode number 113 * @no_addr: The inode number
79 * @type: The type of the inode 114 * @type: The type of the inode
80 * 115 *
81 * Returns: A VFS inode, or an error 116 * Returns: A VFS inode, or an error
82 */ 117 */
83 118
84struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned int type) 119struct inode *gfs2_inode_lookup(struct super_block *sb,
120 unsigned int type,
121 u64 no_addr,
122 u64 no_formal_ino)
85{ 123{
86 struct inode *inode = gfs2_iget(sb, inum); 124 struct inode *inode = gfs2_iget(sb, no_addr);
87 struct gfs2_inode *ip = GFS2_I(inode); 125 struct gfs2_inode *ip = GFS2_I(inode);
88 struct gfs2_glock *io_gl; 126 struct gfs2_glock *io_gl;
89 int error; 127 int error;
@@ -93,29 +131,15 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *i
93 131
94 if (inode->i_state & I_NEW) { 132 if (inode->i_state & I_NEW) {
95 struct gfs2_sbd *sdp = GFS2_SB(inode); 133 struct gfs2_sbd *sdp = GFS2_SB(inode);
96 umode_t mode = DT2IF(type);
97 inode->i_private = ip; 134 inode->i_private = ip;
98 inode->i_mode = mode; 135 ip->i_no_formal_ino = no_formal_ino;
99
100 if (S_ISREG(mode)) {
101 inode->i_op = &gfs2_file_iops;
102 inode->i_fop = &gfs2_file_fops;
103 inode->i_mapping->a_ops = &gfs2_file_aops;
104 } else if (S_ISDIR(mode)) {
105 inode->i_op = &gfs2_dir_iops;
106 inode->i_fop = &gfs2_dir_fops;
107 } else if (S_ISLNK(mode)) {
108 inode->i_op = &gfs2_symlink_iops;
109 } else {
110 inode->i_op = &gfs2_dev_iops;
111 }
112 136
113 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 137 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
114 if (unlikely(error)) 138 if (unlikely(error))
115 goto fail; 139 goto fail;
116 ip->i_gl->gl_object = ip; 140 ip->i_gl->gl_object = ip;
117 141
118 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl); 142 error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
119 if (unlikely(error)) 143 if (unlikely(error))
120 goto fail_put; 144 goto fail_put;
121 145
@@ -123,12 +147,38 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *i
123 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 147 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
124 if (unlikely(error)) 148 if (unlikely(error))
125 goto fail_iopen; 149 goto fail_iopen;
150 ip->i_iopen_gh.gh_gl->gl_object = ip;
126 151
127 gfs2_glock_put(io_gl); 152 gfs2_glock_put(io_gl);
128 unlock_new_inode(inode); 153
154 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
155 goto gfs2_nfsbypass;
156
157 inode->i_mode = DT2IF(type);
158
159 /*
160 * We must read the inode in order to work out its type in
161 * this case. Note that this doesn't happen often as we normally
162 * know the type beforehand. This code path only occurs during
163 * unlinked inode recovery (where it is safe to do this glock,
164 * which is not true in the general case).
165 */
166 if (type == DT_UNKNOWN) {
167 struct gfs2_holder gh;
168 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
169 if (unlikely(error))
170 goto fail_glock;
171 /* Inode is now uptodate */
172 gfs2_glock_dq_uninit(&gh);
173 }
174
175 gfs2_set_iop(inode);
129 } 176 }
130 177
178gfs2_nfsbypass:
131 return inode; 179 return inode;
180fail_glock:
181 gfs2_glock_dq(&ip->i_iopen_gh);
132fail_iopen: 182fail_iopen:
133 gfs2_glock_put(io_gl); 183 gfs2_glock_put(io_gl);
134fail_put: 184fail_put:
@@ -144,14 +194,12 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
144 struct gfs2_dinode_host *di = &ip->i_di; 194 struct gfs2_dinode_host *di = &ip->i_di;
145 const struct gfs2_dinode *str = buf; 195 const struct gfs2_dinode *str = buf;
146 196
147 if (ip->i_num.no_addr != be64_to_cpu(str->di_num.no_addr)) { 197 if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) {
148 if (gfs2_consist_inode(ip)) 198 if (gfs2_consist_inode(ip))
149 gfs2_dinode_print(ip); 199 gfs2_dinode_print(ip);
150 return -EIO; 200 return -EIO;
151 } 201 }
152 if (ip->i_num.no_formal_ino != be64_to_cpu(str->di_num.no_formal_ino)) 202 ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
153 return -ESTALE;
154
155 ip->i_inode.i_mode = be32_to_cpu(str->di_mode); 203 ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
156 ip->i_inode.i_rdev = 0; 204 ip->i_inode.i_rdev = 0;
157 switch (ip->i_inode.i_mode & S_IFMT) { 205 switch (ip->i_inode.i_mode & S_IFMT) {
@@ -175,11 +223,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
175 di->di_blocks = be64_to_cpu(str->di_blocks); 223 di->di_blocks = be64_to_cpu(str->di_blocks);
176 gfs2_set_inode_blocks(&ip->i_inode); 224 gfs2_set_inode_blocks(&ip->i_inode);
177 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); 225 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
178 ip->i_inode.i_atime.tv_nsec = 0; 226 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
179 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); 227 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
180 ip->i_inode.i_mtime.tv_nsec = 0; 228 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
181 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); 229 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
182 ip->i_inode.i_ctime.tv_nsec = 0; 230 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
183 231
184 di->di_goal_meta = be64_to_cpu(str->di_goal_meta); 232 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
185 di->di_goal_data = be64_to_cpu(str->di_goal_data); 233 di->di_goal_data = be64_to_cpu(str->di_goal_data);
@@ -247,7 +295,7 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
247 if (error) 295 if (error)
248 goto out_qs; 296 goto out_qs;
249 297
250 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); 298 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
251 if (!rgd) { 299 if (!rgd) {
252 gfs2_consist_inode(ip); 300 gfs2_consist_inode(ip);
253 error = -EIO; 301 error = -EIO;
@@ -314,7 +362,7 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
314 else 362 else
315 drop_nlink(&ip->i_inode); 363 drop_nlink(&ip->i_inode);
316 364
317 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 365 ip->i_inode.i_ctime = CURRENT_TIME;
318 366
319 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 367 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
320 gfs2_dinode_out(ip, dibh->b_data); 368 gfs2_dinode_out(ip, dibh->b_data);
@@ -366,9 +414,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
366 struct super_block *sb = dir->i_sb; 414 struct super_block *sb = dir->i_sb;
367 struct gfs2_inode *dip = GFS2_I(dir); 415 struct gfs2_inode *dip = GFS2_I(dir);
368 struct gfs2_holder d_gh; 416 struct gfs2_holder d_gh;
369 struct gfs2_inum_host inum; 417 int error = 0;
370 unsigned int type;
371 int error;
372 struct inode *inode = NULL; 418 struct inode *inode = NULL;
373 int unlock = 0; 419 int unlock = 0;
374 420
@@ -395,12 +441,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
395 goto out; 441 goto out;
396 } 442 }
397 443
398 error = gfs2_dir_search(dir, name, &inum, &type); 444 inode = gfs2_dir_search(dir, name);
399 if (error) 445 if (IS_ERR(inode))
400 goto out; 446 error = PTR_ERR(inode);
401
402 inode = gfs2_inode_lookup(sb, &inum, type);
403
404out: 447out:
405 if (unlock) 448 if (unlock)
406 gfs2_glock_dq_uninit(&d_gh); 449 gfs2_glock_dq_uninit(&d_gh);
@@ -409,6 +452,22 @@ out:
409 return inode ? inode : ERR_PTR(error); 452 return inode ? inode : ERR_PTR(error);
410} 453}
411 454
455static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
456{
457 const struct gfs2_inum_range *str = buf;
458
459 ir->ir_start = be64_to_cpu(str->ir_start);
460 ir->ir_length = be64_to_cpu(str->ir_length);
461}
462
463static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
464{
465 struct gfs2_inum_range *str = buf;
466
467 str->ir_start = cpu_to_be64(ir->ir_start);
468 str->ir_length = cpu_to_be64(ir->ir_length);
469}
470
412static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) 471static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
413{ 472{
414 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); 473 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
@@ -548,7 +607,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
548 if (!dip->i_inode.i_nlink) 607 if (!dip->i_inode.i_nlink)
549 return -EPERM; 608 return -EPERM;
550 609
551 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL); 610 error = gfs2_dir_check(&dip->i_inode, name, NULL);
552 switch (error) { 611 switch (error) {
553 case -ENOENT: 612 case -ENOENT:
554 error = 0; 613 error = 0;
@@ -588,8 +647,7 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
588 *gid = current->fsgid; 647 *gid = current->fsgid;
589} 648}
590 649
591static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum_host *inum, 650static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
592 u64 *generation)
593{ 651{
594 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 652 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
595 int error; 653 int error;
@@ -605,7 +663,7 @@ static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum_host *inum,
605 if (error) 663 if (error)
606 goto out_ipreserv; 664 goto out_ipreserv;
607 665
608 inum->no_addr = gfs2_alloc_di(dip, generation); 666 *no_addr = gfs2_alloc_di(dip, generation);
609 667
610 gfs2_trans_end(sdp); 668 gfs2_trans_end(sdp);
611 669
@@ -635,6 +693,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
635 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 693 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
636 struct gfs2_dinode *di; 694 struct gfs2_dinode *di;
637 struct buffer_head *dibh; 695 struct buffer_head *dibh;
696 struct timespec tv = CURRENT_TIME;
638 697
639 dibh = gfs2_meta_new(gl, inum->no_addr); 698 dibh = gfs2_meta_new(gl, inum->no_addr);
640 gfs2_trans_add_bh(gl, dibh, 1); 699 gfs2_trans_add_bh(gl, dibh, 1);
@@ -650,7 +709,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
650 di->di_nlink = 0; 709 di->di_nlink = 0;
651 di->di_size = 0; 710 di->di_size = 0;
652 di->di_blocks = cpu_to_be64(1); 711 di->di_blocks = cpu_to_be64(1);
653 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds()); 712 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
654 di->di_major = cpu_to_be32(MAJOR(dev)); 713 di->di_major = cpu_to_be32(MAJOR(dev));
655 di->di_minor = cpu_to_be32(MINOR(dev)); 714 di->di_minor = cpu_to_be32(MINOR(dev));
656 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); 715 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
@@ -680,6 +739,9 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
680 di->di_entries = 0; 739 di->di_entries = 0;
681 memset(&di->__pad4, 0, sizeof(di->__pad4)); 740 memset(&di->__pad4, 0, sizeof(di->__pad4));
682 di->di_eattr = 0; 741 di->di_eattr = 0;
742 di->di_atime_nsec = cpu_to_be32(tv.tv_nsec);
743 di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
744 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
683 memset(&di->di_reserved, 0, sizeof(di->di_reserved)); 745 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
684 746
685 brelse(dibh); 747 brelse(dibh);
@@ -749,7 +811,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
749 goto fail_quota_locks; 811 goto fail_quota_locks;
750 812
751 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 813 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
752 al->al_rgd->rd_ri.ri_length + 814 al->al_rgd->rd_length +
753 2 * RES_DINODE + 815 2 * RES_DINODE +
754 RES_STATFS + RES_QUOTA, 0); 816 RES_STATFS + RES_QUOTA, 0);
755 if (error) 817 if (error)
@@ -760,7 +822,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
760 goto fail_quota_locks; 822 goto fail_quota_locks;
761 } 823 }
762 824
763 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_inode.i_mode)); 825 error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode));
764 if (error) 826 if (error)
765 goto fail_end_trans; 827 goto fail_end_trans;
766 828
@@ -840,11 +902,11 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
840struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, 902struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
841 unsigned int mode, dev_t dev) 903 unsigned int mode, dev_t dev)
842{ 904{
843 struct inode *inode; 905 struct inode *inode = NULL;
844 struct gfs2_inode *dip = ghs->gh_gl->gl_object; 906 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
845 struct inode *dir = &dip->i_inode; 907 struct inode *dir = &dip->i_inode;
846 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 908 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
847 struct gfs2_inum_host inum; 909 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
848 int error; 910 int error;
849 u64 generation; 911 u64 generation;
850 912
@@ -864,7 +926,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
864 if (error) 926 if (error)
865 goto fail_gunlock; 927 goto fail_gunlock;
866 928
867 error = alloc_dinode(dip, &inum, &generation); 929 error = alloc_dinode(dip, &inum.no_addr, &generation);
868 if (error) 930 if (error)
869 goto fail_gunlock; 931 goto fail_gunlock;
870 932
@@ -877,34 +939,36 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
877 if (error) 939 if (error)
878 goto fail_gunlock2; 940 goto fail_gunlock2;
879 941
880 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode)); 942 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
943 inum.no_addr,
944 inum.no_formal_ino);
881 if (IS_ERR(inode)) 945 if (IS_ERR(inode))
882 goto fail_gunlock2; 946 goto fail_gunlock2;
883 947
884 error = gfs2_inode_refresh(GFS2_I(inode)); 948 error = gfs2_inode_refresh(GFS2_I(inode));
885 if (error) 949 if (error)
886 goto fail_iput; 950 goto fail_gunlock2;
887 951
888 error = gfs2_acl_create(dip, GFS2_I(inode)); 952 error = gfs2_acl_create(dip, GFS2_I(inode));
889 if (error) 953 if (error)
890 goto fail_iput; 954 goto fail_gunlock2;
891 955
892 error = gfs2_security_init(dip, GFS2_I(inode)); 956 error = gfs2_security_init(dip, GFS2_I(inode));
893 if (error) 957 if (error)
894 goto fail_iput; 958 goto fail_gunlock2;
895 959
896 error = link_dinode(dip, name, GFS2_I(inode)); 960 error = link_dinode(dip, name, GFS2_I(inode));
897 if (error) 961 if (error)
898 goto fail_iput; 962 goto fail_gunlock2;
899 963
900 if (!inode) 964 if (!inode)
901 return ERR_PTR(-ENOMEM); 965 return ERR_PTR(-ENOMEM);
902 return inode; 966 return inode;
903 967
904fail_iput:
905 iput(inode);
906fail_gunlock2: 968fail_gunlock2:
907 gfs2_glock_dq_uninit(ghs + 1); 969 gfs2_glock_dq_uninit(ghs + 1);
970 if (inode)
971 iput(inode);
908fail_gunlock: 972fail_gunlock:
909 gfs2_glock_dq(ghs); 973 gfs2_glock_dq(ghs);
910fail: 974fail:
@@ -976,10 +1040,8 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
976 */ 1040 */
977 1041
978int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 1042int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
979 struct gfs2_inode *ip) 1043 const struct gfs2_inode *ip)
980{ 1044{
981 struct gfs2_inum_host inum;
982 unsigned int type;
983 int error; 1045 int error;
984 1046
985 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) 1047 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
@@ -997,18 +1059,10 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
997 if (error) 1059 if (error)
998 return error; 1060 return error;
999 1061
1000 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type); 1062 error = gfs2_dir_check(&dip->i_inode, name, ip);
1001 if (error) 1063 if (error)
1002 return error; 1064 return error;
1003 1065
1004 if (!gfs2_inum_equal(&inum, &ip->i_num))
1005 return -ENOENT;
1006
1007 if (IF2DT(ip->i_inode.i_mode) != type) {
1008 gfs2_consist_inode(dip);
1009 return -EIO;
1010 }
1011
1012 return 0; 1066 return 0;
1013} 1067}
1014 1068
@@ -1132,10 +1186,11 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1132 struct gfs2_glock *gl = gh->gh_gl; 1186 struct gfs2_glock *gl = gh->gh_gl;
1133 struct gfs2_sbd *sdp = gl->gl_sbd; 1187 struct gfs2_sbd *sdp = gl->gl_sbd;
1134 struct gfs2_inode *ip = gl->gl_object; 1188 struct gfs2_inode *ip = gl->gl_object;
1135 s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum); 1189 s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1136 unsigned int state; 1190 unsigned int state;
1137 int flags; 1191 int flags;
1138 int error; 1192 int error;
1193 struct timespec tv = CURRENT_TIME;
1139 1194
1140 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || 1195 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1141 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || 1196 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
@@ -1153,8 +1208,7 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1153 (sdp->sd_vfs->s_flags & MS_RDONLY)) 1208 (sdp->sd_vfs->s_flags & MS_RDONLY))
1154 return 0; 1209 return 0;
1155 1210
1156 curtime = get_seconds(); 1211 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1157 if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) {
1158 gfs2_glock_dq(gh); 1212 gfs2_glock_dq(gh);
1159 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, 1213 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1160 gh); 1214 gh);
@@ -1165,8 +1219,8 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1165 /* Verify that atime hasn't been updated while we were 1219 /* Verify that atime hasn't been updated while we were
1166 trying to get exclusive lock. */ 1220 trying to get exclusive lock. */
1167 1221
1168 curtime = get_seconds(); 1222 tv = CURRENT_TIME;
1169 if (curtime - ip->i_inode.i_atime.tv_sec >= quantum) { 1223 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1170 struct buffer_head *dibh; 1224 struct buffer_head *dibh;
1171 struct gfs2_dinode *di; 1225 struct gfs2_dinode *di;
1172 1226
@@ -1180,11 +1234,12 @@ int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1180 if (error) 1234 if (error)
1181 goto fail_end_trans; 1235 goto fail_end_trans;
1182 1236
1183 ip->i_inode.i_atime.tv_sec = curtime; 1237 ip->i_inode.i_atime = tv;
1184 1238
1185 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1239 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1186 di = (struct gfs2_dinode *)dibh->b_data; 1240 di = (struct gfs2_dinode *)dibh->b_data;
1187 di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1241 di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1242 di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1188 brelse(dibh); 1243 brelse(dibh);
1189 1244
1190 gfs2_trans_end(sdp); 1245 gfs2_trans_end(sdp);
@@ -1252,3 +1307,66 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1252 return error; 1307 return error;
1253} 1308}
1254 1309
1310void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1311{
1312 const struct gfs2_dinode_host *di = &ip->i_di;
1313 struct gfs2_dinode *str = buf;
1314
1315 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
1316 str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
1317 str->di_header.__pad0 = 0;
1318 str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
1319 str->di_header.__pad1 = 0;
1320 str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
1321 str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
1322 str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
1323 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1324 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1325 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1326 str->di_size = cpu_to_be64(di->di_size);
1327 str->di_blocks = cpu_to_be64(di->di_blocks);
1328 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1329 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
1330 str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
1331
1332 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
1333 str->di_goal_data = cpu_to_be64(di->di_goal_data);
1334 str->di_generation = cpu_to_be64(di->di_generation);
1335
1336 str->di_flags = cpu_to_be32(di->di_flags);
1337 str->di_height = cpu_to_be16(di->di_height);
1338 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
1339 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
1340 GFS2_FORMAT_DE : 0);
1341 str->di_depth = cpu_to_be16(di->di_depth);
1342 str->di_entries = cpu_to_be32(di->di_entries);
1343
1344 str->di_eattr = cpu_to_be64(di->di_eattr);
1345 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1346 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
1347 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
1348}
1349
1350void gfs2_dinode_print(const struct gfs2_inode *ip)
1351{
1352 const struct gfs2_dinode_host *di = &ip->i_di;
1353
1354 printk(KERN_INFO " no_formal_ino = %llu\n",
1355 (unsigned long long)ip->i_no_formal_ino);
1356 printk(KERN_INFO " no_addr = %llu\n",
1357 (unsigned long long)ip->i_no_addr);
1358 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
1359 printk(KERN_INFO " di_blocks = %llu\n",
1360 (unsigned long long)di->di_blocks);
1361 printk(KERN_INFO " di_goal_meta = %llu\n",
1362 (unsigned long long)di->di_goal_meta);
1363 printk(KERN_INFO " di_goal_data = %llu\n",
1364 (unsigned long long)di->di_goal_data);
1365 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags);
1366 printk(KERN_INFO " di_height = %u\n", di->di_height);
1367 printk(KERN_INFO " di_depth = %u\n", di->di_depth);
1368 printk(KERN_INFO " di_entries = %u\n", di->di_entries);
1369 printk(KERN_INFO " di_eattr = %llu\n",
1370 (unsigned long long)di->di_eattr);
1371}
1372
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index b57f448b15..4517ac82c0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,17 +10,17 @@
10#ifndef __INODE_DOT_H__ 10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip) 13static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
14{ 14{
15 return !ip->i_di.di_height; 15 return !ip->i_di.di_height;
16} 16}
17 17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip) 18static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
19{ 19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA; 20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21} 21}
22 22
23static inline int gfs2_is_dir(struct gfs2_inode *ip) 23static inline int gfs2_is_dir(const struct gfs2_inode *ip)
24{ 24{
25 return S_ISDIR(ip->i_inode.i_mode); 25 return S_ISDIR(ip->i_inode.i_mode);
26} 26}
@@ -32,9 +32,25 @@ static inline void gfs2_set_inode_blocks(struct inode *inode)
32 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); 32 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
33} 33}
34 34
35static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
36 u64 no_formal_ino)
37{
38 return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino;
39}
40
41static inline void gfs2_inum_out(const struct gfs2_inode *ip,
42 struct gfs2_dirent *dent)
43{
44 dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
45 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
46}
47
48
35void gfs2_inode_attr_in(struct gfs2_inode *ip); 49void gfs2_inode_attr_in(struct gfs2_inode *ip);
36struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum_host *inum, unsigned type); 50void gfs2_set_iop(struct inode *inode);
37struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum); 51struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
52 u64 no_addr, u64 no_formal_ino);
53struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
38 54
39int gfs2_inode_refresh(struct gfs2_inode *ip); 55int gfs2_inode_refresh(struct gfs2_inode *ip);
40 56
@@ -47,12 +63,14 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
47int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 63int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
48 struct gfs2_inode *ip); 64 struct gfs2_inode *ip);
49int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 65int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
50 struct gfs2_inode *ip); 66 const struct gfs2_inode *ip);
51int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); 67int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
52int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 68int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
53int gfs2_glock_nq_atime(struct gfs2_holder *gh); 69int gfs2_glock_nq_atime(struct gfs2_holder *gh);
54int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 70int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
55struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 71struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
72void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
73void gfs2_dinode_print(const struct gfs2_inode *ip);
56 74
57#endif /* __INODE_DOT_H__ */ 75#endif /* __INODE_DOT_H__ */
58 76
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index c305255bfe..542a797ac8 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -174,7 +174,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
174 lp->cur = DLM_LOCK_IV; 174 lp->cur = DLM_LOCK_IV;
175 lp->lvb = NULL; 175 lp->lvb = NULL;
176 lp->hold_null = NULL; 176 lp->hold_null = NULL;
177 init_completion(&lp->ast_wait);
178 INIT_LIST_HEAD(&lp->clist); 177 INIT_LIST_HEAD(&lp->clist);
179 INIT_LIST_HEAD(&lp->blist); 178 INIT_LIST_HEAD(&lp->blist);
180 INIT_LIST_HEAD(&lp->delay_list); 179 INIT_LIST_HEAD(&lp->delay_list);
@@ -399,6 +398,12 @@ static void gdlm_del_lvb(struct gdlm_lock *lp)
399 lp->lksb.sb_lvbptr = NULL; 398 lp->lksb.sb_lvbptr = NULL;
400} 399}
401 400
401static int gdlm_ast_wait(void *word)
402{
403 schedule();
404 return 0;
405}
406
402/* This can do a synchronous dlm request (requiring a lock_dlm thread to get 407/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
403 the completion) because gfs won't call hold_lvb() during a callback (from 408 the completion) because gfs won't call hold_lvb() during a callback (from
404 the context of a lock_dlm thread). */ 409 the context of a lock_dlm thread). */
@@ -424,10 +429,10 @@ static int hold_null_lock(struct gdlm_lock *lp)
424 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; 429 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
425 set_bit(LFL_NOBAST, &lpn->flags); 430 set_bit(LFL_NOBAST, &lpn->flags);
426 set_bit(LFL_INLOCK, &lpn->flags); 431 set_bit(LFL_INLOCK, &lpn->flags);
432 set_bit(LFL_AST_WAIT, &lpn->flags);
427 433
428 init_completion(&lpn->ast_wait);
429 gdlm_do_lock(lpn); 434 gdlm_do_lock(lpn);
430 wait_for_completion(&lpn->ast_wait); 435 wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
431 error = lpn->lksb.sb_status; 436 error = lpn->lksb.sb_status;
432 if (error) { 437 if (error) {
433 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", 438 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index d074c6e6f9..24d70f73b6 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -101,6 +101,7 @@ enum {
101 LFL_NOBAST = 10, 101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11, 102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12, 103 LFL_UNLOCK_DELETE = 12,
104 LFL_AST_WAIT = 13,
104}; 105};
105 106
106struct gdlm_lock { 107struct gdlm_lock {
@@ -117,7 +118,6 @@ struct gdlm_lock {
117 unsigned long flags; /* lock_dlm flags LFL_ */ 118 unsigned long flags; /* lock_dlm flags LFL_ */
118 119
119 int bast_mode; /* protected by async_lock */ 120 int bast_mode; /* protected by async_lock */
120 struct completion ast_wait;
121 121
122 struct list_head clist; /* complete */ 122 struct list_head clist; /* complete */
123 struct list_head blist; /* blocking */ 123 struct list_head blist; /* blocking */
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 1d8faa3da8..41c5b04caa 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -147,7 +147,7 @@ static int gdlm_mount(char *table_name, char *host_data,
147 147
148 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), 148 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
149 &ls->dlm_lockspace, 149 &ls->dlm_lockspace,
150 nodir ? DLM_LSFL_NODIR : 0, 150 DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0),
151 GDLM_LVB_SIZE); 151 GDLM_LVB_SIZE);
152 if (error) { 152 if (error) {
153 log_error("dlm_new_lockspace error %d", error); 153 log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index f82495e18c..fba1f1d87e 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -242,7 +242,7 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
242 op->info.number = name->ln_number; 242 op->info.number = name->ln_number;
243 op->info.start = fl->fl_start; 243 op->info.start = fl->fl_start;
244 op->info.end = fl->fl_end; 244 op->info.end = fl->fl_end;
245 245 op->info.owner = (__u64)(long) fl->fl_owner;
246 246
247 send_op(op); 247 send_op(op);
248 wait_event(recv_wq, (op->done != 0)); 248 wait_event(recv_wq, (op->done != 0));
@@ -254,16 +254,20 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
254 } 254 }
255 spin_unlock(&ops_lock); 255 spin_unlock(&ops_lock);
256 256
257 /* info.rv from userspace is 1 for conflict, 0 for no-conflict,
258 -ENOENT if there are no locks on the file */
259
257 rv = op->info.rv; 260 rv = op->info.rv;
258 261
259 fl->fl_type = F_UNLCK; 262 fl->fl_type = F_UNLCK;
260 if (rv == -ENOENT) 263 if (rv == -ENOENT)
261 rv = 0; 264 rv = 0;
262 else if (rv == 0 && op->info.pid != fl->fl_pid) { 265 else if (rv > 0) {
263 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; 266 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
264 fl->fl_pid = op->info.pid; 267 fl->fl_pid = op->info.pid;
265 fl->fl_start = op->info.start; 268 fl->fl_start = op->info.start;
266 fl->fl_end = op->info.end; 269 fl->fl_end = op->info.end;
270 rv = 0;
267 } 271 }
268 272
269 kfree(op); 273 kfree(op);
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 9cf1f168ea..1aca51e450 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -44,6 +44,13 @@ static void process_blocking(struct gdlm_lock *lp, int bast_mode)
44 ls->fscb(ls->sdp, cb, &lp->lockname); 44 ls->fscb(ls->sdp, cb, &lp->lockname);
45} 45}
46 46
47static void wake_up_ast(struct gdlm_lock *lp)
48{
49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
52}
53
47static void process_complete(struct gdlm_lock *lp) 54static void process_complete(struct gdlm_lock *lp)
48{ 55{
49 struct gdlm_ls *ls = lp->ls; 56 struct gdlm_ls *ls = lp->ls;
@@ -136,7 +143,7 @@ static void process_complete(struct gdlm_lock *lp)
136 */ 143 */
137 144
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { 145 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait); 146 wake_up_ast(lp);
140 return; 147 return;
141 } 148 }
142 149
@@ -214,7 +221,7 @@ out:
214 if (test_bit(LFL_INLOCK, &lp->flags)) { 221 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags); 222 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req; 223 lp->cur = lp->req;
217 complete(&lp->ast_wait); 224 wake_up_ast(lp);
218 return; 225 return;
219 } 226 }
220 227
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 291415ddfe..f49a12e240 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -83,6 +83,11 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
83 83
84 gfs2_assert(sdp, bd->bd_ail == ai); 84 gfs2_assert(sdp, bd->bd_ail == ai);
85 85
86 if (!bh){
87 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
88 continue;
89 }
90
86 if (!buffer_busy(bh)) { 91 if (!buffer_busy(bh)) {
87 if (!buffer_uptodate(bh)) { 92 if (!buffer_uptodate(bh)) {
88 gfs2_log_unlock(sdp); 93 gfs2_log_unlock(sdp);
@@ -125,6 +130,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
125 bd_ail_st_list) { 130 bd_ail_st_list) {
126 bh = bd->bd_bh; 131 bh = bd->bd_bh;
127 132
133 if (!bh){
134 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
135 continue;
136 }
137
128 gfs2_assert(sdp, bd->bd_ail == ai); 138 gfs2_assert(sdp, bd->bd_ail == ai);
129 139
130 if (buffer_busy(bh)) { 140 if (buffer_busy(bh)) {
@@ -262,8 +272,8 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
262 * @sdp: The GFS2 superblock 272 * @sdp: The GFS2 superblock
263 * @blks: The number of blocks to reserve 273 * @blks: The number of blocks to reserve
264 * 274 *
265 * Note that we never give out the last 6 blocks of the journal. Thats 275 * Note that we never give out the last few blocks of the journal. Thats
266 * due to the fact that there is are a small number of header blocks 276 * due to the fact that there is a small number of header blocks
267 * associated with each log flush. The exact number can't be known until 277 * associated with each log flush. The exact number can't be known until
268 * flush time, so we ensure that we have just enough free blocks at all 278 * flush time, so we ensure that we have just enough free blocks at all
269 * times to avoid running out during a log flush. 279 * times to avoid running out during a log flush.
@@ -274,6 +284,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
274int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 284int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
275{ 285{
276 unsigned int try = 0; 286 unsigned int try = 0;
287 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
277 288
278 if (gfs2_assert_warn(sdp, blks) || 289 if (gfs2_assert_warn(sdp, blks) ||
279 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) 290 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
@@ -281,7 +292,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
281 292
282 mutex_lock(&sdp->sd_log_reserve_mutex); 293 mutex_lock(&sdp->sd_log_reserve_mutex);
283 gfs2_log_lock(sdp); 294 gfs2_log_lock(sdp);
284 while(sdp->sd_log_blks_free <= (blks + 6)) { 295 while(sdp->sd_log_blks_free <= (blks + reserved_blks)) {
285 gfs2_log_unlock(sdp); 296 gfs2_log_unlock(sdp);
286 gfs2_ail1_empty(sdp, 0); 297 gfs2_ail1_empty(sdp, 0);
287 gfs2_log_flush(sdp, NULL); 298 gfs2_log_flush(sdp, NULL);
@@ -357,6 +368,58 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
357 return dist; 368 return dist;
358} 369}
359 370
371/**
372 * calc_reserved - Calculate the number of blocks to reserve when
373 * refunding a transaction's unused buffers.
374 * @sdp: The GFS2 superblock
375 *
376 * This is complex. We need to reserve room for all our currently used
377 * metadata buffers (e.g. normal file I/O rewriting file time stamps) and
378 * all our journaled data buffers for journaled files (e.g. files in the
379 * meta_fs like rindex, or files for which chattr +j was done.)
380 * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush
381 * will count it as free space (sd_log_blks_free) and corruption will follow.
382 *
383 * We can have metadata bufs and jdata bufs in the same journal. So each
384 * type gets its own log header, for which we need to reserve a block.
385 * In fact, each type has the potential for needing more than one header
386 * in cases where we have more buffers than will fit on a journal page.
387 * Metadata journal entries take up half the space of journaled buffer entries.
388 * Thus, metadata entries have buf_limit (502) and journaled buffers have
389 * databuf_limit (251) before they cause a wrap around.
390 *
391 * Also, we need to reserve blocks for revoke journal entries and one for an
392 * overall header for the lot.
393 *
394 * Returns: the number of blocks reserved
395 */
396static unsigned int calc_reserved(struct gfs2_sbd *sdp)
397{
398 unsigned int reserved = 0;
399 unsigned int mbuf_limit, metabufhdrs_needed;
400 unsigned int dbuf_limit, databufhdrs_needed;
401 unsigned int revokes = 0;
402
403 mbuf_limit = buf_limit(sdp);
404 metabufhdrs_needed = (sdp->sd_log_commited_buf +
405 (mbuf_limit - 1)) / mbuf_limit;
406 dbuf_limit = databuf_limit(sdp);
407 databufhdrs_needed = (sdp->sd_log_commited_databuf +
408 (dbuf_limit - 1)) / dbuf_limit;
409
410 if (sdp->sd_log_commited_revoke)
411 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
412 sizeof(u64));
413
414 reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
415 sdp->sd_log_commited_databuf + databufhdrs_needed +
416 revokes;
417 /* One for the overall header */
418 if (reserved)
419 reserved++;
420 return reserved;
421}
422
360static unsigned int current_tail(struct gfs2_sbd *sdp) 423static unsigned int current_tail(struct gfs2_sbd *sdp)
361{ 424{
362 struct gfs2_ail *ai; 425 struct gfs2_ail *ai;
@@ -447,14 +510,14 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
447 return bh; 510 return bh;
448} 511}
449 512
450static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull) 513static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
451{ 514{
452 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); 515 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
453 516
454 ail2_empty(sdp, new_tail); 517 ail2_empty(sdp, new_tail);
455 518
456 gfs2_log_lock(sdp); 519 gfs2_log_lock(sdp);
457 sdp->sd_log_blks_free += dist - (pull ? 1 : 0); 520 sdp->sd_log_blks_free += dist;
458 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); 521 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
459 gfs2_log_unlock(sdp); 522 gfs2_log_unlock(sdp);
460 523
@@ -504,7 +567,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
504 brelse(bh); 567 brelse(bh);
505 568
506 if (sdp->sd_log_tail != tail) 569 if (sdp->sd_log_tail != tail)
507 log_pull_tail(sdp, tail, pull); 570 log_pull_tail(sdp, tail);
508 else 571 else
509 gfs2_assert_withdraw(sdp, !pull); 572 gfs2_assert_withdraw(sdp, !pull);
510 573
@@ -517,6 +580,7 @@ static void log_flush_commit(struct gfs2_sbd *sdp)
517 struct list_head *head = &sdp->sd_log_flush_list; 580 struct list_head *head = &sdp->sd_log_flush_list;
518 struct gfs2_log_buf *lb; 581 struct gfs2_log_buf *lb;
519 struct buffer_head *bh; 582 struct buffer_head *bh;
583 int flushcount = 0;
520 584
521 while (!list_empty(head)) { 585 while (!list_empty(head)) {
522 lb = list_entry(head->next, struct gfs2_log_buf, lb_list); 586 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
@@ -533,9 +597,20 @@ static void log_flush_commit(struct gfs2_sbd *sdp)
533 } else 597 } else
534 brelse(bh); 598 brelse(bh);
535 kfree(lb); 599 kfree(lb);
600 flushcount++;
536 } 601 }
537 602
538 log_write_header(sdp, 0, 0); 603 /* If nothing was journaled, the header is unplanned and unwanted. */
604 if (flushcount) {
605 log_write_header(sdp, 0, 0);
606 } else {
607 unsigned int tail;
608 tail = current_tail(sdp);
609
610 gfs2_ail1_empty(sdp, 0);
611 if (sdp->sd_log_tail != tail)
612 log_pull_tail(sdp, tail);
613 }
539} 614}
540 615
541/** 616/**
@@ -565,7 +640,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
565 INIT_LIST_HEAD(&ai->ai_ail1_list); 640 INIT_LIST_HEAD(&ai->ai_ail1_list);
566 INIT_LIST_HEAD(&ai->ai_ail2_list); 641 INIT_LIST_HEAD(&ai->ai_ail2_list);
567 642
568 gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf); 643 gfs2_assert_withdraw(sdp,
644 sdp->sd_log_num_buf + sdp->sd_log_num_jdata ==
645 sdp->sd_log_commited_buf +
646 sdp->sd_log_commited_databuf);
569 gfs2_assert_withdraw(sdp, 647 gfs2_assert_withdraw(sdp,
570 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 648 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
571 649
@@ -576,16 +654,19 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
576 lops_before_commit(sdp); 654 lops_before_commit(sdp);
577 if (!list_empty(&sdp->sd_log_flush_list)) 655 if (!list_empty(&sdp->sd_log_flush_list))
578 log_flush_commit(sdp); 656 log_flush_commit(sdp);
579 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle) 657 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
658 gfs2_log_lock(sdp);
659 sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */
660 gfs2_log_unlock(sdp);
580 log_write_header(sdp, 0, PULL); 661 log_write_header(sdp, 0, PULL);
662 }
581 lops_after_commit(sdp, ai); 663 lops_after_commit(sdp, ai);
582 664
583 gfs2_log_lock(sdp); 665 gfs2_log_lock(sdp);
584 sdp->sd_log_head = sdp->sd_log_flush_head; 666 sdp->sd_log_head = sdp->sd_log_flush_head;
585 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
586 sdp->sd_log_blks_reserved = 0; 667 sdp->sd_log_blks_reserved = 0;
587 sdp->sd_log_commited_buf = 0; 668 sdp->sd_log_commited_buf = 0;
588 sdp->sd_log_num_hdrs = 0; 669 sdp->sd_log_commited_databuf = 0;
589 sdp->sd_log_commited_revoke = 0; 670 sdp->sd_log_commited_revoke = 0;
590 671
591 if (!list_empty(&ai->ai_ail1_list)) { 672 if (!list_empty(&ai->ai_ail1_list)) {
@@ -602,32 +683,26 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
602 683
603static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 684static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
604{ 685{
605 unsigned int reserved = 0; 686 unsigned int reserved;
606 unsigned int old; 687 unsigned int old;
607 688
608 gfs2_log_lock(sdp); 689 gfs2_log_lock(sdp);
609 690
610 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; 691 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
611 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0); 692 sdp->sd_log_commited_databuf += tr->tr_num_databuf_new -
693 tr->tr_num_databuf_rm;
694 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
695 (((int)sdp->sd_log_commited_databuf) >= 0));
612 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 696 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
613 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); 697 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
614 698 reserved = calc_reserved(sdp);
615 if (sdp->sd_log_commited_buf)
616 reserved += sdp->sd_log_commited_buf;
617 if (sdp->sd_log_commited_revoke)
618 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
619 sizeof(u64));
620 if (reserved)
621 reserved++;
622
623 old = sdp->sd_log_blks_free; 699 old = sdp->sd_log_blks_free;
624 sdp->sd_log_blks_free += tr->tr_reserved - 700 sdp->sd_log_blks_free += tr->tr_reserved -
625 (reserved - sdp->sd_log_blks_reserved); 701 (reserved - sdp->sd_log_blks_reserved);
626 702
627 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old); 703 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
628 gfs2_assert_withdraw(sdp, 704 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
629 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks + 705 sdp->sd_jdesc->jd_blocks);
630 sdp->sd_log_num_hdrs);
631 706
632 sdp->sd_log_blks_reserved = reserved; 707 sdp->sd_log_blks_reserved = reserved;
633 708
@@ -673,13 +748,13 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
673 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 748 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
674 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); 749 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
675 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); 750 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
676 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
677 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); 751 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
678 752
679 sdp->sd_log_flush_head = sdp->sd_log_head; 753 sdp->sd_log_flush_head = sdp->sd_log_head;
680 sdp->sd_log_flush_wrapped = 0; 754 sdp->sd_log_flush_wrapped = 0;
681 755
682 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0); 756 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
757 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
683 758
684 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); 759 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
685 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); 760 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index f82d84d05d..aff70f0698 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -17,6 +17,7 @@
17 17
18#include "gfs2.h" 18#include "gfs2.h"
19#include "incore.h" 19#include "incore.h"
20#include "inode.h"
20#include "glock.h" 21#include "glock.h"
21#include "log.h" 22#include "log.h"
22#include "lops.h" 23#include "lops.h"
@@ -117,15 +118,13 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
117 struct gfs2_log_descriptor *ld; 118 struct gfs2_log_descriptor *ld;
118 struct gfs2_bufdata *bd1 = NULL, *bd2; 119 struct gfs2_bufdata *bd1 = NULL, *bd2;
119 unsigned int total = sdp->sd_log_num_buf; 120 unsigned int total = sdp->sd_log_num_buf;
120 unsigned int offset = sizeof(struct gfs2_log_descriptor); 121 unsigned int offset = BUF_OFFSET;
121 unsigned int limit; 122 unsigned int limit;
122 unsigned int num; 123 unsigned int num;
123 unsigned n; 124 unsigned n;
124 __be64 *ptr; 125 __be64 *ptr;
125 126
126 offset += sizeof(__be64) - 1; 127 limit = buf_limit(sdp);
127 offset &= ~(sizeof(__be64) - 1);
128 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
129 /* for 4k blocks, limit = 503 */ 128 /* for 4k blocks, limit = 503 */
130 129
131 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); 130 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
@@ -134,7 +133,6 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
134 if (total > limit) 133 if (total > limit)
135 num = limit; 134 num = limit;
136 bh = gfs2_log_get_buf(sdp); 135 bh = gfs2_log_get_buf(sdp);
137 sdp->sd_log_num_hdrs++;
138 ld = (struct gfs2_log_descriptor *)bh->b_data; 136 ld = (struct gfs2_log_descriptor *)bh->b_data;
139 ptr = (__be64 *)(bh->b_data + offset); 137 ptr = (__be64 *)(bh->b_data + offset);
140 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 138 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -469,25 +467,28 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
469 struct gfs2_inode *ip = GFS2_I(mapping->host); 467 struct gfs2_inode *ip = GFS2_I(mapping->host);
470 468
471 gfs2_log_lock(sdp); 469 gfs2_log_lock(sdp);
470 if (!list_empty(&bd->bd_list_tr)) {
471 gfs2_log_unlock(sdp);
472 return;
473 }
472 tr->tr_touched = 1; 474 tr->tr_touched = 1;
473 if (list_empty(&bd->bd_list_tr) && 475 if (gfs2_is_jdata(ip)) {
474 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
475 tr->tr_num_buf++; 476 tr->tr_num_buf++;
476 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 477 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
477 gfs2_log_unlock(sdp);
478 gfs2_pin(sdp, bd->bd_bh);
479 tr->tr_num_buf_new++;
480 } else {
481 gfs2_log_unlock(sdp);
482 } 478 }
479 gfs2_log_unlock(sdp);
480 if (!list_empty(&le->le_list))
481 return;
482
483 gfs2_trans_add_gl(bd->bd_gl); 483 gfs2_trans_add_gl(bd->bd_gl);
484 gfs2_log_lock(sdp); 484 if (gfs2_is_jdata(ip)) {
485 if (list_empty(&le->le_list)) { 485 sdp->sd_log_num_jdata++;
486 if (ip->i_di.di_flags & GFS2_DIF_JDATA) 486 gfs2_pin(sdp, bd->bd_bh);
487 sdp->sd_log_num_jdata++; 487 tr->tr_num_databuf_new++;
488 sdp->sd_log_num_databuf++;
489 list_add(&le->le_list, &sdp->sd_log_le_databuf);
490 } 488 }
489 sdp->sd_log_num_databuf++;
490 gfs2_log_lock(sdp);
491 list_add(&le->le_list, &sdp->sd_log_le_databuf);
491 gfs2_log_unlock(sdp); 492 gfs2_log_unlock(sdp);
492} 493}
493 494
@@ -520,7 +521,6 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
520 LIST_HEAD(started); 521 LIST_HEAD(started);
521 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt; 522 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
522 struct buffer_head *bh = NULL,*bh1 = NULL; 523 struct buffer_head *bh = NULL,*bh1 = NULL;
523 unsigned int offset = sizeof(struct gfs2_log_descriptor);
524 struct gfs2_log_descriptor *ld; 524 struct gfs2_log_descriptor *ld;
525 unsigned int limit; 525 unsigned int limit;
526 unsigned int total_dbuf = sdp->sd_log_num_databuf; 526 unsigned int total_dbuf = sdp->sd_log_num_databuf;
@@ -528,9 +528,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
528 unsigned int num, n; 528 unsigned int num, n;
529 __be64 *ptr = NULL; 529 __be64 *ptr = NULL;
530 530
531 offset += 2*sizeof(__be64) - 1; 531 limit = databuf_limit(sdp);
532 offset &= ~(2*sizeof(__be64) - 1);
533 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
534 532
535 /* 533 /*
536 * Start writing ordered buffers, write journaled buffers 534 * Start writing ordered buffers, write journaled buffers
@@ -581,10 +579,10 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
581 gfs2_log_unlock(sdp); 579 gfs2_log_unlock(sdp);
582 if (!bh) { 580 if (!bh) {
583 bh = gfs2_log_get_buf(sdp); 581 bh = gfs2_log_get_buf(sdp);
584 sdp->sd_log_num_hdrs++;
585 ld = (struct gfs2_log_descriptor *) 582 ld = (struct gfs2_log_descriptor *)
586 bh->b_data; 583 bh->b_data;
587 ptr = (__be64 *)(bh->b_data + offset); 584 ptr = (__be64 *)(bh->b_data +
585 DATABUF_OFFSET);
588 ld->ld_header.mh_magic = 586 ld->ld_header.mh_magic =
589 cpu_to_be32(GFS2_MAGIC); 587 cpu_to_be32(GFS2_MAGIC);
590 ld->ld_header.mh_type = 588 ld->ld_header.mh_type =
@@ -605,7 +603,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
605 if (unlikely(magic != 0)) 603 if (unlikely(magic != 0))
606 set_buffer_escaped(bh1); 604 set_buffer_escaped(bh1);
607 gfs2_log_lock(sdp); 605 gfs2_log_lock(sdp);
608 if (n++ > num) 606 if (++n >= num)
609 break; 607 break;
610 } else if (!bh1) { 608 } else if (!bh1) {
611 total_dbuf--; 609 total_dbuf--;
@@ -622,6 +620,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
622 } 620 }
623 gfs2_log_unlock(sdp); 621 gfs2_log_unlock(sdp);
624 if (bh) { 622 if (bh) {
623 set_buffer_mapped(bh);
625 set_buffer_dirty(bh); 624 set_buffer_dirty(bh);
626 ll_rw_block(WRITE, 1, &bh); 625 ll_rw_block(WRITE, 1, &bh);
627 bh = NULL; 626 bh = NULL;
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 965bc65c7c..41a00df755 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -13,6 +13,13 @@
13#include <linux/list.h> 13#include <linux/list.h>
14#include "incore.h" 14#include "incore.h"
15 15
16#define BUF_OFFSET \
17 ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \
18 ~(sizeof(__be64) - 1))
19#define DATABUF_OFFSET \
20 ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \
21 ~(2 * sizeof(__be64) - 1))
22
16extern const struct gfs2_log_operations gfs2_glock_lops; 23extern const struct gfs2_log_operations gfs2_glock_lops;
17extern const struct gfs2_log_operations gfs2_buf_lops; 24extern const struct gfs2_log_operations gfs2_buf_lops;
18extern const struct gfs2_log_operations gfs2_revoke_lops; 25extern const struct gfs2_log_operations gfs2_revoke_lops;
@@ -21,6 +28,22 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;
21 28
22extern const struct gfs2_log_operations *gfs2_log_ops[]; 29extern const struct gfs2_log_operations *gfs2_log_ops[];
23 30
31static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
32{
33 unsigned int limit;
34
35 limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64);
36 return limit;
37}
38
39static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
40{
41 unsigned int limit;
42
43 limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64));
44 return limit;
45}
46
24static inline void lops_init_le(struct gfs2_log_element *le, 47static inline void lops_init_le(struct gfs2_log_element *le,
25 const struct gfs2_log_operations *lops) 48 const struct gfs2_log_operations *lops)
26{ 49{
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index e62d4f620c..8da343b34a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -387,12 +387,18 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
387 387
388 if (test_clear_buffer_pinned(bh)) { 388 if (test_clear_buffer_pinned(bh)) {
389 struct gfs2_trans *tr = current->journal_info; 389 struct gfs2_trans *tr = current->journal_info;
390 struct gfs2_inode *bh_ip =
391 GFS2_I(bh->b_page->mapping->host);
392
390 gfs2_log_lock(sdp); 393 gfs2_log_lock(sdp);
391 list_del_init(&bd->bd_le.le_list); 394 list_del_init(&bd->bd_le.le_list);
392 gfs2_assert_warn(sdp, sdp->sd_log_num_buf); 395 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
393 sdp->sd_log_num_buf--; 396 sdp->sd_log_num_buf--;
394 gfs2_log_unlock(sdp); 397 gfs2_log_unlock(sdp);
395 tr->tr_num_buf_rm++; 398 if (bh_ip->i_inode.i_private != NULL)
399 tr->tr_num_databuf_rm++;
400 else
401 tr->tr_num_buf_rm++;
396 brelse(bh); 402 brelse(bh);
397 } 403 }
398 if (bd) { 404 if (bd) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index e037425bc0..527bf19d96 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -63,7 +63,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
63static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, 63static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
64 struct buffer_head **bhp) 64 struct buffer_head **bhp)
65{ 65{
66 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp); 66 return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp);
67} 67}
68 68
69struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); 69struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 4864659555..6f006a804d 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -82,20 +82,19 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
82 char *options, *o, *v; 82 char *options, *o, *v;
83 int error = 0; 83 int error = 0;
84 84
85 if (!remount) { 85 /* If someone preloaded options, use those instead */
86 /* If someone preloaded options, use those instead */ 86 spin_lock(&gfs2_sys_margs_lock);
87 spin_lock(&gfs2_sys_margs_lock); 87 if (!remount && gfs2_sys_margs) {
88 if (gfs2_sys_margs) { 88 data = gfs2_sys_margs;
89 data = gfs2_sys_margs; 89 gfs2_sys_margs = NULL;
90 gfs2_sys_margs = NULL;
91 }
92 spin_unlock(&gfs2_sys_margs_lock);
93
94 /* Set some defaults */
95 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
96 args->ar_quota = GFS2_QUOTA_DEFAULT;
97 args->ar_data = GFS2_DATA_DEFAULT;
98 } 90 }
91 spin_unlock(&gfs2_sys_margs_lock);
92
93 /* Set some defaults */
94 memset(args, 0, sizeof(struct gfs2_args));
95 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
96 args->ar_quota = GFS2_QUOTA_DEFAULT;
97 args->ar_data = GFS2_DATA_DEFAULT;
99 98
100 /* Split the options into tokens with the "," character and 99 /* Split the options into tokens with the "," character and
101 process them */ 100 process them */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
deleted file mode 100644
index d9ecfd23a4..0000000000
--- a/fs/gfs2/ondisk.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14
15#include "gfs2.h"
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include "incore.h"
19
20#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
21 struct->member);
22
23/*
24 * gfs2_xxx_in - read in an xxx struct
25 * first arg: the cpu-order structure
26 * buf: the disk-order buffer
27 *
28 * gfs2_xxx_out - write out an xxx struct
29 * first arg: the cpu-order structure
30 * buf: the disk-order buffer
31 *
32 * gfs2_xxx_print - print out an xxx struct
33 * first arg: the cpu-order structure
34 */
35
36void gfs2_inum_in(struct gfs2_inum_host *no, const void *buf)
37{
38 const struct gfs2_inum *str = buf;
39
40 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
41 no->no_addr = be64_to_cpu(str->no_addr);
42}
43
44void gfs2_inum_out(const struct gfs2_inum_host *no, void *buf)
45{
46 struct gfs2_inum *str = buf;
47
48 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
49 str->no_addr = cpu_to_be64(no->no_addr);
50}
51
52static void gfs2_inum_print(const struct gfs2_inum_host *no)
53{
54 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
55 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
56}
57
58static void gfs2_meta_header_in(struct gfs2_meta_header_host *mh, const void *buf)
59{
60 const struct gfs2_meta_header *str = buf;
61
62 mh->mh_magic = be32_to_cpu(str->mh_magic);
63 mh->mh_type = be32_to_cpu(str->mh_type);
64 mh->mh_format = be32_to_cpu(str->mh_format);
65}
66
67void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
68{
69 const struct gfs2_sb *str = buf;
70
71 gfs2_meta_header_in(&sb->sb_header, buf);
72
73 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
74 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
75 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
76 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
77
78 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
79 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
80
81 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
82 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
83}
84
85void gfs2_rindex_in(struct gfs2_rindex_host *ri, const void *buf)
86{
87 const struct gfs2_rindex *str = buf;
88
89 ri->ri_addr = be64_to_cpu(str->ri_addr);
90 ri->ri_length = be32_to_cpu(str->ri_length);
91 ri->ri_data0 = be64_to_cpu(str->ri_data0);
92 ri->ri_data = be32_to_cpu(str->ri_data);
93 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
94
95}
96
97void gfs2_rindex_print(const struct gfs2_rindex_host *ri)
98{
99 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
100 pv(ri, ri_length, "%u");
101
102 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
103 pv(ri, ri_data, "%u");
104
105 pv(ri, ri_bitbytes, "%u");
106}
107
108void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
109{
110 const struct gfs2_rgrp *str = buf;
111
112 rg->rg_flags = be32_to_cpu(str->rg_flags);
113 rg->rg_free = be32_to_cpu(str->rg_free);
114 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
115 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
116}
117
118void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
119{
120 struct gfs2_rgrp *str = buf;
121
122 str->rg_flags = cpu_to_be32(rg->rg_flags);
123 str->rg_free = cpu_to_be32(rg->rg_free);
124 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
125 str->__pad = cpu_to_be32(0);
126 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
127 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
128}
129
130void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
131{
132 const struct gfs2_quota *str = buf;
133
134 qu->qu_limit = be64_to_cpu(str->qu_limit);
135 qu->qu_warn = be64_to_cpu(str->qu_warn);
136 qu->qu_value = be64_to_cpu(str->qu_value);
137}
138
139void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
140{
141 const struct gfs2_dinode_host *di = &ip->i_di;
142 struct gfs2_dinode *str = buf;
143
144 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
145 str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
146 str->di_header.__pad0 = 0;
147 str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
148 str->di_header.__pad1 = 0;
149
150 gfs2_inum_out(&ip->i_num, &str->di_num);
151
152 str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
153 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
154 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
155 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
156 str->di_size = cpu_to_be64(di->di_size);
157 str->di_blocks = cpu_to_be64(di->di_blocks);
158 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
159 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
160 str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
161
162 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
163 str->di_goal_data = cpu_to_be64(di->di_goal_data);
164 str->di_generation = cpu_to_be64(di->di_generation);
165
166 str->di_flags = cpu_to_be32(di->di_flags);
167 str->di_height = cpu_to_be16(di->di_height);
168 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
169 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
170 GFS2_FORMAT_DE : 0);
171 str->di_depth = cpu_to_be16(di->di_depth);
172 str->di_entries = cpu_to_be32(di->di_entries);
173
174 str->di_eattr = cpu_to_be64(di->di_eattr);
175}
176
177void gfs2_dinode_print(const struct gfs2_inode *ip)
178{
179 const struct gfs2_dinode_host *di = &ip->i_di;
180
181 gfs2_inum_print(&ip->i_num);
182
183 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
184 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
185 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
186 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
187
188 pv(di, di_flags, "0x%.8X");
189 pv(di, di_height, "%u");
190
191 pv(di, di_depth, "%u");
192 pv(di, di_entries, "%u");
193
194 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
195}
196
197void gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf)
198{
199 const struct gfs2_log_header *str = buf;
200
201 gfs2_meta_header_in(&lh->lh_header, buf);
202 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
203 lh->lh_flags = be32_to_cpu(str->lh_flags);
204 lh->lh_tail = be32_to_cpu(str->lh_tail);
205 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
206 lh->lh_hash = be32_to_cpu(str->lh_hash);
207}
208
209void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
210{
211 const struct gfs2_inum_range *str = buf;
212
213 ir->ir_start = be64_to_cpu(str->ir_start);
214 ir->ir_length = be64_to_cpu(str->ir_length);
215}
216
217void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
218{
219 struct gfs2_inum_range *str = buf;
220
221 str->ir_start = cpu_to_be64(ir->ir_start);
222 str->ir_length = cpu_to_be64(ir->ir_length);
223}
224
225void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
226{
227 const struct gfs2_statfs_change *str = buf;
228
229 sc->sc_total = be64_to_cpu(str->sc_total);
230 sc->sc_free = be64_to_cpu(str->sc_free);
231 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
232}
233
234void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
235{
236 struct gfs2_statfs_change *str = buf;
237
238 str->sc_total = cpu_to_be64(sc->sc_total);
239 str->sc_free = cpu_to_be64(sc->sc_free);
240 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
241}
242
243void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
244{
245 const struct gfs2_quota_change *str = buf;
246
247 qc->qc_change = be64_to_cpu(str->qc_change);
248 qc->qc_flags = be32_to_cpu(str->qc_flags);
249 qc->qc_id = be32_to_cpu(str->qc_id);
250}
251
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 30c1562217..26c888890c 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -32,6 +32,7 @@
32#include "trans.h" 32#include "trans.h"
33#include "rgrp.h" 33#include "rgrp.h"
34#include "ops_file.h" 34#include "ops_file.h"
35#include "super.h"
35#include "util.h" 36#include "util.h"
36#include "glops.h" 37#include "glops.h"
37 38
@@ -49,6 +50,8 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
49 end = start + bsize; 50 end = start + bsize;
50 if (end <= from || start >= to) 51 if (end <= from || start >= to)
51 continue; 52 continue;
53 if (gfs2_is_jdata(ip))
54 set_buffer_uptodate(bh);
52 gfs2_trans_add_bh(ip->i_gl, bh, 0); 55 gfs2_trans_add_bh(ip->i_gl, bh, 0);
53 } 56 }
54} 57}
@@ -134,7 +137,9 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
134 return 0; /* don't care */ 137 return 0; /* don't care */
135 } 138 }
136 139
137 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) { 140 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) &&
141 PageChecked(page)) {
142 ClearPageChecked(page);
138 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); 143 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
139 if (error) 144 if (error)
140 goto out_ignore; 145 goto out_ignore;
@@ -203,11 +208,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
203 * so we need to supply one here. It doesn't happen often. 208 * so we need to supply one here. It doesn't happen often.
204 */ 209 */
205 if (unlikely(page->index)) { 210 if (unlikely(page->index)) {
206 kaddr = kmap_atomic(page, KM_USER0); 211 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
207 memset(kaddr, 0, PAGE_CACHE_SIZE);
208 kunmap_atomic(kaddr, KM_USER0);
209 flush_dcache_page(page);
210 SetPageUptodate(page);
211 return 0; 212 return 0;
212 } 213 }
213 214
@@ -450,6 +451,31 @@ out_uninit:
450} 451}
451 452
452/** 453/**
454 * adjust_fs_space - Adjusts the free space available due to gfs2_grow
455 * @inode: the rindex inode
456 */
457static void adjust_fs_space(struct inode *inode)
458{
459 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
460 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
461 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
462 u64 fs_total, new_free;
463
464 /* Total up the file system space, according to the latest rindex. */
465 fs_total = gfs2_ri_total(sdp);
466
467 spin_lock(&sdp->sd_statfs_spin);
468 if (fs_total > (m_sc->sc_total + l_sc->sc_total))
469 new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
470 else
471 new_free = 0;
472 spin_unlock(&sdp->sd_statfs_spin);
473 fs_warn(sdp, "File system extended by %llu blocks.\n",
474 (unsigned long long)new_free);
475 gfs2_statfs_change(sdp, new_free, new_free, 0);
476}
477
478/**
453 * gfs2_commit_write - Commit write to a file 479 * gfs2_commit_write - Commit write to a file
454 * @file: The file to write to 480 * @file: The file to write to
455 * @page: The page containing the data 481 * @page: The page containing the data
@@ -511,6 +537,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
511 di->di_size = cpu_to_be64(inode->i_size); 537 di->di_size = cpu_to_be64(inode->i_size);
512 } 538 }
513 539
540 if (inode == sdp->sd_rindex)
541 adjust_fs_space(inode);
542
514 brelse(dibh); 543 brelse(dibh);
515 gfs2_trans_end(sdp); 544 gfs2_trans_end(sdp);
516 if (al->al_requested) { 545 if (al->al_requested) {
@@ -543,6 +572,23 @@ fail_nounlock:
543} 572}
544 573
545/** 574/**
575 * gfs2_set_page_dirty - Page dirtying function
576 * @page: The page to dirty
577 *
578 * Returns: 1 if it dirtyed the page, or 0 otherwise
579 */
580
581static int gfs2_set_page_dirty(struct page *page)
582{
583 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
584 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
585
586 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
587 SetPageChecked(page);
588 return __set_page_dirty_buffers(page);
589}
590
591/**
546 * gfs2_bmap - Block map function 592 * gfs2_bmap - Block map function
547 * @mapping: Address space info 593 * @mapping: Address space info
548 * @lblock: The block to map 594 * @lblock: The block to map
@@ -578,6 +624,8 @@ static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
578 if (bd) { 624 if (bd) {
579 bd->bd_bh = NULL; 625 bd->bd_bh = NULL;
580 bh->b_private = NULL; 626 bh->b_private = NULL;
627 if (!bd->bd_ail && list_empty(&bd->bd_le.le_list))
628 kmem_cache_free(gfs2_bufdata_cachep, bd);
581 } 629 }
582 gfs2_log_unlock(sdp); 630 gfs2_log_unlock(sdp);
583 631
@@ -598,6 +646,8 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset)
598 unsigned int curr_off = 0; 646 unsigned int curr_off = 0;
599 647
600 BUG_ON(!PageLocked(page)); 648 BUG_ON(!PageLocked(page));
649 if (offset == 0)
650 ClearPageChecked(page);
601 if (!page_has_buffers(page)) 651 if (!page_has_buffers(page))
602 return; 652 return;
603 653
@@ -728,8 +778,8 @@ static unsigned limit = 0;
728 return; 778 return;
729 779
730 fs_warn(sdp, "ip = %llu %llu\n", 780 fs_warn(sdp, "ip = %llu %llu\n",
731 (unsigned long long)ip->i_num.no_formal_ino, 781 (unsigned long long)ip->i_no_formal_ino,
732 (unsigned long long)ip->i_num.no_addr); 782 (unsigned long long)ip->i_no_addr);
733 783
734 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) 784 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
735 fs_warn(sdp, "ip->i_cache[%u] = %s\n", 785 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
@@ -810,6 +860,7 @@ const struct address_space_operations gfs2_file_aops = {
810 .sync_page = block_sync_page, 860 .sync_page = block_sync_page,
811 .prepare_write = gfs2_prepare_write, 861 .prepare_write = gfs2_prepare_write,
812 .commit_write = gfs2_commit_write, 862 .commit_write = gfs2_commit_write,
863 .set_page_dirty = gfs2_set_page_dirty,
813 .bmap = gfs2_bmap, 864 .bmap = gfs2_bmap,
814 .invalidatepage = gfs2_invalidatepage, 865 .invalidatepage = gfs2_invalidatepage,
815 .releasepage = gfs2_releasepage, 866 .releasepage = gfs2_releasepage,
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index 35aaee4aa7..fa1b5b3d28 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index a6fdc52f55..793e334d09 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -21,6 +21,7 @@
21#include "glock.h" 21#include "glock.h"
22#include "ops_dentry.h" 22#include "ops_dentry.h"
23#include "util.h" 23#include "util.h"
24#include "inode.h"
24 25
25/** 26/**
26 * gfs2_drevalidate - Check directory lookup consistency 27 * gfs2_drevalidate - Check directory lookup consistency
@@ -40,14 +41,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
40 struct gfs2_inode *dip = GFS2_I(parent->d_inode); 41 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
41 struct inode *inode = dentry->d_inode; 42 struct inode *inode = dentry->d_inode;
42 struct gfs2_holder d_gh; 43 struct gfs2_holder d_gh;
43 struct gfs2_inode *ip; 44 struct gfs2_inode *ip = NULL;
44 struct gfs2_inum_host inum;
45 unsigned int type;
46 int error; 45 int error;
47 int had_lock=0; 46 int had_lock=0;
48 47
49 if (inode && is_bad_inode(inode)) 48 if (inode) {
50 goto invalid; 49 if (is_bad_inode(inode))
50 goto invalid;
51 ip = GFS2_I(inode);
52 }
51 53
52 if (sdp->sd_args.ar_localcaching) 54 if (sdp->sd_args.ar_localcaching)
53 goto valid; 55 goto valid;
@@ -59,7 +61,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
59 goto fail; 61 goto fail;
60 } 62 }
61 63
62 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); 64 error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
63 switch (error) { 65 switch (error) {
64 case 0: 66 case 0:
65 if (!inode) 67 if (!inode)
@@ -73,16 +75,6 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
73 goto fail_gunlock; 75 goto fail_gunlock;
74 } 76 }
75 77
76 ip = GFS2_I(inode);
77
78 if (!gfs2_inum_equal(&ip->i_num, &inum))
79 goto invalid_gunlock;
80
81 if (IF2DT(ip->i_inode.i_mode) != type) {
82 gfs2_consist_inode(dip);
83 goto fail_gunlock;
84 }
85
86valid_gunlock: 78valid_gunlock:
87 if (!had_lock) 79 if (!had_lock)
88 gfs2_glock_dq_uninit(&d_gh); 80 gfs2_glock_dq_uninit(&d_gh);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index aad918337a..b8312edee0 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -11,6 +11,7 @@
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/exportfs.h>
14#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 16#include <linux/crc32.h>
16#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
@@ -22,10 +23,14 @@
22#include "glops.h" 23#include "glops.h"
23#include "inode.h" 24#include "inode.h"
24#include "ops_dentry.h" 25#include "ops_dentry.h"
25#include "ops_export.h" 26#include "ops_fstype.h"
26#include "rgrp.h" 27#include "rgrp.h"
27#include "util.h" 28#include "util.h"
28 29
30#define GFS2_SMALL_FH_SIZE 4
31#define GFS2_LARGE_FH_SIZE 8
32#define GFS2_OLD_FH_SIZE 10
33
29static struct dentry *gfs2_decode_fh(struct super_block *sb, 34static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *p, 35 __u32 *p,
31 int fh_len, 36 int fh_len,
@@ -35,31 +40,28 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb,
35 void *context) 40 void *context)
36{ 41{
37 __be32 *fh = (__force __be32 *)p; 42 __be32 *fh = (__force __be32 *)p;
38 struct gfs2_fh_obj fh_obj; 43 struct gfs2_inum_host inum, parent;
39 struct gfs2_inum_host *this, parent;
40 44
41 this = &fh_obj.this;
42 fh_obj.imode = DT_UNKNOWN;
43 memset(&parent, 0, sizeof(struct gfs2_inum)); 45 memset(&parent, 0, sizeof(struct gfs2_inum));
44 46
45 switch (fh_len) { 47 switch (fh_len) {
46 case GFS2_LARGE_FH_SIZE: 48 case GFS2_LARGE_FH_SIZE:
49 case GFS2_OLD_FH_SIZE:
47 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; 50 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
48 parent.no_formal_ino |= be32_to_cpu(fh[5]); 51 parent.no_formal_ino |= be32_to_cpu(fh[5]);
49 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; 52 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
50 parent.no_addr |= be32_to_cpu(fh[7]); 53 parent.no_addr |= be32_to_cpu(fh[7]);
51 fh_obj.imode = be32_to_cpu(fh[8]);
52 case GFS2_SMALL_FH_SIZE: 54 case GFS2_SMALL_FH_SIZE:
53 this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; 55 inum.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
54 this->no_formal_ino |= be32_to_cpu(fh[1]); 56 inum.no_formal_ino |= be32_to_cpu(fh[1]);
55 this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32; 57 inum.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
56 this->no_addr |= be32_to_cpu(fh[3]); 58 inum.no_addr |= be32_to_cpu(fh[3]);
57 break; 59 break;
58 default: 60 default:
59 return NULL; 61 return NULL;
60 } 62 }
61 63
62 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent, 64 return gfs2_export_ops.find_exported_dentry(sb, &inum, &parent,
63 acceptable, context); 65 acceptable, context);
64} 66}
65 67
@@ -75,10 +77,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
75 (connectable && *len < GFS2_LARGE_FH_SIZE)) 77 (connectable && *len < GFS2_LARGE_FH_SIZE))
76 return 255; 78 return 255;
77 79
78 fh[0] = cpu_to_be32(ip->i_num.no_formal_ino >> 32); 80 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
79 fh[1] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF); 81 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
80 fh[2] = cpu_to_be32(ip->i_num.no_addr >> 32); 82 fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
81 fh[3] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF); 83 fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
82 *len = GFS2_SMALL_FH_SIZE; 84 *len = GFS2_SMALL_FH_SIZE;
83 85
84 if (!connectable || inode == sb->s_root->d_inode) 86 if (!connectable || inode == sb->s_root->d_inode)
@@ -90,13 +92,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
90 igrab(inode); 92 igrab(inode);
91 spin_unlock(&dentry->d_lock); 93 spin_unlock(&dentry->d_lock);
92 94
93 fh[4] = cpu_to_be32(ip->i_num.no_formal_ino >> 32); 95 fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
94 fh[5] = cpu_to_be32(ip->i_num.no_formal_ino & 0xFFFFFFFF); 96 fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
95 fh[6] = cpu_to_be32(ip->i_num.no_addr >> 32); 97 fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
96 fh[7] = cpu_to_be32(ip->i_num.no_addr & 0xFFFFFFFF); 98 fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
97
98 fh[8] = cpu_to_be32(inode->i_mode);
99 fh[9] = 0; /* pad to double word */
100 *len = GFS2_LARGE_FH_SIZE; 99 *len = GFS2_LARGE_FH_SIZE;
101 100
102 iput(inode); 101 iput(inode);
@@ -144,7 +143,8 @@ static int gfs2_get_name(struct dentry *parent, char *name,
144 ip = GFS2_I(inode); 143 ip = GFS2_I(inode);
145 144
146 *name = 0; 145 *name = 0;
147 gnfd.inum = ip->i_num; 146 gnfd.inum.no_addr = ip->i_no_addr;
147 gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
148 gnfd.name = name; 148 gnfd.name = name;
149 149
150 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); 150 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
@@ -192,8 +192,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
192static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) 192static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
193{ 193{
194 struct gfs2_sbd *sdp = sb->s_fs_info; 194 struct gfs2_sbd *sdp = sb->s_fs_info;
195 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj; 195 struct gfs2_inum_host *inum = inum_obj;
196 struct gfs2_inum_host *inum = &fh_obj->this;
197 struct gfs2_holder i_gh, ri_gh, rgd_gh; 196 struct gfs2_holder i_gh, ri_gh, rgd_gh;
198 struct gfs2_rgrpd *rgd; 197 struct gfs2_rgrpd *rgd;
199 struct inode *inode; 198 struct inode *inode;
@@ -202,9 +201,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
202 201
203 /* System files? */ 202 /* System files? */
204 203
205 inode = gfs2_ilookup(sb, inum); 204 inode = gfs2_ilookup(sb, inum->no_addr);
206 if (inode) { 205 if (inode) {
207 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) { 206 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
208 iput(inode); 207 iput(inode);
209 return ERR_PTR(-ESTALE); 208 return ERR_PTR(-ESTALE);
210 } 209 }
@@ -236,7 +235,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
236 gfs2_glock_dq_uninit(&rgd_gh); 235 gfs2_glock_dq_uninit(&rgd_gh);
237 gfs2_glock_dq_uninit(&ri_gh); 236 gfs2_glock_dq_uninit(&ri_gh);
238 237
239 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode); 238 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
239 inum->no_addr,
240 0);
240 if (!inode) 241 if (!inode)
241 goto fail; 242 goto fail;
242 if (IS_ERR(inode)) { 243 if (IS_ERR(inode)) {
@@ -250,6 +251,15 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
250 goto fail; 251 goto fail;
251 } 252 }
252 253
254 /* Pick up the works we bypass in gfs2_inode_lookup */
255 if (inode->i_state & I_NEW)
256 gfs2_set_iop(inode);
257
258 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
259 iput(inode);
260 goto fail;
261 }
262
253 error = -EIO; 263 error = -EIO;
254 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { 264 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
255 iput(inode); 265 iput(inode);
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
deleted file mode 100644
index f925a955b3..0000000000
--- a/fs/gfs2/ops_export.h
+++ /dev/null
@@ -1,22 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13#define GFS2_SMALL_FH_SIZE 4
14#define GFS2_LARGE_FH_SIZE 10
15
16extern struct export_operations gfs2_export_ops;
17struct gfs2_fh_obj {
18 struct gfs2_inum_host this;
19 __u32 imode;
20};
21
22#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 064df88045..196d83266e 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -502,7 +502,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
502 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 502 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
503 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 503 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
504 struct lm_lockname name = 504 struct lm_lockname name =
505 { .ln_number = ip->i_num.no_addr, 505 { .ln_number = ip->i_no_addr,
506 .ln_type = LM_TYPE_PLOCK }; 506 .ln_type = LM_TYPE_PLOCK };
507 507
508 if (!(fl->fl_flags & FL_POSIX)) 508 if (!(fl->fl_flags & FL_POSIX))
@@ -557,7 +557,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
557 gfs2_glock_dq_uninit(fl_gh); 557 gfs2_glock_dq_uninit(fl_gh);
558 } else { 558 } else {
559 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), 559 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
560 ip->i_num.no_addr, &gfs2_flock_glops, 560 ip->i_no_addr, &gfs2_flock_glops,
561 CREATE, &gl); 561 CREATE, &gl);
562 if (error) 562 if (error)
563 goto out; 563 goto out;
@@ -635,7 +635,6 @@ const struct file_operations gfs2_file_fops = {
635 .release = gfs2_close, 635 .release = gfs2_close,
636 .fsync = gfs2_fsync, 636 .fsync = gfs2_fsync,
637 .lock = gfs2_lock, 637 .lock = gfs2_lock,
638 .sendfile = generic_file_sendfile,
639 .flock = gfs2_flock, 638 .flock = gfs2_flock,
640 .splice_read = generic_file_splice_read, 639 .splice_read = generic_file_splice_read,
641 .splice_write = generic_file_splice_write, 640 .splice_write = generic_file_splice_write,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2c5f8e7def..cf5aa50505 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -27,7 +27,6 @@
27#include "inode.h" 27#include "inode.h"
28#include "lm.h" 28#include "lm.h"
29#include "mount.h" 29#include "mount.h"
30#include "ops_export.h"
31#include "ops_fstype.h" 30#include "ops_fstype.h"
32#include "ops_super.h" 31#include "ops_super.h"
33#include "recovery.h" 32#include "recovery.h"
@@ -105,6 +104,7 @@ static void init_vfs(struct super_block *sb, unsigned noatime)
105 sb->s_magic = GFS2_MAGIC; 104 sb->s_magic = GFS2_MAGIC;
106 sb->s_op = &gfs2_super_ops; 105 sb->s_op = &gfs2_super_ops;
107 sb->s_export_op = &gfs2_export_ops; 106 sb->s_export_op = &gfs2_export_ops;
107 sb->s_time_gran = 1;
108 sb->s_maxbytes = MAX_LFS_FILESIZE; 108 sb->s_maxbytes = MAX_LFS_FILESIZE;
109 109
110 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) 110 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
@@ -116,7 +116,6 @@ static void init_vfs(struct super_block *sb, unsigned noatime)
116 116
117static int init_names(struct gfs2_sbd *sdp, int silent) 117static int init_names(struct gfs2_sbd *sdp, int silent)
118{ 118{
119 struct page *page;
120 char *proto, *table; 119 char *proto, *table;
121 int error = 0; 120 int error = 0;
122 121
@@ -126,14 +125,9 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
126 /* Try to autodetect */ 125 /* Try to autodetect */
127 126
128 if (!proto[0] || !table[0]) { 127 if (!proto[0] || !table[0]) {
129 struct gfs2_sb *sb; 128 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
130 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); 129 if (error)
131 if (!page) 130 return error;
132 return -ENOBUFS;
133 sb = kmap(page);
134 gfs2_sb_in(&sdp->sd_sb, sb);
135 kunmap(page);
136 __free_page(page);
137 131
138 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); 132 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
139 if (error) 133 if (error)
@@ -151,6 +145,9 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
151 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); 145 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
152 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); 146 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
153 147
148 while ((table = strchr(sdp->sd_table_name, '/')))
149 *table = '_';
150
154out: 151out:
155 return error; 152 return error;
156} 153}
@@ -236,17 +233,17 @@ fail:
236 return error; 233 return error;
237} 234}
238 235
239static struct inode *gfs2_lookup_root(struct super_block *sb, 236static inline struct inode *gfs2_lookup_root(struct super_block *sb,
240 struct gfs2_inum_host *inum) 237 u64 no_addr)
241{ 238{
242 return gfs2_inode_lookup(sb, inum, DT_DIR); 239 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
243} 240}
244 241
245static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 242static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
246{ 243{
247 struct super_block *sb = sdp->sd_vfs; 244 struct super_block *sb = sdp->sd_vfs;
248 struct gfs2_holder sb_gh; 245 struct gfs2_holder sb_gh;
249 struct gfs2_inum_host *inum; 246 u64 no_addr;
250 struct inode *inode; 247 struct inode *inode;
251 int error = 0; 248 int error = 0;
252 249
@@ -289,10 +286,10 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
289 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); 286 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
290 287
291 /* Get the root inode */ 288 /* Get the root inode */
292 inum = &sdp->sd_sb.sb_root_dir; 289 no_addr = sdp->sd_sb.sb_root_dir.no_addr;
293 if (sb->s_type == &gfs2meta_fs_type) 290 if (sb->s_type == &gfs2meta_fs_type)
294 inum = &sdp->sd_sb.sb_master_dir; 291 no_addr = sdp->sd_sb.sb_master_dir.no_addr;
295 inode = gfs2_lookup_root(sb, inum); 292 inode = gfs2_lookup_root(sb, no_addr);
296 if (IS_ERR(inode)) { 293 if (IS_ERR(inode)) {
297 error = PTR_ERR(inode); 294 error = PTR_ERR(inode);
298 fs_err(sdp, "can't read in root inode: %d\n", error); 295 fs_err(sdp, "can't read in root inode: %d\n", error);
@@ -449,7 +446,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
449 if (undo) 446 if (undo)
450 goto fail_qinode; 447 goto fail_qinode;
451 448
452 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir); 449 inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
453 if (IS_ERR(inode)) { 450 if (IS_ERR(inode)) {
454 error = PTR_ERR(inode); 451 error = PTR_ERR(inode);
455 fs_err(sdp, "can't read in master directory: %d\n", error); 452 fs_err(sdp, "can't read in master directory: %d\n", error);
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
index 7cc2c29627..407029b3b2 100644
--- a/fs/gfs2/ops_fstype.h
+++ b/fs/gfs2/ops_fstype.h
@@ -14,5 +14,6 @@
14 14
15extern struct file_system_type gfs2_fs_type; 15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type; 16extern struct file_system_type gfs2meta_fs_type;
17extern struct export_operations gfs2_export_ops;
17 18
18#endif /* __OPS_FSTYPE_DOT_H__ */ 19#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d85f6e05cb..911c115b5c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -157,7 +157,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
157 if (error) 157 if (error)
158 goto out_gunlock; 158 goto out_gunlock;
159 159
160 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL); 160 error = gfs2_dir_check(dir, &dentry->d_name, NULL);
161 switch (error) { 161 switch (error) {
162 case -ENOENT: 162 case -ENOENT:
163 break; 163 break;
@@ -206,7 +206,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
206 goto out_gunlock_q; 206 goto out_gunlock_q;
207 207
208 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 208 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
209 al->al_rgd->rd_ri.ri_length + 209 al->al_rgd->rd_length +
210 2 * RES_DINODE + RES_STATFS + 210 2 * RES_DINODE + RES_STATFS +
211 RES_QUOTA, 0); 211 RES_QUOTA, 0);
212 if (error) 212 if (error)
@@ -217,8 +217,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_ipres; 217 goto out_ipres;
218 } 218 }
219 219
220 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num, 220 error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode));
221 IF2DT(inode->i_mode));
222 if (error) 221 if (error)
223 goto out_end_trans; 222 goto out_end_trans;
224 223
@@ -275,7 +274,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
275 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 274 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
276 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 275 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
277 276
278 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); 277 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
279 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 278 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
280 279
281 280
@@ -420,7 +419,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
420 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 419 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
421 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 420 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
422 421
423 gfs2_inum_out(&dip->i_num, &dent->de_inum); 422 gfs2_inum_out(dip, dent);
424 dent->de_type = cpu_to_be16(DT_DIR); 423 dent->de_type = cpu_to_be16(DT_DIR);
425 424
426 gfs2_dinode_out(ip, di); 425 gfs2_dinode_out(ip, di);
@@ -472,7 +471,7 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
472 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 471 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
473 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 472 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
474 473
475 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); 474 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
476 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 475 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
477 476
478 error = gfs2_glock_nq_m(3, ghs); 477 error = gfs2_glock_nq_m(3, ghs);
@@ -614,7 +613,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
614 * this is the case of the target file already existing 613 * this is the case of the target file already existing
615 * so we unlink before doing the rename 614 * so we unlink before doing the rename
616 */ 615 */
617 nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr); 616 nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
618 if (nrgd) 617 if (nrgd)
619 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); 618 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
620 } 619 }
@@ -653,7 +652,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
653 if (error) 652 if (error)
654 goto out_gunlock; 653 goto out_gunlock;
655 654
656 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL); 655 error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
657 switch (error) { 656 switch (error) {
658 case -ENOENT: 657 case -ENOENT:
659 error = 0; 658 error = 0;
@@ -712,7 +711,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
712 goto out_gunlock_q; 711 goto out_gunlock_q;
713 712
714 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 713 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
715 al->al_rgd->rd_ri.ri_length + 714 al->al_rgd->rd_length +
716 4 * RES_DINODE + 4 * RES_LEAF + 715 4 * RES_DINODE + 4 * RES_LEAF +
717 RES_STATFS + RES_QUOTA + 4, 0); 716 RES_STATFS + RES_QUOTA + 4, 0);
718 if (error) 717 if (error)
@@ -750,7 +749,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
750 if (error) 749 if (error)
751 goto out_end_trans; 750 goto out_end_trans;
752 751
753 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR); 752 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
754 if (error) 753 if (error)
755 goto out_end_trans; 754 goto out_end_trans;
756 } else { 755 } else {
@@ -758,7 +757,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 error = gfs2_meta_inode_buffer(ip, &dibh); 757 error = gfs2_meta_inode_buffer(ip, &dibh);
759 if (error) 758 if (error)
760 goto out_end_trans; 759 goto out_end_trans;
761 ip->i_inode.i_ctime = CURRENT_TIME_SEC; 760 ip->i_inode.i_ctime = CURRENT_TIME;
762 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 761 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
763 gfs2_dinode_out(ip, dibh->b_data); 762 gfs2_dinode_out(ip, dibh->b_data);
764 brelse(dibh); 763 brelse(dibh);
@@ -768,8 +767,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
768 if (error) 767 if (error)
769 goto out_end_trans; 768 goto out_end_trans;
770 769
771 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num, 770 error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode));
772 IF2DT(ip->i_inode.i_mode));
773 if (error) 771 if (error)
774 goto out_end_trans; 772 goto out_end_trans;
775 773
@@ -905,8 +903,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
905 } 903 }
906 904
907 error = gfs2_truncatei(ip, attr->ia_size); 905 error = gfs2_truncatei(ip, attr->ia_size);
908 if (error) 906 if (error && (inode->i_size != ip->i_di.di_size))
909 return error; 907 i_size_write(inode, ip->i_di.di_size);
910 908
911 return error; 909 return error;
912} 910}
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 485ce3d499..603d940f11 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -326,8 +326,10 @@ static void gfs2_clear_inode(struct inode *inode)
326 gfs2_glock_schedule_for_reclaim(ip->i_gl); 326 gfs2_glock_schedule_for_reclaim(ip->i_gl);
327 gfs2_glock_put(ip->i_gl); 327 gfs2_glock_put(ip->i_gl);
328 ip->i_gl = NULL; 328 ip->i_gl = NULL;
329 if (ip->i_iopen_gh.gh_gl) 329 if (ip->i_iopen_gh.gh_gl) {
330 ip->i_iopen_gh.gh_gl->gl_object = NULL;
330 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 331 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
332 }
331 } 333 }
332} 334}
333 335
@@ -422,13 +424,13 @@ static void gfs2_delete_inode(struct inode *inode)
422 if (!inode->i_private) 424 if (!inode->i_private)
423 goto out; 425 goto out;
424 426
425 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &gh); 427 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
426 if (unlikely(error)) { 428 if (unlikely(error)) {
427 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 429 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
428 goto out; 430 goto out;
429 } 431 }
430 432
431 gfs2_glock_dq(&ip->i_iopen_gh); 433 gfs2_glock_dq_wait(&ip->i_iopen_gh);
432 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 434 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
433 error = gfs2_glock_nq(&ip->i_iopen_gh); 435 error = gfs2_glock_nq(&ip->i_iopen_gh);
434 if (error) 436 if (error)
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index aa0dbd2aac..404b7cc9f8 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -66,7 +66,7 @@ static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
66 if (error) 66 if (error)
67 goto out_gunlock_q; 67 goto out_gunlock_q;
68 68
69 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length + 69 error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
70 ind_blocks + RES_DINODE + 70 ind_blocks + RES_DINODE +
71 RES_STATFS + RES_QUOTA, 0); 71 RES_STATFS + RES_QUOTA, 0);
72 if (error) 72 if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c186857e48..6e546ee8f3 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -66,6 +66,18 @@
66#define QUOTA_USER 1 66#define QUOTA_USER 1
67#define QUOTA_GROUP 0 67#define QUOTA_GROUP 0
68 68
69struct gfs2_quota_host {
70 u64 qu_limit;
71 u64 qu_warn;
72 s64 qu_value;
73};
74
75struct gfs2_quota_change_host {
76 u64 qc_change;
77 u32 qc_flags; /* GFS2_QCF_... */
78 u32 qc_id;
79};
80
69static u64 qd2offset(struct gfs2_quota_data *qd) 81static u64 qd2offset(struct gfs2_quota_data *qd)
70{ 82{
71 u64 offset; 83 u64 offset;
@@ -561,6 +573,25 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
561 mutex_unlock(&sdp->sd_quota_mutex); 573 mutex_unlock(&sdp->sd_quota_mutex);
562} 574}
563 575
576static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
577{
578 const struct gfs2_quota *str = buf;
579
580 qu->qu_limit = be64_to_cpu(str->qu_limit);
581 qu->qu_warn = be64_to_cpu(str->qu_warn);
582 qu->qu_value = be64_to_cpu(str->qu_value);
583}
584
585static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
586{
587 struct gfs2_quota *str = buf;
588
589 str->qu_limit = cpu_to_be64(qu->qu_limit);
590 str->qu_warn = cpu_to_be64(qu->qu_warn);
591 str->qu_value = cpu_to_be64(qu->qu_value);
592 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
593}
594
564/** 595/**
565 * gfs2_adjust_quota 596 * gfs2_adjust_quota
566 * 597 *
@@ -573,12 +604,13 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
573 struct inode *inode = &ip->i_inode; 604 struct inode *inode = &ip->i_inode;
574 struct address_space *mapping = inode->i_mapping; 605 struct address_space *mapping = inode->i_mapping;
575 unsigned long index = loc >> PAGE_CACHE_SHIFT; 606 unsigned long index = loc >> PAGE_CACHE_SHIFT;
576 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1); 607 unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
577 unsigned blocksize, iblock, pos; 608 unsigned blocksize, iblock, pos;
578 struct buffer_head *bh; 609 struct buffer_head *bh;
579 struct page *page; 610 struct page *page;
580 void *kaddr; 611 void *kaddr;
581 __be64 *ptr; 612 char *ptr;
613 struct gfs2_quota_host qp;
582 s64 value; 614 s64 value;
583 int err = -EIO; 615 int err = -EIO;
584 616
@@ -620,13 +652,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
620 652
621 kaddr = kmap_atomic(page, KM_USER0); 653 kaddr = kmap_atomic(page, KM_USER0);
622 ptr = kaddr + offset; 654 ptr = kaddr + offset;
623 value = (s64)be64_to_cpu(*ptr) + change; 655 gfs2_quota_in(&qp, ptr);
624 *ptr = cpu_to_be64(value); 656 qp.qu_value += change;
657 value = qp.qu_value;
658 gfs2_quota_out(&qp, ptr);
625 flush_dcache_page(page); 659 flush_dcache_page(page);
626 kunmap_atomic(kaddr, KM_USER0); 660 kunmap_atomic(kaddr, KM_USER0);
627 err = 0; 661 err = 0;
628 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); 662 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
629 qd->qd_qb.qb_value = cpu_to_be64(value); 663 qd->qd_qb.qb_value = cpu_to_be64(value);
664 ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
665 ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
630unlock: 666unlock:
631 unlock_page(page); 667 unlock_page(page);
632 page_cache_release(page); 668 page_cache_release(page);
@@ -689,7 +725,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
689 goto out_alloc; 725 goto out_alloc;
690 726
691 error = gfs2_trans_begin(sdp, 727 error = gfs2_trans_begin(sdp,
692 al->al_rgd->rd_ri.ri_length + 728 al->al_rgd->rd_length +
693 num_qd * data_blocks + 729 num_qd * data_blocks +
694 nalloc * ind_blocks + 730 nalloc * ind_blocks +
695 RES_DINODE + num_qd + 731 RES_DINODE + num_qd +
@@ -709,7 +745,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
709 offset = qd2offset(qd); 745 offset = qd2offset(qd);
710 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, 746 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
711 (struct gfs2_quota_data *) 747 (struct gfs2_quota_data *)
712 qd->qd_gl->gl_lvb); 748 qd);
713 if (error) 749 if (error)
714 goto out_end_trans; 750 goto out_end_trans;
715 751
@@ -1050,6 +1086,15 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1050 return error; 1086 return error;
1051} 1087}
1052 1088
1089static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
1090{
1091 const struct gfs2_quota_change *str = buf;
1092
1093 qc->qc_change = be64_to_cpu(str->qc_change);
1094 qc->qc_flags = be32_to_cpu(str->qc_flags);
1095 qc->qc_id = be32_to_cpu(str->qc_id);
1096}
1097
1053int gfs2_quota_init(struct gfs2_sbd *sdp) 1098int gfs2_quota_init(struct gfs2_sbd *sdp)
1054{ 1099{
1055 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1100 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 8bc182c7e2..5ada38c99a 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -116,6 +116,22 @@ void gfs2_revoke_clean(struct gfs2_sbd *sdp)
116 } 116 }
117} 117}
118 118
119static int gfs2_log_header_in(struct gfs2_log_header_host *lh, const void *buf)
120{
121 const struct gfs2_log_header *str = buf;
122
123 if (str->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
124 str->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH))
125 return 1;
126
127 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
128 lh->lh_flags = be32_to_cpu(str->lh_flags);
129 lh->lh_tail = be32_to_cpu(str->lh_tail);
130 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
131 lh->lh_hash = be32_to_cpu(str->lh_hash);
132 return 0;
133}
134
119/** 135/**
120 * get_log_header - read the log header for a given segment 136 * get_log_header - read the log header for a given segment
121 * @jd: the journal 137 * @jd: the journal
@@ -147,12 +163,10 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
147 sizeof(u32)); 163 sizeof(u32));
148 hash = crc32_le(hash, (unsigned char const *)&nothing, sizeof(nothing)); 164 hash = crc32_le(hash, (unsigned char const *)&nothing, sizeof(nothing));
149 hash ^= (u32)~0; 165 hash ^= (u32)~0;
150 gfs2_log_header_in(&lh, bh->b_data); 166 error = gfs2_log_header_in(&lh, bh->b_data);
151 brelse(bh); 167 brelse(bh);
152 168
153 if (lh.lh_header.mh_magic != GFS2_MAGIC || 169 if (error || lh.lh_blkno != blk || lh.lh_hash != hash)
154 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
155 lh.lh_blkno != blk || lh.lh_hash != hash)
156 return 1; 170 return 1;
157 171
158 *head = lh; 172 *head = lh;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1727f5012e..e4e0406251 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -28,6 +28,7 @@
28#include "ops_file.h" 28#include "ops_file.h"
29#include "util.h" 29#include "util.h"
30#include "log.h" 30#include "log.h"
31#include "inode.h"
31 32
32#define BFITNOENT ((u32)~0) 33#define BFITNOENT ((u32)~0)
33 34
@@ -50,6 +51,9 @@ static const char valid_change[16] = {
50 1, 0, 0, 0 51 1, 0, 0, 0
51}; 52};
52 53
54static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
55 unsigned char old_state, unsigned char new_state);
56
53/** 57/**
54 * gfs2_setbit - Set a bit in the bitmaps 58 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps 59 * @buffer: the buffer that holds the bitmaps
@@ -204,7 +208,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{ 208{
205 struct gfs2_sbd *sdp = rgd->rd_sbd; 209 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL; 210 struct gfs2_bitmap *bi = NULL;
207 u32 length = rgd->rd_ri.ri_length; 211 u32 length = rgd->rd_length;
208 u32 count[4], tmp; 212 u32 count[4], tmp;
209 int buf, x; 213 int buf, x;
210 214
@@ -227,7 +231,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
227 return; 231 return;
228 } 232 }
229 233
230 tmp = rgd->rd_ri.ri_data - 234 tmp = rgd->rd_data -
231 rgd->rd_rg.rg_free - 235 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes; 236 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) { 237 if (count[1] + count[2] != tmp) {
@@ -253,10 +257,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
253 257
254} 258}
255 259
256static inline int rgrp_contains_block(struct gfs2_rindex_host *ri, u64 block) 260static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
257{ 261{
258 u64 first = ri->ri_data0; 262 u64 first = rgd->rd_data0;
259 u64 last = first + ri->ri_data; 263 u64 last = first + rgd->rd_data;
260 return first <= block && block < last; 264 return first <= block && block < last;
261} 265}
262 266
@@ -275,7 +279,7 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
275 spin_lock(&sdp->sd_rindex_spin); 279 spin_lock(&sdp->sd_rindex_spin);
276 280
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { 281 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) { 282 if (rgrp_contains_block(rgd, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); 283 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin); 284 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd; 285 return rgd;
@@ -354,6 +358,15 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
354 mutex_unlock(&sdp->sd_rindex_mutex); 358 mutex_unlock(&sdp->sd_rindex_mutex);
355} 359}
356 360
361static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
362{
363 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
364 printk(KERN_INFO " ri_length = %u\n", rgd->rd_length);
365 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
366 printk(KERN_INFO " ri_data = %u\n", rgd->rd_data);
367 printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes);
368}
369
357/** 370/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes 371 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor 372 * @rgd: The resource group descriptor
@@ -367,7 +380,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{ 380{
368 struct gfs2_sbd *sdp = rgd->rd_sbd; 381 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi; 382 struct gfs2_bitmap *bi;
370 u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */ 383 u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
371 u32 bytes_left, bytes; 384 u32 bytes_left, bytes;
372 int x; 385 int x;
373 386
@@ -378,7 +391,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
378 if (!rgd->rd_bits) 391 if (!rgd->rd_bits)
379 return -ENOMEM; 392 return -ENOMEM;
380 393
381 bytes_left = rgd->rd_ri.ri_bitbytes; 394 bytes_left = rgd->rd_bitbytes;
382 395
383 for (x = 0; x < length; x++) { 396 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x; 397 bi = rgd->rd_bits + x;
@@ -399,14 +412,14 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
399 } else if (x + 1 == length) { 412 } else if (x + 1 == length) {
400 bytes = bytes_left; 413 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header); 414 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; 415 bi->bi_start = rgd->rd_bitbytes - bytes_left;
403 bi->bi_len = bytes; 416 bi->bi_len = bytes;
404 /* other blocks */ 417 /* other blocks */
405 } else { 418 } else {
406 bytes = sdp->sd_sb.sb_bsize - 419 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header); 420 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header); 421 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; 422 bi->bi_start = rgd->rd_bitbytes - bytes_left;
410 bi->bi_len = bytes; 423 bi->bi_len = bytes;
411 } 424 }
412 425
@@ -418,9 +431,9 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
418 return -EIO; 431 return -EIO;
419 } 432 }
420 bi = rgd->rd_bits + (length - 1); 433 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) { 434 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
422 if (gfs2_consist_rgrpd(rgd)) { 435 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri); 436 gfs2_rindex_print(rgd);
424 fs_err(sdp, "start=%u len=%u offset=%u\n", 437 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset); 438 bi->bi_start, bi->bi_len, bi->bi_offset);
426 } 439 }
@@ -431,9 +444,104 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
431} 444}
432 445
433/** 446/**
434 * gfs2_ri_update - Pull in a new resource index from the disk 447 * gfs2_ri_total - Total up the file system space, according to the rindex.
448 *
449 */
450u64 gfs2_ri_total(struct gfs2_sbd *sdp)
451{
452 u64 total_data = 0;
453 struct inode *inode = sdp->sd_rindex;
454 struct gfs2_inode *ip = GFS2_I(inode);
455 char buf[sizeof(struct gfs2_rindex)];
456 struct file_ra_state ra_state;
457 int error, rgrps;
458
459 mutex_lock(&sdp->sd_rindex_mutex);
460 file_ra_state_init(&ra_state, inode->i_mapping);
461 for (rgrps = 0;; rgrps++) {
462 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
463
464 if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
465 break;
466 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
467 sizeof(struct gfs2_rindex));
468 if (error != sizeof(struct gfs2_rindex))
469 break;
470 total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
471 }
472 mutex_unlock(&sdp->sd_rindex_mutex);
473 return total_data;
474}
475
476static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
477{
478 const struct gfs2_rindex *str = buf;
479
480 rgd->rd_addr = be64_to_cpu(str->ri_addr);
481 rgd->rd_length = be32_to_cpu(str->ri_length);
482 rgd->rd_data0 = be64_to_cpu(str->ri_data0);
483 rgd->rd_data = be32_to_cpu(str->ri_data);
484 rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
485}
486
487/**
488 * read_rindex_entry - Pull in a new resource index entry from the disk
435 * @gl: The glock covering the rindex inode 489 * @gl: The glock covering the rindex inode
436 * 490 *
491 * Returns: 0 on success, error code otherwise
492 */
493
494static int read_rindex_entry(struct gfs2_inode *ip,
495 struct file_ra_state *ra_state)
496{
497 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
498 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
499 char buf[sizeof(struct gfs2_rindex)];
500 int error;
501 struct gfs2_rgrpd *rgd;
502
503 error = gfs2_internal_read(ip, ra_state, buf, &pos,
504 sizeof(struct gfs2_rindex));
505 if (!error)
506 return 0;
507 if (error != sizeof(struct gfs2_rindex)) {
508 if (error > 0)
509 error = -EIO;
510 return error;
511 }
512
513 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
514 error = -ENOMEM;
515 if (!rgd)
516 return error;
517
518 mutex_init(&rgd->rd_mutex);
519 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
520 rgd->rd_sbd = sdp;
521
522 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
523 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
524
525 gfs2_rindex_in(rgd, buf);
526 error = compute_bitstructs(rgd);
527 if (error)
528 return error;
529
530 error = gfs2_glock_get(sdp, rgd->rd_addr,
531 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
532 if (error)
533 return error;
534
535 rgd->rd_gl->gl_object = rgd;
536 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
537 rgd->rd_flags |= GFS2_RDF_CHECK;
538 return error;
539}
540
541/**
542 * gfs2_ri_update - Pull in a new resource index from the disk
543 * @ip: pointer to the rindex inode
544 *
437 * Returns: 0 on successful update, error code otherwise 545 * Returns: 0 on successful update, error code otherwise
438 */ 546 */
439 547
@@ -441,13 +549,11 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
441{ 549{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 550 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode; 551 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state; 552 struct file_ra_state ra_state;
447 u64 junk = ip->i_di.di_size; 553 u64 rgrp_count = ip->i_di.di_size;
448 int error; 554 int error;
449 555
450 if (do_div(junk, sizeof(struct gfs2_rindex))) { 556 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip); 557 gfs2_consist_inode(ip);
452 return -EIO; 558 return -EIO;
453 } 559 }
@@ -455,50 +561,50 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
455 clear_rgrpdi(sdp); 561 clear_rgrpdi(sdp);
456 562
457 file_ra_state_init(&ra_state, inode->i_mapping); 563 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 564 for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 565 error = read_rindex_entry(ip, &ra_state);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 566 if (error) {
461 sizeof(struct gfs2_rindex)); 567 clear_rgrpdi(sdp);
462 if (!error) 568 return error;
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 } 569 }
570 }
469 571
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); 572 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
471 error = -ENOMEM; 573 return 0;
472 if (!rgd) 574}
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486 575
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, 576/**
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); 577 * gfs2_ri_update_special - Pull in a new resource index from the disk
489 if (error) 578 *
490 goto fail; 579 * This is a special version that's safe to call from gfs2_inplace_reserve_i.
580 * In this case we know that we don't have any resource groups in memory yet.
581 *
582 * @ip: pointer to the rindex inode
583 *
584 * Returns: 0 on successful update, error code otherwise
585 */
586static int gfs2_ri_update_special(struct gfs2_inode *ip)
587{
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state;
591 int error;
491 592
492 rgd->rd_gl->gl_object = rgd; 593 file_ra_state_init(&ra_state, inode->i_mapping);
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; 594 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
595 /* Ignore partials */
596 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
597 ip->i_di.di_size)
598 break;
599 error = read_rindex_entry(ip, &ra_state);
600 if (error) {
601 clear_rgrpdi(sdp);
602 return error;
603 }
494 } 604 }
495 605
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn; 606 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0; 607 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502} 608}
503 609
504/** 610/**
@@ -543,6 +649,28 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
543 return error; 649 return error;
544} 650}
545 651
652static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
653{
654 const struct gfs2_rgrp *str = buf;
655
656 rg->rg_flags = be32_to_cpu(str->rg_flags);
657 rg->rg_free = be32_to_cpu(str->rg_free);
658 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
659 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
660}
661
662static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
663{
664 struct gfs2_rgrp *str = buf;
665
666 str->rg_flags = cpu_to_be32(rg->rg_flags);
667 str->rg_free = cpu_to_be32(rg->rg_free);
668 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
669 str->__pad = cpu_to_be32(0);
670 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
671 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
672}
673
546/** 674/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps 675 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in 676 * @rgd: the struct gfs2_rgrpd describing the RG to read in
@@ -557,7 +685,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{ 685{
558 struct gfs2_sbd *sdp = rgd->rd_sbd; 686 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl; 687 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length; 688 unsigned int length = rgd->rd_length;
561 struct gfs2_bitmap *bi; 689 struct gfs2_bitmap *bi;
562 unsigned int x, y; 690 unsigned int x, y;
563 int error; 691 int error;
@@ -575,7 +703,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
575 703
576 for (x = 0; x < length; x++) { 704 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x; 705 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh); 706 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
579 if (error) 707 if (error)
580 goto fail; 708 goto fail;
581 } 709 }
@@ -637,7 +765,7 @@ void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
637void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) 765void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
638{ 766{
639 struct gfs2_sbd *sdp = rgd->rd_sbd; 767 struct gfs2_sbd *sdp = rgd->rd_sbd;
640 int x, length = rgd->rd_ri.ri_length; 768 int x, length = rgd->rd_length;
641 769
642 spin_lock(&sdp->sd_rindex_spin); 770 spin_lock(&sdp->sd_rindex_spin);
643 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); 771 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
@@ -660,7 +788,7 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
660void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) 788void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
661{ 789{
662 struct gfs2_sbd *sdp = rgd->rd_sbd; 790 struct gfs2_sbd *sdp = rgd->rd_sbd;
663 unsigned int length = rgd->rd_ri.ri_length; 791 unsigned int length = rgd->rd_length;
664 unsigned int x; 792 unsigned int x;
665 793
666 for (x = 0; x < length; x++) { 794 for (x = 0; x < length; x++) {
@@ -722,6 +850,38 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
722} 850}
723 851
724/** 852/**
853 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
854 * @rgd: The rgrp
855 *
856 * Returns: The inode, if one has been found
857 */
858
859static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
860{
861 struct inode *inode;
862 u32 goal = 0;
863 u64 no_addr;
864
865 for(;;) {
866 goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
867 GFS2_BLKST_UNLINKED);
868 if (goal == 0)
869 return 0;
870 no_addr = goal + rgd->rd_data0;
871 if (no_addr <= *last_unlinked)
872 continue;
873 *last_unlinked = no_addr;
874 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
875 no_addr, -1);
876 if (!IS_ERR(inode))
877 return inode;
878 }
879
880 rgd->rd_flags &= ~GFS2_RDF_CHECK;
881 return NULL;
882}
883
884/**
725 * recent_rgrp_first - get first RG from "recent" list 885 * recent_rgrp_first - get first RG from "recent" list
726 * @sdp: The GFS2 superblock 886 * @sdp: The GFS2 superblock
727 * @rglast: address of the rgrp used last 887 * @rglast: address of the rgrp used last
@@ -743,7 +903,7 @@ static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
743 goto first; 903 goto first;
744 904
745 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { 905 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
746 if (rgd->rd_ri.ri_addr == rglast) 906 if (rgd->rd_addr == rglast)
747 goto out; 907 goto out;
748 } 908 }
749 909
@@ -882,8 +1042,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
882 * Returns: errno 1042 * Returns: errno
883 */ 1043 */
884 1044
885static int get_local_rgrp(struct gfs2_inode *ip) 1045static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
886{ 1046{
1047 struct inode *inode = NULL;
887 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1048 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
888 struct gfs2_rgrpd *rgd, *begin = NULL; 1049 struct gfs2_rgrpd *rgd, *begin = NULL;
889 struct gfs2_alloc *al = &ip->i_alloc; 1050 struct gfs2_alloc *al = &ip->i_alloc;
@@ -903,7 +1064,11 @@ static int get_local_rgrp(struct gfs2_inode *ip)
903 case 0: 1064 case 0:
904 if (try_rgrp_fit(rgd, al)) 1065 if (try_rgrp_fit(rgd, al))
905 goto out; 1066 goto out;
1067 if (rgd->rd_flags & GFS2_RDF_CHECK)
1068 inode = try_rgrp_unlink(rgd, last_unlinked);
906 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1069 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1070 if (inode)
1071 return inode;
907 rgd = recent_rgrp_next(rgd, 1); 1072 rgd = recent_rgrp_next(rgd, 1);
908 break; 1073 break;
909 1074
@@ -912,7 +1077,7 @@ static int get_local_rgrp(struct gfs2_inode *ip)
912 break; 1077 break;
913 1078
914 default: 1079 default:
915 return error; 1080 return ERR_PTR(error);
916 } 1081 }
917 } 1082 }
918 1083
@@ -927,7 +1092,11 @@ static int get_local_rgrp(struct gfs2_inode *ip)
927 case 0: 1092 case 0:
928 if (try_rgrp_fit(rgd, al)) 1093 if (try_rgrp_fit(rgd, al))
929 goto out; 1094 goto out;
1095 if (rgd->rd_flags & GFS2_RDF_CHECK)
1096 inode = try_rgrp_unlink(rgd, last_unlinked);
930 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1097 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1098 if (inode)
1099 return inode;
931 break; 1100 break;
932 1101
933 case GLR_TRYFAILED: 1102 case GLR_TRYFAILED:
@@ -935,7 +1104,7 @@ static int get_local_rgrp(struct gfs2_inode *ip)
935 break; 1104 break;
936 1105
937 default: 1106 default:
938 return error; 1107 return ERR_PTR(error);
939 } 1108 }
940 1109
941 rgd = gfs2_rgrpd_get_next(rgd); 1110 rgd = gfs2_rgrpd_get_next(rgd);
@@ -944,7 +1113,7 @@ static int get_local_rgrp(struct gfs2_inode *ip)
944 1113
945 if (rgd == begin) { 1114 if (rgd == begin) {
946 if (++loops >= 3) 1115 if (++loops >= 3)
947 return -ENOSPC; 1116 return ERR_PTR(-ENOSPC);
948 if (!skipped) 1117 if (!skipped)
949 loops++; 1118 loops++;
950 flags = 0; 1119 flags = 0;
@@ -954,7 +1123,7 @@ static int get_local_rgrp(struct gfs2_inode *ip)
954 } 1123 }
955 1124
956out: 1125out:
957 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr; 1126 ip->i_last_rg_alloc = rgd->rd_addr;
958 1127
959 if (begin) { 1128 if (begin) {
960 recent_rgrp_add(rgd); 1129 recent_rgrp_add(rgd);
@@ -964,7 +1133,7 @@ out:
964 forward_rgrp_set(sdp, rgd); 1133 forward_rgrp_set(sdp, rgd);
965 } 1134 }
966 1135
967 return 0; 1136 return NULL;
968} 1137}
969 1138
970/** 1139/**
@@ -978,19 +1147,33 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
978{ 1147{
979 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1148 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
980 struct gfs2_alloc *al = &ip->i_alloc; 1149 struct gfs2_alloc *al = &ip->i_alloc;
981 int error; 1150 struct inode *inode;
1151 int error = 0;
1152 u64 last_unlinked = 0;
982 1153
983 if (gfs2_assert_warn(sdp, al->al_requested)) 1154 if (gfs2_assert_warn(sdp, al->al_requested))
984 return -EINVAL; 1155 return -EINVAL;
985 1156
986 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1157try_again:
1158 /* We need to hold the rindex unless the inode we're using is
1159 the rindex itself, in which case it's already held. */
1160 if (ip != GFS2_I(sdp->sd_rindex))
1161 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1162 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
1163 error = gfs2_ri_update_special(ip);
1164
987 if (error) 1165 if (error)
988 return error; 1166 return error;
989 1167
990 error = get_local_rgrp(ip); 1168 inode = get_local_rgrp(ip, &last_unlinked);
991 if (error) { 1169 if (inode) {
992 gfs2_glock_dq_uninit(&al->al_ri_gh); 1170 if (ip != GFS2_I(sdp->sd_rindex))
993 return error; 1171 gfs2_glock_dq_uninit(&al->al_ri_gh);
1172 if (IS_ERR(inode))
1173 return PTR_ERR(inode);
1174 iput(inode);
1175 gfs2_log_flush(sdp, NULL);
1176 goto try_again;
994 } 1177 }
995 1178
996 al->al_file = file; 1179 al->al_file = file;
@@ -1019,7 +1202,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1019 1202
1020 al->al_rgd = NULL; 1203 al->al_rgd = NULL;
1021 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1204 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1022 gfs2_glock_dq_uninit(&al->al_ri_gh); 1205 if (ip != GFS2_I(sdp->sd_rindex))
1206 gfs2_glock_dq_uninit(&al->al_ri_gh);
1023} 1207}
1024 1208
1025/** 1209/**
@@ -1037,8 +1221,8 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1037 unsigned int buf; 1221 unsigned int buf;
1038 unsigned char type; 1222 unsigned char type;
1039 1223
1040 length = rgd->rd_ri.ri_length; 1224 length = rgd->rd_length;
1041 rgrp_block = block - rgd->rd_ri.ri_data0; 1225 rgrp_block = block - rgd->rd_data0;
1042 1226
1043 for (buf = 0; buf < length; buf++) { 1227 for (buf = 0; buf < length; buf++) {
1044 bi = rgd->rd_bits + buf; 1228 bi = rgd->rd_bits + buf;
@@ -1077,10 +1261,10 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1077 */ 1261 */
1078 1262
1079static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, 1263static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1080 unsigned char old_state, unsigned char new_state) 1264 unsigned char old_state, unsigned char new_state)
1081{ 1265{
1082 struct gfs2_bitmap *bi = NULL; 1266 struct gfs2_bitmap *bi = NULL;
1083 u32 length = rgd->rd_ri.ri_length; 1267 u32 length = rgd->rd_length;
1084 u32 blk = 0; 1268 u32 blk = 0;
1085 unsigned int buf, x; 1269 unsigned int buf, x;
1086 1270
@@ -1118,17 +1302,18 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1118 goal = 0; 1302 goal = 0;
1119 } 1303 }
1120 1304
1121 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length)) 1305 if (old_state != new_state) {
1122 blk = 0; 1306 gfs2_assert_withdraw(rgd->rd_sbd, blk != BFITNOENT);
1123 1307
1124 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1308 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1125 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1309 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1126 bi->bi_len, blk, new_state);
1127 if (bi->bi_clone)
1128 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1129 bi->bi_len, blk, new_state); 1310 bi->bi_len, blk, new_state);
1311 if (bi->bi_clone)
1312 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1313 bi->bi_len, blk, new_state);
1314 }
1130 1315
1131 return bi->bi_start * GFS2_NBBY + blk; 1316 return (blk == BFITNOENT) ? 0 : (bi->bi_start * GFS2_NBBY) + blk;
1132} 1317}
1133 1318
1134/** 1319/**
@@ -1156,9 +1341,9 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1156 return NULL; 1341 return NULL;
1157 } 1342 }
1158 1343
1159 length = rgd->rd_ri.ri_length; 1344 length = rgd->rd_length;
1160 1345
1161 rgrp_blk = bstart - rgd->rd_ri.ri_data0; 1346 rgrp_blk = bstart - rgd->rd_data0;
1162 1347
1163 while (blen--) { 1348 while (blen--) {
1164 for (buf = 0; buf < length; buf++) { 1349 for (buf = 0; buf < length; buf++) {
@@ -1202,15 +1387,15 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
1202 u32 goal, blk; 1387 u32 goal, blk;
1203 u64 block; 1388 u64 block;
1204 1389
1205 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data)) 1390 if (rgrp_contains_block(rgd, ip->i_di.di_goal_data))
1206 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0; 1391 goal = ip->i_di.di_goal_data - rgd->rd_data0;
1207 else 1392 else
1208 goal = rgd->rd_last_alloc_data; 1393 goal = rgd->rd_last_alloc_data;
1209 1394
1210 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); 1395 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1211 rgd->rd_last_alloc_data = blk; 1396 rgd->rd_last_alloc_data = blk;
1212 1397
1213 block = rgd->rd_ri.ri_data0 + blk; 1398 block = rgd->rd_data0 + blk;
1214 ip->i_di.di_goal_data = block; 1399 ip->i_di.di_goal_data = block;
1215 1400
1216 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1401 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
@@ -1246,15 +1431,15 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1246 u32 goal, blk; 1431 u32 goal, blk;
1247 u64 block; 1432 u64 block;
1248 1433
1249 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta)) 1434 if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
1250 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0; 1435 goal = ip->i_di.di_goal_meta - rgd->rd_data0;
1251 else 1436 else
1252 goal = rgd->rd_last_alloc_meta; 1437 goal = rgd->rd_last_alloc_meta;
1253 1438
1254 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); 1439 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1255 rgd->rd_last_alloc_meta = blk; 1440 rgd->rd_last_alloc_meta = blk;
1256 1441
1257 block = rgd->rd_ri.ri_data0 + blk; 1442 block = rgd->rd_data0 + blk;
1258 ip->i_di.di_goal_meta = block; 1443 ip->i_di.di_goal_meta = block;
1259 1444
1260 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1445 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
@@ -1296,7 +1481,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1296 1481
1297 rgd->rd_last_alloc_meta = blk; 1482 rgd->rd_last_alloc_meta = blk;
1298 1483
1299 block = rgd->rd_ri.ri_data0 + blk; 1484 block = rgd->rd_data0 + blk;
1300 1485
1301 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1486 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1302 rgd->rd_rg.rg_free--; 1487 rgd->rd_rg.rg_free--;
@@ -1379,7 +1564,7 @@ void gfs2_unlink_di(struct inode *inode)
1379 struct gfs2_inode *ip = GFS2_I(inode); 1564 struct gfs2_inode *ip = GFS2_I(inode);
1380 struct gfs2_sbd *sdp = GFS2_SB(inode); 1565 struct gfs2_sbd *sdp = GFS2_SB(inode);
1381 struct gfs2_rgrpd *rgd; 1566 struct gfs2_rgrpd *rgd;
1382 u64 blkno = ip->i_num.no_addr; 1567 u64 blkno = ip->i_no_addr;
1383 1568
1384 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); 1569 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1385 if (!rgd) 1570 if (!rgd)
@@ -1414,9 +1599,9 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1414 1599
1415void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) 1600void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1416{ 1601{
1417 gfs2_free_uninit_di(rgd, ip->i_num.no_addr); 1602 gfs2_free_uninit_di(rgd, ip->i_no_addr);
1418 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); 1603 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
1419 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1); 1604 gfs2_meta_wipe(ip, ip->i_no_addr, 1);
1420} 1605}
1421 1606
1422/** 1607/**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b01e0cfc99..b4c6adfc6f 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -65,5 +65,6 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
65void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, 65void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
66 int flags); 66 int flags);
67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
68u64 gfs2_ri_total(struct gfs2_sbd *sdp);
68 69
69#endif /* __RGRP_DOT_H__ */ 70#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4fdda974dc..f916b9740c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -95,8 +95,8 @@ int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
95{ 95{
96 unsigned int x; 96 unsigned int x;
97 97
98 if (sb->sb_header.mh_magic != GFS2_MAGIC || 98 if (sb->sb_magic != GFS2_MAGIC ||
99 sb->sb_header.mh_type != GFS2_METATYPE_SB) { 99 sb->sb_type != GFS2_METATYPE_SB) {
100 if (!silent) 100 if (!silent)
101 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); 101 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
102 return -EINVAL; 102 return -EINVAL;
@@ -174,10 +174,31 @@ static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
174 return 0; 174 return 0;
175} 175}
176 176
177static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
178{
179 const struct gfs2_sb *str = buf;
180
181 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
182 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
183 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
184 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
185 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
186 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
187 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
188 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
189 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
190 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
191 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
192
193 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
194 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
195}
196
177/** 197/**
178 * gfs2_read_super - Read the gfs2 super block from disk 198 * gfs2_read_super - Read the gfs2 super block from disk
179 * @sb: The VFS super block 199 * @sdp: The GFS2 super block
180 * @sector: The location of the super block 200 * @sector: The location of the super block
201 * @error: The error code to return
181 * 202 *
182 * This uses the bio functions to read the super block from disk 203 * This uses the bio functions to read the super block from disk
183 * because we want to be 100% sure that we never read cached data. 204 * because we want to be 100% sure that we never read cached data.
@@ -189,17 +210,19 @@ static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
189 * the master directory (contains pointers to journals etc) and the 210 * the master directory (contains pointers to journals etc) and the
190 * root directory. 211 * root directory.
191 * 212 *
192 * Returns: A page containing the sb or NULL 213 * Returns: 0 on success or error
193 */ 214 */
194 215
195struct page *gfs2_read_super(struct super_block *sb, sector_t sector) 216int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
196{ 217{
218 struct super_block *sb = sdp->sd_vfs;
219 struct gfs2_sb *p;
197 struct page *page; 220 struct page *page;
198 struct bio *bio; 221 struct bio *bio;
199 222
200 page = alloc_page(GFP_KERNEL); 223 page = alloc_page(GFP_KERNEL);
201 if (unlikely(!page)) 224 if (unlikely(!page))
202 return NULL; 225 return -ENOBUFS;
203 226
204 ClearPageUptodate(page); 227 ClearPageUptodate(page);
205 ClearPageDirty(page); 228 ClearPageDirty(page);
@@ -208,7 +231,7 @@ struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
208 bio = bio_alloc(GFP_KERNEL, 1); 231 bio = bio_alloc(GFP_KERNEL, 1);
209 if (unlikely(!bio)) { 232 if (unlikely(!bio)) {
210 __free_page(page); 233 __free_page(page);
211 return NULL; 234 return -ENOBUFS;
212 } 235 }
213 236
214 bio->bi_sector = sector * (sb->s_blocksize >> 9); 237 bio->bi_sector = sector * (sb->s_blocksize >> 9);
@@ -222,9 +245,13 @@ struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
222 bio_put(bio); 245 bio_put(bio);
223 if (!PageUptodate(page)) { 246 if (!PageUptodate(page)) {
224 __free_page(page); 247 __free_page(page);
225 return NULL; 248 return -EIO;
226 } 249 }
227 return page; 250 p = kmap(page);
251 gfs2_sb_in(&sdp->sd_sb, p);
252 kunmap(page);
253 __free_page(page);
254 return 0;
228} 255}
229 256
230/** 257/**
@@ -241,19 +268,13 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
241 u32 tmp_blocks; 268 u32 tmp_blocks;
242 unsigned int x; 269 unsigned int x;
243 int error; 270 int error;
244 struct page *page;
245 char *sb;
246 271
247 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); 272 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
248 if (!page) { 273 if (error) {
249 if (!silent) 274 if (!silent)
250 fs_err(sdp, "can't read superblock\n"); 275 fs_err(sdp, "can't read superblock\n");
251 return -EIO; 276 return error;
252 } 277 }
253 sb = kmap(page);
254 gfs2_sb_in(&sdp->sd_sb, sb);
255 kunmap(page);
256 __free_page(page);
257 278
258 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); 279 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
259 if (error) 280 if (error)
@@ -360,7 +381,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
360 name.len = sprintf(buf, "journal%u", sdp->sd_journals); 381 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
361 name.hash = gfs2_disk_hash(name.name, name.len); 382 name.hash = gfs2_disk_hash(name.name, name.len);
362 383
363 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL); 384 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
364 if (error == -ENOENT) { 385 if (error == -ENOENT) {
365 error = 0; 386 error = 0;
366 break; 387 break;
@@ -593,6 +614,24 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
593 return error; 614 return error;
594} 615}
595 616
617static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
618{
619 const struct gfs2_statfs_change *str = buf;
620
621 sc->sc_total = be64_to_cpu(str->sc_total);
622 sc->sc_free = be64_to_cpu(str->sc_free);
623 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
624}
625
626static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
627{
628 struct gfs2_statfs_change *str = buf;
629
630 str->sc_total = cpu_to_be64(sc->sc_total);
631 str->sc_free = cpu_to_be64(sc->sc_free);
632 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
633}
634
596int gfs2_statfs_init(struct gfs2_sbd *sdp) 635int gfs2_statfs_init(struct gfs2_sbd *sdp)
597{ 636{
598 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); 637 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -772,7 +811,7 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
772 struct gfs2_statfs_change_host *sc) 811 struct gfs2_statfs_change_host *sc)
773{ 812{
774 gfs2_rgrp_verify(rgd); 813 gfs2_rgrp_verify(rgd);
775 sc->sc_total += rgd->rd_ri.ri_data; 814 sc->sc_total += rgd->rd_data;
776 sc->sc_free += rgd->rd_rg.rg_free; 815 sc->sc_free += rgd->rd_rg.rg_free;
777 sc->sc_dinodes += rgd->rd_rg.rg_dinodes; 816 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
778 return 0; 817 return 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index e590b2df11..60a870e430 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -16,7 +16,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
16 16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); 17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); 18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19struct page *gfs2_read_super(struct super_block *sb, sector_t sector); 19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
20 20
21static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 21static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
22{ 22{
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 601eaa1b9e..424a0774ed 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -115,8 +115,8 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
115 "GFS2: fsid=%s: inode = %llu %llu\n" 115 "GFS2: fsid=%s: inode = %llu %llu\n"
116 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 116 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
117 sdp->sd_fsname, 117 sdp->sd_fsname,
118 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino, 118 sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino,
119 (unsigned long long)ip->i_num.no_addr, 119 (unsigned long long)ip->i_no_addr,
120 sdp->sd_fsname, function, file, line); 120 sdp->sd_fsname, function, file, line);
121 return rv; 121 return rv;
122} 122}
@@ -137,7 +137,7 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
137 "GFS2: fsid=%s: RG = %llu\n" 137 "GFS2: fsid=%s: RG = %llu\n"
138 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 138 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
139 sdp->sd_fsname, 139 sdp->sd_fsname,
140 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr, 140 sdp->sd_fsname, (unsigned long long)rgd->rd_addr,
141 sdp->sd_fsname, function, file, line); 141 sdp->sd_fsname, function, file, line);
142 return rv; 142 return rv;
143} 143}
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9a934db0bd..bc835f272a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -607,7 +607,7 @@ static const struct file_operations hfs_file_operations = {
607 .write = do_sync_write, 607 .write = do_sync_write,
608 .aio_write = generic_file_aio_write, 608 .aio_write = generic_file_aio_write,
609 .mmap = generic_file_mmap, 609 .mmap = generic_file_mmap,
610 .sendfile = generic_file_sendfile, 610 .splice_read = generic_file_splice_read,
611 .fsync = file_fsync, 611 .fsync = file_fsync,
612 .open = hfs_file_open, 612 .open = hfs_file_open,
613 .release = hfs_file_release, 613 .release = hfs_file_release,
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 90ebab753d..050d29c0a5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -62,8 +62,10 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
62 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 62 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
63 (head->key_type == HFSPLUS_KEY_BINARY)) 63 (head->key_type == HFSPLUS_KEY_BINARY))
64 tree->keycmp = hfsplus_cat_bin_cmp_key; 64 tree->keycmp = hfsplus_cat_bin_cmp_key;
65 else 65 else {
66 tree->keycmp = hfsplus_cat_case_cmp_key; 66 tree->keycmp = hfsplus_cat_case_cmp_key;
67 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
68 }
67 } else { 69 } else {
68 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 70 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
69 goto fail_page; 71 goto fail_page;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 80b5682a22..1955ee6125 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -36,6 +36,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
36 u16 type; 36 u16 type;
37 37
38 sb = dir->i_sb; 38 sb = dir->i_sb;
39
40 dentry->d_op = &hfsplus_dentry_operations;
39 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
40 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
41 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 3915635b44..d9f5eda6d0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -150,6 +150,7 @@ struct hfsplus_sb_info {
150#define HFSPLUS_SB_NODECOMPOSE 0x0002 150#define HFSPLUS_SB_NODECOMPOSE 0x0002
151#define HFSPLUS_SB_FORCE 0x0004 151#define HFSPLUS_SB_FORCE 0x0004
152#define HFSPLUS_SB_HFSX 0x0008 152#define HFSPLUS_SB_HFSX 0x0008
153#define HFSPLUS_SB_CASEFOLD 0x0010
153 154
154 155
155struct hfsplus_inode_info { 156struct hfsplus_inode_info {
@@ -321,6 +322,7 @@ void hfsplus_file_truncate(struct inode *);
321/* inode.c */ 322/* inode.c */
322extern const struct address_space_operations hfsplus_aops; 323extern const struct address_space_operations hfsplus_aops;
323extern const struct address_space_operations hfsplus_btree_aops; 324extern const struct address_space_operations hfsplus_btree_aops;
325extern struct dentry_operations hfsplus_dentry_operations;
324 326
325void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); 327void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
326void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); 328void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
@@ -353,6 +355,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist
353int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 355int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
354int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); 356int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
355int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); 357int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
358int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
359int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
356 360
357/* wrapper.c */ 361/* wrapper.c */
358int hfsplus_read_wrapper(struct super_block *); 362int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 45dab5d6cc..6f7c662174 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -131,6 +131,11 @@ const struct address_space_operations hfsplus_aops = {
131 .writepages = hfsplus_writepages, 131 .writepages = hfsplus_writepages,
132}; 132};
133 133
134struct dentry_operations hfsplus_dentry_operations = {
135 .d_hash = hfsplus_hash_dentry,
136 .d_compare = hfsplus_compare_dentry,
137};
138
134static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, 139static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
135 struct nameidata *nd) 140 struct nameidata *nd)
136{ 141{
@@ -288,7 +293,7 @@ static const struct file_operations hfsplus_file_operations = {
288 .write = do_sync_write, 293 .write = do_sync_write,
289 .aio_write = generic_file_aio_write, 294 .aio_write = generic_file_aio_write,
290 .mmap = generic_file_mmap, 295 .mmap = generic_file_mmap,
291 .sendfile = generic_file_sendfile, 296 .splice_read = generic_file_splice_read,
292 .fsync = file_fsync, 297 .fsync = file_fsync,
293 .open = hfsplus_file_open, 298 .open = hfsplus_file_open,
294 .release = hfsplus_file_release, 299 .release = hfsplus_file_release,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 79fd10402e..b60c0affbe 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -38,7 +38,7 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 if (IS_RDONLY(inode)) 38 if (IS_RDONLY(inode))
39 return -EROFS; 39 return -EROFS;
40 40
41 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 41 if (!is_owner_or_cap(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *)arg)) 44 if (get_user(flags, (int __user *)arg))
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ebd1b380cb..6d87a2a953 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -283,11 +283,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
283 struct nls_table *nls = NULL; 283 struct nls_table *nls = NULL;
284 int err = -EINVAL; 284 int err = -EINVAL;
285 285
286 sbi = kmalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); 286 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
287 if (!sbi) 287 if (!sbi)
288 return -ENOMEM; 288 return -ENOMEM;
289 289
290 memset(sbi, 0, sizeof(HFSPLUS_SB(sb)));
291 sb->s_fs_info = sbi; 290 sb->s_fs_info = sbi;
292 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 291 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
293 hfsplus_fill_defaults(sbi); 292 hfsplus_fill_defaults(sbi);
@@ -381,6 +380,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
381 iput(root); 380 iput(root);
382 goto cleanup; 381 goto cleanup;
383 } 382 }
383 sb->s_root->d_op = &hfsplus_dentry_operations;
384 384
385 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 385 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
386 str.name = HFSP_HIDDENDIR_NAME; 386 str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 689c8bd721..9e10f9444b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -239,61 +239,201 @@ out:
239 return res; 239 return res;
240} 240}
241 241
242int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, const char *astr, int len) 242/*
243 * Convert one or more ASCII characters into a single unicode character.
244 * Returns the number of ASCII characters corresponding to the unicode char.
245 */
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc)
243{ 248{
244 struct nls_table *nls = HFSPLUS_SB(sb).nls; 249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
245 int size, off, decompose; 250 if (size <= 0) {
251 *uc = '?';
252 size = 1;
253 }
254 switch (*uc) {
255 case 0x2400:
256 *uc = 0;
257 break;
258 case ':':
259 *uc = '/';
260 break;
261 }
262 return size;
263}
264
265/* Decomposes a single unicode character. */
266static inline u16 *decompose_unichar(wchar_t uc, int *size)
267{
268 int off;
269
270 off = hfsplus_decompose_table[(uc >> 12) & 0xf];
271 if (off == 0 || off == 0xffff)
272 return NULL;
273
274 off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
275 if (!off)
276 return NULL;
277
278 off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
279 if (!off)
280 return NULL;
281
282 off = hfsplus_decompose_table[off + (uc & 0xf)];
283 *size = off & 3;
284 if (*size == 0)
285 return NULL;
286 return hfsplus_decompose_table + (off / 4);
287}
288
289int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
290 const char *astr, int len)
291{
292 int size, dsize, decompose;
293 u16 *dstr, outlen = 0;
246 wchar_t c; 294 wchar_t c;
247 u16 outlen = 0;
248 295
249 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
250
251 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
252 size = nls->char2uni(astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
253 if (size <= 0) { 299
254 c = '?'; 300 if (decompose && (dstr = decompose_unichar(c, &dsize))) {
255 size = 1; 301 if (outlen + dsize > HFSPLUS_MAX_STRLEN)
256 }
257 astr += size;
258 len -= size;
259 switch (c) {
260 case 0x2400:
261 c = 0;
262 break;
263 case ':':
264 c = '/';
265 break;
266 }
267 if (c >= 0xc0 && decompose) {
268 off = hfsplus_decompose_table[(c >> 12) & 0xf];
269 if (!off)
270 goto done;
271 if (off == 0xffff) {
272 goto done;
273 }
274 off = hfsplus_decompose_table[off + ((c >> 8) & 0xf)];
275 if (!off)
276 goto done;
277 off = hfsplus_decompose_table[off + ((c >> 4) & 0xf)];
278 if (!off)
279 goto done;
280 off = hfsplus_decompose_table[off + (c & 0xf)];
281 size = off & 3;
282 if (!size)
283 goto done;
284 off /= 4;
285 if (outlen + size > HFSPLUS_MAX_STRLEN)
286 break; 302 break;
287 do { 303 do {
288 ustr->unicode[outlen++] = cpu_to_be16(hfsplus_decompose_table[off++]); 304 ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
289 } while (--size > 0); 305 } while (--dsize > 0);
290 continue; 306 } else
291 } 307 ustr->unicode[outlen++] = cpu_to_be16(c);
292 done: 308
293 ustr->unicode[outlen++] = cpu_to_be16(c); 309 astr += size;
310 len -= size;
294 } 311 }
295 ustr->length = cpu_to_be16(outlen); 312 ustr->length = cpu_to_be16(outlen);
296 if (len > 0) 313 if (len > 0)
297 return -ENAMETOOLONG; 314 return -ENAMETOOLONG;
298 return 0; 315 return 0;
299} 316}
317
318/*
319 * Hash a string to an integer as appropriate for the HFS+ filesystem.
320 * Composed unicode characters are decomposed and case-folding is performed
321 * if the appropriate bits are (un)set on the superblock.
322 */
323int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
324{
325 struct super_block *sb = dentry->d_sb;
326 const char *astr;
327 const u16 *dstr;
328 int casefold, decompose, size, dsize, len;
329 unsigned long hash;
330 wchar_t c;
331 u16 c2;
332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
335 hash = init_name_hash();
336 astr = str->name;
337 len = str->len;
338 while (len > 0) {
339 size = asc2unichar(sb, astr, len, &c);
340 astr += size;
341 len -= size;
342
343 if (decompose && (dstr = decompose_unichar(c, &dsize))) {
344 do {
345 c2 = *dstr++;
346 if (!casefold || (c2 = case_fold(c2)))
347 hash = partial_name_hash(c2, hash);
348 } while (--dsize > 0);
349 } else {
350 c2 = c;
351 if (!casefold || (c2 = case_fold(c2)))
352 hash = partial_name_hash(c2, hash);
353 }
354 }
355 str->hash = end_name_hash(hash);
356
357 return 0;
358}
359
360/*
361 * Compare strings with HFS+ filename ordering.
362 * Composed unicode characters are decomposed and case-folding is performed
363 * if the appropriate bits are (un)set on the superblock.
364 */
365int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
366{
367 struct super_block *sb = dentry->d_sb;
368 int casefold, decompose, size;
369 int dsize1, dsize2, len1, len2;
370 const u16 *dstr1, *dstr2;
371 const char *astr1, *astr2;
372 u16 c1, c2;
373 wchar_t c;
374
375 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
376 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
377 astr1 = s1->name;
378 len1 = s1->len;
379 astr2 = s2->name;
380 len2 = s2->len;
381 dsize1 = dsize2 = 0;
382 dstr1 = dstr2 = NULL;
383
384 while (len1 > 0 && len2 > 0) {
385 if (!dsize1) {
386 size = asc2unichar(sb, astr1, len1, &c);
387 astr1 += size;
388 len1 -= size;
389
390 if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
391 c1 = c;
392 dstr1 = &c1;
393 dsize1 = 1;
394 }
395 }
396
397 if (!dsize2) {
398 size = asc2unichar(sb, astr2, len2, &c);
399 astr2 += size;
400 len2 -= size;
401
402 if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
403 c2 = c;
404 dstr2 = &c2;
405 dsize2 = 1;
406 }
407 }
408
409 c1 = *dstr1;
410 c2 = *dstr2;
411 if (casefold) {
412 if (!(c1 = case_fold(c1))) {
413 dstr1++;
414 dsize1--;
415 continue;
416 }
417 if (!(c2 = case_fold(c2))) {
418 dstr2++;
419 dsize2--;
420 continue;
421 }
422 }
423 if (c1 < c2)
424 return -1;
425 else if (c1 > c2)
426 return 1;
427
428 dstr1++;
429 dsize1--;
430 dstr2++;
431 dsize2--;
432 }
433
434 if (len1 < len2)
435 return -1;
436 if (len1 > len2)
437 return 1;
438 return 0;
439}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 8286491dbf..c77862032e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -390,7 +390,7 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
390static const struct file_operations hostfs_file_fops = { 390static const struct file_operations hostfs_file_fops = {
391 .llseek = generic_file_llseek, 391 .llseek = generic_file_llseek,
392 .read = do_sync_read, 392 .read = do_sync_read,
393 .sendfile = generic_file_sendfile, 393 .splice_read = generic_file_splice_read,
394 .aio_read = generic_file_aio_read, 394 .aio_read = generic_file_aio_read,
395 .aio_write = generic_file_aio_write, 395 .aio_write = generic_file_aio_write,
396 .write = do_sync_write, 396 .write = do_sync_write,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index b4eafc0f1e..5b53e5c5d8 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -129,7 +129,7 @@ const struct file_operations hpfs_file_ops =
129 .mmap = generic_file_mmap, 129 .mmap = generic_file_mmap,
130 .release = hpfs_file_release, 130 .release = hpfs_file_release,
131 .fsync = hpfs_file_fsync, 131 .fsync = hpfs_file_fsync,
132 .sendfile = generic_file_sendfile, 132 .splice_read = generic_file_splice_read,
133}; 133};
134 134
135const struct inode_operations hpfs_file_iops = 135const struct inode_operations hpfs_file_iops =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6b46b3ac2..d145cb79c3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -13,15 +13,18 @@
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/kernel.h>
16#include <linux/writeback.h> 17#include <linux/writeback.h>
17#include <linux/pagemap.h> 18#include <linux/pagemap.h>
18#include <linux/highmem.h> 19#include <linux/highmem.h>
19#include <linux/init.h> 20#include <linux/init.h>
20#include <linux/string.h> 21#include <linux/string.h>
21#include <linux/capability.h> 22#include <linux/capability.h>
23#include <linux/ctype.h>
22#include <linux/backing-dev.h> 24#include <linux/backing-dev.h>
23#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
24#include <linux/pagevec.h> 26#include <linux/pagevec.h>
27#include <linux/parser.h>
25#include <linux/mman.h> 28#include <linux/mman.h>
26#include <linux/quotaops.h> 29#include <linux/quotaops.h>
27#include <linux/slab.h> 30#include <linux/slab.h>
@@ -47,6 +50,21 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 50
48int sysctl_hugetlb_shm_group; 51int sysctl_hugetlb_shm_group;
49 52
53enum {
54 Opt_size, Opt_nr_inodes,
55 Opt_mode, Opt_uid, Opt_gid,
56 Opt_err,
57};
58
59static match_table_t tokens = {
60 {Opt_size, "size=%s"},
61 {Opt_nr_inodes, "nr_inodes=%s"},
62 {Opt_mode, "mode=%o"},
63 {Opt_uid, "uid=%u"},
64 {Opt_gid, "gid=%u"},
65 {Opt_err, NULL},
66};
67
50static void huge_pagevec_release(struct pagevec *pvec) 68static void huge_pagevec_release(struct pagevec *pvec)
51{ 69{
52 int i; 70 int i;
@@ -594,46 +612,73 @@ static const struct super_operations hugetlbfs_ops = {
594static int 612static int
595hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 613hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
596{ 614{
597 char *opt, *value, *rest; 615 char *p, *rest;
616 substring_t args[MAX_OPT_ARGS];
617 int option;
598 618
599 if (!options) 619 if (!options)
600 return 0; 620 return 0;
601 while ((opt = strsep(&options, ",")) != NULL) { 621
602 if (!*opt) 622 while ((p = strsep(&options, ",")) != NULL) {
623 int token;
624 if (!*p)
603 continue; 625 continue;
604 626
605 value = strchr(opt, '='); 627 token = match_token(p, tokens, args);
606 if (!value || !*value) 628 switch (token) {
607 return -EINVAL; 629 case Opt_uid:
608 else 630 if (match_int(&args[0], &option))
609 *value++ = '\0'; 631 goto bad_val;
610 632 pconfig->uid = option;
611 if (!strcmp(opt, "uid")) 633 break;
612 pconfig->uid = simple_strtoul(value, &value, 0); 634
613 else if (!strcmp(opt, "gid")) 635 case Opt_gid:
614 pconfig->gid = simple_strtoul(value, &value, 0); 636 if (match_int(&args[0], &option))
615 else if (!strcmp(opt, "mode")) 637 goto bad_val;
616 pconfig->mode = simple_strtoul(value,&value,0) & 0777U; 638 pconfig->gid = option;
617 else if (!strcmp(opt, "size")) { 639 break;
618 unsigned long long size = memparse(value, &rest); 640
641 case Opt_mode:
642 if (match_octal(&args[0], &option))
643 goto bad_val;
644 pconfig->mode = option & 0777U;
645 break;
646
647 case Opt_size: {
648 unsigned long long size;
649 /* memparse() will accept a K/M/G without a digit */
650 if (!isdigit(*args[0].from))
651 goto bad_val;
652 size = memparse(args[0].from, &rest);
619 if (*rest == '%') { 653 if (*rest == '%') {
620 size <<= HPAGE_SHIFT; 654 size <<= HPAGE_SHIFT;
621 size *= max_huge_pages; 655 size *= max_huge_pages;
622 do_div(size, 100); 656 do_div(size, 100);
623 rest++;
624 } 657 }
625 pconfig->nr_blocks = (size >> HPAGE_SHIFT); 658 pconfig->nr_blocks = (size >> HPAGE_SHIFT);
626 value = rest; 659 break;
627 } else if (!strcmp(opt,"nr_inodes")) { 660 }
628 pconfig->nr_inodes = memparse(value, &rest); 661
629 value = rest; 662 case Opt_nr_inodes:
630 } else 663 /* memparse() will accept a K/M/G without a digit */
631 return -EINVAL; 664 if (!isdigit(*args[0].from))
665 goto bad_val;
666 pconfig->nr_inodes = memparse(args[0].from, &rest);
667 break;
632 668
633 if (*value) 669 default:
670 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
671 p);
634 return -EINVAL; 672 return -EINVAL;
673 break;
674 }
635 } 675 }
636 return 0; 676 return 0;
677
678bad_val:
679 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
680 args[0].from, p);
681 return 1;
637} 682}
638 683
639static int 684static int
@@ -651,7 +696,6 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
651 config.gid = current->fsgid; 696 config.gid = current->fsgid;
652 config.mode = 0755; 697 config.mode = 0755;
653 ret = hugetlbfs_parse_options(data, &config); 698 ret = hugetlbfs_parse_options(data, &config);
654
655 if (ret) 699 if (ret)
656 return ret; 700 return ret;
657 701
diff --git a/fs/inode.c b/fs/inode.c
index 9a012cc5b6..320e088d0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -145,7 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb)
145 mapping->a_ops = &empty_aops; 145 mapping->a_ops = &empty_aops;
146 mapping->host = inode; 146 mapping->host = inode;
147 mapping->flags = 0; 147 mapping->flags = 0;
148 mapping_set_gfp_mask(mapping, GFP_HIGHUSER); 148 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
149 mapping->assoc_mapping = NULL; 149 mapping->assoc_mapping = NULL;
150 mapping->backing_dev_info = &default_backing_dev_info; 150 mapping->backing_dev_info = &default_backing_dev_info;
151 151
@@ -462,6 +462,11 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
462 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 462 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
463} 463}
464 464
465static struct shrinker icache_shrinker = {
466 .shrink = shrink_icache_memory,
467 .seeks = DEFAULT_SEEKS,
468};
469
465static void __wait_on_freeing_inode(struct inode *inode); 470static void __wait_on_freeing_inode(struct inode *inode);
466/* 471/*
467 * Called with the inode lock held. 472 * Called with the inode lock held.
@@ -519,7 +524,13 @@ repeat:
519 * new_inode - obtain an inode 524 * new_inode - obtain an inode
520 * @sb: superblock 525 * @sb: superblock
521 * 526 *
522 * Allocates a new inode for given superblock. 527 * Allocates a new inode for given superblock. The default gfp_mask
528 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
529 * If HIGHMEM pages are unsuitable or it is known that pages allocated
530 * for the page cache are not reclaimable or migratable,
531 * mapping_set_gfp_mask() must be called with suitable flags on the
532 * newly created inode's mapping
533 *
523 */ 534 */
524struct inode *new_inode(struct super_block *sb) 535struct inode *new_inode(struct super_block *sb)
525{ 536{
@@ -1379,7 +1390,7 @@ void __init inode_init(unsigned long mempages)
1379 SLAB_MEM_SPREAD), 1390 SLAB_MEM_SPREAD),
1380 init_once, 1391 init_once,
1381 NULL); 1392 NULL);
1382 set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); 1393 register_shrinker(&icache_shrinker);
1383 1394
1384 /* Hash may have been set up in inode_init_early */ 1395 /* Hash may have been set up in inode_init_early */
1385 if (!hashdist) 1396 if (!hashdist)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 8c90cbc903..c2a773e862 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -12,7 +12,6 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kallsyms.h>
16 15
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18#include <asm/ioctls.h> 17#include <asm/ioctls.h>
@@ -21,7 +20,6 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
21 unsigned long arg) 20 unsigned long arg)
22{ 21{
23 int error = -ENOTTY; 22 int error = -ENOTTY;
24 void *f;
25 23
26 if (!filp->f_op) 24 if (!filp->f_op)
27 goto out; 25 goto out;
@@ -31,16 +29,10 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
31 if (error == -ENOIOCTLCMD) 29 if (error == -ENOIOCTLCMD)
32 error = -EINVAL; 30 error = -EINVAL;
33 goto out; 31 goto out;
34 } else if ((f = filp->f_op->ioctl)) { 32 } else if (filp->f_op->ioctl) {
35 lock_kernel(); 33 lock_kernel();
36 if (!filp->f_op->ioctl) { 34 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
37 printk("%s: ioctl %p disappeared\n", __FUNCTION__, f); 35 filp, cmd, arg);
38 print_symbol("symbol: %s\n", (unsigned long)f);
39 dump_stack();
40 } else {
41 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
42 filp, cmd, arg);
43 }
44 unlock_kernel(); 36 unlock_kernel();
45 } 37 }
46 38
@@ -182,11 +174,3 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
182 out: 174 out:
183 return error; 175 return error;
184} 176}
185
186/*
187 * Platforms implementing 32 bit compatibility ioctl handlers in
188 * modules need this exported
189 */
190#ifdef CONFIG_COMPAT
191EXPORT_SYMBOL(sys_ioctl);
192#endif
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 0e94c31cad..1ba407c64d 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -7,34 +7,18 @@
7 * 7 *
8 * Steve Beynon : Missing last directory entries fixed 8 * Steve Beynon : Missing last directory entries fixed
9 * (stephen@askone.demon.co.uk) : 21st June 1996 9 * (stephen@askone.demon.co.uk) : 21st June 1996
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include "isofs.h" 14#include "isofs.h"
15 15
16static int isofs_readdir(struct file *, void *, filldir_t);
17
18const struct file_operations isofs_dir_operations =
19{
20 .read = generic_read_dir,
21 .readdir = isofs_readdir,
22};
23
24/*
25 * directories can handle most operations...
26 */
27const struct inode_operations isofs_dir_inode_operations =
28{
29 .lookup = isofs_lookup,
30};
31
32int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
33{ 17{
34 char * old = de->name; 18 char * old = de->name;
35 int len = de->name_len[0]; 19 int len = de->name_len[0];
36 int i; 20 int i;
37 21
38 for (i = 0; i < len; i++) { 22 for (i = 0; i < len; i++) {
39 unsigned char c = old[i]; 23 unsigned char c = old[i];
40 if (!c) 24 if (!c)
@@ -62,22 +46,27 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
62} 46}
63 47
64/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ 48/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
65int get_acorn_filename(struct iso_directory_record * de, 49int get_acorn_filename(struct iso_directory_record *de,
66 char * retname, struct inode * inode) 50 char *retname, struct inode *inode)
67{ 51{
68 int std; 52 int std;
69 unsigned char * chr; 53 unsigned char *chr;
70 int retnamlen = isofs_name_translate(de, retname, inode); 54 int retnamlen = isofs_name_translate(de, retname, inode);
71 if (retnamlen == 0) return 0; 55
56 if (retnamlen == 0)
57 return 0;
72 std = sizeof(struct iso_directory_record) + de->name_len[0]; 58 std = sizeof(struct iso_directory_record) + de->name_len[0];
73 if (std & 1) std++; 59 if (std & 1)
74 if ((*((unsigned char *) de) - std) != 32) return retnamlen; 60 std++;
61 if ((*((unsigned char *) de) - std) != 32)
62 return retnamlen;
75 chr = ((unsigned char *) de) + std; 63 chr = ((unsigned char *) de) + std;
76 if (strncmp(chr, "ARCHIMEDES", 10)) return retnamlen; 64 if (strncmp(chr, "ARCHIMEDES", 10))
77 if ((*retname == '_') && ((chr[19] & 1) == 1)) *retname = '!'; 65 return retnamlen;
66 if ((*retname == '_') && ((chr[19] & 1) == 1))
67 *retname = '!';
78 if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) 68 if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff)
79 && ((chr[12] & 0xf0) == 0xf0)) 69 && ((chr[12] & 0xf0) == 0xf0)) {
80 {
81 retname[retnamlen] = ','; 70 retname[retnamlen] = ',';
82 sprintf(retname+retnamlen+1, "%3.3x", 71 sprintf(retname+retnamlen+1, "%3.3x",
83 ((chr[12] & 0xf) << 8) | chr[11]); 72 ((chr[12] & 0xf) << 8) | chr[11]);
@@ -91,7 +80,7 @@ int get_acorn_filename(struct iso_directory_record * de,
91 */ 80 */
92static int do_isofs_readdir(struct inode *inode, struct file *filp, 81static int do_isofs_readdir(struct inode *inode, struct file *filp,
93 void *dirent, filldir_t filldir, 82 void *dirent, filldir_t filldir,
94 char * tmpname, struct iso_directory_record * tmpde) 83 char *tmpname, struct iso_directory_record *tmpde)
95{ 84{
96 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
97 unsigned char bufbits = ISOFS_BUFFER_BITS(inode); 86 unsigned char bufbits = ISOFS_BUFFER_BITS(inode);
@@ -121,9 +110,11 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
121 110
122 de_len = *(unsigned char *) de; 111 de_len = *(unsigned char *) de;
123 112
124 /* If the length byte is zero, we should move on to the next 113 /*
125 CDROM sector. If we are at the end of the directory, we 114 * If the length byte is zero, we should move on to the next
126 kick out of the while loop. */ 115 * CDROM sector. If we are at the end of the directory, we
116 * kick out of the while loop.
117 */
127 118
128 if (de_len == 0) { 119 if (de_len == 0) {
129 brelse(bh); 120 brelse(bh);
@@ -157,11 +148,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
157 148
158 if (first_de) { 149 if (first_de) {
159 isofs_normalize_block_and_offset(de, 150 isofs_normalize_block_and_offset(de,
160 &block_saved, 151 &block_saved,
161 &offset_saved); 152 &offset_saved);
162 inode_number = isofs_get_ino(block_saved, 153 inode_number = isofs_get_ino(block_saved,
163 offset_saved, 154 offset_saved, bufbits);
164 bufbits);
165 } 155 }
166 156
167 if (de->flags[-sbi->s_high_sierra] & 0x80) { 157 if (de->flags[-sbi->s_high_sierra] & 0x80) {
@@ -199,7 +189,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
199 */ 189 */
200 if ((sbi->s_hide == 'y' && 190 if ((sbi->s_hide == 'y' &&
201 (de->flags[-sbi->s_high_sierra] & 1)) || 191 (de->flags[-sbi->s_high_sierra] & 1)) ||
202 (sbi->s_showassoc =='n' && 192 (sbi->s_showassoc =='n' &&
203 (de->flags[-sbi->s_high_sierra] & 4))) { 193 (de->flags[-sbi->s_high_sierra] & 4))) {
204 filp->f_pos += de_len; 194 filp->f_pos += de_len;
205 continue; 195 continue;
@@ -240,7 +230,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
240 230
241 continue; 231 continue;
242 } 232 }
243 if (bh) brelse(bh); 233 if (bh)
234 brelse(bh);
244 return 0; 235 return 0;
245} 236}
246 237
@@ -253,8 +244,8 @@ static int isofs_readdir(struct file *filp,
253 void *dirent, filldir_t filldir) 244 void *dirent, filldir_t filldir)
254{ 245{
255 int result; 246 int result;
256 char * tmpname; 247 char *tmpname;
257 struct iso_directory_record * tmpde; 248 struct iso_directory_record *tmpde;
258 struct inode *inode = filp->f_path.dentry->d_inode; 249 struct inode *inode = filp->f_path.dentry->d_inode;
259 250
260 tmpname = (char *)__get_free_page(GFP_KERNEL); 251 tmpname = (char *)__get_free_page(GFP_KERNEL);
@@ -270,3 +261,19 @@ static int isofs_readdir(struct file *filp,
270 unlock_kernel(); 261 unlock_kernel();
271 return result; 262 return result;
272} 263}
264
265const struct file_operations isofs_dir_operations =
266{
267 .read = generic_read_dir,
268 .readdir = isofs_readdir,
269};
270
271/*
272 * directories can handle most operations...
273 */
274const struct inode_operations isofs_dir_inode_operations =
275{
276 .lookup = isofs_lookup,
277};
278
279
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5c3eecf754..4f5418be05 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,20 +73,20 @@ static void isofs_destroy_inode(struct inode *inode)
73 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); 73 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
74} 74}
75 75
76static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags) 76static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
77{ 77{
78 struct iso_inode_info *ei = foo; 78 struct iso_inode_info *ei = foo;
79 79
80 inode_init_once(&ei->vfs_inode); 80 inode_init_once(&ei->vfs_inode);
81} 81}
82 82
83static int init_inodecache(void) 83static int init_inodecache(void)
84{ 84{
85 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", 85 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
86 sizeof(struct iso_inode_info), 86 sizeof(struct iso_inode_info),
87 0, (SLAB_RECLAIM_ACCOUNT| 87 0, (SLAB_RECLAIM_ACCOUNT|
88 SLAB_MEM_SPREAD), 88 SLAB_MEM_SPREAD),
89 init_once, NULL); 89 init_once, NULL);
90 if (isofs_inode_cachep == NULL) 90 if (isofs_inode_cachep == NULL)
91 return -ENOMEM; 91 return -ENOMEM;
92 return 0; 92 return 0;
@@ -150,9 +150,9 @@ struct iso9660_options{
150 uid_t uid; 150 uid_t uid;
151 char *iocharset; 151 char *iocharset;
152 unsigned char utf8; 152 unsigned char utf8;
153 /* LVE */ 153 /* LVE */
154 s32 session; 154 s32 session;
155 s32 sbsector; 155 s32 sbsector;
156}; 156};
157 157
158/* 158/*
@@ -197,7 +197,7 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
197 hash = init_name_hash(); 197 hash = init_name_hash();
198 while (len--) { 198 while (len--) {
199 c = tolower(*name++); 199 c = tolower(*name++);
200 hash = partial_name_hash(tolower(c), hash); 200 hash = partial_name_hash(c, hash);
201 } 201 }
202 qstr->hash = end_name_hash(hash); 202 qstr->hash = end_name_hash(hash);
203 203
@@ -360,10 +360,12 @@ static int parse_options(char *options, struct iso9660_options *popt)
360 popt->check = 'u'; /* unset */ 360 popt->check = 'u'; /* unset */
361 popt->nocompress = 0; 361 popt->nocompress = 0;
362 popt->blocksize = 1024; 362 popt->blocksize = 1024;
363 popt->mode = S_IRUGO | S_IXUGO; /* r-x for all. The disc could 363 popt->mode = S_IRUGO | S_IXUGO; /*
364 be shared with DOS machines so 364 * r-x for all. The disc could
365 virtually anything could be 365 * be shared with DOS machines so
366 a valid executable. */ 366 * virtually anything could be
367 * a valid executable.
368 */
367 popt->gid = 0; 369 popt->gid = 0;
368 popt->uid = 0; 370 popt->uid = 0;
369 popt->iocharset = NULL; 371 popt->iocharset = NULL;
@@ -503,30 +505,30 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
503 Te.cdte_format=CDROM_LBA; 505 Te.cdte_format=CDROM_LBA;
504 i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); 506 i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
505 if (!i) { 507 if (!i) {
506 printk(KERN_DEBUG "Session %d start %d type %d\n", 508 printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
507 session, Te.cdte_addr.lba, 509 session, Te.cdte_addr.lba,
508 Te.cdte_ctrl&CDROM_DATA_TRACK); 510 Te.cdte_ctrl&CDROM_DATA_TRACK);
509 if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) 511 if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
510 return Te.cdte_addr.lba; 512 return Te.cdte_addr.lba;
511 } 513 }
512 514
513 printk(KERN_ERR "Invalid session number or type of track\n"); 515 printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
514 } 516 }
515 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); 517 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
516 if (session > 0) 518 if (session > 0)
517 printk(KERN_ERR "Invalid session number\n"); 519 printk(KERN_ERR "ISOFS: Invalid session number\n");
518#if 0 520#if 0
519 printk("isofs.inode: CDROMMULTISESSION: rc=%d\n",i); 521 printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
520 if (i==0) { 522 if (i==0) {
521 printk("isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); 523 printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
522 printk("isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); 524 printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
523 } 525 }
524#endif 526#endif
525 if (i==0) 527 if (i==0)
526#if WE_OBEY_THE_WRITTEN_STANDARDS 528#if WE_OBEY_THE_WRITTEN_STANDARDS
527 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 529 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
528#endif 530#endif
529 vol_desc_start=ms_info.addr.lba; 531 vol_desc_start=ms_info.addr.lba;
530 return vol_desc_start; 532 return vol_desc_start;
531} 533}
532 534
@@ -538,20 +540,20 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
538 */ 540 */
539static int isofs_fill_super(struct super_block *s, void *data, int silent) 541static int isofs_fill_super(struct super_block *s, void *data, int silent)
540{ 542{
541 struct buffer_head * bh = NULL, *pri_bh = NULL; 543 struct buffer_head *bh = NULL, *pri_bh = NULL;
542 struct hs_primary_descriptor * h_pri = NULL; 544 struct hs_primary_descriptor *h_pri = NULL;
543 struct iso_primary_descriptor * pri = NULL; 545 struct iso_primary_descriptor *pri = NULL;
544 struct iso_supplementary_descriptor *sec = NULL; 546 struct iso_supplementary_descriptor *sec = NULL;
545 struct iso_directory_record * rootp; 547 struct iso_directory_record *rootp;
546 int joliet_level = 0; 548 struct inode *inode;
547 int iso_blknum, block; 549 struct iso9660_options opt;
548 int orig_zonesize; 550 struct isofs_sb_info *sbi;
549 int table; 551 unsigned long first_data_zone;
550 unsigned int vol_desc_start; 552 int joliet_level = 0;
551 unsigned long first_data_zone; 553 int iso_blknum, block;
552 struct inode * inode; 554 int orig_zonesize;
553 struct iso9660_options opt; 555 int table;
554 struct isofs_sb_info * sbi; 556 unsigned int vol_desc_start;
555 557
556 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 558 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
557 if (!sbi) 559 if (!sbi)
@@ -577,72 +579,73 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
577 vol_desc_start = (opt.sbsector != -1) ? 579 vol_desc_start = (opt.sbsector != -1) ?
578 opt.sbsector : isofs_get_last_session(s,opt.session); 580 opt.sbsector : isofs_get_last_session(s,opt.session);
579 581
580 for (iso_blknum = vol_desc_start+16; 582 for (iso_blknum = vol_desc_start+16;
581 iso_blknum < vol_desc_start+100; iso_blknum++) 583 iso_blknum < vol_desc_start+100; iso_blknum++) {
582 { 584 struct hs_volume_descriptor *hdp;
583 struct hs_volume_descriptor * hdp; 585 struct iso_volume_descriptor *vdp;
584 struct iso_volume_descriptor * vdp; 586
585 587 block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
586 block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); 588 if (!(bh = sb_bread(s, block)))
587 if (!(bh = sb_bread(s, block))) 589 goto out_no_read;
588 goto out_no_read; 590
589 591 vdp = (struct iso_volume_descriptor *)bh->b_data;
590 vdp = (struct iso_volume_descriptor *)bh->b_data; 592 hdp = (struct hs_volume_descriptor *)bh->b_data;
591 hdp = (struct hs_volume_descriptor *)bh->b_data; 593
592 594 /*
593 /* Due to the overlapping physical location of the descriptors, 595 * Due to the overlapping physical location of the descriptors,
594 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure 596 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
595 * proper identification in this case, we first check for ISO. 597 * proper identification in this case, we first check for ISO.
596 */ 598 */
597 if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { 599 if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
598 if (isonum_711 (vdp->type) == ISO_VD_END) 600 if (isonum_711(vdp->type) == ISO_VD_END)
599 break; 601 break;
600 if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) { 602 if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
601 if (pri == NULL) { 603 if (pri == NULL) {
602 pri = (struct iso_primary_descriptor *)vdp; 604 pri = (struct iso_primary_descriptor *)vdp;
603 /* Save the buffer in case we need it ... */ 605 /* Save the buffer in case we need it ... */
604 pri_bh = bh; 606 pri_bh = bh;
605 bh = NULL; 607 bh = NULL;
606 } 608 }
607 } 609 }
608#ifdef CONFIG_JOLIET 610#ifdef CONFIG_JOLIET
609 else if (isonum_711 (vdp->type) == ISO_VD_SUPPLEMENTARY) { 611 else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
610 sec = (struct iso_supplementary_descriptor *)vdp; 612 sec = (struct iso_supplementary_descriptor *)vdp;
611 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { 613 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
612 if (opt.joliet == 'y') { 614 if (opt.joliet == 'y') {
613 if (sec->escape[2] == 0x40) { 615 if (sec->escape[2] == 0x40)
614 joliet_level = 1; 616 joliet_level = 1;
615 } else if (sec->escape[2] == 0x43) { 617 else if (sec->escape[2] == 0x43)
616 joliet_level = 2; 618 joliet_level = 2;
617 } else if (sec->escape[2] == 0x45) { 619 else if (sec->escape[2] == 0x45)
618 joliet_level = 3; 620 joliet_level = 3;
619 } 621
620 printk(KERN_DEBUG"ISO 9660 Extensions: Microsoft Joliet Level %d\n", 622 printk(KERN_DEBUG "ISO 9660 Extensions: "
621 joliet_level); 623 "Microsoft Joliet Level %d\n",
624 joliet_level);
625 }
626 goto root_found;
627 } else {
628 /* Unknown supplementary volume descriptor */
629 sec = NULL;
630 }
622 } 631 }
623 goto root_found;
624 } else {
625 /* Unknown supplementary volume descriptor */
626 sec = NULL;
627 }
628 }
629#endif 632#endif
630 } else { 633 } else {
631 if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { 634 if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
632 if (isonum_711 (hdp->type) != ISO_VD_PRIMARY) 635 if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
633 goto out_freebh; 636 goto out_freebh;
634 637
635 sbi->s_high_sierra = 1; 638 sbi->s_high_sierra = 1;
636 opt.rock = 'n'; 639 opt.rock = 'n';
637 h_pri = (struct hs_primary_descriptor *)vdp; 640 h_pri = (struct hs_primary_descriptor *)vdp;
638 goto root_found; 641 goto root_found;
642 }
639 } 643 }
640 }
641 644
642 /* Just skip any volume descriptors we don't recognize */ 645 /* Just skip any volume descriptors we don't recognize */
643 646
644 brelse(bh); 647 brelse(bh);
645 bh = NULL; 648 bh = NULL;
646 } 649 }
647 /* 650 /*
648 * If we fall through, either no volume descriptor was found, 651 * If we fall through, either no volume descriptor was found,
@@ -657,24 +660,24 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
657root_found: 660root_found:
658 661
659 if (joliet_level && (pri == NULL || opt.rock == 'n')) { 662 if (joliet_level && (pri == NULL || opt.rock == 'n')) {
660 /* This is the case of Joliet with the norock mount flag. 663 /* This is the case of Joliet with the norock mount flag.
661 * A disc with both Joliet and Rock Ridge is handled later 664 * A disc with both Joliet and Rock Ridge is handled later
662 */ 665 */
663 pri = (struct iso_primary_descriptor *) sec; 666 pri = (struct iso_primary_descriptor *) sec;
664 } 667 }
665 668
666 if(sbi->s_high_sierra){ 669 if(sbi->s_high_sierra){
667 rootp = (struct iso_directory_record *) h_pri->root_directory_record; 670 rootp = (struct iso_directory_record *) h_pri->root_directory_record;
668 sbi->s_nzones = isonum_733 (h_pri->volume_space_size); 671 sbi->s_nzones = isonum_733(h_pri->volume_space_size);
669 sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size); 672 sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
670 sbi->s_max_size = isonum_733(h_pri->volume_space_size); 673 sbi->s_max_size = isonum_733(h_pri->volume_space_size);
671 } else { 674 } else {
672 if (!pri) 675 if (!pri)
673 goto out_freebh; 676 goto out_freebh;
674 rootp = (struct iso_directory_record *) pri->root_directory_record; 677 rootp = (struct iso_directory_record *) pri->root_directory_record;
675 sbi->s_nzones = isonum_733 (pri->volume_space_size); 678 sbi->s_nzones = isonum_733(pri->volume_space_size);
676 sbi->s_log_zone_size = isonum_723 (pri->logical_block_size); 679 sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
677 sbi->s_max_size = isonum_733(pri->volume_space_size); 680 sbi->s_max_size = isonum_733(pri->volume_space_size);
678 } 681 }
679 682
680 sbi->s_ninodes = 0; /* No way to figure this out easily */ 683 sbi->s_ninodes = 0; /* No way to figure this out easily */
@@ -687,42 +690,43 @@ root_found:
687 * blocks that were 512 bytes (which should only very rarely 690 * blocks that were 512 bytes (which should only very rarely
688 * happen.) 691 * happen.)
689 */ 692 */
690 if(orig_zonesize < opt.blocksize) 693 if (orig_zonesize < opt.blocksize)
691 goto out_bad_size; 694 goto out_bad_size;
692 695
693 /* RDE: convert log zone size to bit shift */ 696 /* RDE: convert log zone size to bit shift */
694 switch (sbi->s_log_zone_size) 697 switch (sbi->s_log_zone_size) {
695 { case 512: sbi->s_log_zone_size = 9; break; 698 case 512: sbi->s_log_zone_size = 9; break;
696 case 1024: sbi->s_log_zone_size = 10; break; 699 case 1024: sbi->s_log_zone_size = 10; break;
697 case 2048: sbi->s_log_zone_size = 11; break; 700 case 2048: sbi->s_log_zone_size = 11; break;
698 701
699 default: 702 default:
700 goto out_bad_zone_size; 703 goto out_bad_zone_size;
701 } 704 }
702 705
703 s->s_magic = ISOFS_SUPER_MAGIC; 706 s->s_magic = ISOFS_SUPER_MAGIC;
704 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ 707 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
705 708
706 /* The CDROM is read-only, has no nodes (devices) on it, and since 709 /*
707 all of the files appear to be owned by root, we really do not want 710 * The CDROM is read-only, has no nodes (devices) on it, and since
708 to allow suid. (suid or devices will not show up unless we have 711 * all of the files appear to be owned by root, we really do not want
709 Rock Ridge extensions) */ 712 * to allow suid. (suid or devices will not show up unless we have
713 * Rock Ridge extensions)
714 */
710 715
711 s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; 716 s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
712 717
713 /* Set this for reference. Its not currently used except on write 718 /* Set this for reference. Its not currently used except on write
714 which we don't have .. */ 719 which we don't have .. */
715 720
716 first_data_zone = isonum_733 (rootp->extent) + 721 first_data_zone = isonum_733(rootp->extent) +
717 isonum_711 (rootp->ext_attr_length); 722 isonum_711(rootp->ext_attr_length);
718 sbi->s_firstdatazone = first_data_zone; 723 sbi->s_firstdatazone = first_data_zone;
719#ifndef BEQUIET 724#ifndef BEQUIET
720 printk(KERN_DEBUG "Max size:%ld Log zone size:%ld\n", 725 printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n",
721 sbi->s_max_size, 726 sbi->s_max_size, 1UL << sbi->s_log_zone_size);
722 1UL << sbi->s_log_zone_size); 727 printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
723 printk(KERN_DEBUG "First datazone:%ld\n", sbi->s_firstdatazone);
724 if(sbi->s_high_sierra) 728 if(sbi->s_high_sierra)
725 printk(KERN_DEBUG "Disc in High Sierra format.\n"); 729 printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
726#endif 730#endif
727 731
728 /* 732 /*
@@ -737,8 +741,8 @@ root_found:
737 pri = (struct iso_primary_descriptor *) sec; 741 pri = (struct iso_primary_descriptor *) sec;
738 rootp = (struct iso_directory_record *) 742 rootp = (struct iso_directory_record *)
739 pri->root_directory_record; 743 pri->root_directory_record;
740 first_data_zone = isonum_733 (rootp->extent) + 744 first_data_zone = isonum_733(rootp->extent) +
741 isonum_711 (rootp->ext_attr_length); 745 isonum_711(rootp->ext_attr_length);
742 } 746 }
743 747
744 /* 748 /*
@@ -771,7 +775,7 @@ root_found:
771 775
772#ifdef CONFIG_JOLIET 776#ifdef CONFIG_JOLIET
773 if (joliet_level && opt.utf8 == 0) { 777 if (joliet_level && opt.utf8 == 0) {
774 char * p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; 778 char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
775 sbi->s_nls_iocharset = load_nls(p); 779 sbi->s_nls_iocharset = load_nls(p);
776 if (! sbi->s_nls_iocharset) { 780 if (! sbi->s_nls_iocharset) {
777 /* Fail only if explicit charset specified */ 781 /* Fail only if explicit charset specified */
@@ -821,7 +825,7 @@ root_found:
821 sbi->s_rock = 0; 825 sbi->s_rock = 0;
822 if (sbi->s_firstdatazone != first_data_zone) { 826 if (sbi->s_firstdatazone != first_data_zone) {
823 sbi->s_firstdatazone = first_data_zone; 827 sbi->s_firstdatazone = first_data_zone;
824 printk(KERN_DEBUG 828 printk(KERN_DEBUG
825 "ISOFS: changing to secondary root\n"); 829 "ISOFS: changing to secondary root\n");
826 iput(inode); 830 iput(inode);
827 inode = isofs_iget(s, sbi->s_firstdatazone, 0); 831 inode = isofs_iget(s, sbi->s_firstdatazone, 0);
@@ -830,8 +834,10 @@ root_found:
830 834
831 if (opt.check == 'u') { 835 if (opt.check == 'u') {
832 /* Only Joliet is case insensitive by default */ 836 /* Only Joliet is case insensitive by default */
833 if (joliet_level) opt.check = 'r'; 837 if (joliet_level)
834 else opt.check = 's'; 838 opt.check = 'r';
839 else
840 opt.check = 's';
835 } 841 }
836 sbi->s_joliet_level = joliet_level; 842 sbi->s_joliet_level = joliet_level;
837 843
@@ -846,8 +852,10 @@ root_found:
846 goto out_no_root; 852 goto out_no_root;
847 853
848 table = 0; 854 table = 0;
849 if (joliet_level) table += 2; 855 if (joliet_level)
850 if (opt.check == 'r') table++; 856 table += 2;
857 if (opt.check == 'r')
858 table++;
851 s->s_root->d_op = &isofs_dentry_ops[table]; 859 s->s_root->d_op = &isofs_dentry_ops[table];
852 860
853 kfree(opt.iocharset); 861 kfree(opt.iocharset);
@@ -858,10 +866,10 @@ root_found:
858 * Display error messages and free resources. 866 * Display error messages and free resources.
859 */ 867 */
860out_bad_root: 868out_bad_root:
861 printk(KERN_WARNING "isofs_fill_super: root inode not initialized\n"); 869 printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
862 goto out_iput; 870 goto out_iput;
863out_no_root: 871out_no_root:
864 printk(KERN_WARNING "isofs_fill_super: get root inode failed\n"); 872 printk(KERN_WARNING "%s: get root inode failed\n", __func__);
865out_iput: 873out_iput:
866 iput(inode); 874 iput(inode);
867#ifdef CONFIG_JOLIET 875#ifdef CONFIG_JOLIET
@@ -870,21 +878,20 @@ out_iput:
870#endif 878#endif
871 goto out_freesbi; 879 goto out_freesbi;
872out_no_read: 880out_no_read:
873 printk(KERN_WARNING "isofs_fill_super: " 881 printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
874 "bread failed, dev=%s, iso_blknum=%d, block=%d\n", 882 __func__, s->s_id, iso_blknum, block);
875 s->s_id, iso_blknum, block);
876 goto out_freesbi; 883 goto out_freesbi;
877out_bad_zone_size: 884out_bad_zone_size:
878 printk(KERN_WARNING "Bad logical zone size %ld\n", 885 printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
879 sbi->s_log_zone_size); 886 sbi->s_log_zone_size);
880 goto out_freebh; 887 goto out_freebh;
881out_bad_size: 888out_bad_size:
882 printk(KERN_WARNING "Logical zone size(%d) < hardware blocksize(%u)\n", 889 printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
883 orig_zonesize, opt.blocksize); 890 orig_zonesize, opt.blocksize);
884 goto out_freebh; 891 goto out_freebh;
885out_unknown_format: 892out_unknown_format:
886 if (!silent) 893 if (!silent)
887 printk(KERN_WARNING "Unable to identify CD-ROM format.\n"); 894 printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");
888 895
889out_freebh: 896out_freebh:
890 brelse(bh); 897 brelse(bh);
@@ -902,7 +909,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
902 buf->f_type = ISOFS_SUPER_MAGIC; 909 buf->f_type = ISOFS_SUPER_MAGIC;
903 buf->f_bsize = sb->s_blocksize; 910 buf->f_bsize = sb->s_blocksize;
904 buf->f_blocks = (ISOFS_SB(sb)->s_nzones 911 buf->f_blocks = (ISOFS_SB(sb)->s_nzones
905 << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); 912 << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
906 buf->f_bfree = 0; 913 buf->f_bfree = 0;
907 buf->f_bavail = 0; 914 buf->f_bavail = 0;
908 buf->f_files = ISOFS_SB(sb)->s_ninodes; 915 buf->f_files = ISOFS_SB(sb)->s_ninodes;
@@ -931,20 +938,20 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
931 938
932 rv = 0; 939 rv = 0;
933 if (iblock < 0 || iblock != iblock_s) { 940 if (iblock < 0 || iblock != iblock_s) {
934 printk("isofs_get_blocks: block number too large\n"); 941 printk(KERN_DEBUG "%s: block number too large\n", __func__);
935 goto abort; 942 goto abort;
936 } 943 }
937 944
938 b_off = iblock; 945 b_off = iblock;
939 946
940 offset = 0; 947 offset = 0;
941 firstext = ei->i_first_extent; 948 firstext = ei->i_first_extent;
942 sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); 949 sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
943 nextblk = ei->i_next_section_block; 950 nextblk = ei->i_next_section_block;
944 nextoff = ei->i_next_section_offset; 951 nextoff = ei->i_next_section_offset;
945 section = 0; 952 section = 0;
946 953
947 while ( nblocks ) { 954 while (nblocks) {
948 /* If we are *way* beyond the end of the file, print a message. 955 /* If we are *way* beyond the end of the file, print a message.
949 * Access beyond the end of the file up to the next page boundary 956 * Access beyond the end of the file up to the next page boundary
950 * is normal, however because of the way the page cache works. 957 * is normal, however because of the way the page cache works.
@@ -953,11 +960,11 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
953 * I/O errors. 960 * I/O errors.
954 */ 961 */
955 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 962 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
956 printk("isofs_get_blocks: block >= EOF (%ld, %ld)\n", 963 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
957 iblock, (unsigned long) inode->i_size); 964 __func__, iblock, (unsigned long) inode->i_size);
958 goto abort; 965 goto abort;
959 } 966 }
960 967
961 /* On the last section, nextblk == 0, section size is likely to 968 /* On the last section, nextblk == 0, section size is likely to
962 * exceed sect_size by a partial block, and access beyond the 969 * exceed sect_size by a partial block, and access beyond the
963 * end of the file will reach beyond the section size, too. 970 * end of the file will reach beyond the section size, too.
@@ -976,20 +983,21 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
976 iput(ninode); 983 iput(ninode);
977 984
978 if (++section > 100) { 985 if (++section > 100) {
979 printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n"); 986 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
980 printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u " 987 " aborting...\n", __func__);
981 "nextblk=%lu nextoff=%lu\n", 988 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
982 iblock, firstext, (unsigned) sect_size, 989 "nextblk=%lu nextoff=%lu\n", __func__,
983 nextblk, nextoff); 990 iblock, firstext, (unsigned) sect_size,
991 nextblk, nextoff);
984 goto abort; 992 goto abort;
985 } 993 }
986 } 994 }
987 995
988 if ( *bh ) { 996 if (*bh) {
989 map_bh(*bh, inode->i_sb, firstext + b_off - offset); 997 map_bh(*bh, inode->i_sb, firstext + b_off - offset);
990 } else { 998 } else {
991 *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); 999 *bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
992 if ( !*bh ) 1000 if (!*bh)
993 goto abort; 1001 goto abort;
994 } 1002 }
995 bh++; /* Next buffer head */ 1003 bh++; /* Next buffer head */
@@ -1010,7 +1018,7 @@ static int isofs_get_block(struct inode *inode, sector_t iblock,
1010 struct buffer_head *bh_result, int create) 1018 struct buffer_head *bh_result, int create)
1011{ 1019{
1012 if (create) { 1020 if (create) {
1013 printk("isofs_get_block: Kernel tries to allocate a block\n"); 1021 printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
1014 return -EROFS; 1022 return -EROFS;
1015 } 1023 }
1016 1024
@@ -1070,11 +1078,11 @@ static int isofs_read_level3_size(struct inode *inode)
1070{ 1078{
1071 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 1079 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
1072 int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; 1080 int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
1073 struct buffer_head * bh = NULL; 1081 struct buffer_head *bh = NULL;
1074 unsigned long block, offset, block_saved, offset_saved; 1082 unsigned long block, offset, block_saved, offset_saved;
1075 int i = 0; 1083 int i = 0;
1076 int more_entries = 0; 1084 int more_entries = 0;
1077 struct iso_directory_record * tmpde = NULL; 1085 struct iso_directory_record *tmpde = NULL;
1078 struct iso_inode_info *ei = ISOFS_I(inode); 1086 struct iso_inode_info *ei = ISOFS_I(inode);
1079 1087
1080 inode->i_size = 0; 1088 inode->i_size = 0;
@@ -1089,7 +1097,7 @@ static int isofs_read_level3_size(struct inode *inode)
1089 offset = ei->i_iget5_offset; 1097 offset = ei->i_iget5_offset;
1090 1098
1091 do { 1099 do {
1092 struct iso_directory_record * de; 1100 struct iso_directory_record *de;
1093 unsigned int de_len; 1101 unsigned int de_len;
1094 1102
1095 if (!bh) { 1103 if (!bh) {
@@ -1163,10 +1171,9 @@ out_noread:
1163 return -EIO; 1171 return -EIO;
1164 1172
1165out_toomany: 1173out_toomany:
1166 printk(KERN_INFO "isofs_read_level3_size: " 1174 printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
1167 "More than 100 file sections ?!?, aborting...\n" 1175 "isofs_read_level3_size: inode=%lu\n",
1168 "isofs_read_level3_size: inode=%lu\n", 1176 __func__, inode->i_ino);
1169 inode->i_ino);
1170 goto out; 1177 goto out;
1171} 1178}
1172 1179
@@ -1177,9 +1184,9 @@ static void isofs_read_inode(struct inode *inode)
1177 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 1184 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
1178 unsigned long block; 1185 unsigned long block;
1179 int high_sierra = sbi->s_high_sierra; 1186 int high_sierra = sbi->s_high_sierra;
1180 struct buffer_head * bh = NULL; 1187 struct buffer_head *bh = NULL;
1181 struct iso_directory_record * de; 1188 struct iso_directory_record *de;
1182 struct iso_directory_record * tmpde = NULL; 1189 struct iso_directory_record *tmpde = NULL;
1183 unsigned int de_len; 1190 unsigned int de_len;
1184 unsigned long offset; 1191 unsigned long offset;
1185 struct iso_inode_info *ei = ISOFS_I(inode); 1192 struct iso_inode_info *ei = ISOFS_I(inode);
@@ -1199,7 +1206,7 @@ static void isofs_read_inode(struct inode *inode)
1199 1206
1200 tmpde = kmalloc(de_len, GFP_KERNEL); 1207 tmpde = kmalloc(de_len, GFP_KERNEL);
1201 if (tmpde == NULL) { 1208 if (tmpde == NULL) {
1202 printk(KERN_INFO "isofs_read_inode: out of memory\n"); 1209 printk(KERN_INFO "%s: out of memory\n", __func__);
1203 goto fail; 1210 goto fail;
1204 } 1211 }
1205 memcpy(tmpde, bh->b_data + offset, frag1); 1212 memcpy(tmpde, bh->b_data + offset, frag1);
@@ -1212,24 +1219,26 @@ static void isofs_read_inode(struct inode *inode)
1212 } 1219 }
1213 1220
1214 inode->i_ino = isofs_get_ino(ei->i_iget5_block, 1221 inode->i_ino = isofs_get_ino(ei->i_iget5_block,
1215 ei->i_iget5_offset, 1222 ei->i_iget5_offset,
1216 ISOFS_BUFFER_BITS(inode)); 1223 ISOFS_BUFFER_BITS(inode));
1217 1224
1218 /* Assume it is a normal-format file unless told otherwise */ 1225 /* Assume it is a normal-format file unless told otherwise */
1219 ei->i_file_format = isofs_file_normal; 1226 ei->i_file_format = isofs_file_normal;
1220 1227
1221 if (de->flags[-high_sierra] & 2) { 1228 if (de->flags[-high_sierra] & 2) {
1222 inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR; 1229 inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR;
1223 inode->i_nlink = 1; /* Set to 1. We know there are 2, but 1230 inode->i_nlink = 1; /*
1224 the find utility tries to optimize 1231 * Set to 1. We know there are 2, but
1225 if it is 2, and it screws up. It is 1232 * the find utility tries to optimize
1226 easier to give 1 which tells find to 1233 * if it is 2, and it screws up. It is
1227 do it the hard way. */ 1234 * easier to give 1 which tells find to
1235 * do it the hard way.
1236 */
1228 } else { 1237 } else {
1229 /* Everybody gets to read the file. */ 1238 /* Everybody gets to read the file. */
1230 inode->i_mode = sbi->s_mode; 1239 inode->i_mode = sbi->s_mode;
1231 inode->i_nlink = 1; 1240 inode->i_nlink = 1;
1232 inode->i_mode |= S_IFREG; 1241 inode->i_mode |= S_IFREG;
1233 } 1242 }
1234 inode->i_uid = sbi->s_uid; 1243 inode->i_uid = sbi->s_uid;
1235 inode->i_gid = sbi->s_gid; 1244 inode->i_gid = sbi->s_gid;
@@ -1239,13 +1248,14 @@ static void isofs_read_inode(struct inode *inode)
1239 ei->i_format_parm[1] = 0; 1248 ei->i_format_parm[1] = 0;
1240 ei->i_format_parm[2] = 0; 1249 ei->i_format_parm[2] = 0;
1241 1250
1242 ei->i_section_size = isonum_733 (de->size); 1251 ei->i_section_size = isonum_733(de->size);
1243 if (de->flags[-high_sierra] & 0x80) { 1252 if (de->flags[-high_sierra] & 0x80) {
1244 if(isofs_read_level3_size(inode)) goto fail; 1253 if(isofs_read_level3_size(inode))
1254 goto fail;
1245 } else { 1255 } else {
1246 ei->i_next_section_block = 0; 1256 ei->i_next_section_block = 0;
1247 ei->i_next_section_offset = 0; 1257 ei->i_next_section_offset = 0;
1248 inode->i_size = isonum_733 (de->size); 1258 inode->i_size = isonum_733(de->size);
1249 } 1259 }
1250 1260
1251 /* 1261 /*
@@ -1258,23 +1268,24 @@ static void isofs_read_inode(struct inode *inode)
1258 inode->i_size &= 0x00ffffff; 1268 inode->i_size &= 0x00ffffff;
1259 1269
1260 if (de->interleave[0]) { 1270 if (de->interleave[0]) {
1261 printk("Interleaved files not (yet) supported.\n"); 1271 printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
1262 inode->i_size = 0; 1272 inode->i_size = 0;
1263 } 1273 }
1264 1274
1265 /* I have no idea what file_unit_size is used for, so 1275 /* I have no idea what file_unit_size is used for, so
1266 we will flag it for now */ 1276 we will flag it for now */
1267 if (de->file_unit_size[0] != 0) { 1277 if (de->file_unit_size[0] != 0) {
1268 printk("File unit size != 0 for ISO file (%ld).\n", 1278 printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
1269 inode->i_ino); 1279 inode->i_ino);
1270 } 1280 }
1271 1281
1272 /* I have no idea what other flag bits are used for, so 1282 /* I have no idea what other flag bits are used for, so
1273 we will flag it for now */ 1283 we will flag it for now */
1274#ifdef DEBUG 1284#ifdef DEBUG
1275 if((de->flags[-high_sierra] & ~2)!= 0){ 1285 if((de->flags[-high_sierra] & ~2)!= 0){
1276 printk("Unusual flag settings for ISO file (%ld %x).\n", 1286 printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
1277 inode->i_ino, de->flags[-high_sierra]); 1287 "(%ld %x).\n",
1288 inode->i_ino, de->flags[-high_sierra]);
1278 } 1289 }
1279#endif 1290#endif
1280 1291
@@ -1285,11 +1296,11 @@ static void isofs_read_inode(struct inode *inode)
1285 inode->i_atime.tv_nsec = 1296 inode->i_atime.tv_nsec =
1286 inode->i_ctime.tv_nsec = 0; 1297 inode->i_ctime.tv_nsec = 0;
1287 1298
1288 ei->i_first_extent = (isonum_733 (de->extent) + 1299 ei->i_first_extent = (isonum_733(de->extent) +
1289 isonum_711 (de->ext_attr_length)); 1300 isonum_711(de->ext_attr_length));
1290 1301
1291 /* Set the number of blocks for stat() - should be done before RR */ 1302 /* Set the number of blocks for stat() - should be done before RR */
1292 inode->i_blocks = (inode->i_size + 511) >> 9; 1303 inode->i_blocks = (inode->i_size + 511) >> 9;
1293 1304
1294 /* 1305 /*
1295 * Now test for possible Rock Ridge extensions which will override 1306 * Now test for possible Rock Ridge extensions which will override
@@ -1306,7 +1317,7 @@ static void isofs_read_inode(struct inode *inode)
1306 /* Install the inode operations vector */ 1317 /* Install the inode operations vector */
1307 if (S_ISREG(inode->i_mode)) { 1318 if (S_ISREG(inode->i_mode)) {
1308 inode->i_fop = &generic_ro_fops; 1319 inode->i_fop = &generic_ro_fops;
1309 switch ( ei->i_file_format ) { 1320 switch (ei->i_file_format) {
1310#ifdef CONFIG_ZISOFS 1321#ifdef CONFIG_ZISOFS
1311 case isofs_file_compressed: 1322 case isofs_file_compressed:
1312 inode->i_data.a_ops = &zisofs_aops; 1323 inode->i_data.a_ops = &zisofs_aops;
@@ -1350,7 +1361,7 @@ static int isofs_iget5_test(struct inode *ino, void *data)
1350 struct isofs_iget5_callback_data *d = 1361 struct isofs_iget5_callback_data *d =
1351 (struct isofs_iget5_callback_data*)data; 1362 (struct isofs_iget5_callback_data*)data;
1352 return (i->i_iget5_block == d->block) 1363 return (i->i_iget5_block == d->block)
1353 && (i->i_iget5_offset == d->offset); 1364 && (i->i_iget5_offset == d->offset);
1354} 1365}
1355 1366
1356static int isofs_iget5_set(struct inode *ino, void *data) 1367static int isofs_iget5_set(struct inode *ino, void *data)
@@ -1384,7 +1395,7 @@ struct inode *isofs_iget(struct super_block *sb,
1384 hashval = (block << sb->s_blocksize_bits) | offset; 1395 hashval = (block << sb->s_blocksize_bits) | offset;
1385 1396
1386 inode = iget5_locked(sb, hashval, &isofs_iget5_test, 1397 inode = iget5_locked(sb, hashval, &isofs_iget5_test,
1387 &isofs_iget5_set, &data); 1398 &isofs_iget5_set, &data);
1388 1399
1389 if (inode && (inode->i_state & I_NEW)) { 1400 if (inode && (inode->i_state & I_NEW)) {
1390 sb->s_op->read_inode(inode); 1401 sb->s_op->read_inode(inode);
@@ -1398,7 +1409,7 @@ static int isofs_get_sb(struct file_system_type *fs_type,
1398 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1409 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1399{ 1410{
1400 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, 1411 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
1401 mnt); 1412 mnt);
1402} 1413}
1403 1414
1404static struct file_system_type iso9660_fs_type = { 1415static struct file_system_type iso9660_fs_type = {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index efe2872cd4..a07e67b1ea 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -1,5 +1,6 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/buffer_head.h> 2#include <linux/buffer_head.h>
3#include <linux/exportfs.h>
3#include <linux/iso_fs.h> 4#include <linux/iso_fs.h>
4#include <asm/unaligned.h> 5#include <asm/unaligned.h>
5 6
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index fb8fe7a9dd..92c14b850e 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -80,22 +80,20 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
80 80
81 if (utf8) { 81 if (utf8) {
82 len = wcsntombs_be(outname, de->name, 82 len = wcsntombs_be(outname, de->name,
83 de->name_len[0] >> 1, PAGE_SIZE); 83 de->name_len[0] >> 1, PAGE_SIZE);
84 } else { 84 } else {
85 len = uni16_to_x8(outname, (__be16 *) de->name, 85 len = uni16_to_x8(outname, (__be16 *) de->name,
86 de->name_len[0] >> 1, nls); 86 de->name_len[0] >> 1, nls);
87 } 87 }
88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) { 88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1'))
89 len -= 2; 89 len -= 2;
90 }
91 90
92 /* 91 /*
93 * Windows doesn't like periods at the end of a name, 92 * Windows doesn't like periods at the end of a name,
94 * so neither do we 93 * so neither do we
95 */ 94 */
96 while (len >= 2 && (outname[len-1] == '.')) { 95 while (len >= 2 && (outname[len-1] == '.'))
97 len--; 96 len--;
98 }
99 97
100 return len; 98 return len;
101} 99}
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c04b3a14a3..c8c7e5138a 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -15,7 +15,7 @@
15 * some sanity tests. 15 * some sanity tests.
16 */ 16 */
17static int 17static int
18isofs_cmp(struct dentry * dentry, const char * compare, int dlen) 18isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
19{ 19{
20 struct qstr qstr; 20 struct qstr qstr;
21 21
@@ -48,24 +48,24 @@ isofs_cmp(struct dentry * dentry, const char * compare, int dlen)
48 */ 48 */
49static unsigned long 49static unsigned long
50isofs_find_entry(struct inode *dir, struct dentry *dentry, 50isofs_find_entry(struct inode *dir, struct dentry *dentry,
51 unsigned long *block_rv, unsigned long* offset_rv, 51 unsigned long *block_rv, unsigned long *offset_rv,
52 char * tmpname, struct iso_directory_record * tmpde) 52 char *tmpname, struct iso_directory_record *tmpde)
53{ 53{
54 unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); 54 unsigned long bufsize = ISOFS_BUFFER_SIZE(dir);
55 unsigned char bufbits = ISOFS_BUFFER_BITS(dir); 55 unsigned char bufbits = ISOFS_BUFFER_BITS(dir);
56 unsigned long block, f_pos, offset, block_saved, offset_saved; 56 unsigned long block, f_pos, offset, block_saved, offset_saved;
57 struct buffer_head * bh = NULL; 57 struct buffer_head *bh = NULL;
58 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); 58 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
59 59
60 if (!ISOFS_I(dir)->i_first_extent) 60 if (!ISOFS_I(dir)->i_first_extent)
61 return 0; 61 return 0;
62 62
63 f_pos = 0; 63 f_pos = 0;
64 offset = 0; 64 offset = 0;
65 block = 0; 65 block = 0;
66 66
67 while (f_pos < dir->i_size) { 67 while (f_pos < dir->i_size) {
68 struct iso_directory_record * de; 68 struct iso_directory_record *de;
69 int de_len, match, i, dlen; 69 int de_len, match, i, dlen;
70 char *dpnt; 70 char *dpnt;
71 71
@@ -114,7 +114,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
114 114
115 if (sbi->s_rock && 115 if (sbi->s_rock &&
116 ((i = get_rock_ridge_filename(de, tmpname, dir)))) { 116 ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
117 dlen = i; /* possibly -1 */ 117 dlen = i; /* possibly -1 */
118 dpnt = tmpname; 118 dpnt = tmpname;
119#ifdef CONFIG_JOLIET 119#ifdef CONFIG_JOLIET
120 } else if (sbi->s_joliet_level) { 120 } else if (sbi->s_joliet_level) {
@@ -145,8 +145,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
145 isofs_normalize_block_and_offset(de, 145 isofs_normalize_block_and_offset(de,
146 &block_saved, 146 &block_saved,
147 &offset_saved); 147 &offset_saved);
148 *block_rv = block_saved; 148 *block_rv = block_saved;
149 *offset_rv = offset_saved; 149 *offset_rv = offset_saved;
150 brelse(bh); 150 brelse(bh);
151 return 1; 151 return 1;
152 } 152 }
@@ -155,7 +155,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
155 return 0; 155 return 0;
156} 156}
157 157
158struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) 158struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
159{ 159{
160 int found; 160 int found;
161 unsigned long block, offset; 161 unsigned long block, offset;
@@ -170,9 +170,9 @@ struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct n
170 170
171 lock_kernel(); 171 lock_kernel();
172 found = isofs_find_entry(dir, dentry, 172 found = isofs_find_entry(dir, dentry,
173 &block, &offset, 173 &block, &offset,
174 page_address(page), 174 page_address(page),
175 1024 + page_address(page)); 175 1024 + page_address(page));
176 __free_page(page); 176 __free_page(page);
177 177
178 inode = NULL; 178 inode = NULL;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 1facfaff97..a003d50edc 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -887,7 +887,8 @@ restart_loop:
887 journal->j_committing_transaction = NULL; 887 journal->j_committing_transaction = NULL;
888 spin_unlock(&journal->j_state_lock); 888 spin_unlock(&journal->j_state_lock);
889 889
890 if (commit_transaction->t_checkpoint_list == NULL) { 890 if (commit_transaction->t_checkpoint_list == NULL &&
891 commit_transaction->t_checkpoint_io_list == NULL) {
891 __journal_drop_transaction(journal, commit_transaction); 892 __journal_drop_transaction(journal, commit_transaction);
892 } else { 893 } else {
893 if (journal->j_checkpoint_transactions == NULL) { 894 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 824e3b7d4e..8db2fa2517 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -68,6 +68,7 @@
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/init.h> 69#include <linux/init.h>
70#endif 70#endif
71#include <linux/log2.h>
71 72
72static struct kmem_cache *revoke_record_cache; 73static struct kmem_cache *revoke_record_cache;
73static struct kmem_cache *revoke_table_cache; 74static struct kmem_cache *revoke_table_cache;
@@ -211,7 +212,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
211 journal->j_revoke = journal->j_revoke_table[0]; 212 journal->j_revoke = journal->j_revoke_table[0];
212 213
213 /* Check that the hash_size is a power of two */ 214 /* Check that the hash_size is a power of two */
214 J_ASSERT ((hash_size & (hash_size-1)) == 0); 215 J_ASSERT(is_power_of_2(hash_size));
215 216
216 journal->j_revoke->hash_size = hash_size; 217 journal->j_revoke->hash_size = hash_size;
217 218
@@ -238,7 +239,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
238 journal->j_revoke = journal->j_revoke_table[1]; 239 journal->j_revoke = journal->j_revoke_table[1];
239 240
240 /* Check that the hash_size is a power of two */ 241 /* Check that the hash_size is a power of two */
241 J_ASSERT ((hash_size & (hash_size-1)) == 0); 242 J_ASSERT(is_power_of_2(hash_size));
242 243
243 journal->j_revoke->hash_size = hash_size; 244 journal->j_revoke->hash_size = hash_size;
244 245
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2856e1100a..c0f59d1b13 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -896,7 +896,8 @@ restart_loop:
896 journal->j_committing_transaction = NULL; 896 journal->j_committing_transaction = NULL;
897 spin_unlock(&journal->j_state_lock); 897 spin_unlock(&journal->j_state_lock);
898 898
899 if (commit_transaction->t_checkpoint_list == NULL) { 899 if (commit_transaction->t_checkpoint_list == NULL &&
900 commit_transaction->t_checkpoint_io_list == NULL) {
900 __jbd2_journal_drop_transaction(journal, commit_transaction); 901 __jbd2_journal_drop_transaction(journal, commit_transaction);
901 } else { 902 } else {
902 if (journal->j_checkpoint_transactions == NULL) { 903 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 78d63b818f..f290cb7cb8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -35,6 +35,7 @@
35#include <linux/kthread.h> 35#include <linux/kthread.h>
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/page.h> 41#include <asm/page.h>
@@ -528,7 +529,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
528{ 529{
529 int err = 0; 530 int err = 0;
530 531
531#ifdef CONFIG_JBD_DEBUG 532#ifdef CONFIG_JBD2_DEBUG
532 spin_lock(&journal->j_state_lock); 533 spin_lock(&journal->j_state_lock);
533 if (!tid_geq(journal->j_commit_request, tid)) { 534 if (!tid_geq(journal->j_commit_request, tid)) {
534 printk(KERN_EMERG 535 printk(KERN_EMERG
@@ -1709,7 +1710,7 @@ void jbd2_slab_free(void *ptr, size_t size)
1709 * Journal_head storage management 1710 * Journal_head storage management
1710 */ 1711 */
1711static struct kmem_cache *jbd2_journal_head_cache; 1712static struct kmem_cache *jbd2_journal_head_cache;
1712#ifdef CONFIG_JBD_DEBUG 1713#ifdef CONFIG_JBD2_DEBUG
1713static atomic_t nr_journal_heads = ATOMIC_INIT(0); 1714static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1714#endif 1715#endif
1715 1716
@@ -1747,7 +1748,7 @@ static struct journal_head *journal_alloc_journal_head(void)
1747 struct journal_head *ret; 1748 struct journal_head *ret;
1748 static unsigned long last_warning; 1749 static unsigned long last_warning;
1749 1750
1750#ifdef CONFIG_JBD_DEBUG 1751#ifdef CONFIG_JBD2_DEBUG
1751 atomic_inc(&nr_journal_heads); 1752 atomic_inc(&nr_journal_heads);
1752#endif 1753#endif
1753 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 1754 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -1768,7 +1769,7 @@ static struct journal_head *journal_alloc_journal_head(void)
1768 1769
1769static void journal_free_journal_head(struct journal_head *jh) 1770static void journal_free_journal_head(struct journal_head *jh)
1770{ 1771{
1771#ifdef CONFIG_JBD_DEBUG 1772#ifdef CONFIG_JBD2_DEBUG
1772 atomic_dec(&nr_journal_heads); 1773 atomic_dec(&nr_journal_heads);
1773 memset(jh, JBD_POISON_FREE, sizeof(*jh)); 1774 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1774#endif 1775#endif
@@ -1951,64 +1952,50 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
1951} 1952}
1952 1953
1953/* 1954/*
1954 * /proc tunables 1955 * debugfs tunables
1955 */ 1956 */
1956#if defined(CONFIG_JBD_DEBUG) 1957#if defined(CONFIG_JBD2_DEBUG)
1957int jbd2_journal_enable_debug; 1958u8 jbd2_journal_enable_debug;
1958EXPORT_SYMBOL(jbd2_journal_enable_debug); 1959EXPORT_SYMBOL(jbd2_journal_enable_debug);
1959#endif 1960#endif
1960 1961
1961#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) 1962#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_DEBUG_FS)
1962 1963
1963static struct proc_dir_entry *proc_jbd_debug; 1964#define JBD2_DEBUG_NAME "jbd2-debug"
1964 1965
1965static int read_jbd_debug(char *page, char **start, off_t off, 1966struct dentry *jbd2_debugfs_dir, *jbd2_debug;
1966 int count, int *eof, void *data)
1967{
1968 int ret;
1969 1967
1970 ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug); 1968static void __init jbd2_create_debugfs_entry(void)
1971 *eof = 1; 1969{
1972 return ret; 1970 jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
1971 if (jbd2_debugfs_dir)
1972 jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO,
1973 jbd2_debugfs_dir,
1974 &jbd2_journal_enable_debug);
1973} 1975}
1974 1976
1975static int write_jbd_debug(struct file *file, const char __user *buffer, 1977static void __exit jbd2_remove_debugfs_entry(void)
1976 unsigned long count, void *data)
1977{ 1978{
1978 char buf[32]; 1979 if (jbd2_debug)
1979 1980 debugfs_remove(jbd2_debug);
1980 if (count > ARRAY_SIZE(buf) - 1) 1981 if (jbd2_debugfs_dir)
1981 count = ARRAY_SIZE(buf) - 1; 1982 debugfs_remove(jbd2_debugfs_dir);
1982 if (copy_from_user(buf, buffer, count))
1983 return -EFAULT;
1984 buf[ARRAY_SIZE(buf) - 1] = '\0';
1985 jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
1986 return count;
1987} 1983}
1988 1984
1989#define JBD_PROC_NAME "sys/fs/jbd2-debug" 1985#else
1990 1986
1991static void __init create_jbd_proc_entry(void) 1987static void __init jbd2_create_debugfs_entry(void)
1992{ 1988{
1993 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); 1989 do {
1994 if (proc_jbd_debug) { 1990 } while (0);
1995 /* Why is this so hard? */
1996 proc_jbd_debug->read_proc = read_jbd_debug;
1997 proc_jbd_debug->write_proc = write_jbd_debug;
1998 }
1999} 1991}
2000 1992
2001static void __exit jbd2_remove_jbd_proc_entry(void) 1993static void __exit jbd2_remove_debugfs_entry(void)
2002{ 1994{
2003 if (proc_jbd_debug) 1995 do {
2004 remove_proc_entry(JBD_PROC_NAME, NULL); 1996 } while (0);
2005} 1997}
2006 1998
2007#else
2008
2009#define create_jbd_proc_entry() do {} while (0)
2010#define jbd2_remove_jbd_proc_entry() do {} while (0)
2011
2012#endif 1999#endif
2013 2000
2014struct kmem_cache *jbd2_handle_cache; 2001struct kmem_cache *jbd2_handle_cache;
@@ -2067,18 +2054,18 @@ static int __init journal_init(void)
2067 ret = journal_init_caches(); 2054 ret = journal_init_caches();
2068 if (ret != 0) 2055 if (ret != 0)
2069 jbd2_journal_destroy_caches(); 2056 jbd2_journal_destroy_caches();
2070 create_jbd_proc_entry(); 2057 jbd2_create_debugfs_entry();
2071 return ret; 2058 return ret;
2072} 2059}
2073 2060
2074static void __exit journal_exit(void) 2061static void __exit journal_exit(void)
2075{ 2062{
2076#ifdef CONFIG_JBD_DEBUG 2063#ifdef CONFIG_JBD2_DEBUG
2077 int n = atomic_read(&nr_journal_heads); 2064 int n = atomic_read(&nr_journal_heads);
2078 if (n) 2065 if (n)
2079 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2066 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2080#endif 2067#endif
2081 jbd2_remove_jbd_proc_entry(); 2068 jbd2_remove_debugfs_entry();
2082 jbd2_journal_destroy_caches(); 2069 jbd2_journal_destroy_caches();
2083} 2070}
2084 2071
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 395c92a04a..e7730a045b 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -295,7 +295,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
295 printk(KERN_ERR "JBD: error %d scanning journal\n", err); 295 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
296 ++journal->j_transaction_sequence; 296 ++journal->j_transaction_sequence;
297 } else { 297 } else {
298#ifdef CONFIG_JBD_DEBUG 298#ifdef CONFIG_JBD2_DEBUG
299 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 299 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
300#endif 300#endif
301 jbd_debug(0, 301 jbd_debug(0,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9246e763da..28cac049a5 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -68,6 +68,7 @@
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/init.h> 69#include <linux/init.h>
70#endif 70#endif
71#include <linux/log2.h>
71 72
72static struct kmem_cache *jbd2_revoke_record_cache; 73static struct kmem_cache *jbd2_revoke_record_cache;
73static struct kmem_cache *jbd2_revoke_table_cache; 74static struct kmem_cache *jbd2_revoke_table_cache;
@@ -212,7 +213,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
212 journal->j_revoke = journal->j_revoke_table[0]; 213 journal->j_revoke = journal->j_revoke_table[0];
213 214
214 /* Check that the hash_size is a power of two */ 215 /* Check that the hash_size is a power of two */
215 J_ASSERT ((hash_size & (hash_size-1)) == 0); 216 J_ASSERT(is_power_of_2(hash_size));
216 217
217 journal->j_revoke->hash_size = hash_size; 218 journal->j_revoke->hash_size = hash_size;
218 219
@@ -239,7 +240,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
239 journal->j_revoke = journal->j_revoke_table[1]; 240 journal->j_revoke = journal->j_revoke_table[1];
240 241
241 /* Check that the hash_size is a power of two */ 242 /* Check that the hash_size is a power of two */
242 J_ASSERT ((hash_size & (hash_size-1)) == 0); 243 J_ASSERT(is_power_of_2(hash_size));
243 244
244 journal->j_revoke->hash_size = hash_size; 245 journal->j_revoke->hash_size = hash_size;
245 246
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a46101ee86..65b3a1b5b8 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -435,7 +435,7 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value,
435 struct posix_acl *acl; 435 struct posix_acl *acl;
436 int rc; 436 int rc;
437 437
438 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 438 if (!is_owner_or_cap(inode))
439 return -EPERM; 439 return -EPERM;
440 440
441 if (value) { 441 if (value) {
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0c82dfcfd2..143c5530ca 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -81,6 +81,7 @@ static int jffs2_garbage_collect_thread(void *_c)
81 81
82 set_user_nice(current, 10); 82 set_user_nice(current, 10);
83 83
84 set_freezable();
84 for (;;) { 85 for (;;) {
85 allow_signal(SIGHUP); 86 allow_signal(SIGHUP);
86 87
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 99871279a1..c2530197be 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -47,7 +47,7 @@ const struct file_operations jffs2_file_operations =
47 .ioctl = jffs2_ioctl, 47 .ioctl = jffs2_ioctl,
48 .mmap = generic_file_readonly_mmap, 48 .mmap = generic_file_readonly_mmap,
49 .fsync = jffs2_fsync, 49 .fsync = jffs2_fsync,
50 .sendfile = generic_file_sendfile 50 .splice_read = generic_file_splice_read,
51}; 51};
52 52
53/* jffs2_file_inode_operations */ 53/* jffs2_file_inode_operations */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 12e83f67ee..7b363786c2 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -210,8 +210,7 @@ static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *
210 * offset, and the one with the smallest length will come first in the 210 * offset, and the one with the smallest length will come first in the
211 * ordering. 211 * ordering.
212 * 212 *
213 * Returns 0 if the node was inserted 213 * Returns 0 if the node was handled (including marking it obsolete)
214 * 1 if the node is obsolete (because we can't mark it so yet)
215 * < 0 an if error occurred 214 * < 0 an if error occurred
216 */ 215 */
217static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, 216static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
@@ -572,8 +571,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
572 * Helper function for jffs2_get_inode_nodes(). 571 * Helper function for jffs2_get_inode_nodes().
573 * It is called every time an directory entry node is found. 572 * It is called every time an directory entry node is found.
574 * 573 *
575 * Returns: 0 on succes; 574 * Returns: 0 on success;
576 * 1 if the node should be marked obsolete;
577 * negative error code on failure. 575 * negative error code on failure.
578 */ 576 */
579static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 577static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
@@ -680,8 +678,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
680 * Helper function for jffs2_get_inode_nodes(). 678 * Helper function for jffs2_get_inode_nodes().
681 * It is called every time an inode node is found. 679 * It is called every time an inode node is found.
682 * 680 *
683 * Returns: 0 on success; 681 * Returns: 0 on success (possibly after marking a bad node obsolete);
684 * 1 if the node should be marked obsolete;
685 * negative error code on failure. 682 * negative error code on failure.
686 */ 683 */
687static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 684static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
@@ -690,7 +687,7 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
690{ 687{
691 struct jffs2_tmp_dnode_info *tn; 688 struct jffs2_tmp_dnode_info *tn;
692 uint32_t len, csize; 689 uint32_t len, csize;
693 int ret = 1; 690 int ret = 0;
694 uint32_t crc; 691 uint32_t crc;
695 692
696 /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */ 693 /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
@@ -719,8 +716,9 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
719 /* Sanity checks */ 716 /* Sanity checks */
720 if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) || 717 if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
721 unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) { 718 unlikely(PAD(je32_to_cpu(rd->csize) + sizeof(*rd)) != PAD(je32_to_cpu(rd->totlen)))) {
722 JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref)); 719 JFFS2_WARNING("inode node header CRC is corrupted at %#08x\n", ref_offset(ref));
723 jffs2_dbg_dump_node(c, ref_offset(ref)); 720 jffs2_dbg_dump_node(c, ref_offset(ref));
721 jffs2_mark_node_obsolete(c, ref);
724 goto free_out; 722 goto free_out;
725 } 723 }
726 724
@@ -775,6 +773,7 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
775 if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) { 773 if (len >= csize && unlikely(tn->partial_crc != je32_to_cpu(rd->data_crc))) {
776 JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n", 774 JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
777 ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc)); 775 ref_offset(ref), tn->partial_crc, je32_to_cpu(rd->data_crc));
776 jffs2_mark_node_obsolete(c, ref);
778 goto free_out; 777 goto free_out;
779 } 778 }
780 779
@@ -854,7 +853,6 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
854 * It is called every time an unknown node is found. 853 * It is called every time an unknown node is found.
855 * 854 *
856 * Returns: 0 on success; 855 * Returns: 0 on success;
857 * 1 if the node should be marked obsolete;
858 * negative error code on failure. 856 * negative error code on failure.
859 */ 857 */
860static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un) 858static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un)
@@ -1088,10 +1086,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
1088 } 1086 }
1089 1087
1090 err = read_unknown(c, ref, &node->u); 1088 err = read_unknown(c, ref, &node->u);
1091 if (err == 1) { 1089 if (unlikely(err))
1092 jffs2_mark_node_obsolete(c, ref);
1093 break;
1094 } else if (unlikely(err))
1095 goto free_out; 1090 goto free_out;
1096 1091
1097 } 1092 }
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
index 79494c4f2b..fa92f7f1d0 100644
--- a/fs/jfs/endian24.h
+++ b/fs/jfs/endian24.h
@@ -29,7 +29,7 @@
29 __u32 __x = (x); \ 29 __u32 __x = (x); \
30 ((__u32)( \ 30 ((__u32)( \
31 ((__x & (__u32)0x000000ffUL) << 16) | \ 31 ((__x & (__u32)0x000000ffUL) << 16) | \
32 (__x & (__u32)0x0000ff00UL) | \ 32 (__x & (__u32)0x0000ff00UL) | \
33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \ 33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \
34}) 34})
35 35
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index f7f8eff19b..87eb93694a 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -108,7 +108,6 @@ const struct file_operations jfs_file_operations = {
108 .aio_read = generic_file_aio_read, 108 .aio_read = generic_file_aio_read,
109 .aio_write = generic_file_aio_write, 109 .aio_write = generic_file_aio_write,
110 .mmap = generic_file_mmap, 110 .mmap = generic_file_mmap,
111 .sendfile = generic_file_sendfile,
112 .splice_read = generic_file_splice_read, 111 .splice_read = generic_file_splice_read,
113 .splice_write = generic_file_splice_write, 112 .splice_write = generic_file_splice_write,
114 .fsync = jfs_fsync, 113 .fsync = jfs_fsync,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index fe063af6fd..3c8663bea9 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -69,7 +69,7 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
69 if (IS_RDONLY(inode)) 69 if (IS_RDONLY(inode))
70 return -EROFS; 70 return -EROFS;
71 71
72 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 72 if (!is_owner_or_cap(inode))
73 return -EACCES; 73 return -EACCES;
74 74
75 if (get_user(flags, (int __user *) arg)) 75 if (get_user(flags, (int __user *) arg))
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 9c5d59632a..887f5759e5 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -26,34 +26,6 @@
26#include "jfs_filsys.h" 26#include "jfs_filsys.h"
27#include "jfs_debug.h" 27#include "jfs_debug.h"
28 28
29#ifdef CONFIG_JFS_DEBUG
30void dump_mem(char *label, void *data, int length)
31{
32 int i, j;
33 int *intptr = data;
34 char *charptr = data;
35 char buf[10], line[80];
36
37 printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
38 data);
39 for (i = 0; i < length; i += 16) {
40 line[0] = 0;
41 for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
42 sprintf(buf, " %08x", intptr[i / 4 + j]);
43 strcat(line, buf);
44 }
45 buf[0] = ' ';
46 buf[2] = 0;
47 for (j = 0; (j < 16) && (i + j < length); j++) {
48 buf[1] =
49 isprint(charptr[i + j]) ? charptr[i + j] : '.';
50 strcat(line, buf);
51 }
52 printk("%s\n", line);
53 }
54}
55#endif
56
57#ifdef PROC_FS_JFS /* see jfs_debug.h */ 29#ifdef PROC_FS_JFS /* see jfs_debug.h */
58 30
59static struct proc_dir_entry *base; 31static struct proc_dir_entry *base;
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 7378798f0b..044c1e654c 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,6 @@ extern void jfs_proc_clean(void);
62 62
63extern int jfsloglevel; 63extern int jfsloglevel;
64 64
65extern void dump_mem(char *label, void *data, int length);
66extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); 65extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
67 66
68/* information message: e.g., configuration, major event */ 67/* information message: e.g., configuration, major event */
@@ -94,7 +93,6 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
94 * --------- 93 * ---------
95 */ 94 */
96#else /* CONFIG_JFS_DEBUG */ 95#else /* CONFIG_JFS_DEBUG */
97#define dump_mem(label,data,length) do {} while (0)
98#define ASSERT(p) do {} while (0) 96#define ASSERT(p) do {} while (0)
99#define jfs_info(fmt, arg...) do {} while (0) 97#define jfs_info(fmt, arg...) do {} while (0)
100#define jfs_debug(fmt, arg...) do {} while (0) 98#define jfs_debug(fmt, arg...) do {} while (0)
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 40b2011138..c387540d34 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -19,23 +19,23 @@
19#define _H_JFS_DINODE 19#define _H_JFS_DINODE
20 20
21/* 21/*
22 * jfs_dinode.h: on-disk inode manager 22 * jfs_dinode.h: on-disk inode manager
23 */ 23 */
24 24
25#define INODESLOTSIZE 128 25#define INODESLOTSIZE 128
26#define L2INODESLOTSIZE 7 26#define L2INODESLOTSIZE 7
27#define log2INODESIZE 9 /* log2(bytes per dinode) */ 27#define log2INODESIZE 9 /* log2(bytes per dinode) */
28 28
29 29
30/* 30/*
31 * on-disk inode : 512 bytes 31 * on-disk inode : 512 bytes
32 * 32 *
33 * note: align 64-bit fields on 8-byte boundary. 33 * note: align 64-bit fields on 8-byte boundary.
34 */ 34 */
35struct dinode { 35struct dinode {
36 /* 36 /*
37 * I. base area (128 bytes) 37 * I. base area (128 bytes)
38 * ------------------------ 38 * ------------------------
39 * 39 *
40 * define generic/POSIX attributes 40 * define generic/POSIX attributes
41 */ 41 */
@@ -70,16 +70,16 @@ struct dinode {
70 __le32 di_acltype; /* 4: Type of ACL */ 70 __le32 di_acltype; /* 4: Type of ACL */
71 71
72 /* 72 /*
73 * Extension Areas. 73 * Extension Areas.
74 * 74 *
75 * Historically, the inode was partitioned into 4 128-byte areas, 75 * Historically, the inode was partitioned into 4 128-byte areas,
76 * the last 3 being defined as unions which could have multiple 76 * the last 3 being defined as unions which could have multiple
77 * uses. The first 96 bytes had been completely unused until 77 * uses. The first 96 bytes had been completely unused until
78 * an index table was added to the directory. It is now more 78 * an index table was added to the directory. It is now more
79 * useful to describe the last 3/4 of the inode as a single 79 * useful to describe the last 3/4 of the inode as a single
80 * union. We would probably be better off redesigning the 80 * union. We would probably be better off redesigning the
81 * entire structure from scratch, but we don't want to break 81 * entire structure from scratch, but we don't want to break
82 * commonality with OS/2's JFS at this time. 82 * commonality with OS/2's JFS at this time.
83 */ 83 */
84 union { 84 union {
85 struct { 85 struct {
@@ -95,7 +95,7 @@ struct dinode {
95 } _dir; /* (384) */ 95 } _dir; /* (384) */
96#define di_dirtable u._dir._table 96#define di_dirtable u._dir._table
97#define di_dtroot u._dir._dtroot 97#define di_dtroot u._dir._dtroot
98#define di_parent di_dtroot.header.idotdot 98#define di_parent di_dtroot.header.idotdot
99#define di_DASD di_dtroot.header.DASD 99#define di_DASD di_dtroot.header.DASD
100 100
101 struct { 101 struct {
@@ -127,14 +127,14 @@ struct dinode {
127#define di_inlinedata u._file._u2._special._u 127#define di_inlinedata u._file._u2._special._u
128#define di_rdev u._file._u2._special._u._rdev 128#define di_rdev u._file._u2._special._u._rdev
129#define di_fastsymlink u._file._u2._special._u._fastsymlink 129#define di_fastsymlink u._file._u2._special._u._fastsymlink
130#define di_inlineea u._file._u2._special._inlineea 130#define di_inlineea u._file._u2._special._inlineea
131 } u; 131 } u;
132}; 132};
133 133
134/* extended mode bits (on-disk inode di_mode) */ 134/* extended mode bits (on-disk inode di_mode) */
135#define IFJOURNAL 0x00010000 /* journalled file */ 135#define IFJOURNAL 0x00010000 /* journalled file */
136#define ISPARSE 0x00020000 /* sparse file enabled */ 136#define ISPARSE 0x00020000 /* sparse file enabled */
137#define INLINEEA 0x00040000 /* inline EA area free */ 137#define INLINEEA 0x00040000 /* inline EA area free */
138#define ISWAPFILE 0x00800000 /* file open for pager swap space */ 138#define ISWAPFILE 0x00800000 /* file open for pager swap space */
139 139
140/* more extended mode bits: attributes for OS/2 */ 140/* more extended mode bits: attributes for OS/2 */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index f3b1ebb222..e1985066b1 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -154,12 +154,12 @@ static const s8 budtab[256] = {
154 * the in-core descriptor is initialized from disk. 154 * the in-core descriptor is initialized from disk.
155 * 155 *
156 * PARAMETERS: 156 * PARAMETERS:
157 * ipbmap - pointer to in-core inode for the block map. 157 * ipbmap - pointer to in-core inode for the block map.
158 * 158 *
159 * RETURN VALUES: 159 * RETURN VALUES:
160 * 0 - success 160 * 0 - success
161 * -ENOMEM - insufficient memory 161 * -ENOMEM - insufficient memory
162 * -EIO - i/o error 162 * -EIO - i/o error
163 */ 163 */
164int dbMount(struct inode *ipbmap) 164int dbMount(struct inode *ipbmap)
165{ 165{
@@ -232,11 +232,11 @@ int dbMount(struct inode *ipbmap)
232 * the memory for this descriptor is freed. 232 * the memory for this descriptor is freed.
233 * 233 *
234 * PARAMETERS: 234 * PARAMETERS:
235 * ipbmap - pointer to in-core inode for the block map. 235 * ipbmap - pointer to in-core inode for the block map.
236 * 236 *
237 * RETURN VALUES: 237 * RETURN VALUES:
238 * 0 - success 238 * 0 - success
239 * -EIO - i/o error 239 * -EIO - i/o error
240 */ 240 */
241int dbUnmount(struct inode *ipbmap, int mounterror) 241int dbUnmount(struct inode *ipbmap, int mounterror)
242{ 242{
@@ -320,13 +320,13 @@ int dbSync(struct inode *ipbmap)
320 * at a time. 320 * at a time.
321 * 321 *
322 * PARAMETERS: 322 * PARAMETERS:
323 * ip - pointer to in-core inode; 323 * ip - pointer to in-core inode;
324 * blkno - starting block number to be freed. 324 * blkno - starting block number to be freed.
325 * nblocks - number of blocks to be freed. 325 * nblocks - number of blocks to be freed.
326 * 326 *
327 * RETURN VALUES: 327 * RETURN VALUES:
328 * 0 - success 328 * 0 - success
329 * -EIO - i/o error 329 * -EIO - i/o error
330 */ 330 */
331int dbFree(struct inode *ip, s64 blkno, s64 nblocks) 331int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
332{ 332{
@@ -395,23 +395,23 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
395/* 395/*
396 * NAME: dbUpdatePMap() 396 * NAME: dbUpdatePMap()
397 * 397 *
398 * FUNCTION: update the allocation state (free or allocate) of the 398 * FUNCTION: update the allocation state (free or allocate) of the
399 * specified block range in the persistent block allocation map. 399 * specified block range in the persistent block allocation map.
400 * 400 *
401 * the blocks will be updated in the persistent map one 401 * the blocks will be updated in the persistent map one
402 * dmap at a time. 402 * dmap at a time.
403 * 403 *
404 * PARAMETERS: 404 * PARAMETERS:
405 * ipbmap - pointer to in-core inode for the block map. 405 * ipbmap - pointer to in-core inode for the block map.
406 * free - 'true' if block range is to be freed from the persistent 406 * free - 'true' if block range is to be freed from the persistent
407 * map; 'false' if it is to be allocated. 407 * map; 'false' if it is to be allocated.
408 * blkno - starting block number of the range. 408 * blkno - starting block number of the range.
409 * nblocks - number of contiguous blocks in the range. 409 * nblocks - number of contiguous blocks in the range.
410 * tblk - transaction block; 410 * tblk - transaction block;
411 * 411 *
412 * RETURN VALUES: 412 * RETURN VALUES:
413 * 0 - success 413 * 0 - success
414 * -EIO - i/o error 414 * -EIO - i/o error
415 */ 415 */
416int 416int
417dbUpdatePMap(struct inode *ipbmap, 417dbUpdatePMap(struct inode *ipbmap,
@@ -573,7 +573,7 @@ dbUpdatePMap(struct inode *ipbmap,
573/* 573/*
574 * NAME: dbNextAG() 574 * NAME: dbNextAG()
575 * 575 *
576 * FUNCTION: find the preferred allocation group for new allocations. 576 * FUNCTION: find the preferred allocation group for new allocations.
577 * 577 *
578 * Within the allocation groups, we maintain a preferred 578 * Within the allocation groups, we maintain a preferred
579 * allocation group which consists of a group with at least 579 * allocation group which consists of a group with at least
@@ -589,10 +589,10 @@ dbUpdatePMap(struct inode *ipbmap,
589 * empty ags around for large allocations. 589 * empty ags around for large allocations.
590 * 590 *
591 * PARAMETERS: 591 * PARAMETERS:
592 * ipbmap - pointer to in-core inode for the block map. 592 * ipbmap - pointer to in-core inode for the block map.
593 * 593 *
594 * RETURN VALUES: 594 * RETURN VALUES:
595 * the preferred allocation group number. 595 * the preferred allocation group number.
596 */ 596 */
597int dbNextAG(struct inode *ipbmap) 597int dbNextAG(struct inode *ipbmap)
598{ 598{
@@ -656,7 +656,7 @@ unlock:
656/* 656/*
657 * NAME: dbAlloc() 657 * NAME: dbAlloc()
658 * 658 *
659 * FUNCTION: attempt to allocate a specified number of contiguous free 659 * FUNCTION: attempt to allocate a specified number of contiguous free
660 * blocks from the working allocation block map. 660 * blocks from the working allocation block map.
661 * 661 *
662 * the block allocation policy uses hints and a multi-step 662 * the block allocation policy uses hints and a multi-step
@@ -680,16 +680,16 @@ unlock:
680 * size or requests that specify no hint value. 680 * size or requests that specify no hint value.
681 * 681 *
682 * PARAMETERS: 682 * PARAMETERS:
683 * ip - pointer to in-core inode; 683 * ip - pointer to in-core inode;
684 * hint - allocation hint. 684 * hint - allocation hint.
685 * nblocks - number of contiguous blocks in the range. 685 * nblocks - number of contiguous blocks in the range.
686 * results - on successful return, set to the starting block number 686 * results - on successful return, set to the starting block number
687 * of the newly allocated contiguous range. 687 * of the newly allocated contiguous range.
688 * 688 *
689 * RETURN VALUES: 689 * RETURN VALUES:
690 * 0 - success 690 * 0 - success
691 * -ENOSPC - insufficient disk resources 691 * -ENOSPC - insufficient disk resources
692 * -EIO - i/o error 692 * -EIO - i/o error
693 */ 693 */
694int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) 694int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
695{ 695{
@@ -706,12 +706,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
706 /* assert that nblocks is valid */ 706 /* assert that nblocks is valid */
707 assert(nblocks > 0); 707 assert(nblocks > 0);
708 708
709#ifdef _STILL_TO_PORT
710 /* DASD limit check F226941 */
711 if (OVER_LIMIT(ip, nblocks))
712 return -ENOSPC;
713#endif /* _STILL_TO_PORT */
714
715 /* get the log2 number of blocks to be allocated. 709 /* get the log2 number of blocks to be allocated.
716 * if the number of blocks is not a log2 multiple, 710 * if the number of blocks is not a log2 multiple,
717 * it will be rounded up to the next log2 multiple. 711 * it will be rounded up to the next log2 multiple.
@@ -720,7 +714,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
720 714
721 bmp = JFS_SBI(ip->i_sb)->bmap; 715 bmp = JFS_SBI(ip->i_sb)->bmap;
722 716
723//retry: /* serialize w.r.t.extendfs() */
724 mapSize = bmp->db_mapsize; 717 mapSize = bmp->db_mapsize;
725 718
726 /* the hint should be within the map */ 719 /* the hint should be within the map */
@@ -879,17 +872,17 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
879/* 872/*
880 * NAME: dbAllocExact() 873 * NAME: dbAllocExact()
881 * 874 *
882 * FUNCTION: try to allocate the requested extent; 875 * FUNCTION: try to allocate the requested extent;
883 * 876 *
884 * PARAMETERS: 877 * PARAMETERS:
885 * ip - pointer to in-core inode; 878 * ip - pointer to in-core inode;
886 * blkno - extent address; 879 * blkno - extent address;
887 * nblocks - extent length; 880 * nblocks - extent length;
888 * 881 *
889 * RETURN VALUES: 882 * RETURN VALUES:
890 * 0 - success 883 * 0 - success
891 * -ENOSPC - insufficient disk resources 884 * -ENOSPC - insufficient disk resources
892 * -EIO - i/o error 885 * -EIO - i/o error
893 */ 886 */
894int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) 887int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
895{ 888{
@@ -946,7 +939,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
946/* 939/*
947 * NAME: dbReAlloc() 940 * NAME: dbReAlloc()
948 * 941 *
949 * FUNCTION: attempt to extend a current allocation by a specified 942 * FUNCTION: attempt to extend a current allocation by a specified
950 * number of blocks. 943 * number of blocks.
951 * 944 *
952 * this routine attempts to satisfy the allocation request 945 * this routine attempts to satisfy the allocation request
@@ -959,21 +952,21 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
959 * number of blocks required. 952 * number of blocks required.
960 * 953 *
961 * PARAMETERS: 954 * PARAMETERS:
962 * ip - pointer to in-core inode requiring allocation. 955 * ip - pointer to in-core inode requiring allocation.
963 * blkno - starting block of the current allocation. 956 * blkno - starting block of the current allocation.
964 * nblocks - number of contiguous blocks within the current 957 * nblocks - number of contiguous blocks within the current
965 * allocation. 958 * allocation.
966 * addnblocks - number of blocks to add to the allocation. 959 * addnblocks - number of blocks to add to the allocation.
967 * results - on successful return, set to the starting block number 960 * results - on successful return, set to the starting block number
968 * of the existing allocation if the existing allocation 961 * of the existing allocation if the existing allocation
969 * was extended in place or to a newly allocated contiguous 962 * was extended in place or to a newly allocated contiguous
970 * range if the existing allocation could not be extended 963 * range if the existing allocation could not be extended
971 * in place. 964 * in place.
972 * 965 *
973 * RETURN VALUES: 966 * RETURN VALUES:
974 * 0 - success 967 * 0 - success
975 * -ENOSPC - insufficient disk resources 968 * -ENOSPC - insufficient disk resources
976 * -EIO - i/o error 969 * -EIO - i/o error
977 */ 970 */
978int 971int
979dbReAlloc(struct inode *ip, 972dbReAlloc(struct inode *ip,
@@ -1004,7 +997,7 @@ dbReAlloc(struct inode *ip,
1004/* 997/*
1005 * NAME: dbExtend() 998 * NAME: dbExtend()
1006 * 999 *
1007 * FUNCTION: attempt to extend a current allocation by a specified 1000 * FUNCTION: attempt to extend a current allocation by a specified
1008 * number of blocks. 1001 * number of blocks.
1009 * 1002 *
1010 * this routine attempts to satisfy the allocation request 1003 * this routine attempts to satisfy the allocation request
@@ -1013,16 +1006,16 @@ dbReAlloc(struct inode *ip,
1013 * immediately following the current allocation. 1006 * immediately following the current allocation.
1014 * 1007 *
1015 * PARAMETERS: 1008 * PARAMETERS:
1016 * ip - pointer to in-core inode requiring allocation. 1009 * ip - pointer to in-core inode requiring allocation.
1017 * blkno - starting block of the current allocation. 1010 * blkno - starting block of the current allocation.
1018 * nblocks - number of contiguous blocks within the current 1011 * nblocks - number of contiguous blocks within the current
1019 * allocation. 1012 * allocation.
1020 * addnblocks - number of blocks to add to the allocation. 1013 * addnblocks - number of blocks to add to the allocation.
1021 * 1014 *
1022 * RETURN VALUES: 1015 * RETURN VALUES:
1023 * 0 - success 1016 * 0 - success
1024 * -ENOSPC - insufficient disk resources 1017 * -ENOSPC - insufficient disk resources
1025 * -EIO - i/o error 1018 * -EIO - i/o error
1026 */ 1019 */
1027static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) 1020static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1028{ 1021{
@@ -1109,19 +1102,19 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1109/* 1102/*
1110 * NAME: dbAllocNext() 1103 * NAME: dbAllocNext()
1111 * 1104 *
1112 * FUNCTION: attempt to allocate the blocks of the specified block 1105 * FUNCTION: attempt to allocate the blocks of the specified block
1113 * range within a dmap. 1106 * range within a dmap.
1114 * 1107 *
1115 * PARAMETERS: 1108 * PARAMETERS:
1116 * bmp - pointer to bmap descriptor 1109 * bmp - pointer to bmap descriptor
1117 * dp - pointer to dmap. 1110 * dp - pointer to dmap.
1118 * blkno - starting block number of the range. 1111 * blkno - starting block number of the range.
1119 * nblocks - number of contiguous free blocks of the range. 1112 * nblocks - number of contiguous free blocks of the range.
1120 * 1113 *
1121 * RETURN VALUES: 1114 * RETURN VALUES:
1122 * 0 - success 1115 * 0 - success
1123 * -ENOSPC - insufficient disk resources 1116 * -ENOSPC - insufficient disk resources
1124 * -EIO - i/o error 1117 * -EIO - i/o error
1125 * 1118 *
1126 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1119 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1127 */ 1120 */
@@ -1233,7 +1226,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1233/* 1226/*
1234 * NAME: dbAllocNear() 1227 * NAME: dbAllocNear()
1235 * 1228 *
1236 * FUNCTION: attempt to allocate a number of contiguous free blocks near 1229 * FUNCTION: attempt to allocate a number of contiguous free blocks near
1237 * a specified block (hint) within a dmap. 1230 * a specified block (hint) within a dmap.
1238 * 1231 *
1239 * starting with the dmap leaf that covers the hint, we'll 1232 * starting with the dmap leaf that covers the hint, we'll
@@ -1242,18 +1235,18 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1242 * the desired free space. 1235 * the desired free space.
1243 * 1236 *
1244 * PARAMETERS: 1237 * PARAMETERS:
1245 * bmp - pointer to bmap descriptor 1238 * bmp - pointer to bmap descriptor
1246 * dp - pointer to dmap. 1239 * dp - pointer to dmap.
1247 * blkno - block number to allocate near. 1240 * blkno - block number to allocate near.
1248 * nblocks - actual number of contiguous free blocks desired. 1241 * nblocks - actual number of contiguous free blocks desired.
1249 * l2nb - log2 number of contiguous free blocks desired. 1242 * l2nb - log2 number of contiguous free blocks desired.
1250 * results - on successful return, set to the starting block number 1243 * results - on successful return, set to the starting block number
1251 * of the newly allocated range. 1244 * of the newly allocated range.
1252 * 1245 *
1253 * RETURN VALUES: 1246 * RETURN VALUES:
1254 * 0 - success 1247 * 0 - success
1255 * -ENOSPC - insufficient disk resources 1248 * -ENOSPC - insufficient disk resources
1256 * -EIO - i/o error 1249 * -EIO - i/o error
1257 * 1250 *
1258 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1251 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1259 */ 1252 */
@@ -1316,7 +1309,7 @@ dbAllocNear(struct bmap * bmp,
1316/* 1309/*
1317 * NAME: dbAllocAG() 1310 * NAME: dbAllocAG()
1318 * 1311 *
1319 * FUNCTION: attempt to allocate the specified number of contiguous 1312 * FUNCTION: attempt to allocate the specified number of contiguous
1320 * free blocks within the specified allocation group. 1313 * free blocks within the specified allocation group.
1321 * 1314 *
1322 * unless the allocation group size is equal to the number 1315 * unless the allocation group size is equal to the number
@@ -1353,17 +1346,17 @@ dbAllocNear(struct bmap * bmp,
1353 * the allocation group. 1346 * the allocation group.
1354 * 1347 *
1355 * PARAMETERS: 1348 * PARAMETERS:
1356 * bmp - pointer to bmap descriptor 1349 * bmp - pointer to bmap descriptor
1357 * agno - allocation group number. 1350 * agno - allocation group number.
1358 * nblocks - actual number of contiguous free blocks desired. 1351 * nblocks - actual number of contiguous free blocks desired.
1359 * l2nb - log2 number of contiguous free blocks desired. 1352 * l2nb - log2 number of contiguous free blocks desired.
1360 * results - on successful return, set to the starting block number 1353 * results - on successful return, set to the starting block number
1361 * of the newly allocated range. 1354 * of the newly allocated range.
1362 * 1355 *
1363 * RETURN VALUES: 1356 * RETURN VALUES:
1364 * 0 - success 1357 * 0 - success
1365 * -ENOSPC - insufficient disk resources 1358 * -ENOSPC - insufficient disk resources
1366 * -EIO - i/o error 1359 * -EIO - i/o error
1367 * 1360 *
1368 * note: IWRITE_LOCK(ipmap) held on entry/exit; 1361 * note: IWRITE_LOCK(ipmap) held on entry/exit;
1369 */ 1362 */
@@ -1546,7 +1539,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1546/* 1539/*
1547 * NAME: dbAllocAny() 1540 * NAME: dbAllocAny()
1548 * 1541 *
1549 * FUNCTION: attempt to allocate the specified number of contiguous 1542 * FUNCTION: attempt to allocate the specified number of contiguous
1550 * free blocks anywhere in the file system. 1543 * free blocks anywhere in the file system.
1551 * 1544 *
1552 * dbAllocAny() attempts to find the sufficient free space by 1545 * dbAllocAny() attempts to find the sufficient free space by
@@ -1556,16 +1549,16 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1556 * desired free space is allocated. 1549 * desired free space is allocated.
1557 * 1550 *
1558 * PARAMETERS: 1551 * PARAMETERS:
1559 * bmp - pointer to bmap descriptor 1552 * bmp - pointer to bmap descriptor
1560 * nblocks - actual number of contiguous free blocks desired. 1553 * nblocks - actual number of contiguous free blocks desired.
1561 * l2nb - log2 number of contiguous free blocks desired. 1554 * l2nb - log2 number of contiguous free blocks desired.
1562 * results - on successful return, set to the starting block number 1555 * results - on successful return, set to the starting block number
1563 * of the newly allocated range. 1556 * of the newly allocated range.
1564 * 1557 *
1565 * RETURN VALUES: 1558 * RETURN VALUES:
1566 * 0 - success 1559 * 0 - success
1567 * -ENOSPC - insufficient disk resources 1560 * -ENOSPC - insufficient disk resources
1568 * -EIO - i/o error 1561 * -EIO - i/o error
1569 * 1562 *
1570 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1563 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1571 */ 1564 */
@@ -1598,9 +1591,9 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1598/* 1591/*
1599 * NAME: dbFindCtl() 1592 * NAME: dbFindCtl()
1600 * 1593 *
1601 * FUNCTION: starting at a specified dmap control page level and block 1594 * FUNCTION: starting at a specified dmap control page level and block
1602 * number, search down the dmap control levels for a range of 1595 * number, search down the dmap control levels for a range of
1603 * contiguous free blocks large enough to satisfy an allocation 1596 * contiguous free blocks large enough to satisfy an allocation
1604 * request for the specified number of free blocks. 1597 * request for the specified number of free blocks.
1605 * 1598 *
1606 * if sufficient contiguous free blocks are found, this routine 1599 * if sufficient contiguous free blocks are found, this routine
@@ -1609,17 +1602,17 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1609 * is sufficient in size. 1602 * is sufficient in size.
1610 * 1603 *
1611 * PARAMETERS: 1604 * PARAMETERS:
1612 * bmp - pointer to bmap descriptor 1605 * bmp - pointer to bmap descriptor
1613 * level - starting dmap control page level. 1606 * level - starting dmap control page level.
1614 * l2nb - log2 number of contiguous free blocks desired. 1607 * l2nb - log2 number of contiguous free blocks desired.
1615 * *blkno - on entry, starting block number for conducting the search. 1608 * *blkno - on entry, starting block number for conducting the search.
1616 * on successful return, the first block within a dmap page 1609 * on successful return, the first block within a dmap page
1617 * that contains or starts a range of contiguous free blocks. 1610 * that contains or starts a range of contiguous free blocks.
1618 * 1611 *
1619 * RETURN VALUES: 1612 * RETURN VALUES:
1620 * 0 - success 1613 * 0 - success
1621 * -ENOSPC - insufficient disk resources 1614 * -ENOSPC - insufficient disk resources
1622 * -EIO - i/o error 1615 * -EIO - i/o error
1623 * 1616 *
1624 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1617 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1625 */ 1618 */
@@ -1699,7 +1692,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1699/* 1692/*
1700 * NAME: dbAllocCtl() 1693 * NAME: dbAllocCtl()
1701 * 1694 *
1702 * FUNCTION: attempt to allocate a specified number of contiguous 1695 * FUNCTION: attempt to allocate a specified number of contiguous
1703 * blocks starting within a specific dmap. 1696 * blocks starting within a specific dmap.
1704 * 1697 *
1705 * this routine is called by higher level routines that search 1698 * this routine is called by higher level routines that search
@@ -1726,18 +1719,18 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1726 * first dmap (i.e. blkno). 1719 * first dmap (i.e. blkno).
1727 * 1720 *
1728 * PARAMETERS: 1721 * PARAMETERS:
1729 * bmp - pointer to bmap descriptor 1722 * bmp - pointer to bmap descriptor
1730 * nblocks - actual number of contiguous free blocks to allocate. 1723 * nblocks - actual number of contiguous free blocks to allocate.
1731 * l2nb - log2 number of contiguous free blocks to allocate. 1724 * l2nb - log2 number of contiguous free blocks to allocate.
1732 * blkno - starting block number of the dmap to start the allocation 1725 * blkno - starting block number of the dmap to start the allocation
1733 * from. 1726 * from.
1734 * results - on successful return, set to the starting block number 1727 * results - on successful return, set to the starting block number
1735 * of the newly allocated range. 1728 * of the newly allocated range.
1736 * 1729 *
1737 * RETURN VALUES: 1730 * RETURN VALUES:
1738 * 0 - success 1731 * 0 - success
1739 * -ENOSPC - insufficient disk resources 1732 * -ENOSPC - insufficient disk resources
1740 * -EIO - i/o error 1733 * -EIO - i/o error
1741 * 1734 *
1742 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1735 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1743 */ 1736 */
@@ -1870,7 +1863,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1870/* 1863/*
1871 * NAME: dbAllocDmapLev() 1864 * NAME: dbAllocDmapLev()
1872 * 1865 *
1873 * FUNCTION: attempt to allocate a specified number of contiguous blocks 1866 * FUNCTION: attempt to allocate a specified number of contiguous blocks
1874 * from a specified dmap. 1867 * from a specified dmap.
1875 * 1868 *
1876 * this routine checks if the contiguous blocks are available. 1869 * this routine checks if the contiguous blocks are available.
@@ -1878,17 +1871,17 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1878 * returned. 1871 * returned.
1879 * 1872 *
1880 * PARAMETERS: 1873 * PARAMETERS:
1881 * mp - pointer to bmap descriptor 1874 * mp - pointer to bmap descriptor
1882 * dp - pointer to dmap to attempt to allocate blocks from. 1875 * dp - pointer to dmap to attempt to allocate blocks from.
1883 * l2nb - log2 number of contiguous block desired. 1876 * l2nb - log2 number of contiguous block desired.
1884 * nblocks - actual number of contiguous block desired. 1877 * nblocks - actual number of contiguous block desired.
1885 * results - on successful return, set to the starting block number 1878 * results - on successful return, set to the starting block number
1886 * of the newly allocated range. 1879 * of the newly allocated range.
1887 * 1880 *
1888 * RETURN VALUES: 1881 * RETURN VALUES:
1889 * 0 - success 1882 * 0 - success
1890 * -ENOSPC - insufficient disk resources 1883 * -ENOSPC - insufficient disk resources
1891 * -EIO - i/o error 1884 * -EIO - i/o error
1892 * 1885 *
1893 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 1886 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1894 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; 1887 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
@@ -1933,7 +1926,7 @@ dbAllocDmapLev(struct bmap * bmp,
1933/* 1926/*
1934 * NAME: dbAllocDmap() 1927 * NAME: dbAllocDmap()
1935 * 1928 *
1936 * FUNCTION: adjust the disk allocation map to reflect the allocation 1929 * FUNCTION: adjust the disk allocation map to reflect the allocation
1937 * of a specified block range within a dmap. 1930 * of a specified block range within a dmap.
1938 * 1931 *
1939 * this routine allocates the specified blocks from the dmap 1932 * this routine allocates the specified blocks from the dmap
@@ -1946,14 +1939,14 @@ dbAllocDmapLev(struct bmap * bmp,
1946 * covers this dmap. 1939 * covers this dmap.
1947 * 1940 *
1948 * PARAMETERS: 1941 * PARAMETERS:
1949 * bmp - pointer to bmap descriptor 1942 * bmp - pointer to bmap descriptor
1950 * dp - pointer to dmap to allocate the block range from. 1943 * dp - pointer to dmap to allocate the block range from.
1951 * blkno - starting block number of the block to be allocated. 1944 * blkno - starting block number of the block to be allocated.
1952 * nblocks - number of blocks to be allocated. 1945 * nblocks - number of blocks to be allocated.
1953 * 1946 *
1954 * RETURN VALUES: 1947 * RETURN VALUES:
1955 * 0 - success 1948 * 0 - success
1956 * -EIO - i/o error 1949 * -EIO - i/o error
1957 * 1950 *
1958 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 1951 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
1959 */ 1952 */
@@ -1989,7 +1982,7 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
1989/* 1982/*
1990 * NAME: dbFreeDmap() 1983 * NAME: dbFreeDmap()
1991 * 1984 *
1992 * FUNCTION: adjust the disk allocation map to reflect the allocation 1985 * FUNCTION: adjust the disk allocation map to reflect the allocation
1993 * of a specified block range within a dmap. 1986 * of a specified block range within a dmap.
1994 * 1987 *
1995 * this routine frees the specified blocks from the dmap through 1988 * this routine frees the specified blocks from the dmap through
@@ -1997,18 +1990,18 @@ static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
1997 * causes the maximum string of free blocks within the dmap to 1990 * causes the maximum string of free blocks within the dmap to
1998 * change (i.e. the value of the root of the dmap's dmtree), this 1991 * change (i.e. the value of the root of the dmap's dmtree), this
1999 * routine will cause this change to be reflected up through the 1992 * routine will cause this change to be reflected up through the
2000 * appropriate levels of the dmap control pages by a call to 1993 * appropriate levels of the dmap control pages by a call to
2001 * dbAdjCtl() for the L0 dmap control page that covers this dmap. 1994 * dbAdjCtl() for the L0 dmap control page that covers this dmap.
2002 * 1995 *
2003 * PARAMETERS: 1996 * PARAMETERS:
2004 * bmp - pointer to bmap descriptor 1997 * bmp - pointer to bmap descriptor
2005 * dp - pointer to dmap to free the block range from. 1998 * dp - pointer to dmap to free the block range from.
2006 * blkno - starting block number of the block to be freed. 1999 * blkno - starting block number of the block to be freed.
2007 * nblocks - number of blocks to be freed. 2000 * nblocks - number of blocks to be freed.
2008 * 2001 *
2009 * RETURN VALUES: 2002 * RETURN VALUES:
2010 * 0 - success 2003 * 0 - success
2011 * -EIO - i/o error 2004 * -EIO - i/o error
2012 * 2005 *
2013 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2006 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2014 */ 2007 */
@@ -2055,7 +2048,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2055/* 2048/*
2056 * NAME: dbAllocBits() 2049 * NAME: dbAllocBits()
2057 * 2050 *
2058 * FUNCTION: allocate a specified block range from a dmap. 2051 * FUNCTION: allocate a specified block range from a dmap.
2059 * 2052 *
2060 * this routine updates the dmap to reflect the working 2053 * this routine updates the dmap to reflect the working
2061 * state allocation of the specified block range. it directly 2054 * state allocation of the specified block range. it directly
@@ -2065,10 +2058,10 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2065 * dmap's dmtree, as a whole, to reflect the allocated range. 2058 * dmap's dmtree, as a whole, to reflect the allocated range.
2066 * 2059 *
2067 * PARAMETERS: 2060 * PARAMETERS:
2068 * bmp - pointer to bmap descriptor 2061 * bmp - pointer to bmap descriptor
2069 * dp - pointer to dmap to allocate bits from. 2062 * dp - pointer to dmap to allocate bits from.
2070 * blkno - starting block number of the bits to be allocated. 2063 * blkno - starting block number of the bits to be allocated.
2071 * nblocks - number of bits to be allocated. 2064 * nblocks - number of bits to be allocated.
2072 * 2065 *
2073 * RETURN VALUES: none 2066 * RETURN VALUES: none
2074 * 2067 *
@@ -2149,7 +2142,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2149 * the allocated words. 2142 * the allocated words.
2150 */ 2143 */
2151 for (; nwords > 0; nwords -= nw) { 2144 for (; nwords > 0; nwords -= nw) {
2152 if (leaf[word] < BUDMIN) { 2145 if (leaf[word] < BUDMIN) {
2153 jfs_error(bmp->db_ipbmap->i_sb, 2146 jfs_error(bmp->db_ipbmap->i_sb,
2154 "dbAllocBits: leaf page " 2147 "dbAllocBits: leaf page "
2155 "corrupt"); 2148 "corrupt");
@@ -2202,7 +2195,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2202/* 2195/*
2203 * NAME: dbFreeBits() 2196 * NAME: dbFreeBits()
2204 * 2197 *
2205 * FUNCTION: free a specified block range from a dmap. 2198 * FUNCTION: free a specified block range from a dmap.
2206 * 2199 *
2207 * this routine updates the dmap to reflect the working 2200 * this routine updates the dmap to reflect the working
2208 * state allocation of the specified block range. it directly 2201 * state allocation of the specified block range. it directly
@@ -2212,10 +2205,10 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2212 * dmtree, as a whole, to reflect the deallocated range. 2205 * dmtree, as a whole, to reflect the deallocated range.
2213 * 2206 *
2214 * PARAMETERS: 2207 * PARAMETERS:
2215 * bmp - pointer to bmap descriptor 2208 * bmp - pointer to bmap descriptor
2216 * dp - pointer to dmap to free bits from. 2209 * dp - pointer to dmap to free bits from.
2217 * blkno - starting block number of the bits to be freed. 2210 * blkno - starting block number of the bits to be freed.
2218 * nblocks - number of bits to be freed. 2211 * nblocks - number of bits to be freed.
2219 * 2212 *
2220 * RETURN VALUES: 0 for success 2213 * RETURN VALUES: 0 for success
2221 * 2214 *
@@ -2388,19 +2381,19 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2388 * the new root value and the next dmap control page level to 2381 * the new root value and the next dmap control page level to
2389 * be adjusted. 2382 * be adjusted.
2390 * PARAMETERS: 2383 * PARAMETERS:
2391 * bmp - pointer to bmap descriptor 2384 * bmp - pointer to bmap descriptor
2392 * blkno - the first block of a block range within a dmap. it is 2385 * blkno - the first block of a block range within a dmap. it is
2393 * the allocation or deallocation of this block range that 2386 * the allocation or deallocation of this block range that
2394 * requires the dmap control page to be adjusted. 2387 * requires the dmap control page to be adjusted.
2395 * newval - the new value of the lower level dmap or dmap control 2388 * newval - the new value of the lower level dmap or dmap control
2396 * page root. 2389 * page root.
2397 * alloc - 'true' if adjustment is due to an allocation. 2390 * alloc - 'true' if adjustment is due to an allocation.
2398 * level - current level of dmap control page (i.e. L0, L1, L2) to 2391 * level - current level of dmap control page (i.e. L0, L1, L2) to
2399 * be adjusted. 2392 * be adjusted.
2400 * 2393 *
2401 * RETURN VALUES: 2394 * RETURN VALUES:
2402 * 0 - success 2395 * 0 - success
2403 * -EIO - i/o error 2396 * -EIO - i/o error
2404 * 2397 *
2405 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2398 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2406 */ 2399 */
@@ -2544,16 +2537,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2544/* 2537/*
2545 * NAME: dbSplit() 2538 * NAME: dbSplit()
2546 * 2539 *
2547 * FUNCTION: update the leaf of a dmtree with a new value, splitting 2540 * FUNCTION: update the leaf of a dmtree with a new value, splitting
2548 * the leaf from the binary buddy system of the dmtree's 2541 * the leaf from the binary buddy system of the dmtree's
2549 * leaves, as required. 2542 * leaves, as required.
2550 * 2543 *
2551 * PARAMETERS: 2544 * PARAMETERS:
2552 * tp - pointer to the tree containing the leaf. 2545 * tp - pointer to the tree containing the leaf.
2553 * leafno - the number of the leaf to be updated. 2546 * leafno - the number of the leaf to be updated.
2554 * splitsz - the size the binary buddy system starting at the leaf 2547 * splitsz - the size the binary buddy system starting at the leaf
2555 * must be split to, specified as the log2 number of blocks. 2548 * must be split to, specified as the log2 number of blocks.
2556 * newval - the new value for the leaf. 2549 * newval - the new value for the leaf.
2557 * 2550 *
2558 * RETURN VALUES: none 2551 * RETURN VALUES: none
2559 * 2552 *
@@ -2600,7 +2593,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2600/* 2593/*
2601 * NAME: dbBackSplit() 2594 * NAME: dbBackSplit()
2602 * 2595 *
2603 * FUNCTION: back split the binary buddy system of dmtree leaves 2596 * FUNCTION: back split the binary buddy system of dmtree leaves
2604 * that hold a specified leaf until the specified leaf 2597 * that hold a specified leaf until the specified leaf
2605 * starts its own binary buddy system. 2598 * starts its own binary buddy system.
2606 * 2599 *
@@ -2617,8 +2610,8 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2617 * in which a previous join operation must be backed out. 2610 * in which a previous join operation must be backed out.
2618 * 2611 *
2619 * PARAMETERS: 2612 * PARAMETERS:
2620 * tp - pointer to the tree containing the leaf. 2613 * tp - pointer to the tree containing the leaf.
2621 * leafno - the number of the leaf to be updated. 2614 * leafno - the number of the leaf to be updated.
2622 * 2615 *
2623 * RETURN VALUES: none 2616 * RETURN VALUES: none
2624 * 2617 *
@@ -2692,14 +2685,14 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
2692/* 2685/*
2693 * NAME: dbJoin() 2686 * NAME: dbJoin()
2694 * 2687 *
2695 * FUNCTION: update the leaf of a dmtree with a new value, joining 2688 * FUNCTION: update the leaf of a dmtree with a new value, joining
2696 * the leaf with other leaves of the dmtree into a multi-leaf 2689 * the leaf with other leaves of the dmtree into a multi-leaf
2697 * binary buddy system, as required. 2690 * binary buddy system, as required.
2698 * 2691 *
2699 * PARAMETERS: 2692 * PARAMETERS:
2700 * tp - pointer to the tree containing the leaf. 2693 * tp - pointer to the tree containing the leaf.
2701 * leafno - the number of the leaf to be updated. 2694 * leafno - the number of the leaf to be updated.
2702 * newval - the new value for the leaf. 2695 * newval - the new value for the leaf.
2703 * 2696 *
2704 * RETURN VALUES: none 2697 * RETURN VALUES: none
2705 */ 2698 */
@@ -2785,15 +2778,15 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
2785/* 2778/*
2786 * NAME: dbAdjTree() 2779 * NAME: dbAdjTree()
2787 * 2780 *
2788 * FUNCTION: update a leaf of a dmtree with a new value, adjusting 2781 * FUNCTION: update a leaf of a dmtree with a new value, adjusting
2789 * the dmtree, as required, to reflect the new leaf value. 2782 * the dmtree, as required, to reflect the new leaf value.
2790 * the combination of any buddies must already be done before 2783 * the combination of any buddies must already be done before
2791 * this is called. 2784 * this is called.
2792 * 2785 *
2793 * PARAMETERS: 2786 * PARAMETERS:
2794 * tp - pointer to the tree to be adjusted. 2787 * tp - pointer to the tree to be adjusted.
2795 * leafno - the number of the leaf to be updated. 2788 * leafno - the number of the leaf to be updated.
2796 * newval - the new value for the leaf. 2789 * newval - the new value for the leaf.
2797 * 2790 *
2798 * RETURN VALUES: none 2791 * RETURN VALUES: none
2799 */ 2792 */
@@ -2852,7 +2845,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2852/* 2845/*
2853 * NAME: dbFindLeaf() 2846 * NAME: dbFindLeaf()
2854 * 2847 *
2855 * FUNCTION: search a dmtree_t for sufficient free blocks, returning 2848 * FUNCTION: search a dmtree_t for sufficient free blocks, returning
2856 * the index of a leaf describing the free blocks if 2849 * the index of a leaf describing the free blocks if
2857 * sufficient free blocks are found. 2850 * sufficient free blocks are found.
2858 * 2851 *
@@ -2861,15 +2854,15 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2861 * free space. 2854 * free space.
2862 * 2855 *
2863 * PARAMETERS: 2856 * PARAMETERS:
2864 * tp - pointer to the tree to be searched. 2857 * tp - pointer to the tree to be searched.
2865 * l2nb - log2 number of free blocks to search for. 2858 * l2nb - log2 number of free blocks to search for.
2866 * leafidx - return pointer to be set to the index of the leaf 2859 * leafidx - return pointer to be set to the index of the leaf
2867 * describing at least l2nb free blocks if sufficient 2860 * describing at least l2nb free blocks if sufficient
2868 * free blocks are found. 2861 * free blocks are found.
2869 * 2862 *
2870 * RETURN VALUES: 2863 * RETURN VALUES:
2871 * 0 - success 2864 * 0 - success
2872 * -ENOSPC - insufficient free blocks. 2865 * -ENOSPC - insufficient free blocks.
2873 */ 2866 */
2874static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) 2867static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2875{ 2868{
@@ -2916,18 +2909,18 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2916/* 2909/*
2917 * NAME: dbFindBits() 2910 * NAME: dbFindBits()
2918 * 2911 *
2919 * FUNCTION: find a specified number of binary buddy free bits within a 2912 * FUNCTION: find a specified number of binary buddy free bits within a
2920 * dmap bitmap word value. 2913 * dmap bitmap word value.
2921 * 2914 *
2922 * this routine searches the bitmap value for (1 << l2nb) free 2915 * this routine searches the bitmap value for (1 << l2nb) free
2923 * bits at (1 << l2nb) alignments within the value. 2916 * bits at (1 << l2nb) alignments within the value.
2924 * 2917 *
2925 * PARAMETERS: 2918 * PARAMETERS:
2926 * word - dmap bitmap word value. 2919 * word - dmap bitmap word value.
2927 * l2nb - number of free bits specified as a log2 number. 2920 * l2nb - number of free bits specified as a log2 number.
2928 * 2921 *
2929 * RETURN VALUES: 2922 * RETURN VALUES:
2930 * starting bit number of free bits. 2923 * starting bit number of free bits.
2931 */ 2924 */
2932static int dbFindBits(u32 word, int l2nb) 2925static int dbFindBits(u32 word, int l2nb)
2933{ 2926{
@@ -2963,14 +2956,14 @@ static int dbFindBits(u32 word, int l2nb)
2963/* 2956/*
2964 * NAME: dbMaxBud(u8 *cp) 2957 * NAME: dbMaxBud(u8 *cp)
2965 * 2958 *
2966 * FUNCTION: determine the largest binary buddy string of free 2959 * FUNCTION: determine the largest binary buddy string of free
2967 * bits within 32-bits of the map. 2960 * bits within 32-bits of the map.
2968 * 2961 *
2969 * PARAMETERS: 2962 * PARAMETERS:
2970 * cp - pointer to the 32-bit value. 2963 * cp - pointer to the 32-bit value.
2971 * 2964 *
2972 * RETURN VALUES: 2965 * RETURN VALUES:
2973 * largest binary buddy of free bits within a dmap word. 2966 * largest binary buddy of free bits within a dmap word.
2974 */ 2967 */
2975static int dbMaxBud(u8 * cp) 2968static int dbMaxBud(u8 * cp)
2976{ 2969{
@@ -3000,14 +2993,14 @@ static int dbMaxBud(u8 * cp)
3000/* 2993/*
3001 * NAME: cnttz(uint word) 2994 * NAME: cnttz(uint word)
3002 * 2995 *
3003 * FUNCTION: determine the number of trailing zeros within a 32-bit 2996 * FUNCTION: determine the number of trailing zeros within a 32-bit
3004 * value. 2997 * value.
3005 * 2998 *
3006 * PARAMETERS: 2999 * PARAMETERS:
3007 * value - 32-bit value to be examined. 3000 * value - 32-bit value to be examined.
3008 * 3001 *
3009 * RETURN VALUES: 3002 * RETURN VALUES:
3010 * count of trailing zeros 3003 * count of trailing zeros
3011 */ 3004 */
3012static int cnttz(u32 word) 3005static int cnttz(u32 word)
3013{ 3006{
@@ -3025,14 +3018,14 @@ static int cnttz(u32 word)
3025/* 3018/*
3026 * NAME: cntlz(u32 value) 3019 * NAME: cntlz(u32 value)
3027 * 3020 *
3028 * FUNCTION: determine the number of leading zeros within a 32-bit 3021 * FUNCTION: determine the number of leading zeros within a 32-bit
3029 * value. 3022 * value.
3030 * 3023 *
3031 * PARAMETERS: 3024 * PARAMETERS:
3032 * value - 32-bit value to be examined. 3025 * value - 32-bit value to be examined.
3033 * 3026 *
3034 * RETURN VALUES: 3027 * RETURN VALUES:
3035 * count of leading zeros 3028 * count of leading zeros
3036 */ 3029 */
3037static int cntlz(u32 value) 3030static int cntlz(u32 value)
3038{ 3031{
@@ -3050,14 +3043,14 @@ static int cntlz(u32 value)
3050 * NAME: blkstol2(s64 nb) 3043 * NAME: blkstol2(s64 nb)
3051 * 3044 *
3052 * FUNCTION: convert a block count to its log2 value. if the block 3045 * FUNCTION: convert a block count to its log2 value. if the block
3053 * count is not a l2 multiple, it is rounded up to the next 3046 * count is not a l2 multiple, it is rounded up to the next
3054 * larger l2 multiple. 3047 * larger l2 multiple.
3055 * 3048 *
3056 * PARAMETERS: 3049 * PARAMETERS:
3057 * nb - number of blocks 3050 * nb - number of blocks
3058 * 3051 *
3059 * RETURN VALUES: 3052 * RETURN VALUES:
3060 * log2 number of blocks 3053 * log2 number of blocks
3061 */ 3054 */
3062static int blkstol2(s64 nb) 3055static int blkstol2(s64 nb)
3063{ 3056{
@@ -3099,13 +3092,13 @@ static int blkstol2(s64 nb)
3099 * at a time. 3092 * at a time.
3100 * 3093 *
3101 * PARAMETERS: 3094 * PARAMETERS:
3102 * ip - pointer to in-core inode; 3095 * ip - pointer to in-core inode;
3103 * blkno - starting block number to be freed. 3096 * blkno - starting block number to be freed.
3104 * nblocks - number of blocks to be freed. 3097 * nblocks - number of blocks to be freed.
3105 * 3098 *
3106 * RETURN VALUES: 3099 * RETURN VALUES:
3107 * 0 - success 3100 * 0 - success
3108 * -EIO - i/o error 3101 * -EIO - i/o error
3109 */ 3102 */
3110int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) 3103int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3111{ 3104{
@@ -3278,10 +3271,10 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3278 * L2 3271 * L2
3279 * | 3272 * |
3280 * L1---------------------------------L1 3273 * L1---------------------------------L1
3281 * | | 3274 * | |
3282 * L0---------L0---------L0 L0---------L0---------L0 3275 * L0---------L0---------L0 L0---------L0---------L0
3283 * | | | | | | 3276 * | | | | | |
3284 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; 3277 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm;
3285 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm 3278 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3286 * 3279 *
3287 * <---old---><----------------------------extend-----------------------> 3280 * <---old---><----------------------------extend----------------------->
@@ -3307,7 +3300,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3307 (long long) blkno, (long long) nblocks, (long long) newsize); 3300 (long long) blkno, (long long) nblocks, (long long) newsize);
3308 3301
3309 /* 3302 /*
3310 * initialize bmap control page. 3303 * initialize bmap control page.
3311 * 3304 *
3312 * all the data in bmap control page should exclude 3305 * all the data in bmap control page should exclude
3313 * the mkfs hidden dmap page. 3306 * the mkfs hidden dmap page.
@@ -3330,7 +3323,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3330 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; 3323 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3331 3324
3332 /* 3325 /*
3333 * reconfigure db_agfree[] 3326 * reconfigure db_agfree[]
3334 * from old AG configuration to new AG configuration; 3327 * from old AG configuration to new AG configuration;
3335 * 3328 *
3336 * coalesce contiguous k (newAGSize/oldAGSize) AGs; 3329 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
@@ -3362,7 +3355,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3362 bmp->db_maxag = bmp->db_maxag / k; 3355 bmp->db_maxag = bmp->db_maxag / k;
3363 3356
3364 /* 3357 /*
3365 * extend bmap 3358 * extend bmap
3366 * 3359 *
3367 * update bit maps and corresponding level control pages; 3360 * update bit maps and corresponding level control pages;
3368 * global control page db_nfree, db_agfree[agno], db_maxfreebud; 3361 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
@@ -3410,7 +3403,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3410 /* compute start L0 */ 3403 /* compute start L0 */
3411 j = 0; 3404 j = 0;
3412 l1leaf = l1dcp->stree + CTLLEAFIND; 3405 l1leaf = l1dcp->stree + CTLLEAFIND;
3413 p += nbperpage; /* 1st L0 of L1.k */ 3406 p += nbperpage; /* 1st L0 of L1.k */
3414 } 3407 }
3415 3408
3416 /* 3409 /*
@@ -3548,7 +3541,7 @@ errout:
3548 return -EIO; 3541 return -EIO;
3549 3542
3550 /* 3543 /*
3551 * finalize bmap control page 3544 * finalize bmap control page
3552 */ 3545 */
3553finalize: 3546finalize:
3554 3547
@@ -3567,7 +3560,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3567 int i, n; 3560 int i, n;
3568 3561
3569 /* 3562 /*
3570 * finalize bmap control page 3563 * finalize bmap control page
3571 */ 3564 */
3572//finalize: 3565//finalize:
3573 /* 3566 /*
@@ -3953,8 +3946,8 @@ static int dbGetL2AGSize(s64 nblocks)
3953 * convert number of map pages to the zero origin top dmapctl level 3946 * convert number of map pages to the zero origin top dmapctl level
3954 */ 3947 */
3955#define BMAPPGTOLEV(npages) \ 3948#define BMAPPGTOLEV(npages) \
3956 (((npages) <= 3 + MAXL0PAGES) ? 0 \ 3949 (((npages) <= 3 + MAXL0PAGES) ? 0 : \
3957 : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) 3950 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
3958 3951
3959s64 dbMapFileSizeToMapSize(struct inode * ipbmap) 3952s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
3960{ 3953{
@@ -3981,8 +3974,8 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
3981 factor = 3974 factor =
3982 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); 3975 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
3983 complete = (u32) npages / factor; 3976 complete = (u32) npages / factor;
3984 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL 3977 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
3985 : ((i == 1) ? LPERCTL : 1)); 3978 ((i == 1) ? LPERCTL : 1));
3986 3979
3987 /* pages in last/incomplete child */ 3980 /* pages in last/incomplete child */
3988 npages = (u32) npages % factor; 3981 npages = (u32) npages % factor;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 45ea454c74..11e6d471b3 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -83,7 +83,7 @@ static __inline signed char TREEMAX(signed char *cp)
83 * - 1 is added to account for the control page of the map. 83 * - 1 is added to account for the control page of the map.
84 */ 84 */
85#define BLKTODMAP(b,s) \ 85#define BLKTODMAP(b,s) \
86 ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) 86 ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
87 87
88/* 88/*
89 * convert disk block number to the logical block number of the LEVEL 0 89 * convert disk block number to the logical block number of the LEVEL 0
@@ -98,7 +98,7 @@ static __inline signed char TREEMAX(signed char *cp)
98 * - 1 is added to account for the control page of the map. 98 * - 1 is added to account for the control page of the map.
99 */ 99 */
100#define BLKTOL0(b,s) \ 100#define BLKTOL0(b,s) \
101 (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) 101 (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
102 102
103/* 103/*
104 * convert disk block number to the logical block number of the LEVEL 1 104 * convert disk block number to the logical block number of the LEVEL 1
@@ -120,7 +120,7 @@ static __inline signed char TREEMAX(signed char *cp)
120 * at the specified level which describes the disk block. 120 * at the specified level which describes the disk block.
121 */ 121 */
122#define BLKTOCTL(b,s,l) \ 122#define BLKTOCTL(b,s,l) \
123 (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) 123 (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
124 124
125/* 125/*
126 * convert aggregate map size to the zero origin dmapctl level of the 126 * convert aggregate map size to the zero origin dmapctl level of the
@@ -145,27 +145,27 @@ static __inline signed char TREEMAX(signed char *cp)
145 * dmaptree must be consistent with dmapctl. 145 * dmaptree must be consistent with dmapctl.
146 */ 146 */
147struct dmaptree { 147struct dmaptree {
148 __le32 nleafs; /* 4: number of tree leafs */ 148 __le32 nleafs; /* 4: number of tree leafs */
149 __le32 l2nleafs; /* 4: l2 number of tree leafs */ 149 __le32 l2nleafs; /* 4: l2 number of tree leafs */
150 __le32 leafidx; /* 4: index of first tree leaf */ 150 __le32 leafidx; /* 4: index of first tree leaf */
151 __le32 height; /* 4: height of the tree */ 151 __le32 height; /* 4: height of the tree */
152 s8 budmin; /* 1: min l2 tree leaf value to combine */ 152 s8 budmin; /* 1: min l2 tree leaf value to combine */
153 s8 stree[TREESIZE]; /* TREESIZE: tree */ 153 s8 stree[TREESIZE]; /* TREESIZE: tree */
154 u8 pad[2]; /* 2: pad to word boundary */ 154 u8 pad[2]; /* 2: pad to word boundary */
155}; /* - 360 - */ 155}; /* - 360 - */
156 156
157/* 157/*
158 * dmap page per 8K blocks bitmap 158 * dmap page per 8K blocks bitmap
159 */ 159 */
160struct dmap { 160struct dmap {
161 __le32 nblocks; /* 4: num blks covered by this dmap */ 161 __le32 nblocks; /* 4: num blks covered by this dmap */
162 __le32 nfree; /* 4: num of free blks in this dmap */ 162 __le32 nfree; /* 4: num of free blks in this dmap */
163 __le64 start; /* 8: starting blkno for this dmap */ 163 __le64 start; /* 8: starting blkno for this dmap */
164 struct dmaptree tree; /* 360: dmap tree */ 164 struct dmaptree tree; /* 360: dmap tree */
165 u8 pad[1672]; /* 1672: pad to 2048 bytes */ 165 u8 pad[1672]; /* 1672: pad to 2048 bytes */
166 __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ 166 __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */
167 __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ 167 __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */
168}; /* - 4096 - */ 168}; /* - 4096 - */
169 169
170/* 170/*
171 * disk map control page per level. 171 * disk map control page per level.
@@ -173,14 +173,14 @@ struct dmap {
173 * dmapctl must be consistent with dmaptree. 173 * dmapctl must be consistent with dmaptree.
174 */ 174 */
175struct dmapctl { 175struct dmapctl {
176 __le32 nleafs; /* 4: number of tree leafs */ 176 __le32 nleafs; /* 4: number of tree leafs */
177 __le32 l2nleafs; /* 4: l2 number of tree leafs */ 177 __le32 l2nleafs; /* 4: l2 number of tree leafs */
178 __le32 leafidx; /* 4: index of the first tree leaf */ 178 __le32 leafidx; /* 4: index of the first tree leaf */
179 __le32 height; /* 4: height of tree */ 179 __le32 height; /* 4: height of tree */
180 s8 budmin; /* 1: minimum l2 tree leaf value */ 180 s8 budmin; /* 1: minimum l2 tree leaf value */
181 s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ 181 s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */
182 u8 pad[2714]; /* 2714: pad to 4096 */ 182 u8 pad[2714]; /* 2714: pad to 4096 */
183}; /* - 4096 - */ 183}; /* - 4096 - */
184 184
185/* 185/*
186 * common definition for dmaptree within dmap and dmapctl 186 * common definition for dmaptree within dmap and dmapctl
@@ -202,41 +202,41 @@ typedef union dmtree {
202 * on-disk aggregate disk allocation map descriptor. 202 * on-disk aggregate disk allocation map descriptor.
203 */ 203 */
204struct dbmap_disk { 204struct dbmap_disk {
205 __le64 dn_mapsize; /* 8: number of blocks in aggregate */ 205 __le64 dn_mapsize; /* 8: number of blocks in aggregate */
206 __le64 dn_nfree; /* 8: num free blks in aggregate map */ 206 __le64 dn_nfree; /* 8: num free blks in aggregate map */
207 __le32 dn_l2nbperpage; /* 4: number of blks per page */ 207 __le32 dn_l2nbperpage; /* 4: number of blks per page */
208 __le32 dn_numag; /* 4: total number of ags */ 208 __le32 dn_numag; /* 4: total number of ags */
209 __le32 dn_maxlevel; /* 4: number of active ags */ 209 __le32 dn_maxlevel; /* 4: number of active ags */
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
217 __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ 217 __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */
218 __le64 dn_agsize; /* 8: num of blks per alloc group */ 218 __le64 dn_agsize; /* 8: num of blks per alloc group */
219 s8 dn_maxfreebud; /* 1: max free buddy system */ 219 s8 dn_maxfreebud; /* 1: max free buddy system */
220 u8 pad[3007]; /* 3007: pad to 4096 */ 220 u8 pad[3007]; /* 3007: pad to 4096 */
221}; /* - 4096 - */ 221}; /* - 4096 - */
222 222
223struct dbmap { 223struct dbmap {
224 s64 dn_mapsize; /* number of blocks in aggregate */ 224 s64 dn_mapsize; /* number of blocks in aggregate */
225 s64 dn_nfree; /* num free blks in aggregate map */ 225 s64 dn_nfree; /* num free blks in aggregate map */
226 int dn_l2nbperpage; /* number of blks per page */ 226 int dn_l2nbperpage; /* number of blks per page */
227 int dn_numag; /* total number of ags */ 227 int dn_numag; /* total number of ags */
228 int dn_maxlevel; /* number of active ags */ 228 int dn_maxlevel; /* number of active ags */
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheigth; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
236 s64 dn_agfree[MAXAG]; /* per AG free count */ 236 s64 dn_agfree[MAXAG]; /* per AG free count */
237 s64 dn_agsize; /* num of blks per alloc group */ 237 s64 dn_agsize; /* num of blks per alloc group */
238 signed char dn_maxfreebud; /* max free buddy system */ 238 signed char dn_maxfreebud; /* max free buddy system */
239}; /* - 4096 - */ 239}; /* - 4096 - */
240/* 240/*
241 * in-memory aggregate disk allocation map descriptor. 241 * in-memory aggregate disk allocation map descriptor.
242 */ 242 */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 6d62f32228..c14ba3cfa8 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -315,8 +315,8 @@ static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
315 lv = &llck->lv[llck->index]; 315 lv = &llck->lv[llck->index];
316 316
317 /* 317 /*
318 * Linelock slot size is twice the size of directory table 318 * Linelock slot size is twice the size of directory table
319 * slot size. 512 entries per page. 319 * slot size. 512 entries per page.
320 */ 320 */
321 lv->offset = ((index - 2) & 511) >> 1; 321 lv->offset = ((index - 2) & 511) >> 1;
322 lv->length = 1; 322 lv->length = 1;
@@ -615,7 +615,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
615 btstack->nsplit = 1; 615 btstack->nsplit = 1;
616 616
617 /* 617 /*
618 * search down tree from root: 618 * search down tree from root:
619 * 619 *
620 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 620 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
621 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 621 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -659,7 +659,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
659 } 659 }
660 if (cmp == 0) { 660 if (cmp == 0) {
661 /* 661 /*
662 * search hit 662 * search hit
663 */ 663 */
664 /* search hit - leaf page: 664 /* search hit - leaf page:
665 * return the entry found 665 * return the entry found
@@ -723,7 +723,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
723 } 723 }
724 724
725 /* 725 /*
726 * search miss 726 * search miss
727 * 727 *
728 * base is the smallest index with key (Kj) greater than 728 * base is the smallest index with key (Kj) greater than
729 * search key (K) and may be zero or (maxindex + 1) index. 729 * search key (K) and may be zero or (maxindex + 1) index.
@@ -834,7 +834,7 @@ int dtInsert(tid_t tid, struct inode *ip,
834 struct lv *lv; 834 struct lv *lv;
835 835
836 /* 836 /*
837 * retrieve search result 837 * retrieve search result
838 * 838 *
839 * dtSearch() returns (leaf page pinned, index at which to insert). 839 * dtSearch() returns (leaf page pinned, index at which to insert).
840 * n.b. dtSearch() may return index of (maxindex + 1) of 840 * n.b. dtSearch() may return index of (maxindex + 1) of
@@ -843,7 +843,7 @@ int dtInsert(tid_t tid, struct inode *ip,
843 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); 843 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
844 844
845 /* 845 /*
846 * insert entry for new key 846 * insert entry for new key
847 */ 847 */
848 if (DO_INDEX(ip)) { 848 if (DO_INDEX(ip)) {
849 if (JFS_IP(ip)->next_index == DIREND) { 849 if (JFS_IP(ip)->next_index == DIREND) {
@@ -860,9 +860,9 @@ int dtInsert(tid_t tid, struct inode *ip,
860 data.leaf.ino = *fsn; 860 data.leaf.ino = *fsn;
861 861
862 /* 862 /*
863 * leaf page does not have enough room for new entry: 863 * leaf page does not have enough room for new entry:
864 * 864 *
865 * extend/split the leaf page; 865 * extend/split the leaf page;
866 * 866 *
867 * dtSplitUp() will insert the entry and unpin the leaf page. 867 * dtSplitUp() will insert the entry and unpin the leaf page.
868 */ 868 */
@@ -877,9 +877,9 @@ int dtInsert(tid_t tid, struct inode *ip,
877 } 877 }
878 878
879 /* 879 /*
880 * leaf page does have enough room for new entry: 880 * leaf page does have enough room for new entry:
881 * 881 *
882 * insert the new data entry into the leaf page; 882 * insert the new data entry into the leaf page;
883 */ 883 */
884 BT_MARK_DIRTY(mp, ip); 884 BT_MARK_DIRTY(mp, ip);
885 /* 885 /*
@@ -967,13 +967,13 @@ static int dtSplitUp(tid_t tid,
967 } 967 }
968 968
969 /* 969 /*
970 * split leaf page 970 * split leaf page
971 * 971 *
972 * The split routines insert the new entry, and 972 * The split routines insert the new entry, and
973 * acquire txLock as appropriate. 973 * acquire txLock as appropriate.
974 */ 974 */
975 /* 975 /*
976 * split root leaf page: 976 * split root leaf page:
977 */ 977 */
978 if (sp->header.flag & BT_ROOT) { 978 if (sp->header.flag & BT_ROOT) {
979 /* 979 /*
@@ -1012,7 +1012,7 @@ static int dtSplitUp(tid_t tid,
1012 } 1012 }
1013 1013
1014 /* 1014 /*
1015 * extend first leaf page 1015 * extend first leaf page
1016 * 1016 *
1017 * extend the 1st extent if less than buffer page size 1017 * extend the 1st extent if less than buffer page size
1018 * (dtExtendPage() reurns leaf page unpinned) 1018 * (dtExtendPage() reurns leaf page unpinned)
@@ -1068,7 +1068,7 @@ static int dtSplitUp(tid_t tid,
1068 } 1068 }
1069 1069
1070 /* 1070 /*
1071 * split leaf page <sp> into <sp> and a new right page <rp>. 1071 * split leaf page <sp> into <sp> and a new right page <rp>.
1072 * 1072 *
1073 * return <rp> pinned and its extent descriptor <rpxd> 1073 * return <rp> pinned and its extent descriptor <rpxd>
1074 */ 1074 */
@@ -1433,7 +1433,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1433 rp->header.freecnt = rp->header.maxslot - fsi; 1433 rp->header.freecnt = rp->header.maxslot - fsi;
1434 1434
1435 /* 1435 /*
1436 * sequential append at tail: append without split 1436 * sequential append at tail: append without split
1437 * 1437 *
1438 * If splitting the last page on a level because of appending 1438 * If splitting the last page on a level because of appending
1439 * a entry to it (skip is maxentry), it's likely that the access is 1439 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1467,7 +1467,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1467 } 1467 }
1468 1468
1469 /* 1469 /*
1470 * non-sequential insert (at possibly middle page) 1470 * non-sequential insert (at possibly middle page)
1471 */ 1471 */
1472 1472
1473 /* 1473 /*
@@ -1508,7 +1508,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1508 left = 0; 1508 left = 0;
1509 1509
1510 /* 1510 /*
1511 * compute fill factor for split pages 1511 * compute fill factor for split pages
1512 * 1512 *
1513 * <nxt> traces the next entry to move to rp 1513 * <nxt> traces the next entry to move to rp
1514 * <off> traces the next entry to stay in sp 1514 * <off> traces the next entry to stay in sp
@@ -1551,7 +1551,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1551 /* <nxt> poins to the 1st entry to move */ 1551 /* <nxt> poins to the 1st entry to move */
1552 1552
1553 /* 1553 /*
1554 * move entries to right page 1554 * move entries to right page
1555 * 1555 *
1556 * dtMoveEntry() initializes rp and reserves entry for insertion 1556 * dtMoveEntry() initializes rp and reserves entry for insertion
1557 * 1557 *
@@ -1677,7 +1677,7 @@ static int dtExtendPage(tid_t tid,
1677 return (rc); 1677 return (rc);
1678 1678
1679 /* 1679 /*
1680 * extend the extent 1680 * extend the extent
1681 */ 1681 */
1682 pxdlist = split->pxdlist; 1682 pxdlist = split->pxdlist;
1683 pxd = &pxdlist->pxd[pxdlist->npxd]; 1683 pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1722,7 +1722,7 @@ static int dtExtendPage(tid_t tid,
1722 } 1722 }
1723 1723
1724 /* 1724 /*
1725 * extend the page 1725 * extend the page
1726 */ 1726 */
1727 sp->header.self = *pxd; 1727 sp->header.self = *pxd;
1728 1728
@@ -1739,9 +1739,6 @@ static int dtExtendPage(tid_t tid,
1739 /* update buffer extent descriptor of extended page */ 1739 /* update buffer extent descriptor of extended page */
1740 xlen = lengthPXD(pxd); 1740 xlen = lengthPXD(pxd);
1741 xsize = xlen << JFS_SBI(sb)->l2bsize; 1741 xsize = xlen << JFS_SBI(sb)->l2bsize;
1742#ifdef _STILL_TO_PORT
1743 bmSetXD(smp, xaddr, xsize);
1744#endif /* _STILL_TO_PORT */
1745 1742
1746 /* 1743 /*
1747 * copy old stbl to new stbl at start of extended area 1744 * copy old stbl to new stbl at start of extended area
@@ -1836,7 +1833,7 @@ static int dtExtendPage(tid_t tid,
1836 } 1833 }
1837 1834
1838 /* 1835 /*
1839 * update parent entry on the parent/root page 1836 * update parent entry on the parent/root page
1840 */ 1837 */
1841 /* 1838 /*
1842 * acquire a transaction lock on the parent/root page 1839 * acquire a transaction lock on the parent/root page
@@ -1904,7 +1901,7 @@ static int dtSplitRoot(tid_t tid,
1904 sp = &JFS_IP(ip)->i_dtroot; 1901 sp = &JFS_IP(ip)->i_dtroot;
1905 1902
1906 /* 1903 /*
1907 * allocate/initialize a single (right) child page 1904 * allocate/initialize a single (right) child page
1908 * 1905 *
1909 * N.B. at first split, a one (or two) block to fit new entry 1906 * N.B. at first split, a one (or two) block to fit new entry
1910 * is allocated; at subsequent split, a full page is allocated; 1907 * is allocated; at subsequent split, a full page is allocated;
@@ -1943,7 +1940,7 @@ static int dtSplitRoot(tid_t tid,
1943 rp->header.prev = 0; 1940 rp->header.prev = 0;
1944 1941
1945 /* 1942 /*
1946 * move in-line root page into new right page extent 1943 * move in-line root page into new right page extent
1947 */ 1944 */
1948 /* linelock header + copied entries + new stbl (1st slot) in new page */ 1945 /* linelock header + copied entries + new stbl (1st slot) in new page */
1949 ASSERT(dtlck->index == 0); 1946 ASSERT(dtlck->index == 0);
@@ -2016,7 +2013,7 @@ static int dtSplitRoot(tid_t tid,
2016 dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); 2013 dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2017 2014
2018 /* 2015 /*
2019 * reset parent/root page 2016 * reset parent/root page
2020 * 2017 *
2021 * set the 1st entry offset to 0, which force the left-most key 2018 * set the 1st entry offset to 0, which force the left-most key
2022 * at any level of the tree to be less than any search key. 2019 * at any level of the tree to be less than any search key.
@@ -2102,7 +2099,7 @@ int dtDelete(tid_t tid,
2102 dtpage_t *np; 2099 dtpage_t *np;
2103 2100
2104 /* 2101 /*
2105 * search for the entry to delete: 2102 * search for the entry to delete:
2106 * 2103 *
2107 * dtSearch() returns (leaf page pinned, index at which to delete). 2104 * dtSearch() returns (leaf page pinned, index at which to delete).
2108 */ 2105 */
@@ -2253,7 +2250,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2253 int i; 2250 int i;
2254 2251
2255 /* 2252 /*
2256 * keep the root leaf page which has become empty 2253 * keep the root leaf page which has become empty
2257 */ 2254 */
2258 if (BT_IS_ROOT(fmp)) { 2255 if (BT_IS_ROOT(fmp)) {
2259 /* 2256 /*
@@ -2269,7 +2266,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2269 } 2266 }
2270 2267
2271 /* 2268 /*
2272 * free the non-root leaf page 2269 * free the non-root leaf page
2273 */ 2270 */
2274 /* 2271 /*
2275 * acquire a transaction lock on the page 2272 * acquire a transaction lock on the page
@@ -2299,7 +2296,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2299 discard_metapage(fmp); 2296 discard_metapage(fmp);
2300 2297
2301 /* 2298 /*
2302 * propagate page deletion up the directory tree 2299 * propagate page deletion up the directory tree
2303 * 2300 *
2304 * If the delete from the parent page makes it empty, 2301 * If the delete from the parent page makes it empty,
2305 * continue all the way up the tree. 2302 * continue all the way up the tree.
@@ -2440,10 +2437,10 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2440 2437
2441#ifdef _NOTYET 2438#ifdef _NOTYET
2442/* 2439/*
2443 * NAME: dtRelocate() 2440 * NAME: dtRelocate()
2444 * 2441 *
2445 * FUNCTION: relocate dtpage (internal or leaf) of directory; 2442 * FUNCTION: relocate dtpage (internal or leaf) of directory;
2446 * This function is mainly used by defragfs utility. 2443 * This function is mainly used by defragfs utility.
2447 */ 2444 */
2448int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, 2445int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2449 s64 nxaddr) 2446 s64 nxaddr)
@@ -2471,8 +2468,8 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2471 xlen); 2468 xlen);
2472 2469
2473 /* 2470 /*
2474 * 1. get the internal parent dtpage covering 2471 * 1. get the internal parent dtpage covering
2475 * router entry for the tartget page to be relocated; 2472 * router entry for the tartget page to be relocated;
2476 */ 2473 */
2477 rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); 2474 rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2478 if (rc) 2475 if (rc)
@@ -2483,7 +2480,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2483 jfs_info("dtRelocate: parent router entry validated."); 2480 jfs_info("dtRelocate: parent router entry validated.");
2484 2481
2485 /* 2482 /*
2486 * 2. relocate the target dtpage 2483 * 2. relocate the target dtpage
2487 */ 2484 */
2488 /* read in the target page from src extent */ 2485 /* read in the target page from src extent */
2489 DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); 2486 DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
@@ -2581,9 +2578,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2581 2578
2582 /* update the buffer extent descriptor of the dtpage */ 2579 /* update the buffer extent descriptor of the dtpage */
2583 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; 2580 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2584#ifdef _STILL_TO_PORT 2581
2585 bmSetXD(mp, nxaddr, xsize);
2586#endif /* _STILL_TO_PORT */
2587 /* unpin the relocated page */ 2582 /* unpin the relocated page */
2588 DT_PUTPAGE(mp); 2583 DT_PUTPAGE(mp);
2589 jfs_info("dtRelocate: target dtpage relocated."); 2584 jfs_info("dtRelocate: target dtpage relocated.");
@@ -2594,7 +2589,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2594 */ 2589 */
2595 2590
2596 /* 2591 /*
2597 * 3. acquire maplock for the source extent to be freed; 2592 * 3. acquire maplock for the source extent to be freed;
2598 */ 2593 */
2599 /* for dtpage relocation, write a LOG_NOREDOPAGE record 2594 /* for dtpage relocation, write a LOG_NOREDOPAGE record
2600 * for the source dtpage (logredo() will init NoRedoPage 2595 * for the source dtpage (logredo() will init NoRedoPage
@@ -2609,7 +2604,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2609 pxdlock->index = 1; 2604 pxdlock->index = 1;
2610 2605
2611 /* 2606 /*
2612 * 4. update the parent router entry for relocation; 2607 * 4. update the parent router entry for relocation;
2613 * 2608 *
2614 * acquire tlck for the parent entry covering the target dtpage; 2609 * acquire tlck for the parent entry covering the target dtpage;
2615 * write LOG_REDOPAGE to apply after image only; 2610 * write LOG_REDOPAGE to apply after image only;
@@ -2637,7 +2632,7 @@ int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2637 * NAME: dtSearchNode() 2632 * NAME: dtSearchNode()
2638 * 2633 *
2639 * FUNCTION: Search for an dtpage containing a specified address 2634 * FUNCTION: Search for an dtpage containing a specified address
2640 * This function is mainly used by defragfs utility. 2635 * This function is mainly used by defragfs utility.
2641 * 2636 *
2642 * NOTE: Search result on stack, the found page is pinned at exit. 2637 * NOTE: Search result on stack, the found page is pinned at exit.
2643 * The result page must be an internal dtpage. 2638 * The result page must be an internal dtpage.
@@ -2660,7 +2655,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2660 BT_CLR(btstack); /* reset stack */ 2655 BT_CLR(btstack); /* reset stack */
2661 2656
2662 /* 2657 /*
2663 * descend tree to the level with specified leftmost page 2658 * descend tree to the level with specified leftmost page
2664 * 2659 *
2665 * by convention, root bn = 0. 2660 * by convention, root bn = 0.
2666 */ 2661 */
@@ -2699,7 +2694,7 @@ static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2699 } 2694 }
2700 2695
2701 /* 2696 /*
2702 * search each page at the current levevl 2697 * search each page at the current levevl
2703 */ 2698 */
2704 loop: 2699 loop:
2705 stbl = DT_GETSTBL(p); 2700 stbl = DT_GETSTBL(p);
@@ -3044,9 +3039,9 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3044 if (DO_INDEX(ip)) { 3039 if (DO_INDEX(ip)) {
3045 /* 3040 /*
3046 * persistent index is stored in directory entries. 3041 * persistent index is stored in directory entries.
3047 * Special cases: 0 = . 3042 * Special cases: 0 = .
3048 * 1 = .. 3043 * 1 = ..
3049 * -1 = End of directory 3044 * -1 = End of directory
3050 */ 3045 */
3051 do_index = 1; 3046 do_index = 1;
3052 3047
@@ -3128,10 +3123,10 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3128 /* 3123 /*
3129 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 3124 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3130 * 3125 *
3131 * pn = index = 0: First entry "." 3126 * pn = index = 0: First entry "."
3132 * pn = 0; index = 1: Second entry ".." 3127 * pn = 0; index = 1: Second entry ".."
3133 * pn > 0: Real entries, pn=1 -> leftmost page 3128 * pn > 0: Real entries, pn=1 -> leftmost page
3134 * pn = index = -1: No more entries 3129 * pn = index = -1: No more entries
3135 */ 3130 */
3136 dtpos = filp->f_pos; 3131 dtpos = filp->f_pos;
3137 if (dtpos == 0) { 3132 if (dtpos == 0) {
@@ -3351,7 +3346,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3351 BT_CLR(btstack); /* reset stack */ 3346 BT_CLR(btstack); /* reset stack */
3352 3347
3353 /* 3348 /*
3354 * descend leftmost path of the tree 3349 * descend leftmost path of the tree
3355 * 3350 *
3356 * by convention, root bn = 0. 3351 * by convention, root bn = 0.
3357 */ 3352 */
@@ -4531,7 +4526,7 @@ int dtModify(tid_t tid, struct inode *ip,
4531 struct ldtentry *entry; 4526 struct ldtentry *entry;
4532 4527
4533 /* 4528 /*
4534 * search for the entry to modify: 4529 * search for the entry to modify:
4535 * 4530 *
4536 * dtSearch() returns (leaf page pinned, index at which to modify). 4531 * dtSearch() returns (leaf page pinned, index at which to modify).
4537 */ 4532 */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index af8513f786..8561c6ecec 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -35,7 +35,7 @@ typedef union {
35 35
36 36
37/* 37/*
38 * entry segment/slot 38 * entry segment/slot
39 * 39 *
40 * an entry consists of type dependent head/only segment/slot and 40 * an entry consists of type dependent head/only segment/slot and
41 * additional segments/slots linked vi next field; 41 * additional segments/slots linked vi next field;
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index a35bdca6a8..7ae1e3281d 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -34,8 +34,8 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
34#endif 34#endif
35static s64 extRoundDown(s64 nb); 35static s64 extRoundDown(s64 nb);
36 36
37#define DPD(a) (printk("(a): %d\n",(a))) 37#define DPD(a) (printk("(a): %d\n",(a)))
38#define DPC(a) (printk("(a): %c\n",(a))) 38#define DPC(a) (printk("(a): %c\n",(a)))
39#define DPL1(a) \ 39#define DPL1(a) \
40{ \ 40{ \
41 if ((a) >> 32) \ 41 if ((a) >> 32) \
@@ -51,19 +51,19 @@ static s64 extRoundDown(s64 nb);
51 printk("(a): %x\n",(a) << 32); \ 51 printk("(a): %x\n",(a) << 32); \
52} 52}
53 53
54#define DPD1(a) (printk("(a): %d ",(a))) 54#define DPD1(a) (printk("(a): %d ",(a)))
55#define DPX(a) (printk("(a): %08x\n",(a))) 55#define DPX(a) (printk("(a): %08x\n",(a)))
56#define DPX1(a) (printk("(a): %08x ",(a))) 56#define DPX1(a) (printk("(a): %08x ",(a)))
57#define DPS(a) (printk("%s\n",(a))) 57#define DPS(a) (printk("%s\n",(a)))
58#define DPE(a) (printk("\nENTERING: %s\n",(a))) 58#define DPE(a) (printk("\nENTERING: %s\n",(a)))
59#define DPE1(a) (printk("\nENTERING: %s",(a))) 59#define DPE1(a) (printk("\nENTERING: %s",(a)))
60#define DPS1(a) (printk(" %s ",(a))) 60#define DPS1(a) (printk(" %s ",(a)))
61 61
62 62
63/* 63/*
64 * NAME: extAlloc() 64 * NAME: extAlloc()
65 * 65 *
66 * FUNCTION: allocate an extent for a specified page range within a 66 * FUNCTION: allocate an extent for a specified page range within a
67 * file. 67 * file.
68 * 68 *
69 * PARAMETERS: 69 * PARAMETERS:
@@ -78,9 +78,9 @@ static s64 extRoundDown(s64 nb);
78 * should be marked as allocated but not recorded. 78 * should be marked as allocated but not recorded.
79 * 79 *
80 * RETURN VALUES: 80 * RETURN VALUES:
81 * 0 - success 81 * 0 - success
82 * -EIO - i/o error. 82 * -EIO - i/o error.
83 * -ENOSPC - insufficient disk resources. 83 * -ENOSPC - insufficient disk resources.
84 */ 84 */
85int 85int
86extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) 86extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
@@ -192,9 +192,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
192 192
193#ifdef _NOTYET 193#ifdef _NOTYET
194/* 194/*
195 * NAME: extRealloc() 195 * NAME: extRealloc()
196 * 196 *
197 * FUNCTION: extend the allocation of a file extent containing a 197 * FUNCTION: extend the allocation of a file extent containing a
198 * partial back last page. 198 * partial back last page.
199 * 199 *
200 * PARAMETERS: 200 * PARAMETERS:
@@ -207,9 +207,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
207 * should be marked as allocated but not recorded. 207 * should be marked as allocated but not recorded.
208 * 208 *
209 * RETURN VALUES: 209 * RETURN VALUES:
210 * 0 - success 210 * 0 - success
211 * -EIO - i/o error. 211 * -EIO - i/o error.
212 * -ENOSPC - insufficient disk resources. 212 * -ENOSPC - insufficient disk resources.
213 */ 213 */
214int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) 214int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
215{ 215{
@@ -345,9 +345,9 @@ exit:
345 345
346 346
347/* 347/*
348 * NAME: extHint() 348 * NAME: extHint()
349 * 349 *
350 * FUNCTION: produce an extent allocation hint for a file offset. 350 * FUNCTION: produce an extent allocation hint for a file offset.
351 * 351 *
352 * PARAMETERS: 352 * PARAMETERS:
353 * ip - the inode of the file. 353 * ip - the inode of the file.
@@ -356,8 +356,8 @@ exit:
356 * the hint. 356 * the hint.
357 * 357 *
358 * RETURN VALUES: 358 * RETURN VALUES:
359 * 0 - success 359 * 0 - success
360 * -EIO - i/o error. 360 * -EIO - i/o error.
361 */ 361 */
362int extHint(struct inode *ip, s64 offset, xad_t * xp) 362int extHint(struct inode *ip, s64 offset, xad_t * xp)
363{ 363{
@@ -387,7 +387,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
387 lxdl.nlxd = 1; 387 lxdl.nlxd = 1;
388 lxdl.lxd = &lxd; 388 lxdl.lxd = &lxd;
389 LXDoffset(&lxd, prev) 389 LXDoffset(&lxd, prev)
390 LXDlength(&lxd, nbperpage); 390 LXDlength(&lxd, nbperpage);
391 391
392 xadl.maxnxad = 1; 392 xadl.maxnxad = 1;
393 xadl.nxad = 0; 393 xadl.nxad = 0;
@@ -397,11 +397,11 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
397 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) 397 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
398 return (rc); 398 return (rc);
399 399
400 /* check if not extent exists for the previous page. 400 /* check if no extent exists for the previous page.
401 * this is possible for sparse files. 401 * this is possible for sparse files.
402 */ 402 */
403 if (xadl.nxad == 0) { 403 if (xadl.nxad == 0) {
404// assert(ISSPARSE(ip)); 404// assert(ISSPARSE(ip));
405 return (0); 405 return (0);
406 } 406 }
407 407
@@ -410,28 +410,28 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
410 */ 410 */
411 xp->flag &= XAD_NOTRECORDED; 411 xp->flag &= XAD_NOTRECORDED;
412 412
413 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { 413 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
414 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 414 jfs_error(ip->i_sb, "extHint: corrupt xtree");
415 return -EIO; 415 return -EIO;
416 } 416 }
417 417
418 return (0); 418 return (0);
419} 419}
420 420
421 421
422/* 422/*
423 * NAME: extRecord() 423 * NAME: extRecord()
424 * 424 *
425 * FUNCTION: change a page with a file from not recorded to recorded. 425 * FUNCTION: change a page with a file from not recorded to recorded.
426 * 426 *
427 * PARAMETERS: 427 * PARAMETERS:
428 * ip - inode of the file. 428 * ip - inode of the file.
429 * cp - cbuf of the file page. 429 * cp - cbuf of the file page.
430 * 430 *
431 * RETURN VALUES: 431 * RETURN VALUES:
432 * 0 - success 432 * 0 - success
433 * -EIO - i/o error. 433 * -EIO - i/o error.
434 * -ENOSPC - insufficient disk resources. 434 * -ENOSPC - insufficient disk resources.
435 */ 435 */
436int extRecord(struct inode *ip, xad_t * xp) 436int extRecord(struct inode *ip, xad_t * xp)
437{ 437{
@@ -451,9 +451,9 @@ int extRecord(struct inode *ip, xad_t * xp)
451 451
452#ifdef _NOTYET 452#ifdef _NOTYET
453/* 453/*
454 * NAME: extFill() 454 * NAME: extFill()
455 * 455 *
456 * FUNCTION: allocate disk space for a file page that represents 456 * FUNCTION: allocate disk space for a file page that represents
457 * a file hole. 457 * a file hole.
458 * 458 *
459 * PARAMETERS: 459 * PARAMETERS:
@@ -461,16 +461,16 @@ int extRecord(struct inode *ip, xad_t * xp)
461 * cp - cbuf of the file page represent the hole. 461 * cp - cbuf of the file page represent the hole.
462 * 462 *
463 * RETURN VALUES: 463 * RETURN VALUES:
464 * 0 - success 464 * 0 - success
465 * -EIO - i/o error. 465 * -EIO - i/o error.
466 * -ENOSPC - insufficient disk resources. 466 * -ENOSPC - insufficient disk resources.
467 */ 467 */
468int extFill(struct inode *ip, xad_t * xp) 468int extFill(struct inode *ip, xad_t * xp)
469{ 469{
470 int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; 470 int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
471 s64 blkno = offsetXAD(xp) >> ip->i_blkbits; 471 s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
472 472
473// assert(ISSPARSE(ip)); 473// assert(ISSPARSE(ip));
474 474
475 /* initialize the extent allocation hint */ 475 /* initialize the extent allocation hint */
476 XADaddress(xp, 0); 476 XADaddress(xp, 0);
@@ -489,7 +489,7 @@ int extFill(struct inode *ip, xad_t * xp)
489/* 489/*
490 * NAME: extBalloc() 490 * NAME: extBalloc()
491 * 491 *
492 * FUNCTION: allocate disk blocks to form an extent. 492 * FUNCTION: allocate disk blocks to form an extent.
493 * 493 *
494 * initially, we will try to allocate disk blocks for the 494 * initially, we will try to allocate disk blocks for the
495 * requested size (nblocks). if this fails (nblocks 495 * requested size (nblocks). if this fails (nblocks
@@ -513,9 +513,9 @@ int extFill(struct inode *ip, xad_t * xp)
513 * allocated block range. 513 * allocated block range.
514 * 514 *
515 * RETURN VALUES: 515 * RETURN VALUES:
516 * 0 - success 516 * 0 - success
517 * -EIO - i/o error. 517 * -EIO - i/o error.
518 * -ENOSPC - insufficient disk resources. 518 * -ENOSPC - insufficient disk resources.
519 */ 519 */
520static int 520static int
521extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) 521extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
@@ -580,7 +580,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
580/* 580/*
581 * NAME: extBrealloc() 581 * NAME: extBrealloc()
582 * 582 *
583 * FUNCTION: attempt to extend an extent's allocation. 583 * FUNCTION: attempt to extend an extent's allocation.
584 * 584 *
585 * Initially, we will try to extend the extent's allocation 585 * Initially, we will try to extend the extent's allocation
586 * in place. If this fails, we'll try to move the extent 586 * in place. If this fails, we'll try to move the extent
@@ -597,8 +597,8 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
597 * 597 *
598 * PARAMETERS: 598 * PARAMETERS:
599 * ip - the inode of the file. 599 * ip - the inode of the file.
600 * blkno - starting block number of the extents current allocation. 600 * blkno - starting block number of the extents current allocation.
601 * nblks - number of blocks within the extents current allocation. 601 * nblks - number of blocks within the extents current allocation.
602 * newnblks - pointer to a s64 value. on entry, this value is the 602 * newnblks - pointer to a s64 value. on entry, this value is the
603 * the new desired extent size (number of blocks). on 603 * the new desired extent size (number of blocks). on
604 * successful exit, this value is set to the extent's actual 604 * successful exit, this value is set to the extent's actual
@@ -606,9 +606,9 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
606 * newblkno - the starting block number of the extents new allocation. 606 * newblkno - the starting block number of the extents new allocation.
607 * 607 *
608 * RETURN VALUES: 608 * RETURN VALUES:
609 * 0 - success 609 * 0 - success
610 * -EIO - i/o error. 610 * -EIO - i/o error.
611 * -ENOSPC - insufficient disk resources. 611 * -ENOSPC - insufficient disk resources.
612 */ 612 */
613static int 613static int
614extBrealloc(struct inode *ip, 614extBrealloc(struct inode *ip,
@@ -634,16 +634,16 @@ extBrealloc(struct inode *ip,
634 634
635 635
636/* 636/*
637 * NAME: extRoundDown() 637 * NAME: extRoundDown()
638 * 638 *
639 * FUNCTION: round down a specified number of blocks to the next 639 * FUNCTION: round down a specified number of blocks to the next
640 * smallest power of 2 number. 640 * smallest power of 2 number.
641 * 641 *
642 * PARAMETERS: 642 * PARAMETERS:
643 * nb - the inode of the file. 643 * nb - the inode of the file.
644 * 644 *
645 * RETURN VALUES: 645 * RETURN VALUES:
646 * next smallest power of 2 number. 646 * next smallest power of 2 number.
647 */ 647 */
648static s64 extRoundDown(s64 nb) 648static s64 extRoundDown(s64 nb)
649{ 649{
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 38f70ac03b..b3f5463fbe 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -34,9 +34,9 @@
34#define JFS_UNICODE 0x00000001 /* unicode name */ 34#define JFS_UNICODE 0x00000001 /* unicode name */
35 35
36/* mount time flags for error handling */ 36/* mount time flags for error handling */
37#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ 37#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
38#define JFS_ERR_CONTINUE 0x00000004 /* continue */ 38#define JFS_ERR_CONTINUE 0x00000004 /* continue */
39#define JFS_ERR_PANIC 0x00000008 /* panic */ 39#define JFS_ERR_PANIC 0x00000008 /* panic */
40 40
41/* Quota support */ 41/* Quota support */
42#define JFS_USRQUOTA 0x00000010 42#define JFS_USRQUOTA 0x00000010
@@ -83,7 +83,6 @@
83/* case-insensitive name/directory support */ 83/* case-insensitive name/directory support */
84 84
85#define JFS_AIX 0x80000000 /* AIX support */ 85#define JFS_AIX 0x80000000 /* AIX support */
86/* POSIX name/directory support - Never implemented*/
87 86
88/* 87/*
89 * buffer cache configuration 88 * buffer cache configuration
@@ -113,10 +112,10 @@
113#define IDATASIZE 256 /* inode inline data size */ 112#define IDATASIZE 256 /* inode inline data size */
114#define IXATTRSIZE 128 /* inode inline extended attribute size */ 113#define IXATTRSIZE 128 /* inode inline extended attribute size */
115 114
116#define XTPAGE_SIZE 4096 115#define XTPAGE_SIZE 4096
117#define log2_PAGESIZE 12 116#define log2_PAGESIZE 12
118 117
119#define IAG_SIZE 4096 118#define IAG_SIZE 4096
120#define IAG_EXTENT_SIZE 4096 119#define IAG_EXTENT_SIZE 4096
121#define INOSPERIAG 4096 /* number of disk inodes per iag */ 120#define INOSPERIAG 4096 /* number of disk inodes per iag */
122#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ 121#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index c6530227cd..3870ba8b90 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -93,21 +93,21 @@ static int copy_from_dinode(struct dinode *, struct inode *);
93static void copy_to_dinode(struct dinode *, struct inode *); 93static void copy_to_dinode(struct dinode *, struct inode *);
94 94
95/* 95/*
96 * NAME: diMount() 96 * NAME: diMount()
97 * 97 *
98 * FUNCTION: initialize the incore inode map control structures for 98 * FUNCTION: initialize the incore inode map control structures for
99 * a fileset or aggregate init time. 99 * a fileset or aggregate init time.
100 * 100 *
101 * the inode map's control structure (dinomap) is 101 * the inode map's control structure (dinomap) is
102 * brought in from disk and placed in virtual memory. 102 * brought in from disk and placed in virtual memory.
103 * 103 *
104 * PARAMETERS: 104 * PARAMETERS:
105 * ipimap - pointer to inode map inode for the aggregate or fileset. 105 * ipimap - pointer to inode map inode for the aggregate or fileset.
106 * 106 *
107 * RETURN VALUES: 107 * RETURN VALUES:
108 * 0 - success 108 * 0 - success
109 * -ENOMEM - insufficient free virtual memory. 109 * -ENOMEM - insufficient free virtual memory.
110 * -EIO - i/o error. 110 * -EIO - i/o error.
111 */ 111 */
112int diMount(struct inode *ipimap) 112int diMount(struct inode *ipimap)
113{ 113{
@@ -180,18 +180,18 @@ int diMount(struct inode *ipimap)
180 180
181 181
182/* 182/*
183 * NAME: diUnmount() 183 * NAME: diUnmount()
184 * 184 *
185 * FUNCTION: write to disk the incore inode map control structures for 185 * FUNCTION: write to disk the incore inode map control structures for
186 * a fileset or aggregate at unmount time. 186 * a fileset or aggregate at unmount time.
187 * 187 *
188 * PARAMETERS: 188 * PARAMETERS:
189 * ipimap - pointer to inode map inode for the aggregate or fileset. 189 * ipimap - pointer to inode map inode for the aggregate or fileset.
190 * 190 *
191 * RETURN VALUES: 191 * RETURN VALUES:
192 * 0 - success 192 * 0 - success
193 * -ENOMEM - insufficient free virtual memory. 193 * -ENOMEM - insufficient free virtual memory.
194 * -EIO - i/o error. 194 * -EIO - i/o error.
195 */ 195 */
196int diUnmount(struct inode *ipimap, int mounterror) 196int diUnmount(struct inode *ipimap, int mounterror)
197{ 197{
@@ -274,9 +274,9 @@ int diSync(struct inode *ipimap)
274 274
275 275
276/* 276/*
277 * NAME: diRead() 277 * NAME: diRead()
278 * 278 *
279 * FUNCTION: initialize an incore inode from disk. 279 * FUNCTION: initialize an incore inode from disk.
280 * 280 *
281 * on entry, the specifed incore inode should itself 281 * on entry, the specifed incore inode should itself
282 * specify the disk inode number corresponding to the 282 * specify the disk inode number corresponding to the
@@ -285,7 +285,7 @@ int diSync(struct inode *ipimap)
285 * this routine handles incore inode initialization for 285 * this routine handles incore inode initialization for
286 * both "special" and "regular" inodes. special inodes 286 * both "special" and "regular" inodes. special inodes
287 * are those required early in the mount process and 287 * are those required early in the mount process and
288 * require special handling since much of the file system 288 * require special handling since much of the file system
289 * is not yet initialized. these "special" inodes are 289 * is not yet initialized. these "special" inodes are
290 * identified by a NULL inode map inode pointer and are 290 * identified by a NULL inode map inode pointer and are
291 * actually initialized by a call to diReadSpecial(). 291 * actually initialized by a call to diReadSpecial().
@@ -298,12 +298,12 @@ int diSync(struct inode *ipimap)
298 * incore inode. 298 * incore inode.
299 * 299 *
300 * PARAMETERS: 300 * PARAMETERS:
301 * ip - pointer to incore inode to be initialized from disk. 301 * ip - pointer to incore inode to be initialized from disk.
302 * 302 *
303 * RETURN VALUES: 303 * RETURN VALUES:
304 * 0 - success 304 * 0 - success
305 * -EIO - i/o error. 305 * -EIO - i/o error.
306 * -ENOMEM - insufficient memory 306 * -ENOMEM - insufficient memory
307 * 307 *
308 */ 308 */
309int diRead(struct inode *ip) 309int diRead(struct inode *ip)
@@ -410,26 +410,26 @@ int diRead(struct inode *ip)
410 410
411 411
412/* 412/*
413 * NAME: diReadSpecial() 413 * NAME: diReadSpecial()
414 * 414 *
415 * FUNCTION: initialize a 'special' inode from disk. 415 * FUNCTION: initialize a 'special' inode from disk.
416 * 416 *
417 * this routines handles aggregate level inodes. The 417 * this routines handles aggregate level inodes. The
418 * inode cache cannot differentiate between the 418 * inode cache cannot differentiate between the
419 * aggregate inodes and the filesystem inodes, so we 419 * aggregate inodes and the filesystem inodes, so we
420 * handle these here. We don't actually use the aggregate 420 * handle these here. We don't actually use the aggregate
421 * inode map, since these inodes are at a fixed location 421 * inode map, since these inodes are at a fixed location
422 * and in some cases the aggregate inode map isn't initialized 422 * and in some cases the aggregate inode map isn't initialized
423 * yet. 423 * yet.
424 * 424 *
425 * PARAMETERS: 425 * PARAMETERS:
426 * sb - filesystem superblock 426 * sb - filesystem superblock
427 * inum - aggregate inode number 427 * inum - aggregate inode number
428 * secondary - 1 if secondary aggregate inode table 428 * secondary - 1 if secondary aggregate inode table
429 * 429 *
430 * RETURN VALUES: 430 * RETURN VALUES:
431 * new inode - success 431 * new inode - success
432 * NULL - i/o error. 432 * NULL - i/o error.
433 */ 433 */
434struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) 434struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
435{ 435{
@@ -502,12 +502,12 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
502} 502}
503 503
504/* 504/*
505 * NAME: diWriteSpecial() 505 * NAME: diWriteSpecial()
506 * 506 *
507 * FUNCTION: Write the special inode to disk 507 * FUNCTION: Write the special inode to disk
508 * 508 *
509 * PARAMETERS: 509 * PARAMETERS:
510 * ip - special inode 510 * ip - special inode
511 * secondary - 1 if secondary aggregate inode table 511 * secondary - 1 if secondary aggregate inode table
512 * 512 *
513 * RETURN VALUES: none 513 * RETURN VALUES: none
@@ -554,9 +554,9 @@ void diWriteSpecial(struct inode *ip, int secondary)
554} 554}
555 555
556/* 556/*
557 * NAME: diFreeSpecial() 557 * NAME: diFreeSpecial()
558 * 558 *
559 * FUNCTION: Free allocated space for special inode 559 * FUNCTION: Free allocated space for special inode
560 */ 560 */
561void diFreeSpecial(struct inode *ip) 561void diFreeSpecial(struct inode *ip)
562{ 562{
@@ -572,9 +572,9 @@ void diFreeSpecial(struct inode *ip)
572 572
573 573
574/* 574/*
575 * NAME: diWrite() 575 * NAME: diWrite()
576 * 576 *
577 * FUNCTION: write the on-disk inode portion of the in-memory inode 577 * FUNCTION: write the on-disk inode portion of the in-memory inode
578 * to its corresponding on-disk inode. 578 * to its corresponding on-disk inode.
579 * 579 *
580 * on entry, the specifed incore inode should itself 580 * on entry, the specifed incore inode should itself
@@ -589,11 +589,11 @@ void diFreeSpecial(struct inode *ip)
589 * 589 *
590 * PARAMETERS: 590 * PARAMETERS:
591 * tid - transacation id 591 * tid - transacation id
592 * ip - pointer to incore inode to be written to the inode extent. 592 * ip - pointer to incore inode to be written to the inode extent.
593 * 593 *
594 * RETURN VALUES: 594 * RETURN VALUES:
595 * 0 - success 595 * 0 - success
596 * -EIO - i/o error. 596 * -EIO - i/o error.
597 */ 597 */
598int diWrite(tid_t tid, struct inode *ip) 598int diWrite(tid_t tid, struct inode *ip)
599{ 599{
@@ -730,7 +730,7 @@ int diWrite(tid_t tid, struct inode *ip)
730 ilinelock = (struct linelock *) & tlck->lock; 730 ilinelock = (struct linelock *) & tlck->lock;
731 731
732 /* 732 /*
733 * regular file: 16 byte (XAD slot) granularity 733 * regular file: 16 byte (XAD slot) granularity
734 */ 734 */
735 if (type & tlckXTREE) { 735 if (type & tlckXTREE) {
736 xtpage_t *p, *xp; 736 xtpage_t *p, *xp;
@@ -755,7 +755,7 @@ int diWrite(tid_t tid, struct inode *ip)
755 xad->flag &= ~(XAD_NEW | XAD_EXTENDED); 755 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
756 } 756 }
757 /* 757 /*
758 * directory: 32 byte (directory entry slot) granularity 758 * directory: 32 byte (directory entry slot) granularity
759 */ 759 */
760 else if (type & tlckDTREE) { 760 else if (type & tlckDTREE) {
761 dtpage_t *p, *xp; 761 dtpage_t *p, *xp;
@@ -800,9 +800,8 @@ int diWrite(tid_t tid, struct inode *ip)
800 } 800 }
801 801
802 /* 802 /*
803 * lock/copy inode base: 128 byte slot granularity 803 * lock/copy inode base: 128 byte slot granularity
804 */ 804 */
805// baseDinode:
806 lv = & dilinelock->lv[dilinelock->index]; 805 lv = & dilinelock->lv[dilinelock->index];
807 lv->offset = dioffset >> L2INODESLOTSIZE; 806 lv->offset = dioffset >> L2INODESLOTSIZE;
808 copy_to_dinode(dp, ip); 807 copy_to_dinode(dp, ip);
@@ -813,17 +812,6 @@ int diWrite(tid_t tid, struct inode *ip)
813 lv->length = 1; 812 lv->length = 1;
814 dilinelock->index++; 813 dilinelock->index++;
815 814
816#ifdef _JFS_FASTDASD
817 /*
818 * We aren't logging changes to the DASD used in directory inodes,
819 * but we need to write them to disk. If we don't unmount cleanly,
820 * mount will recalculate the DASD used.
821 */
822 if (S_ISDIR(ip->i_mode)
823 && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
824 memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
825#endif /* _JFS_FASTDASD */
826
827 /* release the buffer holding the updated on-disk inode. 815 /* release the buffer holding the updated on-disk inode.
828 * the buffer will be later written by commit processing. 816 * the buffer will be later written by commit processing.
829 */ 817 */
@@ -834,9 +822,9 @@ int diWrite(tid_t tid, struct inode *ip)
834 822
835 823
836/* 824/*
837 * NAME: diFree(ip) 825 * NAME: diFree(ip)
838 * 826 *
839 * FUNCTION: free a specified inode from the inode working map 827 * FUNCTION: free a specified inode from the inode working map
840 * for a fileset or aggregate. 828 * for a fileset or aggregate.
841 * 829 *
842 * if the inode to be freed represents the first (only) 830 * if the inode to be freed represents the first (only)
@@ -865,11 +853,11 @@ int diWrite(tid_t tid, struct inode *ip)
865 * any updates and are held until all updates are complete. 853 * any updates and are held until all updates are complete.
866 * 854 *
867 * PARAMETERS: 855 * PARAMETERS:
868 * ip - inode to be freed. 856 * ip - inode to be freed.
869 * 857 *
870 * RETURN VALUES: 858 * RETURN VALUES:
871 * 0 - success 859 * 0 - success
872 * -EIO - i/o error. 860 * -EIO - i/o error.
873 */ 861 */
874int diFree(struct inode *ip) 862int diFree(struct inode *ip)
875{ 863{
@@ -902,7 +890,8 @@ int diFree(struct inode *ip)
902 * the map. 890 * the map.
903 */ 891 */
904 if (iagno >= imap->im_nextiag) { 892 if (iagno >= imap->im_nextiag) {
905 dump_mem("imap", imap, 32); 893 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
894 imap, 32, 0);
906 jfs_error(ip->i_sb, 895 jfs_error(ip->i_sb,
907 "diFree: inum = %d, iagno = %d, nextiag = %d", 896 "diFree: inum = %d, iagno = %d, nextiag = %d",
908 (uint) inum, iagno, imap->im_nextiag); 897 (uint) inum, iagno, imap->im_nextiag);
@@ -964,8 +953,8 @@ int diFree(struct inode *ip)
964 return -EIO; 953 return -EIO;
965 } 954 }
966 /* 955 /*
967 * inode extent still has some inodes or below low water mark: 956 * inode extent still has some inodes or below low water mark:
968 * keep the inode extent; 957 * keep the inode extent;
969 */ 958 */
970 if (bitmap || 959 if (bitmap ||
971 imap->im_agctl[agno].numfree < 96 || 960 imap->im_agctl[agno].numfree < 96 ||
@@ -1047,12 +1036,12 @@ int diFree(struct inode *ip)
1047 1036
1048 1037
1049 /* 1038 /*
1050 * inode extent has become free and above low water mark: 1039 * inode extent has become free and above low water mark:
1051 * free the inode extent; 1040 * free the inode extent;
1052 */ 1041 */
1053 1042
1054 /* 1043 /*
1055 * prepare to update iag list(s) (careful update step 1) 1044 * prepare to update iag list(s) (careful update step 1)
1056 */ 1045 */
1057 amp = bmp = cmp = dmp = NULL; 1046 amp = bmp = cmp = dmp = NULL;
1058 fwd = back = -1; 1047 fwd = back = -1;
@@ -1152,7 +1141,7 @@ int diFree(struct inode *ip)
1152 invalidate_pxd_metapages(ip, freepxd); 1141 invalidate_pxd_metapages(ip, freepxd);
1153 1142
1154 /* 1143 /*
1155 * update iag list(s) (careful update step 2) 1144 * update iag list(s) (careful update step 2)
1156 */ 1145 */
1157 /* add the iag to the ag extent free list if this is the 1146 /* add the iag to the ag extent free list if this is the
1158 * first free extent for the iag. 1147 * first free extent for the iag.
@@ -1338,20 +1327,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
1338 1327
1339 1328
1340/* 1329/*
1341 * NAME: diAlloc(pip,dir,ip) 1330 * NAME: diAlloc(pip,dir,ip)
1342 * 1331 *
1343 * FUNCTION: allocate a disk inode from the inode working map 1332 * FUNCTION: allocate a disk inode from the inode working map
1344 * for a fileset or aggregate. 1333 * for a fileset or aggregate.
1345 * 1334 *
1346 * PARAMETERS: 1335 * PARAMETERS:
1347 * pip - pointer to incore inode for the parent inode. 1336 * pip - pointer to incore inode for the parent inode.
1348 * dir - 'true' if the new disk inode is for a directory. 1337 * dir - 'true' if the new disk inode is for a directory.
1349 * ip - pointer to a new inode 1338 * ip - pointer to a new inode
1350 * 1339 *
1351 * RETURN VALUES: 1340 * RETURN VALUES:
1352 * 0 - success. 1341 * 0 - success.
1353 * -ENOSPC - insufficient disk resources. 1342 * -ENOSPC - insufficient disk resources.
1354 * -EIO - i/o error. 1343 * -EIO - i/o error.
1355 */ 1344 */
1356int diAlloc(struct inode *pip, bool dir, struct inode *ip) 1345int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1357{ 1346{
@@ -1433,7 +1422,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1433 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); 1422 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
1434 1423
1435 /* 1424 /*
1436 * try to allocate from the IAG 1425 * try to allocate from the IAG
1437 */ 1426 */
1438 /* check if the inode may be allocated from the iag 1427 /* check if the inode may be allocated from the iag
1439 * (i.e. the inode has free inodes or new extent can be added). 1428 * (i.e. the inode has free inodes or new extent can be added).
@@ -1633,9 +1622,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1633 1622
1634 1623
1635/* 1624/*
1636 * NAME: diAllocAG(imap,agno,dir,ip) 1625 * NAME: diAllocAG(imap,agno,dir,ip)
1637 * 1626 *
1638 * FUNCTION: allocate a disk inode from the allocation group. 1627 * FUNCTION: allocate a disk inode from the allocation group.
1639 * 1628 *
1640 * this routine first determines if a new extent of free 1629 * this routine first determines if a new extent of free
1641 * inodes should be added for the allocation group, with 1630 * inodes should be added for the allocation group, with
@@ -1649,17 +1638,17 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1649 * PRE CONDITION: Already have the AG lock for this AG. 1638 * PRE CONDITION: Already have the AG lock for this AG.
1650 * 1639 *
1651 * PARAMETERS: 1640 * PARAMETERS:
1652 * imap - pointer to inode map control structure. 1641 * imap - pointer to inode map control structure.
1653 * agno - allocation group to allocate from. 1642 * agno - allocation group to allocate from.
1654 * dir - 'true' if the new disk inode is for a directory. 1643 * dir - 'true' if the new disk inode is for a directory.
1655 * ip - pointer to the new inode to be filled in on successful return 1644 * ip - pointer to the new inode to be filled in on successful return
1656 * with the disk inode number allocated, its extent address 1645 * with the disk inode number allocated, its extent address
1657 * and the start of the ag. 1646 * and the start of the ag.
1658 * 1647 *
1659 * RETURN VALUES: 1648 * RETURN VALUES:
1660 * 0 - success. 1649 * 0 - success.
1661 * -ENOSPC - insufficient disk resources. 1650 * -ENOSPC - insufficient disk resources.
1662 * -EIO - i/o error. 1651 * -EIO - i/o error.
1663 */ 1652 */
1664static int 1653static int
1665diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) 1654diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1709,9 +1698,9 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1709 1698
1710 1699
1711/* 1700/*
1712 * NAME: diAllocAny(imap,agno,dir,iap) 1701 * NAME: diAllocAny(imap,agno,dir,iap)
1713 * 1702 *
1714 * FUNCTION: allocate a disk inode from any other allocation group. 1703 * FUNCTION: allocate a disk inode from any other allocation group.
1715 * 1704 *
1716 * this routine is called when an allocation attempt within 1705 * this routine is called when an allocation attempt within
1717 * the primary allocation group has failed. if attempts to 1706 * the primary allocation group has failed. if attempts to
@@ -1719,17 +1708,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1719 * specified primary group. 1708 * specified primary group.
1720 * 1709 *
1721 * PARAMETERS: 1710 * PARAMETERS:
1722 * imap - pointer to inode map control structure. 1711 * imap - pointer to inode map control structure.
1723 * agno - primary allocation group (to avoid). 1712 * agno - primary allocation group (to avoid).
1724 * dir - 'true' if the new disk inode is for a directory. 1713 * dir - 'true' if the new disk inode is for a directory.
1725 * ip - pointer to a new inode to be filled in on successful return 1714 * ip - pointer to a new inode to be filled in on successful return
1726 * with the disk inode number allocated, its extent address 1715 * with the disk inode number allocated, its extent address
1727 * and the start of the ag. 1716 * and the start of the ag.
1728 * 1717 *
1729 * RETURN VALUES: 1718 * RETURN VALUES:
1730 * 0 - success. 1719 * 0 - success.
1731 * -ENOSPC - insufficient disk resources. 1720 * -ENOSPC - insufficient disk resources.
1732 * -EIO - i/o error. 1721 * -EIO - i/o error.
1733 */ 1722 */
1734static int 1723static int
1735diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) 1724diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
@@ -1772,9 +1761,9 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
1772 1761
1773 1762
1774/* 1763/*
1775 * NAME: diAllocIno(imap,agno,ip) 1764 * NAME: diAllocIno(imap,agno,ip)
1776 * 1765 *
1777 * FUNCTION: allocate a disk inode from the allocation group's free 1766 * FUNCTION: allocate a disk inode from the allocation group's free
1778 * inode list, returning an error if this free list is 1767 * inode list, returning an error if this free list is
1779 * empty (i.e. no iags on the list). 1768 * empty (i.e. no iags on the list).
1780 * 1769 *
@@ -1785,16 +1774,16 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
1785 * PRE CONDITION: Already have AG lock for this AG. 1774 * PRE CONDITION: Already have AG lock for this AG.
1786 * 1775 *
1787 * PARAMETERS: 1776 * PARAMETERS:
1788 * imap - pointer to inode map control structure. 1777 * imap - pointer to inode map control structure.
1789 * agno - allocation group. 1778 * agno - allocation group.
1790 * ip - pointer to new inode to be filled in on successful return 1779 * ip - pointer to new inode to be filled in on successful return
1791 * with the disk inode number allocated, its extent address 1780 * with the disk inode number allocated, its extent address
1792 * and the start of the ag. 1781 * and the start of the ag.
1793 * 1782 *
1794 * RETURN VALUES: 1783 * RETURN VALUES:
1795 * 0 - success. 1784 * 0 - success.
1796 * -ENOSPC - insufficient disk resources. 1785 * -ENOSPC - insufficient disk resources.
1797 * -EIO - i/o error. 1786 * -EIO - i/o error.
1798 */ 1787 */
1799static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) 1788static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1800{ 1789{
@@ -1890,7 +1879,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1890 1879
1891 1880
1892/* 1881/*
1893 * NAME: diAllocExt(imap,agno,ip) 1882 * NAME: diAllocExt(imap,agno,ip)
1894 * 1883 *
1895 * FUNCTION: add a new extent of free inodes to an iag, allocating 1884 * FUNCTION: add a new extent of free inodes to an iag, allocating
1896 * an inode from this extent to satisfy the current allocation 1885 * an inode from this extent to satisfy the current allocation
@@ -1910,16 +1899,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1910 * for the purpose of satisfying this request. 1899 * for the purpose of satisfying this request.
1911 * 1900 *
1912 * PARAMETERS: 1901 * PARAMETERS:
1913 * imap - pointer to inode map control structure. 1902 * imap - pointer to inode map control structure.
1914 * agno - allocation group number. 1903 * agno - allocation group number.
1915 * ip - pointer to new inode to be filled in on successful return 1904 * ip - pointer to new inode to be filled in on successful return
1916 * with the disk inode number allocated, its extent address 1905 * with the disk inode number allocated, its extent address
1917 * and the start of the ag. 1906 * and the start of the ag.
1918 * 1907 *
1919 * RETURN VALUES: 1908 * RETURN VALUES:
1920 * 0 - success. 1909 * 0 - success.
1921 * -ENOSPC - insufficient disk resources. 1910 * -ENOSPC - insufficient disk resources.
1922 * -EIO - i/o error. 1911 * -EIO - i/o error.
1923 */ 1912 */
1924static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) 1913static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1925{ 1914{
@@ -2010,7 +1999,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
2010 1999
2011 2000
2012/* 2001/*
2013 * NAME: diAllocBit(imap,iagp,ino) 2002 * NAME: diAllocBit(imap,iagp,ino)
2014 * 2003 *
2015 * FUNCTION: allocate a backed inode from an iag. 2004 * FUNCTION: allocate a backed inode from an iag.
2016 * 2005 *
@@ -2030,14 +2019,14 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
2030 * this AG. Must have read lock on imap inode. 2019 * this AG. Must have read lock on imap inode.
2031 * 2020 *
2032 * PARAMETERS: 2021 * PARAMETERS:
2033 * imap - pointer to inode map control structure. 2022 * imap - pointer to inode map control structure.
2034 * iagp - pointer to iag. 2023 * iagp - pointer to iag.
2035 * ino - inode number to be allocated within the iag. 2024 * ino - inode number to be allocated within the iag.
2036 * 2025 *
2037 * RETURN VALUES: 2026 * RETURN VALUES:
2038 * 0 - success. 2027 * 0 - success.
2039 * -ENOSPC - insufficient disk resources. 2028 * -ENOSPC - insufficient disk resources.
2040 * -EIO - i/o error. 2029 * -EIO - i/o error.
2041 */ 2030 */
2042static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) 2031static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2043{ 2032{
@@ -2144,11 +2133,11 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2144 2133
2145 2134
2146/* 2135/*
2147 * NAME: diNewExt(imap,iagp,extno) 2136 * NAME: diNewExt(imap,iagp,extno)
2148 * 2137 *
2149 * FUNCTION: initialize a new extent of inodes for an iag, allocating 2138 * FUNCTION: initialize a new extent of inodes for an iag, allocating
2150 * the first inode of the extent for use for the current 2139 * the first inode of the extent for use for the current
2151 * allocation request. 2140 * allocation request.
2152 * 2141 *
2153 * disk resources are allocated for the new extent of inodes 2142 * disk resources are allocated for the new extent of inodes
2154 * and the inodes themselves are initialized to reflect their 2143 * and the inodes themselves are initialized to reflect their
@@ -2177,14 +2166,14 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2177 * this AG. Must have read lock on imap inode. 2166 * this AG. Must have read lock on imap inode.
2178 * 2167 *
2179 * PARAMETERS: 2168 * PARAMETERS:
2180 * imap - pointer to inode map control structure. 2169 * imap - pointer to inode map control structure.
2181 * iagp - pointer to iag. 2170 * iagp - pointer to iag.
2182 * extno - extent number. 2171 * extno - extent number.
2183 * 2172 *
2184 * RETURN VALUES: 2173 * RETURN VALUES:
2185 * 0 - success. 2174 * 0 - success.
2186 * -ENOSPC - insufficient disk resources. 2175 * -ENOSPC - insufficient disk resources.
2187 * -EIO - i/o error. 2176 * -EIO - i/o error.
2188 */ 2177 */
2189static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) 2178static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2190{ 2179{
@@ -2430,7 +2419,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2430 2419
2431 2420
2432/* 2421/*
2433 * NAME: diNewIAG(imap,iagnop,agno) 2422 * NAME: diNewIAG(imap,iagnop,agno)
2434 * 2423 *
2435 * FUNCTION: allocate a new iag for an allocation group. 2424 * FUNCTION: allocate a new iag for an allocation group.
2436 * 2425 *
@@ -2443,16 +2432,16 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2443 * and returned to satisfy the request. 2432 * and returned to satisfy the request.
2444 * 2433 *
2445 * PARAMETERS: 2434 * PARAMETERS:
2446 * imap - pointer to inode map control structure. 2435 * imap - pointer to inode map control structure.
2447 * iagnop - pointer to an iag number set with the number of the 2436 * iagnop - pointer to an iag number set with the number of the
2448 * newly allocated iag upon successful return. 2437 * newly allocated iag upon successful return.
2449 * agno - allocation group number. 2438 * agno - allocation group number.
2450 * bpp - Buffer pointer to be filled in with new IAG's buffer 2439 * bpp - Buffer pointer to be filled in with new IAG's buffer
2451 * 2440 *
2452 * RETURN VALUES: 2441 * RETURN VALUES:
2453 * 0 - success. 2442 * 0 - success.
2454 * -ENOSPC - insufficient disk resources. 2443 * -ENOSPC - insufficient disk resources.
2455 * -EIO - i/o error. 2444 * -EIO - i/o error.
2456 * 2445 *
2457 * serialization: 2446 * serialization:
2458 * AG lock held on entry/exit; 2447 * AG lock held on entry/exit;
@@ -2461,7 +2450,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2461 * 2450 *
2462 * note: new iag transaction: 2451 * note: new iag transaction:
2463 * . synchronously write iag; 2452 * . synchronously write iag;
2464 * . write log of xtree and inode of imap; 2453 * . write log of xtree and inode of imap;
2465 * . commit; 2454 * . commit;
2466 * . synchronous write of xtree (right to left, bottom to top); 2455 * . synchronous write of xtree (right to left, bottom to top);
2467 * . at start of logredo(): init in-memory imap with one additional iag page; 2456 * . at start of logredo(): init in-memory imap with one additional iag page;
@@ -2481,9 +2470,6 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2481 s64 xaddr = 0; 2470 s64 xaddr = 0;
2482 s64 blkno; 2471 s64 blkno;
2483 tid_t tid; 2472 tid_t tid;
2484#ifdef _STILL_TO_PORT
2485 xad_t xad;
2486#endif /* _STILL_TO_PORT */
2487 struct inode *iplist[1]; 2473 struct inode *iplist[1];
2488 2474
2489 /* pick up pointers to the inode map and mount inodes */ 2475 /* pick up pointers to the inode map and mount inodes */
@@ -2674,15 +2660,15 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2674} 2660}
2675 2661
2676/* 2662/*
2677 * NAME: diIAGRead() 2663 * NAME: diIAGRead()
2678 * 2664 *
2679 * FUNCTION: get the buffer for the specified iag within a fileset 2665 * FUNCTION: get the buffer for the specified iag within a fileset
2680 * or aggregate inode map. 2666 * or aggregate inode map.
2681 * 2667 *
2682 * PARAMETERS: 2668 * PARAMETERS:
2683 * imap - pointer to inode map control structure. 2669 * imap - pointer to inode map control structure.
2684 * iagno - iag number. 2670 * iagno - iag number.
2685 * bpp - point to buffer pointer to be filled in on successful 2671 * bpp - point to buffer pointer to be filled in on successful
2686 * exit. 2672 * exit.
2687 * 2673 *
2688 * SERIALIZATION: 2674 * SERIALIZATION:
@@ -2691,8 +2677,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2691 * the read lock is unnecessary.) 2677 * the read lock is unnecessary.)
2692 * 2678 *
2693 * RETURN VALUES: 2679 * RETURN VALUES:
2694 * 0 - success. 2680 * 0 - success.
2695 * -EIO - i/o error. 2681 * -EIO - i/o error.
2696 */ 2682 */
2697static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) 2683static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2698{ 2684{
@@ -2712,17 +2698,17 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2712} 2698}
2713 2699
2714/* 2700/*
2715 * NAME: diFindFree() 2701 * NAME: diFindFree()
2716 * 2702 *
2717 * FUNCTION: find the first free bit in a word starting at 2703 * FUNCTION: find the first free bit in a word starting at
2718 * the specified bit position. 2704 * the specified bit position.
2719 * 2705 *
2720 * PARAMETERS: 2706 * PARAMETERS:
2721 * word - word to be examined. 2707 * word - word to be examined.
2722 * start - starting bit position. 2708 * start - starting bit position.
2723 * 2709 *
2724 * RETURN VALUES: 2710 * RETURN VALUES:
2725 * bit position of first free bit in the word or 32 if 2711 * bit position of first free bit in the word or 32 if
2726 * no free bits were found. 2712 * no free bits were found.
2727 */ 2713 */
2728static int diFindFree(u32 word, int start) 2714static int diFindFree(u32 word, int start)
@@ -2897,7 +2883,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2897 atomic_read(&imap->im_numfree)); 2883 atomic_read(&imap->im_numfree));
2898 2884
2899 /* 2885 /*
2900 * reconstruct imap 2886 * reconstruct imap
2901 * 2887 *
2902 * coalesce contiguous k (newAGSize/oldAGSize) AGs; 2888 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
2903 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; 2889 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
@@ -2913,7 +2899,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2913 } 2899 }
2914 2900
2915 /* 2901 /*
2916 * process each iag page of the map. 2902 * process each iag page of the map.
2917 * 2903 *
2918 * rebuild AG Free Inode List, AG Free Inode Extent List; 2904 * rebuild AG Free Inode List, AG Free Inode Extent List;
2919 */ 2905 */
@@ -2932,7 +2918,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2932 2918
2933 /* leave free iag in the free iag list */ 2919 /* leave free iag in the free iag list */
2934 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { 2920 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2935 release_metapage(bp); 2921 release_metapage(bp);
2936 continue; 2922 continue;
2937 } 2923 }
2938 2924
@@ -3063,13 +3049,13 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
3063} 3049}
3064 3050
3065/* 3051/*
3066 * NAME: copy_from_dinode() 3052 * NAME: copy_from_dinode()
3067 * 3053 *
3068 * FUNCTION: Copies inode info from disk inode to in-memory inode 3054 * FUNCTION: Copies inode info from disk inode to in-memory inode
3069 * 3055 *
3070 * RETURN VALUES: 3056 * RETURN VALUES:
3071 * 0 - success 3057 * 0 - success
3072 * -ENOMEM - insufficient memory 3058 * -ENOMEM - insufficient memory
3073 */ 3059 */
3074static int copy_from_dinode(struct dinode * dip, struct inode *ip) 3060static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3075{ 3061{
@@ -3151,9 +3137,9 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3151} 3137}
3152 3138
3153/* 3139/*
3154 * NAME: copy_to_dinode() 3140 * NAME: copy_to_dinode()
3155 * 3141 *
3156 * FUNCTION: Copies inode info from in-memory inode to disk inode 3142 * FUNCTION: Copies inode info from in-memory inode to disk inode
3157 */ 3143 */
3158static void copy_to_dinode(struct dinode * dip, struct inode *ip) 3144static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3159{ 3145{
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
index 4f9c346ed4..610a0e9d89 100644
--- a/fs/jfs/jfs_imap.h
+++ b/fs/jfs/jfs_imap.h
@@ -24,17 +24,17 @@
24 * jfs_imap.h: disk inode manager 24 * jfs_imap.h: disk inode manager
25 */ 25 */
26 26
27#define EXTSPERIAG 128 /* number of disk inode extent per iag */ 27#define EXTSPERIAG 128 /* number of disk inode extent per iag */
28#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ 28#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */
29#define SMAPSZ 4 /* number of words per summary map */ 29#define SMAPSZ 4 /* number of words per summary map */
30#define EXTSPERSUM 32 /* number of extents per summary map entry */ 30#define EXTSPERSUM 32 /* number of extents per summary map entry */
31#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ 31#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */
32#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ 32#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */
33#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ 33#define MAXIAGS ((1<<20)-1) /* maximum number of iags */
34#define MAXAG 128 /* maximum number of allocation groups */ 34#define MAXAG 128 /* maximum number of allocation groups */
35 35
36#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ 36#define AMAPSIZE 512 /* bytes in the IAG allocation maps */
37#define SMAPSIZE 16 /* bytes in the IAG summary maps */ 37#define SMAPSIZE 16 /* bytes in the IAG summary maps */
38 38
39/* convert inode number to iag number */ 39/* convert inode number to iag number */
40#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) 40#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG)
@@ -60,31 +60,31 @@
60 * inode allocation group page (per 4096 inodes of an AG) 60 * inode allocation group page (per 4096 inodes of an AG)
61 */ 61 */
62struct iag { 62struct iag {
63 __le64 agstart; /* 8: starting block of ag */ 63 __le64 agstart; /* 8: starting block of ag */
64 __le32 iagnum; /* 4: inode allocation group number */ 64 __le32 iagnum; /* 4: inode allocation group number */
65 __le32 inofreefwd; /* 4: ag inode free list forward */ 65 __le32 inofreefwd; /* 4: ag inode free list forward */
66 __le32 inofreeback; /* 4: ag inode free list back */ 66 __le32 inofreeback; /* 4: ag inode free list back */
67 __le32 extfreefwd; /* 4: ag inode extent free list forward */ 67 __le32 extfreefwd; /* 4: ag inode extent free list forward */
68 __le32 extfreeback; /* 4: ag inode extent free list back */ 68 __le32 extfreeback; /* 4: ag inode extent free list back */
69 __le32 iagfree; /* 4: iag free list */ 69 __le32 iagfree; /* 4: iag free list */
70 70
71 /* summary map: 1 bit per inode extent */ 71 /* summary map: 1 bit per inode extent */
72 __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; 72 __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
73 * note: this indicates free and backed 73 * note: this indicates free and backed
74 * inodes, if the extent is not backed the 74 * inodes, if the extent is not backed the
75 * value will be 1. if the extent is 75 * value will be 1. if the extent is
76 * backed but all inodes are being used the 76 * backed but all inodes are being used the
77 * value will be 1. if the extent is 77 * value will be 1. if the extent is
78 * backed but at least one of the inodes is 78 * backed but at least one of the inodes is
79 * free the value will be 0. 79 * free the value will be 0.
80 */ 80 */
81 __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ 81 __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
82 __le32 nfreeinos; /* 4: number of free inodes */ 82 __le32 nfreeinos; /* 4: number of free inodes */
83 __le32 nfreeexts; /* 4: number of free extents */ 83 __le32 nfreeexts; /* 4: number of free extents */
84 /* (72) */ 84 /* (72) */
85 u8 pad[1976]; /* 1976: pad to 2048 bytes */ 85 u8 pad[1976]; /* 1976: pad to 2048 bytes */
86 /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ 86 /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
87 __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ 87 __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */
88 __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ 88 __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */
89 pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ 89 pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */
90}; /* (4096) */ 90}; /* (4096) */
@@ -93,44 +93,44 @@ struct iag {
93 * per AG control information (in inode map control page) 93 * per AG control information (in inode map control page)
94 */ 94 */
95struct iagctl_disk { 95struct iagctl_disk {
96 __le32 inofree; /* 4: free inode list anchor */ 96 __le32 inofree; /* 4: free inode list anchor */
97 __le32 extfree; /* 4: free extent list anchor */ 97 __le32 extfree; /* 4: free extent list anchor */
98 __le32 numinos; /* 4: number of backed inodes */ 98 __le32 numinos; /* 4: number of backed inodes */
99 __le32 numfree; /* 4: number of free inodes */ 99 __le32 numfree; /* 4: number of free inodes */
100}; /* (16) */ 100}; /* (16) */
101 101
102struct iagctl { 102struct iagctl {
103 int inofree; /* free inode list anchor */ 103 int inofree; /* free inode list anchor */
104 int extfree; /* free extent list anchor */ 104 int extfree; /* free extent list anchor */
105 int numinos; /* number of backed inodes */ 105 int numinos; /* number of backed inodes */
106 int numfree; /* number of free inodes */ 106 int numfree; /* number of free inodes */
107}; 107};
108 108
109/* 109/*
110 * per fileset/aggregate inode map control page 110 * per fileset/aggregate inode map control page
111 */ 111 */
112struct dinomap_disk { 112struct dinomap_disk {
113 __le32 in_freeiag; /* 4: free iag list anchor */ 113 __le32 in_freeiag; /* 4: free iag list anchor */
114 __le32 in_nextiag; /* 4: next free iag number */ 114 __le32 in_nextiag; /* 4: next free iag number */
115 __le32 in_numinos; /* 4: num of backed inodes */ 115 __le32 in_numinos; /* 4: num of backed inodes */
116 __le32 in_numfree; /* 4: num of free backed inodes */ 116 __le32 in_numfree; /* 4: num of free backed inodes */
117 __le32 in_nbperiext; /* 4: num of blocks per inode extent */ 117 __le32 in_nbperiext; /* 4: num of blocks per inode extent */
118 __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ 118 __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */
119 __le32 in_diskblock; /* 4: for standalone test driver */ 119 __le32 in_diskblock; /* 4: for standalone test driver */
120 __le32 in_maxag; /* 4: for standalone test driver */ 120 __le32 in_maxag; /* 4: for standalone test driver */
121 u8 pad[2016]; /* 2016: pad to 2048 */ 121 u8 pad[2016]; /* 2016: pad to 2048 */
122 struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ 122 struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
123}; /* (4096) */ 123}; /* (4096) */
124 124
125struct dinomap { 125struct dinomap {
126 int in_freeiag; /* free iag list anchor */ 126 int in_freeiag; /* free iag list anchor */
127 int in_nextiag; /* next free iag number */ 127 int in_nextiag; /* next free iag number */
128 int in_numinos; /* num of backed inodes */ 128 int in_numinos; /* num of backed inodes */
129 int in_numfree; /* num of free backed inodes */ 129 int in_numfree; /* num of free backed inodes */
130 int in_nbperiext; /* num of blocks per inode extent */ 130 int in_nbperiext; /* num of blocks per inode extent */
131 int in_l2nbperiext; /* l2 of in_nbperiext */ 131 int in_l2nbperiext; /* l2 of in_nbperiext */
132 int in_diskblock; /* for standalone test driver */ 132 int in_diskblock; /* for standalone test driver */
133 int in_maxag; /* for standalone test driver */ 133 int in_maxag; /* for standalone test driver */
134 struct iagctl in_agctl[MAXAG]; /* AG control information */ 134 struct iagctl in_agctl[MAXAG]; /* AG control information */
135}; 135};
136 136
@@ -139,9 +139,9 @@ struct dinomap {
139 */ 139 */
140struct inomap { 140struct inomap {
141 struct dinomap im_imap; /* 4096: inode allocation control */ 141 struct dinomap im_imap; /* 4096: inode allocation control */
142 struct inode *im_ipimap; /* 4: ptr to inode for imap */ 142 struct inode *im_ipimap; /* 4: ptr to inode for imap */
143 struct mutex im_freelock; /* 4: iag free list lock */ 143 struct mutex im_freelock; /* 4: iag free list lock */
144 struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ 144 struct mutex im_aglock[MAXAG]; /* 512: per AG locks */
145 u32 *im_DBGdimap; 145 u32 *im_DBGdimap;
146 atomic_t im_numinos; /* num of backed inodes */ 146 atomic_t im_numinos; /* num of backed inodes */
147 atomic_t im_numfree; /* num of free backed inodes */ 147 atomic_t im_numfree; /* num of free backed inodes */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 8f453eff3c..cb8f30985a 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -40,7 +40,7 @@ struct jfs_inode_info {
40 uint mode2; /* jfs-specific mode */ 40 uint mode2; /* jfs-specific mode */
41 uint saved_uid; /* saved for uid mount option */ 41 uint saved_uid; /* saved for uid mount option */
42 uint saved_gid; /* saved for gid mount option */ 42 uint saved_gid; /* saved for gid mount option */
43 pxd_t ixpxd; /* inode extent descriptor */ 43 pxd_t ixpxd; /* inode extent descriptor */
44 dxd_t acl; /* dxd describing acl */ 44 dxd_t acl; /* dxd describing acl */
45 dxd_t ea; /* dxd describing ea */ 45 dxd_t ea; /* dxd describing ea */
46 time_t otime; /* time created */ 46 time_t otime; /* time created */
@@ -190,7 +190,7 @@ struct jfs_sb_info {
190 uint gengen; /* inode generation generator*/ 190 uint gengen; /* inode generation generator*/
191 uint inostamp; /* shows inode belongs to fileset*/ 191 uint inostamp; /* shows inode belongs to fileset*/
192 192
193 /* Formerly in ipbmap */ 193 /* Formerly in ipbmap */
194 struct bmap *bmap; /* incore bmap descriptor */ 194 struct bmap *bmap; /* incore bmap descriptor */
195 struct nls_table *nls_tab; /* current codepage */ 195 struct nls_table *nls_tab; /* current codepage */
196 struct inode *direct_inode; /* metadata inode */ 196 struct inode *direct_inode; /* metadata inode */
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 2374b595f2..f0ec72b263 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
32extern void jfs_free_zero_link(struct inode *); 32extern void jfs_free_zero_link(struct inode *);
33extern struct dentry *jfs_get_parent(struct dentry *dentry); 33extern struct dentry *jfs_get_parent(struct dentry *dentry);
34extern void jfs_get_inode_flags(struct jfs_inode_info *); 34extern void jfs_get_inode_flags(struct jfs_inode_info *);
35extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp);
35extern void jfs_set_inode_flags(struct inode *); 36extern void jfs_set_inode_flags(struct inode *);
36extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 37extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
37 38
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 44a2f33cb9..de3e4a506d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -244,7 +244,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
244 goto writeRecord; 244 goto writeRecord;
245 245
246 /* 246 /*
247 * initialize/update page/transaction recovery lsn 247 * initialize/update page/transaction recovery lsn
248 */ 248 */
249 lsn = log->lsn; 249 lsn = log->lsn;
250 250
@@ -263,7 +263,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
263 } 263 }
264 264
265 /* 265 /*
266 * initialize/update lsn of tblock of the page 266 * initialize/update lsn of tblock of the page
267 * 267 *
268 * transaction inherits oldest lsn of pages associated 268 * transaction inherits oldest lsn of pages associated
269 * with allocation/deallocation of resources (their 269 * with allocation/deallocation of resources (their
@@ -307,7 +307,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
307 LOGSYNC_UNLOCK(log, flags); 307 LOGSYNC_UNLOCK(log, flags);
308 308
309 /* 309 /*
310 * write the log record 310 * write the log record
311 */ 311 */
312 writeRecord: 312 writeRecord:
313 lsn = lmWriteRecord(log, tblk, lrd, tlck); 313 lsn = lmWriteRecord(log, tblk, lrd, tlck);
@@ -372,7 +372,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
372 goto moveLrd; 372 goto moveLrd;
373 373
374 /* 374 /*
375 * move log record data 375 * move log record data
376 */ 376 */
377 /* retrieve source meta-data page to log */ 377 /* retrieve source meta-data page to log */
378 if (tlck->flag & tlckPAGELOCK) { 378 if (tlck->flag & tlckPAGELOCK) {
@@ -465,7 +465,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
465 } 465 }
466 466
467 /* 467 /*
468 * move log record descriptor 468 * move log record descriptor
469 */ 469 */
470 moveLrd: 470 moveLrd:
471 lrd->length = cpu_to_le16(len); 471 lrd->length = cpu_to_le16(len);
@@ -574,7 +574,7 @@ static int lmNextPage(struct jfs_log * log)
574 LOGGC_LOCK(log); 574 LOGGC_LOCK(log);
575 575
576 /* 576 /*
577 * write or queue the full page at the tail of write queue 577 * write or queue the full page at the tail of write queue
578 */ 578 */
579 /* get the tail tblk on commit queue */ 579 /* get the tail tblk on commit queue */
580 if (list_empty(&log->cqueue)) 580 if (list_empty(&log->cqueue))
@@ -625,7 +625,7 @@ static int lmNextPage(struct jfs_log * log)
625 LOGGC_UNLOCK(log); 625 LOGGC_UNLOCK(log);
626 626
627 /* 627 /*
628 * allocate/initialize next page 628 * allocate/initialize next page
629 */ 629 */
630 /* if log wraps, the first data page of log is 2 630 /* if log wraps, the first data page of log is 2
631 * (0 never used, 1 is superblock). 631 * (0 never used, 1 is superblock).
@@ -953,7 +953,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
953 } 953 }
954 954
955 /* 955 /*
956 * forward syncpt 956 * forward syncpt
957 */ 957 */
958 /* if last sync is same as last syncpt, 958 /* if last sync is same as last syncpt,
959 * invoke sync point forward processing to update sync. 959 * invoke sync point forward processing to update sync.
@@ -989,7 +989,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
989 lsn = log->lsn; 989 lsn = log->lsn;
990 990
991 /* 991 /*
992 * setup next syncpt trigger (SWAG) 992 * setup next syncpt trigger (SWAG)
993 */ 993 */
994 logsize = log->logsize; 994 logsize = log->logsize;
995 995
@@ -1000,11 +1000,11 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
1000 if (more < 2 * LOGPSIZE) { 1000 if (more < 2 * LOGPSIZE) {
1001 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); 1001 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1002 /* 1002 /*
1003 * log wrapping 1003 * log wrapping
1004 * 1004 *
1005 * option 1 - panic ? No.! 1005 * option 1 - panic ? No.!
1006 * option 2 - shutdown file systems 1006 * option 2 - shutdown file systems
1007 * associated with log ? 1007 * associated with log ?
1008 * option 3 - extend log ? 1008 * option 3 - extend log ?
1009 */ 1009 */
1010 /* 1010 /*
@@ -1062,7 +1062,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
1062/* 1062/*
1063 * NAME: lmLogOpen() 1063 * NAME: lmLogOpen()
1064 * 1064 *
1065 * FUNCTION: open the log on first open; 1065 * FUNCTION: open the log on first open;
1066 * insert filesystem in the active list of the log. 1066 * insert filesystem in the active list of the log.
1067 * 1067 *
1068 * PARAMETER: ipmnt - file system mount inode 1068 * PARAMETER: ipmnt - file system mount inode
@@ -1113,7 +1113,7 @@ int lmLogOpen(struct super_block *sb)
1113 init_waitqueue_head(&log->syncwait); 1113 init_waitqueue_head(&log->syncwait);
1114 1114
1115 /* 1115 /*
1116 * external log as separate logical volume 1116 * external log as separate logical volume
1117 * 1117 *
1118 * file systems to log may have n-to-1 relationship; 1118 * file systems to log may have n-to-1 relationship;
1119 */ 1119 */
@@ -1155,7 +1155,7 @@ journal_found:
1155 return 0; 1155 return 0;
1156 1156
1157 /* 1157 /*
1158 * unwind on error 1158 * unwind on error
1159 */ 1159 */
1160 shutdown: /* unwind lbmLogInit() */ 1160 shutdown: /* unwind lbmLogInit() */
1161 list_del(&log->journal_list); 1161 list_del(&log->journal_list);
@@ -1427,7 +1427,7 @@ int lmLogInit(struct jfs_log * log)
1427 return 0; 1427 return 0;
1428 1428
1429 /* 1429 /*
1430 * unwind on error 1430 * unwind on error
1431 */ 1431 */
1432 errout30: /* release log page */ 1432 errout30: /* release log page */
1433 log->wqueue = NULL; 1433 log->wqueue = NULL;
@@ -1480,7 +1480,7 @@ int lmLogClose(struct super_block *sb)
1480 1480
1481 if (test_bit(log_INLINELOG, &log->flag)) { 1481 if (test_bit(log_INLINELOG, &log->flag)) {
1482 /* 1482 /*
1483 * in-line log in host file system 1483 * in-line log in host file system
1484 */ 1484 */
1485 rc = lmLogShutdown(log); 1485 rc = lmLogShutdown(log);
1486 kfree(log); 1486 kfree(log);
@@ -1504,7 +1504,7 @@ int lmLogClose(struct super_block *sb)
1504 goto out; 1504 goto out;
1505 1505
1506 /* 1506 /*
1507 * external log as separate logical volume 1507 * external log as separate logical volume
1508 */ 1508 */
1509 list_del(&log->journal_list); 1509 list_del(&log->journal_list);
1510 bdev = log->bdev; 1510 bdev = log->bdev;
@@ -1622,20 +1622,26 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1622 if (!list_empty(&log->synclist)) { 1622 if (!list_empty(&log->synclist)) {
1623 struct logsyncblk *lp; 1623 struct logsyncblk *lp;
1624 1624
1625 printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
1625 list_for_each_entry(lp, &log->synclist, synclist) { 1626 list_for_each_entry(lp, &log->synclist, synclist) {
1626 if (lp->xflag & COMMIT_PAGE) { 1627 if (lp->xflag & COMMIT_PAGE) {
1627 struct metapage *mp = (struct metapage *)lp; 1628 struct metapage *mp = (struct metapage *)lp;
1628 dump_mem("orphan metapage", lp, 1629 print_hex_dump(KERN_ERR, "metapage: ",
1629 sizeof(struct metapage)); 1630 DUMP_PREFIX_ADDRESS, 16, 4,
1630 dump_mem("page", mp->page, sizeof(struct page)); 1631 mp, sizeof(struct metapage), 0);
1631 } 1632 print_hex_dump(KERN_ERR, "page: ",
1632 else 1633 DUMP_PREFIX_ADDRESS, 16,
1633 dump_mem("orphan tblock", lp, 1634 sizeof(long), mp->page,
1634 sizeof(struct tblock)); 1635 sizeof(struct page), 0);
1636 } else
1637 print_hex_dump(KERN_ERR, "tblock:",
1638 DUMP_PREFIX_ADDRESS, 16, 4,
1639 lp, sizeof(struct tblock), 0);
1635 } 1640 }
1636 } 1641 }
1642#else
1643 WARN_ON(!list_empty(&log->synclist));
1637#endif 1644#endif
1638 //assert(list_empty(&log->synclist));
1639 clear_bit(log_FLUSH, &log->flag); 1645 clear_bit(log_FLUSH, &log->flag);
1640} 1646}
1641 1647
@@ -1723,7 +1729,7 @@ int lmLogShutdown(struct jfs_log * log)
1723 * 1729 *
1724 * PARAMETE: log - pointer to logs inode. 1730 * PARAMETE: log - pointer to logs inode.
1725 * fsdev - kdev_t of filesystem. 1731 * fsdev - kdev_t of filesystem.
1726 * serial - pointer to returned log serial number 1732 * serial - pointer to returned log serial number
1727 * activate - insert/remove device from active list. 1733 * activate - insert/remove device from active list.
1728 * 1734 *
1729 * RETURN: 0 - success 1735 * RETURN: 0 - success
@@ -1963,7 +1969,7 @@ static void lbmfree(struct lbuf * bp)
1963 * FUNCTION: add a log buffer to the log redrive list 1969 * FUNCTION: add a log buffer to the log redrive list
1964 * 1970 *
1965 * PARAMETER: 1971 * PARAMETER:
1966 * bp - log buffer 1972 * bp - log buffer
1967 * 1973 *
1968 * NOTES: 1974 * NOTES:
1969 * Takes log_redrive_lock. 1975 * Takes log_redrive_lock.
@@ -2054,7 +2060,7 @@ static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
2054 bp->l_flag = flag; 2060 bp->l_flag = flag;
2055 2061
2056 /* 2062 /*
2057 * insert bp at tail of write queue associated with log 2063 * insert bp at tail of write queue associated with log
2058 * 2064 *
2059 * (request is either for bp already/currently at head of queue 2065 * (request is either for bp already/currently at head of queue
2060 * or new bp to be inserted at tail) 2066 * or new bp to be inserted at tail)
@@ -2117,7 +2123,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2117 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); 2123 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2118 2124
2119 /* 2125 /*
2120 * initiate pageout of the page 2126 * initiate pageout of the page
2121 */ 2127 */
2122 lbmStartIO(bp); 2128 lbmStartIO(bp);
2123} 2129}
@@ -2128,7 +2134,7 @@ static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2128 * 2134 *
2129 * FUNCTION: Interface to DD strategy routine 2135 * FUNCTION: Interface to DD strategy routine
2130 * 2136 *
2131 * RETURN: none 2137 * RETURN: none
2132 * 2138 *
2133 * serialization: LCACHE_LOCK() is NOT held during log i/o; 2139 * serialization: LCACHE_LOCK() is NOT held during log i/o;
2134 */ 2140 */
@@ -2222,7 +2228,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2222 bio_put(bio); 2228 bio_put(bio);
2223 2229
2224 /* 2230 /*
2225 * pagein completion 2231 * pagein completion
2226 */ 2232 */
2227 if (bp->l_flag & lbmREAD) { 2233 if (bp->l_flag & lbmREAD) {
2228 bp->l_flag &= ~lbmREAD; 2234 bp->l_flag &= ~lbmREAD;
@@ -2236,7 +2242,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2236 } 2242 }
2237 2243
2238 /* 2244 /*
2239 * pageout completion 2245 * pageout completion
2240 * 2246 *
2241 * the bp at the head of write queue has completed pageout. 2247 * the bp at the head of write queue has completed pageout.
2242 * 2248 *
@@ -2302,7 +2308,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2302 } 2308 }
2303 2309
2304 /* 2310 /*
2305 * synchronous pageout: 2311 * synchronous pageout:
2306 * 2312 *
2307 * buffer has not necessarily been removed from write queue 2313 * buffer has not necessarily been removed from write queue
2308 * (e.g., synchronous write of partial-page with COMMIT): 2314 * (e.g., synchronous write of partial-page with COMMIT):
@@ -2316,7 +2322,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2316 } 2322 }
2317 2323
2318 /* 2324 /*
2319 * Group Commit pageout: 2325 * Group Commit pageout:
2320 */ 2326 */
2321 else if (bp->l_flag & lbmGC) { 2327 else if (bp->l_flag & lbmGC) {
2322 LCACHE_UNLOCK(flags); 2328 LCACHE_UNLOCK(flags);
@@ -2324,7 +2330,7 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2324 } 2330 }
2325 2331
2326 /* 2332 /*
2327 * asynchronous pageout: 2333 * asynchronous pageout:
2328 * 2334 *
2329 * buffer must have been removed from write queue: 2335 * buffer must have been removed from write queue:
2330 * insert buffer at head of freelist where it can be recycled 2336 * insert buffer at head of freelist where it can be recycled
@@ -2375,7 +2381,7 @@ int jfsIOWait(void *arg)
2375 * FUNCTION: format file system log 2381 * FUNCTION: format file system log
2376 * 2382 *
2377 * PARAMETERS: 2383 * PARAMETERS:
2378 * log - volume log 2384 * log - volume log
2379 * logAddress - start address of log space in FS block 2385 * logAddress - start address of log space in FS block
2380 * logSize - length of log space in FS block; 2386 * logSize - length of log space in FS block;
2381 * 2387 *
@@ -2407,16 +2413,16 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2407 npages = logSize >> sbi->l2nbperpage; 2413 npages = logSize >> sbi->l2nbperpage;
2408 2414
2409 /* 2415 /*
2410 * log space: 2416 * log space:
2411 * 2417 *
2412 * page 0 - reserved; 2418 * page 0 - reserved;
2413 * page 1 - log superblock; 2419 * page 1 - log superblock;
2414 * page 2 - log data page: A SYNC log record is written 2420 * page 2 - log data page: A SYNC log record is written
2415 * into this page at logform time; 2421 * into this page at logform time;
2416 * pages 3-N - log data page: set to empty log data pages; 2422 * pages 3-N - log data page: set to empty log data pages;
2417 */ 2423 */
2418 /* 2424 /*
2419 * init log superblock: log page 1 2425 * init log superblock: log page 1
2420 */ 2426 */
2421 logsuper = (struct logsuper *) bp->l_ldata; 2427 logsuper = (struct logsuper *) bp->l_ldata;
2422 2428
@@ -2436,7 +2442,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2436 goto exit; 2442 goto exit;
2437 2443
2438 /* 2444 /*
2439 * init pages 2 to npages-1 as log data pages: 2445 * init pages 2 to npages-1 as log data pages:
2440 * 2446 *
2441 * log page sequence number (lpsn) initialization: 2447 * log page sequence number (lpsn) initialization:
2442 * 2448 *
@@ -2479,7 +2485,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2479 goto exit; 2485 goto exit;
2480 2486
2481 /* 2487 /*
2482 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) 2488 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2483 */ 2489 */
2484 for (lspn = 0; lspn < npages - 3; lspn++) { 2490 for (lspn = 0; lspn < npages - 3; lspn++) {
2485 lp->h.page = lp->t.page = cpu_to_le32(lspn); 2491 lp->h.page = lp->t.page = cpu_to_le32(lspn);
@@ -2495,7 +2501,7 @@ int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2495 rc = 0; 2501 rc = 0;
2496exit: 2502exit:
2497 /* 2503 /*
2498 * finalize log 2504 * finalize log
2499 */ 2505 */
2500 /* release the buffer */ 2506 /* release the buffer */
2501 lbmFree(bp); 2507 lbmFree(bp);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index a53fb17ea2..1f85ef0ec0 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -144,7 +144,7 @@ struct logpage {
144 * 144 *
145 * (this comment should be rewritten !) 145 * (this comment should be rewritten !)
146 * jfs uses only "after" log records (only a single writer is allowed 146 * jfs uses only "after" log records (only a single writer is allowed
147 * in a page, pages are written to temporary paging space if 147 * in a page, pages are written to temporary paging space if
148 * if they must be written to disk before commit, and i/o is 148 * if they must be written to disk before commit, and i/o is
149 * scheduled for modified pages to their home location after 149 * scheduled for modified pages to their home location after
150 * the log records containing the after values and the commit 150 * the log records containing the after values and the commit
@@ -153,7 +153,7 @@ struct logpage {
153 * 153 *
154 * a log record consists of a data area of variable length followed by 154 * a log record consists of a data area of variable length followed by
155 * a descriptor of fixed size LOGRDSIZE bytes. 155 * a descriptor of fixed size LOGRDSIZE bytes.
156 * the data area is rounded up to an integral number of 4-bytes and 156 * the data area is rounded up to an integral number of 4-bytes and
157 * must be no longer than LOGPSIZE. 157 * must be no longer than LOGPSIZE.
158 * the descriptor is of size of multiple of 4-bytes and aligned on a 158 * the descriptor is of size of multiple of 4-bytes and aligned on a
159 * 4-byte boundary. 159 * 4-byte boundary.
@@ -215,13 +215,13 @@ struct lrd {
215 union { 215 union {
216 216
217 /* 217 /*
218 * COMMIT: commit 218 * COMMIT: commit
219 * 219 *
220 * transaction commit: no type-dependent information; 220 * transaction commit: no type-dependent information;
221 */ 221 */
222 222
223 /* 223 /*
224 * REDOPAGE: after-image 224 * REDOPAGE: after-image
225 * 225 *
226 * apply after-image; 226 * apply after-image;
227 * 227 *
@@ -236,7 +236,7 @@ struct lrd {
236 } redopage; /* (20) */ 236 } redopage; /* (20) */
237 237
238 /* 238 /*
239 * NOREDOPAGE: the page is freed 239 * NOREDOPAGE: the page is freed
240 * 240 *
241 * do not apply after-image records which precede this record 241 * do not apply after-image records which precede this record
242 * in the log with the same page block number to this page. 242 * in the log with the same page block number to this page.
@@ -252,7 +252,7 @@ struct lrd {
252 } noredopage; /* (20) */ 252 } noredopage; /* (20) */
253 253
254 /* 254 /*
255 * UPDATEMAP: update block allocation map 255 * UPDATEMAP: update block allocation map
256 * 256 *
257 * either in-line PXD, 257 * either in-line PXD,
258 * or out-of-line XADLIST; 258 * or out-of-line XADLIST;
@@ -268,7 +268,7 @@ struct lrd {
268 } updatemap; /* (20) */ 268 } updatemap; /* (20) */
269 269
270 /* 270 /*
271 * NOREDOINOEXT: the inode extent is freed 271 * NOREDOINOEXT: the inode extent is freed
272 * 272 *
273 * do not apply after-image records which precede this 273 * do not apply after-image records which precede this
274 * record in the log with the any of the 4 page block 274 * record in the log with the any of the 4 page block
@@ -286,7 +286,7 @@ struct lrd {
286 } noredoinoext; /* (20) */ 286 } noredoinoext; /* (20) */
287 287
288 /* 288 /*
289 * SYNCPT: log sync point 289 * SYNCPT: log sync point
290 * 290 *
291 * replay log upto syncpt address specified; 291 * replay log upto syncpt address specified;
292 */ 292 */
@@ -295,13 +295,13 @@ struct lrd {
295 } syncpt; 295 } syncpt;
296 296
297 /* 297 /*
298 * MOUNT: file system mount 298 * MOUNT: file system mount
299 * 299 *
300 * file system mount: no type-dependent information; 300 * file system mount: no type-dependent information;
301 */ 301 */
302 302
303 /* 303 /*
304 * ? FREEXTENT: free specified extent(s) 304 * ? FREEXTENT: free specified extent(s)
305 * 305 *
306 * free specified extent(s) from block allocation map 306 * free specified extent(s) from block allocation map
307 * N.B.: nextents should be length of data/sizeof(xad_t) 307 * N.B.: nextents should be length of data/sizeof(xad_t)
@@ -314,7 +314,7 @@ struct lrd {
314 } freextent; 314 } freextent;
315 315
316 /* 316 /*
317 * ? NOREDOFILE: this file is freed 317 * ? NOREDOFILE: this file is freed
318 * 318 *
319 * do not apply records which precede this record in the log 319 * do not apply records which precede this record in the log
320 * with the same inode number. 320 * with the same inode number.
@@ -330,7 +330,7 @@ struct lrd {
330 } noredofile; 330 } noredofile;
331 331
332 /* 332 /*
333 * ? NEWPAGE: 333 * ? NEWPAGE:
334 * 334 *
335 * metadata type dependent 335 * metadata type dependent
336 */ 336 */
@@ -342,7 +342,7 @@ struct lrd {
342 } newpage; 342 } newpage;
343 343
344 /* 344 /*
345 * ? DUMMY: filler 345 * ? DUMMY: filler
346 * 346 *
347 * no type-dependent information 347 * no type-dependent information
348 */ 348 */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 43d4f69afb..77c7f1129d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -472,7 +472,8 @@ add_failed:
472 printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); 472 printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n");
473 goto skip; 473 goto skip;
474dump_bio: 474dump_bio:
475 dump_mem("bio", bio, sizeof(*bio)); 475 print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16,
476 4, bio, sizeof(*bio), 0);
476skip: 477skip:
477 bio_put(bio); 478 bio_put(bio);
478 unlock_page(page); 479 unlock_page(page);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 4dd4798348..644429acb8 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -80,7 +80,7 @@ static int logMOUNT(struct super_block *sb);
80 */ 80 */
81int jfs_mount(struct super_block *sb) 81int jfs_mount(struct super_block *sb)
82{ 82{
83 int rc = 0; /* Return code */ 83 int rc = 0; /* Return code */
84 struct jfs_sb_info *sbi = JFS_SBI(sb); 84 struct jfs_sb_info *sbi = JFS_SBI(sb);
85 struct inode *ipaimap = NULL; 85 struct inode *ipaimap = NULL;
86 struct inode *ipaimap2 = NULL; 86 struct inode *ipaimap2 = NULL;
@@ -169,7 +169,7 @@ int jfs_mount(struct super_block *sb)
169 sbi->ipaimap2 = NULL; 169 sbi->ipaimap2 = NULL;
170 170
171 /* 171 /*
172 * mount (the only/single) fileset 172 * mount (the only/single) fileset
173 */ 173 */
174 /* 174 /*
175 * open fileset inode allocation map (aka fileset inode) 175 * open fileset inode allocation map (aka fileset inode)
@@ -195,7 +195,7 @@ int jfs_mount(struct super_block *sb)
195 goto out; 195 goto out;
196 196
197 /* 197 /*
198 * unwind on error 198 * unwind on error
199 */ 199 */
200 errout41: /* close fileset inode allocation map inode */ 200 errout41: /* close fileset inode allocation map inode */
201 diFreeSpecial(ipimap); 201 diFreeSpecial(ipimap);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 25430d0b0d..7aa1f7004e 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -18,7 +18,7 @@
18 */ 18 */
19 19
20/* 20/*
21 * jfs_txnmgr.c: transaction manager 21 * jfs_txnmgr.c: transaction manager
22 * 22 *
23 * notes: 23 * notes:
24 * transaction starts with txBegin() and ends with txCommit() 24 * transaction starts with txBegin() and ends with txCommit()
@@ -60,7 +60,7 @@
60#include "jfs_debug.h" 60#include "jfs_debug.h"
61 61
62/* 62/*
63 * transaction management structures 63 * transaction management structures
64 */ 64 */
65static struct { 65static struct {
66 int freetid; /* index of a free tid structure */ 66 int freetid; /* index of a free tid structure */
@@ -103,19 +103,19 @@ module_param(nTxLock, int, 0);
103MODULE_PARM_DESC(nTxLock, 103MODULE_PARM_DESC(nTxLock,
104 "Number of transaction locks (max:65536)"); 104 "Number of transaction locks (max:65536)");
105 105
106struct tblock *TxBlock; /* transaction block table */ 106struct tblock *TxBlock; /* transaction block table */
107static int TxLockLWM; /* Low water mark for number of txLocks used */ 107static int TxLockLWM; /* Low water mark for number of txLocks used */
108static int TxLockHWM; /* High water mark for number of txLocks used */ 108static int TxLockHWM; /* High water mark for number of txLocks used */
109static int TxLockVHWM; /* Very High water mark */ 109static int TxLockVHWM; /* Very High water mark */
110struct tlock *TxLock; /* transaction lock table */ 110struct tlock *TxLock; /* transaction lock table */
111 111
112/* 112/*
113 * transaction management lock 113 * transaction management lock
114 */ 114 */
115static DEFINE_SPINLOCK(jfsTxnLock); 115static DEFINE_SPINLOCK(jfsTxnLock);
116 116
117#define TXN_LOCK() spin_lock(&jfsTxnLock) 117#define TXN_LOCK() spin_lock(&jfsTxnLock)
118#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) 118#define TXN_UNLOCK() spin_unlock(&jfsTxnLock)
119 119
120#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); 120#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock);
121#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) 121#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags)
@@ -148,7 +148,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
148#define TXN_WAKEUP(event) wake_up_all(event) 148#define TXN_WAKEUP(event) wake_up_all(event)
149 149
150/* 150/*
151 * statistics 151 * statistics
152 */ 152 */
153static struct { 153static struct {
154 tid_t maxtid; /* 4: biggest tid ever used */ 154 tid_t maxtid; /* 4: biggest tid ever used */
@@ -181,8 +181,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
181static void LogSyncRelease(struct metapage * mp); 181static void LogSyncRelease(struct metapage * mp);
182 182
183/* 183/*
184 * transaction block/lock management 184 * transaction block/lock management
185 * --------------------------------- 185 * ---------------------------------
186 */ 186 */
187 187
188/* 188/*
@@ -227,9 +227,9 @@ static void txLockFree(lid_t lid)
227} 227}
228 228
229/* 229/*
230 * NAME: txInit() 230 * NAME: txInit()
231 * 231 *
232 * FUNCTION: initialize transaction management structures 232 * FUNCTION: initialize transaction management structures
233 * 233 *
234 * RETURN: 234 * RETURN:
235 * 235 *
@@ -333,9 +333,9 @@ int txInit(void)
333} 333}
334 334
335/* 335/*
336 * NAME: txExit() 336 * NAME: txExit()
337 * 337 *
338 * FUNCTION: clean up when module is unloaded 338 * FUNCTION: clean up when module is unloaded
339 */ 339 */
340void txExit(void) 340void txExit(void)
341{ 341{
@@ -346,12 +346,12 @@ void txExit(void)
346} 346}
347 347
348/* 348/*
349 * NAME: txBegin() 349 * NAME: txBegin()
350 * 350 *
351 * FUNCTION: start a transaction. 351 * FUNCTION: start a transaction.
352 * 352 *
353 * PARAMETER: sb - superblock 353 * PARAMETER: sb - superblock
354 * flag - force for nested tx; 354 * flag - force for nested tx;
355 * 355 *
356 * RETURN: tid - transaction id 356 * RETURN: tid - transaction id
357 * 357 *
@@ -447,13 +447,13 @@ tid_t txBegin(struct super_block *sb, int flag)
447} 447}
448 448
449/* 449/*
450 * NAME: txBeginAnon() 450 * NAME: txBeginAnon()
451 * 451 *
452 * FUNCTION: start an anonymous transaction. 452 * FUNCTION: start an anonymous transaction.
453 * Blocks if logsync or available tlocks are low to prevent 453 * Blocks if logsync or available tlocks are low to prevent
454 * anonymous tlocks from depleting supply. 454 * anonymous tlocks from depleting supply.
455 * 455 *
456 * PARAMETER: sb - superblock 456 * PARAMETER: sb - superblock
457 * 457 *
458 * RETURN: none 458 * RETURN: none
459 */ 459 */
@@ -489,11 +489,11 @@ void txBeginAnon(struct super_block *sb)
489} 489}
490 490
491/* 491/*
492 * txEnd() 492 * txEnd()
493 * 493 *
494 * function: free specified transaction block. 494 * function: free specified transaction block.
495 * 495 *
496 * logsync barrier processing: 496 * logsync barrier processing:
497 * 497 *
498 * serialization: 498 * serialization:
499 */ 499 */
@@ -577,13 +577,13 @@ wakeup:
577} 577}
578 578
579/* 579/*
580 * txLock() 580 * txLock()
581 * 581 *
582 * function: acquire a transaction lock on the specified <mp> 582 * function: acquire a transaction lock on the specified <mp>
583 * 583 *
584 * parameter: 584 * parameter:
585 * 585 *
586 * return: transaction lock id 586 * return: transaction lock id
587 * 587 *
588 * serialization: 588 * serialization:
589 */ 589 */
@@ -829,12 +829,16 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
829 /* Only locks on ipimap or ipaimap should reach here */ 829 /* Only locks on ipimap or ipaimap should reach here */
830 /* assert(jfs_ip->fileset == AGGREGATE_I); */ 830 /* assert(jfs_ip->fileset == AGGREGATE_I); */
831 if (jfs_ip->fileset != AGGREGATE_I) { 831 if (jfs_ip->fileset != AGGREGATE_I) {
832 jfs_err("txLock: trying to lock locked page!"); 832 printk(KERN_ERR "txLock: trying to lock locked page!");
833 dump_mem("ip", ip, sizeof(struct inode)); 833 print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
834 dump_mem("mp", mp, sizeof(struct metapage)); 834 ip, sizeof(*ip), 0);
835 dump_mem("Locker's tblk", tid_to_tblock(tid), 835 print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
836 sizeof(struct tblock)); 836 mp, sizeof(*mp), 0);
837 dump_mem("Tlock", tlck, sizeof(struct tlock)); 837 print_hex_dump(KERN_ERR, "Locker's tblock: ",
838 DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
839 sizeof(struct tblock), 0);
840 print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
841 tlck, sizeof(*tlck), 0);
838 BUG(); 842 BUG();
839 } 843 }
840 INCREMENT(stattx.waitlock); /* statistics */ 844 INCREMENT(stattx.waitlock); /* statistics */
@@ -857,17 +861,17 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
857} 861}
858 862
859/* 863/*
860 * NAME: txRelease() 864 * NAME: txRelease()
861 * 865 *
862 * FUNCTION: Release buffers associated with transaction locks, but don't 866 * FUNCTION: Release buffers associated with transaction locks, but don't
863 * mark homeok yet. The allows other transactions to modify 867 * mark homeok yet. The allows other transactions to modify
864 * buffers, but won't let them go to disk until commit record 868 * buffers, but won't let them go to disk until commit record
865 * actually gets written. 869 * actually gets written.
866 * 870 *
867 * PARAMETER: 871 * PARAMETER:
868 * tblk - 872 * tblk -
869 * 873 *
870 * RETURN: Errors from subroutines. 874 * RETURN: Errors from subroutines.
871 */ 875 */
872static void txRelease(struct tblock * tblk) 876static void txRelease(struct tblock * tblk)
873{ 877{
@@ -896,10 +900,10 @@ static void txRelease(struct tblock * tblk)
896} 900}
897 901
898/* 902/*
899 * NAME: txUnlock() 903 * NAME: txUnlock()
900 * 904 *
901 * FUNCTION: Initiates pageout of pages modified by tid in journalled 905 * FUNCTION: Initiates pageout of pages modified by tid in journalled
902 * objects and frees their lockwords. 906 * objects and frees their lockwords.
903 */ 907 */
904static void txUnlock(struct tblock * tblk) 908static void txUnlock(struct tblock * tblk)
905{ 909{
@@ -983,10 +987,10 @@ static void txUnlock(struct tblock * tblk)
983} 987}
984 988
985/* 989/*
986 * txMaplock() 990 * txMaplock()
987 * 991 *
988 * function: allocate a transaction lock for freed page/entry; 992 * function: allocate a transaction lock for freed page/entry;
989 * for freed page, maplock is used as xtlock/dtlock type; 993 * for freed page, maplock is used as xtlock/dtlock type;
990 */ 994 */
991struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) 995struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
992{ 996{
@@ -1057,7 +1061,7 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
1057} 1061}
1058 1062
1059/* 1063/*
1060 * txLinelock() 1064 * txLinelock()
1061 * 1065 *
1062 * function: allocate a transaction lock for log vector list 1066 * function: allocate a transaction lock for log vector list
1063 */ 1067 */
@@ -1092,39 +1096,39 @@ struct linelock *txLinelock(struct linelock * tlock)
1092} 1096}
1093 1097
1094/* 1098/*
1095 * transaction commit management 1099 * transaction commit management
1096 * ----------------------------- 1100 * -----------------------------
1097 */ 1101 */
1098 1102
1099/* 1103/*
1100 * NAME: txCommit() 1104 * NAME: txCommit()
1101 * 1105 *
1102 * FUNCTION: commit the changes to the objects specified in 1106 * FUNCTION: commit the changes to the objects specified in
1103 * clist. For journalled segments only the 1107 * clist. For journalled segments only the
1104 * changes of the caller are committed, ie by tid. 1108 * changes of the caller are committed, ie by tid.
1105 * for non-journalled segments the data are flushed to 1109 * for non-journalled segments the data are flushed to
1106 * disk and then the change to the disk inode and indirect 1110 * disk and then the change to the disk inode and indirect
1107 * blocks committed (so blocks newly allocated to the 1111 * blocks committed (so blocks newly allocated to the
1108 * segment will be made a part of the segment atomically). 1112 * segment will be made a part of the segment atomically).
1109 * 1113 *
1110 * all of the segments specified in clist must be in 1114 * all of the segments specified in clist must be in
1111 * one file system. no more than 6 segments are needed 1115 * one file system. no more than 6 segments are needed
1112 * to handle all unix svcs. 1116 * to handle all unix svcs.
1113 * 1117 *
1114 * if the i_nlink field (i.e. disk inode link count) 1118 * if the i_nlink field (i.e. disk inode link count)
1115 * is zero, and the type of inode is a regular file or 1119 * is zero, and the type of inode is a regular file or
1116 * directory, or symbolic link , the inode is truncated 1120 * directory, or symbolic link , the inode is truncated
1117 * to zero length. the truncation is committed but the 1121 * to zero length. the truncation is committed but the
1118 * VM resources are unaffected until it is closed (see 1122 * VM resources are unaffected until it is closed (see
1119 * iput and iclose). 1123 * iput and iclose).
1120 * 1124 *
1121 * PARAMETER: 1125 * PARAMETER:
1122 * 1126 *
1123 * RETURN: 1127 * RETURN:
1124 * 1128 *
1125 * serialization: 1129 * serialization:
1126 * on entry the inode lock on each segment is assumed 1130 * on entry the inode lock on each segment is assumed
1127 * to be held. 1131 * to be held.
1128 * 1132 *
1129 * i/o error: 1133 * i/o error:
1130 */ 1134 */
@@ -1175,7 +1179,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1175 if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) 1179 if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1176 tblk->xflag |= COMMIT_LAZY; 1180 tblk->xflag |= COMMIT_LAZY;
1177 /* 1181 /*
1178 * prepare non-journaled objects for commit 1182 * prepare non-journaled objects for commit
1179 * 1183 *
1180 * flush data pages of non-journaled file 1184 * flush data pages of non-journaled file
1181 * to prevent the file getting non-initialized disk blocks 1185 * to prevent the file getting non-initialized disk blocks
@@ -1186,7 +1190,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1186 cd.nip = nip; 1190 cd.nip = nip;
1187 1191
1188 /* 1192 /*
1189 * acquire transaction lock on (on-disk) inodes 1193 * acquire transaction lock on (on-disk) inodes
1190 * 1194 *
1191 * update on-disk inode from in-memory inode 1195 * update on-disk inode from in-memory inode
1192 * acquiring transaction locks for AFTER records 1196 * acquiring transaction locks for AFTER records
@@ -1262,7 +1266,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1262 } 1266 }
1263 1267
1264 /* 1268 /*
1265 * write log records from transaction locks 1269 * write log records from transaction locks
1266 * 1270 *
1267 * txUpdateMap() resets XAD_NEW in XAD. 1271 * txUpdateMap() resets XAD_NEW in XAD.
1268 */ 1272 */
@@ -1294,7 +1298,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1294 !test_cflag(COMMIT_Nolink, tblk->u.ip))); 1298 !test_cflag(COMMIT_Nolink, tblk->u.ip)));
1295 1299
1296 /* 1300 /*
1297 * write COMMIT log record 1301 * write COMMIT log record
1298 */ 1302 */
1299 lrd->type = cpu_to_le16(LOG_COMMIT); 1303 lrd->type = cpu_to_le16(LOG_COMMIT);
1300 lrd->length = 0; 1304 lrd->length = 0;
@@ -1303,7 +1307,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1303 lmGroupCommit(log, tblk); 1307 lmGroupCommit(log, tblk);
1304 1308
1305 /* 1309 /*
1306 * - transaction is now committed - 1310 * - transaction is now committed -
1307 */ 1311 */
1308 1312
1309 /* 1313 /*
@@ -1314,11 +1318,11 @@ int txCommit(tid_t tid, /* transaction identifier */
1314 txForce(tblk); 1318 txForce(tblk);
1315 1319
1316 /* 1320 /*
1317 * update allocation map. 1321 * update allocation map.
1318 * 1322 *
1319 * update inode allocation map and inode: 1323 * update inode allocation map and inode:
1320 * free pager lock on memory object of inode if any. 1324 * free pager lock on memory object of inode if any.
1321 * update block allocation map. 1325 * update block allocation map.
1322 * 1326 *
1323 * txUpdateMap() resets XAD_NEW in XAD. 1327 * txUpdateMap() resets XAD_NEW in XAD.
1324 */ 1328 */
@@ -1326,7 +1330,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1326 txUpdateMap(tblk); 1330 txUpdateMap(tblk);
1327 1331
1328 /* 1332 /*
1329 * free transaction locks and pageout/free pages 1333 * free transaction locks and pageout/free pages
1330 */ 1334 */
1331 txRelease(tblk); 1335 txRelease(tblk);
1332 1336
@@ -1335,7 +1339,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1335 1339
1336 1340
1337 /* 1341 /*
1338 * reset in-memory object state 1342 * reset in-memory object state
1339 */ 1343 */
1340 for (k = 0; k < cd.nip; k++) { 1344 for (k = 0; k < cd.nip; k++) {
1341 ip = cd.iplist[k]; 1345 ip = cd.iplist[k];
@@ -1358,11 +1362,11 @@ int txCommit(tid_t tid, /* transaction identifier */
1358} 1362}
1359 1363
1360/* 1364/*
1361 * NAME: txLog() 1365 * NAME: txLog()
1362 * 1366 *
1363 * FUNCTION: Writes AFTER log records for all lines modified 1367 * FUNCTION: Writes AFTER log records for all lines modified
1364 * by tid for segments specified by inodes in comdata. 1368 * by tid for segments specified by inodes in comdata.
1365 * Code assumes only WRITELOCKS are recorded in lockwords. 1369 * Code assumes only WRITELOCKS are recorded in lockwords.
1366 * 1370 *
1367 * PARAMETERS: 1371 * PARAMETERS:
1368 * 1372 *
@@ -1421,12 +1425,12 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1421} 1425}
1422 1426
1423/* 1427/*
1424 * diLog() 1428 * diLog()
1425 * 1429 *
1426 * function: log inode tlock and format maplock to update bmap; 1430 * function: log inode tlock and format maplock to update bmap;
1427 */ 1431 */
1428static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1432static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1429 struct tlock * tlck, struct commit * cd) 1433 struct tlock * tlck, struct commit * cd)
1430{ 1434{
1431 int rc = 0; 1435 int rc = 0;
1432 struct metapage *mp; 1436 struct metapage *mp;
@@ -1442,7 +1446,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1442 pxd = &lrd->log.redopage.pxd; 1446 pxd = &lrd->log.redopage.pxd;
1443 1447
1444 /* 1448 /*
1445 * inode after image 1449 * inode after image
1446 */ 1450 */
1447 if (tlck->type & tlckENTRY) { 1451 if (tlck->type & tlckENTRY) {
1448 /* log after-image for logredo(): */ 1452 /* log after-image for logredo(): */
@@ -1456,7 +1460,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1456 tlck->flag |= tlckWRITEPAGE; 1460 tlck->flag |= tlckWRITEPAGE;
1457 } else if (tlck->type & tlckFREE) { 1461 } else if (tlck->type & tlckFREE) {
1458 /* 1462 /*
1459 * free inode extent 1463 * free inode extent
1460 * 1464 *
1461 * (pages of the freed inode extent have been invalidated and 1465 * (pages of the freed inode extent have been invalidated and
1462 * a maplock for free of the extent has been formatted at 1466 * a maplock for free of the extent has been formatted at
@@ -1498,7 +1502,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1498 jfs_err("diLog: UFO type tlck:0x%p", tlck); 1502 jfs_err("diLog: UFO type tlck:0x%p", tlck);
1499#ifdef _JFS_WIP 1503#ifdef _JFS_WIP
1500 /* 1504 /*
1501 * alloc/free external EA extent 1505 * alloc/free external EA extent
1502 * 1506 *
1503 * a maplock for txUpdateMap() to update bPWMAP for alloc/free 1507 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1504 * of the extent has been formatted at txLock() time; 1508 * of the extent has been formatted at txLock() time;
@@ -1534,9 +1538,9 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1534} 1538}
1535 1539
1536/* 1540/*
1537 * dataLog() 1541 * dataLog()
1538 * 1542 *
1539 * function: log data tlock 1543 * function: log data tlock
1540 */ 1544 */
1541static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1545static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1542 struct tlock * tlck) 1546 struct tlock * tlck)
@@ -1580,9 +1584,9 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1580} 1584}
1581 1585
1582/* 1586/*
1583 * dtLog() 1587 * dtLog()
1584 * 1588 *
1585 * function: log dtree tlock and format maplock to update bmap; 1589 * function: log dtree tlock and format maplock to update bmap;
1586 */ 1590 */
1587static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1591static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1588 struct tlock * tlck) 1592 struct tlock * tlck)
@@ -1603,10 +1607,10 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1603 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); 1607 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1604 1608
1605 /* 1609 /*
1606 * page extension via relocation: entry insertion; 1610 * page extension via relocation: entry insertion;
1607 * page extension in-place: entry insertion; 1611 * page extension in-place: entry insertion;
1608 * new right page from page split, reinitialized in-line 1612 * new right page from page split, reinitialized in-line
1609 * root from root page split: entry insertion; 1613 * root from root page split: entry insertion;
1610 */ 1614 */
1611 if (tlck->type & (tlckNEW | tlckEXTEND)) { 1615 if (tlck->type & (tlckNEW | tlckEXTEND)) {
1612 /* log after-image of the new page for logredo(): 1616 /* log after-image of the new page for logredo():
@@ -1641,8 +1645,8 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1641 } 1645 }
1642 1646
1643 /* 1647 /*
1644 * entry insertion/deletion, 1648 * entry insertion/deletion,
1645 * sibling page link update (old right page before split); 1649 * sibling page link update (old right page before split);
1646 */ 1650 */
1647 if (tlck->type & (tlckENTRY | tlckRELINK)) { 1651 if (tlck->type & (tlckENTRY | tlckRELINK)) {
1648 /* log after-image for logredo(): */ 1652 /* log after-image for logredo(): */
@@ -1658,11 +1662,11 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1658 } 1662 }
1659 1663
1660 /* 1664 /*
1661 * page deletion: page has been invalidated 1665 * page deletion: page has been invalidated
1662 * page relocation: source extent 1666 * page relocation: source extent
1663 * 1667 *
1664 * a maplock for free of the page has been formatted 1668 * a maplock for free of the page has been formatted
1665 * at txLock() time); 1669 * at txLock() time);
1666 */ 1670 */
1667 if (tlck->type & (tlckFREE | tlckRELOCATE)) { 1671 if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1668 /* log LOG_NOREDOPAGE of the deleted page for logredo() 1672 /* log LOG_NOREDOPAGE of the deleted page for logredo()
@@ -1683,9 +1687,9 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1683} 1687}
1684 1688
1685/* 1689/*
1686 * xtLog() 1690 * xtLog()
1687 * 1691 *
1688 * function: log xtree tlock and format maplock to update bmap; 1692 * function: log xtree tlock and format maplock to update bmap;
1689 */ 1693 */
1690static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 1694static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1691 struct tlock * tlck) 1695 struct tlock * tlck)
@@ -1725,8 +1729,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1725 xadlock = (struct xdlistlock *) maplock; 1729 xadlock = (struct xdlistlock *) maplock;
1726 1730
1727 /* 1731 /*
1728 * entry insertion/extension; 1732 * entry insertion/extension;
1729 * sibling page link update (old right page before split); 1733 * sibling page link update (old right page before split);
1730 */ 1734 */
1731 if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { 1735 if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1732 /* log after-image for logredo(): 1736 /* log after-image for logredo():
@@ -1801,7 +1805,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1801 } 1805 }
1802 1806
1803 /* 1807 /*
1804 * page deletion: file deletion/truncation (ref. xtTruncate()) 1808 * page deletion: file deletion/truncation (ref. xtTruncate())
1805 * 1809 *
1806 * (page will be invalidated after log is written and bmap 1810 * (page will be invalidated after log is written and bmap
1807 * is updated from the page); 1811 * is updated from the page);
@@ -1908,13 +1912,13 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1908 } 1912 }
1909 1913
1910 /* 1914 /*
1911 * page/entry truncation: file truncation (ref. xtTruncate()) 1915 * page/entry truncation: file truncation (ref. xtTruncate())
1912 * 1916 *
1913 * |----------+------+------+---------------| 1917 * |----------+------+------+---------------|
1914 * | | | 1918 * | | |
1915 * | | hwm - hwm before truncation 1919 * | | hwm - hwm before truncation
1916 * | next - truncation point 1920 * | next - truncation point
1917 * lwm - lwm before truncation 1921 * lwm - lwm before truncation
1918 * header ? 1922 * header ?
1919 */ 1923 */
1920 if (tlck->type & tlckTRUNCATE) { 1924 if (tlck->type & tlckTRUNCATE) {
@@ -1937,7 +1941,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1937 twm = xtlck->twm.offset; 1941 twm = xtlck->twm.offset;
1938 1942
1939 /* 1943 /*
1940 * write log records 1944 * write log records
1941 */ 1945 */
1942 /* log after-image for logredo(): 1946 /* log after-image for logredo():
1943 * 1947 *
@@ -1997,7 +2001,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1997 } 2001 }
1998 2002
1999 /* 2003 /*
2000 * format maplock(s) for txUpdateMap() to update bmap 2004 * format maplock(s) for txUpdateMap() to update bmap
2001 */ 2005 */
2002 maplock->index = 0; 2006 maplock->index = 0;
2003 2007
@@ -2069,9 +2073,9 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2069} 2073}
2070 2074
2071/* 2075/*
2072 * mapLog() 2076 * mapLog()
2073 * 2077 *
2074 * function: log from maplock of freed data extents; 2078 * function: log from maplock of freed data extents;
2075 */ 2079 */
2076static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, 2080static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2077 struct tlock * tlck) 2081 struct tlock * tlck)
@@ -2081,7 +2085,7 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2081 pxd_t *pxd; 2085 pxd_t *pxd;
2082 2086
2083 /* 2087 /*
2084 * page relocation: free the source page extent 2088 * page relocation: free the source page extent
2085 * 2089 *
2086 * a maplock for txUpdateMap() for free of the page 2090 * a maplock for txUpdateMap() for free of the page
2087 * has been formatted at txLock() time saving the src 2091 * has been formatted at txLock() time saving the src
@@ -2155,10 +2159,10 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2155} 2159}
2156 2160
2157/* 2161/*
2158 * txEA() 2162 * txEA()
2159 * 2163 *
2160 * function: acquire maplock for EA/ACL extents or 2164 * function: acquire maplock for EA/ACL extents or
2161 * set COMMIT_INLINE flag; 2165 * set COMMIT_INLINE flag;
2162 */ 2166 */
2163void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) 2167void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2164{ 2168{
@@ -2207,10 +2211,10 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2207} 2211}
2208 2212
2209/* 2213/*
2210 * txForce() 2214 * txForce()
2211 * 2215 *
2212 * function: synchronously write pages locked by transaction 2216 * function: synchronously write pages locked by transaction
2213 * after txLog() but before txUpdateMap(); 2217 * after txLog() but before txUpdateMap();
2214 */ 2218 */
2215static void txForce(struct tblock * tblk) 2219static void txForce(struct tblock * tblk)
2216{ 2220{
@@ -2273,10 +2277,10 @@ static void txForce(struct tblock * tblk)
2273} 2277}
2274 2278
2275/* 2279/*
2276 * txUpdateMap() 2280 * txUpdateMap()
2277 * 2281 *
2278 * function: update persistent allocation map (and working map 2282 * function: update persistent allocation map (and working map
2279 * if appropriate); 2283 * if appropriate);
2280 * 2284 *
2281 * parameter: 2285 * parameter:
2282 */ 2286 */
@@ -2298,7 +2302,7 @@ static void txUpdateMap(struct tblock * tblk)
2298 2302
2299 2303
2300 /* 2304 /*
2301 * update block allocation map 2305 * update block allocation map
2302 * 2306 *
2303 * update allocation state in pmap (and wmap) and 2307 * update allocation state in pmap (and wmap) and
2304 * update lsn of the pmap page; 2308 * update lsn of the pmap page;
@@ -2382,7 +2386,7 @@ static void txUpdateMap(struct tblock * tblk)
2382 } 2386 }
2383 } 2387 }
2384 /* 2388 /*
2385 * update inode allocation map 2389 * update inode allocation map
2386 * 2390 *
2387 * update allocation state in pmap and 2391 * update allocation state in pmap and
2388 * update lsn of the pmap page; 2392 * update lsn of the pmap page;
@@ -2407,24 +2411,24 @@ static void txUpdateMap(struct tblock * tblk)
2407} 2411}
2408 2412
2409/* 2413/*
2410 * txAllocPMap() 2414 * txAllocPMap()
2411 * 2415 *
2412 * function: allocate from persistent map; 2416 * function: allocate from persistent map;
2413 * 2417 *
2414 * parameter: 2418 * parameter:
2415 * ipbmap - 2419 * ipbmap -
2416 * malock - 2420 * malock -
2417 * xad list: 2421 * xad list:
2418 * pxd: 2422 * pxd:
2419 * 2423 *
2420 * maptype - 2424 * maptype -
2421 * allocate from persistent map; 2425 * allocate from persistent map;
2422 * free from persistent map; 2426 * free from persistent map;
2423 * (e.g., tmp file - free from working map at releae 2427 * (e.g., tmp file - free from working map at releae
2424 * of last reference); 2428 * of last reference);
2425 * free from persistent and working map; 2429 * free from persistent and working map;
2426 * 2430 *
2427 * lsn - log sequence number; 2431 * lsn - log sequence number;
2428 */ 2432 */
2429static void txAllocPMap(struct inode *ip, struct maplock * maplock, 2433static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2430 struct tblock * tblk) 2434 struct tblock * tblk)
@@ -2478,9 +2482,9 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2478} 2482}
2479 2483
2480/* 2484/*
2481 * txFreeMap() 2485 * txFreeMap()
2482 * 2486 *
2483 * function: free from persistent and/or working map; 2487 * function: free from persistent and/or working map;
2484 * 2488 *
2485 * todo: optimization 2489 * todo: optimization
2486 */ 2490 */
@@ -2579,9 +2583,9 @@ void txFreeMap(struct inode *ip,
2579} 2583}
2580 2584
2581/* 2585/*
2582 * txFreelock() 2586 * txFreelock()
2583 * 2587 *
2584 * function: remove tlock from inode anonymous locklist 2588 * function: remove tlock from inode anonymous locklist
2585 */ 2589 */
2586void txFreelock(struct inode *ip) 2590void txFreelock(struct inode *ip)
2587{ 2591{
@@ -2619,7 +2623,7 @@ void txFreelock(struct inode *ip)
2619} 2623}
2620 2624
2621/* 2625/*
2622 * txAbort() 2626 * txAbort()
2623 * 2627 *
2624 * function: abort tx before commit; 2628 * function: abort tx before commit;
2625 * 2629 *
@@ -2679,7 +2683,7 @@ void txAbort(tid_t tid, int dirty)
2679} 2683}
2680 2684
2681/* 2685/*
2682 * txLazyCommit(void) 2686 * txLazyCommit(void)
2683 * 2687 *
2684 * All transactions except those changing ipimap (COMMIT_FORCE) are 2688 * All transactions except those changing ipimap (COMMIT_FORCE) are
2685 * processed by this routine. This insures that the inode and block 2689 * processed by this routine. This insures that the inode and block
@@ -2728,7 +2732,7 @@ static void txLazyCommit(struct tblock * tblk)
2728} 2732}
2729 2733
2730/* 2734/*
2731 * jfs_lazycommit(void) 2735 * jfs_lazycommit(void)
2732 * 2736 *
2733 * To be run as a kernel daemon. If lbmIODone is called in an interrupt 2737 * To be run as a kernel daemon. If lbmIODone is called in an interrupt
2734 * context, or where blocking is not wanted, this routine will process 2738 * context, or where blocking is not wanted, this routine will process
@@ -2913,7 +2917,7 @@ void txResume(struct super_block *sb)
2913} 2917}
2914 2918
2915/* 2919/*
2916 * jfs_sync(void) 2920 * jfs_sync(void)
2917 * 2921 *
2918 * To be run as a kernel daemon. This is awakened when tlocks run low. 2922 * To be run as a kernel daemon. This is awakened when tlocks run low.
2919 * We write any inodes that have anonymous tlocks so they will become 2923 * We write any inodes that have anonymous tlocks so they will become
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index 7863cf21af..ab72889370 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h
@@ -94,7 +94,7 @@ extern struct tblock *TxBlock; /* transaction block table */
94 */ 94 */
95struct tlock { 95struct tlock {
96 lid_t next; /* 2: index next lockword on tid locklist 96 lid_t next; /* 2: index next lockword on tid locklist
97 * next lockword on freelist 97 * next lockword on freelist
98 */ 98 */
99 tid_t tid; /* 2: transaction id holding lock */ 99 tid_t tid; /* 2: transaction id holding lock */
100 100
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 09b2529586..649f9817ac 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -21,7 +21,7 @@
21/* 21/*
22 * jfs_types.h: 22 * jfs_types.h:
23 * 23 *
24 * basic type/utility definitions 24 * basic type/utility definitions
25 * 25 *
26 * note: this header file must be the 1st include file 26 * note: this header file must be the 1st include file
27 * of JFS include list in all JFS .c file. 27 * of JFS include list in all JFS .c file.
@@ -54,8 +54,8 @@ struct timestruc_t {
54 */ 54 */
55 55
56#define LEFTMOSTONE 0x80000000 56#define LEFTMOSTONE 0x80000000
57#define HIGHORDER 0x80000000u /* high order bit on */ 57#define HIGHORDER 0x80000000u /* high order bit on */
58#define ONES 0xffffffffu /* all bit on */ 58#define ONES 0xffffffffu /* all bit on */
59 59
60/* 60/*
61 * logical xd (lxd) 61 * logical xd (lxd)
@@ -148,7 +148,7 @@ typedef struct {
148#define sizeDXD(dxd) le32_to_cpu((dxd)->size) 148#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
149 149
150/* 150/*
151 * directory entry argument 151 * directory entry argument
152 */ 152 */
153struct component_name { 153struct component_name {
154 int namlen; 154 int namlen;
@@ -160,14 +160,14 @@ struct component_name {
160 * DASD limit information - stored in directory inode 160 * DASD limit information - stored in directory inode
161 */ 161 */
162struct dasd { 162struct dasd {
163 u8 thresh; /* Alert Threshold (in percent) */ 163 u8 thresh; /* Alert Threshold (in percent) */
164 u8 delta; /* Alert Threshold delta (in percent) */ 164 u8 delta; /* Alert Threshold delta (in percent) */
165 u8 rsrvd1; 165 u8 rsrvd1;
166 u8 limit_hi; /* DASD limit (in logical blocks) */ 166 u8 limit_hi; /* DASD limit (in logical blocks) */
167 __le32 limit_lo; /* DASD limit (in logical blocks) */ 167 __le32 limit_lo; /* DASD limit (in logical blocks) */
168 u8 rsrvd2[3]; 168 u8 rsrvd2[3];
169 u8 used_hi; /* DASD usage (in logical blocks) */ 169 u8 used_hi; /* DASD usage (in logical blocks) */
170 __le32 used_lo; /* DASD usage (in logical blocks) */ 170 __le32 used_lo; /* DASD usage (in logical blocks) */
171}; 171};
172 172
173#define DASDLIMIT(dasdp) \ 173#define DASDLIMIT(dasdp) \
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index a386f48c73..7971f37534 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -60,7 +60,7 @@ int jfs_umount(struct super_block *sb)
60 jfs_info("UnMount JFS: sb:0x%p", sb); 60 jfs_info("UnMount JFS: sb:0x%p", sb);
61 61
62 /* 62 /*
63 * update superblock and close log 63 * update superblock and close log
64 * 64 *
65 * if mounted read-write and log based recovery was enabled 65 * if mounted read-write and log based recovery was enabled
66 */ 66 */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index acc97c46d8..1543906a2e 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -16,7 +16,7 @@
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */ 17 */
18/* 18/*
19 * jfs_xtree.c: extent allocation descriptor B+-tree manager 19 * jfs_xtree.c: extent allocation descriptor B+-tree manager
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
@@ -32,30 +32,30 @@
32/* 32/*
33 * xtree local flag 33 * xtree local flag
34 */ 34 */
35#define XT_INSERT 0x00000001 35#define XT_INSERT 0x00000001
36 36
37/* 37/*
38 * xtree key/entry comparison: extent offset 38 * xtree key/entry comparison: extent offset
39 * 39 *
40 * return: 40 * return:
41 * -1: k < start of extent 41 * -1: k < start of extent
42 * 0: start_of_extent <= k <= end_of_extent 42 * 0: start_of_extent <= k <= end_of_extent
43 * 1: k > end_of_extent 43 * 1: k > end_of_extent
44 */ 44 */
45#define XT_CMP(CMP, K, X, OFFSET64)\ 45#define XT_CMP(CMP, K, X, OFFSET64)\
46{\ 46{\
47 OFFSET64 = offsetXAD(X);\ 47 OFFSET64 = offsetXAD(X);\
48 (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ 48 (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
49 ((K) < OFFSET64) ? -1 : 0;\ 49 ((K) < OFFSET64) ? -1 : 0;\
50} 50}
51 51
52/* write a xad entry */ 52/* write a xad entry */
53#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ 53#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
54{\ 54{\
55 (XAD)->flag = (FLAG);\ 55 (XAD)->flag = (FLAG);\
56 XADoffset((XAD), (OFF));\ 56 XADoffset((XAD), (OFF));\
57 XADlength((XAD), (LEN));\ 57 XADlength((XAD), (LEN));\
58 XADaddress((XAD), (ADDR));\ 58 XADaddress((XAD), (ADDR));\
59} 59}
60 60
61#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) 61#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
@@ -76,13 +76,13 @@
76 MP = NULL;\ 76 MP = NULL;\
77 RC = -EIO;\ 77 RC = -EIO;\
78 }\ 78 }\
79 }\ 79 }\
80} 80}
81 81
82/* for consistency */ 82/* for consistency */
83#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) 83#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
84 84
85#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ 85#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
86 BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) 86 BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
87/* xtree entry parameter descriptor */ 87/* xtree entry parameter descriptor */
88struct xtsplit { 88struct xtsplit {
@@ -97,7 +97,7 @@ struct xtsplit {
97 97
98 98
99/* 99/*
100 * statistics 100 * statistics
101 */ 101 */
102#ifdef CONFIG_JFS_STATISTICS 102#ifdef CONFIG_JFS_STATISTICS
103static struct { 103static struct {
@@ -136,7 +136,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
136#endif /* _STILL_TO_PORT */ 136#endif /* _STILL_TO_PORT */
137 137
138/* 138/*
139 * xtLookup() 139 * xtLookup()
140 * 140 *
141 * function: map a single page into a physical extent; 141 * function: map a single page into a physical extent;
142 */ 142 */
@@ -179,7 +179,7 @@ int xtLookup(struct inode *ip, s64 lstart,
179 } 179 }
180 180
181 /* 181 /*
182 * compute the physical extent covering logical extent 182 * compute the physical extent covering logical extent
183 * 183 *
184 * N.B. search may have failed (e.g., hole in sparse file), 184 * N.B. search may have failed (e.g., hole in sparse file),
185 * and returned the index of the next entry. 185 * and returned the index of the next entry.
@@ -220,27 +220,27 @@ int xtLookup(struct inode *ip, s64 lstart,
220 220
221 221
222/* 222/*
223 * xtLookupList() 223 * xtLookupList()
224 * 224 *
225 * function: map a single logical extent into a list of physical extent; 225 * function: map a single logical extent into a list of physical extent;
226 * 226 *
227 * parameter: 227 * parameter:
228 * struct inode *ip, 228 * struct inode *ip,
229 * struct lxdlist *lxdlist, lxd list (in) 229 * struct lxdlist *lxdlist, lxd list (in)
230 * struct xadlist *xadlist, xad list (in/out) 230 * struct xadlist *xadlist, xad list (in/out)
231 * int flag) 231 * int flag)
232 * 232 *
233 * coverage of lxd by xad under assumption of 233 * coverage of lxd by xad under assumption of
234 * . lxd's are ordered and disjoint. 234 * . lxd's are ordered and disjoint.
235 * . xad's are ordered and disjoint. 235 * . xad's are ordered and disjoint.
236 * 236 *
237 * return: 237 * return:
238 * 0: success 238 * 0: success
239 * 239 *
240 * note: a page being written (even a single byte) is backed fully, 240 * note: a page being written (even a single byte) is backed fully,
241 * except the last page which is only backed with blocks 241 * except the last page which is only backed with blocks
242 * required to cover the last byte; 242 * required to cover the last byte;
243 * the extent backing a page is fully contained within an xad; 243 * the extent backing a page is fully contained within an xad;
244 */ 244 */
245int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, 245int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
246 struct xadlist * xadlist, int flag) 246 struct xadlist * xadlist, int flag)
@@ -284,7 +284,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
284 return rc; 284 return rc;
285 285
286 /* 286 /*
287 * compute the physical extent covering logical extent 287 * compute the physical extent covering logical extent
288 * 288 *
289 * N.B. search may have failed (e.g., hole in sparse file), 289 * N.B. search may have failed (e.g., hole in sparse file),
290 * and returned the index of the next entry. 290 * and returned the index of the next entry.
@@ -343,7 +343,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
343 if (lstart >= size) 343 if (lstart >= size)
344 goto mapend; 344 goto mapend;
345 345
346 /* compare with the current xad */ 346 /* compare with the current xad */
347 goto compare1; 347 goto compare1;
348 } 348 }
349 /* lxd is covered by xad */ 349 /* lxd is covered by xad */
@@ -430,7 +430,7 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
430 /* 430 /*
431 * lxd is partially covered by xad 431 * lxd is partially covered by xad
432 */ 432 */
433 else { /* (xend < lend) */ 433 else { /* (xend < lend) */
434 434
435 /* 435 /*
436 * get next xad 436 * get next xad
@@ -477,22 +477,22 @@ int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
477 477
478 478
479/* 479/*
480 * xtSearch() 480 * xtSearch()
481 * 481 *
482 * function: search for the xad entry covering specified offset. 482 * function: search for the xad entry covering specified offset.
483 * 483 *
484 * parameters: 484 * parameters:
485 * ip - file object; 485 * ip - file object;
486 * xoff - extent offset; 486 * xoff - extent offset;
487 * nextp - address of next extent (if any) for search miss 487 * nextp - address of next extent (if any) for search miss
488 * cmpp - comparison result: 488 * cmpp - comparison result:
489 * btstack - traverse stack; 489 * btstack - traverse stack;
490 * flag - search process flag (XT_INSERT); 490 * flag - search process flag (XT_INSERT);
491 * 491 *
492 * returns: 492 * returns:
493 * btstack contains (bn, index) of search path traversed to the entry. 493 * btstack contains (bn, index) of search path traversed to the entry.
494 * *cmpp is set to result of comparison with the entry returned. 494 * *cmpp is set to result of comparison with the entry returned.
495 * the page containing the entry is pinned at exit. 495 * the page containing the entry is pinned at exit.
496 */ 496 */
497static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, 497static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
498 int *cmpp, struct btstack * btstack, int flag) 498 int *cmpp, struct btstack * btstack, int flag)
@@ -517,7 +517,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
517 btstack->nsplit = 0; 517 btstack->nsplit = 0;
518 518
519 /* 519 /*
520 * search down tree from root: 520 * search down tree from root:
521 * 521 *
522 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 522 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
523 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 523 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -642,7 +642,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
642 XT_CMP(cmp, xoff, &p->xad[index], t64); 642 XT_CMP(cmp, xoff, &p->xad[index], t64);
643 if (cmp == 0) { 643 if (cmp == 0) {
644 /* 644 /*
645 * search hit 645 * search hit
646 */ 646 */
647 /* search hit - leaf page: 647 /* search hit - leaf page:
648 * return the entry found 648 * return the entry found
@@ -692,7 +692,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
692 } 692 }
693 693
694 /* 694 /*
695 * search miss 695 * search miss
696 * 696 *
697 * base is the smallest index with key (Kj) greater than 697 * base is the smallest index with key (Kj) greater than
698 * search key (K) and may be zero or maxentry index. 698 * search key (K) and may be zero or maxentry index.
@@ -773,22 +773,22 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
773} 773}
774 774
775/* 775/*
776 * xtInsert() 776 * xtInsert()
777 * 777 *
778 * function: 778 * function:
779 * 779 *
780 * parameter: 780 * parameter:
781 * tid - transaction id; 781 * tid - transaction id;
782 * ip - file object; 782 * ip - file object;
783 * xflag - extent flag (XAD_NOTRECORDED): 783 * xflag - extent flag (XAD_NOTRECORDED):
784 * xoff - extent offset; 784 * xoff - extent offset;
785 * xlen - extent length; 785 * xlen - extent length;
786 * xaddrp - extent address pointer (in/out): 786 * xaddrp - extent address pointer (in/out):
787 * if (*xaddrp) 787 * if (*xaddrp)
788 * caller allocated data extent at *xaddrp; 788 * caller allocated data extent at *xaddrp;
789 * else 789 * else
790 * allocate data extent and return its xaddr; 790 * allocate data extent and return its xaddr;
791 * flag - 791 * flag -
792 * 792 *
793 * return: 793 * return:
794 */ 794 */
@@ -813,7 +813,7 @@ int xtInsert(tid_t tid, /* transaction id */
813 jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); 813 jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
814 814
815 /* 815 /*
816 * search for the entry location at which to insert: 816 * search for the entry location at which to insert:
817 * 817 *
818 * xtFastSearch() and xtSearch() both returns (leaf page 818 * xtFastSearch() and xtSearch() both returns (leaf page
819 * pinned, index at which to insert). 819 * pinned, index at which to insert).
@@ -853,13 +853,13 @@ int xtInsert(tid_t tid, /* transaction id */
853 } 853 }
854 854
855 /* 855 /*
856 * insert entry for new extent 856 * insert entry for new extent
857 */ 857 */
858 xflag |= XAD_NEW; 858 xflag |= XAD_NEW;
859 859
860 /* 860 /*
861 * if the leaf page is full, split the page and 861 * if the leaf page is full, split the page and
862 * propagate up the router entry for the new page from split 862 * propagate up the router entry for the new page from split
863 * 863 *
864 * The xtSplitUp() will insert the entry and unpin the leaf page. 864 * The xtSplitUp() will insert the entry and unpin the leaf page.
865 */ 865 */
@@ -886,7 +886,7 @@ int xtInsert(tid_t tid, /* transaction id */
886 } 886 }
887 887
888 /* 888 /*
889 * insert the new entry into the leaf page 889 * insert the new entry into the leaf page
890 */ 890 */
891 /* 891 /*
892 * acquire a transaction lock on the leaf page; 892 * acquire a transaction lock on the leaf page;
@@ -930,16 +930,16 @@ int xtInsert(tid_t tid, /* transaction id */
930 930
931 931
932/* 932/*
933 * xtSplitUp() 933 * xtSplitUp()
934 * 934 *
935 * function: 935 * function:
936 * split full pages as propagating insertion up the tree 936 * split full pages as propagating insertion up the tree
937 * 937 *
938 * parameter: 938 * parameter:
939 * tid - transaction id; 939 * tid - transaction id;
940 * ip - file object; 940 * ip - file object;
941 * split - entry parameter descriptor; 941 * split - entry parameter descriptor;
942 * btstack - traverse stack from xtSearch() 942 * btstack - traverse stack from xtSearch()
943 * 943 *
944 * return: 944 * return:
945 */ 945 */
@@ -1199,22 +1199,22 @@ xtSplitUp(tid_t tid,
1199 1199
1200 1200
1201/* 1201/*
1202 * xtSplitPage() 1202 * xtSplitPage()
1203 * 1203 *
1204 * function: 1204 * function:
1205 * split a full non-root page into 1205 * split a full non-root page into
1206 * original/split/left page and new right page 1206 * original/split/left page and new right page
1207 * i.e., the original/split page remains as left page. 1207 * i.e., the original/split page remains as left page.
1208 * 1208 *
1209 * parameter: 1209 * parameter:
1210 * int tid, 1210 * int tid,
1211 * struct inode *ip, 1211 * struct inode *ip,
1212 * struct xtsplit *split, 1212 * struct xtsplit *split,
1213 * struct metapage **rmpp, 1213 * struct metapage **rmpp,
1214 * u64 *rbnp, 1214 * u64 *rbnp,
1215 * 1215 *
1216 * return: 1216 * return:
1217 * Pointer to page in which to insert or NULL on error. 1217 * Pointer to page in which to insert or NULL on error.
1218 */ 1218 */
1219static int 1219static int
1220xtSplitPage(tid_t tid, struct inode *ip, 1220xtSplitPage(tid_t tid, struct inode *ip,
@@ -1248,9 +1248,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
1248 rbn = addressPXD(pxd); 1248 rbn = addressPXD(pxd);
1249 1249
1250 /* Allocate blocks to quota. */ 1250 /* Allocate blocks to quota. */
1251 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 1251 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1252 rc = -EDQUOT; 1252 rc = -EDQUOT;
1253 goto clean_up; 1253 goto clean_up;
1254 } 1254 }
1255 1255
1256 quota_allocation += lengthPXD(pxd); 1256 quota_allocation += lengthPXD(pxd);
@@ -1304,7 +1304,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1304 skip = split->index; 1304 skip = split->index;
1305 1305
1306 /* 1306 /*
1307 * sequential append at tail (after last entry of last page) 1307 * sequential append at tail (after last entry of last page)
1308 * 1308 *
1309 * if splitting the last page on a level because of appending 1309 * if splitting the last page on a level because of appending
1310 * a entry to it (skip is maxentry), it's likely that the access is 1310 * a entry to it (skip is maxentry), it's likely that the access is
@@ -1342,7 +1342,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1342 } 1342 }
1343 1343
1344 /* 1344 /*
1345 * non-sequential insert (at possibly middle page) 1345 * non-sequential insert (at possibly middle page)
1346 */ 1346 */
1347 1347
1348 /* 1348 /*
@@ -1465,25 +1465,24 @@ xtSplitPage(tid_t tid, struct inode *ip,
1465 1465
1466 1466
1467/* 1467/*
1468 * xtSplitRoot() 1468 * xtSplitRoot()
1469 * 1469 *
1470 * function: 1470 * function:
1471 * split the full root page into 1471 * split the full root page into original/root/split page and new
1472 * original/root/split page and new right page 1472 * right page
1473 * i.e., root remains fixed in tree anchor (inode) and 1473 * i.e., root remains fixed in tree anchor (inode) and the root is
1474 * the root is copied to a single new right child page 1474 * copied to a single new right child page since root page <<
1475 * since root page << non-root page, and 1475 * non-root page, and the split root page contains a single entry
1476 * the split root page contains a single entry for the 1476 * for the new right child page.
1477 * new right child page.
1478 * 1477 *
1479 * parameter: 1478 * parameter:
1480 * int tid, 1479 * int tid,
1481 * struct inode *ip, 1480 * struct inode *ip,
1482 * struct xtsplit *split, 1481 * struct xtsplit *split,
1483 * struct metapage **rmpp) 1482 * struct metapage **rmpp)
1484 * 1483 *
1485 * return: 1484 * return:
1486 * Pointer to page in which to insert or NULL on error. 1485 * Pointer to page in which to insert or NULL on error.
1487 */ 1486 */
1488static int 1487static int
1489xtSplitRoot(tid_t tid, 1488xtSplitRoot(tid_t tid,
@@ -1505,7 +1504,7 @@ xtSplitRoot(tid_t tid,
1505 INCREMENT(xtStat.split); 1504 INCREMENT(xtStat.split);
1506 1505
1507 /* 1506 /*
1508 * allocate a single (right) child page 1507 * allocate a single (right) child page
1509 */ 1508 */
1510 pxdlist = split->pxdlist; 1509 pxdlist = split->pxdlist;
1511 pxd = &pxdlist->pxd[pxdlist->npxd]; 1510 pxd = &pxdlist->pxd[pxdlist->npxd];
@@ -1573,7 +1572,7 @@ xtSplitRoot(tid_t tid,
1573 } 1572 }
1574 1573
1575 /* 1574 /*
1576 * reset the root 1575 * reset the root
1577 * 1576 *
1578 * init root with the single entry for the new right page 1577 * init root with the single entry for the new right page
1579 * set the 1st entry offset to 0, which force the left-most key 1578 * set the 1st entry offset to 0, which force the left-most key
@@ -1610,7 +1609,7 @@ xtSplitRoot(tid_t tid,
1610 1609
1611 1610
1612/* 1611/*
1613 * xtExtend() 1612 * xtExtend()
1614 * 1613 *
1615 * function: extend in-place; 1614 * function: extend in-place;
1616 * 1615 *
@@ -1677,7 +1676,7 @@ int xtExtend(tid_t tid, /* transaction id */
1677 goto extendOld; 1676 goto extendOld;
1678 1677
1679 /* 1678 /*
1680 * extent overflow: insert entry for new extent 1679 * extent overflow: insert entry for new extent
1681 */ 1680 */
1682//insertNew: 1681//insertNew:
1683 xoff = offsetXAD(xad) + MAXXLEN; 1682 xoff = offsetXAD(xad) + MAXXLEN;
@@ -1685,8 +1684,8 @@ int xtExtend(tid_t tid, /* transaction id */
1685 nextindex = le16_to_cpu(p->header.nextindex); 1684 nextindex = le16_to_cpu(p->header.nextindex);
1686 1685
1687 /* 1686 /*
1688 * if the leaf page is full, insert the new entry and 1687 * if the leaf page is full, insert the new entry and
1689 * propagate up the router entry for the new page from split 1688 * propagate up the router entry for the new page from split
1690 * 1689 *
1691 * The xtSplitUp() will insert the entry and unpin the leaf page. 1690 * The xtSplitUp() will insert the entry and unpin the leaf page.
1692 */ 1691 */
@@ -1731,7 +1730,7 @@ int xtExtend(tid_t tid, /* transaction id */
1731 } 1730 }
1732 } 1731 }
1733 /* 1732 /*
1734 * insert the new entry into the leaf page 1733 * insert the new entry into the leaf page
1735 */ 1734 */
1736 else { 1735 else {
1737 /* insert the new entry: mark the entry NEW */ 1736 /* insert the new entry: mark the entry NEW */
@@ -1771,11 +1770,11 @@ int xtExtend(tid_t tid, /* transaction id */
1771 1770
1772#ifdef _NOTYET 1771#ifdef _NOTYET
1773/* 1772/*
1774 * xtTailgate() 1773 * xtTailgate()
1775 * 1774 *
1776 * function: split existing 'tail' extent 1775 * function: split existing 'tail' extent
1777 * (split offset >= start offset of tail extent), and 1776 * (split offset >= start offset of tail extent), and
1778 * relocate and extend the split tail half; 1777 * relocate and extend the split tail half;
1779 * 1778 *
1780 * note: existing extent may or may not have been committed. 1779 * note: existing extent may or may not have been committed.
1781 * caller is responsible for pager buffer cache update, and 1780 * caller is responsible for pager buffer cache update, and
@@ -1804,7 +1803,7 @@ int xtTailgate(tid_t tid, /* transaction id */
1804 1803
1805/* 1804/*
1806printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", 1805printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1807 (ulong)xoff, xlen, (ulong)xaddr); 1806 (ulong)xoff, xlen, (ulong)xaddr);
1808*/ 1807*/
1809 1808
1810 /* there must exist extent to be tailgated */ 1809 /* there must exist extent to be tailgated */
@@ -1842,18 +1841,18 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1842 xad = &p->xad[index]; 1841 xad = &p->xad[index];
1843/* 1842/*
1844printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", 1843printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1845 (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); 1844 (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
1846*/ 1845*/
1847 if ((llen = xoff - offsetXAD(xad)) == 0) 1846 if ((llen = xoff - offsetXAD(xad)) == 0)
1848 goto updateOld; 1847 goto updateOld;
1849 1848
1850 /* 1849 /*
1851 * partially replace extent: insert entry for new extent 1850 * partially replace extent: insert entry for new extent
1852 */ 1851 */
1853//insertNew: 1852//insertNew:
1854 /* 1853 /*
1855 * if the leaf page is full, insert the new entry and 1854 * if the leaf page is full, insert the new entry and
1856 * propagate up the router entry for the new page from split 1855 * propagate up the router entry for the new page from split
1857 * 1856 *
1858 * The xtSplitUp() will insert the entry and unpin the leaf page. 1857 * The xtSplitUp() will insert the entry and unpin the leaf page.
1859 */ 1858 */
@@ -1898,7 +1897,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1898 } 1897 }
1899 } 1898 }
1900 /* 1899 /*
1901 * insert the new entry into the leaf page 1900 * insert the new entry into the leaf page
1902 */ 1901 */
1903 else { 1902 else {
1904 /* insert the new entry: mark the entry NEW */ 1903 /* insert the new entry: mark the entry NEW */
@@ -1955,17 +1954,17 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1955#endif /* _NOTYET */ 1954#endif /* _NOTYET */
1956 1955
1957/* 1956/*
1958 * xtUpdate() 1957 * xtUpdate()
1959 * 1958 *
1960 * function: update XAD; 1959 * function: update XAD;
1961 * 1960 *
1962 * update extent for allocated_but_not_recorded or 1961 * update extent for allocated_but_not_recorded or
1963 * compressed extent; 1962 * compressed extent;
1964 * 1963 *
1965 * parameter: 1964 * parameter:
1966 * nxad - new XAD; 1965 * nxad - new XAD;
1967 * logical extent of the specified XAD must be completely 1966 * logical extent of the specified XAD must be completely
1968 * contained by an existing XAD; 1967 * contained by an existing XAD;
1969 */ 1968 */
1970int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) 1969int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1971{ /* new XAD */ 1970{ /* new XAD */
@@ -2416,19 +2415,19 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
2416 2415
2417 2416
2418/* 2417/*
2419 * xtAppend() 2418 * xtAppend()
2420 * 2419 *
2421 * function: grow in append mode from contiguous region specified ; 2420 * function: grow in append mode from contiguous region specified ;
2422 * 2421 *
2423 * parameter: 2422 * parameter:
2424 * tid - transaction id; 2423 * tid - transaction id;
2425 * ip - file object; 2424 * ip - file object;
2426 * xflag - extent flag: 2425 * xflag - extent flag:
2427 * xoff - extent offset; 2426 * xoff - extent offset;
2428 * maxblocks - max extent length; 2427 * maxblocks - max extent length;
2429 * xlen - extent length (in/out); 2428 * xlen - extent length (in/out);
2430 * xaddrp - extent address pointer (in/out): 2429 * xaddrp - extent address pointer (in/out):
2431 * flag - 2430 * flag -
2432 * 2431 *
2433 * return: 2432 * return:
2434 */ 2433 */
@@ -2460,7 +2459,7 @@ int xtAppend(tid_t tid, /* transaction id */
2460 (ulong) xoff, maxblocks, xlen, (ulong) xaddr); 2459 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
2461 2460
2462 /* 2461 /*
2463 * search for the entry location at which to insert: 2462 * search for the entry location at which to insert:
2464 * 2463 *
2465 * xtFastSearch() and xtSearch() both returns (leaf page 2464 * xtFastSearch() and xtSearch() both returns (leaf page
2466 * pinned, index at which to insert). 2465 * pinned, index at which to insert).
@@ -2482,13 +2481,13 @@ int xtAppend(tid_t tid, /* transaction id */
2482 xlen = min(xlen, (int)(next - xoff)); 2481 xlen = min(xlen, (int)(next - xoff));
2483//insert: 2482//insert:
2484 /* 2483 /*
2485 * insert entry for new extent 2484 * insert entry for new extent
2486 */ 2485 */
2487 xflag |= XAD_NEW; 2486 xflag |= XAD_NEW;
2488 2487
2489 /* 2488 /*
2490 * if the leaf page is full, split the page and 2489 * if the leaf page is full, split the page and
2491 * propagate up the router entry for the new page from split 2490 * propagate up the router entry for the new page from split
2492 * 2491 *
2493 * The xtSplitUp() will insert the entry and unpin the leaf page. 2492 * The xtSplitUp() will insert the entry and unpin the leaf page.
2494 */ 2493 */
@@ -2545,7 +2544,7 @@ int xtAppend(tid_t tid, /* transaction id */
2545 return 0; 2544 return 0;
2546 2545
2547 /* 2546 /*
2548 * insert the new entry into the leaf page 2547 * insert the new entry into the leaf page
2549 */ 2548 */
2550 insertLeaf: 2549 insertLeaf:
2551 /* 2550 /*
@@ -2589,17 +2588,17 @@ int xtAppend(tid_t tid, /* transaction id */
2589 2588
2590/* - TBD for defragmentaion/reorganization - 2589/* - TBD for defragmentaion/reorganization -
2591 * 2590 *
2592 * xtDelete() 2591 * xtDelete()
2593 * 2592 *
2594 * function: 2593 * function:
2595 * delete the entry with the specified key. 2594 * delete the entry with the specified key.
2596 * 2595 *
2597 * N.B.: whole extent of the entry is assumed to be deleted. 2596 * N.B.: whole extent of the entry is assumed to be deleted.
2598 * 2597 *
2599 * parameter: 2598 * parameter:
2600 * 2599 *
2601 * return: 2600 * return:
2602 * ENOENT: if the entry is not found. 2601 * ENOENT: if the entry is not found.
2603 * 2602 *
2604 * exception: 2603 * exception:
2605 */ 2604 */
@@ -2665,10 +2664,10 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2665 2664
2666/* - TBD for defragmentaion/reorganization - 2665/* - TBD for defragmentaion/reorganization -
2667 * 2666 *
2668 * xtDeleteUp() 2667 * xtDeleteUp()
2669 * 2668 *
2670 * function: 2669 * function:
2671 * free empty pages as propagating deletion up the tree 2670 * free empty pages as propagating deletion up the tree
2672 * 2671 *
2673 * parameter: 2672 * parameter:
2674 * 2673 *
@@ -2815,15 +2814,15 @@ xtDeleteUp(tid_t tid, struct inode *ip,
2815 2814
2816 2815
2817/* 2816/*
2818 * NAME: xtRelocate() 2817 * NAME: xtRelocate()
2819 * 2818 *
2820 * FUNCTION: relocate xtpage or data extent of regular file; 2819 * FUNCTION: relocate xtpage or data extent of regular file;
2821 * This function is mainly used by defragfs utility. 2820 * This function is mainly used by defragfs utility.
2822 * 2821 *
2823 * NOTE: This routine does not have the logic to handle 2822 * NOTE: This routine does not have the logic to handle
2824 * uncommitted allocated extent. The caller should call 2823 * uncommitted allocated extent. The caller should call
2825 * txCommit() to commit all the allocation before call 2824 * txCommit() to commit all the allocation before call
2826 * this routine. 2825 * this routine.
2827 */ 2826 */
2828int 2827int
2829xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ 2828xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
@@ -2865,8 +2864,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2865 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); 2864 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
2866 2865
2867 /* 2866 /*
2868 * 1. get and validate the parent xtpage/xad entry 2867 * 1. get and validate the parent xtpage/xad entry
2869 * covering the source extent to be relocated; 2868 * covering the source extent to be relocated;
2870 */ 2869 */
2871 if (xtype == DATAEXT) { 2870 if (xtype == DATAEXT) {
2872 /* search in leaf entry */ 2871 /* search in leaf entry */
@@ -2910,7 +2909,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2910 jfs_info("xtRelocate: parent xad entry validated."); 2909 jfs_info("xtRelocate: parent xad entry validated.");
2911 2910
2912 /* 2911 /*
2913 * 2. relocate the extent 2912 * 2. relocate the extent
2914 */ 2913 */
2915 if (xtype == DATAEXT) { 2914 if (xtype == DATAEXT) {
2916 /* if the extent is allocated-but-not-recorded 2915 /* if the extent is allocated-but-not-recorded
@@ -2923,7 +2922,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2923 XT_PUTPAGE(pmp); 2922 XT_PUTPAGE(pmp);
2924 2923
2925 /* 2924 /*
2926 * cmRelocate() 2925 * cmRelocate()
2927 * 2926 *
2928 * copy target data pages to be relocated; 2927 * copy target data pages to be relocated;
2929 * 2928 *
@@ -2945,8 +2944,8 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2945 pno = offset >> CM_L2BSIZE; 2944 pno = offset >> CM_L2BSIZE;
2946 npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; 2945 npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
2947/* 2946/*
2948 npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - 2947 npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
2949 (offset >> CM_L2BSIZE) + 1; 2948 (offset >> CM_L2BSIZE) + 1;
2950*/ 2949*/
2951 sxaddr = oxaddr; 2950 sxaddr = oxaddr;
2952 dxaddr = nxaddr; 2951 dxaddr = nxaddr;
@@ -2981,7 +2980,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2981 2980
2982 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); 2981 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2983 jfs_info("xtRelocate: target data extent relocated."); 2982 jfs_info("xtRelocate: target data extent relocated.");
2984 } else { /* (xtype == XTPAGE) */ 2983 } else { /* (xtype == XTPAGE) */
2985 2984
2986 /* 2985 /*
2987 * read in the target xtpage from the source extent; 2986 * read in the target xtpage from the source extent;
@@ -3026,16 +3025,14 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3026 */ 3025 */
3027 if (lmp) { 3026 if (lmp) {
3028 BT_MARK_DIRTY(lmp, ip); 3027 BT_MARK_DIRTY(lmp, ip);
3029 tlck = 3028 tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
3030 txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
3031 lp->header.next = cpu_to_le64(nxaddr); 3029 lp->header.next = cpu_to_le64(nxaddr);
3032 XT_PUTPAGE(lmp); 3030 XT_PUTPAGE(lmp);
3033 } 3031 }
3034 3032
3035 if (rmp) { 3033 if (rmp) {
3036 BT_MARK_DIRTY(rmp, ip); 3034 BT_MARK_DIRTY(rmp, ip);
3037 tlck = 3035 tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
3038 txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
3039 rp->header.prev = cpu_to_le64(nxaddr); 3036 rp->header.prev = cpu_to_le64(nxaddr);
3040 XT_PUTPAGE(rmp); 3037 XT_PUTPAGE(rmp);
3041 } 3038 }
@@ -3062,7 +3059,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3062 * scan may be skipped by commit() and logredo(); 3059 * scan may be skipped by commit() and logredo();
3063 */ 3060 */
3064 BT_MARK_DIRTY(mp, ip); 3061 BT_MARK_DIRTY(mp, ip);
3065 /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ 3062 /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
3066 tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); 3063 tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
3067 xtlck = (struct xtlock *) & tlck->lock; 3064 xtlck = (struct xtlock *) & tlck->lock;
3068 3065
@@ -3084,7 +3081,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3084 } 3081 }
3085 3082
3086 /* 3083 /*
3087 * 3. acquire maplock for the source extent to be freed; 3084 * 3. acquire maplock for the source extent to be freed;
3088 * 3085 *
3089 * acquire a maplock saving the src relocated extent address; 3086 * acquire a maplock saving the src relocated extent address;
3090 * to free of the extent at commit time; 3087 * to free of the extent at commit time;
@@ -3105,7 +3102,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3105 * is no buffer associated with this lock since the buffer 3102 * is no buffer associated with this lock since the buffer
3106 * has been redirected to the target location. 3103 * has been redirected to the target location.
3107 */ 3104 */
3108 else /* (xtype == XTPAGE) */ 3105 else /* (xtype == XTPAGE) */
3109 tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); 3106 tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
3110 3107
3111 pxdlock = (struct pxd_lock *) & tlck->lock; 3108 pxdlock = (struct pxd_lock *) & tlck->lock;
@@ -3115,7 +3112,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3115 pxdlock->index = 1; 3112 pxdlock->index = 1;
3116 3113
3117 /* 3114 /*
3118 * 4. update the parent xad entry for relocation; 3115 * 4. update the parent xad entry for relocation;
3119 * 3116 *
3120 * acquire tlck for the parent entry with XAD_NEW as entry 3117 * acquire tlck for the parent entry with XAD_NEW as entry
3121 * update which will write LOG_REDOPAGE and update bmap for 3118 * update which will write LOG_REDOPAGE and update bmap for
@@ -3143,22 +3140,22 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
3143 3140
3144 3141
3145/* 3142/*
3146 * xtSearchNode() 3143 * xtSearchNode()
3147 * 3144 *
3148 * function: search for the internal xad entry covering specified extent. 3145 * function: search for the internal xad entry covering specified extent.
3149 * This function is mainly used by defragfs utility. 3146 * This function is mainly used by defragfs utility.
3150 * 3147 *
3151 * parameters: 3148 * parameters:
3152 * ip - file object; 3149 * ip - file object;
3153 * xad - extent to find; 3150 * xad - extent to find;
3154 * cmpp - comparison result: 3151 * cmpp - comparison result:
3155 * btstack - traverse stack; 3152 * btstack - traverse stack;
3156 * flag - search process flag; 3153 * flag - search process flag;
3157 * 3154 *
3158 * returns: 3155 * returns:
3159 * btstack contains (bn, index) of search path traversed to the entry. 3156 * btstack contains (bn, index) of search path traversed to the entry.
3160 * *cmpp is set to result of comparison with the entry returned. 3157 * *cmpp is set to result of comparison with the entry returned.
3161 * the page containing the entry is pinned at exit. 3158 * the page containing the entry is pinned at exit.
3162 */ 3159 */
3163static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ 3160static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3164 int *cmpp, struct btstack * btstack, int flag) 3161 int *cmpp, struct btstack * btstack, int flag)
@@ -3181,7 +3178,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3181 xaddr = addressXAD(xad); 3178 xaddr = addressXAD(xad);
3182 3179
3183 /* 3180 /*
3184 * search down tree from root: 3181 * search down tree from root:
3185 * 3182 *
3186 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of 3183 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
3187 * internal page, child page Pi contains entry with k, Ki <= K < Kj. 3184 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
@@ -3217,7 +3214,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3217 XT_CMP(cmp, xoff, &p->xad[index], t64); 3214 XT_CMP(cmp, xoff, &p->xad[index], t64);
3218 if (cmp == 0) { 3215 if (cmp == 0) {
3219 /* 3216 /*
3220 * search hit 3217 * search hit
3221 * 3218 *
3222 * verify for exact match; 3219 * verify for exact match;
3223 */ 3220 */
@@ -3245,7 +3242,7 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3245 } 3242 }
3246 3243
3247 /* 3244 /*
3248 * search miss - non-leaf page: 3245 * search miss - non-leaf page:
3249 * 3246 *
3250 * base is the smallest index with key (Kj) greater than 3247 * base is the smallest index with key (Kj) greater than
3251 * search key (K) and may be zero or maxentry index. 3248 * search key (K) and may be zero or maxentry index.
@@ -3268,15 +3265,15 @@ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3268 3265
3269 3266
3270/* 3267/*
3271 * xtRelink() 3268 * xtRelink()
3272 * 3269 *
3273 * function: 3270 * function:
3274 * link around a freed page. 3271 * link around a freed page.
3275 * 3272 *
3276 * Parameter: 3273 * Parameter:
3277 * int tid, 3274 * int tid,
3278 * struct inode *ip, 3275 * struct inode *ip,
3279 * xtpage_t *p) 3276 * xtpage_t *p)
3280 * 3277 *
3281 * returns: 3278 * returns:
3282 */ 3279 */
@@ -3338,7 +3335,7 @@ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
3338 3335
3339 3336
3340/* 3337/*
3341 * xtInitRoot() 3338 * xtInitRoot()
3342 * 3339 *
3343 * initialize file root (inline in inode) 3340 * initialize file root (inline in inode)
3344 */ 3341 */
@@ -3385,42 +3382,42 @@ void xtInitRoot(tid_t tid, struct inode *ip)
3385#define MAX_TRUNCATE_LEAVES 50 3382#define MAX_TRUNCATE_LEAVES 50
3386 3383
3387/* 3384/*
3388 * xtTruncate() 3385 * xtTruncate()
3389 * 3386 *
3390 * function: 3387 * function:
3391 * traverse for truncation logging backward bottom up; 3388 * traverse for truncation logging backward bottom up;
3392 * terminate at the last extent entry at the current subtree 3389 * terminate at the last extent entry at the current subtree
3393 * root page covering new down size. 3390 * root page covering new down size.
3394 * truncation may occur within the last extent entry. 3391 * truncation may occur within the last extent entry.
3395 * 3392 *
3396 * parameter: 3393 * parameter:
3397 * int tid, 3394 * int tid,
3398 * struct inode *ip, 3395 * struct inode *ip,
3399 * s64 newsize, 3396 * s64 newsize,
3400 * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} 3397 * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
3401 * 3398 *
3402 * return: 3399 * return:
3403 * 3400 *
3404 * note: 3401 * note:
3405 * PWMAP: 3402 * PWMAP:
3406 * 1. truncate (non-COMMIT_NOLINK file) 3403 * 1. truncate (non-COMMIT_NOLINK file)
3407 * by jfs_truncate() or jfs_open(O_TRUNC): 3404 * by jfs_truncate() or jfs_open(O_TRUNC):
3408 * xtree is updated; 3405 * xtree is updated;
3409 * 2. truncate index table of directory when last entry removed 3406 * 2. truncate index table of directory when last entry removed
3410 * map update via tlock at commit time; 3407 * map update via tlock at commit time;
3411 * PMAP: 3408 * PMAP:
3412 * Call xtTruncate_pmap instead 3409 * Call xtTruncate_pmap instead
3413 * WMAP: 3410 * WMAP:
3414 * 1. remove (free zero link count) on last reference release 3411 * 1. remove (free zero link count) on last reference release
3415 * (pmap has been freed at commit zero link count); 3412 * (pmap has been freed at commit zero link count);
3416 * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): 3413 * 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
3417 * xtree is updated; 3414 * xtree is updated;
3418 * map update directly at truncation time; 3415 * map update directly at truncation time;
3419 * 3416 *
3420 * if (DELETE) 3417 * if (DELETE)
3421 * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); 3418 * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
3422 * else if (TRUNCATE) 3419 * else if (TRUNCATE)
3423 * must write LOG_NOREDOPAGE for deleted index page; 3420 * must write LOG_NOREDOPAGE for deleted index page;
3424 * 3421 *
3425 * pages may already have been tlocked by anonymous transactions 3422 * pages may already have been tlocked by anonymous transactions
3426 * during file growth (i.e., write) before truncation; 3423 * during file growth (i.e., write) before truncation;
@@ -3493,7 +3490,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3493 * retained in the new sized file. 3490 * retained in the new sized file.
3494 * if type is PMAP, the data and index pages are NOT 3491 * if type is PMAP, the data and index pages are NOT
3495 * freed, and the data and index blocks are NOT freed 3492 * freed, and the data and index blocks are NOT freed
3496 * from working map. 3493 * from working map.
3497 * (this will allow continued access of data/index of 3494 * (this will allow continued access of data/index of
3498 * temporary file (zerolink count file truncated to zero-length)). 3495 * temporary file (zerolink count file truncated to zero-length)).
3499 */ 3496 */
@@ -3542,7 +3539,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3542 goto getChild; 3539 goto getChild;
3543 3540
3544 /* 3541 /*
3545 * leaf page 3542 * leaf page
3546 */ 3543 */
3547 freed = 0; 3544 freed = 0;
3548 3545
@@ -3916,7 +3913,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3916 } 3913 }
3917 3914
3918 /* 3915 /*
3919 * internal page: go down to child page of current entry 3916 * internal page: go down to child page of current entry
3920 */ 3917 */
3921 getChild: 3918 getChild:
3922 /* save current parent entry for the child page */ 3919 /* save current parent entry for the child page */
@@ -3965,7 +3962,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3965 3962
3966 3963
3967/* 3964/*
3968 * xtTruncate_pmap() 3965 * xtTruncate_pmap()
3969 * 3966 *
3970 * function: 3967 * function:
3971 * Perform truncate to zero lenghth for deleted file, leaving the 3968 * Perform truncate to zero lenghth for deleted file, leaving the
@@ -3974,9 +3971,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3974 * is committed to disk. 3971 * is committed to disk.
3975 * 3972 *
3976 * parameter: 3973 * parameter:
3977 * tid_t tid, 3974 * tid_t tid,
3978 * struct inode *ip, 3975 * struct inode *ip,
3979 * s64 committed_size) 3976 * s64 committed_size)
3980 * 3977 *
3981 * return: new committed size 3978 * return: new committed size
3982 * 3979 *
@@ -4050,7 +4047,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4050 } 4047 }
4051 4048
4052 /* 4049 /*
4053 * leaf page 4050 * leaf page
4054 */ 4051 */
4055 4052
4056 if (++locked_leaves > MAX_TRUNCATE_LEAVES) { 4053 if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
@@ -4062,7 +4059,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4062 xoff = offsetXAD(xad); 4059 xoff = offsetXAD(xad);
4063 xlen = lengthXAD(xad); 4060 xlen = lengthXAD(xad);
4064 XT_PUTPAGE(mp); 4061 XT_PUTPAGE(mp);
4065 return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; 4062 return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
4066 } 4063 }
4067 tlck = txLock(tid, ip, mp, tlckXTREE); 4064 tlck = txLock(tid, ip, mp, tlckXTREE);
4068 tlck->type = tlckXTREE | tlckFREE; 4065 tlck->type = tlckXTREE | tlckFREE;
@@ -4099,8 +4096,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4099 */ 4096 */
4100 tlck = txLock(tid, ip, mp, tlckXTREE); 4097 tlck = txLock(tid, ip, mp, tlckXTREE);
4101 xtlck = (struct xtlock *) & tlck->lock; 4098 xtlck = (struct xtlock *) & tlck->lock;
4102 xtlck->hwm.offset = 4099 xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
4103 le16_to_cpu(p->header.nextindex) - 1;
4104 tlck->type = tlckXTREE | tlckFREE; 4100 tlck->type = tlckXTREE | tlckFREE;
4105 4101
4106 XT_PUTPAGE(mp); 4102 XT_PUTPAGE(mp);
@@ -4118,7 +4114,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4118 else 4114 else
4119 index--; 4115 index--;
4120 /* 4116 /*
4121 * internal page: go down to child page of current entry 4117 * internal page: go down to child page of current entry
4122 */ 4118 */
4123 getChild: 4119 getChild:
4124 /* save current parent entry for the child page */ 4120 /* save current parent entry for the child page */
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 164f6f2b10..70815c8a3d 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -19,14 +19,14 @@
19#define _H_JFS_XTREE 19#define _H_JFS_XTREE
20 20
21/* 21/*
22 * jfs_xtree.h: extent allocation descriptor B+-tree manager 22 * jfs_xtree.h: extent allocation descriptor B+-tree manager
23 */ 23 */
24 24
25#include "jfs_btree.h" 25#include "jfs_btree.h"
26 26
27 27
28/* 28/*
29 * extent allocation descriptor (xad) 29 * extent allocation descriptor (xad)
30 */ 30 */
31typedef struct xad { 31typedef struct xad {
32 unsigned flag:8; /* 1: flag */ 32 unsigned flag:8; /* 1: flag */
@@ -38,30 +38,30 @@ typedef struct xad {
38 __le32 addr2; /* 4: address in unit of fsblksize */ 38 __le32 addr2; /* 4: address in unit of fsblksize */
39} xad_t; /* (16) */ 39} xad_t; /* (16) */
40 40
41#define MAXXLEN ((1 << 24) - 1) 41#define MAXXLEN ((1 << 24) - 1)
42 42
43#define XTSLOTSIZE 16 43#define XTSLOTSIZE 16
44#define L2XTSLOTSIZE 4 44#define L2XTSLOTSIZE 4
45 45
46/* xad_t field construction */ 46/* xad_t field construction */
47#define XADoffset(xad, offset64)\ 47#define XADoffset(xad, offset64)\
48{\ 48{\
49 (xad)->off1 = ((u64)offset64) >> 32;\ 49 (xad)->off1 = ((u64)offset64) >> 32;\
50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ 50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
51} 51}
52#define XADaddress(xad, address64)\ 52#define XADaddress(xad, address64)\
53{\ 53{\
54 (xad)->addr1 = ((u64)address64) >> 32;\ 54 (xad)->addr1 = ((u64)address64) >> 32;\
55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ 55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
56} 56}
57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) 57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
58 58
59/* xad_t field extraction */ 59/* xad_t field extraction */
60#define offsetXAD(xad)\ 60#define offsetXAD(xad)\
61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) 61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
62#define addressXAD(xad)\ 62#define addressXAD(xad)\
63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) 63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
64#define lengthXAD(xad) __le24_to_cpu((xad)->len) 64#define lengthXAD(xad) __le24_to_cpu((xad)->len)
65 65
66/* xad list */ 66/* xad list */
67struct xadlist { 67struct xadlist {
@@ -71,22 +71,22 @@ struct xadlist {
71}; 71};
72 72
73/* xad_t flags */ 73/* xad_t flags */
74#define XAD_NEW 0x01 /* new */ 74#define XAD_NEW 0x01 /* new */
75#define XAD_EXTENDED 0x02 /* extended */ 75#define XAD_EXTENDED 0x02 /* extended */
76#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ 76#define XAD_COMPRESSED 0x04 /* compressed with recorded length */
77#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ 77#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */
78#define XAD_COW 0x10 /* copy-on-write */ 78#define XAD_COW 0x10 /* copy-on-write */
79 79
80 80
81/* possible values for maxentry */ 81/* possible values for maxentry */
82#define XTROOTINITSLOT_DIR 6 82#define XTROOTINITSLOT_DIR 6
83#define XTROOTINITSLOT 10 83#define XTROOTINITSLOT 10
84#define XTROOTMAXSLOT 18 84#define XTROOTMAXSLOT 18
85#define XTPAGEMAXSLOT 256 85#define XTPAGEMAXSLOT 256
86#define XTENTRYSTART 2 86#define XTENTRYSTART 2
87 87
88/* 88/*
89 * xtree page: 89 * xtree page:
90 */ 90 */
91typedef union { 91typedef union {
92 struct xtheader { 92 struct xtheader {
@@ -106,7 +106,7 @@ typedef union {
106} xtpage_t; 106} xtpage_t;
107 107
108/* 108/*
109 * external declaration 109 * external declaration
110 */ 110 */
111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, 111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
112 int *pflag, s64 * paddr, int *plen, int flag); 112 int *pflag, s64 * paddr, int *plen, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 41c2047712..932797ba43 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -328,7 +328,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
328 * dentry - child directory dentry 328 * dentry - child directory dentry
329 * 329 *
330 * RETURN: -EINVAL - if name is . or .. 330 * RETURN: -EINVAL - if name is . or ..
331 * -EINVAL - if . or .. exist but are invalid. 331 * -EINVAL - if . or .. exist but are invalid.
332 * errors from subroutines 332 * errors from subroutines
333 * 333 *
334 * note: 334 * note:
@@ -517,7 +517,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
517 inode_dec_link_count(ip); 517 inode_dec_link_count(ip);
518 518
519 /* 519 /*
520 * commit zero link count object 520 * commit zero link count object
521 */ 521 */
522 if (ip->i_nlink == 0) { 522 if (ip->i_nlink == 0) {
523 assert(!test_cflag(COMMIT_Nolink, ip)); 523 assert(!test_cflag(COMMIT_Nolink, ip));
@@ -596,7 +596,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
596/* 596/*
597 * NAME: commitZeroLink() 597 * NAME: commitZeroLink()
598 * 598 *
599 * FUNCTION: for non-directory, called by jfs_remove(), 599 * FUNCTION: for non-directory, called by jfs_remove(),
600 * truncate a regular file, directory or symbolic 600 * truncate a regular file, directory or symbolic
601 * link to zero length. return 0 if type is not 601 * link to zero length. return 0 if type is not
602 * one of these. 602 * one of these.
@@ -676,7 +676,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
676/* 676/*
677 * NAME: jfs_free_zero_link() 677 * NAME: jfs_free_zero_link()
678 * 678 *
679 * FUNCTION: for non-directory, called by iClose(), 679 * FUNCTION: for non-directory, called by iClose(),
680 * free resources of a file from cache and WORKING map 680 * free resources of a file from cache and WORKING map
681 * for a file previously committed with zero link count 681 * for a file previously committed with zero link count
682 * while associated with a pager object, 682 * while associated with a pager object,
@@ -855,12 +855,12 @@ static int jfs_link(struct dentry *old_dentry,
855 * NAME: jfs_symlink(dip, dentry, name) 855 * NAME: jfs_symlink(dip, dentry, name)
856 * 856 *
857 * FUNCTION: creates a symbolic link to <symlink> by name <name> 857 * FUNCTION: creates a symbolic link to <symlink> by name <name>
858 * in directory <dip> 858 * in directory <dip>
859 * 859 *
860 * PARAMETER: dip - parent directory vnode 860 * PARAMETER: dip - parent directory vnode
861 * dentry - dentry of symbolic link 861 * dentry - dentry of symbolic link
862 * name - the path name of the existing object 862 * name - the path name of the existing object
863 * that will be the source of the link 863 * that will be the source of the link
864 * 864 *
865 * RETURN: errors from subroutines 865 * RETURN: errors from subroutines
866 * 866 *
@@ -1052,9 +1052,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1052 1052
1053 1053
1054/* 1054/*
1055 * NAME: jfs_rename 1055 * NAME: jfs_rename
1056 * 1056 *
1057 * FUNCTION: rename a file or directory 1057 * FUNCTION: rename a file or directory
1058 */ 1058 */
1059static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, 1059static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1060 struct inode *new_dir, struct dentry *new_dentry) 1060 struct inode *new_dir, struct dentry *new_dentry)
@@ -1331,9 +1331,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1331 1331
1332 1332
1333/* 1333/*
1334 * NAME: jfs_mknod 1334 * NAME: jfs_mknod
1335 * 1335 *
1336 * FUNCTION: Create a special file (device) 1336 * FUNCTION: Create a special file (device)
1337 */ 1337 */
1338static int jfs_mknod(struct inode *dir, struct dentry *dentry, 1338static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1339 int mode, dev_t rdev) 1339 int mode, dev_t rdev)
@@ -1477,6 +1477,38 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1477 return dentry; 1477 return dentry;
1478} 1478}
1479 1479
1480struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp)
1481{
1482 __u32 *objp = vobjp;
1483 unsigned long ino = objp[0];
1484 __u32 generation = objp[1];
1485 struct inode *inode;
1486 struct dentry *result;
1487
1488 if (ino == 0)
1489 return ERR_PTR(-ESTALE);
1490 inode = iget(sb, ino);
1491 if (inode == NULL)
1492 return ERR_PTR(-ENOMEM);
1493
1494 if (is_bad_inode(inode) ||
1495 (generation && inode->i_generation != generation)) {
1496 result = ERR_PTR(-ESTALE);
1497 goto out_iput;
1498 }
1499
1500 result = d_alloc_anon(inode);
1501 if (!result) {
1502 result = ERR_PTR(-ENOMEM);
1503 goto out_iput;
1504 }
1505 return result;
1506
1507 out_iput:
1508 iput(inode);
1509 return result;
1510}
1511
1480struct dentry *jfs_get_parent(struct dentry *dentry) 1512struct dentry *jfs_get_parent(struct dentry *dentry)
1481{ 1513{
1482 struct super_block *sb = dentry->d_inode->i_sb; 1514 struct super_block *sb = dentry->d_inode->i_sb;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 79d625f3f7..71984ee953 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -29,17 +29,17 @@
29#include "jfs_txnmgr.h" 29#include "jfs_txnmgr.h"
30#include "jfs_debug.h" 30#include "jfs_debug.h"
31 31
32#define BITSPERPAGE (PSIZE << 3) 32#define BITSPERPAGE (PSIZE << 3)
33#define L2MEGABYTE 20 33#define L2MEGABYTE 20
34#define MEGABYTE (1 << L2MEGABYTE) 34#define MEGABYTE (1 << L2MEGABYTE)
35#define MEGABYTE32 (MEGABYTE << 5) 35#define MEGABYTE32 (MEGABYTE << 5)
36 36
37/* convert block number to bmap file page number */ 37/* convert block number to bmap file page number */
38#define BLKTODMAPN(b)\ 38#define BLKTODMAPN(b)\
39 (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) 39 (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
40 40
41/* 41/*
42 * jfs_extendfs() 42 * jfs_extendfs()
43 * 43 *
44 * function: extend file system; 44 * function: extend file system;
45 * 45 *
@@ -48,9 +48,9 @@
48 * workspace space 48 * workspace space
49 * 49 *
50 * input: 50 * input:
51 * new LVSize: in LV blocks (required) 51 * new LVSize: in LV blocks (required)
52 * new LogSize: in LV blocks (optional) 52 * new LogSize: in LV blocks (optional)
53 * new FSSize: in LV blocks (optional) 53 * new FSSize: in LV blocks (optional)
54 * 54 *
55 * new configuration: 55 * new configuration:
56 * 1. set new LogSize as specified or default from new LVSize; 56 * 1. set new LogSize as specified or default from new LVSize;
@@ -125,8 +125,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
125 } 125 }
126 126
127 /* 127 /*
128 * reconfigure LV spaces 128 * reconfigure LV spaces
129 * --------------------- 129 * ---------------------
130 * 130 *
131 * validate new size, or, if not specified, determine new size 131 * validate new size, or, if not specified, determine new size
132 */ 132 */
@@ -198,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
198 log_formatted = 1; 198 log_formatted = 1;
199 } 199 }
200 /* 200 /*
201 * quiesce file system 201 * quiesce file system
202 * 202 *
203 * (prepare to move the inline log and to prevent map update) 203 * (prepare to move the inline log and to prevent map update)
204 * 204 *
@@ -270,8 +270,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
270 } 270 }
271 271
272 /* 272 /*
273 * extend block allocation map 273 * extend block allocation map
274 * --------------------------- 274 * ---------------------------
275 * 275 *
276 * extendfs() for new extension, retry after crash recovery; 276 * extendfs() for new extension, retry after crash recovery;
277 * 277 *
@@ -283,7 +283,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
283 * s_size: aggregate size in physical blocks; 283 * s_size: aggregate size in physical blocks;
284 */ 284 */
285 /* 285 /*
286 * compute the new block allocation map configuration 286 * compute the new block allocation map configuration
287 * 287 *
288 * map dinode: 288 * map dinode:
289 * di_size: map file size in byte; 289 * di_size: map file size in byte;
@@ -301,7 +301,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
301 newNpages = BLKTODMAPN(t64) + 1; 301 newNpages = BLKTODMAPN(t64) + 1;
302 302
303 /* 303 /*
304 * extend map from current map (WITHOUT growing mapfile) 304 * extend map from current map (WITHOUT growing mapfile)
305 * 305 *
306 * map new extension with unmapped part of the last partial 306 * map new extension with unmapped part of the last partial
307 * dmap page, if applicable, and extra page(s) allocated 307 * dmap page, if applicable, and extra page(s) allocated
@@ -341,8 +341,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
341 XSize -= nblocks; 341 XSize -= nblocks;
342 342
343 /* 343 /*
344 * grow map file to cover remaining extension 344 * grow map file to cover remaining extension
345 * and/or one extra dmap page for next extendfs(); 345 * and/or one extra dmap page for next extendfs();
346 * 346 *
347 * allocate new map pages and its backing blocks, and 347 * allocate new map pages and its backing blocks, and
348 * update map file xtree 348 * update map file xtree
@@ -422,8 +422,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
422 dbFinalizeBmap(ipbmap); 422 dbFinalizeBmap(ipbmap);
423 423
424 /* 424 /*
425 * update inode allocation map 425 * update inode allocation map
426 * --------------------------- 426 * ---------------------------
427 * 427 *
428 * move iag lists from old to new iag; 428 * move iag lists from old to new iag;
429 * agstart field is not updated for logredo() to reconstruct 429 * agstart field is not updated for logredo() to reconstruct
@@ -442,8 +442,8 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
442 } 442 }
443 443
444 /* 444 /*
445 * finalize 445 * finalize
446 * -------- 446 * --------
447 * 447 *
448 * extension is committed when on-disk super block is 448 * extension is committed when on-disk super block is
449 * updated with new descriptors: logredo will recover 449 * updated with new descriptors: logredo will recover
@@ -480,7 +480,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
480 diFreeSpecial(ipbmap2); 480 diFreeSpecial(ipbmap2);
481 481
482 /* 482 /*
483 * update superblock 483 * update superblock
484 */ 484 */
485 if ((rc = readSuper(sb, &bh))) 485 if ((rc = readSuper(sb, &bh)))
486 goto error_out; 486 goto error_out;
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
530 530
531 resume: 531 resume:
532 /* 532 /*
533 * resume file system transactions 533 * resume file system transactions
534 */ 534 */
535 txResume(sb); 535 txResume(sb);
536 536
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 20e4ac1c79..929fceca79 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/posix_acl.h> 28#include <linux/posix_acl.h>
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/exportfs.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/seq_file.h> 32#include <linux/seq_file.h>
32 33
@@ -737,6 +738,7 @@ static const struct super_operations jfs_super_operations = {
737}; 738};
738 739
739static struct export_operations jfs_export_operations = { 740static struct export_operations jfs_export_operations = {
741 .get_dentry = jfs_get_dentry,
740 .get_parent = jfs_get_parent, 742 .get_parent = jfs_get_parent,
741}; 743};
742 744
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index b753ba2164..9b7f2cdaae 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -63,9 +63,9 @@
63 * 63 *
64 * On-disk: 64 * On-disk:
65 * 65 *
66 * FEALISTs are stored on disk using blocks allocated by dbAlloc() and 66 * FEALISTs are stored on disk using blocks allocated by dbAlloc() and
67 * written directly. An EA list may be in-lined in the inode if there is 67 * written directly. An EA list may be in-lined in the inode if there is
68 * sufficient room available. 68 * sufficient room available.
69 */ 69 */
70 70
71struct ea_buffer { 71struct ea_buffer {
@@ -590,7 +590,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
590 size_check: 590 size_check:
591 if (EALIST_SIZE(ea_buf->xattr) != ea_size) { 591 if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
592 printk(KERN_ERR "ea_get: invalid extended attribute\n"); 592 printk(KERN_ERR "ea_get: invalid extended attribute\n");
593 dump_mem("xattr", ea_buf->xattr, ea_size); 593 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
594 ea_buf->xattr, ea_size, 1);
594 ea_release(inode, ea_buf); 595 ea_release(inode, ea_buf);
595 rc = -EIO; 596 rc = -EIO;
596 goto clean_up; 597 goto clean_up;
@@ -696,7 +697,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
696 struct posix_acl *acl; 697 struct posix_acl *acl;
697 int rc; 698 int rc;
698 699
699 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 700 if (!is_owner_or_cap(inode))
700 return -EPERM; 701 return -EPERM;
701 702
702 /* 703 /*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 96070bff93..572601e98d 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -44,9 +44,8 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
44 */ 44 */
45static struct nlm_host * 45static struct nlm_host *
46nlm_lookup_host(int server, const struct sockaddr_in *sin, 46nlm_lookup_host(int server, const struct sockaddr_in *sin,
47 int proto, int version, 47 int proto, int version, const char *hostname,
48 const char *hostname, 48 int hostname_len, const struct sockaddr_in *ssin)
49 int hostname_len)
50{ 49{
51 struct hlist_head *chain; 50 struct hlist_head *chain;
52 struct hlist_node *pos; 51 struct hlist_node *pos;
@@ -54,7 +53,9 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
54 struct nsm_handle *nsm = NULL; 53 struct nsm_handle *nsm = NULL;
55 int hash; 54 int hash;
56 55
57 dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n", 56 dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
57 ", p=%d, v=%d, my role=%s, name=%.*s)\n",
58 NIPQUAD(ssin->sin_addr.s_addr),
58 NIPQUAD(sin->sin_addr.s_addr), proto, version, 59 NIPQUAD(sin->sin_addr.s_addr), proto, version,
59 server? "server" : "client", 60 server? "server" : "client",
60 hostname_len, 61 hostname_len,
@@ -91,6 +92,8 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
91 continue; 92 continue;
92 if (host->h_server != server) 93 if (host->h_server != server)
93 continue; 94 continue;
95 if (!nlm_cmp_addr(&host->h_saddr, ssin))
96 continue;
94 97
95 /* Move to head of hash chain. */ 98 /* Move to head of hash chain. */
96 hlist_del(&host->h_hash); 99 hlist_del(&host->h_hash);
@@ -118,6 +121,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
118 host->h_name = nsm->sm_name; 121 host->h_name = nsm->sm_name;
119 host->h_addr = *sin; 122 host->h_addr = *sin;
120 host->h_addr.sin_port = 0; /* ouch! */ 123 host->h_addr.sin_port = 0; /* ouch! */
124 host->h_saddr = *ssin;
121 host->h_version = version; 125 host->h_version = version;
122 host->h_proto = proto; 126 host->h_proto = proto;
123 host->h_rpcclnt = NULL; 127 host->h_rpcclnt = NULL;
@@ -161,15 +165,9 @@ nlm_destroy_host(struct nlm_host *host)
161 */ 165 */
162 nsm_unmonitor(host); 166 nsm_unmonitor(host);
163 167
164 if ((clnt = host->h_rpcclnt) != NULL) { 168 clnt = host->h_rpcclnt;
165 if (atomic_read(&clnt->cl_users)) { 169 if (clnt != NULL)
166 printk(KERN_WARNING 170 rpc_shutdown_client(clnt);
167 "lockd: active RPC handle\n");
168 clnt->cl_dead = 1;
169 } else {
170 rpc_destroy_client(host->h_rpcclnt);
171 }
172 }
173 kfree(host); 171 kfree(host);
174} 172}
175 173
@@ -180,8 +178,10 @@ struct nlm_host *
180nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, 178nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
181 const char *hostname, int hostname_len) 179 const char *hostname, int hostname_len)
182{ 180{
181 struct sockaddr_in ssin = {0};
182
183 return nlm_lookup_host(0, sin, proto, version, 183 return nlm_lookup_host(0, sin, proto, version,
184 hostname, hostname_len); 184 hostname, hostname_len, &ssin);
185} 185}
186 186
187/* 187/*
@@ -191,9 +191,12 @@ struct nlm_host *
191nlmsvc_lookup_host(struct svc_rqst *rqstp, 191nlmsvc_lookup_host(struct svc_rqst *rqstp,
192 const char *hostname, int hostname_len) 192 const char *hostname, int hostname_len)
193{ 193{
194 struct sockaddr_in ssin = {0};
195
196 ssin.sin_addr = rqstp->rq_daddr.addr;
194 return nlm_lookup_host(1, svc_addr_in(rqstp), 197 return nlm_lookup_host(1, svc_addr_in(rqstp),
195 rqstp->rq_prot, rqstp->rq_vers, 198 rqstp->rq_prot, rqstp->rq_vers,
196 hostname, hostname_len); 199 hostname, hostname_len, &ssin);
197} 200}
198 201
199/* 202/*
@@ -204,8 +207,9 @@ nlm_bind_host(struct nlm_host *host)
204{ 207{
205 struct rpc_clnt *clnt; 208 struct rpc_clnt *clnt;
206 209
207 dprintk("lockd: nlm_bind_host(%08x)\n", 210 dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
208 (unsigned)ntohl(host->h_addr.sin_addr.s_addr)); 211 NIPQUAD(host->h_saddr.sin_addr),
212 NIPQUAD(host->h_addr.sin_addr));
209 213
210 /* Lock host handle */ 214 /* Lock host handle */
211 mutex_lock(&host->h_mutex); 215 mutex_lock(&host->h_mutex);
@@ -232,6 +236,7 @@ nlm_bind_host(struct nlm_host *host)
232 .protocol = host->h_proto, 236 .protocol = host->h_proto,
233 .address = (struct sockaddr *)&host->h_addr, 237 .address = (struct sockaddr *)&host->h_addr,
234 .addrsize = sizeof(host->h_addr), 238 .addrsize = sizeof(host->h_addr),
239 .saddress = (struct sockaddr *)&host->h_saddr,
235 .timeout = &timeparms, 240 .timeout = &timeparms,
236 .servername = host->h_name, 241 .servername = host->h_name,
237 .program = &nlm_program, 242 .program = &nlm_program,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 2102e2d013..3353ed8421 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,6 +61,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
61 status); 61 status);
62 else 62 else
63 status = 0; 63 status = 0;
64 rpc_shutdown_client(clnt);
64 out: 65 out:
65 return status; 66 return status;
66} 67}
@@ -138,7 +139,6 @@ nsm_create(void)
138 .program = &nsm_program, 139 .program = &nsm_program,
139 .version = SM_VERSION, 140 .version = SM_VERSION,
140 .authflavor = RPC_AUTH_NULL, 141 .authflavor = RPC_AUTH_NULL,
141 .flags = (RPC_CLNT_CREATE_ONESHOT),
142 }; 142 };
143 143
144 return rpc_create(&args); 144 return rpc_create(&args);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 126b1bf02c..82e2192a0d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,6 +25,7 @@
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/freezer.h>
28 29
29#include <linux/sunrpc/types.h> 30#include <linux/sunrpc/types.h>
30#include <linux/sunrpc/stats.h> 31#include <linux/sunrpc/stats.h>
@@ -75,18 +76,31 @@ static const int nlm_port_min = 0, nlm_port_max = 65535;
75 76
76static struct ctl_table_header * nlm_sysctl_table; 77static struct ctl_table_header * nlm_sysctl_table;
77 78
78static unsigned long set_grace_period(void) 79static unsigned long get_lockd_grace_period(void)
79{ 80{
80 unsigned long grace_period;
81
82 /* Note: nlm_timeout should always be nonzero */ 81 /* Note: nlm_timeout should always be nonzero */
83 if (nlm_grace_period) 82 if (nlm_grace_period)
84 grace_period = ((nlm_grace_period + nlm_timeout - 1) 83 return roundup(nlm_grace_period, nlm_timeout) * HZ;
85 / nlm_timeout) * nlm_timeout * HZ;
86 else 84 else
87 grace_period = nlm_timeout * 5 * HZ; 85 return nlm_timeout * 5 * HZ;
86}
87
88unsigned long get_nfs_grace_period(void)
89{
90 unsigned long lockdgrace = get_lockd_grace_period();
91 unsigned long nfsdgrace = 0;
92
93 if (nlmsvc_ops)
94 nfsdgrace = nlmsvc_ops->get_grace_period();
95
96 return max(lockdgrace, nfsdgrace);
97}
98EXPORT_SYMBOL(get_nfs_grace_period);
99
100static unsigned long set_grace_period(void)
101{
88 nlmsvc_grace_period = 1; 102 nlmsvc_grace_period = 1;
89 return grace_period + jiffies; 103 return get_nfs_grace_period() + jiffies;
90} 104}
91 105
92static inline void clear_grace_period(void) 106static inline void clear_grace_period(void)
@@ -119,13 +133,11 @@ lockd(struct svc_rqst *rqstp)
119 complete(&lockd_start_done); 133 complete(&lockd_start_done);
120 134
121 daemonize("lockd"); 135 daemonize("lockd");
136 set_freezable();
122 137
123 /* Process request with signals blocked, but allow SIGKILL. */ 138 /* Process request with signals blocked, but allow SIGKILL. */
124 allow_signal(SIGKILL); 139 allow_signal(SIGKILL);
125 140
126 /* kick rpciod */
127 rpciod_up();
128
129 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 141 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
130 142
131 if (!nlm_timeout) 143 if (!nlm_timeout)
@@ -202,9 +214,6 @@ lockd(struct svc_rqst *rqstp)
202 /* Exit the RPC thread */ 214 /* Exit the RPC thread */
203 svc_exit_thread(rqstp); 215 svc_exit_thread(rqstp);
204 216
205 /* release rpciod */
206 rpciod_down();
207
208 /* Release module */ 217 /* Release module */
209 unlock_kernel(); 218 unlock_kernel();
210 module_put_and_exit(0); 219 module_put_and_exit(0);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index deeb9dc062..fbb1d02f87 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,7 +100,6 @@ struct mb_cache {
100static LIST_HEAD(mb_cache_list); 100static LIST_HEAD(mb_cache_list);
101static LIST_HEAD(mb_cache_lru_list); 101static LIST_HEAD(mb_cache_lru_list);
102static DEFINE_SPINLOCK(mb_cache_spinlock); 102static DEFINE_SPINLOCK(mb_cache_spinlock);
103static struct shrinker *mb_shrinker;
104 103
105static inline int 104static inline int
106mb_cache_indexes(struct mb_cache *cache) 105mb_cache_indexes(struct mb_cache *cache)
@@ -118,6 +117,10 @@ mb_cache_indexes(struct mb_cache *cache)
118 117
119static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); 118static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
120 119
120static struct shrinker mb_cache_shrinker = {
121 .shrink = mb_cache_shrink_fn,
122 .seeks = DEFAULT_SEEKS,
123};
121 124
122static inline int 125static inline int
123__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 126__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
@@ -662,13 +665,13 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
662 665
663static int __init init_mbcache(void) 666static int __init init_mbcache(void)
664{ 667{
665 mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn); 668 register_shrinker(&mb_cache_shrinker);
666 return 0; 669 return 0;
667} 670}
668 671
669static void __exit exit_mbcache(void) 672static void __exit exit_mbcache(void)
670{ 673{
671 remove_shrinker(mb_shrinker); 674 unregister_shrinker(&mb_cache_shrinker);
672} 675}
673 676
674module_init(init_mbcache) 677module_init(init_mbcache)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f92baa1d75..17765f697e 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -23,7 +23,7 @@ const struct file_operations minix_file_operations = {
23 .aio_write = generic_file_aio_write, 23 .aio_write = generic_file_aio_write,
24 .mmap = generic_file_mmap, 24 .mmap = generic_file_mmap,
25 .fsync = minix_sync_file, 25 .fsync = minix_sync_file,
26 .sendfile = generic_file_sendfile, 26 .splice_read = generic_file_splice_read,
27}; 27};
28 28
29const struct inode_operations minix_file_inode_operations = { 29const struct inode_operations minix_file_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 5e2d98d10c..defaa47c11 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1576,7 +1576,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1576 1576
1577 /* O_NOATIME can only be set by the owner or superuser */ 1577 /* O_NOATIME can only be set by the owner or superuser */
1578 if (flag & O_NOATIME) 1578 if (flag & O_NOATIME)
1579 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) 1579 if (!is_owner_or_cap(inode))
1580 return -EPERM; 1580 return -EPERM;
1581 1581
1582 /* 1582 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index b696e3a0d1..4198003d7e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,6 +28,7 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/unistd.h> 29#include <asm/unistd.h>
30#include "pnode.h" 30#include "pnode.h"
31#include "internal.h"
31 32
32/* spinlock for vfsmount related operations, inplace of dcache_lock */ 33/* spinlock for vfsmount related operations, inplace of dcache_lock */
33__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 34__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
@@ -320,22 +321,16 @@ EXPORT_SYMBOL(mnt_unpin);
320static void *m_start(struct seq_file *m, loff_t *pos) 321static void *m_start(struct seq_file *m, loff_t *pos)
321{ 322{
322 struct mnt_namespace *n = m->private; 323 struct mnt_namespace *n = m->private;
323 struct list_head *p;
324 loff_t l = *pos;
325 324
326 down_read(&namespace_sem); 325 down_read(&namespace_sem);
327 list_for_each(p, &n->list) 326 return seq_list_start(&n->list, *pos);
328 if (!l--)
329 return list_entry(p, struct vfsmount, mnt_list);
330 return NULL;
331} 327}
332 328
333static void *m_next(struct seq_file *m, void *v, loff_t *pos) 329static void *m_next(struct seq_file *m, void *v, loff_t *pos)
334{ 330{
335 struct mnt_namespace *n = m->private; 331 struct mnt_namespace *n = m->private;
336 struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; 332
337 (*pos)++; 333 return seq_list_next(v, &n->list, pos);
338 return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
339} 334}
340 335
341static void m_stop(struct seq_file *m, void *v) 336static void m_stop(struct seq_file *m, void *v)
@@ -350,7 +345,7 @@ static inline void mangle(struct seq_file *m, const char *s)
350 345
351static int show_vfsmnt(struct seq_file *m, void *v) 346static int show_vfsmnt(struct seq_file *m, void *v)
352{ 347{
353 struct vfsmount *mnt = v; 348 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
354 int err = 0; 349 int err = 0;
355 static struct proc_fs_info { 350 static struct proc_fs_info {
356 int flag; 351 int flag;
@@ -405,7 +400,7 @@ struct seq_operations mounts_op = {
405 400
406static int show_vfsstat(struct seq_file *m, void *v) 401static int show_vfsstat(struct seq_file *m, void *v)
407{ 402{
408 struct vfsmount *mnt = v; 403 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
409 int err = 0; 404 int err = 0;
410 405
411 /* device */ 406 /* device */
@@ -1457,7 +1452,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1457 1452
1458 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1453 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
1459 if (!new_ns) 1454 if (!new_ns)
1460 return NULL; 1455 return ERR_PTR(-ENOMEM);
1461 1456
1462 atomic_set(&new_ns->count, 1); 1457 atomic_set(&new_ns->count, 1);
1463 INIT_LIST_HEAD(&new_ns->list); 1458 INIT_LIST_HEAD(&new_ns->list);
@@ -1471,7 +1466,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1471 if (!new_ns->root) { 1466 if (!new_ns->root) {
1472 up_write(&namespace_sem); 1467 up_write(&namespace_sem);
1473 kfree(new_ns); 1468 kfree(new_ns);
1474 return NULL; 1469 return ERR_PTR(-ENOMEM);;
1475 } 1470 }
1476 spin_lock(&vfsmount_lock); 1471 spin_lock(&vfsmount_lock);
1477 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1472 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -1515,7 +1510,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1515 return new_ns; 1510 return new_ns;
1516} 1511}
1517 1512
1518struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns, 1513struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
1519 struct fs_struct *new_fs) 1514 struct fs_struct *new_fs)
1520{ 1515{
1521 struct mnt_namespace *new_ns; 1516 struct mnt_namespace *new_ns;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index d3152f8d95..2b145de45b 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -203,7 +203,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
203 203
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
205 if (pos >= MAX_NON_LFS) { 205 if (pos >= MAX_NON_LFS) {
206 send_sig(SIGXFSZ, current, 0);
207 return -EFBIG; 206 return -EFBIG;
208 } 207 }
209 if (count > MAX_NON_LFS - (u32)pos) { 208 if (count > MAX_NON_LFS - (u32)pos) {
@@ -212,7 +211,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
212 } 211 }
213 if (pos >= inode->i_sb->s_maxbytes) { 212 if (pos >= inode->i_sb->s_maxbytes) {
214 if (count || pos > inode->i_sb->s_maxbytes) { 213 if (count || pos > inode->i_sb->s_maxbytes) {
215 send_sig(SIGXFSZ, current, 0);
216 return -EFBIG; 214 return -EFBIG;
217 } 215 }
218 } 216 }
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index f4580b44ee..b55cb236cf 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
8 pagelist.o proc.o read.o symlink.o unlink.o \ 8 pagelist.o proc.o read.o symlink.o unlink.o \
9 write.o namespace.o 9 write.o namespace.o mount_clnt.o
10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o 10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o 11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
13nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ 13nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f309c874..a796be5051 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,7 @@
14#include <linux/sunrpc/svcsock.h> 14#include <linux/sunrpc/svcsock.h>
15#include <linux/nfs_fs.h> 15#include <linux/nfs_fs.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/freezer.h>
17 18
18#include <net/inet_sock.h> 19#include <net/inet_sock.h>
19 20
@@ -67,6 +68,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
67 daemonize("nfsv4-svc"); 68 daemonize("nfsv4-svc");
68 /* Process request with signals blocked, but allow SIGKILL. */ 69 /* Process request with signals blocked, but allow SIGKILL. */
69 allow_signal(SIGKILL); 70 allow_signal(SIGKILL);
71 set_freezable();
70 72
71 complete(&nfs_callback_info.started); 73 complete(&nfs_callback_info.started);
72 74
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 881fa49009..a49f9feff7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -102,19 +102,10 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
102 int nfsversion) 102 int nfsversion)
103{ 103{
104 struct nfs_client *clp; 104 struct nfs_client *clp;
105 int error;
106 105
107 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) 106 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
108 goto error_0; 107 goto error_0;
109 108
110 error = rpciod_up();
111 if (error < 0) {
112 dprintk("%s: couldn't start rpciod! Error = %d\n",
113 __FUNCTION__, error);
114 goto error_1;
115 }
116 __set_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
117
118 if (nfsversion == 4) { 109 if (nfsversion == 4) {
119 if (nfs_callback_up() < 0) 110 if (nfs_callback_up() < 0)
120 goto error_2; 111 goto error_2;
@@ -139,8 +130,6 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
139#ifdef CONFIG_NFS_V4 130#ifdef CONFIG_NFS_V4
140 init_rwsem(&clp->cl_sem); 131 init_rwsem(&clp->cl_sem);
141 INIT_LIST_HEAD(&clp->cl_delegations); 132 INIT_LIST_HEAD(&clp->cl_delegations);
142 INIT_LIST_HEAD(&clp->cl_state_owners);
143 INIT_LIST_HEAD(&clp->cl_unused);
144 spin_lock_init(&clp->cl_lock); 133 spin_lock_init(&clp->cl_lock);
145 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 134 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
146 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 135 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -154,9 +143,6 @@ error_3:
154 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 143 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
155 nfs_callback_down(); 144 nfs_callback_down();
156error_2: 145error_2:
157 rpciod_down();
158 __clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
159error_1:
160 kfree(clp); 146 kfree(clp);
161error_0: 147error_0:
162 return NULL; 148 return NULL;
@@ -167,16 +153,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
167#ifdef CONFIG_NFS_V4 153#ifdef CONFIG_NFS_V4
168 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) 154 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
169 nfs4_kill_renewd(clp); 155 nfs4_kill_renewd(clp);
170 while (!list_empty(&clp->cl_unused)) { 156 BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
171 struct nfs4_state_owner *sp;
172
173 sp = list_entry(clp->cl_unused.next,
174 struct nfs4_state_owner,
175 so_list);
176 list_del(&sp->so_list);
177 kfree(sp);
178 }
179 BUG_ON(!list_empty(&clp->cl_state_owners));
180 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) 157 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
181 nfs_idmap_delete(clp); 158 nfs_idmap_delete(clp);
182#endif 159#endif
@@ -198,9 +175,6 @@ static void nfs_free_client(struct nfs_client *clp)
198 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 175 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
199 nfs_callback_down(); 176 nfs_callback_down();
200 177
201 if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state))
202 rpciod_down();
203
204 kfree(clp->cl_hostname); 178 kfree(clp->cl_hostname);
205 kfree(clp); 179 kfree(clp);
206 180
@@ -1232,23 +1206,9 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1232 */ 1206 */
1233static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1207static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1234{ 1208{
1235 struct list_head *_p;
1236 loff_t pos = *_pos;
1237
1238 /* lock the list against modification */ 1209 /* lock the list against modification */
1239 spin_lock(&nfs_client_lock); 1210 spin_lock(&nfs_client_lock);
1240 1211 return seq_list_start_head(&nfs_client_list, *_pos);
1241 /* allow for the header line */
1242 if (!pos)
1243 return SEQ_START_TOKEN;
1244 pos--;
1245
1246 /* find the n'th element in the list */
1247 list_for_each(_p, &nfs_client_list)
1248 if (!pos--)
1249 break;
1250
1251 return _p != &nfs_client_list ? _p : NULL;
1252} 1212}
1253 1213
1254/* 1214/*
@@ -1256,14 +1216,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1256 */ 1216 */
1257static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) 1217static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1258{ 1218{
1259 struct list_head *_p; 1219 return seq_list_next(v, &nfs_client_list, pos);
1260
1261 (*pos)++;
1262
1263 _p = v;
1264 _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next;
1265
1266 return _p != &nfs_client_list ? _p : NULL;
1267} 1220}
1268 1221
1269/* 1222/*
@@ -1282,7 +1235,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1282 struct nfs_client *clp; 1235 struct nfs_client *clp;
1283 1236
1284 /* display header on line 1 */ 1237 /* display header on line 1 */
1285 if (v == SEQ_START_TOKEN) { 1238 if (v == &nfs_client_list) {
1286 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); 1239 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
1287 return 0; 1240 return 0;
1288 } 1241 }
@@ -1323,23 +1276,9 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1323 */ 1276 */
1324static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1277static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1325{ 1278{
1326 struct list_head *_p;
1327 loff_t pos = *_pos;
1328
1329 /* lock the list against modification */ 1279 /* lock the list against modification */
1330 spin_lock(&nfs_client_lock); 1280 spin_lock(&nfs_client_lock);
1331 1281 return seq_list_start_head(&nfs_volume_list, *_pos);
1332 /* allow for the header line */
1333 if (!pos)
1334 return SEQ_START_TOKEN;
1335 pos--;
1336
1337 /* find the n'th element in the list */
1338 list_for_each(_p, &nfs_volume_list)
1339 if (!pos--)
1340 break;
1341
1342 return _p != &nfs_volume_list ? _p : NULL;
1343} 1282}
1344 1283
1345/* 1284/*
@@ -1347,14 +1286,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1347 */ 1286 */
1348static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) 1287static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1349{ 1288{
1350 struct list_head *_p; 1289 return seq_list_next(v, &nfs_volume_list, pos);
1351
1352 (*pos)++;
1353
1354 _p = v;
1355 _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next;
1356
1357 return _p != &nfs_volume_list ? _p : NULL;
1358} 1290}
1359 1291
1360/* 1292/*
@@ -1375,7 +1307,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1375 char dev[8], fsid[17]; 1307 char dev[8], fsid[17];
1376 1308
1377 /* display header on line 1 */ 1309 /* display header on line 1 */
1378 if (v == SEQ_START_TOKEN) { 1310 if (v == &nfs_volume_list) {
1379 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1311 seq_puts(m, "NV SERVER PORT DEV FSID\n");
1380 return 0; 1312 return 0;
1381 } 1313 }
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f37d1bea8..20ac403469 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -27,6 +27,13 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
27 kfree(delegation); 27 kfree(delegation);
28} 28}
29 29
30static void nfs_free_delegation_callback(struct rcu_head *head)
31{
32 struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
33
34 nfs_free_delegation(delegation);
35}
36
30static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) 37static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
31{ 38{
32 struct inode *inode = state->inode; 39 struct inode *inode = state->inode;
@@ -57,7 +64,7 @@ out_err:
57 return status; 64 return status;
58} 65}
59 66
60static void nfs_delegation_claim_opens(struct inode *inode) 67static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
61{ 68{
62 struct nfs_inode *nfsi = NFS_I(inode); 69 struct nfs_inode *nfsi = NFS_I(inode);
63 struct nfs_open_context *ctx; 70 struct nfs_open_context *ctx;
@@ -72,9 +79,11 @@ again:
72 continue; 79 continue;
73 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 80 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
74 continue; 81 continue;
82 if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
83 continue;
75 get_nfs_open_context(ctx); 84 get_nfs_open_context(ctx);
76 spin_unlock(&inode->i_lock); 85 spin_unlock(&inode->i_lock);
77 err = nfs4_open_delegation_recall(ctx->dentry, state); 86 err = nfs4_open_delegation_recall(ctx, state, stateid);
78 if (err >= 0) 87 if (err >= 0)
79 err = nfs_delegation_claim_locks(ctx, state); 88 err = nfs_delegation_claim_locks(ctx, state);
80 put_nfs_open_context(ctx); 89 put_nfs_open_context(ctx);
@@ -115,10 +124,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
115 struct nfs_delegation *delegation; 124 struct nfs_delegation *delegation;
116 int status = 0; 125 int status = 0;
117 126
118 /* Ensure we first revalidate the attributes and page cache! */
119 if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)))
120 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
121
122 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); 127 delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
123 if (delegation == NULL) 128 if (delegation == NULL)
124 return -ENOMEM; 129 return -ENOMEM;
@@ -131,10 +136,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
131 delegation->inode = inode; 136 delegation->inode = inode;
132 137
133 spin_lock(&clp->cl_lock); 138 spin_lock(&clp->cl_lock);
134 if (nfsi->delegation == NULL) { 139 if (rcu_dereference(nfsi->delegation) == NULL) {
135 list_add(&delegation->super_list, &clp->cl_delegations); 140 list_add_rcu(&delegation->super_list, &clp->cl_delegations);
136 nfsi->delegation = delegation;
137 nfsi->delegation_state = delegation->type; 141 nfsi->delegation_state = delegation->type;
142 rcu_assign_pointer(nfsi->delegation, delegation);
138 delegation = NULL; 143 delegation = NULL;
139 } else { 144 } else {
140 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, 145 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
@@ -145,6 +150,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
145 status = -EIO; 150 status = -EIO;
146 } 151 }
147 } 152 }
153
154 /* Ensure we revalidate the attributes and page cache! */
155 spin_lock(&inode->i_lock);
156 nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
157 spin_unlock(&inode->i_lock);
158
148 spin_unlock(&clp->cl_lock); 159 spin_unlock(&clp->cl_lock);
149 kfree(delegation); 160 kfree(delegation);
150 return status; 161 return status;
@@ -155,7 +166,7 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
155 int res = 0; 166 int res = 0;
156 167
157 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); 168 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
158 nfs_free_delegation(delegation); 169 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
159 return res; 170 return res;
160} 171}
161 172
@@ -170,33 +181,55 @@ static void nfs_msync_inode(struct inode *inode)
170/* 181/*
171 * Basic procedure for returning a delegation to the server 182 * Basic procedure for returning a delegation to the server
172 */ 183 */
173int __nfs_inode_return_delegation(struct inode *inode) 184static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
174{ 185{
175 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 186 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
176 struct nfs_inode *nfsi = NFS_I(inode); 187 struct nfs_inode *nfsi = NFS_I(inode);
177 struct nfs_delegation *delegation;
178 int res = 0;
179 188
180 nfs_msync_inode(inode); 189 nfs_msync_inode(inode);
181 down_read(&clp->cl_sem); 190 down_read(&clp->cl_sem);
182 /* Guard against new delegated open calls */ 191 /* Guard against new delegated open calls */
183 down_write(&nfsi->rwsem); 192 down_write(&nfsi->rwsem);
184 spin_lock(&clp->cl_lock); 193 nfs_delegation_claim_opens(inode, &delegation->stateid);
185 delegation = nfsi->delegation;
186 if (delegation != NULL) {
187 list_del_init(&delegation->super_list);
188 nfsi->delegation = NULL;
189 nfsi->delegation_state = 0;
190 }
191 spin_unlock(&clp->cl_lock);
192 nfs_delegation_claim_opens(inode);
193 up_write(&nfsi->rwsem); 194 up_write(&nfsi->rwsem);
194 up_read(&clp->cl_sem); 195 up_read(&clp->cl_sem);
195 nfs_msync_inode(inode); 196 nfs_msync_inode(inode);
196 197
197 if (delegation != NULL) 198 return nfs_do_return_delegation(inode, delegation);
198 res = nfs_do_return_delegation(inode, delegation); 199}
199 return res; 200
201static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
202{
203 struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
204
205 if (delegation == NULL)
206 goto nomatch;
207 if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
208 sizeof(delegation->stateid.data)) != 0)
209 goto nomatch;
210 list_del_rcu(&delegation->super_list);
211 nfsi->delegation_state = 0;
212 rcu_assign_pointer(nfsi->delegation, NULL);
213 return delegation;
214nomatch:
215 return NULL;
216}
217
218int nfs_inode_return_delegation(struct inode *inode)
219{
220 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
221 struct nfs_inode *nfsi = NFS_I(inode);
222 struct nfs_delegation *delegation;
223 int err = 0;
224
225 if (rcu_dereference(nfsi->delegation) != NULL) {
226 spin_lock(&clp->cl_lock);
227 delegation = nfs_detach_delegation_locked(nfsi, NULL);
228 spin_unlock(&clp->cl_lock);
229 if (delegation != NULL)
230 err = __nfs_inode_return_delegation(inode, delegation);
231 }
232 return err;
200} 233}
201 234
202/* 235/*
@@ -211,19 +244,23 @@ void nfs_return_all_delegations(struct super_block *sb)
211 if (clp == NULL) 244 if (clp == NULL)
212 return; 245 return;
213restart: 246restart:
214 spin_lock(&clp->cl_lock); 247 rcu_read_lock();
215 list_for_each_entry(delegation, &clp->cl_delegations, super_list) { 248 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
216 if (delegation->inode->i_sb != sb) 249 if (delegation->inode->i_sb != sb)
217 continue; 250 continue;
218 inode = igrab(delegation->inode); 251 inode = igrab(delegation->inode);
219 if (inode == NULL) 252 if (inode == NULL)
220 continue; 253 continue;
254 spin_lock(&clp->cl_lock);
255 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
221 spin_unlock(&clp->cl_lock); 256 spin_unlock(&clp->cl_lock);
222 nfs_inode_return_delegation(inode); 257 rcu_read_unlock();
258 if (delegation != NULL)
259 __nfs_inode_return_delegation(inode, delegation);
223 iput(inode); 260 iput(inode);
224 goto restart; 261 goto restart;
225 } 262 }
226 spin_unlock(&clp->cl_lock); 263 rcu_read_unlock();
227} 264}
228 265
229static int nfs_do_expire_all_delegations(void *ptr) 266static int nfs_do_expire_all_delegations(void *ptr)
@@ -234,22 +271,26 @@ static int nfs_do_expire_all_delegations(void *ptr)
234 271
235 allow_signal(SIGKILL); 272 allow_signal(SIGKILL);
236restart: 273restart:
237 spin_lock(&clp->cl_lock);
238 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0) 274 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
239 goto out; 275 goto out;
240 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) 276 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
241 goto out; 277 goto out;
242 list_for_each_entry(delegation, &clp->cl_delegations, super_list) { 278 rcu_read_lock();
279 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
243 inode = igrab(delegation->inode); 280 inode = igrab(delegation->inode);
244 if (inode == NULL) 281 if (inode == NULL)
245 continue; 282 continue;
283 spin_lock(&clp->cl_lock);
284 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
246 spin_unlock(&clp->cl_lock); 285 spin_unlock(&clp->cl_lock);
247 nfs_inode_return_delegation(inode); 286 rcu_read_unlock();
287 if (delegation)
288 __nfs_inode_return_delegation(inode, delegation);
248 iput(inode); 289 iput(inode);
249 goto restart; 290 goto restart;
250 } 291 }
292 rcu_read_unlock();
251out: 293out:
252 spin_unlock(&clp->cl_lock);
253 nfs_put_client(clp); 294 nfs_put_client(clp);
254 module_put_and_exit(0); 295 module_put_and_exit(0);
255} 296}
@@ -280,17 +321,21 @@ void nfs_handle_cb_pathdown(struct nfs_client *clp)
280 if (clp == NULL) 321 if (clp == NULL)
281 return; 322 return;
282restart: 323restart:
283 spin_lock(&clp->cl_lock); 324 rcu_read_lock();
284 list_for_each_entry(delegation, &clp->cl_delegations, super_list) { 325 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
285 inode = igrab(delegation->inode); 326 inode = igrab(delegation->inode);
286 if (inode == NULL) 327 if (inode == NULL)
287 continue; 328 continue;
329 spin_lock(&clp->cl_lock);
330 delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
288 spin_unlock(&clp->cl_lock); 331 spin_unlock(&clp->cl_lock);
289 nfs_inode_return_delegation(inode); 332 rcu_read_unlock();
333 if (delegation != NULL)
334 __nfs_inode_return_delegation(inode, delegation);
290 iput(inode); 335 iput(inode);
291 goto restart; 336 goto restart;
292 } 337 }
293 spin_unlock(&clp->cl_lock); 338 rcu_read_unlock();
294} 339}
295 340
296struct recall_threadargs { 341struct recall_threadargs {
@@ -316,21 +361,14 @@ static int recall_thread(void *data)
316 down_read(&clp->cl_sem); 361 down_read(&clp->cl_sem);
317 down_write(&nfsi->rwsem); 362 down_write(&nfsi->rwsem);
318 spin_lock(&clp->cl_lock); 363 spin_lock(&clp->cl_lock);
319 delegation = nfsi->delegation; 364 delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
320 if (delegation != NULL && memcmp(delegation->stateid.data, 365 if (delegation != NULL)
321 args->stateid->data,
322 sizeof(delegation->stateid.data)) == 0) {
323 list_del_init(&delegation->super_list);
324 nfsi->delegation = NULL;
325 nfsi->delegation_state = 0;
326 args->result = 0; 366 args->result = 0;
327 } else { 367 else
328 delegation = NULL;
329 args->result = -ENOENT; 368 args->result = -ENOENT;
330 }
331 spin_unlock(&clp->cl_lock); 369 spin_unlock(&clp->cl_lock);
332 complete(&args->started); 370 complete(&args->started);
333 nfs_delegation_claim_opens(inode); 371 nfs_delegation_claim_opens(inode, args->stateid);
334 up_write(&nfsi->rwsem); 372 up_write(&nfsi->rwsem);
335 up_read(&clp->cl_sem); 373 up_read(&clp->cl_sem);
336 nfs_msync_inode(inode); 374 nfs_msync_inode(inode);
@@ -371,14 +409,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
371{ 409{
372 struct nfs_delegation *delegation; 410 struct nfs_delegation *delegation;
373 struct inode *res = NULL; 411 struct inode *res = NULL;
374 spin_lock(&clp->cl_lock); 412 rcu_read_lock();
375 list_for_each_entry(delegation, &clp->cl_delegations, super_list) { 413 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
376 if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { 414 if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
377 res = igrab(delegation->inode); 415 res = igrab(delegation->inode);
378 break; 416 break;
379 } 417 }
380 } 418 }
381 spin_unlock(&clp->cl_lock); 419 rcu_read_unlock();
382 return res; 420 return res;
383} 421}
384 422
@@ -388,10 +426,10 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
388void nfs_delegation_mark_reclaim(struct nfs_client *clp) 426void nfs_delegation_mark_reclaim(struct nfs_client *clp)
389{ 427{
390 struct nfs_delegation *delegation; 428 struct nfs_delegation *delegation;
391 spin_lock(&clp->cl_lock); 429 rcu_read_lock();
392 list_for_each_entry(delegation, &clp->cl_delegations, super_list) 430 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
393 delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; 431 delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
394 spin_unlock(&clp->cl_lock); 432 rcu_read_unlock();
395} 433}
396 434
397/* 435/*
@@ -399,39 +437,35 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
399 */ 437 */
400void nfs_delegation_reap_unclaimed(struct nfs_client *clp) 438void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
401{ 439{
402 struct nfs_delegation *delegation, *n; 440 struct nfs_delegation *delegation;
403 LIST_HEAD(head); 441restart:
404 spin_lock(&clp->cl_lock); 442 rcu_read_lock();
405 list_for_each_entry_safe(delegation, n, &clp->cl_delegations, super_list) { 443 list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
406 if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) 444 if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
407 continue; 445 continue;
408 list_move(&delegation->super_list, &head); 446 spin_lock(&clp->cl_lock);
409 NFS_I(delegation->inode)->delegation = NULL; 447 delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL);
410 NFS_I(delegation->inode)->delegation_state = 0; 448 spin_unlock(&clp->cl_lock);
411 } 449 rcu_read_unlock();
412 spin_unlock(&clp->cl_lock); 450 if (delegation != NULL)
413 while(!list_empty(&head)) { 451 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
414 delegation = list_entry(head.next, struct nfs_delegation, super_list); 452 goto restart;
415 list_del(&delegation->super_list);
416 nfs_free_delegation(delegation);
417 } 453 }
454 rcu_read_unlock();
418} 455}
419 456
420int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) 457int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
421{ 458{
422 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
423 struct nfs_inode *nfsi = NFS_I(inode); 459 struct nfs_inode *nfsi = NFS_I(inode);
424 struct nfs_delegation *delegation; 460 struct nfs_delegation *delegation;
425 int res = 0; 461 int ret = 0;
426 462
427 if (nfsi->delegation_state == 0) 463 rcu_read_lock();
428 return 0; 464 delegation = rcu_dereference(nfsi->delegation);
429 spin_lock(&clp->cl_lock);
430 delegation = nfsi->delegation;
431 if (delegation != NULL) { 465 if (delegation != NULL) {
432 memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); 466 memcpy(dst->data, delegation->stateid.data, sizeof(dst->data));
433 res = 1; 467 ret = 1;
434 } 468 }
435 spin_unlock(&clp->cl_lock); 469 rcu_read_unlock();
436 return res; 470 return ret;
437} 471}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2cfd4b24c7..5874ce7fdb 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -22,11 +22,12 @@ struct nfs_delegation {
22 long flags; 22 long flags;
23 loff_t maxsize; 23 loff_t maxsize;
24 __u64 change_attr; 24 __u64 change_attr;
25 struct rcu_head rcu;
25}; 26};
26 27
27int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 28int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
28void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 29void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
29int __nfs_inode_return_delegation(struct inode *inode); 30int nfs_inode_return_delegation(struct inode *inode);
30int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); 31int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
31 32
32struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 33struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
@@ -39,27 +40,24 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
39 40
40/* NFSv4 delegation-related procedures */ 41/* NFSv4 delegation-related procedures */
41int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); 42int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
42int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state); 43int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
43int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); 44int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
44int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); 45int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
45 46
46static inline int nfs_have_delegation(struct inode *inode, int flags) 47static inline int nfs_have_delegation(struct inode *inode, int flags)
47{ 48{
49 struct nfs_delegation *delegation;
50 int ret = 0;
51
48 flags &= FMODE_READ|FMODE_WRITE; 52 flags &= FMODE_READ|FMODE_WRITE;
49 smp_rmb(); 53 rcu_read_lock();
50 if ((NFS_I(inode)->delegation_state & flags) == flags) 54 delegation = rcu_dereference(NFS_I(inode)->delegation);
51 return 1; 55 if (delegation != NULL && (delegation->type & flags) == flags)
52 return 0; 56 ret = 1;
57 rcu_read_unlock();
58 return ret;
53} 59}
54 60
55static inline int nfs_inode_return_delegation(struct inode *inode)
56{
57 int err = 0;
58
59 if (NFS_I(inode)->delegation != NULL)
60 err = __nfs_inode_return_delegation(inode);
61 return err;
62}
63#else 61#else
64static inline int nfs_have_delegation(struct inode *inode, int flags) 62static inline int nfs_have_delegation(struct inode *inode, int flags)
65{ 63{
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c27258b5d3..322141f4ab 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -897,14 +897,13 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
897 return (nd->intent.open.flags & O_EXCL) != 0; 897 return (nd->intent.open.flags & O_EXCL) != 0;
898} 898}
899 899
900static inline int nfs_reval_fsid(struct vfsmount *mnt, struct inode *dir, 900static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr)
901 struct nfs_fh *fh, struct nfs_fattr *fattr)
902{ 901{
903 struct nfs_server *server = NFS_SERVER(dir); 902 struct nfs_server *server = NFS_SERVER(dir);
904 903
905 if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) 904 if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
906 /* Revalidate fsid on root dir */ 905 /* Revalidate fsid using the parent directory */
907 return __nfs_revalidate_inode(server, mnt->mnt_root->d_inode); 906 return __nfs_revalidate_inode(server, dir);
908 return 0; 907 return 0;
909} 908}
910 909
@@ -946,7 +945,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
946 res = ERR_PTR(error); 945 res = ERR_PTR(error);
947 goto out_unlock; 946 goto out_unlock;
948 } 947 }
949 error = nfs_reval_fsid(nd->mnt, dir, &fhandle, &fattr); 948 error = nfs_reval_fsid(dir, &fattr);
950 if (error < 0) { 949 if (error < 0) {
951 res = ERR_PTR(error); 950 res = ERR_PTR(error);
952 goto out_unlock; 951 goto out_unlock;
@@ -1244,7 +1243,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1244 attr.ia_mode = mode; 1243 attr.ia_mode = mode;
1245 attr.ia_valid = ATTR_MODE; 1244 attr.ia_valid = ATTR_MODE;
1246 1245
1247 if (nd && (nd->flags & LOOKUP_CREATE)) 1246 if ((nd->flags & LOOKUP_CREATE) != 0)
1248 open_flags = nd->intent.open.flags; 1247 open_flags = nd->intent.open.flags;
1249 1248
1250 lock_kernel(); 1249 lock_kernel();
@@ -1535,7 +1534,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1535 1534
1536 lock_kernel(); 1535 lock_kernel();
1537 1536
1538 page = alloc_page(GFP_KERNEL); 1537 page = alloc_page(GFP_HIGHUSER);
1539 if (!page) { 1538 if (!page) {
1540 unlock_kernel(); 1539 unlock_kernel();
1541 return -ENOMEM; 1540 return -ENOMEM;
@@ -1744,8 +1743,8 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
1744 struct nfs_inode *nfsi; 1743 struct nfs_inode *nfsi;
1745 struct nfs_access_entry *cache; 1744 struct nfs_access_entry *cache;
1746 1745
1747 spin_lock(&nfs_access_lru_lock);
1748restart: 1746restart:
1747 spin_lock(&nfs_access_lru_lock);
1749 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { 1748 list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
1750 struct inode *inode; 1749 struct inode *inode;
1751 1750
@@ -1770,6 +1769,7 @@ remove_lru_entry:
1770 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1769 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1771 } 1770 }
1772 spin_unlock(&inode->i_lock); 1771 spin_unlock(&inode->i_lock);
1772 spin_unlock(&nfs_access_lru_lock);
1773 iput(inode); 1773 iput(inode);
1774 goto restart; 1774 goto restart;
1775 } 1775 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 00eee87510..a5c82b6f3b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,7 +266,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
266static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) 266static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
267{ 267{
268 struct nfs_open_context *ctx = dreq->ctx; 268 struct nfs_open_context *ctx = dreq->ctx;
269 struct inode *inode = ctx->dentry->d_inode; 269 struct inode *inode = ctx->path.dentry->d_inode;
270 size_t rsize = NFS_SERVER(inode)->rsize; 270 size_t rsize = NFS_SERVER(inode)->rsize;
271 unsigned int pgbase; 271 unsigned int pgbase;
272 int result; 272 int result;
@@ -295,9 +295,14 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
295 break; 295 break;
296 } 296 }
297 if ((unsigned)result < data->npages) { 297 if ((unsigned)result < data->npages) {
298 nfs_direct_release_pages(data->pagevec, result); 298 bytes = result * PAGE_SIZE;
299 nfs_readdata_release(data); 299 if (bytes <= pgbase) {
300 break; 300 nfs_direct_release_pages(data->pagevec, result);
301 nfs_readdata_release(data);
302 break;
303 }
304 bytes -= pgbase;
305 data->npages = result;
301 } 306 }
302 307
303 get_dreq(dreq); 308 get_dreq(dreq);
@@ -601,7 +606,7 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
601static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) 606static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
602{ 607{
603 struct nfs_open_context *ctx = dreq->ctx; 608 struct nfs_open_context *ctx = dreq->ctx;
604 struct inode *inode = ctx->dentry->d_inode; 609 struct inode *inode = ctx->path.dentry->d_inode;
605 size_t wsize = NFS_SERVER(inode)->wsize; 610 size_t wsize = NFS_SERVER(inode)->wsize;
606 unsigned int pgbase; 611 unsigned int pgbase;
607 int result; 612 int result;
@@ -630,9 +635,14 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
630 break; 635 break;
631 } 636 }
632 if ((unsigned)result < data->npages) { 637 if ((unsigned)result < data->npages) {
633 nfs_direct_release_pages(data->pagevec, result); 638 bytes = result * PAGE_SIZE;
634 nfs_writedata_release(data); 639 if (bytes <= pgbase) {
635 break; 640 nfs_direct_release_pages(data->pagevec, result);
641 nfs_writedata_release(data);
642 break;
643 }
644 bytes -= pgbase;
645 data->npages = result;
636 } 646 }
637 647
638 get_dreq(dreq); 648 get_dreq(dreq);
@@ -763,10 +773,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
763 (unsigned long) count, (long long) pos); 773 (unsigned long) count, (long long) pos);
764 774
765 if (nr_segs != 1) 775 if (nr_segs != 1)
766 return -EINVAL;
767
768 if (count < 0)
769 goto out; 776 goto out;
777
770 retval = -EFAULT; 778 retval = -EFAULT;
771 if (!access_ok(VERIFY_WRITE, buf, count)) 779 if (!access_ok(VERIFY_WRITE, buf, count))
772 goto out; 780 goto out;
@@ -814,7 +822,7 @@ out:
814ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 822ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
815 unsigned long nr_segs, loff_t pos) 823 unsigned long nr_segs, loff_t pos)
816{ 824{
817 ssize_t retval; 825 ssize_t retval = -EINVAL;
818 struct file *file = iocb->ki_filp; 826 struct file *file = iocb->ki_filp;
819 struct address_space *mapping = file->f_mapping; 827 struct address_space *mapping = file->f_mapping;
820 /* XXX: temporary */ 828 /* XXX: temporary */
@@ -827,7 +835,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
827 (unsigned long) count, (long long) pos); 835 (unsigned long) count, (long long) pos);
828 836
829 if (nr_segs != 1) 837 if (nr_segs != 1)
830 return -EINVAL; 838 goto out;
831 839
832 retval = generic_write_checks(file, &pos, &count, 0); 840 retval = generic_write_checks(file, &pos, &count, 0);
833 if (retval) 841 if (retval)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9eb8eb4e4a..8689b736fd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -41,7 +41,9 @@ static int nfs_file_open(struct inode *, struct file *);
41static int nfs_file_release(struct inode *, struct file *); 41static int nfs_file_release(struct inode *, struct file *);
42static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); 42static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
43static int nfs_file_mmap(struct file *, struct vm_area_struct *); 43static int nfs_file_mmap(struct file *, struct vm_area_struct *);
44static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *); 44static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
45 struct pipe_inode_info *pipe,
46 size_t count, unsigned int flags);
45static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, 47static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
46 unsigned long nr_segs, loff_t pos); 48 unsigned long nr_segs, loff_t pos);
47static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 49static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
@@ -65,7 +67,7 @@ const struct file_operations nfs_file_operations = {
65 .fsync = nfs_fsync, 67 .fsync = nfs_fsync,
66 .lock = nfs_lock, 68 .lock = nfs_lock,
67 .flock = nfs_flock, 69 .flock = nfs_flock,
68 .sendfile = nfs_file_sendfile, 70 .splice_read = nfs_file_splice_read,
69 .check_flags = nfs_check_flags, 71 .check_flags = nfs_check_flags,
70}; 72};
71 73
@@ -224,20 +226,21 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
224} 226}
225 227
226static ssize_t 228static ssize_t
227nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, 229nfs_file_splice_read(struct file *filp, loff_t *ppos,
228 read_actor_t actor, void *target) 230 struct pipe_inode_info *pipe, size_t count,
231 unsigned int flags)
229{ 232{
230 struct dentry *dentry = filp->f_path.dentry; 233 struct dentry *dentry = filp->f_path.dentry;
231 struct inode *inode = dentry->d_inode; 234 struct inode *inode = dentry->d_inode;
232 ssize_t res; 235 ssize_t res;
233 236
234 dfprintk(VFS, "nfs: sendfile(%s/%s, %lu@%Lu)\n", 237 dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
235 dentry->d_parent->d_name.name, dentry->d_name.name, 238 dentry->d_parent->d_name.name, dentry->d_name.name,
236 (unsigned long) count, (unsigned long long) *ppos); 239 (unsigned long) count, (unsigned long long) *ppos);
237 240
238 res = nfs_revalidate_mapping(inode, filp->f_mapping); 241 res = nfs_revalidate_mapping(inode, filp->f_mapping);
239 if (!res) 242 if (!res)
240 res = generic_file_sendfile(filp, ppos, count, actor, target); 243 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
241 return res; 244 return res;
242} 245}
243 246
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd9f5a8365..3d9fccf4ef 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -461,14 +461,14 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
461 461
462 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 462 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
463 if (ctx != NULL) { 463 if (ctx != NULL) {
464 atomic_set(&ctx->count, 1); 464 ctx->path.dentry = dget(dentry);
465 ctx->dentry = dget(dentry); 465 ctx->path.mnt = mntget(mnt);
466 ctx->vfsmnt = mntget(mnt);
467 ctx->cred = get_rpccred(cred); 466 ctx->cred = get_rpccred(cred);
468 ctx->state = NULL; 467 ctx->state = NULL;
469 ctx->lockowner = current->files; 468 ctx->lockowner = current->files;
470 ctx->error = 0; 469 ctx->error = 0;
471 ctx->dir_cookie = 0; 470 ctx->dir_cookie = 0;
471 kref_init(&ctx->kref);
472 } 472 }
473 return ctx; 473 return ctx;
474} 474}
@@ -476,27 +476,33 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
476struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) 476struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
477{ 477{
478 if (ctx != NULL) 478 if (ctx != NULL)
479 atomic_inc(&ctx->count); 479 kref_get(&ctx->kref);
480 return ctx; 480 return ctx;
481} 481}
482 482
483void put_nfs_open_context(struct nfs_open_context *ctx) 483static void nfs_free_open_context(struct kref *kref)
484{ 484{
485 if (atomic_dec_and_test(&ctx->count)) { 485 struct nfs_open_context *ctx = container_of(kref,
486 if (!list_empty(&ctx->list)) { 486 struct nfs_open_context, kref);
487 struct inode *inode = ctx->dentry->d_inode; 487
488 spin_lock(&inode->i_lock); 488 if (!list_empty(&ctx->list)) {
489 list_del(&ctx->list); 489 struct inode *inode = ctx->path.dentry->d_inode;
490 spin_unlock(&inode->i_lock); 490 spin_lock(&inode->i_lock);
491 } 491 list_del(&ctx->list);
492 if (ctx->state != NULL) 492 spin_unlock(&inode->i_lock);
493 nfs4_close_state(ctx->state, ctx->mode);
494 if (ctx->cred != NULL)
495 put_rpccred(ctx->cred);
496 dput(ctx->dentry);
497 mntput(ctx->vfsmnt);
498 kfree(ctx);
499 } 493 }
494 if (ctx->state != NULL)
495 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
496 if (ctx->cred != NULL)
497 put_rpccred(ctx->cred);
498 dput(ctx->path.dentry);
499 mntput(ctx->path.mnt);
500 kfree(ctx);
501}
502
503void put_nfs_open_context(struct nfs_open_context *ctx)
504{
505 kref_put(&ctx->kref, nfs_free_open_context);
500} 506}
501 507
502/* 508/*
@@ -961,8 +967,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
961 goto out_changed; 967 goto out_changed;
962 968
963 server = NFS_SERVER(inode); 969 server = NFS_SERVER(inode);
964 /* Update the fsid if and only if this is the root directory */ 970 /* Update the fsid? */
965 if (inode == inode->i_sb->s_root->d_inode 971 if (S_ISDIR(inode->i_mode)
966 && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) 972 && !nfs_fsid_equal(&server->fsid, &fattr->fsid))
967 server->fsid = fattr->fsid; 973 server->fsid = fattr->fsid;
968 974
@@ -1066,8 +1072,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1066 invalid &= ~NFS_INO_INVALID_DATA; 1072 invalid &= ~NFS_INO_INVALID_DATA;
1067 if (data_stable) 1073 if (data_stable)
1068 invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); 1074 invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
1069 if (!nfs_have_delegation(inode, FMODE_READ)) 1075 if (!nfs_have_delegation(inode, FMODE_READ) ||
1076 (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
1070 nfsi->cache_validity |= invalid; 1077 nfsi->cache_validity |= invalid;
1078 nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
1071 1079
1072 return 0; 1080 return 0;
1073 out_changed: 1081 out_changed:
@@ -1103,27 +1111,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1103 */ 1111 */
1104void nfs4_clear_inode(struct inode *inode) 1112void nfs4_clear_inode(struct inode *inode)
1105{ 1113{
1106 struct nfs_inode *nfsi = NFS_I(inode);
1107
1108 /* If we are holding a delegation, return it! */ 1114 /* If we are holding a delegation, return it! */
1109 nfs_inode_return_delegation(inode); 1115 nfs_inode_return_delegation(inode);
1110 /* First call standard NFS clear_inode() code */ 1116 /* First call standard NFS clear_inode() code */
1111 nfs_clear_inode(inode); 1117 nfs_clear_inode(inode);
1112 /* Now clear out any remaining state */
1113 while (!list_empty(&nfsi->open_states)) {
1114 struct nfs4_state *state;
1115
1116 state = list_entry(nfsi->open_states.next,
1117 struct nfs4_state,
1118 inode_states);
1119 dprintk("%s(%s/%Ld): found unclaimed NFSv4 state %p\n",
1120 __FUNCTION__,
1121 inode->i_sb->s_id,
1122 (long long)NFS_FILEID(inode),
1123 state);
1124 BUG_ON(atomic_read(&state->count) != 1);
1125 nfs4_close_state(state, state->state);
1126 }
1127} 1118}
1128#endif 1119#endif
1129 1120
@@ -1165,15 +1156,11 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
1165 struct nfs_inode *nfsi = (struct nfs_inode *) foo; 1156 struct nfs_inode *nfsi = (struct nfs_inode *) foo;
1166 1157
1167 inode_init_once(&nfsi->vfs_inode); 1158 inode_init_once(&nfsi->vfs_inode);
1168 spin_lock_init(&nfsi->req_lock);
1169 INIT_LIST_HEAD(&nfsi->dirty);
1170 INIT_LIST_HEAD(&nfsi->commit);
1171 INIT_LIST_HEAD(&nfsi->open_files); 1159 INIT_LIST_HEAD(&nfsi->open_files);
1172 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1160 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1173 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1161 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1174 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1162 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1175 atomic_set(&nfsi->data_updates, 0); 1163 atomic_set(&nfsi->data_updates, 0);
1176 nfsi->ndirty = 0;
1177 nfsi->ncommit = 0; 1164 nfsi->ncommit = 0;
1178 nfsi->npages = 0; 1165 nfsi->npages = 0;
1179 nfs4_init_once(nfsi); 1166 nfs4_init_once(nfsi);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ad2b40db1e..76cf55d571 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -183,9 +183,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
183/* 183/*
184 * Calculate the number of 512byte blocks used. 184 * Calculate the number of 512byte blocks used.
185 */ 185 */
186static inline unsigned long nfs_calc_block_size(u64 tsize) 186static inline blkcnt_t nfs_calc_block_size(u64 tsize)
187{ 187{
188 loff_t used = (tsize + 511) >> 9; 188 blkcnt_t used = (tsize + 511) >> 9;
189 return (used > ULONG_MAX) ? ULONG_MAX : used; 189 return (used > ULONG_MAX) ? ULONG_MAX : used;
190} 190}
191 191
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca5a266a31..8afd9f7e7a 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -1,7 +1,5 @@
1/* 1/*
2 * linux/fs/nfs/mount_clnt.c 2 * In-kernel MOUNT protocol client
3 *
4 * MOUNT client to support NFSroot.
5 * 3 *
6 * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
7 */ 5 */
@@ -18,33 +16,31 @@
18#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
19 17
20#ifdef RPC_DEBUG 18#ifdef RPC_DEBUG
21# define NFSDBG_FACILITY NFSDBG_ROOT 19# define NFSDBG_FACILITY NFSDBG_MOUNT
22#endif 20#endif
23 21
24/*
25#define MOUNT_PROGRAM 100005
26#define MOUNT_VERSION 1
27#define MOUNT_MNT 1
28#define MOUNT_UMNT 3
29 */
30
31static struct rpc_clnt * mnt_create(char *, struct sockaddr_in *,
32 int, int);
33static struct rpc_program mnt_program; 22static struct rpc_program mnt_program;
34 23
35struct mnt_fhstatus { 24struct mnt_fhstatus {
36 unsigned int status; 25 u32 status;
37 struct nfs_fh * fh; 26 struct nfs_fh *fh;
38}; 27};
39 28
40/* 29/**
41 * Obtain an NFS file handle for the given host and path 30 * nfs_mount - Obtain an NFS file handle for the given host and path
31 * @addr: pointer to server's address
32 * @len: size of server's address
33 * @hostname: name of server host, or NULL
34 * @path: pointer to string containing export path to mount
35 * @version: mount version to use for this request
36 * @protocol: transport protocol to use for thie request
37 * @fh: pointer to location to place returned file handle
38 *
39 * Uses default timeout parameters specified by underlying transport.
42 */ 40 */
43int 41int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
44nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh, 42 int version, int protocol, struct nfs_fh *fh)
45 int version, int protocol)
46{ 43{
47 struct rpc_clnt *mnt_clnt;
48 struct mnt_fhstatus result = { 44 struct mnt_fhstatus result = {
49 .fh = fh 45 .fh = fh
50 }; 46 };
@@ -52,16 +48,25 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
52 .rpc_argp = path, 48 .rpc_argp = path,
53 .rpc_resp = &result, 49 .rpc_resp = &result,
54 }; 50 };
55 char hostname[32]; 51 struct rpc_create_args args = {
52 .protocol = protocol,
53 .address = addr,
54 .addrsize = len,
55 .servername = hostname,
56 .program = &mnt_program,
57 .version = version,
58 .authflavor = RPC_AUTH_UNIX,
59 .flags = RPC_CLNT_CREATE_INTR,
60 };
61 struct rpc_clnt *mnt_clnt;
56 int status; 62 int status;
57 63
58 dprintk("NFS: nfs_mount(%08x:%s)\n", 64 dprintk("NFS: sending MNT request for %s:%s\n",
59 (unsigned)ntohl(addr->sin_addr.s_addr), path); 65 (hostname ? hostname : "server"), path);
60 66
61 sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr->sin_addr.s_addr)); 67 mnt_clnt = rpc_create(&args);
62 mnt_clnt = mnt_create(hostname, addr, version, protocol);
63 if (IS_ERR(mnt_clnt)) 68 if (IS_ERR(mnt_clnt))
64 return PTR_ERR(mnt_clnt); 69 goto out_clnt_err;
65 70
66 if (version == NFS_MNT3_VERSION) 71 if (version == NFS_MNT3_VERSION)
67 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; 72 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
@@ -69,33 +74,39 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
69 msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; 74 msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
70 75
71 status = rpc_call_sync(mnt_clnt, &msg, 0); 76 status = rpc_call_sync(mnt_clnt, &msg, 0);
72 return status < 0? status : (result.status? -EACCES : 0); 77 rpc_shutdown_client(mnt_clnt);
73}
74 78
75static struct rpc_clnt * 79 if (status < 0)
76mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, 80 goto out_call_err;
77 int protocol) 81 if (result.status != 0)
78{ 82 goto out_mnt_err;
79 struct rpc_create_args args = { 83
80 .protocol = protocol, 84 dprintk("NFS: MNT request succeeded\n");
81 .address = (struct sockaddr *)srvaddr, 85 status = 0;
82 .addrsize = sizeof(*srvaddr), 86
83 .servername = hostname, 87out:
84 .program = &mnt_program, 88 return status;
85 .version = version, 89
86 .authflavor = RPC_AUTH_UNIX, 90out_clnt_err:
87 .flags = (RPC_CLNT_CREATE_ONESHOT | 91 status = PTR_ERR(mnt_clnt);
88 RPC_CLNT_CREATE_INTR), 92 dprintk("NFS: failed to create RPC client, status=%d\n", status);
89 }; 93 goto out;
94
95out_call_err:
96 dprintk("NFS: failed to start MNT request, status=%d\n", status);
97 goto out;
90 98
91 return rpc_create(&args); 99out_mnt_err:
100 dprintk("NFS: MNT server returned result %d\n", result.status);
101 status = -EACCES;
102 goto out;
92} 103}
93 104
94/* 105/*
95 * XDR encode/decode functions for MOUNT 106 * XDR encode/decode functions for MOUNT
96 */ 107 */
97static int 108static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p,
98xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path) 109 const char *path)
99{ 110{
100 p = xdr_encode_string(p, path); 111 p = xdr_encode_string(p, path);
101 112
@@ -103,8 +114,8 @@ xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path)
103 return 0; 114 return 0;
104} 115}
105 116
106static int 117static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p,
107xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) 118 struct mnt_fhstatus *res)
108{ 119{
109 struct nfs_fh *fh = res->fh; 120 struct nfs_fh *fh = res->fh;
110 121
@@ -115,8 +126,8 @@ xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
115 return 0; 126 return 0;
116} 127}
117 128
118static int 129static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
119xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) 130 struct mnt_fhstatus *res)
120{ 131{
121 struct nfs_fh *fh = res->fh; 132 struct nfs_fh *fh = res->fh;
122 133
@@ -135,53 +146,53 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
135#define MNT_fhstatus_sz (1 + 8) 146#define MNT_fhstatus_sz (1 + 8)
136#define MNT_fhstatus3_sz (1 + 16) 147#define MNT_fhstatus3_sz (1 + 16)
137 148
138static struct rpc_procinfo mnt_procedures[] = { 149static struct rpc_procinfo mnt_procedures[] = {
139[MNTPROC_MNT] = { 150 [MNTPROC_MNT] = {
140 .p_proc = MNTPROC_MNT, 151 .p_proc = MNTPROC_MNT,
141 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 152 .p_encode = (kxdrproc_t) xdr_encode_dirpath,
142 .p_decode = (kxdrproc_t) xdr_decode_fhstatus, 153 .p_decode = (kxdrproc_t) xdr_decode_fhstatus,
143 .p_arglen = MNT_dirpath_sz, 154 .p_arglen = MNT_dirpath_sz,
144 .p_replen = MNT_fhstatus_sz, 155 .p_replen = MNT_fhstatus_sz,
145 .p_statidx = MNTPROC_MNT, 156 .p_statidx = MNTPROC_MNT,
146 .p_name = "MOUNT", 157 .p_name = "MOUNT",
147 }, 158 },
148}; 159};
149 160
150static struct rpc_procinfo mnt3_procedures[] = { 161static struct rpc_procinfo mnt3_procedures[] = {
151[MOUNTPROC3_MNT] = { 162 [MOUNTPROC3_MNT] = {
152 .p_proc = MOUNTPROC3_MNT, 163 .p_proc = MOUNTPROC3_MNT,
153 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 164 .p_encode = (kxdrproc_t) xdr_encode_dirpath,
154 .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, 165 .p_decode = (kxdrproc_t) xdr_decode_fhstatus3,
155 .p_arglen = MNT_dirpath_sz, 166 .p_arglen = MNT_dirpath_sz,
156 .p_replen = MNT_fhstatus3_sz, 167 .p_replen = MNT_fhstatus3_sz,
157 .p_statidx = MOUNTPROC3_MNT, 168 .p_statidx = MOUNTPROC3_MNT,
158 .p_name = "MOUNT", 169 .p_name = "MOUNT",
159 }, 170 },
160}; 171};
161 172
162 173
163static struct rpc_version mnt_version1 = { 174static struct rpc_version mnt_version1 = {
164 .number = 1, 175 .number = 1,
165 .nrprocs = 2, 176 .nrprocs = 2,
166 .procs = mnt_procedures 177 .procs = mnt_procedures,
167}; 178};
168 179
169static struct rpc_version mnt_version3 = { 180static struct rpc_version mnt_version3 = {
170 .number = 3, 181 .number = 3,
171 .nrprocs = 2, 182 .nrprocs = 2,
172 .procs = mnt3_procedures 183 .procs = mnt3_procedures,
173}; 184};
174 185
175static struct rpc_version * mnt_version[] = { 186static struct rpc_version *mnt_version[] = {
176 NULL, 187 NULL,
177 &mnt_version1, 188 &mnt_version1,
178 NULL, 189 NULL,
179 &mnt_version3, 190 &mnt_version3,
180}; 191};
181 192
182static struct rpc_stat mnt_stats; 193static struct rpc_stat mnt_stats;
183 194
184static struct rpc_program mnt_program = { 195static struct rpc_program mnt_program = {
185 .name = "mount", 196 .name = "mount",
186 .number = NFS_MNT_PROGRAM, 197 .number = NFS_MNT_PROGRAM,
187 .nrvers = ARRAY_SIZE(mnt_version), 198 .nrvers = ARRAY_SIZE(mnt_version),
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index cd3ca7b5d3..7fcc78f2aa 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -223,7 +223,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
223static int 223static int
224nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 224nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
225{ 225{
226 struct rpc_auth *auth = req->rq_task->tk_auth; 226 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
227 unsigned int replen; 227 unsigned int replen;
228 u32 offset = (u32)args->offset; 228 u32 offset = (u32)args->offset;
229 u32 count = args->count; 229 u32 count = args->count;
@@ -380,7 +380,7 @@ static int
380nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 380nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
381{ 381{
382 struct rpc_task *task = req->rq_task; 382 struct rpc_task *task = req->rq_task;
383 struct rpc_auth *auth = task->tk_auth; 383 struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
384 unsigned int replen; 384 unsigned int replen;
385 u32 count = args->count; 385 u32 count = args->count;
386 386
@@ -541,7 +541,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
541static int 541static int
542nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) 542nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
543{ 543{
544 struct rpc_auth *auth = req->rq_task->tk_auth; 544 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
545 unsigned int replen; 545 unsigned int replen;
546 546
547 p = xdr_encode_fhandle(p, args->fh); 547 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 45268d6def..814d886b6a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -335,9 +335,7 @@ again:
335 * not sure this buys us anything (and I'd have 335 * not sure this buys us anything (and I'd have
336 * to revamp the NFSv3 XDR code) */ 336 * to revamp the NFSv3 XDR code) */
337 status = nfs3_proc_setattr(dentry, &fattr, sattr); 337 status = nfs3_proc_setattr(dentry, &fattr, sattr);
338 if (status == 0) 338 nfs_post_op_update_inode(dentry->d_inode, &fattr);
339 nfs_setattr_update_inode(dentry->d_inode, sattr);
340 nfs_refresh_inode(dentry->d_inode, &fattr);
341 dprintk("NFS reply setattr (post-create): %d\n", status); 339 dprintk("NFS reply setattr (post-create): %d\n", status);
342 } 340 }
343 if (status != 0) 341 if (status != 0)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index b51df8eb9f..b4647a22f3 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -319,7 +319,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
319static int 319static int
320nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 320nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
321{ 321{
322 struct rpc_auth *auth = req->rq_task->tk_auth; 322 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
323 unsigned int replen; 323 unsigned int replen;
324 u32 count = args->count; 324 u32 count = args->count;
325 325
@@ -458,7 +458,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
458static int 458static int
459nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 459nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
460{ 460{
461 struct rpc_auth *auth = req->rq_task->tk_auth; 461 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
462 unsigned int replen; 462 unsigned int replen;
463 u32 count = args->count; 463 u32 count = args->count;
464 464
@@ -643,7 +643,7 @@ static int
643nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, 643nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
644 struct nfs3_getaclargs *args) 644 struct nfs3_getaclargs *args)
645{ 645{
646 struct rpc_auth *auth = req->rq_task->tk_auth; 646 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
647 unsigned int replen; 647 unsigned int replen;
648 648
649 p = xdr_encode_fhandle(p, args->fh); 649 p = xdr_encode_fhandle(p, args->fh);
@@ -773,7 +773,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
773static int 773static int
774nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 774nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
775{ 775{
776 struct rpc_auth *auth = req->rq_task->tk_auth; 776 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
777 unsigned int replen; 777 unsigned int replen;
778 778
779 p = xdr_encode_fhandle(p, args->fh); 779 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index cf3a17eb5c..6c028e734f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -70,19 +70,26 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
70 seqid->flags |= NFS_SEQID_CONFIRMED; 70 seqid->flags |= NFS_SEQID_CONFIRMED;
71} 71}
72 72
73struct nfs_unique_id {
74 struct rb_node rb_node;
75 __u64 id;
76};
77
73/* 78/*
74 * NFS4 state_owners and lock_owners are simply labels for ordered 79 * NFS4 state_owners and lock_owners are simply labels for ordered
75 * sequences of RPC calls. Their sole purpose is to provide once-only 80 * sequences of RPC calls. Their sole purpose is to provide once-only
76 * semantics by allowing the server to identify replayed requests. 81 * semantics by allowing the server to identify replayed requests.
77 */ 82 */
78struct nfs4_state_owner { 83struct nfs4_state_owner {
79 spinlock_t so_lock; 84 struct nfs_unique_id so_owner_id;
80 struct list_head so_list; /* per-clientid list of state_owners */
81 struct nfs_client *so_client; 85 struct nfs_client *so_client;
82 u32 so_id; /* 32-bit identifier, unique */ 86 struct nfs_server *so_server;
83 atomic_t so_count; 87 struct rb_node so_client_node;
84 88
85 struct rpc_cred *so_cred; /* Associated cred */ 89 struct rpc_cred *so_cred; /* Associated cred */
90
91 spinlock_t so_lock;
92 atomic_t so_count;
86 struct list_head so_states; 93 struct list_head so_states;
87 struct list_head so_delegations; 94 struct list_head so_delegations;
88 struct nfs_seqid_counter so_seqid; 95 struct nfs_seqid_counter so_seqid;
@@ -108,7 +115,7 @@ struct nfs4_lock_state {
108#define NFS_LOCK_INITIALIZED 1 115#define NFS_LOCK_INITIALIZED 1
109 int ls_flags; 116 int ls_flags;
110 struct nfs_seqid_counter ls_seqid; 117 struct nfs_seqid_counter ls_seqid;
111 u32 ls_id; 118 struct nfs_unique_id ls_id;
112 nfs4_stateid ls_stateid; 119 nfs4_stateid ls_stateid;
113 atomic_t ls_count; 120 atomic_t ls_count;
114}; 121};
@@ -116,7 +123,10 @@ struct nfs4_lock_state {
116/* bits for nfs4_state->flags */ 123/* bits for nfs4_state->flags */
117enum { 124enum {
118 LK_STATE_IN_USE, 125 LK_STATE_IN_USE,
119 NFS_DELEGATED_STATE, 126 NFS_DELEGATED_STATE, /* Current stateid is delegation */
127 NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
128 NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
129 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
120}; 130};
121 131
122struct nfs4_state { 132struct nfs4_state {
@@ -130,11 +140,14 @@ struct nfs4_state {
130 unsigned long flags; /* Do we hold any locks? */ 140 unsigned long flags; /* Do we hold any locks? */
131 spinlock_t state_lock; /* Protects the lock_states list */ 141 spinlock_t state_lock; /* Protects the lock_states list */
132 142
133 nfs4_stateid stateid; 143 seqlock_t seqlock; /* Protects the stateid/open_stateid */
144 nfs4_stateid stateid; /* Current stateid: may be delegation */
145 nfs4_stateid open_stateid; /* OPEN stateid */
134 146
135 unsigned int n_rdonly; 147 /* The following 3 fields are protected by owner->so_lock */
136 unsigned int n_wronly; 148 unsigned int n_rdonly; /* Number of read-only references */
137 unsigned int n_rdwr; 149 unsigned int n_wronly; /* Number of write-only references */
150 unsigned int n_rdwr; /* Number of read/write references */
138 int state; /* State on the server (R,W, or RW) */ 151 int state; /* State on the server (R,W, or RW) */
139 atomic_t count; 152 atomic_t count;
140}; 153};
@@ -165,7 +178,7 @@ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struc
165extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); 178extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
166extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); 179extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
167extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); 180extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
168extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); 181extern int nfs4_do_close(struct path *path, struct nfs4_state *state);
169extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 182extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
170extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 183extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
171extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 184extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -189,14 +202,13 @@ extern void nfs4_renew_state(struct work_struct *);
189 202
190/* nfs4state.c */ 203/* nfs4state.c */
191struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); 204struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
192extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
193 205
194extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 206extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
195extern void nfs4_put_state_owner(struct nfs4_state_owner *); 207extern void nfs4_put_state_owner(struct nfs4_state_owner *);
196extern void nfs4_drop_state_owner(struct nfs4_state_owner *); 208extern void nfs4_drop_state_owner(struct nfs4_state_owner *);
197extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); 209extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
198extern void nfs4_put_open_state(struct nfs4_state *); 210extern void nfs4_put_open_state(struct nfs4_state *);
199extern void nfs4_close_state(struct nfs4_state *, mode_t); 211extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
200extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); 212extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
201extern void nfs4_schedule_state_recovery(struct nfs_client *); 213extern void nfs4_schedule_state_recovery(struct nfs_client *);
202extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 214extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
@@ -222,7 +234,7 @@ extern struct svc_version nfs4_callback_version1;
222 234
223#else 235#else
224 236
225#define nfs4_close_state(a, b) do { } while (0) 237#define nfs4_close_state(a, b, c) do { } while (0)
226 238
227#endif /* CONFIG_NFS_V4 */ 239#endif /* CONFIG_NFS_V4 */
228#endif /* __LINUX_FS_NFS_NFS4_FS.H */ 240#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 648e0ac0f9..fee2da856c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,6 +65,7 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
65static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); 65static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); 67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
68static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags);
68 69
69/* Prevent leaks of NFSv4 errors into userland */ 70/* Prevent leaks of NFSv4 errors into userland */
70int nfs4_map_errors(int err) 71int nfs4_map_errors(int err)
@@ -214,27 +215,39 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
214} 215}
215 216
216struct nfs4_opendata { 217struct nfs4_opendata {
217 atomic_t count; 218 struct kref kref;
218 struct nfs_openargs o_arg; 219 struct nfs_openargs o_arg;
219 struct nfs_openres o_res; 220 struct nfs_openres o_res;
220 struct nfs_open_confirmargs c_arg; 221 struct nfs_open_confirmargs c_arg;
221 struct nfs_open_confirmres c_res; 222 struct nfs_open_confirmres c_res;
222 struct nfs_fattr f_attr; 223 struct nfs_fattr f_attr;
223 struct nfs_fattr dir_attr; 224 struct nfs_fattr dir_attr;
224 struct dentry *dentry; 225 struct path path;
225 struct dentry *dir; 226 struct dentry *dir;
226 struct nfs4_state_owner *owner; 227 struct nfs4_state_owner *owner;
228 struct nfs4_state *state;
227 struct iattr attrs; 229 struct iattr attrs;
228 unsigned long timestamp; 230 unsigned long timestamp;
231 unsigned int rpc_done : 1;
229 int rpc_status; 232 int rpc_status;
230 int cancelled; 233 int cancelled;
231}; 234};
232 235
233static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, 236
237static void nfs4_init_opendata_res(struct nfs4_opendata *p)
238{
239 p->o_res.f_attr = &p->f_attr;
240 p->o_res.dir_attr = &p->dir_attr;
241 p->o_res.server = p->o_arg.server;
242 nfs_fattr_init(&p->f_attr);
243 nfs_fattr_init(&p->dir_attr);
244}
245
246static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
234 struct nfs4_state_owner *sp, int flags, 247 struct nfs4_state_owner *sp, int flags,
235 const struct iattr *attrs) 248 const struct iattr *attrs)
236{ 249{
237 struct dentry *parent = dget_parent(dentry); 250 struct dentry *parent = dget_parent(path->dentry);
238 struct inode *dir = parent->d_inode; 251 struct inode *dir = parent->d_inode;
239 struct nfs_server *server = NFS_SERVER(dir); 252 struct nfs_server *server = NFS_SERVER(dir);
240 struct nfs4_opendata *p; 253 struct nfs4_opendata *p;
@@ -245,24 +258,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
245 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 258 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
246 if (p->o_arg.seqid == NULL) 259 if (p->o_arg.seqid == NULL)
247 goto err_free; 260 goto err_free;
248 atomic_set(&p->count, 1); 261 p->path.mnt = mntget(path->mnt);
249 p->dentry = dget(dentry); 262 p->path.dentry = dget(path->dentry);
250 p->dir = parent; 263 p->dir = parent;
251 p->owner = sp; 264 p->owner = sp;
252 atomic_inc(&sp->so_count); 265 atomic_inc(&sp->so_count);
253 p->o_arg.fh = NFS_FH(dir); 266 p->o_arg.fh = NFS_FH(dir);
254 p->o_arg.open_flags = flags, 267 p->o_arg.open_flags = flags,
255 p->o_arg.clientid = server->nfs_client->cl_clientid; 268 p->o_arg.clientid = server->nfs_client->cl_clientid;
256 p->o_arg.id = sp->so_id; 269 p->o_arg.id = sp->so_owner_id.id;
257 p->o_arg.name = &dentry->d_name; 270 p->o_arg.name = &p->path.dentry->d_name;
258 p->o_arg.server = server; 271 p->o_arg.server = server;
259 p->o_arg.bitmask = server->attr_bitmask; 272 p->o_arg.bitmask = server->attr_bitmask;
260 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 273 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
261 p->o_res.f_attr = &p->f_attr;
262 p->o_res.dir_attr = &p->dir_attr;
263 p->o_res.server = server;
264 nfs_fattr_init(&p->f_attr);
265 nfs_fattr_init(&p->dir_attr);
266 if (flags & O_EXCL) { 274 if (flags & O_EXCL) {
267 u32 *s = (u32 *) p->o_arg.u.verifier.data; 275 u32 *s = (u32 *) p->o_arg.u.verifier.data;
268 s[0] = jiffies; 276 s[0] = jiffies;
@@ -274,6 +282,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
274 p->c_arg.fh = &p->o_res.fh; 282 p->c_arg.fh = &p->o_res.fh;
275 p->c_arg.stateid = &p->o_res.stateid; 283 p->c_arg.stateid = &p->o_res.stateid;
276 p->c_arg.seqid = p->o_arg.seqid; 284 p->c_arg.seqid = p->o_arg.seqid;
285 nfs4_init_opendata_res(p);
286 kref_init(&p->kref);
277 return p; 287 return p;
278err_free: 288err_free:
279 kfree(p); 289 kfree(p);
@@ -282,27 +292,25 @@ err:
282 return NULL; 292 return NULL;
283} 293}
284 294
285static void nfs4_opendata_free(struct nfs4_opendata *p) 295static void nfs4_opendata_free(struct kref *kref)
286{ 296{
287 if (p != NULL && atomic_dec_and_test(&p->count)) { 297 struct nfs4_opendata *p = container_of(kref,
288 nfs_free_seqid(p->o_arg.seqid); 298 struct nfs4_opendata, kref);
289 nfs4_put_state_owner(p->owner); 299
290 dput(p->dir); 300 nfs_free_seqid(p->o_arg.seqid);
291 dput(p->dentry); 301 if (p->state != NULL)
292 kfree(p); 302 nfs4_put_open_state(p->state);
293 } 303 nfs4_put_state_owner(p->owner);
304 dput(p->dir);
305 dput(p->path.dentry);
306 mntput(p->path.mnt);
307 kfree(p);
294} 308}
295 309
296/* Helper for asynchronous RPC calls */ 310static void nfs4_opendata_put(struct nfs4_opendata *p)
297static int nfs4_call_async(struct rpc_clnt *clnt,
298 const struct rpc_call_ops *tk_ops, void *calldata)
299{ 311{
300 struct rpc_task *task; 312 if (p != NULL)
301 313 kref_put(&p->kref, nfs4_opendata_free);
302 if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
303 return -ENOMEM;
304 rpc_execute(task);
305 return 0;
306} 314}
307 315
308static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) 316static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
@@ -316,7 +324,34 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
316 return ret; 324 return ret;
317} 325}
318 326
319static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) 327static int can_open_cached(struct nfs4_state *state, int mode)
328{
329 int ret = 0;
330 switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
331 case FMODE_READ:
332 ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
333 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
334 break;
335 case FMODE_WRITE:
336 ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0;
337 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
338 break;
339 case FMODE_READ|FMODE_WRITE:
340 ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
341 }
342 return ret;
343}
344
345static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
346{
347 if ((delegation->type & open_flags) != open_flags)
348 return 0;
349 if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM)
350 return 0;
351 return 1;
352}
353
354static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
320{ 355{
321 switch (open_flags) { 356 switch (open_flags) {
322 case FMODE_WRITE: 357 case FMODE_WRITE:
@@ -328,41 +363,176 @@ static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_
328 case FMODE_READ|FMODE_WRITE: 363 case FMODE_READ|FMODE_WRITE:
329 state->n_rdwr++; 364 state->n_rdwr++;
330 } 365 }
366 nfs4_state_set_mode_locked(state, state->state | open_flags);
331} 367}
332 368
333static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) 369static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
334{ 370{
335 struct inode *inode = state->inode; 371 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
372 memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
373 memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
374 switch (open_flags) {
375 case FMODE_READ:
376 set_bit(NFS_O_RDONLY_STATE, &state->flags);
377 break;
378 case FMODE_WRITE:
379 set_bit(NFS_O_WRONLY_STATE, &state->flags);
380 break;
381 case FMODE_READ|FMODE_WRITE:
382 set_bit(NFS_O_RDWR_STATE, &state->flags);
383 }
384}
385
386static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
387{
388 write_seqlock(&state->seqlock);
389 nfs_set_open_stateid_locked(state, stateid, open_flags);
390 write_sequnlock(&state->seqlock);
391}
336 392
393static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags)
394{
337 open_flags &= (FMODE_READ|FMODE_WRITE); 395 open_flags &= (FMODE_READ|FMODE_WRITE);
338 /* Protect against nfs4_find_state_byowner() */ 396 /*
397 * Protect the call to nfs4_state_set_mode_locked and
398 * serialise the stateid update
399 */
400 write_seqlock(&state->seqlock);
401 if (deleg_stateid != NULL) {
402 memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data));
403 set_bit(NFS_DELEGATED_STATE, &state->flags);
404 }
405 if (open_stateid != NULL)
406 nfs_set_open_stateid_locked(state, open_stateid, open_flags);
407 write_sequnlock(&state->seqlock);
339 spin_lock(&state->owner->so_lock); 408 spin_lock(&state->owner->so_lock);
340 spin_lock(&inode->i_lock);
341 memcpy(&state->stateid, stateid, sizeof(state->stateid));
342 update_open_stateflags(state, open_flags); 409 update_open_stateflags(state, open_flags);
343 nfs4_state_set_mode_locked(state, state->state | open_flags);
344 spin_unlock(&inode->i_lock);
345 spin_unlock(&state->owner->so_lock); 410 spin_unlock(&state->owner->so_lock);
346} 411}
347 412
413static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
414{
415 struct nfs_delegation *delegation;
416
417 rcu_read_lock();
418 delegation = rcu_dereference(NFS_I(inode)->delegation);
419 if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
420 rcu_read_unlock();
421 return;
422 }
423 rcu_read_unlock();
424 nfs_inode_return_delegation(inode);
425}
426
427static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
428{
429 struct nfs4_state *state = opendata->state;
430 struct nfs_inode *nfsi = NFS_I(state->inode);
431 struct nfs_delegation *delegation;
432 int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
433 nfs4_stateid stateid;
434 int ret = -EAGAIN;
435
436 rcu_read_lock();
437 delegation = rcu_dereference(nfsi->delegation);
438 for (;;) {
439 if (can_open_cached(state, open_mode)) {
440 spin_lock(&state->owner->so_lock);
441 if (can_open_cached(state, open_mode)) {
442 update_open_stateflags(state, open_mode);
443 spin_unlock(&state->owner->so_lock);
444 rcu_read_unlock();
445 goto out_return_state;
446 }
447 spin_unlock(&state->owner->so_lock);
448 }
449 if (delegation == NULL)
450 break;
451 if (!can_open_delegated(delegation, open_mode))
452 break;
453 /* Save the delegation */
454 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
455 rcu_read_unlock();
456 lock_kernel();
457 ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode);
458 unlock_kernel();
459 if (ret != 0)
460 goto out;
461 ret = -EAGAIN;
462 rcu_read_lock();
463 delegation = rcu_dereference(nfsi->delegation);
464 /* If no delegation, try a cached open */
465 if (delegation == NULL)
466 continue;
467 /* Is the delegation still valid? */
468 if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
469 continue;
470 rcu_read_unlock();
471 update_open_stateid(state, NULL, &stateid, open_mode);
472 goto out_return_state;
473 }
474 rcu_read_unlock();
475out:
476 return ERR_PTR(ret);
477out_return_state:
478 atomic_inc(&state->count);
479 return state;
480}
481
348static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) 482static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
349{ 483{
350 struct inode *inode; 484 struct inode *inode;
351 struct nfs4_state *state = NULL; 485 struct nfs4_state *state = NULL;
486 struct nfs_delegation *delegation;
487 nfs4_stateid *deleg_stateid = NULL;
488 int ret;
352 489
353 if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 490 if (!data->rpc_done) {
491 state = nfs4_try_open_cached(data);
354 goto out; 492 goto out;
493 }
494
495 ret = -EAGAIN;
496 if (!(data->f_attr.valid & NFS_ATTR_FATTR))
497 goto err;
355 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); 498 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
499 ret = PTR_ERR(inode);
356 if (IS_ERR(inode)) 500 if (IS_ERR(inode))
357 goto out; 501 goto err;
502 ret = -ENOMEM;
358 state = nfs4_get_open_state(inode, data->owner); 503 state = nfs4_get_open_state(inode, data->owner);
359 if (state == NULL) 504 if (state == NULL)
360 goto put_inode; 505 goto err_put_inode;
361 update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags); 506 if (data->o_res.delegation_type != 0) {
362put_inode: 507 int delegation_flags = 0;
508
509 rcu_read_lock();
510 delegation = rcu_dereference(NFS_I(inode)->delegation);
511 if (delegation)
512 delegation_flags = delegation->flags;
513 rcu_read_unlock();
514 if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
515 nfs_inode_set_delegation(state->inode,
516 data->owner->so_cred,
517 &data->o_res);
518 else
519 nfs_inode_reclaim_delegation(state->inode,
520 data->owner->so_cred,
521 &data->o_res);
522 }
523 rcu_read_lock();
524 delegation = rcu_dereference(NFS_I(inode)->delegation);
525 if (delegation != NULL)
526 deleg_stateid = &delegation->stateid;
527 update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
528 rcu_read_unlock();
363 iput(inode); 529 iput(inode);
364out: 530out:
365 return state; 531 return state;
532err_put_inode:
533 iput(inode);
534err:
535 return ERR_PTR(ret);
366} 536}
367 537
368static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) 538static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
@@ -382,79 +552,66 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *
382 return ERR_PTR(-ENOENT); 552 return ERR_PTR(-ENOENT);
383} 553}
384 554
385static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, nfs4_stateid *stateid) 555static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
386{ 556{
557 struct nfs4_state *newstate;
387 int ret; 558 int ret;
388 559
389 opendata->o_arg.open_flags = openflags; 560 opendata->o_arg.open_flags = openflags;
561 memset(&opendata->o_res, 0, sizeof(opendata->o_res));
562 memset(&opendata->c_res, 0, sizeof(opendata->c_res));
563 nfs4_init_opendata_res(opendata);
390 ret = _nfs4_proc_open(opendata); 564 ret = _nfs4_proc_open(opendata);
391 if (ret != 0) 565 if (ret != 0)
392 return ret; 566 return ret;
393 memcpy(stateid->data, opendata->o_res.stateid.data, 567 newstate = nfs4_opendata_to_nfs4_state(opendata);
394 sizeof(stateid->data)); 568 if (IS_ERR(newstate))
569 return PTR_ERR(newstate);
570 nfs4_close_state(&opendata->path, newstate, openflags);
571 *res = newstate;
395 return 0; 572 return 0;
396} 573}
397 574
398static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) 575static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
399{ 576{
400 nfs4_stateid stateid;
401 struct nfs4_state *newstate; 577 struct nfs4_state *newstate;
402 int mode = 0;
403 int delegation = 0;
404 int ret; 578 int ret;
405 579
406 /* memory barrier prior to reading state->n_* */ 580 /* memory barrier prior to reading state->n_* */
581 clear_bit(NFS_DELEGATED_STATE, &state->flags);
407 smp_rmb(); 582 smp_rmb();
408 if (state->n_rdwr != 0) { 583 if (state->n_rdwr != 0) {
409 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &stateid); 584 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
410 if (ret != 0) 585 if (ret != 0)
411 return ret; 586 return ret;
412 mode |= FMODE_READ|FMODE_WRITE; 587 if (newstate != state)
413 if (opendata->o_res.delegation_type != 0) 588 return -ESTALE;
414 delegation = opendata->o_res.delegation_type;
415 smp_rmb();
416 } 589 }
417 if (state->n_wronly != 0) { 590 if (state->n_wronly != 0) {
418 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &stateid); 591 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
419 if (ret != 0) 592 if (ret != 0)
420 return ret; 593 return ret;
421 mode |= FMODE_WRITE; 594 if (newstate != state)
422 if (opendata->o_res.delegation_type != 0) 595 return -ESTALE;
423 delegation = opendata->o_res.delegation_type;
424 smp_rmb();
425 } 596 }
426 if (state->n_rdonly != 0) { 597 if (state->n_rdonly != 0) {
427 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &stateid); 598 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
428 if (ret != 0) 599 if (ret != 0)
429 return ret; 600 return ret;
430 mode |= FMODE_READ; 601 if (newstate != state)
602 return -ESTALE;
431 } 603 }
432 clear_bit(NFS_DELEGATED_STATE, &state->flags); 604 /*
433 if (mode == 0) 605 * We may have performed cached opens for all three recoveries.
434 return 0; 606 * Check if we need to update the current stateid.
435 if (opendata->o_res.delegation_type == 0) 607 */
436 opendata->o_res.delegation_type = delegation; 608 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
437 opendata->o_arg.open_flags |= mode; 609 memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) {
438 newstate = nfs4_opendata_to_nfs4_state(opendata); 610 write_seqlock(&state->seqlock);
439 if (newstate != NULL) { 611 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
440 if (opendata->o_res.delegation_type != 0) { 612 memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data));
441 struct nfs_inode *nfsi = NFS_I(newstate->inode); 613 write_sequnlock(&state->seqlock);
442 int delegation_flags = 0;
443 if (nfsi->delegation)
444 delegation_flags = nfsi->delegation->flags;
445 if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
446 nfs_inode_set_delegation(newstate->inode,
447 opendata->owner->so_cred,
448 &opendata->o_res);
449 else
450 nfs_inode_reclaim_delegation(newstate->inode,
451 opendata->owner->so_cred,
452 &opendata->o_res);
453 }
454 nfs4_close_state(newstate, opendata->o_arg.open_flags);
455 } 614 }
456 if (newstate != state)
457 return -ESTALE;
458 return 0; 615 return 0;
459} 616}
460 617
@@ -462,41 +619,37 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
462 * OPEN_RECLAIM: 619 * OPEN_RECLAIM:
463 * reclaim state on the server after a reboot. 620 * reclaim state on the server after a reboot.
464 */ 621 */
465static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 622static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
466{ 623{
467 struct nfs_delegation *delegation = NFS_I(state->inode)->delegation; 624 struct nfs_delegation *delegation;
468 struct nfs4_opendata *opendata; 625 struct nfs4_opendata *opendata;
469 int delegation_type = 0; 626 int delegation_type = 0;
470 int status; 627 int status;
471 628
472 if (delegation != NULL) { 629 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
473 if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
474 memcpy(&state->stateid, &delegation->stateid,
475 sizeof(state->stateid));
476 set_bit(NFS_DELEGATED_STATE, &state->flags);
477 return 0;
478 }
479 delegation_type = delegation->type;
480 }
481 opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
482 if (opendata == NULL) 630 if (opendata == NULL)
483 return -ENOMEM; 631 return -ENOMEM;
484 opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; 632 opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
485 opendata->o_arg.fh = NFS_FH(state->inode); 633 opendata->o_arg.fh = NFS_FH(state->inode);
486 nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh); 634 nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh);
635 rcu_read_lock();
636 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
637 if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
638 delegation_type = delegation->flags;
639 rcu_read_unlock();
487 opendata->o_arg.u.delegation_type = delegation_type; 640 opendata->o_arg.u.delegation_type = delegation_type;
488 status = nfs4_open_recover(opendata, state); 641 status = nfs4_open_recover(opendata, state);
489 nfs4_opendata_free(opendata); 642 nfs4_opendata_put(opendata);
490 return status; 643 return status;
491} 644}
492 645
493static int nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 646static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
494{ 647{
495 struct nfs_server *server = NFS_SERVER(state->inode); 648 struct nfs_server *server = NFS_SERVER(state->inode);
496 struct nfs4_exception exception = { }; 649 struct nfs4_exception exception = { };
497 int err; 650 int err;
498 do { 651 do {
499 err = _nfs4_do_open_reclaim(sp, state, dentry); 652 err = _nfs4_do_open_reclaim(ctx, state);
500 if (err != -NFS4ERR_DELAY) 653 if (err != -NFS4ERR_DELAY)
501 break; 654 break;
502 nfs4_handle_exception(server, err, &exception); 655 nfs4_handle_exception(server, err, &exception);
@@ -512,37 +665,35 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
512 ctx = nfs4_state_find_open_context(state); 665 ctx = nfs4_state_find_open_context(state);
513 if (IS_ERR(ctx)) 666 if (IS_ERR(ctx))
514 return PTR_ERR(ctx); 667 return PTR_ERR(ctx);
515 ret = nfs4_do_open_reclaim(sp, state, ctx->dentry); 668 ret = nfs4_do_open_reclaim(ctx, state);
516 put_nfs_open_context(ctx); 669 put_nfs_open_context(ctx);
517 return ret; 670 return ret;
518} 671}
519 672
520static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) 673static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
521{ 674{
522 struct nfs4_state_owner *sp = state->owner; 675 struct nfs4_state_owner *sp = state->owner;
523 struct nfs4_opendata *opendata; 676 struct nfs4_opendata *opendata;
524 int ret; 677 int ret;
525 678
526 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 679 opendata = nfs4_opendata_alloc(&ctx->path, sp, 0, NULL);
527 return 0;
528 opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
529 if (opendata == NULL) 680 if (opendata == NULL)
530 return -ENOMEM; 681 return -ENOMEM;
531 opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; 682 opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
532 memcpy(opendata->o_arg.u.delegation.data, state->stateid.data, 683 memcpy(opendata->o_arg.u.delegation.data, stateid->data,
533 sizeof(opendata->o_arg.u.delegation.data)); 684 sizeof(opendata->o_arg.u.delegation.data));
534 ret = nfs4_open_recover(opendata, state); 685 ret = nfs4_open_recover(opendata, state);
535 nfs4_opendata_free(opendata); 686 nfs4_opendata_put(opendata);
536 return ret; 687 return ret;
537} 688}
538 689
539int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) 690int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
540{ 691{
541 struct nfs4_exception exception = { }; 692 struct nfs4_exception exception = { };
542 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 693 struct nfs_server *server = NFS_SERVER(state->inode);
543 int err; 694 int err;
544 do { 695 do {
545 err = _nfs4_open_delegation_recall(dentry, state); 696 err = _nfs4_open_delegation_recall(ctx, state, stateid);
546 switch (err) { 697 switch (err) {
547 case 0: 698 case 0:
548 return err; 699 return err;
@@ -582,9 +733,10 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
582 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 733 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
583 sizeof(data->o_res.stateid.data)); 734 sizeof(data->o_res.stateid.data));
584 renew_lease(data->o_res.server, data->timestamp); 735 renew_lease(data->o_res.server, data->timestamp);
736 data->rpc_done = 1;
585 } 737 }
586 nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
587 nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status); 738 nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
739 nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
588} 740}
589 741
590static void nfs4_open_confirm_release(void *calldata) 742static void nfs4_open_confirm_release(void *calldata)
@@ -596,14 +748,14 @@ static void nfs4_open_confirm_release(void *calldata)
596 if (data->cancelled == 0) 748 if (data->cancelled == 0)
597 goto out_free; 749 goto out_free;
598 /* In case of error, no cleanup! */ 750 /* In case of error, no cleanup! */
599 if (data->rpc_status != 0) 751 if (!data->rpc_done)
600 goto out_free; 752 goto out_free;
601 nfs_confirm_seqid(&data->owner->so_seqid, 0); 753 nfs_confirm_seqid(&data->owner->so_seqid, 0);
602 state = nfs4_opendata_to_nfs4_state(data); 754 state = nfs4_opendata_to_nfs4_state(data);
603 if (state != NULL) 755 if (!IS_ERR(state))
604 nfs4_close_state(state, data->o_arg.open_flags); 756 nfs4_close_state(&data->path, state, data->o_arg.open_flags);
605out_free: 757out_free:
606 nfs4_opendata_free(data); 758 nfs4_opendata_put(data);
607} 759}
608 760
609static const struct rpc_call_ops nfs4_open_confirm_ops = { 761static const struct rpc_call_ops nfs4_open_confirm_ops = {
@@ -621,12 +773,9 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
621 struct rpc_task *task; 773 struct rpc_task *task;
622 int status; 774 int status;
623 775
624 atomic_inc(&data->count); 776 kref_get(&data->kref);
625 /* 777 data->rpc_done = 0;
626 * If rpc_run_task() ends up calling ->rpc_release(), we 778 data->rpc_status = 0;
627 * want to ensure that it takes the 'error' code path.
628 */
629 data->rpc_status = -ENOMEM;
630 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data); 779 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
631 if (IS_ERR(task)) 780 if (IS_ERR(task))
632 return PTR_ERR(task); 781 return PTR_ERR(task);
@@ -653,13 +802,35 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
653 802
654 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) 803 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
655 return; 804 return;
805 /*
806 * Check if we still need to send an OPEN call, or if we can use
807 * a delegation instead.
808 */
809 if (data->state != NULL) {
810 struct nfs_delegation *delegation;
811
812 if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
813 goto out_no_action;
814 rcu_read_lock();
815 delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
816 if (delegation != NULL &&
817 (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) {
818 rcu_read_unlock();
819 goto out_no_action;
820 }
821 rcu_read_unlock();
822 }
656 /* Update sequence id. */ 823 /* Update sequence id. */
657 data->o_arg.id = sp->so_id; 824 data->o_arg.id = sp->so_owner_id.id;
658 data->o_arg.clientid = sp->so_client->cl_clientid; 825 data->o_arg.clientid = sp->so_client->cl_clientid;
659 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) 826 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
660 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 827 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
661 data->timestamp = jiffies; 828 data->timestamp = jiffies;
662 rpc_call_setup(task, &msg, 0); 829 rpc_call_setup(task, &msg, 0);
830 return;
831out_no_action:
832 task->tk_action = NULL;
833
663} 834}
664 835
665static void nfs4_open_done(struct rpc_task *task, void *calldata) 836static void nfs4_open_done(struct rpc_task *task, void *calldata)
@@ -683,8 +854,11 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
683 data->rpc_status = -ENOTDIR; 854 data->rpc_status = -ENOTDIR;
684 } 855 }
685 renew_lease(data->o_res.server, data->timestamp); 856 renew_lease(data->o_res.server, data->timestamp);
857 if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
858 nfs_confirm_seqid(&data->owner->so_seqid, 0);
686 } 859 }
687 nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); 860 nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid);
861 data->rpc_done = 1;
688} 862}
689 863
690static void nfs4_open_release(void *calldata) 864static void nfs4_open_release(void *calldata)
@@ -696,17 +870,17 @@ static void nfs4_open_release(void *calldata)
696 if (data->cancelled == 0) 870 if (data->cancelled == 0)
697 goto out_free; 871 goto out_free;
698 /* In case of error, no cleanup! */ 872 /* In case of error, no cleanup! */
699 if (data->rpc_status != 0) 873 if (data->rpc_status != 0 || !data->rpc_done)
700 goto out_free; 874 goto out_free;
701 /* In case we need an open_confirm, no cleanup! */ 875 /* In case we need an open_confirm, no cleanup! */
702 if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) 876 if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
703 goto out_free; 877 goto out_free;
704 nfs_confirm_seqid(&data->owner->so_seqid, 0); 878 nfs_confirm_seqid(&data->owner->so_seqid, 0);
705 state = nfs4_opendata_to_nfs4_state(data); 879 state = nfs4_opendata_to_nfs4_state(data);
706 if (state != NULL) 880 if (!IS_ERR(state))
707 nfs4_close_state(state, data->o_arg.open_flags); 881 nfs4_close_state(&data->path, state, data->o_arg.open_flags);
708out_free: 882out_free:
709 nfs4_opendata_free(data); 883 nfs4_opendata_put(data);
710} 884}
711 885
712static const struct rpc_call_ops nfs4_open_ops = { 886static const struct rpc_call_ops nfs4_open_ops = {
@@ -727,12 +901,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
727 struct rpc_task *task; 901 struct rpc_task *task;
728 int status; 902 int status;
729 903
730 atomic_inc(&data->count); 904 kref_get(&data->kref);
731 /* 905 data->rpc_done = 0;
732 * If rpc_run_task() ends up calling ->rpc_release(), we 906 data->rpc_status = 0;
733 * want to ensure that it takes the 'error' code path. 907 data->cancelled = 0;
734 */
735 data->rpc_status = -ENOMEM;
736 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data); 908 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
737 if (IS_ERR(task)) 909 if (IS_ERR(task))
738 return PTR_ERR(task); 910 return PTR_ERR(task);
@@ -743,7 +915,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
743 } else 915 } else
744 status = data->rpc_status; 916 status = data->rpc_status;
745 rpc_put_task(task); 917 rpc_put_task(task);
746 if (status != 0) 918 if (status != 0 || !data->rpc_done)
747 return status; 919 return status;
748 920
749 if (o_arg->open_flags & O_CREAT) { 921 if (o_arg->open_flags & O_CREAT) {
@@ -756,7 +928,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
756 if (status != 0) 928 if (status != 0)
757 return status; 929 return status;
758 } 930 }
759 nfs_confirm_seqid(&data->owner->so_seqid, 0);
760 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 931 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
761 return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); 932 return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
762 return 0; 933 return 0;
@@ -772,6 +943,8 @@ static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openf
772 mask |= MAY_READ; 943 mask |= MAY_READ;
773 if (openflags & FMODE_WRITE) 944 if (openflags & FMODE_WRITE)
774 mask |= MAY_WRITE; 945 mask |= MAY_WRITE;
946 if (openflags & FMODE_EXEC)
947 mask |= MAY_EXEC;
775 status = nfs_access_get_cached(inode, cred, &cache); 948 status = nfs_access_get_cached(inode, cred, &cache);
776 if (status == 0) 949 if (status == 0)
777 goto out; 950 goto out;
@@ -811,43 +984,32 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
811 * reclaim state on the server after a network partition. 984 * reclaim state on the server after a network partition.
812 * Assumes caller holds the appropriate lock 985 * Assumes caller holds the appropriate lock
813 */ 986 */
814static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 987static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
815{ 988{
816 struct inode *inode = state->inode;
817 struct nfs_delegation *delegation = NFS_I(inode)->delegation;
818 struct nfs4_opendata *opendata; 989 struct nfs4_opendata *opendata;
819 int openflags = state->state & (FMODE_READ|FMODE_WRITE);
820 int ret; 990 int ret;
821 991
822 if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { 992 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
823 ret = _nfs4_do_access(inode, sp->so_cred, openflags);
824 if (ret < 0)
825 return ret;
826 memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid));
827 set_bit(NFS_DELEGATED_STATE, &state->flags);
828 return 0;
829 }
830 opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL);
831 if (opendata == NULL) 993 if (opendata == NULL)
832 return -ENOMEM; 994 return -ENOMEM;
833 ret = nfs4_open_recover(opendata, state); 995 ret = nfs4_open_recover(opendata, state);
834 if (ret == -ESTALE) { 996 if (ret == -ESTALE) {
835 /* Invalidate the state owner so we don't ever use it again */ 997 /* Invalidate the state owner so we don't ever use it again */
836 nfs4_drop_state_owner(sp); 998 nfs4_drop_state_owner(state->owner);
837 d_drop(dentry); 999 d_drop(ctx->path.dentry);
838 } 1000 }
839 nfs4_opendata_free(opendata); 1001 nfs4_opendata_put(opendata);
840 return ret; 1002 return ret;
841} 1003}
842 1004
843static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 1005static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
844{ 1006{
845 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 1007 struct nfs_server *server = NFS_SERVER(state->inode);
846 struct nfs4_exception exception = { }; 1008 struct nfs4_exception exception = { };
847 int err; 1009 int err;
848 1010
849 do { 1011 do {
850 err = _nfs4_open_expired(sp, state, dentry); 1012 err = _nfs4_open_expired(ctx, state);
851 if (err == -NFS4ERR_DELAY) 1013 if (err == -NFS4ERR_DELAY)
852 nfs4_handle_exception(server, err, &exception); 1014 nfs4_handle_exception(server, err, &exception);
853 } while (exception.retry); 1015 } while (exception.retry);
@@ -862,107 +1024,38 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
862 ctx = nfs4_state_find_open_context(state); 1024 ctx = nfs4_state_find_open_context(state);
863 if (IS_ERR(ctx)) 1025 if (IS_ERR(ctx))
864 return PTR_ERR(ctx); 1026 return PTR_ERR(ctx);
865 ret = nfs4_do_open_expired(sp, state, ctx->dentry); 1027 ret = nfs4_do_open_expired(ctx, state);
866 put_nfs_open_context(ctx); 1028 put_nfs_open_context(ctx);
867 return ret; 1029 return ret;
868} 1030}
869 1031
870/* 1032/*
871 * Returns a referenced nfs4_state if there is an open delegation on the file 1033 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
1034 * fields corresponding to attributes that were used to store the verifier.
1035 * Make sure we clobber those fields in the later setattr call
872 */ 1036 */
873static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res) 1037static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
874{
875 struct nfs_delegation *delegation;
876 struct nfs_server *server = NFS_SERVER(inode);
877 struct nfs_client *clp = server->nfs_client;
878 struct nfs_inode *nfsi = NFS_I(inode);
879 struct nfs4_state_owner *sp = NULL;
880 struct nfs4_state *state = NULL;
881 int open_flags = flags & (FMODE_READ|FMODE_WRITE);
882 int err;
883
884 err = -ENOMEM;
885 if (!(sp = nfs4_get_state_owner(server, cred))) {
886 dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
887 return err;
888 }
889 err = nfs4_recover_expired_lease(server);
890 if (err != 0)
891 goto out_put_state_owner;
892 /* Protect against reboot recovery - NOTE ORDER! */
893 down_read(&clp->cl_sem);
894 /* Protect against delegation recall */
895 down_read(&nfsi->rwsem);
896 delegation = NFS_I(inode)->delegation;
897 err = -ENOENT;
898 if (delegation == NULL || (delegation->type & open_flags) != open_flags)
899 goto out_err;
900 err = -ENOMEM;
901 state = nfs4_get_open_state(inode, sp);
902 if (state == NULL)
903 goto out_err;
904
905 err = -ENOENT;
906 if ((state->state & open_flags) == open_flags) {
907 spin_lock(&inode->i_lock);
908 update_open_stateflags(state, open_flags);
909 spin_unlock(&inode->i_lock);
910 goto out_ok;
911 } else if (state->state != 0)
912 goto out_put_open_state;
913
914 lock_kernel();
915 err = _nfs4_do_access(inode, cred, open_flags);
916 unlock_kernel();
917 if (err != 0)
918 goto out_put_open_state;
919 set_bit(NFS_DELEGATED_STATE, &state->flags);
920 update_open_stateid(state, &delegation->stateid, open_flags);
921out_ok:
922 nfs4_put_state_owner(sp);
923 up_read(&nfsi->rwsem);
924 up_read(&clp->cl_sem);
925 *res = state;
926 return 0;
927out_put_open_state:
928 nfs4_put_open_state(state);
929out_err:
930 up_read(&nfsi->rwsem);
931 up_read(&clp->cl_sem);
932 if (err != -EACCES)
933 nfs_inode_return_delegation(inode);
934out_put_state_owner:
935 nfs4_put_state_owner(sp);
936 return err;
937}
938
939static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred)
940{ 1038{
941 struct nfs4_exception exception = { }; 1039 if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
942 struct nfs4_state *res = ERR_PTR(-EIO); 1040 !(sattr->ia_valid & ATTR_ATIME_SET))
943 int err; 1041 sattr->ia_valid |= ATTR_ATIME;
944 1042
945 do { 1043 if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
946 err = _nfs4_open_delegated(inode, flags, cred, &res); 1044 !(sattr->ia_valid & ATTR_MTIME_SET))
947 if (err == 0) 1045 sattr->ia_valid |= ATTR_MTIME;
948 break;
949 res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(inode),
950 err, &exception));
951 } while (exception.retry);
952 return res;
953} 1046}
954 1047
955/* 1048/*
956 * Returns a referenced nfs4_state 1049 * Returns a referenced nfs4_state
957 */ 1050 */
958static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) 1051static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
959{ 1052{
960 struct nfs4_state_owner *sp; 1053 struct nfs4_state_owner *sp;
961 struct nfs4_state *state = NULL; 1054 struct nfs4_state *state = NULL;
962 struct nfs_server *server = NFS_SERVER(dir); 1055 struct nfs_server *server = NFS_SERVER(dir);
963 struct nfs_client *clp = server->nfs_client; 1056 struct nfs_client *clp = server->nfs_client;
964 struct nfs4_opendata *opendata; 1057 struct nfs4_opendata *opendata;
965 int status; 1058 int status;
966 1059
967 /* Protect against reboot recovery conflicts */ 1060 /* Protect against reboot recovery conflicts */
968 status = -ENOMEM; 1061 status = -ENOMEM;
@@ -973,29 +1066,35 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
973 status = nfs4_recover_expired_lease(server); 1066 status = nfs4_recover_expired_lease(server);
974 if (status != 0) 1067 if (status != 0)
975 goto err_put_state_owner; 1068 goto err_put_state_owner;
1069 if (path->dentry->d_inode != NULL)
1070 nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
976 down_read(&clp->cl_sem); 1071 down_read(&clp->cl_sem);
977 status = -ENOMEM; 1072 status = -ENOMEM;
978 opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr); 1073 opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
979 if (opendata == NULL) 1074 if (opendata == NULL)
980 goto err_release_rwsem; 1075 goto err_release_rwsem;
981 1076
1077 if (path->dentry->d_inode != NULL)
1078 opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
1079
982 status = _nfs4_proc_open(opendata); 1080 status = _nfs4_proc_open(opendata);
983 if (status != 0) 1081 if (status != 0)
984 goto err_opendata_free; 1082 goto err_opendata_put;
1083
1084 if (opendata->o_arg.open_flags & O_EXCL)
1085 nfs4_exclusive_attrset(opendata, sattr);
985 1086
986 status = -ENOMEM;
987 state = nfs4_opendata_to_nfs4_state(opendata); 1087 state = nfs4_opendata_to_nfs4_state(opendata);
988 if (state == NULL) 1088 status = PTR_ERR(state);
989 goto err_opendata_free; 1089 if (IS_ERR(state))
990 if (opendata->o_res.delegation_type != 0) 1090 goto err_opendata_put;
991 nfs_inode_set_delegation(state->inode, cred, &opendata->o_res); 1091 nfs4_opendata_put(opendata);
992 nfs4_opendata_free(opendata);
993 nfs4_put_state_owner(sp); 1092 nfs4_put_state_owner(sp);
994 up_read(&clp->cl_sem); 1093 up_read(&clp->cl_sem);
995 *res = state; 1094 *res = state;
996 return 0; 1095 return 0;
997err_opendata_free: 1096err_opendata_put:
998 nfs4_opendata_free(opendata); 1097 nfs4_opendata_put(opendata);
999err_release_rwsem: 1098err_release_rwsem:
1000 up_read(&clp->cl_sem); 1099 up_read(&clp->cl_sem);
1001err_put_state_owner: 1100err_put_state_owner:
@@ -1006,14 +1105,14 @@ out_err:
1006} 1105}
1007 1106
1008 1107
1009static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred) 1108static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
1010{ 1109{
1011 struct nfs4_exception exception = { }; 1110 struct nfs4_exception exception = { };
1012 struct nfs4_state *res; 1111 struct nfs4_state *res;
1013 int status; 1112 int status;
1014 1113
1015 do { 1114 do {
1016 status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res); 1115 status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
1017 if (status == 0) 1116 if (status == 0)
1018 break; 1117 break;
1019 /* NOTE: BAD_SEQID means the server and client disagree about the 1118 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1028,7 +1127,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
1028 * the user though... 1127 * the user though...
1029 */ 1128 */
1030 if (status == -NFS4ERR_BAD_SEQID) { 1129 if (status == -NFS4ERR_BAD_SEQID) {
1031 printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n"); 1130 printk(KERN_WARNING "NFS: v4 server %s "
1131 " returned a bad sequence-id error!\n",
1132 NFS_SERVER(dir)->nfs_client->cl_hostname);
1032 exception.retry = 1; 1133 exception.retry = 1;
1033 continue; 1134 continue;
1034 } 1135 }
@@ -1042,6 +1143,11 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
1042 exception.retry = 1; 1143 exception.retry = 1;
1043 continue; 1144 continue;
1044 } 1145 }
1146 if (status == -EAGAIN) {
1147 /* We must have found a delegation */
1148 exception.retry = 1;
1149 continue;
1150 }
1045 res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), 1151 res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
1046 status, &exception)); 1152 status, &exception));
1047 } while (exception.retry); 1153 } while (exception.retry);
@@ -1101,6 +1207,7 @@ static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
1101} 1207}
1102 1208
1103struct nfs4_closedata { 1209struct nfs4_closedata {
1210 struct path path;
1104 struct inode *inode; 1211 struct inode *inode;
1105 struct nfs4_state *state; 1212 struct nfs4_state *state;
1106 struct nfs_closeargs arg; 1213 struct nfs_closeargs arg;
@@ -1117,6 +1224,8 @@ static void nfs4_free_closedata(void *data)
1117 nfs4_put_open_state(calldata->state); 1224 nfs4_put_open_state(calldata->state);
1118 nfs_free_seqid(calldata->arg.seqid); 1225 nfs_free_seqid(calldata->arg.seqid);
1119 nfs4_put_state_owner(sp); 1226 nfs4_put_state_owner(sp);
1227 dput(calldata->path.dentry);
1228 mntput(calldata->path.mnt);
1120 kfree(calldata); 1229 kfree(calldata);
1121} 1230}
1122 1231
@@ -1134,8 +1243,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1134 nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); 1243 nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid);
1135 switch (task->tk_status) { 1244 switch (task->tk_status) {
1136 case 0: 1245 case 0:
1137 memcpy(&state->stateid, &calldata->res.stateid, 1246 nfs_set_open_stateid(state, &calldata->res.stateid, calldata->arg.open_flags);
1138 sizeof(state->stateid));
1139 renew_lease(server, calldata->timestamp); 1247 renew_lease(server, calldata->timestamp);
1140 break; 1248 break;
1141 case -NFS4ERR_STALE_STATEID: 1249 case -NFS4ERR_STALE_STATEID:
@@ -1160,26 +1268,30 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1160 .rpc_resp = &calldata->res, 1268 .rpc_resp = &calldata->res,
1161 .rpc_cred = state->owner->so_cred, 1269 .rpc_cred = state->owner->so_cred,
1162 }; 1270 };
1163 int mode = 0, old_mode; 1271 int clear_rd, clear_wr, clear_rdwr;
1272 int mode;
1164 1273
1165 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 1274 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
1166 return; 1275 return;
1167 /* Recalculate the new open mode in case someone reopened the file 1276
1168 * while we were waiting in line to be scheduled. 1277 mode = FMODE_READ|FMODE_WRITE;
1169 */ 1278 clear_rd = clear_wr = clear_rdwr = 0;
1170 spin_lock(&state->owner->so_lock); 1279 spin_lock(&state->owner->so_lock);
1171 spin_lock(&calldata->inode->i_lock); 1280 /* Calculate the change in open mode */
1172 mode = old_mode = state->state;
1173 if (state->n_rdwr == 0) { 1281 if (state->n_rdwr == 0) {
1174 if (state->n_rdonly == 0) 1282 if (state->n_rdonly == 0) {
1175 mode &= ~FMODE_READ; 1283 mode &= ~FMODE_READ;
1176 if (state->n_wronly == 0) 1284 clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1285 clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
1286 }
1287 if (state->n_wronly == 0) {
1177 mode &= ~FMODE_WRITE; 1288 mode &= ~FMODE_WRITE;
1289 clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1290 clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
1291 }
1178 } 1292 }
1179 nfs4_state_set_mode_locked(state, mode);
1180 spin_unlock(&calldata->inode->i_lock);
1181 spin_unlock(&state->owner->so_lock); 1293 spin_unlock(&state->owner->so_lock);
1182 if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) { 1294 if (!clear_rd && !clear_wr && !clear_rdwr) {
1183 /* Note: exit _without_ calling nfs4_close_done */ 1295 /* Note: exit _without_ calling nfs4_close_done */
1184 task->tk_action = NULL; 1296 task->tk_action = NULL;
1185 return; 1297 return;
@@ -1209,19 +1321,21 @@ static const struct rpc_call_ops nfs4_close_ops = {
1209 * 1321 *
1210 * NOTE: Caller must be holding the sp->so_owner semaphore! 1322 * NOTE: Caller must be holding the sp->so_owner semaphore!
1211 */ 1323 */
1212int nfs4_do_close(struct inode *inode, struct nfs4_state *state) 1324int nfs4_do_close(struct path *path, struct nfs4_state *state)
1213{ 1325{
1214 struct nfs_server *server = NFS_SERVER(inode); 1326 struct nfs_server *server = NFS_SERVER(state->inode);
1215 struct nfs4_closedata *calldata; 1327 struct nfs4_closedata *calldata;
1328 struct nfs4_state_owner *sp = state->owner;
1329 struct rpc_task *task;
1216 int status = -ENOMEM; 1330 int status = -ENOMEM;
1217 1331
1218 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); 1332 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
1219 if (calldata == NULL) 1333 if (calldata == NULL)
1220 goto out; 1334 goto out;
1221 calldata->inode = inode; 1335 calldata->inode = state->inode;
1222 calldata->state = state; 1336 calldata->state = state;
1223 calldata->arg.fh = NFS_FH(inode); 1337 calldata->arg.fh = NFS_FH(state->inode);
1224 calldata->arg.stateid = &state->stateid; 1338 calldata->arg.stateid = &state->open_stateid;
1225 /* Serialization for the sequence id */ 1339 /* Serialization for the sequence id */
1226 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); 1340 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
1227 if (calldata->arg.seqid == NULL) 1341 if (calldata->arg.seqid == NULL)
@@ -1229,36 +1343,55 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
1229 calldata->arg.bitmask = server->attr_bitmask; 1343 calldata->arg.bitmask = server->attr_bitmask;
1230 calldata->res.fattr = &calldata->fattr; 1344 calldata->res.fattr = &calldata->fattr;
1231 calldata->res.server = server; 1345 calldata->res.server = server;
1346 calldata->path.mnt = mntget(path->mnt);
1347 calldata->path.dentry = dget(path->dentry);
1232 1348
1233 status = nfs4_call_async(server->client, &nfs4_close_ops, calldata); 1349 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata);
1234 if (status == 0) 1350 if (IS_ERR(task))
1235 goto out; 1351 return PTR_ERR(task);
1236 1352 rpc_put_task(task);
1237 nfs_free_seqid(calldata->arg.seqid); 1353 return 0;
1238out_free_calldata: 1354out_free_calldata:
1239 kfree(calldata); 1355 kfree(calldata);
1240out: 1356out:
1357 nfs4_put_open_state(state);
1358 nfs4_put_state_owner(sp);
1241 return status; 1359 return status;
1242} 1360}
1243 1361
1244static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) 1362static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
1245{ 1363{
1246 struct file *filp; 1364 struct file *filp;
1365 int ret;
1247 1366
1248 filp = lookup_instantiate_filp(nd, dentry, NULL); 1367 /* If the open_intent is for execute, we have an extra check to make */
1368 if (nd->intent.open.flags & FMODE_EXEC) {
1369 ret = _nfs4_do_access(state->inode,
1370 state->owner->so_cred,
1371 nd->intent.open.flags);
1372 if (ret < 0)
1373 goto out_close;
1374 }
1375 filp = lookup_instantiate_filp(nd, path->dentry, NULL);
1249 if (!IS_ERR(filp)) { 1376 if (!IS_ERR(filp)) {
1250 struct nfs_open_context *ctx; 1377 struct nfs_open_context *ctx;
1251 ctx = (struct nfs_open_context *)filp->private_data; 1378 ctx = (struct nfs_open_context *)filp->private_data;
1252 ctx->state = state; 1379 ctx->state = state;
1253 return 0; 1380 return 0;
1254 } 1381 }
1255 nfs4_close_state(state, nd->intent.open.flags); 1382 ret = PTR_ERR(filp);
1256 return PTR_ERR(filp); 1383out_close:
1384 nfs4_close_state(path, state, nd->intent.open.flags);
1385 return ret;
1257} 1386}
1258 1387
1259struct dentry * 1388struct dentry *
1260nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1389nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1261{ 1390{
1391 struct path path = {
1392 .mnt = nd->mnt,
1393 .dentry = dentry,
1394 };
1262 struct iattr attr; 1395 struct iattr attr;
1263 struct rpc_cred *cred; 1396 struct rpc_cred *cred;
1264 struct nfs4_state *state; 1397 struct nfs4_state *state;
@@ -1277,7 +1410,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1277 cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); 1410 cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
1278 if (IS_ERR(cred)) 1411 if (IS_ERR(cred))
1279 return (struct dentry *)cred; 1412 return (struct dentry *)cred;
1280 state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); 1413 state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
1281 put_rpccred(cred); 1414 put_rpccred(cred);
1282 if (IS_ERR(state)) { 1415 if (IS_ERR(state)) {
1283 if (PTR_ERR(state) == -ENOENT) 1416 if (PTR_ERR(state) == -ENOENT)
@@ -1287,22 +1420,24 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1287 res = d_add_unique(dentry, igrab(state->inode)); 1420 res = d_add_unique(dentry, igrab(state->inode));
1288 if (res != NULL) 1421 if (res != NULL)
1289 dentry = res; 1422 dentry = res;
1290 nfs4_intent_set_file(nd, dentry, state); 1423 nfs4_intent_set_file(nd, &path, state);
1291 return res; 1424 return res;
1292} 1425}
1293 1426
1294int 1427int
1295nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) 1428nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
1296{ 1429{
1430 struct path path = {
1431 .mnt = nd->mnt,
1432 .dentry = dentry,
1433 };
1297 struct rpc_cred *cred; 1434 struct rpc_cred *cred;
1298 struct nfs4_state *state; 1435 struct nfs4_state *state;
1299 1436
1300 cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); 1437 cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
1301 if (IS_ERR(cred)) 1438 if (IS_ERR(cred))
1302 return PTR_ERR(cred); 1439 return PTR_ERR(cred);
1303 state = nfs4_open_delegated(dentry->d_inode, openflags, cred); 1440 state = nfs4_do_open(dir, &path, openflags, NULL, cred);
1304 if (IS_ERR(state))
1305 state = nfs4_do_open(dir, dentry, openflags, NULL, cred);
1306 put_rpccred(cred); 1441 put_rpccred(cred);
1307 if (IS_ERR(state)) { 1442 if (IS_ERR(state)) {
1308 switch (PTR_ERR(state)) { 1443 switch (PTR_ERR(state)) {
@@ -1318,10 +1453,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1318 } 1453 }
1319 } 1454 }
1320 if (state->inode == dentry->d_inode) { 1455 if (state->inode == dentry->d_inode) {
1321 nfs4_intent_set_file(nd, dentry, state); 1456 nfs4_intent_set_file(nd, &path, state);
1322 return 1; 1457 return 1;
1323 } 1458 }
1324 nfs4_close_state(state, openflags); 1459 nfs4_close_state(&path, state, openflags);
1325out_drop: 1460out_drop:
1326 d_drop(dentry); 1461 d_drop(dentry);
1327 return 0; 1462 return 0;
@@ -1559,8 +1694,6 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
1559 dprintk("NFS call lookupfh %s\n", name->name); 1694 dprintk("NFS call lookupfh %s\n", name->name);
1560 status = rpc_call_sync(server->client, &msg, 0); 1695 status = rpc_call_sync(server->client, &msg, 0);
1561 dprintk("NFS reply lookupfh: %d\n", status); 1696 dprintk("NFS reply lookupfh: %d\n", status);
1562 if (status == -NFS4ERR_MOVED)
1563 status = -EREMOTE;
1564 return status; 1697 return status;
1565} 1698}
1566 1699
@@ -1571,10 +1704,13 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
1571 struct nfs4_exception exception = { }; 1704 struct nfs4_exception exception = { };
1572 int err; 1705 int err;
1573 do { 1706 do {
1574 err = nfs4_handle_exception(server, 1707 err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr);
1575 _nfs4_proc_lookupfh(server, dirfh, name, 1708 /* FIXME: !!!! */
1576 fhandle, fattr), 1709 if (err == -NFS4ERR_MOVED) {
1577 &exception); 1710 err = -EREMOTE;
1711 break;
1712 }
1713 err = nfs4_handle_exception(server, err, &exception);
1578 } while (exception.retry); 1714 } while (exception.retry);
1579 return err; 1715 return err;
1580} 1716}
@@ -1582,28 +1718,10 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
1582static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, 1718static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
1583 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 1719 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
1584{ 1720{
1585 int status; 1721 int status;
1586 struct nfs_server *server = NFS_SERVER(dir);
1587 struct nfs4_lookup_arg args = {
1588 .bitmask = server->attr_bitmask,
1589 .dir_fh = NFS_FH(dir),
1590 .name = name,
1591 };
1592 struct nfs4_lookup_res res = {
1593 .server = server,
1594 .fattr = fattr,
1595 .fh = fhandle,
1596 };
1597 struct rpc_message msg = {
1598 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
1599 .rpc_argp = &args,
1600 .rpc_resp = &res,
1601 };
1602
1603 nfs_fattr_init(fattr);
1604 1722
1605 dprintk("NFS call lookup %s\n", name->name); 1723 dprintk("NFS call lookup %s\n", name->name);
1606 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1724 status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
1607 if (status == -NFS4ERR_MOVED) 1725 if (status == -NFS4ERR_MOVED)
1608 status = nfs4_get_referral(dir, name, fattr, fhandle); 1726 status = nfs4_get_referral(dir, name, fattr, fhandle);
1609 dprintk("NFS reply lookup: %d\n", status); 1727 dprintk("NFS reply lookup: %d\n", status);
@@ -1752,6 +1870,10 @@ static int
1752nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 1870nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1753 int flags, struct nameidata *nd) 1871 int flags, struct nameidata *nd)
1754{ 1872{
1873 struct path path = {
1874 .mnt = nd->mnt,
1875 .dentry = dentry,
1876 };
1755 struct nfs4_state *state; 1877 struct nfs4_state *state;
1756 struct rpc_cred *cred; 1878 struct rpc_cred *cred;
1757 int status = 0; 1879 int status = 0;
@@ -1761,7 +1883,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1761 status = PTR_ERR(cred); 1883 status = PTR_ERR(cred);
1762 goto out; 1884 goto out;
1763 } 1885 }
1764 state = nfs4_do_open(dir, dentry, flags, sattr, cred); 1886 state = nfs4_do_open(dir, &path, flags, sattr, cred);
1765 put_rpccred(cred); 1887 put_rpccred(cred);
1766 if (IS_ERR(state)) { 1888 if (IS_ERR(state)) {
1767 status = PTR_ERR(state); 1889 status = PTR_ERR(state);
@@ -1773,11 +1895,12 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1773 status = nfs4_do_setattr(state->inode, &fattr, sattr, state); 1895 status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
1774 if (status == 0) 1896 if (status == 0)
1775 nfs_setattr_update_inode(state->inode, sattr); 1897 nfs_setattr_update_inode(state->inode, sattr);
1898 nfs_post_op_update_inode(state->inode, &fattr);
1776 } 1899 }
1777 if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) 1900 if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
1778 status = nfs4_intent_set_file(nd, dentry, state); 1901 status = nfs4_intent_set_file(nd, &path, state);
1779 else 1902 else
1780 nfs4_close_state(state, flags); 1903 nfs4_close_state(&path, state, flags);
1781out: 1904out:
1782 return status; 1905 return status;
1783} 1906}
@@ -3008,7 +3131,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3008 if (status != 0) 3131 if (status != 0)
3009 goto out; 3132 goto out;
3010 lsp = request->fl_u.nfs4_fl.owner; 3133 lsp = request->fl_u.nfs4_fl.owner;
3011 arg.lock_owner.id = lsp->ls_id; 3134 arg.lock_owner.id = lsp->ls_id.id;
3012 status = rpc_call_sync(server->client, &msg, 0); 3135 status = rpc_call_sync(server->client, &msg, 0);
3013 switch (status) { 3136 switch (status) {
3014 case 0: 3137 case 0:
@@ -3152,6 +3275,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3152{ 3275{
3153 struct nfs4_unlockdata *data; 3276 struct nfs4_unlockdata *data;
3154 3277
3278 /* Ensure this is an unlock - when canceling a lock, the
3279 * canceled lock is passed in, and it won't be an unlock.
3280 */
3281 fl->fl_type = F_UNLCK;
3282
3155 data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); 3283 data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
3156 if (data == NULL) { 3284 if (data == NULL) {
3157 nfs_free_seqid(seqid); 3285 nfs_free_seqid(seqid);
@@ -3222,7 +3350,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3222 goto out_free; 3350 goto out_free;
3223 p->arg.lock_stateid = &lsp->ls_stateid; 3351 p->arg.lock_stateid = &lsp->ls_stateid;
3224 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 3352 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
3225 p->arg.lock_owner.id = lsp->ls_id; 3353 p->arg.lock_owner.id = lsp->ls_id.id;
3226 p->lsp = lsp; 3354 p->lsp = lsp;
3227 atomic_inc(&lsp->ls_count); 3355 atomic_inc(&lsp->ls_count);
3228 p->ctx = get_nfs_open_context(ctx); 3356 p->ctx = get_nfs_open_context(ctx);
@@ -3285,7 +3413,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
3285 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, 3413 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
3286 sizeof(data->lsp->ls_stateid.data)); 3414 sizeof(data->lsp->ls_stateid.data));
3287 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; 3415 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
3288 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 3416 renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
3289 } 3417 }
3290 nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); 3418 nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
3291out: 3419out:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 8ed79d5c54..e9662ba81d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -38,12 +38,14 @@
38 * subsequent patch. 38 * subsequent patch.
39 */ 39 */
40 40
41#include <linux/kernel.h>
41#include <linux/slab.h> 42#include <linux/slab.h>
42#include <linux/smp_lock.h> 43#include <linux/smp_lock.h>
43#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
44#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
45#include <linux/kthread.h> 46#include <linux/kthread.h>
46#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/random.h>
47#include <linux/workqueue.h> 49#include <linux/workqueue.h>
48#include <linux/bitops.h> 50#include <linux/bitops.h>
49 51
@@ -69,33 +71,14 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
69 return status; 71 return status;
70} 72}
71 73
72u32
73nfs4_alloc_lockowner_id(struct nfs_client *clp)
74{
75 return clp->cl_lockowner_id ++;
76}
77
78static struct nfs4_state_owner *
79nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred)
80{
81 struct nfs4_state_owner *sp = NULL;
82
83 if (!list_empty(&clp->cl_unused)) {
84 sp = list_entry(clp->cl_unused.next, struct nfs4_state_owner, so_list);
85 atomic_inc(&sp->so_count);
86 sp->so_cred = cred;
87 list_move(&sp->so_list, &clp->cl_state_owners);
88 clp->cl_nunused--;
89 }
90 return sp;
91}
92
93struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) 74struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
94{ 75{
95 struct nfs4_state_owner *sp; 76 struct nfs4_state_owner *sp;
77 struct rb_node *pos;
96 struct rpc_cred *cred = NULL; 78 struct rpc_cred *cred = NULL;
97 79
98 list_for_each_entry(sp, &clp->cl_state_owners, so_list) { 80 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
81 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
99 if (list_empty(&sp->so_states)) 82 if (list_empty(&sp->so_states))
100 continue; 83 continue;
101 cred = get_rpccred(sp->so_cred); 84 cred = get_rpccred(sp->so_cred);
@@ -107,32 +90,146 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
107static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 90static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
108{ 91{
109 struct nfs4_state_owner *sp; 92 struct nfs4_state_owner *sp;
93 struct rb_node *pos;
110 94
111 if (!list_empty(&clp->cl_state_owners)) { 95 pos = rb_first(&clp->cl_state_owners);
112 sp = list_entry(clp->cl_state_owners.next, 96 if (pos != NULL) {
113 struct nfs4_state_owner, so_list); 97 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
114 return get_rpccred(sp->so_cred); 98 return get_rpccred(sp->so_cred);
115 } 99 }
116 return NULL; 100 return NULL;
117} 101}
118 102
103static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
104 __u64 minval, int maxbits)
105{
106 struct rb_node **p, *parent;
107 struct nfs_unique_id *pos;
108 __u64 mask = ~0ULL;
109
110 if (maxbits < 64)
111 mask = (1ULL << maxbits) - 1ULL;
112
113 /* Ensure distribution is more or less flat */
114 get_random_bytes(&new->id, sizeof(new->id));
115 new->id &= mask;
116 if (new->id < minval)
117 new->id += minval;
118retry:
119 p = &root->rb_node;
120 parent = NULL;
121
122 while (*p != NULL) {
123 parent = *p;
124 pos = rb_entry(parent, struct nfs_unique_id, rb_node);
125
126 if (new->id < pos->id)
127 p = &(*p)->rb_left;
128 else if (new->id > pos->id)
129 p = &(*p)->rb_right;
130 else
131 goto id_exists;
132 }
133 rb_link_node(&new->rb_node, parent, p);
134 rb_insert_color(&new->rb_node, root);
135 return;
136id_exists:
137 for (;;) {
138 new->id++;
139 if (new->id < minval || (new->id & mask) != new->id) {
140 new->id = minval;
141 break;
142 }
143 parent = rb_next(parent);
144 if (parent == NULL)
145 break;
146 pos = rb_entry(parent, struct nfs_unique_id, rb_node);
147 if (new->id < pos->id)
148 break;
149 }
150 goto retry;
151}
152
153static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
154{
155 rb_erase(&id->rb_node, root);
156}
157
119static struct nfs4_state_owner * 158static struct nfs4_state_owner *
120nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred) 159nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred)
121{ 160{
161 struct nfs_client *clp = server->nfs_client;
162 struct rb_node **p = &clp->cl_state_owners.rb_node,
163 *parent = NULL;
122 struct nfs4_state_owner *sp, *res = NULL; 164 struct nfs4_state_owner *sp, *res = NULL;
123 165
124 list_for_each_entry(sp, &clp->cl_state_owners, so_list) { 166 while (*p != NULL) {
125 if (sp->so_cred != cred) 167 parent = *p;
168 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
169
170 if (server < sp->so_server) {
171 p = &parent->rb_left;
126 continue; 172 continue;
127 atomic_inc(&sp->so_count); 173 }
128 /* Move to the head of the list */ 174 if (server > sp->so_server) {
129 list_move(&sp->so_list, &clp->cl_state_owners); 175 p = &parent->rb_right;
130 res = sp; 176 continue;
131 break; 177 }
178 if (cred < sp->so_cred)
179 p = &parent->rb_left;
180 else if (cred > sp->so_cred)
181 p = &parent->rb_right;
182 else {
183 atomic_inc(&sp->so_count);
184 res = sp;
185 break;
186 }
132 } 187 }
133 return res; 188 return res;
134} 189}
135 190
191static struct nfs4_state_owner *
192nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
193{
194 struct rb_node **p = &clp->cl_state_owners.rb_node,
195 *parent = NULL;
196 struct nfs4_state_owner *sp;
197
198 while (*p != NULL) {
199 parent = *p;
200 sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
201
202 if (new->so_server < sp->so_server) {
203 p = &parent->rb_left;
204 continue;
205 }
206 if (new->so_server > sp->so_server) {
207 p = &parent->rb_right;
208 continue;
209 }
210 if (new->so_cred < sp->so_cred)
211 p = &parent->rb_left;
212 else if (new->so_cred > sp->so_cred)
213 p = &parent->rb_right;
214 else {
215 atomic_inc(&sp->so_count);
216 return sp;
217 }
218 }
219 nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
220 rb_link_node(&new->so_client_node, parent, p);
221 rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
222 return new;
223}
224
225static void
226nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
227{
228 if (!RB_EMPTY_NODE(&sp->so_client_node))
229 rb_erase(&sp->so_client_node, &clp->cl_state_owners);
230 nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
231}
232
136/* 233/*
137 * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to 234 * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
138 * create a new state_owner. 235 * create a new state_owner.
@@ -160,10 +257,14 @@ nfs4_alloc_state_owner(void)
160void 257void
161nfs4_drop_state_owner(struct nfs4_state_owner *sp) 258nfs4_drop_state_owner(struct nfs4_state_owner *sp)
162{ 259{
163 struct nfs_client *clp = sp->so_client; 260 if (!RB_EMPTY_NODE(&sp->so_client_node)) {
164 spin_lock(&clp->cl_lock); 261 struct nfs_client *clp = sp->so_client;
165 list_del_init(&sp->so_list); 262
166 spin_unlock(&clp->cl_lock); 263 spin_lock(&clp->cl_lock);
264 rb_erase(&sp->so_client_node, &clp->cl_state_owners);
265 RB_CLEAR_NODE(&sp->so_client_node);
266 spin_unlock(&clp->cl_lock);
267 }
167} 268}
168 269
169/* 270/*
@@ -175,26 +276,25 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
175 struct nfs_client *clp = server->nfs_client; 276 struct nfs_client *clp = server->nfs_client;
176 struct nfs4_state_owner *sp, *new; 277 struct nfs4_state_owner *sp, *new;
177 278
178 get_rpccred(cred);
179 new = nfs4_alloc_state_owner();
180 spin_lock(&clp->cl_lock); 279 spin_lock(&clp->cl_lock);
181 sp = nfs4_find_state_owner(clp, cred); 280 sp = nfs4_find_state_owner(server, cred);
182 if (sp == NULL)
183 sp = nfs4_client_grab_unused(clp, cred);
184 if (sp == NULL && new != NULL) {
185 list_add(&new->so_list, &clp->cl_state_owners);
186 new->so_client = clp;
187 new->so_id = nfs4_alloc_lockowner_id(clp);
188 new->so_cred = cred;
189 sp = new;
190 new = NULL;
191 }
192 spin_unlock(&clp->cl_lock); 281 spin_unlock(&clp->cl_lock);
193 kfree(new);
194 if (sp != NULL) 282 if (sp != NULL)
195 return sp; 283 return sp;
196 put_rpccred(cred); 284 new = nfs4_alloc_state_owner();
197 return NULL; 285 if (new == NULL)
286 return NULL;
287 new->so_client = clp;
288 new->so_server = server;
289 new->so_cred = cred;
290 spin_lock(&clp->cl_lock);
291 sp = nfs4_insert_state_owner(clp, new);
292 spin_unlock(&clp->cl_lock);
293 if (sp == new)
294 get_rpccred(cred);
295 else
296 kfree(new);
297 return sp;
198} 298}
199 299
200/* 300/*
@@ -208,18 +308,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
208 308
209 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 309 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
210 return; 310 return;
211 if (clp->cl_nunused >= OPENOWNER_POOL_SIZE) 311 nfs4_remove_state_owner(clp, sp);
212 goto out_free;
213 if (list_empty(&sp->so_list))
214 goto out_free;
215 list_move(&sp->so_list, &clp->cl_unused);
216 clp->cl_nunused++;
217 spin_unlock(&clp->cl_lock);
218 put_rpccred(cred);
219 cred = NULL;
220 return;
221out_free:
222 list_del(&sp->so_list);
223 spin_unlock(&clp->cl_lock); 312 spin_unlock(&clp->cl_lock);
224 put_rpccred(cred); 313 put_rpccred(cred);
225 kfree(sp); 314 kfree(sp);
@@ -236,6 +325,7 @@ nfs4_alloc_open_state(void)
236 atomic_set(&state->count, 1); 325 atomic_set(&state->count, 1);
237 INIT_LIST_HEAD(&state->lock_states); 326 INIT_LIST_HEAD(&state->lock_states);
238 spin_lock_init(&state->state_lock); 327 spin_lock_init(&state->state_lock);
328 seqlock_init(&state->seqlock);
239 return state; 329 return state;
240} 330}
241 331
@@ -263,13 +353,10 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
263 struct nfs4_state *state; 353 struct nfs4_state *state;
264 354
265 list_for_each_entry(state, &nfsi->open_states, inode_states) { 355 list_for_each_entry(state, &nfsi->open_states, inode_states) {
266 /* Is this in the process of being freed? */ 356 if (state->owner != owner)
267 if (state->state == 0)
268 continue; 357 continue;
269 if (state->owner == owner) { 358 if (atomic_inc_not_zero(&state->count))
270 atomic_inc(&state->count);
271 return state; 359 return state;
272 }
273 } 360 }
274 return NULL; 361 return NULL;
275} 362}
@@ -341,16 +428,15 @@ void nfs4_put_open_state(struct nfs4_state *state)
341/* 428/*
342 * Close the current file. 429 * Close the current file.
343 */ 430 */
344void nfs4_close_state(struct nfs4_state *state, mode_t mode) 431void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
345{ 432{
346 struct inode *inode = state->inode;
347 struct nfs4_state_owner *owner = state->owner; 433 struct nfs4_state_owner *owner = state->owner;
348 int oldstate, newstate = 0; 434 int call_close = 0;
435 int newstate;
349 436
350 atomic_inc(&owner->so_count); 437 atomic_inc(&owner->so_count);
351 /* Protect against nfs4_find_state() */ 438 /* Protect against nfs4_find_state() */
352 spin_lock(&owner->so_lock); 439 spin_lock(&owner->so_lock);
353 spin_lock(&inode->i_lock);
354 switch (mode & (FMODE_READ | FMODE_WRITE)) { 440 switch (mode & (FMODE_READ | FMODE_WRITE)) {
355 case FMODE_READ: 441 case FMODE_READ:
356 state->n_rdonly--; 442 state->n_rdonly--;
@@ -361,24 +447,29 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
361 case FMODE_READ|FMODE_WRITE: 447 case FMODE_READ|FMODE_WRITE:
362 state->n_rdwr--; 448 state->n_rdwr--;
363 } 449 }
364 oldstate = newstate = state->state; 450 newstate = FMODE_READ|FMODE_WRITE;
365 if (state->n_rdwr == 0) { 451 if (state->n_rdwr == 0) {
366 if (state->n_rdonly == 0) 452 if (state->n_rdonly == 0) {
367 newstate &= ~FMODE_READ; 453 newstate &= ~FMODE_READ;
368 if (state->n_wronly == 0) 454 call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
455 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
456 }
457 if (state->n_wronly == 0) {
369 newstate &= ~FMODE_WRITE; 458 newstate &= ~FMODE_WRITE;
459 call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
460 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
461 }
462 if (newstate == 0)
463 clear_bit(NFS_DELEGATED_STATE, &state->flags);
370 } 464 }
371 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 465 nfs4_state_set_mode_locked(state, newstate);
372 nfs4_state_set_mode_locked(state, newstate);
373 oldstate = newstate;
374 }
375 spin_unlock(&inode->i_lock);
376 spin_unlock(&owner->so_lock); 466 spin_unlock(&owner->so_lock);
377 467
378 if (oldstate != newstate && nfs4_do_close(inode, state) == 0) 468 if (!call_close) {
379 return; 469 nfs4_put_open_state(state);
380 nfs4_put_open_state(state); 470 nfs4_put_state_owner(owner);
381 nfs4_put_state_owner(owner); 471 } else
472 nfs4_do_close(path, state);
382} 473}
383 474
384/* 475/*
@@ -415,12 +506,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
415 atomic_set(&lsp->ls_count, 1); 506 atomic_set(&lsp->ls_count, 1);
416 lsp->ls_owner = fl_owner; 507 lsp->ls_owner = fl_owner;
417 spin_lock(&clp->cl_lock); 508 spin_lock(&clp->cl_lock);
418 lsp->ls_id = nfs4_alloc_lockowner_id(clp); 509 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
419 spin_unlock(&clp->cl_lock); 510 spin_unlock(&clp->cl_lock);
420 INIT_LIST_HEAD(&lsp->ls_locks); 511 INIT_LIST_HEAD(&lsp->ls_locks);
421 return lsp; 512 return lsp;
422} 513}
423 514
515static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
516{
517 struct nfs_client *clp = lsp->ls_state->owner->so_client;
518
519 spin_lock(&clp->cl_lock);
520 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
521 spin_unlock(&clp->cl_lock);
522 kfree(lsp);
523}
524
424/* 525/*
425 * Return a compatible lock_state. If no initialized lock_state structure 526 * Return a compatible lock_state. If no initialized lock_state structure
426 * exists, return an uninitialized one. 527 * exists, return an uninitialized one.
@@ -450,7 +551,8 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
450 return NULL; 551 return NULL;
451 } 552 }
452 spin_unlock(&state->state_lock); 553 spin_unlock(&state->state_lock);
453 kfree(new); 554 if (new != NULL)
555 nfs4_free_lock_state(new);
454 return lsp; 556 return lsp;
455} 557}
456 558
@@ -471,7 +573,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
471 if (list_empty(&state->lock_states)) 573 if (list_empty(&state->lock_states))
472 clear_bit(LK_STATE_IN_USE, &state->flags); 574 clear_bit(LK_STATE_IN_USE, &state->flags);
473 spin_unlock(&state->state_lock); 575 spin_unlock(&state->state_lock);
474 kfree(lsp); 576 nfs4_free_lock_state(lsp);
475} 577}
476 578
477static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 579static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -513,8 +615,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
513void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) 615void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
514{ 616{
515 struct nfs4_lock_state *lsp; 617 struct nfs4_lock_state *lsp;
618 int seq;
516 619
517 memcpy(dst, &state->stateid, sizeof(*dst)); 620 do {
621 seq = read_seqbegin(&state->seqlock);
622 memcpy(dst, &state->stateid, sizeof(*dst));
623 } while (read_seqretry(&state->seqlock, seq));
518 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) 624 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
519 return; 625 return;
520 626
@@ -557,12 +663,18 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
557 * failed with a seqid incrementing error - 663 * failed with a seqid incrementing error -
558 * see comments nfs_fs.h:seqid_mutating_error() 664 * see comments nfs_fs.h:seqid_mutating_error()
559 */ 665 */
560static inline void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 666static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
561{ 667{
562 switch (status) { 668 switch (status) {
563 case 0: 669 case 0:
564 break; 670 break;
565 case -NFS4ERR_BAD_SEQID: 671 case -NFS4ERR_BAD_SEQID:
672 if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
673 return;
674 printk(KERN_WARNING "NFS: v4 server returned a bad"
675 "sequence-id error on an"
676 "unconfirmed sequence %p!\n",
677 seqid->sequence);
566 case -NFS4ERR_STALE_CLIENTID: 678 case -NFS4ERR_STALE_CLIENTID:
567 case -NFS4ERR_STALE_STATEID: 679 case -NFS4ERR_STALE_STATEID:
568 case -NFS4ERR_BAD_STATEID: 680 case -NFS4ERR_BAD_STATEID:
@@ -586,7 +698,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
586 struct nfs4_state_owner, so_seqid); 698 struct nfs4_state_owner, so_seqid);
587 nfs4_drop_state_owner(sp); 699 nfs4_drop_state_owner(sp);
588 } 700 }
589 return nfs_increment_seqid(status, seqid); 701 nfs_increment_seqid(status, seqid);
590} 702}
591 703
592/* 704/*
@@ -596,7 +708,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
596 */ 708 */
597void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) 709void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
598{ 710{
599 return nfs_increment_seqid(status, seqid); 711 nfs_increment_seqid(status, seqid);
600} 712}
601 713
602int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) 714int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
@@ -748,15 +860,21 @@ out_err:
748static void nfs4_state_mark_reclaim(struct nfs_client *clp) 860static void nfs4_state_mark_reclaim(struct nfs_client *clp)
749{ 861{
750 struct nfs4_state_owner *sp; 862 struct nfs4_state_owner *sp;
863 struct rb_node *pos;
751 struct nfs4_state *state; 864 struct nfs4_state *state;
752 struct nfs4_lock_state *lock; 865 struct nfs4_lock_state *lock;
753 866
754 /* Reset all sequence ids to zero */ 867 /* Reset all sequence ids to zero */
755 list_for_each_entry(sp, &clp->cl_state_owners, so_list) { 868 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
869 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
756 sp->so_seqid.counter = 0; 870 sp->so_seqid.counter = 0;
757 sp->so_seqid.flags = 0; 871 sp->so_seqid.flags = 0;
758 spin_lock(&sp->so_lock); 872 spin_lock(&sp->so_lock);
759 list_for_each_entry(state, &sp->so_states, open_states) { 873 list_for_each_entry(state, &sp->so_states, open_states) {
874 clear_bit(NFS_DELEGATED_STATE, &state->flags);
875 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
876 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
877 clear_bit(NFS_O_RDWR_STATE, &state->flags);
760 list_for_each_entry(lock, &state->lock_states, ls_locks) { 878 list_for_each_entry(lock, &state->lock_states, ls_locks) {
761 lock->ls_seqid.counter = 0; 879 lock->ls_seqid.counter = 0;
762 lock->ls_seqid.flags = 0; 880 lock->ls_seqid.flags = 0;
@@ -771,6 +889,7 @@ static int reclaimer(void *ptr)
771{ 889{
772 struct nfs_client *clp = ptr; 890 struct nfs_client *clp = ptr;
773 struct nfs4_state_owner *sp; 891 struct nfs4_state_owner *sp;
892 struct rb_node *pos;
774 struct nfs4_state_recovery_ops *ops; 893 struct nfs4_state_recovery_ops *ops;
775 struct rpc_cred *cred; 894 struct rpc_cred *cred;
776 int status = 0; 895 int status = 0;
@@ -816,7 +935,8 @@ restart_loop:
816 /* Mark all delegations for reclaim */ 935 /* Mark all delegations for reclaim */
817 nfs_delegation_mark_reclaim(clp); 936 nfs_delegation_mark_reclaim(clp);
818 /* Note: list is protected by exclusive lock on cl->cl_sem */ 937 /* Note: list is protected by exclusive lock on cl->cl_sem */
819 list_for_each_entry(sp, &clp->cl_state_owners, so_list) { 938 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
939 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
820 status = nfs4_reclaim_open_state(ops, sp); 940 status = nfs4_reclaim_open_state(ops, sp);
821 if (status < 0) { 941 if (status < 0) {
822 if (status == -NFS4ERR_NO_GRACE) { 942 if (status == -NFS4ERR_NO_GRACE) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8003c91ccb..c08738441f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -68,9 +68,10 @@ static int nfs4_stat_to_errno(int);
68#endif 68#endif
69 69
70/* lock,open owner id: 70/* lock,open owner id:
71 * we currently use size 1 (u32) out of (NFS4_OPAQUE_LIMIT >> 2) 71 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
72 */ 72 */
73#define owner_id_maxsz (1 + 1) 73#define open_owner_id_maxsz (1 + 4)
74#define lock_owner_id_maxsz (1 + 4)
74#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 75#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
75#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) 76#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
76#define op_encode_hdr_maxsz (1) 77#define op_encode_hdr_maxsz (1)
@@ -87,9 +88,11 @@ static int nfs4_stat_to_errno(int);
87#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) 88#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
88#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) 89#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
89#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) 90#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
91#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
92#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
90/* This is based on getfattr, which uses the most attributes: */ 93/* This is based on getfattr, which uses the most attributes: */
91#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 94#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
92 3 + 3 + 3 + 2 * nfs4_name_maxsz)) 95 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz))
93#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ 96#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
94 nfs4_fattr_value_maxsz) 97 nfs4_fattr_value_maxsz)
95#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) 98#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -116,8 +119,27 @@ static int nfs4_stat_to_errno(int);
116 3 + (NFS4_VERIFIER_SIZE >> 2)) 119 3 + (NFS4_VERIFIER_SIZE >> 2))
117#define decode_setclientid_confirm_maxsz \ 120#define decode_setclientid_confirm_maxsz \
118 (op_decode_hdr_maxsz) 121 (op_decode_hdr_maxsz)
119#define encode_lookup_maxsz (op_encode_hdr_maxsz + \ 122#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
120 1 + ((3 + NFS4_FHSIZE) >> 2)) 123#define decode_lookup_maxsz (op_decode_hdr_maxsz)
124#define encode_share_access_maxsz \
125 (2)
126#define encode_createmode_maxsz (1 + nfs4_fattr_maxsz)
127#define encode_opentype_maxsz (1 + encode_createmode_maxsz)
128#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
129#define encode_open_maxsz (op_encode_hdr_maxsz + \
130 2 + encode_share_access_maxsz + 2 + \
131 open_owner_id_maxsz + \
132 encode_opentype_maxsz + \
133 encode_claim_null_maxsz)
134#define decode_ace_maxsz (3 + nfs4_owner_maxsz)
135#define decode_delegation_maxsz (1 + XDR_QUADLEN(NFS4_STATEID_SIZE) + 1 + \
136 decode_ace_maxsz)
137#define decode_change_info_maxsz (5)
138#define decode_open_maxsz (op_decode_hdr_maxsz + \
139 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
140 decode_change_info_maxsz + 1 + \
141 nfs4_fattr_bitmap_maxsz + \
142 decode_delegation_maxsz)
121#define encode_remove_maxsz (op_encode_hdr_maxsz + \ 143#define encode_remove_maxsz (op_encode_hdr_maxsz + \
122 nfs4_name_maxsz) 144 nfs4_name_maxsz)
123#define encode_rename_maxsz (op_encode_hdr_maxsz + \ 145#define encode_rename_maxsz (op_encode_hdr_maxsz + \
@@ -134,9 +156,15 @@ static int nfs4_stat_to_errno(int);
134#define encode_create_maxsz (op_encode_hdr_maxsz + \ 156#define encode_create_maxsz (op_encode_hdr_maxsz + \
135 2 + nfs4_name_maxsz + \ 157 2 + nfs4_name_maxsz + \
136 nfs4_fattr_maxsz) 158 nfs4_fattr_maxsz)
137#define decode_create_maxsz (op_decode_hdr_maxsz + 8) 159#define decode_create_maxsz (op_decode_hdr_maxsz + \
160 decode_change_info_maxsz + \
161 nfs4_fattr_bitmap_maxsz)
138#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) 162#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
139#define decode_delegreturn_maxsz (op_decode_hdr_maxsz) 163#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
164#define encode_fs_locations_maxsz \
165 (encode_getattr_maxsz)
166#define decode_fs_locations_maxsz \
167 (0)
140#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ 168#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
141#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ 169#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */
142#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ 170#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \
@@ -174,16 +202,21 @@ static int nfs4_stat_to_errno(int);
174 op_decode_hdr_maxsz + 2 + \ 202 op_decode_hdr_maxsz + 2 + \
175 decode_getattr_maxsz) 203 decode_getattr_maxsz)
176#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ 204#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
177 encode_putfh_maxsz + \ 205 encode_putfh_maxsz + \
178 op_encode_hdr_maxsz + \ 206 encode_savefh_maxsz + \
179 13 + 3 + 2 + 64 + \ 207 encode_open_maxsz + \
180 encode_getattr_maxsz + \ 208 encode_getfh_maxsz + \
181 encode_getfh_maxsz) 209 encode_getattr_maxsz + \
210 encode_restorefh_maxsz + \
211 encode_getattr_maxsz)
182#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ 212#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
183 decode_putfh_maxsz + \ 213 decode_putfh_maxsz + \
184 op_decode_hdr_maxsz + 4 + 5 + 2 + 3 + \ 214 decode_savefh_maxsz + \
185 decode_getattr_maxsz + \ 215 decode_open_maxsz + \
186 decode_getfh_maxsz) 216 decode_getfh_maxsz + \
217 decode_getattr_maxsz + \
218 decode_restorefh_maxsz + \
219 decode_getattr_maxsz)
187#define NFS4_enc_open_confirm_sz \ 220#define NFS4_enc_open_confirm_sz \
188 (compound_encode_hdr_maxsz + \ 221 (compound_encode_hdr_maxsz + \
189 encode_putfh_maxsz + \ 222 encode_putfh_maxsz + \
@@ -193,12 +226,12 @@ static int nfs4_stat_to_errno(int);
193 op_decode_hdr_maxsz + 4) 226 op_decode_hdr_maxsz + 4)
194#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ 227#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
195 encode_putfh_maxsz + \ 228 encode_putfh_maxsz + \
196 op_encode_hdr_maxsz + \ 229 encode_open_maxsz + \
197 11) 230 encode_getattr_maxsz)
198#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ 231#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
199 decode_putfh_maxsz + \ 232 decode_putfh_maxsz + \
200 op_decode_hdr_maxsz + \ 233 decode_open_maxsz + \
201 4 + 5 + 2 + 3) 234 decode_getattr_maxsz)
202#define NFS4_enc_open_downgrade_sz \ 235#define NFS4_enc_open_downgrade_sz \
203 (compound_encode_hdr_maxsz + \ 236 (compound_encode_hdr_maxsz + \
204 encode_putfh_maxsz + \ 237 encode_putfh_maxsz + \
@@ -256,19 +289,19 @@ static int nfs4_stat_to_errno(int);
256 op_encode_hdr_maxsz + \ 289 op_encode_hdr_maxsz + \
257 1 + 1 + 2 + 2 + \ 290 1 + 1 + 2 + 2 + \
258 1 + 4 + 1 + 2 + \ 291 1 + 4 + 1 + 2 + \
259 owner_id_maxsz) 292 lock_owner_id_maxsz)
260#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ 293#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \
261 decode_putfh_maxsz + \ 294 decode_putfh_maxsz + \
262 decode_getattr_maxsz + \ 295 decode_getattr_maxsz + \
263 op_decode_hdr_maxsz + \ 296 op_decode_hdr_maxsz + \
264 2 + 2 + 1 + 2 + \ 297 2 + 2 + 1 + 2 + \
265 owner_id_maxsz) 298 lock_owner_id_maxsz)
266#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ 299#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \
267 encode_putfh_maxsz + \ 300 encode_putfh_maxsz + \
268 encode_getattr_maxsz + \ 301 encode_getattr_maxsz + \
269 op_encode_hdr_maxsz + \ 302 op_encode_hdr_maxsz + \
270 1 + 2 + 2 + 2 + \ 303 1 + 2 + 2 + 2 + \
271 owner_id_maxsz) 304 lock_owner_id_maxsz)
272#define NFS4_dec_lockt_sz (NFS4_dec_lock_sz) 305#define NFS4_dec_lockt_sz (NFS4_dec_lock_sz)
273#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ 306#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \
274 encode_putfh_maxsz + \ 307 encode_putfh_maxsz + \
@@ -298,7 +331,7 @@ static int nfs4_stat_to_errno(int);
298 encode_getfh_maxsz) 331 encode_getfh_maxsz)
299#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ 332#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \
300 decode_putfh_maxsz + \ 333 decode_putfh_maxsz + \
301 op_decode_hdr_maxsz + \ 334 decode_lookup_maxsz + \
302 decode_getattr_maxsz + \ 335 decode_getattr_maxsz + \
303 decode_getfh_maxsz) 336 decode_getfh_maxsz)
304#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ 337#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
@@ -417,12 +450,13 @@ static int nfs4_stat_to_errno(int);
417#define NFS4_enc_fs_locations_sz \ 450#define NFS4_enc_fs_locations_sz \
418 (compound_encode_hdr_maxsz + \ 451 (compound_encode_hdr_maxsz + \
419 encode_putfh_maxsz + \ 452 encode_putfh_maxsz + \
420 encode_getattr_maxsz) 453 encode_lookup_maxsz + \
454 encode_fs_locations_maxsz)
421#define NFS4_dec_fs_locations_sz \ 455#define NFS4_dec_fs_locations_sz \
422 (compound_decode_hdr_maxsz + \ 456 (compound_decode_hdr_maxsz + \
423 decode_putfh_maxsz + \ 457 decode_putfh_maxsz + \
424 op_decode_hdr_maxsz + \ 458 decode_lookup_maxsz + \
425 nfs4_fattr_bitmap_maxsz) 459 decode_fs_locations_maxsz)
426 460
427static struct { 461static struct {
428 unsigned int mode; 462 unsigned int mode;
@@ -793,13 +827,14 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
793 WRITE64(nfs4_lock_length(args->fl)); 827 WRITE64(nfs4_lock_length(args->fl));
794 WRITE32(args->new_lock_owner); 828 WRITE32(args->new_lock_owner);
795 if (args->new_lock_owner){ 829 if (args->new_lock_owner){
796 RESERVE_SPACE(4+NFS4_STATEID_SIZE+20); 830 RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
797 WRITE32(args->open_seqid->sequence->counter); 831 WRITE32(args->open_seqid->sequence->counter);
798 WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); 832 WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
799 WRITE32(args->lock_seqid->sequence->counter); 833 WRITE32(args->lock_seqid->sequence->counter);
800 WRITE64(args->lock_owner.clientid); 834 WRITE64(args->lock_owner.clientid);
801 WRITE32(4); 835 WRITE32(16);
802 WRITE32(args->lock_owner.id); 836 WRITEMEM("lock id:", 8);
837 WRITE64(args->lock_owner.id);
803 } 838 }
804 else { 839 else {
805 RESERVE_SPACE(NFS4_STATEID_SIZE+4); 840 RESERVE_SPACE(NFS4_STATEID_SIZE+4);
@@ -814,14 +849,15 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
814{ 849{
815 __be32 *p; 850 __be32 *p;
816 851
817 RESERVE_SPACE(40); 852 RESERVE_SPACE(52);
818 WRITE32(OP_LOCKT); 853 WRITE32(OP_LOCKT);
819 WRITE32(nfs4_lock_type(args->fl, 0)); 854 WRITE32(nfs4_lock_type(args->fl, 0));
820 WRITE64(args->fl->fl_start); 855 WRITE64(args->fl->fl_start);
821 WRITE64(nfs4_lock_length(args->fl)); 856 WRITE64(nfs4_lock_length(args->fl));
822 WRITE64(args->lock_owner.clientid); 857 WRITE64(args->lock_owner.clientid);
823 WRITE32(4); 858 WRITE32(16);
824 WRITE32(args->lock_owner.id); 859 WRITEMEM("lock id:", 8);
860 WRITE64(args->lock_owner.id);
825 861
826 return 0; 862 return 0;
827} 863}
@@ -886,10 +922,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
886 WRITE32(OP_OPEN); 922 WRITE32(OP_OPEN);
887 WRITE32(arg->seqid->sequence->counter); 923 WRITE32(arg->seqid->sequence->counter);
888 encode_share_access(xdr, arg->open_flags); 924 encode_share_access(xdr, arg->open_flags);
889 RESERVE_SPACE(16); 925 RESERVE_SPACE(28);
890 WRITE64(arg->clientid); 926 WRITE64(arg->clientid);
891 WRITE32(4); 927 WRITE32(16);
892 WRITE32(arg->id); 928 WRITEMEM("open id:", 8);
929 WRITE64(arg->id);
893} 930}
894 931
895static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 932static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1071,7 +1108,7 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
1071 1108
1072static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) 1109static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
1073{ 1110{
1074 struct rpc_auth *auth = req->rq_task->tk_auth; 1111 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1075 uint32_t attrs[2] = { 1112 uint32_t attrs[2] = {
1076 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, 1113 FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
1077 FATTR4_WORD1_MOUNTED_ON_FILEID, 1114 FATTR4_WORD1_MOUNTED_ON_FILEID,
@@ -1117,7 +1154,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1117 1154
1118static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req) 1155static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
1119{ 1156{
1120 struct rpc_auth *auth = req->rq_task->tk_auth; 1157 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1121 unsigned int replen; 1158 unsigned int replen;
1122 __be32 *p; 1159 __be32 *p;
1123 1160
@@ -1735,7 +1772,7 @@ out:
1735 */ 1772 */
1736static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 1773static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
1737{ 1774{
1738 struct rpc_auth *auth = req->rq_task->tk_auth; 1775 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1739 struct xdr_stream xdr; 1776 struct xdr_stream xdr;
1740 struct compound_hdr hdr = { 1777 struct compound_hdr hdr = {
1741 .nops = 2, 1778 .nops = 2,
@@ -1795,7 +1832,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
1795 struct nfs_getaclargs *args) 1832 struct nfs_getaclargs *args)
1796{ 1833{
1797 struct xdr_stream xdr; 1834 struct xdr_stream xdr;
1798 struct rpc_auth *auth = req->rq_task->tk_auth; 1835 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
1799 struct compound_hdr hdr = { 1836 struct compound_hdr hdr = {
1800 .nops = 2, 1837 .nops = 2,
1801 }; 1838 };
@@ -2030,7 +2067,7 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
2030 struct compound_hdr hdr = { 2067 struct compound_hdr hdr = {
2031 .nops = 3, 2068 .nops = 3,
2032 }; 2069 };
2033 struct rpc_auth *auth = req->rq_task->tk_auth; 2070 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
2034 int replen; 2071 int replen;
2035 int status; 2072 int status;
2036 2073
@@ -3269,7 +3306,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3269static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3306static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3270{ 3307{
3271 __be32 *p; 3308 __be32 *p;
3272 uint32_t bmlen; 3309 uint32_t savewords, bmlen, i;
3273 int status; 3310 int status;
3274 3311
3275 status = decode_op_hdr(xdr, OP_OPEN); 3312 status = decode_op_hdr(xdr, OP_OPEN);
@@ -3287,7 +3324,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3287 goto xdr_error; 3324 goto xdr_error;
3288 3325
3289 READ_BUF(bmlen << 2); 3326 READ_BUF(bmlen << 2);
3290 p += bmlen; 3327 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
3328 for (i = 0; i < savewords; ++i)
3329 READ32(res->attrset[i]);
3330 for (; i < NFS4_BITMAP_SIZE; i++)
3331 res->attrset[i] = 0;
3332
3291 return decode_delegation(xdr, res); 3333 return decode_delegation(xdr, res);
3292xdr_error: 3334xdr_error:
3293 dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen); 3335 dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 49d1008ce1..3490322d11 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
428 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n", 428 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
429 program, version, NIPQUAD(servaddr)); 429 program, version, NIPQUAD(servaddr));
430 set_sockaddr(&sin, servaddr, 0); 430 set_sockaddr(&sin, servaddr, 0);
431 return rpcb_getport_external(&sin, program, version, proto); 431 return rpcb_getport_sync(&sin, program, version, proto);
432} 432}
433 433
434 434
@@ -496,7 +496,8 @@ static int __init root_nfs_get_handle(void)
496 NFS_MNT3_VERSION : NFS_MNT_VERSION; 496 NFS_MNT3_VERSION : NFS_MNT_VERSION;
497 497
498 set_sockaddr(&sin, servaddr, htons(mount_port)); 498 set_sockaddr(&sin, servaddr, htons(mount_port));
499 status = nfsroot_mount(&sin, nfs_path, &fh, version, protocol); 499 status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
500 nfs_path, version, protocol, &fh);
500 if (status < 0) 501 if (status < 0)
501 printk(KERN_ERR "Root-NFS: Server returned error %d " 502 printk(KERN_ERR "Root-NFS: Server returned error %d "
502 "while mounting %s\n", status, nfs_path); 503 "while mounting %s\n", status, nfs_path);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c5bb51a29e..f56dae5216 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -85,9 +85,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
85 req->wb_offset = offset; 85 req->wb_offset = offset;
86 req->wb_pgbase = offset; 86 req->wb_pgbase = offset;
87 req->wb_bytes = count; 87 req->wb_bytes = count;
88 atomic_set(&req->wb_count, 1);
89 req->wb_context = get_nfs_open_context(ctx); 88 req->wb_context = get_nfs_open_context(ctx);
90 89 kref_init(&req->wb_kref);
91 return req; 90 return req;
92} 91}
93 92
@@ -109,30 +108,31 @@ void nfs_unlock_request(struct nfs_page *req)
109} 108}
110 109
111/** 110/**
112 * nfs_set_page_writeback_locked - Lock a request for writeback 111 * nfs_set_page_tag_locked - Tag a request as locked
113 * @req: 112 * @req:
114 */ 113 */
115int nfs_set_page_writeback_locked(struct nfs_page *req) 114static int nfs_set_page_tag_locked(struct nfs_page *req)
116{ 115{
117 struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); 116 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
118 117
119 if (!nfs_lock_request(req)) 118 if (!nfs_lock_request(req))
120 return 0; 119 return 0;
121 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); 120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
122 return 1; 121 return 1;
123} 122}
124 123
125/** 124/**
126 * nfs_clear_page_writeback - Unlock request and wake up sleepers 125 * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
127 */ 126 */
128void nfs_clear_page_writeback(struct nfs_page *req) 127void nfs_clear_page_tag_locked(struct nfs_page *req)
129{ 128{
130 struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); 129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131 131
132 if (req->wb_page != NULL) { 132 if (req->wb_page != NULL) {
133 spin_lock(&nfsi->req_lock); 133 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); 134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 spin_unlock(&nfsi->req_lock); 135 spin_unlock(&inode->i_lock);
136 } 136 }
137 nfs_unlock_request(req); 137 nfs_unlock_request(req);
138} 138}
@@ -160,11 +160,9 @@ void nfs_clear_request(struct nfs_page *req)
160 * 160 *
161 * Note: Should never be called with the spinlock held! 161 * Note: Should never be called with the spinlock held!
162 */ 162 */
163void 163static void nfs_free_request(struct kref *kref)
164nfs_release_request(struct nfs_page *req)
165{ 164{
166 if (!atomic_dec_and_test(&req->wb_count)) 165 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 return;
168 166
169 /* Release struct file or cached credential */ 167 /* Release struct file or cached credential */
170 nfs_clear_request(req); 168 nfs_clear_request(req);
@@ -172,6 +170,11 @@ nfs_release_request(struct nfs_page *req)
172 nfs_page_free(req); 170 nfs_page_free(req);
173} 171}
174 172
173void nfs_release_request(struct nfs_page *req)
174{
175 kref_put(&req->wb_kref, nfs_free_request);
176}
177
175static int nfs_wait_bit_interruptible(void *word) 178static int nfs_wait_bit_interruptible(void *word)
176{ 179{
177 int ret = 0; 180 int ret = 0;
@@ -193,7 +196,7 @@ static int nfs_wait_bit_interruptible(void *word)
193int 196int
194nfs_wait_on_request(struct nfs_page *req) 197nfs_wait_on_request(struct nfs_page *req)
195{ 198{
196 struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->dentry->d_inode); 199 struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->path.dentry->d_inode);
197 sigset_t oldmask; 200 sigset_t oldmask;
198 int ret = 0; 201 int ret = 0;
199 202
@@ -379,20 +382,20 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
379/** 382/**
380 * nfs_scan_list - Scan a list for matching requests 383 * nfs_scan_list - Scan a list for matching requests
381 * @nfsi: NFS inode 384 * @nfsi: NFS inode
382 * @head: One of the NFS inode request lists
383 * @dst: Destination list 385 * @dst: Destination list
384 * @idx_start: lower bound of page->index to scan 386 * @idx_start: lower bound of page->index to scan
385 * @npages: idx_start + npages sets the upper bound to scan. 387 * @npages: idx_start + npages sets the upper bound to scan.
388 * @tag: tag to scan for
386 * 389 *
387 * Moves elements from one of the inode request lists. 390 * Moves elements from one of the inode request lists.
388 * If the number of requests is set to 0, the entire address_space 391 * If the number of requests is set to 0, the entire address_space
389 * starting at index idx_start, is scanned. 392 * starting at index idx_start, is scanned.
390 * The requests are *not* checked to ensure that they form a contiguous set. 393 * The requests are *not* checked to ensure that they form a contiguous set.
391 * You must be holding the inode's req_lock when calling this function 394 * You must be holding the inode's i_lock when calling this function
392 */ 395 */
393int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, 396int nfs_scan_list(struct nfs_inode *nfsi,
394 struct list_head *dst, pgoff_t idx_start, 397 struct list_head *dst, pgoff_t idx_start,
395 unsigned int npages) 398 unsigned int npages, int tag)
396{ 399{
397 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; 400 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
398 struct nfs_page *req; 401 struct nfs_page *req;
@@ -407,9 +410,9 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
407 idx_end = idx_start + npages - 1; 410 idx_end = idx_start + npages - 1;
408 411
409 for (;;) { 412 for (;;) {
410 found = radix_tree_gang_lookup(&nfsi->nfs_page_tree, 413 found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
411 (void **)&pgvec[0], idx_start, 414 (void **)&pgvec[0], idx_start,
412 NFS_SCAN_MAXENTRIES); 415 NFS_SCAN_MAXENTRIES, tag);
413 if (found <= 0) 416 if (found <= 0)
414 break; 417 break;
415 for (i = 0; i < found; i++) { 418 for (i = 0; i < found; i++) {
@@ -417,15 +420,18 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
417 if (req->wb_index > idx_end) 420 if (req->wb_index > idx_end)
418 goto out; 421 goto out;
419 idx_start = req->wb_index + 1; 422 idx_start = req->wb_index + 1;
420 if (req->wb_list_head != head) 423 if (nfs_set_page_tag_locked(req)) {
421 continue;
422 if (nfs_set_page_writeback_locked(req)) {
423 nfs_list_remove_request(req); 424 nfs_list_remove_request(req);
425 radix_tree_tag_clear(&nfsi->nfs_page_tree,
426 req->wb_index, tag);
424 nfs_list_add_request(req, dst); 427 nfs_list_add_request(req, dst);
425 res++; 428 res++;
429 if (res == INT_MAX)
430 goto out;
426 } 431 }
427 } 432 }
428 433 /* for latency reduction */
434 cond_resched_lock(&nfsi->vfs_inode.i_lock);
429 } 435 }
430out: 436out:
431 return res; 437 return res;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7bd7cb95c0..6ae2e58ed0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -145,8 +145,8 @@ static void nfs_readpage_release(struct nfs_page *req)
145 unlock_page(req->wb_page); 145 unlock_page(req->wb_page);
146 146
147 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 147 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
148 req->wb_context->dentry->d_inode->i_sb->s_id, 148 req->wb_context->path.dentry->d_inode->i_sb->s_id,
149 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 149 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
150 req->wb_bytes, 150 req->wb_bytes,
151 (long long)req_offset(req)); 151 (long long)req_offset(req));
152 nfs_clear_request(req); 152 nfs_clear_request(req);
@@ -164,7 +164,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
164 int flags; 164 int flags;
165 165
166 data->req = req; 166 data->req = req;
167 data->inode = inode = req->wb_context->dentry->d_inode; 167 data->inode = inode = req->wb_context->path.dentry->d_inode;
168 data->cred = req->wb_context->cred; 168 data->cred = req->wb_context->cred;
169 169
170 data->args.fh = NFS_FH(inode); 170 data->args.fh = NFS_FH(inode);
@@ -483,17 +483,19 @@ int nfs_readpage(struct file *file, struct page *page)
483 */ 483 */
484 error = nfs_wb_page(inode, page); 484 error = nfs_wb_page(inode, page);
485 if (error) 485 if (error)
486 goto out_error; 486 goto out_unlock;
487 if (PageUptodate(page))
488 goto out_unlock;
487 489
488 error = -ESTALE; 490 error = -ESTALE;
489 if (NFS_STALE(inode)) 491 if (NFS_STALE(inode))
490 goto out_error; 492 goto out_unlock;
491 493
492 if (file == NULL) { 494 if (file == NULL) {
493 error = -EBADF; 495 error = -EBADF;
494 ctx = nfs_find_open_context(inode, NULL, FMODE_READ); 496 ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
495 if (ctx == NULL) 497 if (ctx == NULL)
496 goto out_error; 498 goto out_unlock;
497 } else 499 } else
498 ctx = get_nfs_open_context((struct nfs_open_context *) 500 ctx = get_nfs_open_context((struct nfs_open_context *)
499 file->private_data); 501 file->private_data);
@@ -502,8 +504,7 @@ int nfs_readpage(struct file *file, struct page *page)
502 504
503 put_nfs_open_context(ctx); 505 put_nfs_open_context(ctx);
504 return error; 506 return error;
505 507out_unlock:
506out_error:
507 unlock_page(page); 508 unlock_page(page);
508 return error; 509 return error;
509} 510}
@@ -520,21 +521,32 @@ readpage_async_filler(void *data, struct page *page)
520 struct inode *inode = page->mapping->host; 521 struct inode *inode = page->mapping->host;
521 struct nfs_page *new; 522 struct nfs_page *new;
522 unsigned int len; 523 unsigned int len;
524 int error;
525
526 error = nfs_wb_page(inode, page);
527 if (error)
528 goto out_unlock;
529 if (PageUptodate(page))
530 goto out_unlock;
523 531
524 nfs_wb_page(inode, page);
525 len = nfs_page_length(page); 532 len = nfs_page_length(page);
526 if (len == 0) 533 if (len == 0)
527 return nfs_return_empty_page(page); 534 return nfs_return_empty_page(page);
535
528 new = nfs_create_request(desc->ctx, inode, page, 0, len); 536 new = nfs_create_request(desc->ctx, inode, page, 0, len);
529 if (IS_ERR(new)) { 537 if (IS_ERR(new))
530 SetPageError(page); 538 goto out_error;
531 unlock_page(page); 539
532 return PTR_ERR(new);
533 }
534 if (len < PAGE_CACHE_SIZE) 540 if (len < PAGE_CACHE_SIZE)
535 zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); 541 zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0);
536 nfs_pageio_add_request(desc->pgio, new); 542 nfs_pageio_add_request(desc->pgio, new);
537 return 0; 543 return 0;
544out_error:
545 error = PTR_ERR(new);
546 SetPageError(page);
547out_unlock:
548 unlock_page(page);
549 return error;
538} 550}
539 551
540int nfs_readpages(struct file *filp, struct address_space *mapping, 552int nfs_readpages(struct file *filp, struct address_space *mapping,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ca20d3cc26..adffe1615c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -45,6 +45,7 @@
45#include <linux/inet.h> 45#include <linux/inet.h>
46#include <linux/nfs_xdr.h> 46#include <linux/nfs_xdr.h>
47#include <linux/magic.h> 47#include <linux/magic.h>
48#include <linux/parser.h>
48 49
49#include <asm/system.h> 50#include <asm/system.h>
50#include <asm/uaccess.h> 51#include <asm/uaccess.h>
@@ -57,6 +58,167 @@
57 58
58#define NFSDBG_FACILITY NFSDBG_VFS 59#define NFSDBG_FACILITY NFSDBG_VFS
59 60
61
62struct nfs_parsed_mount_data {
63 int flags;
64 int rsize, wsize;
65 int timeo, retrans;
66 int acregmin, acregmax,
67 acdirmin, acdirmax;
68 int namlen;
69 unsigned int bsize;
70 unsigned int auth_flavor_len;
71 rpc_authflavor_t auth_flavors[1];
72 char *client_address;
73
74 struct {
75 struct sockaddr_in address;
76 unsigned int program;
77 unsigned int version;
78 unsigned short port;
79 int protocol;
80 } mount_server;
81
82 struct {
83 struct sockaddr_in address;
84 char *hostname;
85 char *export_path;
86 unsigned int program;
87 int protocol;
88 } nfs_server;
89};
90
91enum {
92 /* Mount options that take no arguments */
93 Opt_soft, Opt_hard,
94 Opt_intr, Opt_nointr,
95 Opt_posix, Opt_noposix,
96 Opt_cto, Opt_nocto,
97 Opt_ac, Opt_noac,
98 Opt_lock, Opt_nolock,
99 Opt_v2, Opt_v3,
100 Opt_udp, Opt_tcp,
101 Opt_acl, Opt_noacl,
102 Opt_rdirplus, Opt_nordirplus,
103 Opt_sharecache, Opt_nosharecache,
104
105 /* Mount options that take integer arguments */
106 Opt_port,
107 Opt_rsize, Opt_wsize, Opt_bsize,
108 Opt_timeo, Opt_retrans,
109 Opt_acregmin, Opt_acregmax,
110 Opt_acdirmin, Opt_acdirmax,
111 Opt_actimeo,
112 Opt_namelen,
113 Opt_mountport,
114 Opt_mountprog, Opt_mountvers,
115 Opt_nfsprog, Opt_nfsvers,
116
117 /* Mount options that take string arguments */
118 Opt_sec, Opt_proto, Opt_mountproto,
119 Opt_addr, Opt_mounthost, Opt_clientaddr,
120
121 /* Mount options that are ignored */
122 Opt_userspace, Opt_deprecated,
123
124 Opt_err
125};
126
127static match_table_t nfs_mount_option_tokens = {
128 { Opt_userspace, "bg" },
129 { Opt_userspace, "fg" },
130 { Opt_soft, "soft" },
131 { Opt_hard, "hard" },
132 { Opt_intr, "intr" },
133 { Opt_nointr, "nointr" },
134 { Opt_posix, "posix" },
135 { Opt_noposix, "noposix" },
136 { Opt_cto, "cto" },
137 { Opt_nocto, "nocto" },
138 { Opt_ac, "ac" },
139 { Opt_noac, "noac" },
140 { Opt_lock, "lock" },
141 { Opt_nolock, "nolock" },
142 { Opt_v2, "v2" },
143 { Opt_v3, "v3" },
144 { Opt_udp, "udp" },
145 { Opt_tcp, "tcp" },
146 { Opt_acl, "acl" },
147 { Opt_noacl, "noacl" },
148 { Opt_rdirplus, "rdirplus" },
149 { Opt_nordirplus, "nordirplus" },
150 { Opt_sharecache, "sharecache" },
151 { Opt_nosharecache, "nosharecache" },
152
153 { Opt_port, "port=%u" },
154 { Opt_rsize, "rsize=%u" },
155 { Opt_wsize, "wsize=%u" },
156 { Opt_bsize, "bsize=%u" },
157 { Opt_timeo, "timeo=%u" },
158 { Opt_retrans, "retrans=%u" },
159 { Opt_acregmin, "acregmin=%u" },
160 { Opt_acregmax, "acregmax=%u" },
161 { Opt_acdirmin, "acdirmin=%u" },
162 { Opt_acdirmax, "acdirmax=%u" },
163 { Opt_actimeo, "actimeo=%u" },
164 { Opt_userspace, "retry=%u" },
165 { Opt_namelen, "namlen=%u" },
166 { Opt_mountport, "mountport=%u" },
167 { Opt_mountprog, "mountprog=%u" },
168 { Opt_mountvers, "mountvers=%u" },
169 { Opt_nfsprog, "nfsprog=%u" },
170 { Opt_nfsvers, "nfsvers=%u" },
171 { Opt_nfsvers, "vers=%u" },
172
173 { Opt_sec, "sec=%s" },
174 { Opt_proto, "proto=%s" },
175 { Opt_mountproto, "mountproto=%s" },
176 { Opt_addr, "addr=%s" },
177 { Opt_clientaddr, "clientaddr=%s" },
178 { Opt_mounthost, "mounthost=%s" },
179
180 { Opt_err, NULL }
181};
182
183enum {
184 Opt_xprt_udp, Opt_xprt_tcp,
185
186 Opt_xprt_err
187};
188
189static match_table_t nfs_xprt_protocol_tokens = {
190 { Opt_xprt_udp, "udp" },
191 { Opt_xprt_tcp, "tcp" },
192
193 { Opt_xprt_err, NULL }
194};
195
196enum {
197 Opt_sec_none, Opt_sec_sys,
198 Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p,
199 Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp,
200 Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp,
201
202 Opt_sec_err
203};
204
205static match_table_t nfs_secflavor_tokens = {
206 { Opt_sec_none, "none" },
207 { Opt_sec_none, "null" },
208 { Opt_sec_sys, "sys" },
209
210 { Opt_sec_krb5, "krb5" },
211 { Opt_sec_krb5i, "krb5i" },
212 { Opt_sec_krb5p, "krb5p" },
213
214 { Opt_sec_lkey, "lkey" },
215 { Opt_sec_lkeyi, "lkeyi" },
216 { Opt_sec_lkeyp, "lkeyp" },
217
218 { Opt_sec_err, NULL }
219};
220
221
60static void nfs_umount_begin(struct vfsmount *, int); 222static void nfs_umount_begin(struct vfsmount *, int);
61static int nfs_statfs(struct dentry *, struct kstatfs *); 223static int nfs_statfs(struct dentry *, struct kstatfs *);
62static int nfs_show_options(struct seq_file *, struct vfsmount *); 224static int nfs_show_options(struct seq_file *, struct vfsmount *);
@@ -138,7 +300,10 @@ static const struct super_operations nfs4_sops = {
138}; 300};
139#endif 301#endif
140 302
141static struct shrinker *acl_shrinker; 303static struct shrinker acl_shrinker = {
304 .shrink = nfs_access_cache_shrinker,
305 .seeks = DEFAULT_SEEKS,
306};
142 307
143/* 308/*
144 * Register the NFS filesystems 309 * Register the NFS filesystems
@@ -159,7 +324,7 @@ int __init register_nfs_fs(void)
159 if (ret < 0) 324 if (ret < 0)
160 goto error_2; 325 goto error_2;
161#endif 326#endif
162 acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker); 327 register_shrinker(&acl_shrinker);
163 return 0; 328 return 0;
164 329
165#ifdef CONFIG_NFS_V4 330#ifdef CONFIG_NFS_V4
@@ -177,8 +342,7 @@ error_0:
177 */ 342 */
178void __exit unregister_nfs_fs(void) 343void __exit unregister_nfs_fs(void)
179{ 344{
180 if (acl_shrinker != NULL) 345 unregister_shrinker(&acl_shrinker);
181 remove_shrinker(acl_shrinker);
182#ifdef CONFIG_NFS_V4 346#ifdef CONFIG_NFS_V4
183 unregister_filesystem(&nfs4_fs_type); 347 unregister_filesystem(&nfs4_fs_type);
184 nfs_unregister_sysctl(); 348 nfs_unregister_sysctl();
@@ -263,11 +427,11 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
263 { RPC_AUTH_GSS_SPKM, "spkm" }, 427 { RPC_AUTH_GSS_SPKM, "spkm" },
264 { RPC_AUTH_GSS_SPKMI, "spkmi" }, 428 { RPC_AUTH_GSS_SPKMI, "spkmi" },
265 { RPC_AUTH_GSS_SPKMP, "spkmp" }, 429 { RPC_AUTH_GSS_SPKMP, "spkmp" },
266 { -1, "unknown" } 430 { UINT_MAX, "unknown" }
267 }; 431 };
268 int i; 432 int i;
269 433
270 for (i=0; sec_flavours[i].flavour != -1; i++) { 434 for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) {
271 if (sec_flavours[i].flavour == flavour) 435 if (sec_flavours[i].flavour == flavour)
272 break; 436 break;
273 } 437 }
@@ -291,6 +455,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
291 { NFS_MOUNT_NONLM, ",nolock", "" }, 455 { NFS_MOUNT_NONLM, ",nolock", "" },
292 { NFS_MOUNT_NOACL, ",noacl", "" }, 456 { NFS_MOUNT_NOACL, ",noacl", "" },
293 { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, 457 { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
458 { NFS_MOUNT_UNSHARED, ",nosharecache", ""},
294 { 0, NULL, NULL } 459 { 0, NULL, NULL }
295 }; 460 };
296 const struct proc_nfs_info *nfs_infop; 461 const struct proc_nfs_info *nfs_infop;
@@ -430,87 +595,641 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
430 */ 595 */
431static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) 596static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
432{ 597{
598 struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb);
599 struct rpc_clnt *rpc;
600
433 shrink_submounts(vfsmnt, &nfs_automount_list); 601 shrink_submounts(vfsmnt, &nfs_automount_list);
602
603 if (!(flags & MNT_FORCE))
604 return;
605 /* -EIO all pending I/O */
606 rpc = server->client_acl;
607 if (!IS_ERR(rpc))
608 rpc_killall_tasks(rpc);
609 rpc = server->client;
610 if (!IS_ERR(rpc))
611 rpc_killall_tasks(rpc);
434} 612}
435 613
436/* 614/*
437 * Validate the NFS2/NFS3 mount data 615 * Sanity-check a server address provided by the mount command
438 * - fills in the mount root filehandle
439 */ 616 */
440static int nfs_validate_mount_data(struct nfs_mount_data *data, 617static int nfs_verify_server_address(struct sockaddr *addr)
441 struct nfs_fh *mntfh)
442{ 618{
443 if (data == NULL) { 619 switch (addr->sa_family) {
444 dprintk("%s: missing data argument\n", __FUNCTION__); 620 case AF_INET: {
445 return -EINVAL; 621 struct sockaddr_in *sa = (struct sockaddr_in *) addr;
622 if (sa->sin_addr.s_addr != INADDR_ANY)
623 return 1;
624 break;
625 }
446 } 626 }
447 627
448 if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { 628 return 0;
449 dprintk("%s: bad mount version\n", __FUNCTION__); 629}
450 return -EINVAL; 630
631/*
632 * Error-check and convert a string of mount options from user space into
633 * a data structure
634 */
635static int nfs_parse_mount_options(char *raw,
636 struct nfs_parsed_mount_data *mnt)
637{
638 char *p, *string;
639
640 if (!raw) {
641 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
642 return 1;
451 } 643 }
644 dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
452 645
453 switch (data->version) { 646 while ((p = strsep(&raw, ",")) != NULL) {
454 case 1: 647 substring_t args[MAX_OPT_ARGS];
455 data->namlen = 0; 648 int option, token;
456 case 2: 649
457 data->bsize = 0; 650 if (!*p)
458 case 3: 651 continue;
459 if (data->flags & NFS_MOUNT_VER3) { 652
460 dprintk("%s: mount structure version %d does not support NFSv3\n", 653 dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", p);
461 __FUNCTION__, 654
462 data->version); 655 token = match_token(p, nfs_mount_option_tokens, args);
463 return -EINVAL; 656 switch (token) {
657 case Opt_soft:
658 mnt->flags |= NFS_MOUNT_SOFT;
659 break;
660 case Opt_hard:
661 mnt->flags &= ~NFS_MOUNT_SOFT;
662 break;
663 case Opt_intr:
664 mnt->flags |= NFS_MOUNT_INTR;
665 break;
666 case Opt_nointr:
667 mnt->flags &= ~NFS_MOUNT_INTR;
668 break;
669 case Opt_posix:
670 mnt->flags |= NFS_MOUNT_POSIX;
671 break;
672 case Opt_noposix:
673 mnt->flags &= ~NFS_MOUNT_POSIX;
674 break;
675 case Opt_cto:
676 mnt->flags &= ~NFS_MOUNT_NOCTO;
677 break;
678 case Opt_nocto:
679 mnt->flags |= NFS_MOUNT_NOCTO;
680 break;
681 case Opt_ac:
682 mnt->flags &= ~NFS_MOUNT_NOAC;
683 break;
684 case Opt_noac:
685 mnt->flags |= NFS_MOUNT_NOAC;
686 break;
687 case Opt_lock:
688 mnt->flags &= ~NFS_MOUNT_NONLM;
689 break;
690 case Opt_nolock:
691 mnt->flags |= NFS_MOUNT_NONLM;
692 break;
693 case Opt_v2:
694 mnt->flags &= ~NFS_MOUNT_VER3;
695 break;
696 case Opt_v3:
697 mnt->flags |= NFS_MOUNT_VER3;
698 break;
699 case Opt_udp:
700 mnt->flags &= ~NFS_MOUNT_TCP;
701 mnt->nfs_server.protocol = IPPROTO_UDP;
702 mnt->timeo = 7;
703 mnt->retrans = 5;
704 break;
705 case Opt_tcp:
706 mnt->flags |= NFS_MOUNT_TCP;
707 mnt->nfs_server.protocol = IPPROTO_TCP;
708 mnt->timeo = 600;
709 mnt->retrans = 2;
710 break;
711 case Opt_acl:
712 mnt->flags &= ~NFS_MOUNT_NOACL;
713 break;
714 case Opt_noacl:
715 mnt->flags |= NFS_MOUNT_NOACL;
716 break;
717 case Opt_rdirplus:
718 mnt->flags &= ~NFS_MOUNT_NORDIRPLUS;
719 break;
720 case Opt_nordirplus:
721 mnt->flags |= NFS_MOUNT_NORDIRPLUS;
722 break;
723 case Opt_sharecache:
724 mnt->flags &= ~NFS_MOUNT_UNSHARED;
725 break;
726 case Opt_nosharecache:
727 mnt->flags |= NFS_MOUNT_UNSHARED;
728 break;
729
730 case Opt_port:
731 if (match_int(args, &option))
732 return 0;
733 if (option < 0 || option > 65535)
734 return 0;
735 mnt->nfs_server.address.sin_port = htonl(option);
736 break;
737 case Opt_rsize:
738 if (match_int(args, &mnt->rsize))
739 return 0;
740 break;
741 case Opt_wsize:
742 if (match_int(args, &mnt->wsize))
743 return 0;
744 break;
745 case Opt_bsize:
746 if (match_int(args, &option))
747 return 0;
748 if (option < 0)
749 return 0;
750 mnt->bsize = option;
751 break;
752 case Opt_timeo:
753 if (match_int(args, &mnt->timeo))
754 return 0;
755 break;
756 case Opt_retrans:
757 if (match_int(args, &mnt->retrans))
758 return 0;
759 break;
760 case Opt_acregmin:
761 if (match_int(args, &mnt->acregmin))
762 return 0;
763 break;
764 case Opt_acregmax:
765 if (match_int(args, &mnt->acregmax))
766 return 0;
767 break;
768 case Opt_acdirmin:
769 if (match_int(args, &mnt->acdirmin))
770 return 0;
771 break;
772 case Opt_acdirmax:
773 if (match_int(args, &mnt->acdirmax))
774 return 0;
775 break;
776 case Opt_actimeo:
777 if (match_int(args, &option))
778 return 0;
779 if (option < 0)
780 return 0;
781 mnt->acregmin =
782 mnt->acregmax =
783 mnt->acdirmin =
784 mnt->acdirmax = option;
785 break;
786 case Opt_namelen:
787 if (match_int(args, &mnt->namlen))
788 return 0;
789 break;
790 case Opt_mountport:
791 if (match_int(args, &option))
792 return 0;
793 if (option < 0 || option > 65535)
794 return 0;
795 mnt->mount_server.port = option;
796 break;
797 case Opt_mountprog:
798 if (match_int(args, &option))
799 return 0;
800 if (option < 0)
801 return 0;
802 mnt->mount_server.program = option;
803 break;
804 case Opt_mountvers:
805 if (match_int(args, &option))
806 return 0;
807 if (option < 0)
808 return 0;
809 mnt->mount_server.version = option;
810 break;
811 case Opt_nfsprog:
812 if (match_int(args, &option))
813 return 0;
814 if (option < 0)
815 return 0;
816 mnt->nfs_server.program = option;
817 break;
818 case Opt_nfsvers:
819 if (match_int(args, &option))
820 return 0;
821 switch (option) {
822 case 2:
823 mnt->flags &= ~NFS_MOUNT_VER3;
824 break;
825 case 3:
826 mnt->flags |= NFS_MOUNT_VER3;
827 break;
828 default:
829 goto out_unrec_vers;
464 } 830 }
465 data->root.size = NFS2_FHSIZE; 831 break;
466 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); 832
467 case 4: 833 case Opt_sec:
468 if (data->flags & NFS_MOUNT_SECFLAVOUR) { 834 string = match_strdup(args);
469 dprintk("%s: mount structure version %d does not support strong security\n", 835 if (string == NULL)
470 __FUNCTION__, 836 goto out_nomem;
471 data->version); 837 token = match_token(string, nfs_secflavor_tokens, args);
472 return -EINVAL; 838 kfree(string);
839
840 /*
841 * The flags setting is for v2/v3. The flavor_len
842 * setting is for v4. v2/v3 also need to know the
843 * difference between NULL and UNIX.
844 */
845 switch (token) {
846 case Opt_sec_none:
847 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
848 mnt->auth_flavor_len = 0;
849 mnt->auth_flavors[0] = RPC_AUTH_NULL;
850 break;
851 case Opt_sec_sys:
852 mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
853 mnt->auth_flavor_len = 0;
854 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
855 break;
856 case Opt_sec_krb5:
857 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
858 mnt->auth_flavor_len = 1;
859 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
860 break;
861 case Opt_sec_krb5i:
862 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
863 mnt->auth_flavor_len = 1;
864 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
865 break;
866 case Opt_sec_krb5p:
867 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
868 mnt->auth_flavor_len = 1;
869 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
870 break;
871 case Opt_sec_lkey:
872 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
873 mnt->auth_flavor_len = 1;
874 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
875 break;
876 case Opt_sec_lkeyi:
877 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
878 mnt->auth_flavor_len = 1;
879 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
880 break;
881 case Opt_sec_lkeyp:
882 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
883 mnt->auth_flavor_len = 1;
884 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
885 break;
886 case Opt_sec_spkm:
887 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
888 mnt->auth_flavor_len = 1;
889 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
890 break;
891 case Opt_sec_spkmi:
892 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
893 mnt->auth_flavor_len = 1;
894 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
895 break;
896 case Opt_sec_spkmp:
897 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
898 mnt->auth_flavor_len = 1;
899 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
900 break;
901 default:
902 goto out_unrec_sec;
473 } 903 }
474 case 5: 904 break;
475 memset(data->context, 0, sizeof(data->context)); 905 case Opt_proto:
476 } 906 string = match_strdup(args);
907 if (string == NULL)
908 goto out_nomem;
909 token = match_token(string,
910 nfs_xprt_protocol_tokens, args);
911 kfree(string);
912
913 switch (token) {
914 case Opt_udp:
915 mnt->flags &= ~NFS_MOUNT_TCP;
916 mnt->nfs_server.protocol = IPPROTO_UDP;
917 mnt->timeo = 7;
918 mnt->retrans = 5;
919 break;
920 case Opt_tcp:
921 mnt->flags |= NFS_MOUNT_TCP;
922 mnt->nfs_server.protocol = IPPROTO_TCP;
923 mnt->timeo = 600;
924 mnt->retrans = 2;
925 break;
926 default:
927 goto out_unrec_xprt;
928 }
929 break;
930 case Opt_mountproto:
931 string = match_strdup(args);
932 if (string == NULL)
933 goto out_nomem;
934 token = match_token(string,
935 nfs_xprt_protocol_tokens, args);
936 kfree(string);
937
938 switch (token) {
939 case Opt_udp:
940 mnt->mount_server.protocol = IPPROTO_UDP;
941 break;
942 case Opt_tcp:
943 mnt->mount_server.protocol = IPPROTO_TCP;
944 break;
945 default:
946 goto out_unrec_xprt;
947 }
948 break;
949 case Opt_addr:
950 string = match_strdup(args);
951 if (string == NULL)
952 goto out_nomem;
953 mnt->nfs_server.address.sin_family = AF_INET;
954 mnt->nfs_server.address.sin_addr.s_addr =
955 in_aton(string);
956 kfree(string);
957 break;
958 case Opt_clientaddr:
959 string = match_strdup(args);
960 if (string == NULL)
961 goto out_nomem;
962 mnt->client_address = string;
963 break;
964 case Opt_mounthost:
965 string = match_strdup(args);
966 if (string == NULL)
967 goto out_nomem;
968 mnt->mount_server.address.sin_family = AF_INET;
969 mnt->mount_server.address.sin_addr.s_addr =
970 in_aton(string);
971 kfree(string);
972 break;
477 973
478 /* Set the pseudoflavor */ 974 case Opt_userspace:
479 if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) 975 case Opt_deprecated:
480 data->pseudoflavor = RPC_AUTH_UNIX; 976 break;
481 977
482#ifndef CONFIG_NFS_V3 978 default:
483 /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ 979 goto out_unknown;
484 if (data->flags & NFS_MOUNT_VER3) { 980 }
485 dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
486 return -EPROTONOSUPPORT;
487 } 981 }
488#endif /* CONFIG_NFS_V3 */
489 982
490 /* We now require that the mount process passes the remote address */ 983 return 1;
491 if (data->addr.sin_addr.s_addr == INADDR_ANY) { 984
492 dprintk("%s: mount program didn't pass remote address!\n", 985out_nomem:
493 __FUNCTION__); 986 printk(KERN_INFO "NFS: not enough memory to parse option\n");
494 return -EINVAL; 987 return 0;
988
989out_unrec_vers:
990 printk(KERN_INFO "NFS: unrecognized NFS version number\n");
991 return 0;
992
993out_unrec_xprt:
994 printk(KERN_INFO "NFS: unrecognized transport protocol\n");
995 return 0;
996
997out_unrec_sec:
998 printk(KERN_INFO "NFS: unrecognized security flavor\n");
999 return 0;
1000
1001out_unknown:
1002 printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
1003 return 0;
1004}
1005
1006/*
1007 * Use the remote server's MOUNT service to request the NFS file handle
1008 * corresponding to the provided path.
1009 */
1010static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1011 struct nfs_fh *root_fh)
1012{
1013 struct sockaddr_in sin;
1014 int status;
1015
1016 if (args->mount_server.version == 0) {
1017 if (args->flags & NFS_MOUNT_VER3)
1018 args->mount_server.version = NFS_MNT3_VERSION;
1019 else
1020 args->mount_server.version = NFS_MNT_VERSION;
495 } 1021 }
496 1022
497 /* Prepare the root filehandle */ 1023 /*
498 if (data->flags & NFS_MOUNT_VER3) 1024 * Construct the mount server's address.
499 mntfh->size = data->root.size; 1025 */
1026 if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY)
1027 sin = args->mount_server.address;
500 else 1028 else
501 mntfh->size = NFS2_FHSIZE; 1029 sin = args->nfs_server.address;
1030 if (args->mount_server.port == 0) {
1031 status = rpcb_getport_sync(&sin,
1032 args->mount_server.program,
1033 args->mount_server.version,
1034 args->mount_server.protocol);
1035 if (status < 0)
1036 goto out_err;
1037 sin.sin_port = htons(status);
1038 } else
1039 sin.sin_port = htons(args->mount_server.port);
1040
1041 /*
1042 * Now ask the mount server to map our export path
1043 * to a file handle.
1044 */
1045 status = nfs_mount((struct sockaddr *) &sin,
1046 sizeof(sin),
1047 args->nfs_server.hostname,
1048 args->nfs_server.export_path,
1049 args->mount_server.version,
1050 args->mount_server.protocol,
1051 root_fh);
1052 if (status < 0)
1053 goto out_err;
1054
1055 return status;
502 1056
503 if (mntfh->size > sizeof(mntfh->data)) { 1057out_err:
504 dprintk("%s: invalid root filehandle\n", __FUNCTION__); 1058 dfprintk(MOUNT, "NFS: unable to contact server on host "
505 return -EINVAL; 1059 NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr));
1060 return status;
1061}
1062
1063/*
1064 * Validate the NFS2/NFS3 mount data
1065 * - fills in the mount root filehandle
1066 *
1067 * For option strings, user space handles the following behaviors:
1068 *
1069 * + DNS: mapping server host name to IP address ("addr=" option)
1070 *
1071 * + failure mode: how to behave if a mount request can't be handled
1072 * immediately ("fg/bg" option)
1073 *
1074 * + retry: how often to retry a mount request ("retry=" option)
1075 *
1076 * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
1077 * mountproto=tcp after mountproto=udp, and so on
1078 *
1079 * XXX: as far as I can tell, changing the NFS program number is not
1080 * supported in the NFS client.
1081 */
1082static int nfs_validate_mount_data(struct nfs_mount_data **options,
1083 struct nfs_fh *mntfh,
1084 const char *dev_name)
1085{
1086 struct nfs_mount_data *data = *options;
1087
1088 if (data == NULL)
1089 goto out_no_data;
1090
1091 switch (data->version) {
1092 case 1:
1093 data->namlen = 0;
1094 case 2:
1095 data->bsize = 0;
1096 case 3:
1097 if (data->flags & NFS_MOUNT_VER3)
1098 goto out_no_v3;
1099 data->root.size = NFS2_FHSIZE;
1100 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
1101 case 4:
1102 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1103 goto out_no_sec;
1104 case 5:
1105 memset(data->context, 0, sizeof(data->context));
1106 case 6:
1107 if (data->flags & NFS_MOUNT_VER3)
1108 mntfh->size = data->root.size;
1109 else
1110 mntfh->size = NFS2_FHSIZE;
1111
1112 if (mntfh->size > sizeof(mntfh->data))
1113 goto out_invalid_fh;
1114
1115 memcpy(mntfh->data, data->root.data, mntfh->size);
1116 if (mntfh->size < sizeof(mntfh->data))
1117 memset(mntfh->data + mntfh->size, 0,
1118 sizeof(mntfh->data) - mntfh->size);
1119 break;
1120 default: {
1121 unsigned int len;
1122 char *c;
1123 int status;
1124 struct nfs_parsed_mount_data args = {
1125 .flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP),
1126 .rsize = NFS_MAX_FILE_IO_SIZE,
1127 .wsize = NFS_MAX_FILE_IO_SIZE,
1128 .timeo = 600,
1129 .retrans = 2,
1130 .acregmin = 3,
1131 .acregmax = 60,
1132 .acdirmin = 30,
1133 .acdirmax = 60,
1134 .mount_server.protocol = IPPROTO_UDP,
1135 .mount_server.program = NFS_MNT_PROGRAM,
1136 .nfs_server.protocol = IPPROTO_TCP,
1137 .nfs_server.program = NFS_PROGRAM,
1138 };
1139
1140 if (nfs_parse_mount_options((char *) *options, &args) == 0)
1141 return -EINVAL;
1142
1143 data = kzalloc(sizeof(*data), GFP_KERNEL);
1144 if (data == NULL)
1145 return -ENOMEM;
1146
1147 /*
1148 * NB: after this point, caller will free "data"
1149 * if we return an error
1150 */
1151 *options = data;
1152
1153 c = strchr(dev_name, ':');
1154 if (c == NULL)
1155 return -EINVAL;
1156 len = c - dev_name - 1;
1157 if (len > sizeof(data->hostname))
1158 return -EINVAL;
1159 strncpy(data->hostname, dev_name, len);
1160 args.nfs_server.hostname = data->hostname;
1161
1162 c++;
1163 if (strlen(c) > NFS_MAXPATHLEN)
1164 return -EINVAL;
1165 args.nfs_server.export_path = c;
1166
1167 status = nfs_try_mount(&args, mntfh);
1168 if (status)
1169 return -EINVAL;
1170
1171 /*
1172 * Translate to nfs_mount_data, which nfs_fill_super
1173 * can deal with.
1174 */
1175 data->version = 6;
1176 data->flags = args.flags;
1177 data->rsize = args.rsize;
1178 data->wsize = args.wsize;
1179 data->timeo = args.timeo;
1180 data->retrans = args.retrans;
1181 data->acregmin = args.acregmin;
1182 data->acregmax = args.acregmax;
1183 data->acdirmin = args.acdirmin;
1184 data->acdirmax = args.acdirmax;
1185 data->addr = args.nfs_server.address;
1186 data->namlen = args.namlen;
1187 data->bsize = args.bsize;
1188 data->pseudoflavor = args.auth_flavors[0];
1189
1190 break;
1191 }
506 } 1192 }
507 1193
508 memcpy(mntfh->data, data->root.data, mntfh->size); 1194 if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
509 if (mntfh->size < sizeof(mntfh->data)) 1195 data->pseudoflavor = RPC_AUTH_UNIX;
510 memset(mntfh->data + mntfh->size, 0, 1196
511 sizeof(mntfh->data) - mntfh->size); 1197#ifndef CONFIG_NFS_V3
1198 if (data->flags & NFS_MOUNT_VER3)
1199 goto out_v3_not_compiled;
1200#endif /* !CONFIG_NFS_V3 */
1201
1202 if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
1203 goto out_no_address;
512 1204
513 return 0; 1205 return 0;
1206
1207out_no_data:
1208 dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n");
1209 return -EINVAL;
1210
1211out_no_v3:
1212 dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n",
1213 data->version);
1214 return -EINVAL;
1215
1216out_no_sec:
1217 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
1218 return -EINVAL;
1219
1220#ifndef CONFIG_NFS_V3
1221out_v3_not_compiled:
1222 dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
1223 return -EPROTONOSUPPORT;
1224#endif /* !CONFIG_NFS_V3 */
1225
1226out_no_address:
1227 dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
1228 return -EINVAL;
1229
1230out_invalid_fh:
1231 dfprintk(MOUNT, "NFS: invalid root filehandle\n");
1232 return -EINVAL;
514} 1233}
515 1234
516/* 1235/*
@@ -600,13 +1319,51 @@ static int nfs_compare_super(struct super_block *sb, void *data)
600{ 1319{
601 struct nfs_server *server = data, *old = NFS_SB(sb); 1320 struct nfs_server *server = data, *old = NFS_SB(sb);
602 1321
603 if (old->nfs_client != server->nfs_client) 1322 if (memcmp(&old->nfs_client->cl_addr,
1323 &server->nfs_client->cl_addr,
1324 sizeof(old->nfs_client->cl_addr)) != 0)
1325 return 0;
1326 /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
1327 if (old->flags & NFS_MOUNT_UNSHARED)
604 return 0; 1328 return 0;
605 if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) 1329 if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
606 return 0; 1330 return 0;
607 return 1; 1331 return 1;
608} 1332}
609 1333
1334#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
1335
1336static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
1337{
1338 const struct nfs_server *a = s->s_fs_info;
1339 const struct rpc_clnt *clnt_a = a->client;
1340 const struct rpc_clnt *clnt_b = b->client;
1341
1342 if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK))
1343 goto Ebusy;
1344 if (a->nfs_client != b->nfs_client)
1345 goto Ebusy;
1346 if (a->flags != b->flags)
1347 goto Ebusy;
1348 if (a->wsize != b->wsize)
1349 goto Ebusy;
1350 if (a->rsize != b->rsize)
1351 goto Ebusy;
1352 if (a->acregmin != b->acregmin)
1353 goto Ebusy;
1354 if (a->acregmax != b->acregmax)
1355 goto Ebusy;
1356 if (a->acdirmin != b->acdirmin)
1357 goto Ebusy;
1358 if (a->acdirmax != b->acdirmax)
1359 goto Ebusy;
1360 if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
1361 goto Ebusy;
1362 return 0;
1363Ebusy:
1364 return -EBUSY;
1365}
1366
610static int nfs_get_sb(struct file_system_type *fs_type, 1367static int nfs_get_sb(struct file_system_type *fs_type,
611 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 1368 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
612{ 1369{
@@ -615,30 +1372,37 @@ static int nfs_get_sb(struct file_system_type *fs_type,
615 struct nfs_fh mntfh; 1372 struct nfs_fh mntfh;
616 struct nfs_mount_data *data = raw_data; 1373 struct nfs_mount_data *data = raw_data;
617 struct dentry *mntroot; 1374 struct dentry *mntroot;
1375 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
618 int error; 1376 int error;
619 1377
620 /* Validate the mount data */ 1378 /* Validate the mount data */
621 error = nfs_validate_mount_data(data, &mntfh); 1379 error = nfs_validate_mount_data(&data, &mntfh, dev_name);
622 if (error < 0) 1380 if (error < 0)
623 return error; 1381 goto out;
624 1382
625 /* Get a volume representation */ 1383 /* Get a volume representation */
626 server = nfs_create_server(data, &mntfh); 1384 server = nfs_create_server(data, &mntfh);
627 if (IS_ERR(server)) { 1385 if (IS_ERR(server)) {
628 error = PTR_ERR(server); 1386 error = PTR_ERR(server);
629 goto out_err_noserver; 1387 goto out;
630 } 1388 }
631 1389
1390 if (server->flags & NFS_MOUNT_UNSHARED)
1391 compare_super = NULL;
1392
632 /* Get a superblock - note that we may end up sharing one that already exists */ 1393 /* Get a superblock - note that we may end up sharing one that already exists */
633 s = sget(fs_type, nfs_compare_super, nfs_set_super, server); 1394 s = sget(fs_type, compare_super, nfs_set_super, server);
634 if (IS_ERR(s)) { 1395 if (IS_ERR(s)) {
635 error = PTR_ERR(s); 1396 error = PTR_ERR(s);
636 goto out_err_nosb; 1397 goto out_err_nosb;
637 } 1398 }
638 1399
639 if (s->s_fs_info != server) { 1400 if (s->s_fs_info != server) {
1401 error = nfs_compare_mount_options(s, server, flags);
640 nfs_free_server(server); 1402 nfs_free_server(server);
641 server = NULL; 1403 server = NULL;
1404 if (error < 0)
1405 goto error_splat_super;
642 } 1406 }
643 1407
644 if (!s->s_root) { 1408 if (!s->s_root) {
@@ -656,17 +1420,21 @@ static int nfs_get_sb(struct file_system_type *fs_type,
656 s->s_flags |= MS_ACTIVE; 1420 s->s_flags |= MS_ACTIVE;
657 mnt->mnt_sb = s; 1421 mnt->mnt_sb = s;
658 mnt->mnt_root = mntroot; 1422 mnt->mnt_root = mntroot;
659 return 0; 1423 error = 0;
1424
1425out:
1426 if (data != raw_data)
1427 kfree(data);
1428 return error;
660 1429
661out_err_nosb: 1430out_err_nosb:
662 nfs_free_server(server); 1431 nfs_free_server(server);
663out_err_noserver: 1432 goto out;
664 return error;
665 1433
666error_splat_super: 1434error_splat_super:
667 up_write(&s->s_umount); 1435 up_write(&s->s_umount);
668 deactivate_super(s); 1436 deactivate_super(s);
669 return error; 1437 goto out;
670} 1438}
671 1439
672/* 1440/*
@@ -691,6 +1459,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
691 struct super_block *s; 1459 struct super_block *s;
692 struct nfs_server *server; 1460 struct nfs_server *server;
693 struct dentry *mntroot; 1461 struct dentry *mntroot;
1462 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
694 int error; 1463 int error;
695 1464
696 dprintk("--> nfs_xdev_get_sb()\n"); 1465 dprintk("--> nfs_xdev_get_sb()\n");
@@ -702,16 +1471,22 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
702 goto out_err_noserver; 1471 goto out_err_noserver;
703 } 1472 }
704 1473
1474 if (server->flags & NFS_MOUNT_UNSHARED)
1475 compare_super = NULL;
1476
705 /* Get a superblock - note that we may end up sharing one that already exists */ 1477 /* Get a superblock - note that we may end up sharing one that already exists */
706 s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); 1478 s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
707 if (IS_ERR(s)) { 1479 if (IS_ERR(s)) {
708 error = PTR_ERR(s); 1480 error = PTR_ERR(s);
709 goto out_err_nosb; 1481 goto out_err_nosb;
710 } 1482 }
711 1483
712 if (s->s_fs_info != server) { 1484 if (s->s_fs_info != server) {
1485 error = nfs_compare_mount_options(s, server, flags);
713 nfs_free_server(server); 1486 nfs_free_server(server);
714 server = NULL; 1487 server = NULL;
1488 if (error < 0)
1489 goto error_splat_super;
715 } 1490 }
716 1491
717 if (!s->s_root) { 1492 if (!s->s_root) {
@@ -772,25 +1547,164 @@ static void nfs4_fill_super(struct super_block *sb)
772 nfs_initialise_sb(sb); 1547 nfs_initialise_sb(sb);
773} 1548}
774 1549
775static void *nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) 1550/*
1551 * Validate NFSv4 mount options
1552 */
1553static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
1554 const char *dev_name,
1555 struct sockaddr_in *addr,
1556 rpc_authflavor_t *authflavour,
1557 char **hostname,
1558 char **mntpath,
1559 char **ip_addr)
776{ 1560{
777 void *p = NULL; 1561 struct nfs4_mount_data *data = *options;
778 1562 char *c;
779 if (!src->len) 1563
780 return ERR_PTR(-EINVAL); 1564 if (data == NULL)
781 if (src->len < maxlen) 1565 goto out_no_data;
782 maxlen = src->len; 1566
783 if (dst == NULL) { 1567 switch (data->version) {
784 p = dst = kmalloc(maxlen + 1, GFP_KERNEL); 1568 case 1:
785 if (p == NULL) 1569 if (data->host_addrlen != sizeof(*addr))
786 return ERR_PTR(-ENOMEM); 1570 goto out_no_address;
787 } 1571 if (copy_from_user(addr, data->host_addr, sizeof(*addr)))
788 if (copy_from_user(dst, src->data, maxlen)) { 1572 return -EFAULT;
789 kfree(p); 1573 if (addr->sin_port == 0)
790 return ERR_PTR(-EFAULT); 1574 addr->sin_port = htons(NFS_PORT);
1575 if (!nfs_verify_server_address((struct sockaddr *) addr))
1576 goto out_no_address;
1577
1578 switch (data->auth_flavourlen) {
1579 case 0:
1580 *authflavour = RPC_AUTH_UNIX;
1581 break;
1582 case 1:
1583 if (copy_from_user(authflavour, data->auth_flavours,
1584 sizeof(*authflavour)))
1585 return -EFAULT;
1586 break;
1587 default:
1588 goto out_inval_auth;
1589 }
1590
1591 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
1592 if (IS_ERR(c))
1593 return PTR_ERR(c);
1594 *hostname = c;
1595
1596 c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
1597 if (IS_ERR(c))
1598 return PTR_ERR(c);
1599 *mntpath = c;
1600 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath);
1601
1602 c = strndup_user(data->client_addr.data, 16);
1603 if (IS_ERR(c))
1604 return PTR_ERR(c);
1605 *ip_addr = c;
1606
1607 break;
1608 default: {
1609 unsigned int len;
1610 struct nfs_parsed_mount_data args = {
1611 .rsize = NFS_MAX_FILE_IO_SIZE,
1612 .wsize = NFS_MAX_FILE_IO_SIZE,
1613 .timeo = 600,
1614 .retrans = 2,
1615 .acregmin = 3,
1616 .acregmax = 60,
1617 .acdirmin = 30,
1618 .acdirmax = 60,
1619 .nfs_server.protocol = IPPROTO_TCP,
1620 };
1621
1622 if (nfs_parse_mount_options((char *) *options, &args) == 0)
1623 return -EINVAL;
1624
1625 if (!nfs_verify_server_address((struct sockaddr *)
1626 &args.nfs_server.address))
1627 return -EINVAL;
1628 *addr = args.nfs_server.address;
1629
1630 switch (args.auth_flavor_len) {
1631 case 0:
1632 *authflavour = RPC_AUTH_UNIX;
1633 break;
1634 case 1:
1635 *authflavour = (rpc_authflavor_t) args.auth_flavors[0];
1636 break;
1637 default:
1638 goto out_inval_auth;
1639 }
1640
1641 /*
1642 * Translate to nfs4_mount_data, which nfs4_fill_super
1643 * can deal with.
1644 */
1645 data = kzalloc(sizeof(*data), GFP_KERNEL);
1646 if (data == NULL)
1647 return -ENOMEM;
1648 *options = data;
1649
1650 data->version = 1;
1651 data->flags = args.flags & NFS4_MOUNT_FLAGMASK;
1652 data->rsize = args.rsize;
1653 data->wsize = args.wsize;
1654 data->timeo = args.timeo;
1655 data->retrans = args.retrans;
1656 data->acregmin = args.acregmin;
1657 data->acregmax = args.acregmax;
1658 data->acdirmin = args.acdirmin;
1659 data->acdirmax = args.acdirmax;
1660 data->proto = args.nfs_server.protocol;
1661
1662 /*
1663 * Split "dev_name" into "hostname:mntpath".
1664 */
1665 c = strchr(dev_name, ':');
1666 if (c == NULL)
1667 return -EINVAL;
1668 /* while calculating len, pretend ':' is '\0' */
1669 len = c - dev_name;
1670 if (len > NFS4_MAXNAMLEN)
1671 return -EINVAL;
1672 *hostname = kzalloc(len, GFP_KERNEL);
1673 if (*hostname == NULL)
1674 return -ENOMEM;
1675 strncpy(*hostname, dev_name, len - 1);
1676
1677 c++; /* step over the ':' */
1678 len = strlen(c);
1679 if (len > NFS4_MAXPATHLEN)
1680 return -EINVAL;
1681 *mntpath = kzalloc(len + 1, GFP_KERNEL);
1682 if (*mntpath == NULL)
1683 return -ENOMEM;
1684 strncpy(*mntpath, c, len);
1685
1686 dprintk("MNTPATH: %s\n", *mntpath);
1687
1688 *ip_addr = args.client_address;
1689
1690 break;
1691 }
791 } 1692 }
792 dst[maxlen] = '\0'; 1693
793 return dst; 1694 return 0;
1695
1696out_no_data:
1697 dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n");
1698 return -EINVAL;
1699
1700out_inval_auth:
1701 dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n",
1702 data->auth_flavourlen);
1703 return -EINVAL;
1704
1705out_no_address:
1706 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
1707 return -EINVAL;
794} 1708}
795 1709
796/* 1710/*
@@ -806,81 +1720,29 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
806 rpc_authflavor_t authflavour; 1720 rpc_authflavor_t authflavour;
807 struct nfs_fh mntfh; 1721 struct nfs_fh mntfh;
808 struct dentry *mntroot; 1722 struct dentry *mntroot;
809 char *mntpath = NULL, *hostname = NULL, ip_addr[16]; 1723 char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
810 void *p; 1724 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
811 int error; 1725 int error;
812 1726
813 if (data == NULL) { 1727 /* Validate the mount data */
814 dprintk("%s: missing data argument\n", __FUNCTION__); 1728 error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour,
815 return -EINVAL; 1729 &hostname, &mntpath, &ip_addr);
816 } 1730 if (error < 0)
817 if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { 1731 goto out;
818 dprintk("%s: bad mount version\n", __FUNCTION__);
819 return -EINVAL;
820 }
821
822 /* We now require that the mount process passes the remote address */
823 if (data->host_addrlen != sizeof(addr))
824 return -EINVAL;
825
826 if (copy_from_user(&addr, data->host_addr, sizeof(addr)))
827 return -EFAULT;
828
829 if (addr.sin_family != AF_INET ||
830 addr.sin_addr.s_addr == INADDR_ANY
831 ) {
832 dprintk("%s: mount program didn't pass remote IP address!\n",
833 __FUNCTION__);
834 return -EINVAL;
835 }
836 /* RFC3530: The default port for NFS is 2049 */
837 if (addr.sin_port == 0)
838 addr.sin_port = htons(NFS_PORT);
839
840 /* Grab the authentication type */
841 authflavour = RPC_AUTH_UNIX;
842 if (data->auth_flavourlen != 0) {
843 if (data->auth_flavourlen != 1) {
844 dprintk("%s: Invalid number of RPC auth flavours %d.\n",
845 __FUNCTION__, data->auth_flavourlen);
846 error = -EINVAL;
847 goto out_err_noserver;
848 }
849
850 if (copy_from_user(&authflavour, data->auth_flavours,
851 sizeof(authflavour))) {
852 error = -EFAULT;
853 goto out_err_noserver;
854 }
855 }
856
857 p = nfs_copy_user_string(NULL, &data->hostname, 256);
858 if (IS_ERR(p))
859 goto out_err;
860 hostname = p;
861
862 p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
863 if (IS_ERR(p))
864 goto out_err;
865 mntpath = p;
866
867 dprintk("MNTPATH: %s\n", mntpath);
868
869 p = nfs_copy_user_string(ip_addr, &data->client_addr,
870 sizeof(ip_addr) - 1);
871 if (IS_ERR(p))
872 goto out_err;
873 1732
874 /* Get a volume representation */ 1733 /* Get a volume representation */
875 server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, 1734 server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
876 authflavour, &mntfh); 1735 authflavour, &mntfh);
877 if (IS_ERR(server)) { 1736 if (IS_ERR(server)) {
878 error = PTR_ERR(server); 1737 error = PTR_ERR(server);
879 goto out_err_noserver; 1738 goto out;
880 } 1739 }
881 1740
1741 if (server->flags & NFS4_MOUNT_UNSHARED)
1742 compare_super = NULL;
1743
882 /* Get a superblock - note that we may end up sharing one that already exists */ 1744 /* Get a superblock - note that we may end up sharing one that already exists */
883 s = sget(fs_type, nfs_compare_super, nfs_set_super, server); 1745 s = sget(fs_type, compare_super, nfs_set_super, server);
884 if (IS_ERR(s)) { 1746 if (IS_ERR(s)) {
885 error = PTR_ERR(s); 1747 error = PTR_ERR(s);
886 goto out_free; 1748 goto out_free;
@@ -906,25 +1768,22 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
906 s->s_flags |= MS_ACTIVE; 1768 s->s_flags |= MS_ACTIVE;
907 mnt->mnt_sb = s; 1769 mnt->mnt_sb = s;
908 mnt->mnt_root = mntroot; 1770 mnt->mnt_root = mntroot;
1771 error = 0;
1772
1773out:
1774 kfree(ip_addr);
909 kfree(mntpath); 1775 kfree(mntpath);
910 kfree(hostname); 1776 kfree(hostname);
911 return 0; 1777 return error;
912
913out_err:
914 error = PTR_ERR(p);
915 goto out_err_noserver;
916 1778
917out_free: 1779out_free:
918 nfs_free_server(server); 1780 nfs_free_server(server);
919out_err_noserver: 1781 goto out;
920 kfree(mntpath);
921 kfree(hostname);
922 return error;
923 1782
924error_splat_super: 1783error_splat_super:
925 up_write(&s->s_umount); 1784 up_write(&s->s_umount);
926 deactivate_super(s); 1785 deactivate_super(s);
927 goto out_err_noserver; 1786 goto out;
928} 1787}
929 1788
930static void nfs4_kill_super(struct super_block *sb) 1789static void nfs4_kill_super(struct super_block *sb)
@@ -949,6 +1808,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
949 struct super_block *s; 1808 struct super_block *s;
950 struct nfs_server *server; 1809 struct nfs_server *server;
951 struct dentry *mntroot; 1810 struct dentry *mntroot;
1811 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
952 int error; 1812 int error;
953 1813
954 dprintk("--> nfs4_xdev_get_sb()\n"); 1814 dprintk("--> nfs4_xdev_get_sb()\n");
@@ -960,8 +1820,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
960 goto out_err_noserver; 1820 goto out_err_noserver;
961 } 1821 }
962 1822
1823 if (server->flags & NFS4_MOUNT_UNSHARED)
1824 compare_super = NULL;
1825
963 /* Get a superblock - note that we may end up sharing one that already exists */ 1826 /* Get a superblock - note that we may end up sharing one that already exists */
964 s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); 1827 s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
965 if (IS_ERR(s)) { 1828 if (IS_ERR(s)) {
966 error = PTR_ERR(s); 1829 error = PTR_ERR(s);
967 goto out_err_nosb; 1830 goto out_err_nosb;
@@ -1016,6 +1879,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
1016 struct nfs_server *server; 1879 struct nfs_server *server;
1017 struct dentry *mntroot; 1880 struct dentry *mntroot;
1018 struct nfs_fh mntfh; 1881 struct nfs_fh mntfh;
1882 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
1019 int error; 1883 int error;
1020 1884
1021 dprintk("--> nfs4_referral_get_sb()\n"); 1885 dprintk("--> nfs4_referral_get_sb()\n");
@@ -1027,8 +1891,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
1027 goto out_err_noserver; 1891 goto out_err_noserver;
1028 } 1892 }
1029 1893
1894 if (server->flags & NFS4_MOUNT_UNSHARED)
1895 compare_super = NULL;
1896
1030 /* Get a superblock - note that we may end up sharing one that already exists */ 1897 /* Get a superblock - note that we may end up sharing one that already exists */
1031 s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); 1898 s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
1032 if (IS_ERR(s)) { 1899 if (IS_ERR(s)) {
1033 error = PTR_ERR(s); 1900 error = PTR_ERR(s);
1034 goto out_err_nosb; 1901 goto out_err_nosb;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af344a158e..73ac992ece 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -117,19 +117,19 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page)
117 if (PagePrivate(page)) { 117 if (PagePrivate(page)) {
118 req = (struct nfs_page *)page_private(page); 118 req = (struct nfs_page *)page_private(page);
119 if (req != NULL) 119 if (req != NULL)
120 atomic_inc(&req->wb_count); 120 kref_get(&req->wb_kref);
121 } 121 }
122 return req; 122 return req;
123} 123}
124 124
125static struct nfs_page *nfs_page_find_request(struct page *page) 125static struct nfs_page *nfs_page_find_request(struct page *page)
126{ 126{
127 struct inode *inode = page->mapping->host;
127 struct nfs_page *req = NULL; 128 struct nfs_page *req = NULL;
128 spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
129 129
130 spin_lock(req_lock); 130 spin_lock(&inode->i_lock);
131 req = nfs_page_find_request_locked(page); 131 req = nfs_page_find_request_locked(page);
132 spin_unlock(req_lock); 132 spin_unlock(&inode->i_lock);
133 return req; 133 return req;
134} 134}
135 135
@@ -191,8 +191,6 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
191 } 191 }
192 /* Update file length */ 192 /* Update file length */
193 nfs_grow_file(page, offset, count); 193 nfs_grow_file(page, offset, count);
194 /* Set the PG_uptodate flag? */
195 nfs_mark_uptodate(page, offset, count);
196 nfs_unlock_request(req); 194 nfs_unlock_request(req);
197 return 0; 195 return 0;
198} 196}
@@ -253,16 +251,16 @@ static void nfs_end_page_writeback(struct page *page)
253static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 251static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
254 struct page *page) 252 struct page *page)
255{ 253{
254 struct inode *inode = page->mapping->host;
255 struct nfs_inode *nfsi = NFS_I(inode);
256 struct nfs_page *req; 256 struct nfs_page *req;
257 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
258 spinlock_t *req_lock = &nfsi->req_lock;
259 int ret; 257 int ret;
260 258
261 spin_lock(req_lock); 259 spin_lock(&inode->i_lock);
262 for(;;) { 260 for(;;) {
263 req = nfs_page_find_request_locked(page); 261 req = nfs_page_find_request_locked(page);
264 if (req == NULL) { 262 if (req == NULL) {
265 spin_unlock(req_lock); 263 spin_unlock(&inode->i_lock);
266 return 1; 264 return 1;
267 } 265 }
268 if (nfs_lock_request_dontget(req)) 266 if (nfs_lock_request_dontget(req))
@@ -272,28 +270,28 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
272 * succeed provided that someone hasn't already marked the 270 * succeed provided that someone hasn't already marked the
273 * request as dirty (in which case we don't care). 271 * request as dirty (in which case we don't care).
274 */ 272 */
275 spin_unlock(req_lock); 273 spin_unlock(&inode->i_lock);
276 ret = nfs_wait_on_request(req); 274 ret = nfs_wait_on_request(req);
277 nfs_release_request(req); 275 nfs_release_request(req);
278 if (ret != 0) 276 if (ret != 0)
279 return ret; 277 return ret;
280 spin_lock(req_lock); 278 spin_lock(&inode->i_lock);
281 } 279 }
282 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { 280 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
283 /* This request is marked for commit */ 281 /* This request is marked for commit */
284 spin_unlock(req_lock); 282 spin_unlock(&inode->i_lock);
285 nfs_unlock_request(req); 283 nfs_unlock_request(req);
286 nfs_pageio_complete(pgio); 284 nfs_pageio_complete(pgio);
287 return 1; 285 return 1;
288 } 286 }
289 if (nfs_set_page_writeback(page) != 0) { 287 if (nfs_set_page_writeback(page) != 0) {
290 spin_unlock(req_lock); 288 spin_unlock(&inode->i_lock);
291 BUG(); 289 BUG();
292 } 290 }
293 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 291 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
294 NFS_PAGE_TAG_WRITEBACK); 292 NFS_PAGE_TAG_LOCKED);
295 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); 293 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
296 spin_unlock(req_lock); 294 spin_unlock(&inode->i_lock);
297 nfs_pageio_add_request(pgio, req); 295 nfs_pageio_add_request(pgio, req);
298 return ret; 296 return ret;
299} 297}
@@ -400,7 +398,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
400 if (PageDirty(req->wb_page)) 398 if (PageDirty(req->wb_page))
401 set_bit(PG_NEED_FLUSH, &req->wb_flags); 399 set_bit(PG_NEED_FLUSH, &req->wb_flags);
402 nfsi->npages++; 400 nfsi->npages++;
403 atomic_inc(&req->wb_count); 401 kref_get(&req->wb_kref);
404 return 0; 402 return 0;
405} 403}
406 404
@@ -409,12 +407,12 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
409 */ 407 */
410static void nfs_inode_remove_request(struct nfs_page *req) 408static void nfs_inode_remove_request(struct nfs_page *req)
411{ 409{
412 struct inode *inode = req->wb_context->dentry->d_inode; 410 struct inode *inode = req->wb_context->path.dentry->d_inode;
413 struct nfs_inode *nfsi = NFS_I(inode); 411 struct nfs_inode *nfsi = NFS_I(inode);
414 412
415 BUG_ON (!NFS_WBACK_BUSY(req)); 413 BUG_ON (!NFS_WBACK_BUSY(req));
416 414
417 spin_lock(&nfsi->req_lock); 415 spin_lock(&inode->i_lock);
418 set_page_private(req->wb_page, 0); 416 set_page_private(req->wb_page, 0);
419 ClearPagePrivate(req->wb_page); 417 ClearPagePrivate(req->wb_page);
420 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 418 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
@@ -422,11 +420,11 @@ static void nfs_inode_remove_request(struct nfs_page *req)
422 __set_page_dirty_nobuffers(req->wb_page); 420 __set_page_dirty_nobuffers(req->wb_page);
423 nfsi->npages--; 421 nfsi->npages--;
424 if (!nfsi->npages) { 422 if (!nfsi->npages) {
425 spin_unlock(&nfsi->req_lock); 423 spin_unlock(&inode->i_lock);
426 nfs_end_data_update(inode); 424 nfs_end_data_update(inode);
427 iput(inode); 425 iput(inode);
428 } else 426 } else
429 spin_unlock(&nfsi->req_lock); 427 spin_unlock(&inode->i_lock);
430 nfs_clear_request(req); 428 nfs_clear_request(req);
431 nfs_release_request(req); 429 nfs_release_request(req);
432} 430}
@@ -457,14 +455,16 @@ nfs_dirty_request(struct nfs_page *req)
457static void 455static void
458nfs_mark_request_commit(struct nfs_page *req) 456nfs_mark_request_commit(struct nfs_page *req)
459{ 457{
460 struct inode *inode = req->wb_context->dentry->d_inode; 458 struct inode *inode = req->wb_context->path.dentry->d_inode;
461 struct nfs_inode *nfsi = NFS_I(inode); 459 struct nfs_inode *nfsi = NFS_I(inode);
462 460
463 spin_lock(&nfsi->req_lock); 461 spin_lock(&inode->i_lock);
464 nfs_list_add_request(req, &nfsi->commit);
465 nfsi->ncommit++; 462 nfsi->ncommit++;
466 set_bit(PG_NEED_COMMIT, &(req)->wb_flags); 463 set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
467 spin_unlock(&nfsi->req_lock); 464 radix_tree_tag_set(&nfsi->nfs_page_tree,
465 req->wb_index,
466 NFS_PAGE_TAG_COMMIT);
467 spin_unlock(&inode->i_lock);
468 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 468 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
469 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 469 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
470} 470}
@@ -526,18 +526,18 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u
526 idx_end = idx_start + npages - 1; 526 idx_end = idx_start + npages - 1;
527 527
528 next = idx_start; 528 next = idx_start;
529 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) { 529 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
530 if (req->wb_index > idx_end) 530 if (req->wb_index > idx_end)
531 break; 531 break;
532 532
533 next = req->wb_index + 1; 533 next = req->wb_index + 1;
534 BUG_ON(!NFS_WBACK_BUSY(req)); 534 BUG_ON(!NFS_WBACK_BUSY(req));
535 535
536 atomic_inc(&req->wb_count); 536 kref_get(&req->wb_kref);
537 spin_unlock(&nfsi->req_lock); 537 spin_unlock(&inode->i_lock);
538 error = nfs_wait_on_request(req); 538 error = nfs_wait_on_request(req);
539 nfs_release_request(req); 539 nfs_release_request(req);
540 spin_lock(&nfsi->req_lock); 540 spin_lock(&inode->i_lock);
541 if (error < 0) 541 if (error < 0)
542 return error; 542 return error;
543 res++; 543 res++;
@@ -577,10 +577,9 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
577 int res = 0; 577 int res = 0;
578 578
579 if (nfsi->ncommit != 0) { 579 if (nfsi->ncommit != 0) {
580 res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages); 580 res = nfs_scan_list(nfsi, dst, idx_start, npages,
581 NFS_PAGE_TAG_COMMIT);
581 nfsi->ncommit -= res; 582 nfsi->ncommit -= res;
582 if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
583 printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
584 } 583 }
585 return res; 584 return res;
586} 585}
@@ -603,7 +602,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
603{ 602{
604 struct address_space *mapping = page->mapping; 603 struct address_space *mapping = page->mapping;
605 struct inode *inode = mapping->host; 604 struct inode *inode = mapping->host;
606 struct nfs_inode *nfsi = NFS_I(inode);
607 struct nfs_page *req, *new = NULL; 605 struct nfs_page *req, *new = NULL;
608 pgoff_t rqend, end; 606 pgoff_t rqend, end;
609 607
@@ -613,13 +611,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
613 /* Loop over all inode entries and see if we find 611 /* Loop over all inode entries and see if we find
614 * A request for the page we wish to update 612 * A request for the page we wish to update
615 */ 613 */
616 spin_lock(&nfsi->req_lock); 614 spin_lock(&inode->i_lock);
617 req = nfs_page_find_request_locked(page); 615 req = nfs_page_find_request_locked(page);
618 if (req) { 616 if (req) {
619 if (!nfs_lock_request_dontget(req)) { 617 if (!nfs_lock_request_dontget(req)) {
620 int error; 618 int error;
621 619
622 spin_unlock(&nfsi->req_lock); 620 spin_unlock(&inode->i_lock);
623 error = nfs_wait_on_request(req); 621 error = nfs_wait_on_request(req);
624 nfs_release_request(req); 622 nfs_release_request(req);
625 if (error < 0) { 623 if (error < 0) {
@@ -629,7 +627,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
629 } 627 }
630 continue; 628 continue;
631 } 629 }
632 spin_unlock(&nfsi->req_lock); 630 spin_unlock(&inode->i_lock);
633 if (new) 631 if (new)
634 nfs_release_request(new); 632 nfs_release_request(new);
635 break; 633 break;
@@ -640,14 +638,14 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
640 nfs_lock_request_dontget(new); 638 nfs_lock_request_dontget(new);
641 error = nfs_inode_add_request(inode, new); 639 error = nfs_inode_add_request(inode, new);
642 if (error) { 640 if (error) {
643 spin_unlock(&nfsi->req_lock); 641 spin_unlock(&inode->i_lock);
644 nfs_unlock_request(new); 642 nfs_unlock_request(new);
645 return ERR_PTR(error); 643 return ERR_PTR(error);
646 } 644 }
647 spin_unlock(&nfsi->req_lock); 645 spin_unlock(&inode->i_lock);
648 return new; 646 return new;
649 } 647 }
650 spin_unlock(&nfsi->req_lock); 648 spin_unlock(&inode->i_lock);
651 649
652 new = nfs_create_request(ctx, inode, page, offset, bytes); 650 new = nfs_create_request(ctx, inode, page, offset, bytes);
653 if (IS_ERR(new)) 651 if (IS_ERR(new))
@@ -751,12 +749,17 @@ int nfs_updatepage(struct file *file, struct page *page,
751static void nfs_writepage_release(struct nfs_page *req) 749static void nfs_writepage_release(struct nfs_page *req)
752{ 750{
753 751
754 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { 752 if (PageError(req->wb_page)) {
753 nfs_end_page_writeback(req->wb_page);
754 nfs_inode_remove_request(req);
755 } else if (!nfs_reschedule_unstable_write(req)) {
756 /* Set the PG_uptodate flag */
757 nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
755 nfs_end_page_writeback(req->wb_page); 758 nfs_end_page_writeback(req->wb_page);
756 nfs_inode_remove_request(req); 759 nfs_inode_remove_request(req);
757 } else 760 } else
758 nfs_end_page_writeback(req->wb_page); 761 nfs_end_page_writeback(req->wb_page);
759 nfs_clear_page_writeback(req); 762 nfs_clear_page_tag_locked(req);
760} 763}
761 764
762static inline int flush_task_priority(int how) 765static inline int flush_task_priority(int how)
@@ -786,7 +789,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
786 * NB: take care not to mess about with data->commit et al. */ 789 * NB: take care not to mess about with data->commit et al. */
787 790
788 data->req = req; 791 data->req = req;
789 data->inode = inode = req->wb_context->dentry->d_inode; 792 data->inode = inode = req->wb_context->path.dentry->d_inode;
790 data->cred = req->wb_context->cred; 793 data->cred = req->wb_context->cred;
791 794
792 data->args.fh = NFS_FH(inode); 795 data->args.fh = NFS_FH(inode);
@@ -885,7 +888,7 @@ out_bad:
885 } 888 }
886 nfs_redirty_request(req); 889 nfs_redirty_request(req);
887 nfs_end_page_writeback(req->wb_page); 890 nfs_end_page_writeback(req->wb_page);
888 nfs_clear_page_writeback(req); 891 nfs_clear_page_tag_locked(req);
889 return -ENOMEM; 892 return -ENOMEM;
890} 893}
891 894
@@ -928,7 +931,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
928 nfs_list_remove_request(req); 931 nfs_list_remove_request(req);
929 nfs_redirty_request(req); 932 nfs_redirty_request(req);
930 nfs_end_page_writeback(req->wb_page); 933 nfs_end_page_writeback(req->wb_page);
931 nfs_clear_page_writeback(req); 934 nfs_clear_page_tag_locked(req);
932 } 935 }
933 return -ENOMEM; 936 return -ENOMEM;
934} 937}
@@ -954,8 +957,8 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
954 struct page *page = req->wb_page; 957 struct page *page = req->wb_page;
955 958
956 dprintk("NFS: write (%s/%Ld %d@%Ld)", 959 dprintk("NFS: write (%s/%Ld %d@%Ld)",
957 req->wb_context->dentry->d_inode->i_sb->s_id, 960 req->wb_context->path.dentry->d_inode->i_sb->s_id,
958 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 961 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
959 req->wb_bytes, 962 req->wb_bytes,
960 (long long)req_offset(req)); 963 (long long)req_offset(req));
961 964
@@ -970,9 +973,9 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
970 } 973 }
971 974
972 if (nfs_write_need_commit(data)) { 975 if (nfs_write_need_commit(data)) {
973 spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock; 976 struct inode *inode = page->mapping->host;
974 977
975 spin_lock(req_lock); 978 spin_lock(&inode->i_lock);
976 if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { 979 if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
977 /* Do nothing we need to resend the writes */ 980 /* Do nothing we need to resend the writes */
978 } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { 981 } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
@@ -983,7 +986,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
983 clear_bit(PG_NEED_COMMIT, &req->wb_flags); 986 clear_bit(PG_NEED_COMMIT, &req->wb_flags);
984 dprintk(" server reboot detected\n"); 987 dprintk(" server reboot detected\n");
985 } 988 }
986 spin_unlock(req_lock); 989 spin_unlock(&inode->i_lock);
987 } else 990 } else
988 dprintk(" OK\n"); 991 dprintk(" OK\n");
989 992
@@ -1020,8 +1023,8 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1020 page = req->wb_page; 1023 page = req->wb_page;
1021 1024
1022 dprintk("NFS: write (%s/%Ld %d@%Ld)", 1025 dprintk("NFS: write (%s/%Ld %d@%Ld)",
1023 req->wb_context->dentry->d_inode->i_sb->s_id, 1026 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1024 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 1027 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1025 req->wb_bytes, 1028 req->wb_bytes,
1026 (long long)req_offset(req)); 1029 (long long)req_offset(req));
1027 1030
@@ -1039,12 +1042,14 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1039 dprintk(" marked for commit\n"); 1042 dprintk(" marked for commit\n");
1040 goto next; 1043 goto next;
1041 } 1044 }
1045 /* Set the PG_uptodate flag? */
1046 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
1042 dprintk(" OK\n"); 1047 dprintk(" OK\n");
1043remove_request: 1048remove_request:
1044 nfs_end_page_writeback(page); 1049 nfs_end_page_writeback(page);
1045 nfs_inode_remove_request(req); 1050 nfs_inode_remove_request(req);
1046 next: 1051 next:
1047 nfs_clear_page_writeback(req); 1052 nfs_clear_page_tag_locked(req);
1048 } 1053 }
1049} 1054}
1050 1055
@@ -1157,7 +1162,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1157 1162
1158 list_splice_init(head, &data->pages); 1163 list_splice_init(head, &data->pages);
1159 first = nfs_list_entry(data->pages.next); 1164 first = nfs_list_entry(data->pages.next);
1160 inode = first->wb_context->dentry->d_inode; 1165 inode = first->wb_context->path.dentry->d_inode;
1161 1166
1162 data->inode = inode; 1167 data->inode = inode;
1163 data->cred = first->wb_context->cred; 1168 data->cred = first->wb_context->cred;
@@ -1207,7 +1212,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1207 nfs_list_remove_request(req); 1212 nfs_list_remove_request(req);
1208 nfs_mark_request_commit(req); 1213 nfs_mark_request_commit(req);
1209 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1214 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1210 nfs_clear_page_writeback(req); 1215 nfs_clear_page_tag_locked(req);
1211 } 1216 }
1212 return -ENOMEM; 1217 return -ENOMEM;
1213} 1218}
@@ -1234,8 +1239,8 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1234 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1239 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1235 1240
1236 dprintk("NFS: commit (%s/%Ld %d@%Ld)", 1241 dprintk("NFS: commit (%s/%Ld %d@%Ld)",
1237 req->wb_context->dentry->d_inode->i_sb->s_id, 1242 req->wb_context->path.dentry->d_inode->i_sb->s_id,
1238 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 1243 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
1239 req->wb_bytes, 1244 req->wb_bytes,
1240 (long long)req_offset(req)); 1245 (long long)req_offset(req));
1241 if (task->tk_status < 0) { 1246 if (task->tk_status < 0) {
@@ -1249,6 +1254,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1249 * returned by the server against all stored verfs. */ 1254 * returned by the server against all stored verfs. */
1250 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { 1255 if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
1251 /* We have a match */ 1256 /* We have a match */
1257 /* Set the PG_uptodate flag */
1258 nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
1259 req->wb_bytes);
1252 nfs_inode_remove_request(req); 1260 nfs_inode_remove_request(req);
1253 dprintk(" OK\n"); 1261 dprintk(" OK\n");
1254 goto next; 1262 goto next;
@@ -1257,7 +1265,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1257 dprintk(" mismatch\n"); 1265 dprintk(" mismatch\n");
1258 nfs_redirty_request(req); 1266 nfs_redirty_request(req);
1259 next: 1267 next:
1260 nfs_clear_page_writeback(req); 1268 nfs_clear_page_tag_locked(req);
1261 } 1269 }
1262} 1270}
1263 1271
@@ -1268,13 +1276,12 @@ static const struct rpc_call_ops nfs_commit_ops = {
1268 1276
1269int nfs_commit_inode(struct inode *inode, int how) 1277int nfs_commit_inode(struct inode *inode, int how)
1270{ 1278{
1271 struct nfs_inode *nfsi = NFS_I(inode);
1272 LIST_HEAD(head); 1279 LIST_HEAD(head);
1273 int res; 1280 int res;
1274 1281
1275 spin_lock(&nfsi->req_lock); 1282 spin_lock(&inode->i_lock);
1276 res = nfs_scan_commit(inode, &head, 0, 0); 1283 res = nfs_scan_commit(inode, &head, 0, 0);
1277 spin_unlock(&nfsi->req_lock); 1284 spin_unlock(&inode->i_lock);
1278 if (res) { 1285 if (res) {
1279 int error = nfs_commit_list(inode, &head, how); 1286 int error = nfs_commit_list(inode, &head, how);
1280 if (error < 0) 1287 if (error < 0)
@@ -1292,7 +1299,6 @@ static inline int nfs_commit_list(struct inode *inode, struct list_head *head, i
1292long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1299long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
1293{ 1300{
1294 struct inode *inode = mapping->host; 1301 struct inode *inode = mapping->host;
1295 struct nfs_inode *nfsi = NFS_I(inode);
1296 pgoff_t idx_start, idx_end; 1302 pgoff_t idx_start, idx_end;
1297 unsigned int npages = 0; 1303 unsigned int npages = 0;
1298 LIST_HEAD(head); 1304 LIST_HEAD(head);
@@ -1314,7 +1320,7 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
1314 } 1320 }
1315 } 1321 }
1316 how &= ~FLUSH_NOCOMMIT; 1322 how &= ~FLUSH_NOCOMMIT;
1317 spin_lock(&nfsi->req_lock); 1323 spin_lock(&inode->i_lock);
1318 do { 1324 do {
1319 ret = nfs_wait_on_requests_locked(inode, idx_start, npages); 1325 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1320 if (ret != 0) 1326 if (ret != 0)
@@ -1325,18 +1331,19 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
1325 if (pages == 0) 1331 if (pages == 0)
1326 break; 1332 break;
1327 if (how & FLUSH_INVALIDATE) { 1333 if (how & FLUSH_INVALIDATE) {
1328 spin_unlock(&nfsi->req_lock); 1334 spin_unlock(&inode->i_lock);
1329 nfs_cancel_commit_list(&head); 1335 nfs_cancel_commit_list(&head);
1330 ret = pages; 1336 ret = pages;
1331 spin_lock(&nfsi->req_lock); 1337 spin_lock(&inode->i_lock);
1332 continue; 1338 continue;
1333 } 1339 }
1334 pages += nfs_scan_commit(inode, &head, 0, 0); 1340 pages += nfs_scan_commit(inode, &head, 0, 0);
1335 spin_unlock(&nfsi->req_lock); 1341 spin_unlock(&inode->i_lock);
1336 ret = nfs_commit_list(inode, &head, how); 1342 ret = nfs_commit_list(inode, &head, how);
1337 spin_lock(&nfsi->req_lock); 1343 spin_lock(&inode->i_lock);
1344
1338 } while (ret >= 0); 1345 } while (ret >= 0);
1339 spin_unlock(&nfsi->req_lock); 1346 spin_unlock(&inode->i_lock);
1340 return ret; 1347 return ret;
1341} 1348}
1342 1349
@@ -1430,7 +1437,6 @@ int nfs_set_page_dirty(struct page *page)
1430{ 1437{
1431 struct address_space *mapping = page->mapping; 1438 struct address_space *mapping = page->mapping;
1432 struct inode *inode; 1439 struct inode *inode;
1433 spinlock_t *req_lock;
1434 struct nfs_page *req; 1440 struct nfs_page *req;
1435 int ret; 1441 int ret;
1436 1442
@@ -1439,18 +1445,17 @@ int nfs_set_page_dirty(struct page *page)
1439 inode = mapping->host; 1445 inode = mapping->host;
1440 if (!inode) 1446 if (!inode)
1441 goto out_raced; 1447 goto out_raced;
1442 req_lock = &NFS_I(inode)->req_lock; 1448 spin_lock(&inode->i_lock);
1443 spin_lock(req_lock);
1444 req = nfs_page_find_request_locked(page); 1449 req = nfs_page_find_request_locked(page);
1445 if (req != NULL) { 1450 if (req != NULL) {
1446 /* Mark any existing write requests for flushing */ 1451 /* Mark any existing write requests for flushing */
1447 ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags); 1452 ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
1448 spin_unlock(req_lock); 1453 spin_unlock(&inode->i_lock);
1449 nfs_release_request(req); 1454 nfs_release_request(req);
1450 return ret; 1455 return ret;
1451 } 1456 }
1452 ret = __set_page_dirty_nobuffers(page); 1457 ret = __set_page_dirty_nobuffers(page);
1453 spin_unlock(req_lock); 1458 spin_unlock(&inode->i_lock);
1454 return ret; 1459 return ret;
1455out_raced: 1460out_raced:
1456 return !TestSetPageDirty(page); 1461 return !TestSetPageDirty(page);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 6e92b0fe53..cf61dc8ae9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -12,17 +12,31 @@
12 12
13#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE)) 13#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
14 14
15static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
16{
17 struct exp_flavor_info *f;
18 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
19
20 for (f = exp->ex_flavors; f < end; f++) {
21 if (f->pseudoflavor == rqstp->rq_flavor)
22 return f->flags;
23 }
24 return exp->ex_flags;
25
26}
27
15int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) 28int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
16{ 29{
17 struct svc_cred cred = rqstp->rq_cred; 30 struct svc_cred cred = rqstp->rq_cred;
18 int i; 31 int i;
32 int flags = nfsexp_flags(rqstp, exp);
19 int ret; 33 int ret;
20 34
21 if (exp->ex_flags & NFSEXP_ALLSQUASH) { 35 if (flags & NFSEXP_ALLSQUASH) {
22 cred.cr_uid = exp->ex_anon_uid; 36 cred.cr_uid = exp->ex_anon_uid;
23 cred.cr_gid = exp->ex_anon_gid; 37 cred.cr_gid = exp->ex_anon_gid;
24 cred.cr_group_info = groups_alloc(0); 38 cred.cr_group_info = groups_alloc(0);
25 } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { 39 } else if (flags & NFSEXP_ROOTSQUASH) {
26 struct group_info *gi; 40 struct group_info *gi;
27 if (!cred.cr_uid) 41 if (!cred.cr_uid)
28 cred.cr_uid = exp->ex_anon_uid; 42 cred.cr_uid = exp->ex_anon_uid;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 79bd03b8bb..c7bbf460b0 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,12 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/exportfs.h>
29 30
30#include <linux/sunrpc/svc.h> 31#include <linux/sunrpc/svc.h>
31#include <linux/nfsd/nfsd.h> 32#include <linux/nfsd/nfsd.h>
32#include <linux/nfsd/nfsfh.h> 33#include <linux/nfsd/nfsfh.h>
33#include <linux/nfsd/syscall.h> 34#include <linux/nfsd/syscall.h>
34#include <linux/lockd/bind.h> 35#include <linux/lockd/bind.h>
36#include <linux/sunrpc/msg_prot.h>
37#include <linux/sunrpc/gss_api.h>
35 38
36#define NFSDDBG_FACILITY NFSDDBG_EXPORT 39#define NFSDDBG_FACILITY NFSDDBG_EXPORT
37 40
@@ -451,8 +454,48 @@ out_free_all:
451 return err; 454 return err;
452} 455}
453 456
457static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
458{
459 int listsize, err;
460 struct exp_flavor_info *f;
461
462 err = get_int(mesg, &listsize);
463 if (err)
464 return err;
465 if (listsize < 0 || listsize > MAX_SECINFO_LIST)
466 return -EINVAL;
467
468 for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
469 err = get_int(mesg, &f->pseudoflavor);
470 if (err)
471 return err;
472 /*
473 * Just a quick sanity check; we could also try to check
474 * whether this pseudoflavor is supported, but at worst
475 * an unsupported pseudoflavor on the export would just
476 * be a pseudoflavor that won't match the flavor of any
477 * authenticated request. The administrator will
478 * probably discover the problem when someone fails to
479 * authenticate.
480 */
481 if (f->pseudoflavor < 0)
482 return -EINVAL;
483 err = get_int(mesg, &f->flags);
484 if (err)
485 return err;
486 /* Only some flags are allowed to differ between flavors: */
487 if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
488 return -EINVAL;
489 }
490 exp->ex_nflavors = listsize;
491 return 0;
492}
493
454#else /* CONFIG_NFSD_V4 */ 494#else /* CONFIG_NFSD_V4 */
455static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; } 495static inline int
496fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
497static inline int
498secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
456#endif 499#endif
457 500
458static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) 501static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
@@ -476,6 +519,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
476 519
477 exp.ex_uuid = NULL; 520 exp.ex_uuid = NULL;
478 521
522 /* secinfo */
523 exp.ex_nflavors = 0;
524
479 if (mesg[mlen-1] != '\n') 525 if (mesg[mlen-1] != '\n')
480 return -EINVAL; 526 return -EINVAL;
481 mesg[mlen-1] = 0; 527 mesg[mlen-1] = 0;
@@ -553,7 +599,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
553 if (exp.ex_uuid == NULL) 599 if (exp.ex_uuid == NULL)
554 err = -ENOMEM; 600 err = -ENOMEM;
555 } 601 }
556 } else 602 } else if (strcmp(buf, "secinfo") == 0)
603 err = secinfo_parse(&mesg, buf, &exp);
604 else
557 /* quietly ignore unknown words and anything 605 /* quietly ignore unknown words and anything
558 * following. Newer user-space can try to set 606 * following. Newer user-space can try to set
559 * new values, then see what the result was. 607 * new values, then see what the result was.
@@ -593,6 +641,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
593 641
594static void exp_flags(struct seq_file *m, int flag, int fsid, 642static void exp_flags(struct seq_file *m, int flag, int fsid,
595 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); 643 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
644static void show_secinfo(struct seq_file *m, struct svc_export *exp);
596 645
597static int svc_export_show(struct seq_file *m, 646static int svc_export_show(struct seq_file *m,
598 struct cache_detail *cd, 647 struct cache_detail *cd,
@@ -622,6 +671,7 @@ static int svc_export_show(struct seq_file *m,
622 seq_printf(m, "%02x", exp->ex_uuid[i]); 671 seq_printf(m, "%02x", exp->ex_uuid[i]);
623 } 672 }
624 } 673 }
674 show_secinfo(m, exp);
625 } 675 }
626 seq_puts(m, ")\n"); 676 seq_puts(m, ")\n");
627 return 0; 677 return 0;
@@ -654,6 +704,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
654{ 704{
655 struct svc_export *new = container_of(cnew, struct svc_export, h); 705 struct svc_export *new = container_of(cnew, struct svc_export, h);
656 struct svc_export *item = container_of(citem, struct svc_export, h); 706 struct svc_export *item = container_of(citem, struct svc_export, h);
707 int i;
657 708
658 new->ex_flags = item->ex_flags; 709 new->ex_flags = item->ex_flags;
659 new->ex_anon_uid = item->ex_anon_uid; 710 new->ex_anon_uid = item->ex_anon_uid;
@@ -669,6 +720,10 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
669 item->ex_fslocs.locations_count = 0; 720 item->ex_fslocs.locations_count = 0;
670 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 721 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
671 item->ex_fslocs.migrated = 0; 722 item->ex_fslocs.migrated = 0;
723 new->ex_nflavors = item->ex_nflavors;
724 for (i = 0; i < MAX_SECINFO_LIST; i++) {
725 new->ex_flavors[i] = item->ex_flavors[i];
726 }
672} 727}
673 728
674static struct cache_head *svc_export_alloc(void) 729static struct cache_head *svc_export_alloc(void)
@@ -738,16 +793,18 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
738 int err; 793 int err;
739 794
740 if (!clp) 795 if (!clp)
741 return NULL; 796 return ERR_PTR(-ENOENT);
742 797
743 key.ek_client = clp; 798 key.ek_client = clp;
744 key.ek_fsidtype = fsid_type; 799 key.ek_fsidtype = fsid_type;
745 memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); 800 memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
746 801
747 ek = svc_expkey_lookup(&key); 802 ek = svc_expkey_lookup(&key);
748 if (ek != NULL) 803 if (ek == NULL)
749 if ((err = cache_check(&svc_expkey_cache, &ek->h, reqp))) 804 return ERR_PTR(-ENOMEM);
750 ek = ERR_PTR(err); 805 err = cache_check(&svc_expkey_cache, &ek->h, reqp);
806 if (err)
807 return ERR_PTR(err);
751 return ek; 808 return ek;
752} 809}
753 810
@@ -808,30 +865,21 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
808 struct cache_req *reqp) 865 struct cache_req *reqp)
809{ 866{
810 struct svc_export *exp, key; 867 struct svc_export *exp, key;
868 int err;
811 869
812 if (!clp) 870 if (!clp)
813 return NULL; 871 return ERR_PTR(-ENOENT);
814 872
815 key.ex_client = clp; 873 key.ex_client = clp;
816 key.ex_mnt = mnt; 874 key.ex_mnt = mnt;
817 key.ex_dentry = dentry; 875 key.ex_dentry = dentry;
818 876
819 exp = svc_export_lookup(&key); 877 exp = svc_export_lookup(&key);
820 if (exp != NULL) { 878 if (exp == NULL)
821 int err; 879 return ERR_PTR(-ENOMEM);
822 880 err = cache_check(&svc_export_cache, &exp->h, reqp);
823 err = cache_check(&svc_export_cache, &exp->h, reqp); 881 if (err)
824 switch (err) { 882 return ERR_PTR(err);
825 case 0: break;
826 case -EAGAIN:
827 case -ETIMEDOUT:
828 exp = ERR_PTR(err);
829 break;
830 default:
831 exp = NULL;
832 }
833 }
834
835 return exp; 883 return exp;
836} 884}
837 885
@@ -847,7 +895,7 @@ exp_parent(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
847 dget(dentry); 895 dget(dentry);
848 exp = exp_get_by_name(clp, mnt, dentry, reqp); 896 exp = exp_get_by_name(clp, mnt, dentry, reqp);
849 897
850 while (exp == NULL && !IS_ROOT(dentry)) { 898 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
851 struct dentry *parent; 899 struct dentry *parent;
852 900
853 parent = dget_parent(dentry); 901 parent = dget_parent(dentry);
@@ -900,7 +948,7 @@ static void exp_fsid_unhash(struct svc_export *exp)
900 return; 948 return;
901 949
902 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 950 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
903 if (ek && !IS_ERR(ek)) { 951 if (!IS_ERR(ek)) {
904 ek->h.expiry_time = get_seconds()-1; 952 ek->h.expiry_time = get_seconds()-1;
905 cache_put(&ek->h, &svc_expkey_cache); 953 cache_put(&ek->h, &svc_expkey_cache);
906 } 954 }
@@ -938,7 +986,7 @@ static void exp_unhash(struct svc_export *exp)
938 struct inode *inode = exp->ex_dentry->d_inode; 986 struct inode *inode = exp->ex_dentry->d_inode;
939 987
940 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 988 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
941 if (ek && !IS_ERR(ek)) { 989 if (!IS_ERR(ek)) {
942 ek->h.expiry_time = get_seconds()-1; 990 ek->h.expiry_time = get_seconds()-1;
943 cache_put(&ek->h, &svc_expkey_cache); 991 cache_put(&ek->h, &svc_expkey_cache);
944 } 992 }
@@ -989,13 +1037,12 @@ exp_export(struct nfsctl_export *nxp)
989 1037
990 /* must make sure there won't be an ex_fsid clash */ 1038 /* must make sure there won't be an ex_fsid clash */
991 if ((nxp->ex_flags & NFSEXP_FSID) && 1039 if ((nxp->ex_flags & NFSEXP_FSID) &&
992 (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) && 1040 (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
993 !IS_ERR(fsid_key) &&
994 fsid_key->ek_mnt && 1041 fsid_key->ek_mnt &&
995 (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) ) 1042 (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) )
996 goto finish; 1043 goto finish;
997 1044
998 if (exp) { 1045 if (!IS_ERR(exp)) {
999 /* just a flags/id/fsid update */ 1046 /* just a flags/id/fsid update */
1000 1047
1001 exp_fsid_unhash(exp); 1048 exp_fsid_unhash(exp);
@@ -1104,7 +1151,7 @@ exp_unexport(struct nfsctl_export *nxp)
1104 err = -EINVAL; 1151 err = -EINVAL;
1105 exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL); 1152 exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL);
1106 path_release(&nd); 1153 path_release(&nd);
1107 if (!exp) 1154 if (IS_ERR(exp))
1108 goto out_domain; 1155 goto out_domain;
1109 1156
1110 exp_do_unexport(exp); 1157 exp_do_unexport(exp);
@@ -1149,10 +1196,6 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
1149 err = PTR_ERR(exp); 1196 err = PTR_ERR(exp);
1150 goto out; 1197 goto out;
1151 } 1198 }
1152 if (!exp) {
1153 dprintk("nfsd: exp_rootfh export not found.\n");
1154 goto out;
1155 }
1156 1199
1157 /* 1200 /*
1158 * fh must be initialized before calling fh_compose 1201 * fh must be initialized before calling fh_compose
@@ -1176,17 +1219,130 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
1176{ 1219{
1177 struct svc_export *exp; 1220 struct svc_export *exp;
1178 struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); 1221 struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
1179 if (!ek || IS_ERR(ek)) 1222 if (IS_ERR(ek))
1180 return ERR_PTR(PTR_ERR(ek)); 1223 return ERR_PTR(PTR_ERR(ek));
1181 1224
1182 exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp); 1225 exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
1183 cache_put(&ek->h, &svc_expkey_cache); 1226 cache_put(&ek->h, &svc_expkey_cache);
1184 1227
1185 if (!exp || IS_ERR(exp)) 1228 if (IS_ERR(exp))
1186 return ERR_PTR(PTR_ERR(exp)); 1229 return ERR_PTR(PTR_ERR(exp));
1187 return exp; 1230 return exp;
1188} 1231}
1189 1232
1233__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
1234{
1235 struct exp_flavor_info *f;
1236 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1237
1238 /* legacy gss-only clients are always OK: */
1239 if (exp->ex_client == rqstp->rq_gssclient)
1240 return 0;
1241 /* ip-address based client; check sec= export option: */
1242 for (f = exp->ex_flavors; f < end; f++) {
1243 if (f->pseudoflavor == rqstp->rq_flavor)
1244 return 0;
1245 }
1246 /* defaults in absence of sec= options: */
1247 if (exp->ex_nflavors == 0) {
1248 if (rqstp->rq_flavor == RPC_AUTH_NULL ||
1249 rqstp->rq_flavor == RPC_AUTH_UNIX)
1250 return 0;
1251 }
1252 return nfserr_wrongsec;
1253}
1254
1255/*
1256 * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
1257 * auth_unix client) if it's available and has secinfo information;
1258 * otherwise, will try to use rq_gssclient.
1259 *
1260 * Called from functions that handle requests; functions that do work on
1261 * behalf of mountd are passed a single client name to use, and should
1262 * use exp_get_by_name() or exp_find().
1263 */
1264struct svc_export *
1265rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
1266 struct dentry *dentry)
1267{
1268 struct svc_export *gssexp, *exp = NULL;
1269
1270 if (rqstp->rq_client == NULL)
1271 goto gss;
1272
1273 /* First try the auth_unix client: */
1274 exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
1275 &rqstp->rq_chandle);
1276 if (PTR_ERR(exp) == -ENOENT)
1277 goto gss;
1278 if (IS_ERR(exp))
1279 return exp;
1280 /* If it has secinfo, assume there are no gss/... clients */
1281 if (exp->ex_nflavors > 0)
1282 return exp;
1283gss:
1284 /* Otherwise, try falling back on gss client */
1285 if (rqstp->rq_gssclient == NULL)
1286 return exp;
1287 gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
1288 &rqstp->rq_chandle);
1289 if (PTR_ERR(gssexp) == -ENOENT)
1290 return exp;
1291 if (exp && !IS_ERR(exp))
1292 exp_put(exp);
1293 return gssexp;
1294}
1295
1296struct svc_export *
1297rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
1298{
1299 struct svc_export *gssexp, *exp = NULL;
1300
1301 if (rqstp->rq_client == NULL)
1302 goto gss;
1303
1304 /* First try the auth_unix client: */
1305 exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle);
1306 if (PTR_ERR(exp) == -ENOENT)
1307 goto gss;
1308 if (IS_ERR(exp))
1309 return exp;
1310 /* If it has secinfo, assume there are no gss/... clients */
1311 if (exp->ex_nflavors > 0)
1312 return exp;
1313gss:
1314 /* Otherwise, try falling back on gss client */
1315 if (rqstp->rq_gssclient == NULL)
1316 return exp;
1317 gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv,
1318 &rqstp->rq_chandle);
1319 if (PTR_ERR(gssexp) == -ENOENT)
1320 return exp;
1321 if (exp && !IS_ERR(exp))
1322 exp_put(exp);
1323 return gssexp;
1324}
1325
1326struct svc_export *
1327rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
1328 struct dentry *dentry)
1329{
1330 struct svc_export *exp;
1331
1332 dget(dentry);
1333 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1334
1335 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
1336 struct dentry *parent;
1337
1338 parent = dget_parent(dentry);
1339 dput(dentry);
1340 dentry = parent;
1341 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1342 }
1343 dput(dentry);
1344 return exp;
1345}
1190 1346
1191/* 1347/*
1192 * Called when we need the filehandle for the root of the pseudofs, 1348 * Called when we need the filehandle for the root of the pseudofs,
@@ -1194,8 +1350,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
1194 * export point with fsid==0 1350 * export point with fsid==0
1195 */ 1351 */
1196__be32 1352__be32
1197exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, 1353exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1198 struct cache_req *creq)
1199{ 1354{
1200 struct svc_export *exp; 1355 struct svc_export *exp;
1201 __be32 rv; 1356 __be32 rv;
@@ -1203,12 +1358,16 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
1203 1358
1204 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1359 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1205 1360
1206 exp = exp_find(clp, FSID_NUM, fsidv, creq); 1361 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
1362 if (PTR_ERR(exp) == -ENOENT)
1363 return nfserr_perm;
1207 if (IS_ERR(exp)) 1364 if (IS_ERR(exp))
1208 return nfserrno(PTR_ERR(exp)); 1365 return nfserrno(PTR_ERR(exp));
1209 if (exp == NULL)
1210 return nfserr_perm;
1211 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); 1366 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
1367 if (rv)
1368 goto out;
1369 rv = check_nfsd_access(exp, rqstp);
1370out:
1212 exp_put(exp); 1371 exp_put(exp);
1213 return rv; 1372 return rv;
1214} 1373}
@@ -1296,28 +1455,62 @@ static struct flags {
1296 { 0, {"", ""}} 1455 { 0, {"", ""}}
1297}; 1456};
1298 1457
1299static void exp_flags(struct seq_file *m, int flag, int fsid, 1458static void show_expflags(struct seq_file *m, int flags, int mask)
1300 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
1301{ 1459{
1302 int first = 0;
1303 struct flags *flg; 1460 struct flags *flg;
1461 int state, first = 0;
1304 1462
1305 for (flg = expflags; flg->flag; flg++) { 1463 for (flg = expflags; flg->flag; flg++) {
1306 int state = (flg->flag & flag)?0:1; 1464 if (flg->flag & ~mask)
1465 continue;
1466 state = (flg->flag & flags) ? 0 : 1;
1307 if (*flg->name[state]) 1467 if (*flg->name[state])
1308 seq_printf(m, "%s%s", first++?",":"", flg->name[state]); 1468 seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
1309 } 1469 }
1470}
1471
1472static void show_secinfo_flags(struct seq_file *m, int flags)
1473{
1474 seq_printf(m, ",");
1475 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1476}
1477
1478static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1479{
1480 struct exp_flavor_info *f;
1481 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1482 int lastflags = 0, first = 0;
1483
1484 if (exp->ex_nflavors == 0)
1485 return;
1486 for (f = exp->ex_flavors; f < end; f++) {
1487 if (first || f->flags != lastflags) {
1488 if (!first)
1489 show_secinfo_flags(m, lastflags);
1490 seq_printf(m, ",sec=%d", f->pseudoflavor);
1491 lastflags = f->flags;
1492 } else {
1493 seq_printf(m, ":%d", f->pseudoflavor);
1494 }
1495 }
1496 show_secinfo_flags(m, lastflags);
1497}
1498
1499static void exp_flags(struct seq_file *m, int flag, int fsid,
1500 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
1501{
1502 show_expflags(m, flag, NFSEXP_ALLFLAGS);
1310 if (flag & NFSEXP_FSID) 1503 if (flag & NFSEXP_FSID)
1311 seq_printf(m, "%sfsid=%d", first++?",":"", fsid); 1504 seq_printf(m, ",fsid=%d", fsid);
1312 if (anonu != (uid_t)-2 && anonu != (0x10000-2)) 1505 if (anonu != (uid_t)-2 && anonu != (0x10000-2))
1313 seq_printf(m, "%sanonuid=%d", first++?",":"", anonu); 1506 seq_printf(m, ",sanonuid=%d", anonu);
1314 if (anong != (gid_t)-2 && anong != (0x10000-2)) 1507 if (anong != (gid_t)-2 && anong != (0x10000-2))
1315 seq_printf(m, "%sanongid=%d", first++?",":"", anong); 1508 seq_printf(m, ",sanongid=%d", anong);
1316 if (fsloc && fsloc->locations_count > 0) { 1509 if (fsloc && fsloc->locations_count > 0) {
1317 char *loctype = (fsloc->migrated) ? "refer" : "replicas"; 1510 char *loctype = (fsloc->migrated) ? "refer" : "replicas";
1318 int i; 1511 int i;
1319 1512
1320 seq_printf(m, "%s%s=", first++?",":"", loctype); 1513 seq_printf(m, ",%s=", loctype);
1321 seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); 1514 seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
1322 seq_putc(m, '@'); 1515 seq_putc(m, '@');
1323 seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); 1516 seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 221acd1f11..9e4a568a50 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -65,6 +65,7 @@ nlm_fclose(struct file *filp)
65static struct nlmsvc_binding nfsd_nlm_ops = { 65static struct nlmsvc_binding nfsd_nlm_ops = {
66 .fopen = nlm_fopen, /* open file for locking */ 66 .fopen = nlm_fopen, /* open file for locking */
67 .fclose = nlm_fclose, /* close file */ 67 .fclose = nlm_fclose, /* close file */
68 .get_grace_period = get_nfs4_grace_period,
68}; 69};
69 70
70void 71void
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index cc3b7badd4..b6ed38380a 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -183,8 +183,13 @@ static void
183summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) 183summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
184{ 184{
185 struct posix_acl_entry *pa, *pe; 185 struct posix_acl_entry *pa, *pe;
186 pas->users = 0; 186
187 pas->groups = 0; 187 /*
188 * Only pas.users and pas.groups need initialization; previous
189 * posix_acl_valid() calls ensure that the other fields will be
190 * initialized in the following loop. But, just to placate gcc:
191 */
192 memset(pas, 0, sizeof(*pas));
188 pas->mask = 07; 193 pas->mask = 07;
189 194
190 pe = acl->a_entries + acl->a_count; 195 pe = acl->a_entries + acl->a_count;
@@ -732,13 +737,16 @@ int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
732 *pacl = posix_state_to_acl(&effective_acl_state, flags); 737 *pacl = posix_state_to_acl(&effective_acl_state, flags);
733 if (IS_ERR(*pacl)) { 738 if (IS_ERR(*pacl)) {
734 ret = PTR_ERR(*pacl); 739 ret = PTR_ERR(*pacl);
740 *pacl = NULL;
735 goto out_dstate; 741 goto out_dstate;
736 } 742 }
737 *dpacl = posix_state_to_acl(&default_acl_state, 743 *dpacl = posix_state_to_acl(&default_acl_state,
738 flags | NFS4_ACL_TYPE_DEFAULT); 744 flags | NFS4_ACL_TYPE_DEFAULT);
739 if (IS_ERR(*dpacl)) { 745 if (IS_ERR(*dpacl)) {
740 ret = PTR_ERR(*dpacl); 746 ret = PTR_ERR(*dpacl);
747 *dpacl = NULL;
741 posix_acl_release(*pacl); 748 posix_acl_release(*pacl);
749 *pacl = NULL;
742 goto out_dstate; 750 goto out_dstate;
743 } 751 }
744 sort_pacl(*pacl); 752 sort_pacl(*pacl);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 864090edc2..31d6633c7f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -75,7 +75,7 @@ enum nfs_cb_opnum4 {
75#define op_enc_sz 1 75#define op_enc_sz 1
76#define op_dec_sz 2 76#define op_dec_sz 2
77#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) 77#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
78#define enc_stateid_sz 16 78#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
79#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 79#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
80 1 + enc_stateid_sz + \ 80 1 + enc_stateid_sz + \
81 enc_nfs4_fh_sz) 81 enc_nfs4_fh_sz)
@@ -394,7 +394,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
394 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 394 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
395 .rpc_argp = clp, 395 .rpc_argp = clp,
396 }; 396 };
397 char clientname[16];
398 int status; 397 int status;
399 398
400 if (atomic_read(&cb->cb_set)) 399 if (atomic_read(&cb->cb_set))
@@ -417,11 +416,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
417 memset(program->stats, 0, sizeof(cb->cb_stat)); 416 memset(program->stats, 0, sizeof(cb->cb_stat));
418 program->stats->program = program; 417 program->stats->program = program;
419 418
420 /* Just here to make some printk's more useful: */
421 snprintf(clientname, sizeof(clientname),
422 "%u.%u.%u.%u", NIPQUAD(addr.sin_addr));
423 args.servername = clientname;
424
425 /* Create RPC client */ 419 /* Create RPC client */
426 cb->cb_client = rpc_create(&args); 420 cb->cb_client = rpc_create(&args);
427 if (IS_ERR(cb->cb_client)) { 421 if (IS_ERR(cb->cb_client)) {
@@ -429,29 +423,23 @@ nfsd4_probe_callback(struct nfs4_client *clp)
429 goto out_err; 423 goto out_err;
430 } 424 }
431 425
432 /* Kick rpciod, put the call on the wire. */
433 if (rpciod_up() != 0)
434 goto out_clnt;
435
436 /* the task holds a reference to the nfs4_client struct */ 426 /* the task holds a reference to the nfs4_client struct */
437 atomic_inc(&clp->cl_count); 427 atomic_inc(&clp->cl_count);
438 428
439 msg.rpc_cred = nfsd4_lookupcred(clp,0); 429 msg.rpc_cred = nfsd4_lookupcred(clp,0);
440 if (IS_ERR(msg.rpc_cred)) 430 if (IS_ERR(msg.rpc_cred))
441 goto out_rpciod; 431 goto out_release_clp;
442 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); 432 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
443 put_rpccred(msg.rpc_cred); 433 put_rpccred(msg.rpc_cred);
444 434
445 if (status != 0) { 435 if (status != 0) {
446 dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n"); 436 dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
447 goto out_rpciod; 437 goto out_release_clp;
448 } 438 }
449 return; 439 return;
450 440
451out_rpciod: 441out_release_clp:
452 atomic_dec(&clp->cl_count); 442 atomic_dec(&clp->cl_count);
453 rpciod_down();
454out_clnt:
455 rpc_shutdown_client(cb->cb_client); 443 rpc_shutdown_client(cb->cb_client);
456out_err: 444out_err:
457 cb->cb_client = NULL; 445 cb->cb_client = NULL;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 45aa21ce67..2cf9a9a2d8 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -587,6 +587,15 @@ idmap_lookup(struct svc_rqst *rqstp,
587 return ret; 587 return ret;
588} 588}
589 589
590static char *
591rqst_authname(struct svc_rqst *rqstp)
592{
593 struct auth_domain *clp;
594
595 clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
596 return clp->name;
597}
598
590static int 599static int
591idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 600idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
592 uid_t *id) 601 uid_t *id)
@@ -600,7 +609,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
600 return -EINVAL; 609 return -EINVAL;
601 memcpy(key.name, name, namelen); 610 memcpy(key.name, name, namelen);
602 key.name[namelen] = '\0'; 611 key.name[namelen] = '\0';
603 strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); 612 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
604 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 613 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
605 if (ret == -ENOENT) 614 if (ret == -ENOENT)
606 ret = -ESRCH; /* nfserr_badname */ 615 ret = -ESRCH; /* nfserr_badname */
@@ -620,7 +629,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
620 }; 629 };
621 int ret; 630 int ret;
622 631
623 strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); 632 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
624 ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item); 633 ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item);
625 if (ret == -ENOENT) 634 if (ret == -ENOENT)
626 return sprintf(name, "%u", id); 635 return sprintf(name, "%u", id);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8522729830..3c627128e2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -47,6 +47,7 @@
47#include <linux/nfsd/state.h> 47#include <linux/nfsd/state.h>
48#include <linux/nfsd/xdr4.h> 48#include <linux/nfsd/xdr4.h>
49#include <linux/nfs4_acl.h> 49#include <linux/nfs4_acl.h>
50#include <linux/sunrpc/gss_api.h>
50 51
51#define NFSDDBG_FACILITY NFSDDBG_PROC 52#define NFSDDBG_FACILITY NFSDDBG_PROC
52 53
@@ -286,8 +287,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
286 __be32 status; 287 __be32 status;
287 288
288 fh_put(&cstate->current_fh); 289 fh_put(&cstate->current_fh);
289 status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh, 290 status = exp_pseudoroot(rqstp, &cstate->current_fh);
290 &rqstp->rq_chandle);
291 return status; 291 return status;
292} 292}
293 293
@@ -474,8 +474,8 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
474 __be32 ret; 474 __be32 ret;
475 475
476 fh_init(&tmp_fh, NFS4_FHSIZE); 476 fh_init(&tmp_fh, NFS4_FHSIZE);
477 if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh, 477 ret = exp_pseudoroot(rqstp, &tmp_fh);
478 &rqstp->rq_chandle)) != 0) 478 if (ret)
479 return ret; 479 return ret;
480 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 480 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
481 fh_put(&tmp_fh); 481 fh_put(&tmp_fh);
@@ -611,6 +611,30 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
611} 611}
612 612
613static __be32 613static __be32
614nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 struct nfsd4_secinfo *secinfo)
616{
617 struct svc_fh resfh;
618 struct svc_export *exp;
619 struct dentry *dentry;
620 __be32 err;
621
622 fh_init(&resfh, NFS4_FHSIZE);
623 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
624 secinfo->si_name, secinfo->si_namelen,
625 &exp, &dentry);
626 if (err)
627 return err;
628 if (dentry->d_inode == NULL) {
629 exp_put(exp);
630 err = nfserr_noent;
631 } else
632 secinfo->si_exp = exp;
633 dput(dentry);
634 return err;
635}
636
637static __be32
614nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 638nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 struct nfsd4_setattr *setattr) 639 struct nfsd4_setattr *setattr)
616{ 640{
@@ -1009,6 +1033,9 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1009 [OP_SAVEFH] = { 1033 [OP_SAVEFH] = {
1010 .op_func = (nfsd4op_func)nfsd4_savefh, 1034 .op_func = (nfsd4op_func)nfsd4_savefh,
1011 }, 1035 },
1036 [OP_SECINFO] = {
1037 .op_func = (nfsd4op_func)nfsd4_secinfo,
1038 },
1012 [OP_SETATTR] = { 1039 [OP_SETATTR] = {
1013 .op_func = (nfsd4op_func)nfsd4_setattr, 1040 .op_func = (nfsd4op_func)nfsd4_setattr,
1014 }, 1041 },
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3cc8ce422a..e4a4c87ec8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,8 +49,10 @@
49#include <linux/nfsd/state.h> 49#include <linux/nfsd/state.h>
50#include <linux/nfsd/xdr4.h> 50#include <linux/nfsd/xdr4.h>
51#include <linux/namei.h> 51#include <linux/namei.h>
52#include <linux/swap.h>
52#include <linux/mutex.h> 53#include <linux/mutex.h>
53#include <linux/lockd/bind.h> 54#include <linux/lockd/bind.h>
55#include <linux/module.h>
54 56
55#define NFSDDBG_FACILITY NFSDDBG_PROC 57#define NFSDDBG_FACILITY NFSDDBG_PROC
56 58
@@ -149,6 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
149} 151}
150 152
151static int num_delegations; 153static int num_delegations;
154unsigned int max_delegations;
152 155
153/* 156/*
154 * Open owner state (share locks) 157 * Open owner state (share locks)
@@ -192,7 +195,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
192 struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; 195 struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
193 196
194 dprintk("NFSD alloc_init_deleg\n"); 197 dprintk("NFSD alloc_init_deleg\n");
195 if (num_delegations > STATEID_HASH_SIZE * 4) 198 if (fp->fi_had_conflict)
199 return NULL;
200 if (num_delegations > max_delegations)
196 return NULL; 201 return NULL;
197 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); 202 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
198 if (dp == NULL) 203 if (dp == NULL)
@@ -378,7 +383,6 @@ shutdown_callback_client(struct nfs4_client *clp)
378 if (clnt) { 383 if (clnt) {
379 clp->cl_callback.cb_client = NULL; 384 clp->cl_callback.cb_client = NULL;
380 rpc_shutdown_client(clnt); 385 rpc_shutdown_client(clnt);
381 rpciod_down();
382 } 386 }
383} 387}
384 388
@@ -1000,6 +1004,7 @@ alloc_init_file(struct inode *ino)
1000 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1004 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1001 fp->fi_inode = igrab(ino); 1005 fp->fi_inode = igrab(ino);
1002 fp->fi_id = current_fileid++; 1006 fp->fi_id = current_fileid++;
1007 fp->fi_had_conflict = false;
1003 return fp; 1008 return fp;
1004 } 1009 }
1005 return NULL; 1010 return NULL;
@@ -1326,6 +1331,7 @@ do_recall(void *__dp)
1326{ 1331{
1327 struct nfs4_delegation *dp = __dp; 1332 struct nfs4_delegation *dp = __dp;
1328 1333
1334 dp->dl_file->fi_had_conflict = true;
1329 nfsd4_cb_recall(dp); 1335 nfsd4_cb_recall(dp);
1330 return 0; 1336 return 0;
1331} 1337}
@@ -3191,20 +3197,49 @@ nfsd4_load_reboot_recovery_data(void)
3191 printk("NFSD: Failure reading reboot recovery data\n"); 3197 printk("NFSD: Failure reading reboot recovery data\n");
3192} 3198}
3193 3199
3200unsigned long
3201get_nfs4_grace_period(void)
3202{
3203 return max(user_lease_time, lease_time) * HZ;
3204}
3205
3206/*
3207 * Since the lifetime of a delegation isn't limited to that of an open, a
3208 * client may quite reasonably hang on to a delegation as long as it has
3209 * the inode cached. This becomes an obvious problem the first time a
3210 * client's inode cache approaches the size of the server's total memory.
3211 *
3212 * For now we avoid this problem by imposing a hard limit on the number
3213 * of delegations, which varies according to the server's memory size.
3214 */
3215static void
3216set_max_delegations(void)
3217{
3218 /*
3219 * Allow at most 4 delegations per megabyte of RAM. Quick
3220 * estimates suggest that in the worst case (where every delegation
3221 * is for a different inode), a delegation could take about 1.5K,
3222 * giving a worst case usage of about 6% of memory.
3223 */
3224 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
3225}
3226
3194/* initialization to perform when the nfsd service is started: */ 3227/* initialization to perform when the nfsd service is started: */
3195 3228
3196static void 3229static void
3197__nfs4_state_start(void) 3230__nfs4_state_start(void)
3198{ 3231{
3199 time_t grace_time; 3232 unsigned long grace_time;
3200 3233
3201 boot_time = get_seconds(); 3234 boot_time = get_seconds();
3202 grace_time = max(user_lease_time, lease_time); 3235 grace_time = get_nfs_grace_period();
3203 lease_time = user_lease_time; 3236 lease_time = user_lease_time;
3204 in_grace = 1; 3237 in_grace = 1;
3205 printk("NFSD: starting %ld-second grace period\n", grace_time); 3238 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
3239 grace_time/HZ);
3206 laundry_wq = create_singlethread_workqueue("nfsd4"); 3240 laundry_wq = create_singlethread_workqueue("nfsd4");
3207 queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ); 3241 queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
3242 set_max_delegations();
3208} 3243}
3209 3244
3210int 3245int
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15809dfd88..b3d55c6747 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -56,6 +56,8 @@
56#include <linux/nfsd_idmap.h> 56#include <linux/nfsd_idmap.h>
57#include <linux/nfs4.h> 57#include <linux/nfs4.h>
58#include <linux/nfs4_acl.h> 58#include <linux/nfs4_acl.h>
59#include <linux/sunrpc/gss_api.h>
60#include <linux/sunrpc/svcauth_gss.h>
59 61
60#define NFSDDBG_FACILITY NFSDDBG_XDR 62#define NFSDDBG_FACILITY NFSDDBG_XDR
61 63
@@ -819,6 +821,23 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
819} 821}
820 822
821static __be32 823static __be32
824nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
825 struct nfsd4_secinfo *secinfo)
826{
827 DECODE_HEAD;
828
829 READ_BUF(4);
830 READ32(secinfo->si_namelen);
831 READ_BUF(secinfo->si_namelen);
832 SAVEMEM(secinfo->si_name, secinfo->si_namelen);
833 status = check_filename(secinfo->si_name, secinfo->si_namelen,
834 nfserr_noent);
835 if (status)
836 return status;
837 DECODE_TAIL;
838}
839
840static __be32
822nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 841nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
823{ 842{
824 DECODE_HEAD; 843 DECODE_HEAD;
@@ -1131,6 +1150,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1131 case OP_SAVEFH: 1150 case OP_SAVEFH:
1132 op->status = nfs_ok; 1151 op->status = nfs_ok;
1133 break; 1152 break;
1153 case OP_SECINFO:
1154 op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
1155 break;
1134 case OP_SETATTR: 1156 case OP_SETATTR:
1135 op->status = nfsd4_decode_setattr(argp, &op->u.setattr); 1157 op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
1136 break; 1158 break;
@@ -1296,7 +1318,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1296 char *path, *rootpath; 1318 char *path, *rootpath;
1297 1319
1298 fh_init(&tmp_fh, NFS4_FHSIZE); 1320 fh_init(&tmp_fh, NFS4_FHSIZE);
1299 *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle); 1321 *stat = exp_pseudoroot(rqstp, &tmp_fh);
1300 if (*stat) 1322 if (*stat)
1301 return NULL; 1323 return NULL;
1302 rootpath = tmp_fh.fh_export->ex_path; 1324 rootpath = tmp_fh.fh_export->ex_path;
@@ -1847,11 +1869,19 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1847 if (d_mountpoint(dentry)) { 1869 if (d_mountpoint(dentry)) {
1848 int err; 1870 int err;
1849 1871
1872 /*
1873 * Why the heck aren't we just using nfsd_lookup??
1874 * Different "."/".." handling? Something else?
1875 * At least, add a comment here to explain....
1876 */
1850 err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); 1877 err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
1851 if (err) { 1878 if (err) {
1852 nfserr = nfserrno(err); 1879 nfserr = nfserrno(err);
1853 goto out_put; 1880 goto out_put;
1854 } 1881 }
1882 nfserr = check_nfsd_access(exp, cd->rd_rqstp);
1883 if (nfserr)
1884 goto out_put;
1855 1885
1856 } 1886 }
1857 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 1887 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
@@ -2419,6 +2449,72 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2419 } 2449 }
2420} 2450}
2421 2451
2452static void
2453nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr,
2454 struct nfsd4_secinfo *secinfo)
2455{
2456 int i = 0;
2457 struct svc_export *exp = secinfo->si_exp;
2458 u32 nflavs;
2459 struct exp_flavor_info *flavs;
2460 struct exp_flavor_info def_flavs[2];
2461 ENCODE_HEAD;
2462
2463 if (nfserr)
2464 goto out;
2465 if (exp->ex_nflavors) {
2466 flavs = exp->ex_flavors;
2467 nflavs = exp->ex_nflavors;
2468 } else { /* Handling of some defaults in absence of real secinfo: */
2469 flavs = def_flavs;
2470 if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
2471 nflavs = 2;
2472 flavs[0].pseudoflavor = RPC_AUTH_UNIX;
2473 flavs[1].pseudoflavor = RPC_AUTH_NULL;
2474 } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
2475 nflavs = 1;
2476 flavs[0].pseudoflavor
2477 = svcauth_gss_flavor(exp->ex_client);
2478 } else {
2479 nflavs = 1;
2480 flavs[0].pseudoflavor
2481 = exp->ex_client->flavour->flavour;
2482 }
2483 }
2484
2485 RESERVE_SPACE(4);
2486 WRITE32(nflavs);
2487 ADJUST_ARGS();
2488 for (i = 0; i < nflavs; i++) {
2489 u32 flav = flavs[i].pseudoflavor;
2490 struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
2491
2492 if (gm) {
2493 RESERVE_SPACE(4);
2494 WRITE32(RPC_AUTH_GSS);
2495 ADJUST_ARGS();
2496 RESERVE_SPACE(4 + gm->gm_oid.len);
2497 WRITE32(gm->gm_oid.len);
2498 WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
2499 ADJUST_ARGS();
2500 RESERVE_SPACE(4);
2501 WRITE32(0); /* qop */
2502 ADJUST_ARGS();
2503 RESERVE_SPACE(4);
2504 WRITE32(gss_pseudoflavor_to_service(gm, flav));
2505 ADJUST_ARGS();
2506 gss_mech_put(gm);
2507 } else {
2508 RESERVE_SPACE(4);
2509 WRITE32(flav);
2510 ADJUST_ARGS();
2511 }
2512 }
2513out:
2514 if (exp)
2515 exp_put(exp);
2516}
2517
2422/* 2518/*
2423 * The SETATTR encode routine is special -- it always encodes a bitmap, 2519 * The SETATTR encode routine is special -- it always encodes a bitmap,
2424 * regardless of the error status. 2520 * regardless of the error status.
@@ -2559,6 +2655,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2559 break; 2655 break;
2560 case OP_SAVEFH: 2656 case OP_SAVEFH:
2561 break; 2657 break;
2658 case OP_SECINFO:
2659 nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
2660 break;
2562 case OP_SETATTR: 2661 case OP_SETATTR:
2563 nfsd4_encode_setattr(resp, op->status, &op->u.setattr); 2662 nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
2564 break; 2663 break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 71c686dc72..baac89d917 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,7 +35,6 @@
35#include <linux/nfsd/cache.h> 35#include <linux/nfsd/cache.h>
36#include <linux/nfsd/xdr.h> 36#include <linux/nfsd/xdr.h>
37#include <linux/nfsd/syscall.h> 37#include <linux/nfsd/syscall.h>
38#include <linux/nfsd/interface.h>
39 38
40#include <asm/uaccess.h> 39#include <asm/uaccess.h>
41 40
@@ -245,7 +244,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
245 } 244 }
246 exp_readunlock(); 245 exp_readunlock();
247 if (err == 0) 246 if (err == 0)
248 err = res->fh_size + (int)&((struct knfsd_fh*)0)->fh_base; 247 err = res->fh_size + offsetof(struct knfsd_fh, fh_base);
249 out: 248 out:
250 return err; 249 return err;
251} 250}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6ca2d24fc2..0eb464a39a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -15,10 +15,12 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/dcache.h> 17#include <linux/dcache.h>
18#include <linux/exportfs.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19 20
20#include <linux/sunrpc/clnt.h> 21#include <linux/sunrpc/clnt.h>
21#include <linux/sunrpc/svc.h> 22#include <linux/sunrpc/svc.h>
23#include <linux/sunrpc/svcauth_gss.h>
22#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
23 25
24#define NFSDDBG_FACILITY NFSDDBG_FH 26#define NFSDDBG_FACILITY NFSDDBG_FH
@@ -27,10 +29,6 @@
27static int nfsd_nr_verified; 29static int nfsd_nr_verified;
28static int nfsd_nr_put; 30static int nfsd_nr_put;
29 31
30extern struct export_operations export_op_default;
31
32#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
33
34/* 32/*
35 * our acceptability function. 33 * our acceptability function.
36 * if NOSUBTREECHECK, accept anything 34 * if NOSUBTREECHECK, accept anything
@@ -123,8 +121,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
123 int data_left = fh->fh_size/4; 121 int data_left = fh->fh_size/4;
124 122
125 error = nfserr_stale; 123 error = nfserr_stale;
126 if (rqstp->rq_client == NULL)
127 goto out;
128 if (rqstp->rq_vers > 2) 124 if (rqstp->rq_vers > 2)
129 error = nfserr_badhandle; 125 error = nfserr_badhandle;
130 if (rqstp->rq_vers == 4 && fh->fh_size == 0) 126 if (rqstp->rq_vers == 4 && fh->fh_size == 0)
@@ -148,7 +144,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
148 fh->fh_fsid[1] = fh->fh_fsid[2]; 144 fh->fh_fsid[1] = fh->fh_fsid[2];
149 } 145 }
150 if ((data_left -= len)<0) goto out; 146 if ((data_left -= len)<0) goto out;
151 exp = exp_find(rqstp->rq_client, fh->fh_fsid_type, datap, &rqstp->rq_chandle); 147 exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap);
152 datap += len; 148 datap += len;
153 } else { 149 } else {
154 dev_t xdev; 150 dev_t xdev;
@@ -159,19 +155,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
159 xdev = old_decode_dev(fh->ofh_xdev); 155 xdev = old_decode_dev(fh->ofh_xdev);
160 xino = u32_to_ino_t(fh->ofh_xino); 156 xino = u32_to_ino_t(fh->ofh_xino);
161 mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); 157 mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
162 exp = exp_find(rqstp->rq_client, FSID_DEV, tfh, 158 exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
163 &rqstp->rq_chandle);
164 } 159 }
165 160
166 if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN 161 error = nfserr_stale;
167 || PTR_ERR(exp) == -ETIMEDOUT)) { 162 if (PTR_ERR(exp) == -ENOENT)
168 error = nfserrno(PTR_ERR(exp));
169 goto out; 163 goto out;
170 }
171 164
172 error = nfserr_stale; 165 if (IS_ERR(exp)) {
173 if (!exp || IS_ERR(exp)) 166 error = nfserrno(PTR_ERR(exp));
174 goto out; 167 goto out;
168 }
175 169
176 /* Check if the request originated from a secure port. */ 170 /* Check if the request originated from a secure port. */
177 error = nfserr_perm; 171 error = nfserr_perm;
@@ -211,11 +205,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
211 if (fileid_type == 0) 205 if (fileid_type == 0)
212 dentry = dget(exp->ex_dentry); 206 dentry = dget(exp->ex_dentry);
213 else { 207 else {
214 struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op; 208 dentry = exportfs_decode_fh(exp->ex_mnt, datap,
215 dentry = CALL(nop,decode_fh)(exp->ex_mnt->mnt_sb, 209 data_left, fileid_type,
216 datap, data_left, 210 nfsd_acceptable, exp);
217 fileid_type,
218 nfsd_acceptable, exp);
219 } 211 }
220 if (dentry == NULL) 212 if (dentry == NULL)
221 goto out; 213 goto out;
@@ -257,8 +249,19 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
257 if (error) 249 if (error)
258 goto out; 250 goto out;
259 251
252 if (!(access & MAY_LOCK)) {
253 /*
254 * pseudoflavor restrictions are not enforced on NLM,
255 * which clients virtually always use auth_sys for,
256 * even while using RPCSEC_GSS for NFS.
257 */
258 error = check_nfsd_access(exp, rqstp);
259 if (error)
260 goto out;
261 }
262
260 /* Finally, check access permissions. */ 263 /* Finally, check access permissions. */
261 error = nfsd_permission(exp, dentry, access); 264 error = nfsd_permission(rqstp, exp, dentry, access);
262 265
263 if (error) { 266 if (error) {
264 dprintk("fh_verify: %s/%s permission failure, " 267 dprintk("fh_verify: %s/%s permission failure, "
@@ -286,15 +289,13 @@ out:
286static inline int _fh_update(struct dentry *dentry, struct svc_export *exp, 289static inline int _fh_update(struct dentry *dentry, struct svc_export *exp,
287 __u32 *datap, int *maxsize) 290 __u32 *datap, int *maxsize)
288{ 291{
289 struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
290
291 if (dentry == exp->ex_dentry) { 292 if (dentry == exp->ex_dentry) {
292 *maxsize = 0; 293 *maxsize = 0;
293 return 0; 294 return 0;
294 } 295 }
295 296
296 return CALL(nop,encode_fh)(dentry, datap, maxsize, 297 return exportfs_encode_fh(dentry, datap, maxsize,
297 !(exp->ex_flags&NFSEXP_NOSUBTREECHECK)); 298 !(exp->ex_flags & NFSEXP_NOSUBTREECHECK));
298} 299}
299 300
300/* 301/*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b2c7147aa9..977a71f64e 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -278,7 +278,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
278 * echo thing > device-special-file-or-pipe 278 * echo thing > device-special-file-or-pipe
279 * by doing a CREATE with type==0 279 * by doing a CREATE with type==0
280 */ 280 */
281 nfserr = nfsd_permission(newfhp->fh_export, 281 nfserr = nfsd_permission(rqstp,
282 newfhp->fh_export,
282 newfhp->fh_dentry, 283 newfhp->fh_dentry,
283 MAY_WRITE|MAY_LOCAL_ACCESS); 284 MAY_WRITE|MAY_LOCAL_ACCESS);
284 if (nfserr && nfserr != nfserr_rofs) 285 if (nfserr && nfserr != nfserr_rofs)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ff55950efb..a8c89ae4c7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -19,6 +19,7 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/smp_lock.h> 21#include <linux/smp_lock.h>
22#include <linux/freezer.h>
22#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
23 24
24#include <linux/sunrpc/types.h> 25#include <linux/sunrpc/types.h>
@@ -432,6 +433,7 @@ nfsd(struct svc_rqst *rqstp)
432 * dirty pages. 433 * dirty pages.
433 */ 434 */
434 current->flags |= PF_LESS_THROTTLE; 435 current->flags |= PF_LESS_THROTTLE;
436 set_freezable();
435 437
436 /* 438 /*
437 * The main request loop 439 * The main request loop
@@ -492,6 +494,15 @@ out:
492 module_put_and_exit(0); 494 module_put_and_exit(0);
493} 495}
494 496
497static __be32 map_new_errors(u32 vers, __be32 nfserr)
498{
499 if (nfserr == nfserr_jukebox && vers == 2)
500 return nfserr_dropit;
501 if (nfserr == nfserr_wrongsec && vers < 4)
502 return nfserr_acces;
503 return nfserr;
504}
505
495int 506int
496nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) 507nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
497{ 508{
@@ -534,6 +545,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
534 545
535 /* Now call the procedure handler, and encode NFS status. */ 546 /* Now call the procedure handler, and encode NFS status. */
536 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 547 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
548 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
537 if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2) 549 if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2)
538 nfserr = nfserr_dropit; 550 nfserr = nfserr_dropit;
539 if (nfserr == nfserr_dropit) { 551 if (nfserr == nfserr_dropit) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7e6aa245b5..e90f4a8a1d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -23,7 +23,7 @@
23#include <linux/file.h> 23#include <linux/file.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/major.h> 25#include <linux/major.h>
26#include <linux/ext2_fs.h> 26#include <linux/splice.h>
27#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
@@ -113,7 +113,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
113 113
114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
115 115
116 exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); 116 exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
117 if (IS_ERR(exp2)) { 117 if (IS_ERR(exp2)) {
118 err = PTR_ERR(exp2); 118 err = PTR_ERR(exp2);
119 dput(mounts); 119 dput(mounts);
@@ -135,21 +135,10 @@ out:
135 return err; 135 return err;
136} 136}
137 137
138/*
139 * Look up one component of a pathname.
140 * N.B. After this call _both_ fhp and resfh need an fh_put
141 *
142 * If the lookup would cross a mountpoint, and the mounted filesystem
143 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
144 * accepted as it stands and the mounted directory is
145 * returned. Otherwise the covered directory is returned.
146 * NOTE: this mountpoint crossing is not supported properly by all
147 * clients and is explicitly disallowed for NFSv3
148 * NeilBrown <neilb@cse.unsw.edu.au>
149 */
150__be32 138__be32
151nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 139nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
152 int len, struct svc_fh *resfh) 140 const char *name, int len,
141 struct svc_export **exp_ret, struct dentry **dentry_ret)
153{ 142{
154 struct svc_export *exp; 143 struct svc_export *exp;
155 struct dentry *dparent; 144 struct dentry *dparent;
@@ -168,8 +157,6 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
168 exp = fhp->fh_export; 157 exp = fhp->fh_export;
169 exp_get(exp); 158 exp_get(exp);
170 159
171 err = nfserr_acces;
172
173 /* Lookup the name, but don't follow links */ 160 /* Lookup the name, but don't follow links */
174 if (isdotent(name, len)) { 161 if (isdotent(name, len)) {
175 if (len==1) 162 if (len==1)
@@ -190,17 +177,15 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
190 dput(dentry); 177 dput(dentry);
191 dentry = dp; 178 dentry = dp;
192 179
193 exp2 = exp_parent(exp->ex_client, mnt, dentry, 180 exp2 = rqst_exp_parent(rqstp, mnt, dentry);
194 &rqstp->rq_chandle); 181 if (PTR_ERR(exp2) == -ENOENT) {
195 if (IS_ERR(exp2)) { 182 dput(dentry);
183 dentry = dget(dparent);
184 } else if (IS_ERR(exp2)) {
196 host_err = PTR_ERR(exp2); 185 host_err = PTR_ERR(exp2);
197 dput(dentry); 186 dput(dentry);
198 mntput(mnt); 187 mntput(mnt);
199 goto out_nfserr; 188 goto out_nfserr;
200 }
201 if (!exp2) {
202 dput(dentry);
203 dentry = dget(dparent);
204 } else { 189 } else {
205 exp_put(exp); 190 exp_put(exp);
206 exp = exp2; 191 exp = exp2;
@@ -223,6 +208,41 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
223 } 208 }
224 } 209 }
225 } 210 }
211 *dentry_ret = dentry;
212 *exp_ret = exp;
213 return 0;
214
215out_nfserr:
216 exp_put(exp);
217 return nfserrno(host_err);
218}
219
220/*
221 * Look up one component of a pathname.
222 * N.B. After this call _both_ fhp and resfh need an fh_put
223 *
224 * If the lookup would cross a mountpoint, and the mounted filesystem
225 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
226 * accepted as it stands and the mounted directory is
227 * returned. Otherwise the covered directory is returned.
228 * NOTE: this mountpoint crossing is not supported properly by all
229 * clients and is explicitly disallowed for NFSv3
230 * NeilBrown <neilb@cse.unsw.edu.au>
231 */
232__be32
233nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
234 int len, struct svc_fh *resfh)
235{
236 struct svc_export *exp;
237 struct dentry *dentry;
238 __be32 err;
239
240 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
241 if (err)
242 return err;
243 err = check_nfsd_access(exp, rqstp);
244 if (err)
245 goto out;
226 /* 246 /*
227 * Note: we compose the file handle now, but as the 247 * Note: we compose the file handle now, but as the
228 * dentry may be negative, it may need to be updated. 248 * dentry may be negative, it may need to be updated.
@@ -230,16 +250,13 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
230 err = fh_compose(resfh, exp, dentry, fhp); 250 err = fh_compose(resfh, exp, dentry, fhp);
231 if (!err && !dentry->d_inode) 251 if (!err && !dentry->d_inode)
232 err = nfserr_noent; 252 err = nfserr_noent;
233 dput(dentry);
234out: 253out:
254 dput(dentry);
235 exp_put(exp); 255 exp_put(exp);
236 return err; 256 return err;
237
238out_nfserr:
239 err = nfserrno(host_err);
240 goto out;
241} 257}
242 258
259
243/* 260/*
244 * Set various file attributes. 261 * Set various file attributes.
245 * N.B. After this call fhp needs an fh_put 262 * N.B. After this call fhp needs an fh_put
@@ -311,7 +328,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
311 /* The size case is special. It changes the file as well as the attributes. */ 328 /* The size case is special. It changes the file as well as the attributes. */
312 if (iap->ia_valid & ATTR_SIZE) { 329 if (iap->ia_valid & ATTR_SIZE) {
313 if (iap->ia_size < inode->i_size) { 330 if (iap->ia_size < inode->i_size) {
314 err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); 331 err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
315 if (err) 332 if (err)
316 goto out; 333 goto out;
317 } 334 }
@@ -435,7 +452,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
435 /* Get inode */ 452 /* Get inode */
436 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); 453 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
437 if (error) 454 if (error)
438 goto out; 455 return error;
439 456
440 dentry = fhp->fh_dentry; 457 dentry = fhp->fh_dentry;
441 inode = dentry->d_inode; 458 inode = dentry->d_inode;
@@ -444,33 +461,25 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
444 461
445 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); 462 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
446 if (host_error == -EINVAL) { 463 if (host_error == -EINVAL) {
447 error = nfserr_attrnotsupp; 464 return nfserr_attrnotsupp;
448 goto out;
449 } else if (host_error < 0) 465 } else if (host_error < 0)
450 goto out_nfserr; 466 goto out_nfserr;
451 467
452 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); 468 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
453 if (host_error < 0) 469 if (host_error < 0)
454 goto out_nfserr; 470 goto out_release;
455 471
456 if (S_ISDIR(inode->i_mode)) { 472 if (S_ISDIR(inode->i_mode))
457 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); 473 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
458 if (host_error < 0)
459 goto out_nfserr;
460 }
461 474
462 error = nfs_ok; 475out_release:
463
464out:
465 posix_acl_release(pacl); 476 posix_acl_release(pacl);
466 posix_acl_release(dpacl); 477 posix_acl_release(dpacl);
467 return (error);
468out_nfserr: 478out_nfserr:
469 if (host_error == -EOPNOTSUPP) 479 if (host_error == -EOPNOTSUPP)
470 error = nfserr_attrnotsupp; 480 return nfserr_attrnotsupp;
471 else 481 else
472 error = nfserrno(host_error); 482 return nfserrno(host_error);
473 goto out;
474} 483}
475 484
476static struct posix_acl * 485static struct posix_acl *
@@ -607,7 +616,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
607 616
608 sresult |= map->access; 617 sresult |= map->access;
609 618
610 err2 = nfsd_permission(export, dentry, map->how); 619 err2 = nfsd_permission(rqstp, export, dentry, map->how);
611 switch (err2) { 620 switch (err2) {
612 case nfs_ok: 621 case nfs_ok:
613 result |= map->access; 622 result |= map->access;
@@ -801,26 +810,32 @@ found:
801} 810}
802 811
803/* 812/*
804 * Grab and keep cached pages assosiated with a file in the svc_rqst 813 * Grab and keep cached pages associated with a file in the svc_rqst
805 * so that they can be passed to the netowork sendmsg/sendpage routines 814 * so that they can be passed to the network sendmsg/sendpage routines
806 * directrly. They will be released after the sending has completed. 815 * directly. They will be released after the sending has completed.
807 */ 816 */
808static int 817static int
809nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size) 818nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
819 struct splice_desc *sd)
810{ 820{
811 unsigned long count = desc->count; 821 struct svc_rqst *rqstp = sd->u.data;
812 struct svc_rqst *rqstp = desc->arg.data;
813 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 822 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
823 struct page *page = buf->page;
824 size_t size;
825 int ret;
826
827 ret = buf->ops->confirm(pipe, buf);
828 if (unlikely(ret))
829 return ret;
814 830
815 if (size > count) 831 size = sd->len;
816 size = count;
817 832
818 if (rqstp->rq_res.page_len == 0) { 833 if (rqstp->rq_res.page_len == 0) {
819 get_page(page); 834 get_page(page);
820 put_page(*pp); 835 put_page(*pp);
821 *pp = page; 836 *pp = page;
822 rqstp->rq_resused++; 837 rqstp->rq_resused++;
823 rqstp->rq_res.page_base = offset; 838 rqstp->rq_res.page_base = buf->offset;
824 rqstp->rq_res.page_len = size; 839 rqstp->rq_res.page_len = size;
825 } else if (page != pp[-1]) { 840 } else if (page != pp[-1]) {
826 get_page(page); 841 get_page(page);
@@ -832,11 +847,15 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
832 } else 847 } else
833 rqstp->rq_res.page_len += size; 848 rqstp->rq_res.page_len += size;
834 849
835 desc->count = count - size;
836 desc->written += size;
837 return size; 850 return size;
838} 851}
839 852
853static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
854 struct splice_desc *sd)
855{
856 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
857}
858
840static __be32 859static __be32
841nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 860nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
842 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 861 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -861,10 +880,16 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
861 if (ra && ra->p_set) 880 if (ra && ra->p_set)
862 file->f_ra = ra->p_ra; 881 file->f_ra = ra->p_ra;
863 882
864 if (file->f_op->sendfile && rqstp->rq_sendfile_ok) { 883 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
884 struct splice_desc sd = {
885 .len = 0,
886 .total_len = *count,
887 .pos = offset,
888 .u.data = rqstp,
889 };
890
865 rqstp->rq_resused = 1; 891 rqstp->rq_resused = 1;
866 host_err = file->f_op->sendfile(file, &offset, *count, 892 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
867 nfsd_read_actor, rqstp);
868 } else { 893 } else {
869 oldfs = get_fs(); 894 oldfs = get_fs();
870 set_fs(KERNEL_DS); 895 set_fs(KERNEL_DS);
@@ -1018,7 +1043,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1018 __be32 err; 1043 __be32 err;
1019 1044
1020 if (file) { 1045 if (file) {
1021 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1046 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
1022 MAY_READ|MAY_OWNER_OVERRIDE); 1047 MAY_READ|MAY_OWNER_OVERRIDE);
1023 if (err) 1048 if (err)
1024 goto out; 1049 goto out;
@@ -1047,7 +1072,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1047 __be32 err = 0; 1072 __be32 err = 0;
1048 1073
1049 if (file) { 1074 if (file) {
1050 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1075 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
1051 MAY_WRITE|MAY_OWNER_OVERRIDE); 1076 MAY_WRITE|MAY_OWNER_OVERRIDE);
1052 if (err) 1077 if (err)
1053 goto out; 1078 goto out;
@@ -1776,7 +1801,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1776 * Check for a user's access permissions to this inode. 1801 * Check for a user's access permissions to this inode.
1777 */ 1802 */
1778__be32 1803__be32
1779nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) 1804nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1805 struct dentry *dentry, int acc)
1780{ 1806{
1781 struct inode *inode = dentry->d_inode; 1807 struct inode *inode = dentry->d_inode;
1782 int err; 1808 int err;
@@ -1807,7 +1833,7 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1807 */ 1833 */
1808 if (!(acc & MAY_LOCAL_ACCESS)) 1834 if (!(acc & MAY_LOCAL_ACCESS))
1809 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { 1835 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
1810 if (EX_RDONLY(exp) || IS_RDONLY(inode)) 1836 if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode))
1811 return nfserr_rofs; 1837 return nfserr_rofs;
1812 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) 1838 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1813 return nfserr_perm; 1839 return nfserr_perm;
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index a7ade138d6..f499dd7c39 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -36,11 +36,9 @@ obj-$(CONFIG_NLS_ISO8859_6) += nls_iso8859-6.o
36obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o 36obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o
37obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o 37obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o
38obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o 38obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o
39obj-$(CONFIG_NLS_ISO8859_10) += nls_iso8859-10.o
40obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o 39obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o
41obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o 40obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o
42obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o 41obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o
43obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o 42obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o
44obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o 43obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o
45obj-$(CONFIG_NLS_ABC) += nls_abc.o
46obj-$(CONFIG_NLS_UTF8) += nls_utf8.o 44obj-$(CONFIG_NLS_UTF8) += nls_utf8.o
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7ed56390b5..ffcc504a16 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2276,7 +2276,7 @@ const struct file_operations ntfs_file_ops = {
2276 mounted filesystem. */ 2276 mounted filesystem. */
2277 .mmap = generic_file_mmap, /* Mmap file. */ 2277 .mmap = generic_file_mmap, /* Mmap file. */
2278 .open = ntfs_file_open, /* Open file. */ 2278 .open = ntfs_file_open, /* Open file. */
2279 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2279 .splice_read = generic_file_splice_read /* Zero-copy data send with
2280 the data source being on 2280 the data source being on
2281 the ntfs partition. We do 2281 the ntfs partition. We do
2282 not need to care about the 2282 not need to care about the
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index bff01a5467..e93c6142b2 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h>
24#include <linux/security.h> 25#include <linux/security.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d14..f5e11f4fa9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb);
53 55
54/* 56/*
55 * Structures which describe a path through a btree, and functions to 57 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
117} 119}
118 120
119/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
120 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only. 148 * have a root only.
122 */ 149 */
@@ -212,10 +239,41 @@ out:
212 return ret; 239 return ret;
213} 240}
214 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
215enum ocfs2_contig_type { 272enum ocfs2_contig_type {
216 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
217 CONTIG_LEFT, 274 CONTIG_LEFT,
218 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
219}; 277};
220 278
221 279
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
253{ 311{
254 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
256 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
277 APPEND_TAIL, 343 APPEND_TAIL,
278}; 344};
279 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
280struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
281 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index; 356 int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
285 int ins_tree_depth; 358 int ins_tree_depth;
286}; 359};
287 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
288/* 368/*
289 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
290 */ 370 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
384 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 464 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385 eb->h_blkno = cpu_to_le64(first_blkno); 465 eb->h_blkno = cpu_to_le64(first_blkno);
386 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 466 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389 /* we always use slot zero's suballocator */
390 eb->h_suballoc_slot = 0;
391#else
392 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 467 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 468 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395 eb->h_list.l_count = 469 eb->h_list.l_count =
396 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 470 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
461 struct inode *inode, 535 struct inode *inode,
462 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
463 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
464 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
465 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
466{ 540{
467 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
476 550
477 mlog_entry_void(); 551 mlog_entry_void();
478 552
479 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
480 554
481 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
482 556
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
507 goto bail; 581 goto bail;
508 } 582 }
509 583
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512 586
513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
568 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
569 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
570 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
571 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (status < 0) { 647 if (status < 0) {
574 mlog_errno(status); 648 mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
601 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
602 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603 677
604 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
605 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606 680
607 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
608 if (status < 0) 682 if (status < 0)
609 mlog_errno(status); 683 mlog_errno(status);
610 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
616 mlog_errno(status); 690 mlog_errno(status);
617 } 691 }
618 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
619 status = 0; 701 status = 0;
620bail: 702bail:
621 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
829} 911}
830 912
831/* 913/*
914 * Grow a b-tree so that it has more records.
915 *
916 * We might shift the tree depth in which case existing paths should
917 * be considered invalid.
918 *
919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
922 */
923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
924 struct buffer_head *di_bh, int *final_depth,
925 struct buffer_head **last_eb_bh,
926 struct ocfs2_alloc_context *meta_ac)
927{
928 int ret, shift;
929 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
931 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
932 struct buffer_head *bh = NULL;
933
934 BUG_ON(meta_ac == NULL);
935
936 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
937 if (shift < 0) {
938 ret = shift;
939 mlog_errno(ret);
940 goto out;
941 }
942
943 /* We traveled all the way to the bottom of the allocation tree
944 * and didn't find room for any more extents - we need to add
945 * another tree level */
946 if (shift) {
947 BUG_ON(bh);
948 mlog(0, "need to shift tree depth (current = %d)\n", depth);
949
950 /* ocfs2_shift_tree_depth will return us a buffer with
951 * the new extent block (so we can pass that to
952 * ocfs2_add_branch). */
953 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
954 meta_ac, &bh);
955 if (ret < 0) {
956 mlog_errno(ret);
957 goto out;
958 }
959 depth++;
960 if (depth == 1) {
961 /*
962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
973 goto out;
974 }
975 }
976
977 /* call ocfs2_add_branch to add the final part of the tree with
978 * the new data. */
979 mlog(0, "add branch. bh = %p\n", bh);
980 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
981 meta_ac);
982 if (ret < 0) {
983 mlog_errno(ret);
984 goto out;
985 }
986
987out:
988 if (final_depth)
989 *final_depth = depth;
990 brelse(bh);
991 return ret;
992}
993
994/*
832 * This is only valid for leaf nodes, which are the only ones that can 995 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway. 996 * have empty extents anyway.
834 */ 997 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
934 1097
935} 1098}
936 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
937/* 1116/*
938 * Create an empty extent record . 1117 * Create an empty extent record .
939 * 1118 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1211 * immediately to their right. 1390 * immediately to their right.
1212 */ 1391 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216 1399
@@ -1531,10 +1714,16 @@ out:
1531 return ret; 1714 return ret;
1532} 1715}
1533 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1535 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1536{ 1725{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1538 1727
1539 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1568 return 0; 1757 return 0;
1569} 1758}
1570 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1571/* 1783/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 * 1785 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1586 */ 1798 */
1587static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1589 u32 insert_cpos, 1802 u32 insert_cpos,
1590 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1592{ 1805{
1593 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1594 u32 cpos; 1807 u32 cpos;
1595 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1596 1809
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1657 (unsigned long long) 1870 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1659 1872
1660 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) { 1875 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663 1876
1664 /* 1877 /*
1665 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1687 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1688 1901
1689 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path); 1903 orig_credits, right_path);
1691 if (ret) { 1904 if (ret) {
1692 mlog_errno(ret); 1905 mlog_errno(ret);
1693 goto out; 1906 goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1700 goto out; 1913 goto out;
1701 } 1914 }
1702 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1703 /* 1934 /*
1704 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
1722 return ret; 1953 return ret;
1723} 1954}
1724 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1725/* 2981/*
1726 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1738 2994
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1741 /* 3006 /*
1742 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1743 */ 3008 */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1792 return; 3057 return;
1793 } 3058 }
1794 3059
3060rotate:
1795 /* 3061 /*
1796 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1797 * 3063 *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1815 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816} 3082}
1817 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1822{ 3129{
1823 int ret, i, next_free; 3130 int ret, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1827 3133
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1887 goto out; 3193 goto out;
1888 } 3194 }
1889 3195
1890 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924 3197
1925 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1926 ret = 0; 3199 ret = 0;
@@ -1931,6 +3204,83 @@ out:
1931 return ret; 3204 return ret;
1932} 3205}
1933 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1934/* 3284/*
1935 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
1948{ 3298{
1949 int ret, subtree_index; 3299 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952 3301
1953 /* 3302 /*
1954 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
1984 } 3333 }
1985 } 3334 }
1986 3335
1987 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
1988 3347
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret) 3349 if (ret)
1992 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2075 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2076 */ 3434 */
2077 if (rotate) { 3435 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2079 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path); 3438 right_path, &left_path);
2081 if (ret) { 3439 if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2100 } 3458 }
2101 3459
2102out_update_clusters: 3460out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2104 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2105 3464
2106 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret) 3466 if (ret)
@@ -2114,6 +3473,44 @@ out:
2114 return ret; 3473 return ret;
2115} 3474}
2116 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2117static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2205 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2207 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2208 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210 3609
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2327 u32 cpos, 3726 u32 cpos,
2328 u64 start_blk, 3727 u64 start_blk,
2329 u32 new_clusters, 3728 u32 new_clusters,
3729 u8 flags,
2330 struct ocfs2_alloc_context *meta_ac) 3730 struct ocfs2_alloc_context *meta_ac)
2331{ 3731{
2332 int status, shift; 3732 int status;
2333 struct buffer_head *last_eb_bh = NULL; 3733 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL; 3734 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, }; 3735 struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2350 rec.e_cpos = cpu_to_le32(cpos); 3750 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk); 3751 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 3752 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3753 rec.e_flags = flags;
2353 3754
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3755 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert); 3756 &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3765 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth); 3766 insert.ins_free_records, insert.ins_tree_depth);
2366 3767
2367 /* 3768 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2368 * Avoid growing the tree unless we're out of records and the 3769 status = ocfs2_grow_tree(inode, handle, fe_bh,
2369 * insert type requres one. 3770 &insert.ins_tree_depth, &last_eb_bh,
2370 */ 3771 meta_ac);
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 3772 if (status) {
2372 goto out_add;
2373
2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375 if (shift < 0) {
2376 status = shift;
2377 mlog_errno(status);
2378 goto bail;
2379 }
2380
2381 /* We traveled all the way to the bottom of the allocation tree
2382 * and didn't find room for any more extents - we need to add
2383 * another tree level */
2384 if (shift) {
2385 BUG_ON(bh);
2386 mlog(0, "need to shift tree depth "
2387 "(current = %d)\n", insert.ins_tree_depth);
2388
2389 /* ocfs2_shift_tree_depth will return us a buffer with
2390 * the new extent block (so we can pass that to
2391 * ocfs2_add_branch). */
2392 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393 meta_ac, &bh);
2394 if (status < 0) {
2395 mlog_errno(status); 3773 mlog_errno(status);
2396 goto bail; 3774 goto bail;
2397 } 3775 }
2398 insert.ins_tree_depth++;
2399 /* Special case: we have room now if we shifted from
2400 * tree_depth 0 */
2401 if (insert.ins_tree_depth == 1)
2402 goto out_add;
2403 }
2404
2405 /* call ocfs2_add_branch to add the final part of the tree with
2406 * the new data. */
2407 mlog(0, "add branch. bh = %p\n", bh);
2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409 meta_ac);
2410 if (status < 0) {
2411 mlog_errno(status);
2412 goto bail;
2413 } 3776 }
2414 3777
2415out_add:
2416 /* Finally, we can add clusters. This might rotate the tree for us. */ 3778 /* Finally, we can add clusters. This might rotate the tree for us. */
2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 3779 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418 if (status < 0) 3780 if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
2431 return status; 3793 return status;
2432} 3794}
2433 3795
2434static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 3796static void ocfs2_make_right_split_rec(struct super_block *sb,
3797 struct ocfs2_extent_rec *split_rec,
3798 u32 cpos,
3799 struct ocfs2_extent_rec *rec)
3800{
3801 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3802 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3803
3804 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3805
3806 split_rec->e_cpos = cpu_to_le32(cpos);
3807 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3808
3809 split_rec->e_blkno = rec->e_blkno;
3810 le64_add_cpu(&split_rec->e_blkno,
3811 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3812
3813 split_rec->e_flags = rec->e_flags;
3814}
3815
3816static int ocfs2_split_and_insert(struct inode *inode,
3817 handle_t *handle,
3818 struct ocfs2_path *path,
3819 struct buffer_head *di_bh,
3820 struct buffer_head **last_eb_bh,
3821 int split_index,
3822 struct ocfs2_extent_rec *orig_split_rec,
3823 struct ocfs2_alloc_context *meta_ac)
3824{
3825 int ret = 0, depth;
3826 unsigned int insert_range, rec_range, do_leftright = 0;
3827 struct ocfs2_extent_rec tmprec;
3828 struct ocfs2_extent_list *rightmost_el;
3829 struct ocfs2_extent_rec rec;
3830 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3831 struct ocfs2_insert_type insert;
3832 struct ocfs2_extent_block *eb;
3833 struct ocfs2_dinode *di;
3834
3835leftright:
3836 /*
3837 * Store a copy of the record on the stack - it might move
3838 * around as the tree is manipulated below.
3839 */
3840 rec = path_leaf_el(path)->l_recs[split_index];
3841
3842 di = (struct ocfs2_dinode *)di_bh->b_data;
3843 rightmost_el = &di->id2.i_list;
3844
3845 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3846 if (depth) {
3847 BUG_ON(!(*last_eb_bh));
3848 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3849 rightmost_el = &eb->h_list;
3850 }
3851
3852 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3853 le16_to_cpu(rightmost_el->l_count)) {
3854 int old_depth = depth;
3855
3856 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3857 meta_ac);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 if (old_depth != depth) {
3864 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3865 rightmost_el = &eb->h_list;
3866 }
3867 }
3868
3869 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3870 insert.ins_appending = APPEND_NONE;
3871 insert.ins_contig = CONTIG_NONE;
3872 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3873 - le16_to_cpu(rightmost_el->l_next_free_rec);
3874 insert.ins_tree_depth = depth;
3875
3876 insert_range = le32_to_cpu(split_rec.e_cpos) +
3877 le16_to_cpu(split_rec.e_leaf_clusters);
3878 rec_range = le32_to_cpu(rec.e_cpos) +
3879 le16_to_cpu(rec.e_leaf_clusters);
3880
3881 if (split_rec.e_cpos == rec.e_cpos) {
3882 insert.ins_split = SPLIT_LEFT;
3883 } else if (insert_range == rec_range) {
3884 insert.ins_split = SPLIT_RIGHT;
3885 } else {
3886 /*
3887 * Left/right split. We fake this as a right split
3888 * first and then make a second pass as a left split.
3889 */
3890 insert.ins_split = SPLIT_RIGHT;
3891
3892 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3893 &rec);
3894
3895 split_rec = tmprec;
3896
3897 BUG_ON(do_leftright);
3898 do_leftright = 1;
3899 }
3900
3901 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3902 &insert);
3903 if (ret) {
3904 mlog_errno(ret);
3905 goto out;
3906 }
3907
3908 if (do_leftright == 1) {
3909 u32 cpos;
3910 struct ocfs2_extent_list *el;
3911
3912 do_leftright++;
3913 split_rec = *orig_split_rec;
3914
3915 ocfs2_reinit_path(path, 1);
3916
3917 cpos = le32_to_cpu(split_rec.e_cpos);
3918 ret = ocfs2_find_path(inode, path, cpos);
3919 if (ret) {
3920 mlog_errno(ret);
3921 goto out;
3922 }
3923
3924 el = path_leaf_el(path);
3925 split_index = ocfs2_search_extent_list(el, cpos);
3926 goto leftright;
3927 }
3928out:
3929
3930 return ret;
3931}
3932
3933/*
3934 * Mark part or all of the extent record at split_index in the leaf
3935 * pointed to by path as written. This removes the unwritten
3936 * extent flag.
3937 *
3938 * Care is taken to handle contiguousness so as to not grow the tree.
3939 *
3940 * meta_ac is not strictly necessary - we only truly need it if growth
3941 * of the tree is required. All other cases will degrade into a less
3942 * optimal tree layout.
3943 *
3944 * last_eb_bh should be the rightmost leaf block for any inode with a
3945 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3946 *
3947 * This code is optimized for readability - several passes might be
3948 * made over certain portions of the tree. All of those blocks will
3949 * have been brought into cache (and pinned via the journal), so the
3950 * extra overhead is not expressed in terms of disk reads.
3951 */
3952static int __ocfs2_mark_extent_written(struct inode *inode,
3953 struct buffer_head *di_bh,
3954 handle_t *handle,
3955 struct ocfs2_path *path,
3956 int split_index,
3957 struct ocfs2_extent_rec *split_rec,
3958 struct ocfs2_alloc_context *meta_ac,
3959 struct ocfs2_cached_dealloc_ctxt *dealloc)
3960{
3961 int ret = 0;
3962 struct ocfs2_extent_list *el = path_leaf_el(path);
3963 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3964 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3965 struct ocfs2_merge_ctxt ctxt;
3966 struct ocfs2_extent_list *rightmost_el;
3967
3968 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3969 ret = -EIO;
3970 mlog_errno(ret);
3971 goto out;
3972 }
3973
3974 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3975 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3976 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3977 ret = -EIO;
3978 mlog_errno(ret);
3979 goto out;
3980 }
3981
3982 eb_bh = path_leaf_bh(path);
3983 ret = ocfs2_journal_access(handle, inode, eb_bh,
3984 OCFS2_JOURNAL_ACCESS_WRITE);
3985 if (ret) {
3986 mlog_errno(ret);
3987 goto out;
3988 }
3989
3990 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3991 split_index,
3992 split_rec);
3993
3994 /*
3995 * The core merge / split code wants to know how much room is
3996 * left in this inodes allocation tree, so we pass the
3997 * rightmost extent list.
3998 */
3999 if (path->p_tree_depth) {
4000 struct ocfs2_extent_block *eb;
4001 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4002
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4004 le64_to_cpu(di->i_last_eb_blk),
4005 &last_eb_bh, OCFS2_BH_CACHED, inode);
4006 if (ret) {
4007 mlog_exit(ret);
4008 goto out;
4009 }
4010
4011 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4012 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4013 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4014 ret = -EROFS;
4015 goto out;
4016 }
4017
4018 rightmost_el = &eb->h_list;
4019 } else
4020 rightmost_el = path_root_el(path);
4021
4022 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4023 if (ctxt.c_used_tail_recs > 0 &&
4024 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4025 ctxt.c_used_tail_recs--;
4026
4027 if (rec->e_cpos == split_rec->e_cpos &&
4028 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4029 ctxt.c_split_covers_rec = 1;
4030 else
4031 ctxt.c_split_covers_rec = 0;
4032
4033 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4034
4035 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4036 "has_empty: %u, split_covers: %u\n", split_index,
4037 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4038 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4039
4040 if (ctxt.c_contig_type == CONTIG_NONE) {
4041 if (ctxt.c_split_covers_rec)
4042 el->l_recs[split_index] = *split_rec;
4043 else
4044 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4045 &last_eb_bh, split_index,
4046 split_rec, meta_ac);
4047 if (ret)
4048 mlog_errno(ret);
4049 } else {
4050 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4051 split_index, split_rec,
4052 dealloc, &ctxt);
4053 if (ret)
4054 mlog_errno(ret);
4055 }
4056
4057 ocfs2_journal_dirty(handle, eb_bh);
4058
4059out:
4060 brelse(last_eb_bh);
4061 return ret;
4062}
4063
4064/*
4065 * Mark the already-existing extent at cpos as written for len clusters.
4066 *
4067 * If the existing extent is larger than the request, initiate a
4068 * split. An attempt will be made at merging with adjacent extents.
4069 *
4070 * The caller is responsible for passing down meta_ac if we'll need it.
4071 */
4072int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4073 handle_t *handle, u32 cpos, u32 len, u32 phys,
4074 struct ocfs2_alloc_context *meta_ac,
4075 struct ocfs2_cached_dealloc_ctxt *dealloc)
4076{
4077 int ret, index;
4078 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4079 struct ocfs2_extent_rec split_rec;
4080 struct ocfs2_path *left_path = NULL;
4081 struct ocfs2_extent_list *el;
4082
4083 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4084 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4085
4086 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4087 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4088 "that are being written to, but the feature bit "
4089 "is not set in the super block.",
4090 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4091 ret = -EROFS;
4092 goto out;
4093 }
4094
4095 /*
4096 * XXX: This should be fixed up so that we just re-insert the
4097 * next extent records.
4098 */
4099 ocfs2_extent_map_trunc(inode, 0);
4100
4101 left_path = ocfs2_new_inode_path(di_bh);
4102 if (!left_path) {
4103 ret = -ENOMEM;
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107
4108 ret = ocfs2_find_path(inode, left_path, cpos);
4109 if (ret) {
4110 mlog_errno(ret);
4111 goto out;
4112 }
4113 el = path_leaf_el(left_path);
4114
4115 index = ocfs2_search_extent_list(el, cpos);
4116 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4117 ocfs2_error(inode->i_sb,
4118 "Inode %llu has an extent at cpos %u which can no "
4119 "longer be found.\n",
4120 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4121 ret = -EROFS;
4122 goto out;
4123 }
4124
4125 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4126 split_rec.e_cpos = cpu_to_le32(cpos);
4127 split_rec.e_leaf_clusters = cpu_to_le16(len);
4128 split_rec.e_blkno = cpu_to_le64(start_blkno);
4129 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4130 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4131
4132 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4133 index, &split_rec, meta_ac, dealloc);
4134 if (ret)
4135 mlog_errno(ret);
4136
4137out:
4138 ocfs2_free_path(left_path);
4139 return ret;
4140}
4141
4142static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4143 handle_t *handle, struct ocfs2_path *path,
4144 int index, u32 new_range,
4145 struct ocfs2_alloc_context *meta_ac)
4146{
4147 int ret, depth, credits = handle->h_buffer_credits;
4148 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4149 struct buffer_head *last_eb_bh = NULL;
4150 struct ocfs2_extent_block *eb;
4151 struct ocfs2_extent_list *rightmost_el, *el;
4152 struct ocfs2_extent_rec split_rec;
4153 struct ocfs2_extent_rec *rec;
4154 struct ocfs2_insert_type insert;
4155
4156 /*
4157 * Setup the record to split before we grow the tree.
4158 */
4159 el = path_leaf_el(path);
4160 rec = &el->l_recs[index];
4161 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4162
4163 depth = path->p_tree_depth;
4164 if (depth > 0) {
4165 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4166 le64_to_cpu(di->i_last_eb_blk),
4167 &last_eb_bh, OCFS2_BH_CACHED, inode);
4168 if (ret < 0) {
4169 mlog_errno(ret);
4170 goto out;
4171 }
4172
4173 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4174 rightmost_el = &eb->h_list;
4175 } else
4176 rightmost_el = path_leaf_el(path);
4177
4178 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4179 ret = ocfs2_extend_trans(handle, credits);
4180 if (ret) {
4181 mlog_errno(ret);
4182 goto out;
4183 }
4184
4185 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4186 le16_to_cpu(rightmost_el->l_count)) {
4187 int old_depth = depth;
4188
4189 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4190 meta_ac);
4191 if (ret) {
4192 mlog_errno(ret);
4193 goto out;
4194 }
4195
4196 if (old_depth != depth) {
4197 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4198 rightmost_el = &eb->h_list;
4199 }
4200 }
4201
4202 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4203 insert.ins_appending = APPEND_NONE;
4204 insert.ins_contig = CONTIG_NONE;
4205 insert.ins_split = SPLIT_RIGHT;
4206 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4207 - le16_to_cpu(rightmost_el->l_next_free_rec);
4208 insert.ins_tree_depth = depth;
4209
4210 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4211 if (ret)
4212 mlog_errno(ret);
4213
4214out:
4215 brelse(last_eb_bh);
4216 return ret;
4217}
4218
4219static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4220 struct ocfs2_path *path, int index,
4221 struct ocfs2_cached_dealloc_ctxt *dealloc,
4222 u32 cpos, u32 len)
4223{
4224 int ret;
4225 u32 left_cpos, rec_range, trunc_range;
4226 int wants_rotate = 0, is_rightmost_tree_rec = 0;
4227 struct super_block *sb = inode->i_sb;
4228 struct ocfs2_path *left_path = NULL;
4229 struct ocfs2_extent_list *el = path_leaf_el(path);
4230 struct ocfs2_extent_rec *rec;
4231 struct ocfs2_extent_block *eb;
4232
4233 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4234 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4235 if (ret) {
4236 mlog_errno(ret);
4237 goto out;
4238 }
4239
4240 index--;
4241 }
4242
4243 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4244 path->p_tree_depth) {
4245 /*
4246 * Check whether this is the rightmost tree record. If
4247 * we remove all of this record or part of its right
4248 * edge then an update of the record lengths above it
4249 * will be required.
4250 */
4251 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4252 if (eb->h_next_leaf_blk == 0)
4253 is_rightmost_tree_rec = 1;
4254 }
4255
4256 rec = &el->l_recs[index];
4257 if (index == 0 && path->p_tree_depth &&
4258 le32_to_cpu(rec->e_cpos) == cpos) {
4259 /*
4260 * Changing the leftmost offset (via partial or whole
4261 * record truncate) of an interior (or rightmost) path
4262 * means we have to update the subtree that is formed
4263 * by this leaf and the one to it's left.
4264 *
4265 * There are two cases we can skip:
4266 * 1) Path is the leftmost one in our inode tree.
4267 * 2) The leaf is rightmost and will be empty after
4268 * we remove the extent record - the rotate code
4269 * knows how to update the newly formed edge.
4270 */
4271
4272 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4273 &left_cpos);
4274 if (ret) {
4275 mlog_errno(ret);
4276 goto out;
4277 }
4278
4279 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4280 left_path = ocfs2_new_path(path_root_bh(path),
4281 path_root_el(path));
4282 if (!left_path) {
4283 ret = -ENOMEM;
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 ret = ocfs2_find_path(inode, left_path, left_cpos);
4289 if (ret) {
4290 mlog_errno(ret);
4291 goto out;
4292 }
4293 }
4294 }
4295
4296 ret = ocfs2_extend_rotate_transaction(handle, 0,
4297 handle->h_buffer_credits,
4298 path);
4299 if (ret) {
4300 mlog_errno(ret);
4301 goto out;
4302 }
4303
4304 ret = ocfs2_journal_access_path(inode, handle, path);
4305 if (ret) {
4306 mlog_errno(ret);
4307 goto out;
4308 }
4309
4310 ret = ocfs2_journal_access_path(inode, handle, left_path);
4311 if (ret) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315
4316 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4317 trunc_range = cpos + len;
4318
4319 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4320 int next_free;
4321
4322 memset(rec, 0, sizeof(*rec));
4323 ocfs2_cleanup_merge(el, index);
4324 wants_rotate = 1;
4325
4326 next_free = le16_to_cpu(el->l_next_free_rec);
4327 if (is_rightmost_tree_rec && next_free > 1) {
4328 /*
4329 * We skip the edge update if this path will
4330 * be deleted by the rotate code.
4331 */
4332 rec = &el->l_recs[next_free - 1];
4333 ocfs2_adjust_rightmost_records(inode, handle, path,
4334 rec);
4335 }
4336 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4337 /* Remove leftmost portion of the record. */
4338 le32_add_cpu(&rec->e_cpos, len);
4339 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4340 le16_add_cpu(&rec->e_leaf_clusters, -len);
4341 } else if (rec_range == trunc_range) {
4342 /* Remove rightmost portion of the record */
4343 le16_add_cpu(&rec->e_leaf_clusters, -len);
4344 if (is_rightmost_tree_rec)
4345 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4346 } else {
4347 /* Caller should have trapped this. */
4348 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4349 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4350 le32_to_cpu(rec->e_cpos),
4351 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4352 BUG();
4353 }
4354
4355 if (left_path) {
4356 int subtree_index;
4357
4358 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4359 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4360 subtree_index);
4361 }
4362
4363 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4364
4365 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4366 if (ret) {
4367 mlog_errno(ret);
4368 goto out;
4369 }
4370
4371out:
4372 ocfs2_free_path(left_path);
4373 return ret;
4374}
4375
4376int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4377 u32 cpos, u32 len, handle_t *handle,
4378 struct ocfs2_alloc_context *meta_ac,
4379 struct ocfs2_cached_dealloc_ctxt *dealloc)
4380{
4381 int ret, index;
4382 u32 rec_range, trunc_range;
4383 struct ocfs2_extent_rec *rec;
4384 struct ocfs2_extent_list *el;
4385 struct ocfs2_path *path;
4386
4387 ocfs2_extent_map_trunc(inode, 0);
4388
4389 path = ocfs2_new_inode_path(di_bh);
4390 if (!path) {
4391 ret = -ENOMEM;
4392 mlog_errno(ret);
4393 goto out;
4394 }
4395
4396 ret = ocfs2_find_path(inode, path, cpos);
4397 if (ret) {
4398 mlog_errno(ret);
4399 goto out;
4400 }
4401
4402 el = path_leaf_el(path);
4403 index = ocfs2_search_extent_list(el, cpos);
4404 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4405 ocfs2_error(inode->i_sb,
4406 "Inode %llu has an extent at cpos %u which can no "
4407 "longer be found.\n",
4408 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4409 ret = -EROFS;
4410 goto out;
4411 }
4412
4413 /*
4414 * We have 3 cases of extent removal:
4415 * 1) Range covers the entire extent rec
4416 * 2) Range begins or ends on one edge of the extent rec
4417 * 3) Range is in the middle of the extent rec (no shared edges)
4418 *
4419 * For case 1 we remove the extent rec and left rotate to
4420 * fill the hole.
4421 *
4422 * For case 2 we just shrink the existing extent rec, with a
4423 * tree update if the shrinking edge is also the edge of an
4424 * extent block.
4425 *
4426 * For case 3 we do a right split to turn the extent rec into
4427 * something case 2 can handle.
4428 */
4429 rec = &el->l_recs[index];
4430 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4431 trunc_range = cpos + len;
4432
4433 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4434
4435 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4436 "(cpos %u, len %u)\n",
4437 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4438 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4439
4440 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4441 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4442 cpos, len);
4443 if (ret) {
4444 mlog_errno(ret);
4445 goto out;
4446 }
4447 } else {
4448 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4449 trunc_range, meta_ac);
4450 if (ret) {
4451 mlog_errno(ret);
4452 goto out;
4453 }
4454
4455 /*
4456 * The split could have manipulated the tree enough to
4457 * move the record location, so we have to look for it again.
4458 */
4459 ocfs2_reinit_path(path, 1);
4460
4461 ret = ocfs2_find_path(inode, path, cpos);
4462 if (ret) {
4463 mlog_errno(ret);
4464 goto out;
4465 }
4466
4467 el = path_leaf_el(path);
4468 index = ocfs2_search_extent_list(el, cpos);
4469 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4470 ocfs2_error(inode->i_sb,
4471 "Inode %llu: split at cpos %u lost record.",
4472 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4473 cpos);
4474 ret = -EROFS;
4475 goto out;
4476 }
4477
4478 /*
4479 * Double check our values here. If anything is fishy,
4480 * it's easier to catch it at the top level.
4481 */
4482 rec = &el->l_recs[index];
4483 rec_range = le32_to_cpu(rec->e_cpos) +
4484 ocfs2_rec_clusters(el, rec);
4485 if (rec_range != trunc_range) {
4486 ocfs2_error(inode->i_sb,
4487 "Inode %llu: error after split at cpos %u"
4488 "trunc len %u, existing record is (%u,%u)",
4489 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4490 cpos, len, le32_to_cpu(rec->e_cpos),
4491 ocfs2_rec_clusters(el, rec));
4492 ret = -EROFS;
4493 goto out;
4494 }
4495
4496 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4497 cpos, len);
4498 if (ret) {
4499 mlog_errno(ret);
4500 goto out;
4501 }
4502 }
4503
4504out:
4505 ocfs2_free_path(path);
4506 return ret;
4507}
4508
4509int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435{ 4510{
2436 struct buffer_head *tl_bh = osb->osb_tl_bh; 4511 struct buffer_head *tl_bh = osb->osb_tl_bh;
2437 struct ocfs2_dinode *di; 4512 struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2464 return current_tail == new_start; 4539 return current_tail == new_start;
2465} 4540}
2466 4541
2467static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 4542int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468 handle_t *handle, 4543 handle_t *handle,
2469 u64 start_blk, 4544 u64 start_blk,
2470 unsigned int num_clusters) 4545 unsigned int num_clusters)
2471{ 4546{
2472 int status, index; 4547 int status, index;
2473 unsigned int start_cluster, tl_count; 4548 unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
2623} 4698}
2624 4699
2625/* Expects you to already be holding tl_inode->i_mutex */ 4700/* Expects you to already be holding tl_inode->i_mutex */
2626static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 4701int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627{ 4702{
2628 int status; 4703 int status;
2629 unsigned int num_to_flush; 4704 unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2957 return status; 5032 return status;
2958} 5033}
2959 5034
5035/*
5036 * Delayed de-allocation of suballocator blocks.
5037 *
5038 * Some sets of block de-allocations might involve multiple suballocator inodes.
5039 *
5040 * The locking for this can get extremely complicated, especially when
5041 * the suballocator inodes to delete from aren't known until deep
5042 * within an unrelated codepath.
5043 *
5044 * ocfs2_extent_block structures are a good example of this - an inode
5045 * btree could have been grown by any number of nodes each allocating
5046 * out of their own suballoc inode.
5047 *
5048 * These structures allow the delay of block de-allocation until a
5049 * later time, when locking of multiple cluster inodes won't cause
5050 * deadlock.
5051 */
5052
5053/*
5054 * Describes a single block free from a suballocator
5055 */
5056struct ocfs2_cached_block_free {
5057 struct ocfs2_cached_block_free *free_next;
5058 u64 free_blk;
5059 unsigned int free_bit;
5060};
5061
5062struct ocfs2_per_slot_free_list {
5063 struct ocfs2_per_slot_free_list *f_next_suballocator;
5064 int f_inode_type;
5065 int f_slot;
5066 struct ocfs2_cached_block_free *f_first;
5067};
5068
5069static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5070 int sysfile_type,
5071 int slot,
5072 struct ocfs2_cached_block_free *head)
5073{
5074 int ret;
5075 u64 bg_blkno;
5076 handle_t *handle;
5077 struct inode *inode;
5078 struct buffer_head *di_bh = NULL;
5079 struct ocfs2_cached_block_free *tmp;
5080
5081 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5082 if (!inode) {
5083 ret = -EINVAL;
5084 mlog_errno(ret);
5085 goto out;
5086 }
5087
5088 mutex_lock(&inode->i_mutex);
5089
5090 ret = ocfs2_meta_lock(inode, &di_bh, 1);
5091 if (ret) {
5092 mlog_errno(ret);
5093 goto out_mutex;
5094 }
5095
5096 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5097 if (IS_ERR(handle)) {
5098 ret = PTR_ERR(handle);
5099 mlog_errno(ret);
5100 goto out_unlock;
5101 }
5102
5103 while (head) {
5104 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5105 head->free_bit);
5106 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5107 head->free_bit, (unsigned long long)head->free_blk);
5108
5109 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5110 head->free_bit, bg_blkno, 1);
5111 if (ret) {
5112 mlog_errno(ret);
5113 goto out_journal;
5114 }
5115
5116 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5117 if (ret) {
5118 mlog_errno(ret);
5119 goto out_journal;
5120 }
5121
5122 tmp = head;
5123 head = head->free_next;
5124 kfree(tmp);
5125 }
5126
5127out_journal:
5128 ocfs2_commit_trans(osb, handle);
5129
5130out_unlock:
5131 ocfs2_meta_unlock(inode, 1);
5132 brelse(di_bh);
5133out_mutex:
5134 mutex_unlock(&inode->i_mutex);
5135 iput(inode);
5136out:
5137 while(head) {
5138 /* Premature exit may have left some dangling items. */
5139 tmp = head;
5140 head = head->free_next;
5141 kfree(tmp);
5142 }
5143
5144 return ret;
5145}
5146
5147int ocfs2_run_deallocs(struct ocfs2_super *osb,
5148 struct ocfs2_cached_dealloc_ctxt *ctxt)
5149{
5150 int ret = 0, ret2;
5151 struct ocfs2_per_slot_free_list *fl;
5152
5153 if (!ctxt)
5154 return 0;
5155
5156 while (ctxt->c_first_suballocator) {
5157 fl = ctxt->c_first_suballocator;
5158
5159 if (fl->f_first) {
5160 mlog(0, "Free items: (type %u, slot %d)\n",
5161 fl->f_inode_type, fl->f_slot);
5162 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5163 fl->f_slot, fl->f_first);
5164 if (ret2)
5165 mlog_errno(ret2);
5166 if (!ret)
5167 ret = ret2;
5168 }
5169
5170 ctxt->c_first_suballocator = fl->f_next_suballocator;
5171 kfree(fl);
5172 }
5173
5174 return ret;
5175}
5176
5177static struct ocfs2_per_slot_free_list *
5178ocfs2_find_per_slot_free_list(int type,
5179 int slot,
5180 struct ocfs2_cached_dealloc_ctxt *ctxt)
5181{
5182 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5183
5184 while (fl) {
5185 if (fl->f_inode_type == type && fl->f_slot == slot)
5186 return fl;
5187
5188 fl = fl->f_next_suballocator;
5189 }
5190
5191 fl = kmalloc(sizeof(*fl), GFP_NOFS);
5192 if (fl) {
5193 fl->f_inode_type = type;
5194 fl->f_slot = slot;
5195 fl->f_first = NULL;
5196 fl->f_next_suballocator = ctxt->c_first_suballocator;
5197
5198 ctxt->c_first_suballocator = fl;
5199 }
5200 return fl;
5201}
5202
5203static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5204 int type, int slot, u64 blkno,
5205 unsigned int bit)
5206{
5207 int ret;
5208 struct ocfs2_per_slot_free_list *fl;
5209 struct ocfs2_cached_block_free *item;
5210
5211 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5212 if (fl == NULL) {
5213 ret = -ENOMEM;
5214 mlog_errno(ret);
5215 goto out;
5216 }
5217
5218 item = kmalloc(sizeof(*item), GFP_NOFS);
5219 if (item == NULL) {
5220 ret = -ENOMEM;
5221 mlog_errno(ret);
5222 goto out;
5223 }
5224
5225 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5226 type, slot, bit, (unsigned long long)blkno);
5227
5228 item->free_blk = blkno;
5229 item->free_bit = bit;
5230 item->free_next = fl->f_first;
5231
5232 fl->f_first = item;
5233
5234 ret = 0;
5235out:
5236 return ret;
5237}
5238
5239static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5240 struct ocfs2_extent_block *eb)
5241{
5242 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5243 le16_to_cpu(eb->h_suballoc_slot),
5244 le64_to_cpu(eb->h_blkno),
5245 le16_to_cpu(eb->h_suballoc_bit));
5246}
5247
2960/* This function will figure out whether the currently last extent 5248/* This function will figure out whether the currently last extent
2961 * block will be deleted, and if it will, what the new last extent 5249 * block will be deleted, and if it will, what the new last extent
2962 * block will be so we can update his h_next_leaf_blk field, as well 5250 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 5526 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 5527 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240 5528
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 5529 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
3242 /* 5530 /* An error here is not fatal. */
3243 * This code only understands how to 5531 if (ret < 0)
3244 * lock the suballocator in slot 0, 5532 mlog_errno(ret);
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else { 5533 } else {
3263 deleted_eb = 0; 5534 deleted_eb = 0;
3264 } 5535 }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3397 return ocfs2_journal_dirty_data(handle, bh); 5668 return ocfs2_journal_dirty_data(handle, bh);
3398} 5669}
3399 5670
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 5671static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
3401 struct page **pages, int numpages, 5672 loff_t end, struct page **pages,
3402 u64 phys, handle_t *handle) 5673 int numpages, u64 phys, handle_t *handle)
3403{ 5674{
3404 int i, ret, partial = 0; 5675 int i, ret, partial = 0;
3405 void *kaddr; 5676 void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3412 if (numpages == 0) 5683 if (numpages == 0)
3413 goto out; 5684 goto out;
3414 5685
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 5686 to = PAGE_CACHE_SIZE;
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) { 5687 for(i = 0; i < numpages; i++) {
3433 page = pages[i]; 5688 page = pages[i];
3434 5689
5690 from = start & (PAGE_CACHE_SIZE - 1);
5691 if ((end >> PAGE_CACHE_SHIFT) == page->index)
5692 to = end & (PAGE_CACHE_SIZE - 1);
5693
3435 BUG_ON(from > PAGE_CACHE_SIZE); 5694 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE); 5695 BUG_ON(to > PAGE_CACHE_SIZE);
3437 5696
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3468 5727
3469 flush_dcache_page(page); 5728 flush_dcache_page(page);
3470 5729
3471 /* 5730 start = (page->index + 1) << PAGE_CACHE_SHIFT;
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 } 5731 }
3476out: 5732out:
3477 if (pages) { 5733 if (pages) {
@@ -3484,24 +5740,26 @@ out:
3484 } 5740 }
3485} 5741}
3486 5742
3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 5743static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
3488 int *num, u64 *phys) 5744 struct page **pages, int *num, u64 *phys)
3489{ 5745{
3490 int i, numpages = 0, ret = 0; 5746 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags; 5747 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb; 5748 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping; 5749 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index; 5750 unsigned long index;
3496 u64 next_cluster_bytes; 5751 loff_t last_page_bytes;
3497 5752
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 5753 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5754 BUG_ON(start > end);
3499 5755
3500 /* Cluster boundary, so we don't need to grab any pages. */ 5756 if (start == end)
3501 if ((isize & (csize - 1)) == 0)
3502 goto out; 5757 goto out;
3503 5758
3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 5759 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5760 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5761
5762 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags); 5763 phys, NULL, &ext_flags);
3506 if (ret) { 5764 if (ret) {
3507 mlog_errno(ret); 5765 mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3517 if (ext_flags & OCFS2_EXT_UNWRITTEN) 5775 if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518 goto out; 5776 goto out;
3519 5777
3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 5778 last_page_bytes = PAGE_ALIGN(end);
3521 index = isize >> PAGE_CACHE_SHIFT; 5779 index = start >> PAGE_CACHE_SHIFT;
3522 do { 5780 do {
3523 pages[numpages] = grab_cache_page(mapping, index); 5781 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) { 5782 if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3529 5787
3530 numpages++; 5788 numpages++;
3531 index++; 5789 index++;
3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 5790 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
3533 5791
3534out: 5792out:
3535 if (ret != 0) { 5793 if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
3558 * otherwise block_write_full_page() will skip writeout of pages past 5816 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason. 5817 * i_size. The new_i_size parameter is passed for this reason.
3560 */ 5818 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 5819int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size) 5820 u64 range_start, u64 range_end)
3563{ 5821{
3564 int ret, numpages; 5822 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL; 5823 struct page **pages = NULL;
3567 u64 phys; 5824 u64 phys;
3568 5825
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3581 goto out; 5838 goto out;
3582 } 5839 }
3583 5840
3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 5841 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5842 &numpages, &phys);
3585 if (ret) { 5843 if (ret) {
3586 mlog_errno(ret); 5844 mlog_errno(ret);
3587 goto out; 5845 goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3590 if (numpages == 0) 5848 if (numpages == 0)
3591 goto out; 5849 goto out;
3592 5850
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 5851 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
3594 handle); 5852 numpages, phys, handle);
3595 5853
3596 /* 5854 /*
3597 * Initiate writeout of the pages we zero'd here. We don't 5855 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will 5856 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us. 5857 * do that for us.
3600 */ 5858 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 5859 ret = do_sync_mapping_range(inode->i_mapping, range_start,
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 5860 range_end - 1, SYNC_FILE_RANGE_WRITE);
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret) 5861 if (ret)
3605 mlog_errno(ret); 5862 mlog_errno(ret);
3606 5863
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
3631 5888
3632 mlog_entry_void(); 5889 mlog_entry_void();
3633 5890
3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 5891 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637 i_size_read(inode)); 5892 i_size_read(inode));
3638 5893
@@ -3754,7 +6009,6 @@ start:
3754 goto start; 6009 goto start;
3755 6010
3756bail: 6011bail:
3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758 6012
3759 ocfs2_schedule_truncate_log_flush(osb, 1); 6013 ocfs2_schedule_truncate_log_flush(osb, 1);
3760 6014
@@ -3764,6 +6018,8 @@ bail:
3764 if (handle) 6018 if (handle)
3765 ocfs2_commit_trans(osb, handle); 6019 ocfs2_commit_trans(osb, handle);
3766 6020
6021 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6022
3767 ocfs2_free_path(path); 6023 ocfs2_free_path(path);
3768 6024
3769 /* This will drop the ext_alloc cluster lock for us */ 6025 /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
3774} 6030}
3775 6031
3776/* 6032/*
3777 * Expects the inode to already be locked. This will figure out which 6033 * Expects the inode to already be locked.
3778 * inodes need to be locked and will put them on the returned truncate
3779 * context.
3780 */ 6034 */
3781int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6035int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782 struct inode *inode, 6036 struct inode *inode,
3783 struct buffer_head *fe_bh, 6037 struct buffer_head *fe_bh,
3784 struct ocfs2_truncate_context **tc) 6038 struct ocfs2_truncate_context **tc)
3785{ 6039{
3786 int status, metadata_delete, i; 6040 int status;
3787 unsigned int new_i_clusters; 6041 unsigned int new_i_clusters;
3788 struct ocfs2_dinode *fe; 6042 struct ocfs2_dinode *fe;
3789 struct ocfs2_extent_block *eb; 6043 struct ocfs2_extent_block *eb;
3790 struct ocfs2_extent_list *el;
3791 struct buffer_head *last_eb_bh = NULL; 6044 struct buffer_head *last_eb_bh = NULL;
3792 struct inode *ext_alloc_inode = NULL;
3793 struct buffer_head *ext_alloc_bh = NULL;
3794 6045
3795 mlog_entry_void(); 6046 mlog_entry_void();
3796 6047
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3810 mlog_errno(status); 6061 mlog_errno(status);
3811 goto bail; 6062 goto bail;
3812 } 6063 }
6064 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
3813 6065
3814 metadata_delete = 0;
3815 if (fe->id2.i_list.l_tree_depth) { 6066 if (fe->id2.i_list.l_tree_depth) {
3816 /* If we have a tree, then the truncate may result in
3817 * metadata deletes. Figure this out from the
3818 * rightmost leaf block.*/
3819 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6067 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820 &last_eb_bh, OCFS2_BH_CACHED, inode); 6068 &last_eb_bh, OCFS2_BH_CACHED, inode);
3821 if (status < 0) { 6069 if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3830 status = -EIO; 6078 status = -EIO;
3831 goto bail; 6079 goto bail;
3832 } 6080 }
3833 el = &(eb->h_list);
3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843 metadata_delete = 1;
3844 } 6081 }
3845 6082
3846 (*tc)->tc_last_eb_bh = last_eb_bh; 6083 (*tc)->tc_last_eb_bh = last_eb_bh;
3847 6084
3848 if (metadata_delete) {
3849 mlog(0, "Will have to delete metadata for this trunc. "
3850 "locking allocator.\n");
3851 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852 if (!ext_alloc_inode) {
3853 status = -ENOMEM;
3854 mlog_errno(status);
3855 goto bail;
3856 }
3857
3858 mutex_lock(&ext_alloc_inode->i_mutex);
3859 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862 if (status < 0) {
3863 mlog_errno(status);
3864 goto bail;
3865 }
3866 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867 (*tc)->tc_ext_alloc_locked = 1;
3868 }
3869
3870 status = 0; 6085 status = 0;
3871bail: 6086bail:
3872 if (status < 0) { 6087 if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
3880 6095
3881static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6096static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882{ 6097{
3883 if (tc->tc_ext_alloc_inode) { 6098 /*
3884 if (tc->tc_ext_alloc_locked) 6099 * The caller is responsible for completing deallocation
3885 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 6100 * before freeing the context.
3886 6101 */
3887 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 6102 if (tc->tc_dealloc.c_first_suballocator != NULL)
3888 iput(tc->tc_ext_alloc_inode); 6103 mlog(ML_NOTICE,
3889 } 6104 "Truncate completion has non-empty dealloc context\n");
3890
3891 if (tc->tc_ext_alloc_bh)
3892 brelse(tc->tc_ext_alloc_bh);
3893 6105
3894 if (tc->tc_last_eb_bh) 6106 if (tc->tc_last_eb_bh)
3895 brelse(tc->tc_last_eb_bh); 6107 brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a0..990df48ae8 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
34 u32 cpos, 34 u32 cpos,
35 u64 start_blk, 35 u64 start_blk,
36 u32 new_clusters, 36 u32 new_clusters,
37 u8 flags,
37 struct ocfs2_alloc_context *meta_ac); 38 struct ocfs2_alloc_context *meta_ac);
39struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
41 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
45 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc);
38int ocfs2_num_free_extents(struct ocfs2_super *osb, 48int ocfs2_num_free_extents(struct ocfs2_super *osb,
39 struct inode *inode, 49 struct inode *inode,
40 struct ocfs2_dinode *fe); 50 struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
62 struct ocfs2_dinode **tl_copy); 72 struct ocfs2_dinode **tl_copy);
63int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 73int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
64 struct ocfs2_dinode *tl_copy); 74 struct ocfs2_dinode *tl_copy);
75int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
76int ocfs2_truncate_log_append(struct ocfs2_super *osb,
77 handle_t *handle,
78 u64 start_blk,
79 unsigned int num_clusters);
80int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
81
82/*
83 * Process local structure which describes the block unlinks done
84 * during an operation. This is populated via
85 * ocfs2_cache_block_dealloc().
86 *
87 * ocfs2_run_deallocs() should be called after the potentially
88 * de-allocating routines. No journal handles should be open, and most
89 * locks should have been dropped.
90 */
91struct ocfs2_cached_dealloc_ctxt {
92 struct ocfs2_per_slot_free_list *c_first_suballocator;
93};
94static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
95{
96 c->c_first_suballocator = NULL;
97}
98int ocfs2_run_deallocs(struct ocfs2_super *osb,
99 struct ocfs2_cached_dealloc_ctxt *ctxt);
65 100
66struct ocfs2_truncate_context { 101struct ocfs2_truncate_context {
67 struct inode *tc_ext_alloc_inode; 102 struct ocfs2_cached_dealloc_ctxt tc_dealloc;
68 struct buffer_head *tc_ext_alloc_bh;
69 int tc_ext_alloc_locked; /* is it cluster locked? */ 103 int tc_ext_alloc_locked; /* is it cluster locked? */
70 /* these get destroyed once it's passed to ocfs2_commit_truncate. */ 104 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
71 struct buffer_head *tc_last_eb_bh; 105 struct buffer_head *tc_last_eb_bh;
72}; 106};
73 107
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 108int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size); 109 u64 range_start, u64 range_end);
76int ocfs2_prepare_truncate(struct ocfs2_super *osb, 110int ocfs2_prepare_truncate(struct ocfs2_super *osb,
77 struct inode *inode, 111 struct inode *inode,
78 struct buffer_head *fe_bh, 112 struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
84 118
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh); 120 u32 cpos, struct buffer_head **leaf_bh);
121int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
87 122
88/* 123/*
89 * Helper function to look at the # of clusters in an extent record. 124 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79..84bf6e79de 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
684 bh = bh->b_this_page, block_start += bsize) { 684 bh = bh->b_this_page, block_start += bsize) {
685 block_end = block_start + bsize; 685 block_end = block_start + bsize;
686 686
687 clear_buffer_new(bh);
688
687 /* 689 /*
688 * Ignore blocks outside of our i/o range - 690 * Ignore blocks outside of our i/o range -
689 * they may belong to unallocated clusters. 691 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
698 * For an allocating write with cluster size >= page 700 * For an allocating write with cluster size >= page
699 * size, we always write the entire page. 701 * size, we always write the entire page.
700 */ 702 */
701 703 if (new)
702 if (buffer_new(bh)) 704 set_buffer_new(bh);
703 clear_buffer_new(bh);
704 705
705 if (!buffer_mapped(bh)) { 706 if (!buffer_mapped(bh)) {
706 map_bh(bh, inode->i_sb, *p_blkno); 707 map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
711 if (!buffer_uptodate(bh)) 712 if (!buffer_uptodate(bh))
712 set_buffer_uptodate(bh); 713 set_buffer_uptodate(bh);
713 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
714 (block_start < from || block_end > to)) { 715 !buffer_new(bh) &&
716 (block_start < from || block_end > to)) {
715 ll_rw_block(READ, 1, &bh); 717 ll_rw_block(READ, 1, &bh);
716 *wait_bh++=bh; 718 *wait_bh++=bh;
717 } 719 }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
738 bh = head; 740 bh = head;
739 block_start = 0; 741 block_start = 0;
740 do { 742 do {
741 void *kaddr;
742
743 block_end = block_start + bsize; 743 block_end = block_start + bsize;
744 if (block_end <= from) 744 if (block_end <= from)
745 goto next_bh; 745 goto next_bh;
746 if (block_start >= to) 746 if (block_start >= to)
747 break; 747 break;
748 748
749 kaddr = kmap_atomic(page, KM_USER0); 749 zero_user_page(page, block_start, bh->b_size, KM_USER0);
750 memset(kaddr+block_start, 0, bh->b_size);
751 flush_dcache_page(page);
752 kunmap_atomic(kaddr, KM_USER0);
753 set_buffer_uptodate(bh); 750 set_buffer_uptodate(bh);
754 mark_buffer_dirty(bh); 751 mark_buffer_dirty(bh);
755 752
@@ -761,217 +758,240 @@ next_bh:
761 return ret; 758 return ret;
762} 759}
763 760
761#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
762#define OCFS2_MAX_CTXT_PAGES 1
763#else
764#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
765#endif
766
767#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
768
764/* 769/*
765 * This will copy user data from the buffer page in the splice 770 * Describe the state of a single cluster to be written to.
766 * context.
767 *
768 * For now, we ignore SPLICE_F_MOVE as that would require some extra
769 * communication out all the way to ocfs2_write().
770 */ 771 */
771int ocfs2_map_and_write_splice_data(struct inode *inode, 772struct ocfs2_write_cluster_desc {
772 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 773 u32 c_cpos;
773 unsigned int *ret_from, unsigned int *ret_to) 774 u32 c_phys;
775 /*
776 * Give this a unique field because c_phys eventually gets
777 * filled.
778 */
779 unsigned c_new;
780 unsigned c_unwritten;
781};
782
783static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
774{ 784{
775 int ret; 785 return d->c_new || d->c_unwritten;
776 unsigned int to, from, cluster_start, cluster_end; 786}
777 char *src, *dst;
778 struct ocfs2_splice_write_priv *sp = wc->w_private;
779 struct pipe_buffer *buf = sp->s_buf;
780 unsigned long bytes, src_from;
781 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
782 787
783 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 788struct ocfs2_write_ctxt {
784 &cluster_end); 789 /* Logical cluster position / len of write */
790 u32 w_cpos;
791 u32 w_clen;
785 792
786 from = sp->s_offset; 793 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
787 src_from = sp->s_buf_offset;
788 bytes = wc->w_count;
789 794
790 if (wc->w_large_pages) { 795 /*
791 /* 796 * This is true if page_size > cluster_size.
792 * For cluster size < page size, we have to 797 *
793 * calculate pos within the cluster and obey 798 * It triggers a set of special cases during write which might
794 * the rightmost boundary. 799 * have to deal with allocating writes to partial pages.
795 */ 800 */
796 bytes = min(bytes, (unsigned long)(osb->s_clustersize 801 unsigned int w_large_pages;
797 - (wc->w_pos & (osb->s_clustersize - 1)))); 802
798 } 803 /*
799 to = from + bytes; 804 * Pages involved in this write.
805 *
806 * w_target_page is the page being written to by the user.
807 *
808 * w_pages is an array of pages which always contains
809 * w_target_page, and in the case of an allocating write with
810 * page_size < cluster size, it will contain zero'd and mapped
811 * pages adjacent to w_target_page which need to be written
812 * out in so that future reads from that region will get
813 * zero's.
814 */
815 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
816 unsigned int w_num_pages;
817 struct page *w_target_page;
800 818
801 BUG_ON(from > PAGE_CACHE_SIZE); 819 /*
802 BUG_ON(to > PAGE_CACHE_SIZE); 820 * ocfs2_write_end() uses this to know what the real range to
803 BUG_ON(from < cluster_start); 821 * write in the target should be.
804 BUG_ON(to > cluster_end); 822 */
823 unsigned int w_target_from;
824 unsigned int w_target_to;
805 825
806 if (wc->w_this_page_new) 826 /*
807 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 827 * We could use journal_current_handle() but this is cleaner,
808 cluster_start, cluster_end, 1); 828 * IMHO -Mark
809 else 829 */
810 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 830 handle_t *w_handle;
811 from, to, 0); 831
812 if (ret) { 832 struct buffer_head *w_di_bh;
813 mlog_errno(ret); 833
814 goto out; 834 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835};
836
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
838{
839 int i;
840
841 for(i = 0; i < wc->w_num_pages; i++) {
842 if (wc->w_pages[i] == NULL)
843 continue;
844
845 unlock_page(wc->w_pages[i]);
846 mark_page_accessed(wc->w_pages[i]);
847 page_cache_release(wc->w_pages[i]);
815 } 848 }
816 849
817 src = buf->ops->map(sp->s_pipe, buf, 1); 850 brelse(wc->w_di_bh);
818 dst = kmap_atomic(wc->w_this_page, KM_USER1); 851 kfree(wc);
819 memcpy(dst + from, src + src_from, bytes); 852}
820 kunmap_atomic(wc->w_this_page, KM_USER1); 853
821 buf->ops->unmap(sp->s_pipe, buf, src); 854static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
855 struct ocfs2_super *osb, loff_t pos,
856 unsigned len, struct buffer_head *di_bh)
857{
858 struct ocfs2_write_ctxt *wc;
859
860 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
861 if (!wc)
862 return -ENOMEM;
822 863
823 wc->w_finished_copy = 1; 864 wc->w_cpos = pos >> osb->s_clustersize_bits;
865 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
866 get_bh(di_bh);
867 wc->w_di_bh = di_bh;
824 868
825 *ret_from = from; 869 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
826 *ret_to = to; 870 wc->w_large_pages = 1;
827out: 871 else
872 wc->w_large_pages = 0;
873
874 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
875
876 *wcp = wc;
828 877
829 return bytes ? (unsigned int)bytes : ret; 878 return 0;
830} 879}
831 880
832/* 881/*
833 * This will copy user data from the iovec in the buffered write 882 * If a page has any new buffers, zero them out here, and mark them uptodate
834 * context. 883 * and dirty so they'll be written out (in order to prevent uninitialised
884 * block data from leaking). And clear the new bit.
835 */ 885 */
836int ocfs2_map_and_write_user_data(struct inode *inode, 886static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
837 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
838 unsigned int *ret_from, unsigned int *ret_to)
839{ 887{
840 int ret; 888 unsigned int block_start, block_end;
841 unsigned int to, from, cluster_start, cluster_end; 889 struct buffer_head *head, *bh;
842 unsigned long bytes, src_from;
843 char *dst;
844 struct ocfs2_buffered_write_priv *bp = wc->w_private;
845 const struct iovec *cur_iov = bp->b_cur_iov;
846 char __user *buf;
847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
848 890
849 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 891 BUG_ON(!PageLocked(page));
850 &cluster_end); 892 if (!page_has_buffers(page))
893 return;
851 894
852 buf = cur_iov->iov_base + bp->b_cur_off; 895 bh = head = page_buffers(page);
853 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 896 block_start = 0;
897 do {
898 block_end = block_start + bh->b_size;
854 899
855 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 900 if (buffer_new(bh)) {
901 if (block_end > from && block_start < to) {
902 if (!PageUptodate(page)) {
903 unsigned start, end;
856 904
857 /* 905 start = max(from, block_start);
858 * This is a lot of comparisons, but it reads quite 906 end = min(to, block_end);
859 * easily, which is important here.
860 */
861 /* Stay within the src page */
862 bytes = PAGE_SIZE - src_from;
863 /* Stay within the vector */
864 bytes = min(bytes,
865 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
866 /* Stay within count */
867 bytes = min(bytes, (unsigned long)wc->w_count);
868 /*
869 * For clustersize > page size, just stay within
870 * target page, otherwise we have to calculate pos
871 * within the cluster and obey the rightmost
872 * boundary.
873 */
874 if (wc->w_large_pages) {
875 /*
876 * For cluster size < page size, we have to
877 * calculate pos within the cluster and obey
878 * the rightmost boundary.
879 */
880 bytes = min(bytes, (unsigned long)(osb->s_clustersize
881 - (wc->w_pos & (osb->s_clustersize - 1))));
882 } else {
883 /*
884 * cluster size > page size is the most common
885 * case - we just stay within the target page
886 * boundary.
887 */
888 bytes = min(bytes, PAGE_CACHE_SIZE - from);
889 }
890 907
891 to = from + bytes; 908 zero_user_page(page, start, end - start, KM_USER0);
909 set_buffer_uptodate(bh);
910 }
892 911
893 BUG_ON(from > PAGE_CACHE_SIZE); 912 clear_buffer_new(bh);
894 BUG_ON(to > PAGE_CACHE_SIZE); 913 mark_buffer_dirty(bh);
895 BUG_ON(from < cluster_start); 914 }
896 BUG_ON(to > cluster_end); 915 }
897 916
898 if (wc->w_this_page_new) 917 block_start = block_end;
899 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 918 bh = bh->b_this_page;
900 cluster_start, cluster_end, 1); 919 } while (bh != head);
901 else 920}
902 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
903 from, to, 0);
904 if (ret) {
905 mlog_errno(ret);
906 goto out;
907 }
908 921
909 dst = kmap(wc->w_this_page); 922/*
910 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 923 * Only called when we have a failure during allocating write to write
911 kunmap(wc->w_this_page); 924 * zero's to the newly allocated region.
925 */
926static void ocfs2_write_failure(struct inode *inode,
927 struct ocfs2_write_ctxt *wc,
928 loff_t user_pos, unsigned user_len)
929{
930 int i;
931 unsigned from, to;
932 struct page *tmppage;
912 933
913 /* 934 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
914 * XXX: This is slow, but simple. The caller of
915 * ocfs2_buffered_write_cluster() is responsible for
916 * passing through the iovecs, so it's difficult to
917 * predict what our next step is in here after our
918 * initial write. A future version should be pushing
919 * that iovec manipulation further down.
920 *
921 * By setting this, we indicate that a copy from user
922 * data was done, and subsequent calls for this
923 * cluster will skip copying more data.
924 */
925 wc->w_finished_copy = 1;
926 935
927 *ret_from = from; 936 if (wc->w_large_pages) {
928 *ret_to = to; 937 from = wc->w_target_from;
929out: 938 to = wc->w_target_to;
939 } else {
940 from = 0;
941 to = PAGE_CACHE_SIZE;
942 }
943
944 for(i = 0; i < wc->w_num_pages; i++) {
945 tmppage = wc->w_pages[i];
930 946
931 return bytes ? (unsigned int)bytes : ret; 947 if (ocfs2_should_order_data(inode))
948 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
949 from, to, NULL,
950 ocfs2_journal_dirty_data);
951
952 block_commit_write(tmppage, from, to);
953 }
932} 954}
933 955
934/* 956static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
935 * Map, fill and write a page to disk. 957 struct ocfs2_write_ctxt *wc,
936 * 958 struct page *page, u32 cpos,
937 * The work of copying data is done via callback. Newly allocated 959 loff_t user_pos, unsigned user_len,
938 * pages which don't take user data will be zero'd (set 'new' to 960 int new)
939 * indicate an allocating write)
940 *
941 * Returns a negative error code or the number of bytes copied into
942 * the page.
943 */
944static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
945 u64 *p_blkno, struct page *page,
946 struct ocfs2_write_ctxt *wc, int new)
947{ 961{
948 int ret, copied = 0; 962 int ret;
949 unsigned int from = 0, to = 0; 963 unsigned int map_from = 0, map_to = 0;
950 unsigned int cluster_start, cluster_end; 964 unsigned int cluster_start, cluster_end;
951 unsigned int zero_from = 0, zero_to = 0; 965 unsigned int user_data_from = 0, user_data_to = 0;
952 966
953 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 967 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
954 &cluster_start, &cluster_end); 968 &cluster_start, &cluster_end);
955 969
956 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 970 if (page == wc->w_target_page) {
957 && !wc->w_finished_copy) { 971 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
958 972 map_to = map_from + user_len;
959 wc->w_this_page = page; 973
960 wc->w_this_page_new = new; 974 if (new)
961 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 975 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
962 if (ret < 0) { 976 cluster_start, cluster_end,
977 new);
978 else
979 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
980 map_from, map_to, new);
981 if (ret) {
963 mlog_errno(ret); 982 mlog_errno(ret);
964 goto out; 983 goto out;
965 } 984 }
966 985
967 copied = ret; 986 user_data_from = map_from;
968 987 user_data_to = map_to;
969 zero_from = from;
970 zero_to = to;
971 if (new) { 988 if (new) {
972 from = cluster_start; 989 map_from = cluster_start;
973 to = cluster_end; 990 map_to = cluster_end;
974 } 991 }
992
993 wc->w_target_from = map_from;
994 wc->w_target_to = map_to;
975 } else { 995 } else {
976 /* 996 /*
977 * If we haven't allocated the new page yet, we 997 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
980 */ 1000 */
981 BUG_ON(!new); 1001 BUG_ON(!new);
982 1002
983 from = cluster_start; 1003 map_from = cluster_start;
984 to = cluster_end; 1004 map_to = cluster_end;
985 1005
986 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1006 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
987 cluster_start, cluster_end, 1); 1007 cluster_start, cluster_end, new);
988 if (ret) { 1008 if (ret) {
989 mlog_errno(ret); 1009 mlog_errno(ret);
990 goto out; 1010 goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
1003 */ 1023 */
1004 if (new && !PageUptodate(page)) 1024 if (new && !PageUptodate(page))
1005 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1025 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1006 wc->w_cpos, zero_from, zero_to); 1026 cpos, user_data_from, user_data_to);
1007 1027
1008 flush_dcache_page(page); 1028 flush_dcache_page(page);
1009 1029
1010 if (ocfs2_should_order_data(inode)) {
1011 ret = walk_page_buffers(handle,
1012 page_buffers(page),
1013 from, to, NULL,
1014 ocfs2_journal_dirty_data);
1015 if (ret < 0)
1016 mlog_errno(ret);
1017 }
1018
1019 /*
1020 * We don't use generic_commit_write() because we need to
1021 * handle our own i_size update.
1022 */
1023 ret = block_commit_write(page, from, to);
1024 if (ret)
1025 mlog_errno(ret);
1026out: 1030out:
1027 1031 return ret;
1028 return copied ? copied : ret;
1029} 1032}
1030 1033
1031/* 1034/*
1032 * Do the actual write of some data into an inode. Optionally allocate 1035 * This function will only grab one clusters worth of pages.
1033 * in order to fulfill the write.
1034 *
1035 * cpos is the logical cluster offset within the file to write at
1036 *
1037 * 'phys' is the physical mapping of that offset. a 'phys' value of
1038 * zero indicates that allocation is required. In this case, data_ac
1039 * and meta_ac should be valid (meta_ac can be null if metadata
1040 * allocation isn't required).
1041 */ 1036 */
1042static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1037static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1043 struct buffer_head *di_bh, 1038 struct ocfs2_write_ctxt *wc,
1044 struct ocfs2_alloc_context *data_ac, 1039 u32 cpos, loff_t user_pos, int new,
1045 struct ocfs2_alloc_context *meta_ac, 1040 struct page *mmap_page)
1046 struct ocfs2_write_ctxt *wc)
1047{ 1041{
1048 int ret, i, numpages = 1, new; 1042 int ret = 0, i;
1049 unsigned int copied = 0; 1043 unsigned long start, target_index, index;
1050 u32 tmp_pos;
1051 u64 v_blkno, p_blkno;
1052 struct address_space *mapping = file->f_mapping;
1053 struct inode *inode = mapping->host; 1044 struct inode *inode = mapping->host;
1054 unsigned long index, start;
1055 struct page **cpages;
1056 1045
1057 new = phys == 0 ? 1 : 0; 1046 target_index = user_pos >> PAGE_CACHE_SHIFT;
1058 1047
1059 /* 1048 /*
1060 * Figure out how many pages we'll be manipulating here. For 1049 * Figure out how many pages we'll be manipulating here. For
1061 * non allocating write, we just change the one 1050 * non allocating write, we just change the one
1062 * page. Otherwise, we'll need a whole clusters worth. 1051 * page. Otherwise, we'll need a whole clusters worth.
1063 */ 1052 */
1064 if (new)
1065 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1066
1067 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1068 if (!cpages) {
1069 ret = -ENOMEM;
1070 mlog_errno(ret);
1071 return ret;
1072 }
1073
1074 /*
1075 * Fill our page array first. That way we've grabbed enough so
1076 * that we can zero and flush if we error after adding the
1077 * extent.
1078 */
1079 if (new) { 1053 if (new) {
1080 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1054 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1081 wc->w_cpos); 1055 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1082 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1083 } else { 1056 } else {
1084 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1057 wc->w_num_pages = 1;
1085 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1058 start = target_index;
1086 } 1059 }
1087 1060
1088 for(i = 0; i < numpages; i++) { 1061 for(i = 0; i < wc->w_num_pages; i++) {
1089 index = start + i; 1062 index = start + i;
1090 1063
1091 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1064 if (index == target_index && mmap_page) {
1092 if (!cpages[i]) { 1065 /*
1093 ret = -ENOMEM; 1066 * ocfs2_pagemkwrite() is a little different
1094 mlog_errno(ret); 1067 * and wants us to directly use the page
1095 goto out; 1068 * passed in.
1069 */
1070 lock_page(mmap_page);
1071
1072 if (mmap_page->mapping != mapping) {
1073 unlock_page(mmap_page);
1074 /*
1075 * Sanity check - the locking in
1076 * ocfs2_pagemkwrite() should ensure
1077 * that this code doesn't trigger.
1078 */
1079 ret = -EINVAL;
1080 mlog_errno(ret);
1081 goto out;
1082 }
1083
1084 page_cache_get(mmap_page);
1085 wc->w_pages[i] = mmap_page;
1086 } else {
1087 wc->w_pages[i] = find_or_create_page(mapping, index,
1088 GFP_NOFS);
1089 if (!wc->w_pages[i]) {
1090 ret = -ENOMEM;
1091 mlog_errno(ret);
1092 goto out;
1093 }
1096 } 1094 }
1095
1096 if (index == target_index)
1097 wc->w_target_page = wc->w_pages[i];
1097 } 1098 }
1099out:
1100 return ret;
1101}
1102
1103/*
1104 * Prepare a single cluster for write one cluster into the file.
1105 */
1106static int ocfs2_write_cluster(struct address_space *mapping,
1107 u32 phys, unsigned int unwritten,
1108 struct ocfs2_alloc_context *data_ac,
1109 struct ocfs2_alloc_context *meta_ac,
1110 struct ocfs2_write_ctxt *wc, u32 cpos,
1111 loff_t user_pos, unsigned user_len)
1112{
1113 int ret, i, new, should_zero = 0;
1114 u64 v_blkno, p_blkno;
1115 struct inode *inode = mapping->host;
1116
1117 new = phys == 0 ? 1 : 0;
1118 if (new || unwritten)
1119 should_zero = 1;
1098 1120
1099 if (new) { 1121 if (new) {
1122 u32 tmp_pos;
1123
1100 /* 1124 /*
1101 * This is safe to call with the page locks - it won't take 1125 * This is safe to call with the page locks - it won't take
1102 * any additional semaphores or cluster locks. 1126 * any additional semaphores or cluster locks.
1103 */ 1127 */
1104 tmp_pos = wc->w_cpos; 1128 tmp_pos = cpos;
1105 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1129 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1106 &tmp_pos, 1, di_bh, handle, 1130 &tmp_pos, 1, 0, wc->w_di_bh,
1107 data_ac, meta_ac, NULL); 1131 wc->w_handle, data_ac,
1132 meta_ac, NULL);
1108 /* 1133 /*
1109 * This shouldn't happen because we must have already 1134 * This shouldn't happen because we must have already
1110 * calculated the correct meta data allocation required. The 1135 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1121 mlog_errno(ret); 1146 mlog_errno(ret);
1122 goto out; 1147 goto out;
1123 } 1148 }
1149 } else if (unwritten) {
1150 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
1151 wc->w_handle, cpos, 1, phys,
1152 meta_ac, &wc->w_dealloc);
1153 if (ret < 0) {
1154 mlog_errno(ret);
1155 goto out;
1156 }
1124 } 1157 }
1125 1158
1159 if (should_zero)
1160 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1161 else
1162 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1163
1164 /*
1165 * The only reason this should fail is due to an inability to
1166 * find the extent added.
1167 */
1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1168 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1127 NULL); 1169 NULL);
1128 if (ret < 0) { 1170 if (ret < 0) {
1129 1171 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1130 /* 1172 "at logical block %llu",
1131 * XXX: Should we go readonly here? 1173 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1132 */ 1174 (unsigned long long)v_blkno);
1133
1134 mlog_errno(ret);
1135 goto out; 1175 goto out;
1136 } 1176 }
1137 1177
1138 BUG_ON(p_blkno == 0); 1178 BUG_ON(p_blkno == 0);
1139 1179
1140 for(i = 0; i < numpages; i++) { 1180 for(i = 0; i < wc->w_num_pages; i++) {
1141 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1181 int tmpret;
1142 wc, new); 1182
1143 if (ret < 0) { 1183 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1144 mlog_errno(ret); 1184 wc->w_pages[i], cpos,
1145 goto out; 1185 user_pos, user_len,
1186 should_zero);
1187 if (tmpret) {
1188 mlog_errno(tmpret);
1189 if (ret == 0)
1190 tmpret = ret;
1146 } 1191 }
1147
1148 copied += ret;
1149 } 1192 }
1150 1193
1194 /*
1195 * We only have cleanup to do in case of allocating write.
1196 */
1197 if (ret && new)
1198 ocfs2_write_failure(inode, wc, user_pos, user_len);
1199
1151out: 1200out:
1152 for(i = 0; i < numpages; i++) { 1201
1153 unlock_page(cpages[i]); 1202 return ret;
1154 mark_page_accessed(cpages[i]); 1203}
1155 page_cache_release(cpages[i]); 1204
1205static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1206 struct ocfs2_alloc_context *data_ac,
1207 struct ocfs2_alloc_context *meta_ac,
1208 struct ocfs2_write_ctxt *wc,
1209 loff_t pos, unsigned len)
1210{
1211 int ret, i;
1212 struct ocfs2_write_cluster_desc *desc;
1213
1214 for (i = 0; i < wc->w_clen; i++) {
1215 desc = &wc->w_desc[i];
1216
1217 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1218 desc->c_unwritten, data_ac, meta_ac,
1219 wc, desc->c_cpos, pos, len);
1220 if (ret) {
1221 mlog_errno(ret);
1222 goto out;
1223 }
1156 } 1224 }
1157 kfree(cpages);
1158 1225
1159 return copied ? copied : ret; 1226 ret = 0;
1227out:
1228 return ret;
1160} 1229}
1161 1230
1162static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1231/*
1163 struct ocfs2_super *osb, loff_t pos, 1232 * ocfs2_write_end() wants to know which parts of the target page it
1164 size_t count, ocfs2_page_writer *cb, 1233 * should complete the write on. It's easiest to compute them ahead of
1165 void *cb_priv) 1234 * time when a more complete view of the write is available.
1235 */
1236static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1237 struct ocfs2_write_ctxt *wc,
1238 loff_t pos, unsigned len, int alloc)
1166{ 1239{
1167 wc->w_count = count; 1240 struct ocfs2_write_cluster_desc *desc;
1168 wc->w_pos = pos;
1169 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1170 wc->w_finished_copy = 0;
1171 1241
1172 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1242 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1173 wc->w_large_pages = 1; 1243 wc->w_target_to = wc->w_target_from + len;
1174 else
1175 wc->w_large_pages = 0;
1176 1244
1177 wc->w_write_data_page = cb; 1245 if (alloc == 0)
1178 wc->w_private = cb_priv; 1246 return;
1247
1248 /*
1249 * Allocating write - we may have different boundaries based
1250 * on page size and cluster size.
1251 *
1252 * NOTE: We can no longer compute one value from the other as
1253 * the actual write length and user provided length may be
1254 * different.
1255 */
1256
1257 if (wc->w_large_pages) {
1258 /*
1259 * We only care about the 1st and last cluster within
1260 * our range and whether they should be zero'd or not. Either
1261 * value may be extended out to the start/end of a
1262 * newly allocated cluster.
1263 */
1264 desc = &wc->w_desc[0];
1265 if (ocfs2_should_zero_cluster(desc))
1266 ocfs2_figure_cluster_boundaries(osb,
1267 desc->c_cpos,
1268 &wc->w_target_from,
1269 NULL);
1270
1271 desc = &wc->w_desc[wc->w_clen - 1];
1272 if (ocfs2_should_zero_cluster(desc))
1273 ocfs2_figure_cluster_boundaries(osb,
1274 desc->c_cpos,
1275 NULL,
1276 &wc->w_target_to);
1277 } else {
1278 wc->w_target_from = 0;
1279 wc->w_target_to = PAGE_CACHE_SIZE;
1280 }
1179} 1281}
1180 1282
1181/* 1283/*
1182 * Write a cluster to an inode. The cluster may not be allocated yet, 1284 * Populate each single-cluster write descriptor in the write context
1183 * in which case it will be. This only exists for buffered writes - 1285 * with information about the i/o to be done.
1184 * O_DIRECT takes a more "traditional" path through the kernel.
1185 *
1186 * The caller is responsible for incrementing pos, written counts, etc
1187 * 1286 *
1188 * For file systems that don't support sparse files, pre-allocation 1287 * Returns the number of clusters that will have to be allocated, as
1189 * and page zeroing up until cpos should be done prior to this 1288 * well as a worst case estimate of the number of extent records that
1190 * function call. 1289 * would have to be created during a write to an unwritten region.
1191 *
1192 * Callers should be holding i_sem, and the rw cluster lock.
1193 *
1194 * Returns the number of user bytes written, or less than zero for
1195 * error.
1196 */ 1290 */
1197ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1291static int ocfs2_populate_write_desc(struct inode *inode,
1198 size_t count, ocfs2_page_writer *actor, 1292 struct ocfs2_write_ctxt *wc,
1199 void *priv) 1293 unsigned int *clusters_to_alloc,
1294 unsigned int *extents_to_split)
1295{
1296 int ret;
1297 struct ocfs2_write_cluster_desc *desc;
1298 unsigned int num_clusters = 0;
1299 unsigned int ext_flags = 0;
1300 u32 phys = 0;
1301 int i;
1302
1303 *clusters_to_alloc = 0;
1304 *extents_to_split = 0;
1305
1306 for (i = 0; i < wc->w_clen; i++) {
1307 desc = &wc->w_desc[i];
1308 desc->c_cpos = wc->w_cpos + i;
1309
1310 if (num_clusters == 0) {
1311 /*
1312 * Need to look up the next extent record.
1313 */
1314 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1315 &num_clusters, &ext_flags);
1316 if (ret) {
1317 mlog_errno(ret);
1318 goto out;
1319 }
1320
1321 /*
1322 * Assume worst case - that we're writing in
1323 * the middle of the extent.
1324 *
1325 * We can assume that the write proceeds from
1326 * left to right, in which case the extent
1327 * insert code is smart enough to coalesce the
1328 * next splits into the previous records created.
1329 */
1330 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1331 *extents_to_split = *extents_to_split + 2;
1332 } else if (phys) {
1333 /*
1334 * Only increment phys if it doesn't describe
1335 * a hole.
1336 */
1337 phys++;
1338 }
1339
1340 desc->c_phys = phys;
1341 if (phys == 0) {
1342 desc->c_new = 1;
1343 *clusters_to_alloc = *clusters_to_alloc + 1;
1344 }
1345 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1346 desc->c_unwritten = 1;
1347
1348 num_clusters--;
1349 }
1350
1351 ret = 0;
1352out:
1353 return ret;
1354}
1355
1356int ocfs2_write_begin_nolock(struct address_space *mapping,
1357 loff_t pos, unsigned len, unsigned flags,
1358 struct page **pagep, void **fsdata,
1359 struct buffer_head *di_bh, struct page *mmap_page)
1200{ 1360{
1201 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1361 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1202 ssize_t written = 0; 1362 unsigned int clusters_to_alloc, extents_to_split;
1203 u32 phys; 1363 struct ocfs2_write_ctxt *wc;
1204 struct inode *inode = file->f_mapping->host; 1364 struct inode *inode = mapping->host;
1205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1206 struct buffer_head *di_bh = NULL;
1207 struct ocfs2_dinode *di; 1366 struct ocfs2_dinode *di;
1208 struct ocfs2_alloc_context *data_ac = NULL; 1367 struct ocfs2_alloc_context *data_ac = NULL;
1209 struct ocfs2_alloc_context *meta_ac = NULL; 1368 struct ocfs2_alloc_context *meta_ac = NULL;
1210 handle_t *handle; 1369 handle_t *handle;
1211 struct ocfs2_write_ctxt wc;
1212
1213 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1214 1370
1215 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1371 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1216 if (ret) { 1372 if (ret) {
1217 mlog_errno(ret); 1373 mlog_errno(ret);
1218 goto out; 1374 return ret;
1219 } 1375 }
1220 di = (struct ocfs2_dinode *)di_bh->b_data;
1221
1222 /*
1223 * Take alloc sem here to prevent concurrent lookups. That way
1224 * the mapping, zeroing and tree manipulation within
1225 * ocfs2_write() will be safe against ->readpage(). This
1226 * should also serve to lock out allocation from a shared
1227 * writeable region.
1228 */
1229 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1230 1376
1231 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1377 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1378 &extents_to_split);
1232 if (ret) { 1379 if (ret) {
1233 mlog_errno(ret); 1380 mlog_errno(ret);
1234 goto out_meta; 1381 goto out;
1235 } 1382 }
1236 1383
1237 /* phys == 0 means that allocation is required. */ 1384 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1238 if (phys == 0) { 1385
1239 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1386 /*
1387 * We set w_target_from, w_target_to here so that
1388 * ocfs2_write_end() knows which range in the target page to
1389 * write out. An allocation requires that we write the entire
1390 * cluster range.
1391 */
1392 if (clusters_to_alloc || extents_to_split) {
1393 /*
1394 * XXX: We are stretching the limits of
1395 * ocfs2_lock_allocators(). It greatly over-estimates
1396 * the work to be done.
1397 */
1398 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1399 extents_to_split, &data_ac, &meta_ac);
1240 if (ret) { 1400 if (ret) {
1241 mlog_errno(ret); 1401 mlog_errno(ret);
1242 goto out_meta; 1402 goto out;
1243 } 1403 }
1244 1404
1245 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1405 credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1246 } 1406 clusters_to_alloc);
1247 1407
1248 ret = ocfs2_data_lock(inode, 1);
1249 if (ret) {
1250 mlog_errno(ret);
1251 goto out_meta;
1252 } 1408 }
1253 1409
1410 ocfs2_set_target_boundaries(osb, wc, pos, len,
1411 clusters_to_alloc + extents_to_split);
1412
1254 handle = ocfs2_start_trans(osb, credits); 1413 handle = ocfs2_start_trans(osb, credits);
1255 if (IS_ERR(handle)) { 1414 if (IS_ERR(handle)) {
1256 ret = PTR_ERR(handle); 1415 ret = PTR_ERR(handle);
1257 mlog_errno(ret); 1416 mlog_errno(ret);
1258 goto out_data; 1417 goto out;
1259 } 1418 }
1260 1419
1261 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1420 wc->w_handle = handle;
1262 meta_ac, &wc); 1421
1263 if (written < 0) { 1422 /*
1264 ret = written; 1423 * We don't want this to fail in ocfs2_write_end(), so do it
1424 * here.
1425 */
1426 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1427 OCFS2_JOURNAL_ACCESS_WRITE);
1428 if (ret) {
1265 mlog_errno(ret); 1429 mlog_errno(ret);
1266 goto out_commit; 1430 goto out_commit;
1267 } 1431 }
1268 1432
1269 ret = ocfs2_journal_access(handle, inode, di_bh, 1433 /*
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1434 * Fill our page array first. That way we've grabbed enough so
1435 * that we can zero and flush if we error after adding the
1436 * extent.
1437 */
1438 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1439 clusters_to_alloc + extents_to_split,
1440 mmap_page);
1271 if (ret) { 1441 if (ret) {
1272 mlog_errno(ret); 1442 mlog_errno(ret);
1273 goto out_commit; 1443 goto out_commit;
1274 } 1444 }
1275 1445
1276 pos += written; 1446 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1447 len);
1448 if (ret) {
1449 mlog_errno(ret);
1450 goto out_commit;
1451 }
1452
1453 if (data_ac)
1454 ocfs2_free_alloc_context(data_ac);
1455 if (meta_ac)
1456 ocfs2_free_alloc_context(meta_ac);
1457
1458 *pagep = wc->w_target_page;
1459 *fsdata = wc;
1460 return 0;
1461out_commit:
1462 ocfs2_commit_trans(osb, handle);
1463
1464out:
1465 ocfs2_free_write_ctxt(wc);
1466
1467 if (data_ac)
1468 ocfs2_free_alloc_context(data_ac);
1469 if (meta_ac)
1470 ocfs2_free_alloc_context(meta_ac);
1471 return ret;
1472}
1473
1474int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1475 loff_t pos, unsigned len, unsigned flags,
1476 struct page **pagep, void **fsdata)
1477{
1478 int ret;
1479 struct buffer_head *di_bh = NULL;
1480 struct inode *inode = mapping->host;
1481
1482 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1483 if (ret) {
1484 mlog_errno(ret);
1485 return ret;
1486 }
1487
1488 /*
1489 * Take alloc sem here to prevent concurrent lookups. That way
1490 * the mapping, zeroing and tree manipulation within
1491 * ocfs2_write() will be safe against ->readpage(). This
1492 * should also serve to lock out allocation from a shared
1493 * writeable region.
1494 */
1495 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1496
1497 ret = ocfs2_data_lock(inode, 1);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out_fail;
1501 }
1502
1503 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1504 fsdata, di_bh, NULL);
1505 if (ret) {
1506 mlog_errno(ret);
1507 goto out_fail_data;
1508 }
1509
1510 brelse(di_bh);
1511
1512 return 0;
1513
1514out_fail_data:
1515 ocfs2_data_unlock(inode, 1);
1516out_fail:
1517 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1518
1519 brelse(di_bh);
1520 ocfs2_meta_unlock(inode, 1);
1521
1522 return ret;
1523}
1524
1525int ocfs2_write_end_nolock(struct address_space *mapping,
1526 loff_t pos, unsigned len, unsigned copied,
1527 struct page *page, void *fsdata)
1528{
1529 int i;
1530 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1531 struct inode *inode = mapping->host;
1532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1533 struct ocfs2_write_ctxt *wc = fsdata;
1534 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1535 handle_t *handle = wc->w_handle;
1536 struct page *tmppage;
1537
1538 if (unlikely(copied < len)) {
1539 if (!PageUptodate(wc->w_target_page))
1540 copied = 0;
1541
1542 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1543 start+len);
1544 }
1545 flush_dcache_page(wc->w_target_page);
1546
1547 for(i = 0; i < wc->w_num_pages; i++) {
1548 tmppage = wc->w_pages[i];
1549
1550 if (tmppage == wc->w_target_page) {
1551 from = wc->w_target_from;
1552 to = wc->w_target_to;
1553
1554 BUG_ON(from > PAGE_CACHE_SIZE ||
1555 to > PAGE_CACHE_SIZE ||
1556 to < from);
1557 } else {
1558 /*
1559 * Pages adjacent to the target (if any) imply
1560 * a hole-filling write in which case we want
1561 * to flush their entire range.
1562 */
1563 from = 0;
1564 to = PAGE_CACHE_SIZE;
1565 }
1566
1567 if (ocfs2_should_order_data(inode))
1568 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1569 from, to, NULL,
1570 ocfs2_journal_dirty_data);
1571
1572 block_commit_write(tmppage, from, to);
1573 }
1574
1575 pos += copied;
1277 if (pos > inode->i_size) { 1576 if (pos > inode->i_size) {
1278 i_size_write(inode, pos); 1577 i_size_write(inode, pos);
1279 mark_inode_dirty(inode); 1578 mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1283 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1582 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1284 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1583 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1285 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1584 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1585 ocfs2_journal_dirty(handle, wc->w_di_bh);
1286 1586
1287 ret = ocfs2_journal_dirty(handle, di_bh);
1288 if (ret)
1289 mlog_errno(ret);
1290
1291out_commit:
1292 ocfs2_commit_trans(osb, handle); 1587 ocfs2_commit_trans(osb, handle);
1293 1588
1294out_data: 1589 ocfs2_run_deallocs(osb, &wc->w_dealloc);
1295 ocfs2_data_unlock(inode, 1); 1590
1591 ocfs2_free_write_ctxt(wc);
1592
1593 return copied;
1594}
1595
1596int ocfs2_write_end(struct file *file, struct address_space *mapping,
1597 loff_t pos, unsigned len, unsigned copied,
1598 struct page *page, void *fsdata)
1599{
1600 int ret;
1601 struct inode *inode = mapping->host;
1296 1602
1297out_meta: 1603 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1604
1605 ocfs2_data_unlock(inode, 1);
1298 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1606 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1299 ocfs2_meta_unlock(inode, 1); 1607 ocfs2_meta_unlock(inode, 1);
1300 1608
1301out: 1609 return ret;
1302 brelse(di_bh);
1303 if (data_ac)
1304 ocfs2_free_alloc_context(data_ac);
1305 if (meta_ac)
1306 ocfs2_free_alloc_context(meta_ac);
1307
1308 return written ? written : ret;
1309} 1610}
1310 1611
1311const struct address_space_operations ocfs2_aops = { 1612const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b..389579bd64 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle,
42 int (*fn)( handle_t *handle, 42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh)); 43 struct buffer_head *bh));
44 44
45struct ocfs2_write_ctxt; 45int ocfs2_write_begin(struct file *file, struct address_space *mapping,
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, 46 loff_t pos, unsigned len, unsigned flags,
47 u64 *, unsigned int *, unsigned int *); 47 struct page **pagep, void **fsdata);
48 48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 49int ocfs2_write_end(struct file *file, struct address_space *mapping,
50 size_t count, ocfs2_page_writer *actor, 50 loff_t pos, unsigned len, unsigned copied,
51 void *priv); 51 struct page *page, void *fsdata);
52 52
53struct ocfs2_write_ctxt { 53int ocfs2_write_end_nolock(struct address_space *mapping,
54 size_t w_count; 54 loff_t pos, unsigned len, unsigned copied,
55 loff_t w_pos; 55 struct page *page, void *fsdata);
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58 56
59 /* This is true if page_size > cluster_size */ 57int ocfs2_write_begin_nolock(struct address_space *mapping,
60 unsigned int w_large_pages; 58 loff_t pos, unsigned len, unsigned flags,
61 59 struct page **pagep, void **fsdata,
62 /* Filler callback and private data */ 60 struct buffer_head *di_bh, struct page *mmap_page);
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96 61
97/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
98#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c..2bd7f788cf 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1335 ret = wait_event_interruptible(o2hb_steady_queue, 1335 ret = wait_event_interruptible(o2hb_steady_queue,
1336 atomic_read(&reg->hr_steady_iterations) == 0); 1336 atomic_read(&reg->hr_steady_iterations) == 0);
1337 if (ret) { 1337 if (ret) {
1338 /* We got interrupted (hello ptrace!). Clean up */
1338 spin_lock(&o2hb_live_lock); 1339 spin_lock(&o2hb_live_lock);
1339 hb_task = reg->hr_task; 1340 hb_task = reg->hr_task;
1340 reg->hr_task = NULL; 1341 reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1345 goto out; 1346 goto out;
1346 } 1347 }
1347 1348
1348 ret = count; 1349 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1350 spin_lock(&o2hb_live_lock);
1351 hb_task = reg->hr_task;
1352 spin_unlock(&o2hb_live_lock);
1353
1354 if (hb_task)
1355 ret = count;
1356 else
1357 ret = -EIO;
1358
1349out: 1359out:
1350 if (filp) 1360 if (filp)
1351 fput(filp); 1361 fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1523 if (hb_task) 1533 if (hb_task)
1524 kthread_stop(hb_task); 1534 kthread_stop(hb_task);
1525 1535
1536 /*
1537 * If we're racing a dev_write(), we need to wake them. They will
1538 * check reg->hr_task
1539 */
1540 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1541 atomic_set(&reg->hr_steady_iterations, 0);
1542 wake_up(&o2hb_steady_queue);
1543 }
1544
1526 config_item_put(item); 1545 config_item_put(item);
1527} 1546}
1528 1547
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
1665} 1684}
1666EXPORT_SYMBOL_GPL(o2hb_setup_callback); 1685EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1667 1686
1668int o2hb_register_callback(struct o2hb_callback_func *hc) 1687static struct o2hb_region *o2hb_find_region(const char *region_uuid)
1688{
1689 struct o2hb_region *p, *reg = NULL;
1690
1691 assert_spin_locked(&o2hb_live_lock);
1692
1693 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
1694 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
1695 reg = p;
1696 break;
1697 }
1698 }
1699
1700 return reg;
1701}
1702
1703static int o2hb_region_get(const char *region_uuid)
1704{
1705 int ret = 0;
1706 struct o2hb_region *reg;
1707
1708 spin_lock(&o2hb_live_lock);
1709
1710 reg = o2hb_find_region(region_uuid);
1711 if (!reg)
1712 ret = -ENOENT;
1713 spin_unlock(&o2hb_live_lock);
1714
1715 if (ret)
1716 goto out;
1717
1718 ret = o2nm_depend_this_node();
1719 if (ret)
1720 goto out;
1721
1722 ret = o2nm_depend_item(&reg->hr_item);
1723 if (ret)
1724 o2nm_undepend_this_node();
1725
1726out:
1727 return ret;
1728}
1729
1730static void o2hb_region_put(const char *region_uuid)
1731{
1732 struct o2hb_region *reg;
1733
1734 spin_lock(&o2hb_live_lock);
1735
1736 reg = o2hb_find_region(region_uuid);
1737
1738 spin_unlock(&o2hb_live_lock);
1739
1740 if (reg) {
1741 o2nm_undepend_item(&reg->hr_item);
1742 o2nm_undepend_this_node();
1743 }
1744}
1745
1746int o2hb_register_callback(const char *region_uuid,
1747 struct o2hb_callback_func *hc)
1669{ 1748{
1670 struct o2hb_callback_func *tmp; 1749 struct o2hb_callback_func *tmp;
1671 struct list_head *iter; 1750 struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
1681 goto out; 1760 goto out;
1682 } 1761 }
1683 1762
1763 if (region_uuid) {
1764 ret = o2hb_region_get(region_uuid);
1765 if (ret)
1766 goto out;
1767 }
1768
1684 down_write(&o2hb_callback_sem); 1769 down_write(&o2hb_callback_sem);
1685 1770
1686 list_for_each(iter, &hbcall->list) { 1771 list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
1702} 1787}
1703EXPORT_SYMBOL_GPL(o2hb_register_callback); 1788EXPORT_SYMBOL_GPL(o2hb_register_callback);
1704 1789
1705void o2hb_unregister_callback(struct o2hb_callback_func *hc) 1790void o2hb_unregister_callback(const char *region_uuid,
1791 struct o2hb_callback_func *hc)
1706{ 1792{
1707 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 1793 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1708 1794
1709 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 1795 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1710 __builtin_return_address(0), hc); 1796 __builtin_return_address(0), hc);
1711 1797
1798 /* XXX Can this happen _with_ a region reference? */
1712 if (list_empty(&hc->hc_item)) 1799 if (list_empty(&hc->hc_item))
1713 return; 1800 return;
1714 1801
1802 if (region_uuid)
1803 o2hb_region_put(region_uuid);
1804
1715 down_write(&o2hb_callback_sem); 1805 down_write(&o2hb_callback_sem);
1716 1806
1717 list_del_init(&hc->hc_item); 1807 list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b397..35397dd5ec 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
69 o2hb_cb_func *func, 69 o2hb_cb_func *func,
70 void *data, 70 void *data,
71 int priority); 71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc); 72int o2hb_register_callback(const char *region_uuid,
73void o2hb_unregister_callback(struct o2hb_callback_func *hc); 73 struct o2hb_callback_func *hc);
74void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes); 77 unsigned bytes);
76void o2hb_init(void); 78void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 2b205f5d57..e9e042b93d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,7 +74,6 @@ struct mlog_attribute {
74#define define_mask(_name) { \ 74#define define_mask(_name) { \
75 .attr = { \ 75 .attr = { \
76 .name = #_name, \ 76 .name = #_name, \
77 .owner = THIS_MODULE, \
78 .mode = S_IRUGO | S_IWUSR, \ 77 .mode = S_IRUGO | S_IWUSR, \
79 }, \ 78 }, \
80 .mask = ML_##_name, \ 79 .mask = ML_##_name, \
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01c..af2070da30 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
900 }, 900 },
901}; 901};
902 902
903int o2nm_depend_item(struct config_item *item)
904{
905 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
906}
907
908void o2nm_undepend_item(struct config_item *item)
909{
910 configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
911}
912
913int o2nm_depend_this_node(void)
914{
915 int ret = 0;
916 struct o2nm_node *local_node;
917
918 local_node = o2nm_get_node_by_num(o2nm_this_node());
919 if (!local_node) {
920 ret = -EINVAL;
921 goto out;
922 }
923
924 ret = o2nm_depend_item(&local_node->nd_item);
925 o2nm_node_put(local_node);
926
927out:
928 return ret;
929}
930
931void o2nm_undepend_this_node(void)
932{
933 struct o2nm_node *local_node;
934
935 local_node = o2nm_get_node_by_num(o2nm_this_node());
936 BUG_ON(!local_node);
937
938 o2nm_undepend_item(&local_node->nd_item);
939 o2nm_node_put(local_node);
940}
941
942
903static void __exit exit_o2nm(void) 943static void __exit exit_o2nm(void)
904{ 944{
905 if (ocfs2_table_header) 945 if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
934 goto out_sysctl; 974 goto out_sysctl;
935 975
936 config_group_init(&o2nm_cluster_group.cs_subsys.su_group); 976 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
937 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); 977 mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
938 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); 978 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
939 if (ret) { 979 if (ret) {
940 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); 980 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138a..7c860361b8 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
77void o2nm_node_get(struct o2nm_node *node); 77void o2nm_node_get(struct o2nm_node *node);
78void o2nm_node_put(struct o2nm_node *node); 78void o2nm_node_put(struct o2nm_node *node);
79 79
80int o2nm_depend_item(struct config_item *item);
81void o2nm_undepend_item(struct config_item *item);
82int o2nm_depend_this_node(void);
83void o2nm_undepend_this_node(void);
84
80#endif /* O2CLUSTER_NODEMANAGER_H */ 85#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c79..f0bdfd944c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
261 261
262static void o2net_complete_nodes_nsw(struct o2net_node *nn) 262static void o2net_complete_nodes_nsw(struct o2net_node *nn)
263{ 263{
264 struct list_head *iter, *tmp; 264 struct o2net_status_wait *nsw, *tmp;
265 unsigned int num_kills = 0; 265 unsigned int num_kills = 0;
266 struct o2net_status_wait *nsw;
267 266
268 assert_spin_locked(&nn->nn_lock); 267 assert_spin_locked(&nn->nn_lock);
269 268
270 list_for_each_safe(iter, tmp, &nn->nn_status_list) { 269 list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
271 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
272 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); 270 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
273 num_kills++; 271 num_kills++;
274 } 272 }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
764 762
765void o2net_unregister_handler_list(struct list_head *list) 763void o2net_unregister_handler_list(struct list_head *list)
766{ 764{
767 struct list_head *pos, *n; 765 struct o2net_msg_handler *nmh, *n;
768 struct o2net_msg_handler *nmh;
769 766
770 write_lock(&o2net_handler_lock); 767 write_lock(&o2net_handler_lock);
771 list_for_each_safe(pos, n, list) { 768 list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
772 nmh = list_entry(pos, struct o2net_msg_handler,
773 nh_unregister_item);
774 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", 769 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
775 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); 770 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
776 rb_erase(&nmh->nh_node, &o2net_handler_tree); 771 rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1638 1633
1639void o2net_unregister_hb_callbacks(void) 1634void o2net_unregister_hb_callbacks(void)
1640{ 1635{
1641 o2hb_unregister_callback(&o2net_hb_up); 1636 o2hb_unregister_callback(NULL, &o2net_hb_up);
1642 o2hb_unregister_callback(&o2net_hb_down); 1637 o2hb_unregister_callback(NULL, &o2net_hb_down);
1643} 1638}
1644 1639
1645int o2net_register_hb_callbacks(void) 1640int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
1651 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, 1646 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1652 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); 1647 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1653 1648
1654 ret = o2hb_register_callback(&o2net_hb_up); 1649 ret = o2hb_register_callback(NULL, &o2net_hb_up);
1655 if (ret == 0) 1650 if (ret == 0)
1656 ret = o2hb_register_callback(&o2net_hb_down); 1651 ret = o2hb_register_callback(NULL, &o2net_hb_down);
1657 1652
1658 if (ret) 1653 if (ret)
1659 o2net_unregister_hb_callbacks(); 1654 o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2b..0d5fdde959 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
368 u32 offset = OCFS2_I(dir)->ip_clusters; 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle, 371 1, 0, parent_fe_bh, handle,
372 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
373 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
374 if (status < 0) { 374 if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd9..6954565b8c 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
1128 1128
1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1130{ 1130{
1131 o2hb_unregister_callback(&dlm->dlm_hb_up); 1131 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
1132 o2hb_unregister_callback(&dlm->dlm_hb_down); 1132 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1134} 1134}
1135 1135
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1141 1141
1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1144 status = o2hb_register_callback(&dlm->dlm_hb_down); 1144 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
1145 if (status) 1145 if (status)
1146 goto bail; 1146 goto bail;
1147 1147
1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1150 status = o2hb_register_callback(&dlm->dlm_hb_up); 1150 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
1151 if (status) 1151 if (status)
1152 goto bail; 1152 goto bail;
1153 1153
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d..65b2b9b926 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
192static void dlm_dump_mles(struct dlm_ctxt *dlm) 192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{ 193{
194 struct dlm_master_list_entry *mle; 194 struct dlm_master_list_entry *mle;
195 struct list_head *iter;
196 195
197 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); 196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
198 spin_lock(&dlm->master_lock); 197 spin_lock(&dlm->master_lock);
199 list_for_each(iter, &dlm->master_list) { 198 list_for_each_entry(mle, &dlm->master_list, list)
200 mle = list_entry(iter, struct dlm_master_list_entry, list);
201 dlm_print_one_mle(mle); 199 dlm_print_one_mle(mle);
202 }
203 spin_unlock(&dlm->master_lock); 200 spin_unlock(&dlm->master_lock);
204} 201}
205 202
206int dlm_dump_all_mles(const char __user *data, unsigned int len) 203int dlm_dump_all_mles(const char __user *data, unsigned int len)
207{ 204{
208 struct list_head *iter;
209 struct dlm_ctxt *dlm; 205 struct dlm_ctxt *dlm;
210 206
211 spin_lock(&dlm_domain_lock); 207 spin_lock(&dlm_domain_lock);
212 list_for_each(iter, &dlm_domains) { 208 list_for_each_entry(dlm, &dlm_domains, list) {
213 dlm = list_entry (iter, struct dlm_ctxt, list);
214 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); 209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
215 dlm_dump_mles(dlm); 210 dlm_dump_mles(dlm);
216 } 211 }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
454 char *name, unsigned int namelen) 449 char *name, unsigned int namelen)
455{ 450{
456 struct dlm_master_list_entry *tmpmle; 451 struct dlm_master_list_entry *tmpmle;
457 struct list_head *iter;
458 452
459 assert_spin_locked(&dlm->master_lock); 453 assert_spin_locked(&dlm->master_lock);
460 454
461 list_for_each(iter, &dlm->master_list) { 455 list_for_each_entry(tmpmle, &dlm->master_list, list) {
462 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
463 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 456 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
464 continue; 457 continue;
465 dlm_get_mle(tmpmle); 458 dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
472void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 465void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
473{ 466{
474 struct dlm_master_list_entry *mle; 467 struct dlm_master_list_entry *mle;
475 struct list_head *iter;
476 468
477 assert_spin_locked(&dlm->spinlock); 469 assert_spin_locked(&dlm->spinlock);
478 470
479 list_for_each(iter, &dlm->mle_hb_events) { 471 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
480 mle = list_entry(iter, struct dlm_master_list_entry,
481 hb_events);
482 if (node_up) 472 if (node_up)
483 dlm_mle_node_up(dlm, mle, NULL, idx); 473 dlm_mle_node_up(dlm, mle, NULL, idx);
484 else 474 else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2434 int ret; 2424 int ret;
2435 int i; 2425 int i;
2436 int count = 0; 2426 int count = 0;
2437 struct list_head *queue, *iter; 2427 struct list_head *queue;
2438 struct dlm_lock *lock; 2428 struct dlm_lock *lock;
2439 2429
2440 assert_spin_locked(&res->spinlock); 2430 assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2453 ret = 0; 2443 ret = 0;
2454 queue = &res->granted; 2444 queue = &res->granted;
2455 for (i = 0; i < 3; i++) { 2445 for (i = 0; i < 3; i++) {
2456 list_for_each(iter, queue) { 2446 list_for_each_entry(lock, queue, list) {
2457 lock = list_entry(iter, struct dlm_lock, list);
2458 ++count; 2447 ++count;
2459 if (lock->ml.node == dlm->node_num) { 2448 if (lock->ml.node == dlm->node_num) {
2460 mlog(0, "found a lock owned by this node still " 2449 mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
2923static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 2912static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2924 struct dlm_lock_resource *res) 2913 struct dlm_lock_resource *res)
2925{ 2914{
2926 struct list_head *iter, *iter2;
2927 struct list_head *queue = &res->granted; 2915 struct list_head *queue = &res->granted;
2928 int i, bit; 2916 int i, bit;
2929 struct dlm_lock *lock; 2917 struct dlm_lock *lock, *next;
2930 2918
2931 assert_spin_locked(&res->spinlock); 2919 assert_spin_locked(&res->spinlock);
2932 2920
2933 BUG_ON(res->owner == dlm->node_num); 2921 BUG_ON(res->owner == dlm->node_num);
2934 2922
2935 for (i=0; i<3; i++) { 2923 for (i=0; i<3; i++) {
2936 list_for_each_safe(iter, iter2, queue) { 2924 list_for_each_entry_safe(lock, next, queue, list) {
2937 lock = list_entry (iter, struct dlm_lock, list);
2938 if (lock->ml.node != dlm->node_num) { 2925 if (lock->ml.node != dlm->node_num) {
2939 mlog(0, "putting lock for node %u\n", 2926 mlog(0, "putting lock for node %u\n",
2940 lock->ml.node); 2927 lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2976{ 2963{
2977 int i; 2964 int i;
2978 struct list_head *queue = &res->granted; 2965 struct list_head *queue = &res->granted;
2979 struct list_head *iter;
2980 struct dlm_lock *lock; 2966 struct dlm_lock *lock;
2981 int nodenum; 2967 int nodenum;
2982 2968
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2984 2970
2985 spin_lock(&res->spinlock); 2971 spin_lock(&res->spinlock);
2986 for (i=0; i<3; i++) { 2972 for (i=0; i<3; i++) {
2987 list_for_each(iter, queue) { 2973 list_for_each_entry(lock, queue, list) {
2988 /* up to the caller to make sure this node 2974 /* up to the caller to make sure this node
2989 * is alive */ 2975 * is alive */
2990 lock = list_entry (iter, struct dlm_lock, list);
2991 if (lock->ml.node != dlm->node_num) { 2976 if (lock->ml.node != dlm->node_num) {
2992 spin_unlock(&res->spinlock); 2977 spin_unlock(&res->spinlock);
2993 return lock->ml.node; 2978 return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3234 3219
3235void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3220void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3236{ 3221{
3237 struct list_head *iter, *iter2; 3222 struct dlm_master_list_entry *mle, *next;
3238 struct dlm_master_list_entry *mle;
3239 struct dlm_lock_resource *res; 3223 struct dlm_lock_resource *res;
3240 unsigned int hash; 3224 unsigned int hash;
3241 3225
@@ -3245,9 +3229,7 @@ top:
3245 3229
3246 /* clean the master list */ 3230 /* clean the master list */
3247 spin_lock(&dlm->master_lock); 3231 spin_lock(&dlm->master_lock);
3248 list_for_each_safe(iter, iter2, &dlm->master_list) { 3232 list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
3249 mle = list_entry(iter, struct dlm_master_list_entry, list);
3250
3251 BUG_ON(mle->type != DLM_MLE_BLOCK && 3233 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3252 mle->type != DLM_MLE_MASTER && 3234 mle->type != DLM_MLE_MASTER &&
3253 mle->type != DLM_MLE_MIGRATION); 3235 mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58e..a2c33160bf 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
158 struct dlm_ctxt *dlm = 158 struct dlm_ctxt *dlm =
159 container_of(work, struct dlm_ctxt, dispatched_work); 159 container_of(work, struct dlm_ctxt, dispatched_work);
160 LIST_HEAD(tmp_list); 160 LIST_HEAD(tmp_list);
161 struct list_head *iter, *iter2; 161 struct dlm_work_item *item, *next;
162 struct dlm_work_item *item;
163 dlm_workfunc_t *workfunc; 162 dlm_workfunc_t *workfunc;
164 int tot=0; 163 int tot=0;
165 164
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
167 list_splice_init(&dlm->work_list, &tmp_list); 166 list_splice_init(&dlm->work_list, &tmp_list);
168 spin_unlock(&dlm->work_lock); 167 spin_unlock(&dlm->work_lock);
169 168
170 list_for_each_safe(iter, iter2, &tmp_list) { 169 list_for_each_entry(item, &tmp_list, list) {
171 tot++; 170 tot++;
172 } 171 }
173 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); 172 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
174 173
175 list_for_each_safe(iter, iter2, &tmp_list) { 174 list_for_each_entry_safe(item, next, &tmp_list, list) {
176 item = list_entry(iter, struct dlm_work_item, list);
177 workfunc = item->func; 175 workfunc = item->func;
178 list_del_init(&item->list); 176 list_del_init(&item->list);
179 177
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
549{ 547{
550 int status = 0; 548 int status = 0;
551 struct dlm_reco_node_data *ndata; 549 struct dlm_reco_node_data *ndata;
552 struct list_head *iter;
553 int all_nodes_done; 550 int all_nodes_done;
554 int destroy = 0; 551 int destroy = 0;
555 int pass = 0; 552 int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 564
568 /* safe to access the node data list without a lock, since this 565 /* safe to access the node data list without a lock, since this
569 * process is the only one to change the list */ 566 * process is the only one to change the list */
570 list_for_each(iter, &dlm->reco.node_data) { 567 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
571 ndata = list_entry (iter, struct dlm_reco_node_data, list);
572 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 568 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
573 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 569 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
574 570
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
655 * done, or if anyone died */ 651 * done, or if anyone died */
656 all_nodes_done = 1; 652 all_nodes_done = 1;
657 spin_lock(&dlm_reco_state_lock); 653 spin_lock(&dlm_reco_state_lock);
658 list_for_each(iter, &dlm->reco.node_data) { 654 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
659 ndata = list_entry (iter, struct dlm_reco_node_data, list);
660
661 mlog(0, "checking recovery state of node %u\n", 655 mlog(0, "checking recovery state of node %u\n",
662 ndata->node_num); 656 ndata->node_num);
663 switch (ndata->state) { 657 switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
774 768
775static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) 769static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
776{ 770{
777 struct list_head *iter, *iter2; 771 struct dlm_reco_node_data *ndata, *next;
778 struct dlm_reco_node_data *ndata;
779 LIST_HEAD(tmplist); 772 LIST_HEAD(tmplist);
780 773
781 spin_lock(&dlm_reco_state_lock); 774 spin_lock(&dlm_reco_state_lock);
782 list_splice_init(&dlm->reco.node_data, &tmplist); 775 list_splice_init(&dlm->reco.node_data, &tmplist);
783 spin_unlock(&dlm_reco_state_lock); 776 spin_unlock(&dlm_reco_state_lock);
784 777
785 list_for_each_safe(iter, iter2, &tmplist) { 778 list_for_each_entry_safe(ndata, next, &tmplist, list) {
786 ndata = list_entry (iter, struct dlm_reco_node_data, list);
787 list_del_init(&ndata->list); 779 list_del_init(&ndata->list);
788 kfree(ndata); 780 kfree(ndata);
789 } 781 }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
876 struct dlm_lock_resource *res; 868 struct dlm_lock_resource *res;
877 struct dlm_ctxt *dlm; 869 struct dlm_ctxt *dlm;
878 LIST_HEAD(resources); 870 LIST_HEAD(resources);
879 struct list_head *iter;
880 int ret; 871 int ret;
881 u8 dead_node, reco_master; 872 u8 dead_node, reco_master;
882 int skip_all_done = 0; 873 int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
920 911
921 /* any errors returned will be due to the new_master dying, 912 /* any errors returned will be due to the new_master dying,
922 * the dlm_reco_thread should detect this */ 913 * the dlm_reco_thread should detect this */
923 list_for_each(iter, &resources) { 914 list_for_each_entry(res, &resources, recovering) {
924 res = list_entry (iter, struct dlm_lock_resource, recovering);
925 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 915 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
926 DLM_MRES_RECOVERY); 916 DLM_MRES_RECOVERY);
927 if (ret < 0) { 917 if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
983{ 973{
984 struct dlm_ctxt *dlm = data; 974 struct dlm_ctxt *dlm = data;
985 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; 975 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
986 struct list_head *iter;
987 struct dlm_reco_node_data *ndata = NULL; 976 struct dlm_reco_node_data *ndata = NULL;
988 int ret = -EINVAL; 977 int ret = -EINVAL;
989 978
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
1000 dlm->reco.dead_node, done->node_idx, dlm->node_num); 989 dlm->reco.dead_node, done->node_idx, dlm->node_num);
1001 990
1002 spin_lock(&dlm_reco_state_lock); 991 spin_lock(&dlm_reco_state_lock);
1003 list_for_each(iter, &dlm->reco.node_data) { 992 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
1004 ndata = list_entry (iter, struct dlm_reco_node_data, list);
1005 if (ndata->node_num != done->node_idx) 993 if (ndata->node_num != done->node_idx)
1006 continue; 994 continue;
1007 995
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1049 struct list_head *list, 1037 struct list_head *list,
1050 u8 dead_node) 1038 u8 dead_node)
1051{ 1039{
1052 struct dlm_lock_resource *res; 1040 struct dlm_lock_resource *res, *next;
1053 struct list_head *iter, *iter2;
1054 struct dlm_lock *lock; 1041 struct dlm_lock *lock;
1055 1042
1056 spin_lock(&dlm->spinlock); 1043 spin_lock(&dlm->spinlock);
1057 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 1044 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
1058 res = list_entry (iter, struct dlm_lock_resource, recovering);
1059 /* always prune any $RECOVERY entries for dead nodes, 1045 /* always prune any $RECOVERY entries for dead nodes,
1060 * otherwise hangs can occur during later recovery */ 1046 * otherwise hangs can occur during later recovery */
1061 if (dlm_is_recovery_lock(res->lockname.name, 1047 if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1169 u8 flags, u8 master) 1155 u8 flags, u8 master)
1170{ 1156{
1171 /* mres here is one full page */ 1157 /* mres here is one full page */
1172 memset(mres, 0, PAGE_SIZE); 1158 clear_page(mres);
1173 mres->lockname_len = namelen; 1159 mres->lockname_len = namelen;
1174 memcpy(mres->lockname, lockname, namelen); 1160 memcpy(mres->lockname, lockname, namelen);
1175 mres->num_locks = 0; 1161 mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1252 struct dlm_migratable_lockres *mres, 1238 struct dlm_migratable_lockres *mres,
1253 u8 send_to, u8 flags) 1239 u8 send_to, u8 flags)
1254{ 1240{
1255 struct list_head *queue, *iter; 1241 struct list_head *queue;
1256 int total_locks, i; 1242 int total_locks, i;
1257 u64 mig_cookie = 0; 1243 u64 mig_cookie = 0;
1258 struct dlm_lock *lock; 1244 struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1278 total_locks = 0; 1264 total_locks = 0;
1279 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { 1265 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
1280 queue = dlm_list_idx_to_ptr(res, i); 1266 queue = dlm_list_idx_to_ptr(res, i);
1281 list_for_each(iter, queue) { 1267 list_for_each_entry(lock, queue, list) {
1282 lock = list_entry (iter, struct dlm_lock, list);
1283
1284 /* add another lock. */ 1268 /* add another lock. */
1285 total_locks++; 1269 total_locks++;
1286 if (!dlm_add_lock_to_array(lock, mres, i)) 1270 if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1717 struct dlm_lockstatus *lksb = NULL; 1701 struct dlm_lockstatus *lksb = NULL;
1718 int ret = 0; 1702 int ret = 0;
1719 int i, j, bad; 1703 int i, j, bad;
1720 struct list_head *iter;
1721 struct dlm_lock *lock = NULL; 1704 struct dlm_lock *lock = NULL;
1722 u8 from = O2NM_MAX_NODES; 1705 u8 from = O2NM_MAX_NODES;
1723 unsigned int added = 0; 1706 unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1755 spin_lock(&res->spinlock); 1738 spin_lock(&res->spinlock);
1756 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1739 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1757 tmpq = dlm_list_idx_to_ptr(res, j); 1740 tmpq = dlm_list_idx_to_ptr(res, j);
1758 list_for_each(iter, tmpq) { 1741 list_for_each_entry(lock, tmpq, list) {
1759 lock = list_entry (iter, struct dlm_lock, list);
1760 if (lock->ml.cookie != ml->cookie) 1742 if (lock->ml.cookie != ml->cookie)
1761 lock = NULL; 1743 lock = NULL;
1762 else 1744 else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1930 struct dlm_lock_resource *res) 1912 struct dlm_lock_resource *res)
1931{ 1913{
1932 int i; 1914 int i;
1933 struct list_head *queue, *iter, *iter2; 1915 struct list_head *queue;
1934 struct dlm_lock *lock; 1916 struct dlm_lock *lock, *next;
1935 1917
1936 res->state |= DLM_LOCK_RES_RECOVERING; 1918 res->state |= DLM_LOCK_RES_RECOVERING;
1937 if (!list_empty(&res->recovering)) { 1919 if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1947 /* find any pending locks and put them back on proper list */ 1929 /* find any pending locks and put them back on proper list */
1948 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { 1930 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1949 queue = dlm_list_idx_to_ptr(res, i); 1931 queue = dlm_list_idx_to_ptr(res, i);
1950 list_for_each_safe(iter, iter2, queue) { 1932 list_for_each_entry_safe(lock, next, queue, list) {
1951 lock = list_entry (iter, struct dlm_lock, list);
1952 dlm_lock_get(lock); 1933 dlm_lock_get(lock);
1953 if (lock->convert_pending) { 1934 if (lock->convert_pending) {
1954 /* move converting lock back to granted */ 1935 /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2013 u8 dead_node, u8 new_master) 1994 u8 dead_node, u8 new_master)
2014{ 1995{
2015 int i; 1996 int i;
2016 struct list_head *iter, *iter2;
2017 struct hlist_node *hash_iter; 1997 struct hlist_node *hash_iter;
2018 struct hlist_head *bucket; 1998 struct hlist_head *bucket;
2019 1999 struct dlm_lock_resource *res, *next;
2020 struct dlm_lock_resource *res;
2021 2000
2022 mlog_entry_void(); 2001 mlog_entry_void();
2023 2002
2024 assert_spin_locked(&dlm->spinlock); 2003 assert_spin_locked(&dlm->spinlock);
2025 2004
2026 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 2005 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2027 res = list_entry (iter, struct dlm_lock_resource, recovering);
2028 if (res->owner == dead_node) { 2006 if (res->owner == dead_node) {
2029 list_del_init(&res->recovering); 2007 list_del_init(&res->recovering);
2030 spin_lock(&res->spinlock); 2008 spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
2099static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, 2077static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2100 struct dlm_lock_resource *res, u8 dead_node) 2078 struct dlm_lock_resource *res, u8 dead_node)
2101{ 2079{
2102 struct list_head *iter, *queue; 2080 struct list_head *queue;
2103 struct dlm_lock *lock; 2081 struct dlm_lock *lock;
2104 int blank_lvb = 0, local = 0; 2082 int blank_lvb = 0, local = 0;
2105 int i; 2083 int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2121 2099
2122 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { 2100 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
2123 queue = dlm_list_idx_to_ptr(res, i); 2101 queue = dlm_list_idx_to_ptr(res, i);
2124 list_for_each(iter, queue) { 2102 list_for_each_entry(lock, queue, list) {
2125 lock = list_entry (iter, struct dlm_lock, list);
2126 if (lock->ml.node == search_node) { 2103 if (lock->ml.node == search_node) {
2127 if (dlm_lvb_needs_invalidation(lock, local)) { 2104 if (dlm_lvb_needs_invalidation(lock, local)) {
2128 /* zero the lksb lvb and lockres lvb */ 2105 /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2143static void dlm_free_dead_locks(struct dlm_ctxt *dlm, 2120static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2144 struct dlm_lock_resource *res, u8 dead_node) 2121 struct dlm_lock_resource *res, u8 dead_node)
2145{ 2122{
2146 struct list_head *iter, *tmpiter; 2123 struct dlm_lock *lock, *next;
2147 struct dlm_lock *lock;
2148 unsigned int freed = 0; 2124 unsigned int freed = 0;
2149 2125
2150 /* this node is the lockres master: 2126 /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2155 assert_spin_locked(&res->spinlock); 2131 assert_spin_locked(&res->spinlock);
2156 2132
2157 /* TODO: check pending_asts, pending_basts here */ 2133 /* TODO: check pending_asts, pending_basts here */
2158 list_for_each_safe(iter, tmpiter, &res->granted) { 2134 list_for_each_entry_safe(lock, next, &res->granted, list) {
2159 lock = list_entry (iter, struct dlm_lock, list);
2160 if (lock->ml.node == dead_node) { 2135 if (lock->ml.node == dead_node) {
2161 list_del_init(&lock->list); 2136 list_del_init(&lock->list);
2162 dlm_lock_put(lock); 2137 dlm_lock_put(lock);
2163 freed++; 2138 freed++;
2164 } 2139 }
2165 } 2140 }
2166 list_for_each_safe(iter, tmpiter, &res->converting) { 2141 list_for_each_entry_safe(lock, next, &res->converting, list) {
2167 lock = list_entry (iter, struct dlm_lock, list);
2168 if (lock->ml.node == dead_node) { 2142 if (lock->ml.node == dead_node) {
2169 list_del_init(&lock->list); 2143 list_del_init(&lock->list);
2170 dlm_lock_put(lock); 2144 dlm_lock_put(lock);
2171 freed++; 2145 freed++;
2172 } 2146 }
2173 } 2147 }
2174 list_for_each_safe(iter, tmpiter, &res->blocked) { 2148 list_for_each_entry_safe(lock, next, &res->blocked, list) {
2175 lock = list_entry (iter, struct dlm_lock, list);
2176 if (lock->ml.node == dead_node) { 2149 if (lock->ml.node == dead_node) {
2177 list_del_init(&lock->list); 2150 list_del_init(&lock->list);
2178 dlm_lock_put(lock); 2151 dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0..f71250ed16 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
600static void lockres_set_flags(struct ocfs2_lock_res *lockres, 600static void lockres_set_flags(struct ocfs2_lock_res *lockres,
601 unsigned long newflags) 601 unsigned long newflags)
602{ 602{
603 struct list_head *pos, *tmp; 603 struct ocfs2_mask_waiter *mw, *tmp;
604 struct ocfs2_mask_waiter *mw;
605 604
606 assert_spin_locked(&lockres->l_lock); 605 assert_spin_locked(&lockres->l_lock);
607 606
608 lockres->l_flags = newflags; 607 lockres->l_flags = newflags;
609 608
610 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 609 list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
611 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
612 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 610 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
613 continue; 611 continue;
614 612
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b22076..ff257628af 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
32 *var = cpu_to_le32(le32_to_cpu(*var) + val); 32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33} 33}
34 34
35static inline void le64_add_cpu(__le64 *var, u64 val)
36{
37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38}
39
35static inline void le32_and_cpu(__le32 *var, u32 val) 40static inline void le32_and_cpu(__le32 *var, u32 val)
36{ 41{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val); 42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866..e08bed9e45 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
26#ifndef OCFS2_EXPORT_H 26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H 27#define OCFS2_EXPORT_H
28 28
29#include <linux/exportfs.h>
30
29extern struct export_operations ocfs2_export_ops; 31extern struct export_operations ocfs2_export_ops;
30 32
31#endif /* OCFS2_EXPORT_H */ 33#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6..03c1d365c7 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
109 */ 109 */
110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
111{ 111{
112 struct list_head *p, *n; 112 struct ocfs2_extent_map_item *emi, *n;
113 struct ocfs2_extent_map_item *emi;
114 struct ocfs2_inode_info *oi = OCFS2_I(inode); 113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
115 struct ocfs2_extent_map *em = &oi->ip_extent_map; 114 struct ocfs2_extent_map *em = &oi->ip_extent_map;
116 LIST_HEAD(tmp_list); 115 LIST_HEAD(tmp_list);
117 unsigned int range; 116 unsigned int range;
118 117
119 spin_lock(&oi->ip_lock); 118 spin_lock(&oi->ip_lock);
120 list_for_each_safe(p, n, &em->em_list) { 119 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
122
123 if (emi->ei_cpos >= cpos) { 120 if (emi->ei_cpos >= cpos) {
124 /* Full truncate of this record. */ 121 /* Full truncate of this record. */
125 list_move(&emi->ei_list, &tmp_list); 122 list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
136 } 133 }
137 spin_unlock(&oi->ip_lock); 134 spin_unlock(&oi->ip_lock);
138 135
139 list_for_each_safe(p, n, &tmp_list) { 136 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
141 list_del(&emi->ei_list); 137 list_del(&emi->ei_list);
142 kfree(emi); 138 kfree(emi);
143 } 139 }
@@ -377,37 +373,6 @@ out:
377 return ret; 373 return ret;
378} 374}
379 375
380/*
381 * Return the index of the extent record which contains cluster #v_cluster.
382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
393
394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
395 rec = &el->l_recs[i];
396
397 rec_start = le32_to_cpu(rec->e_cpos);
398 clusters = ocfs2_rec_clusters(el, rec);
399
400 rec_end = rec_start + clusters;
401
402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
406 }
407
408 return ret;
409}
410
411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
412 u32 *p_cluster, u32 *num_clusters, 377 u32 *p_cluster, u32 *num_clusters,
413 unsigned int *extent_flags) 378 unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ac6c96431b..004c2abbc7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -31,7 +31,7 @@
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/uio.h> 32#include <linux/uio.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/pipe_fs_i.h> 34#include <linux/splice.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37 37
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 u64 cluster_bytes;
266 267
267 mlog_entry_void(); 268 mlog_entry_void();
268 269
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
286 /* 287 /*
287 * Do this before setting i_size. 288 * Do this before setting i_size.
288 */ 289 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 290 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
291 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
292 cluster_bytes);
290 if (status) { 293 if (status) {
291 mlog_errno(status); 294 mlog_errno(status);
292 goto out_commit; 295 goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 329 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 330 (unsigned long long)new_i_size);
328 331
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size);
331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 332 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 333 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
363 if (new_i_size == le64_to_cpu(fe->i_size)) 363 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 364 goto bail;
365 365
366 down_write(&OCFS2_I(inode)->ip_alloc_sem);
367
366 /* This forces other nodes to sync and drop their pages. Do 368 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 369 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 370 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 371 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 372 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 373 if (status < 0) {
374 up_write(&OCFS2_I(inode)->ip_alloc_sem);
375
372 mlog_errno(status); 376 mlog_errno(status);
373 goto bail; 377 goto bail;
374 } 378 }
375 379
380 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
381 truncate_inode_pages(inode->i_mapping, new_i_size);
382
376 /* alright, we're going to need to do a full blown alloc size 383 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 384 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 385 * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
399bail_unlock_data: 406bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 407 ocfs2_data_unlock(inode, 1);
401 408
409 up_write(&OCFS2_I(inode)->ip_alloc_sem);
410
402bail: 411bail:
403 412
404 mlog_exit(status); 413 mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 428 struct inode *inode,
420 u32 *logical_offset, 429 u32 *logical_offset,
421 u32 clusters_to_add, 430 u32 clusters_to_add,
431 int mark_unwritten,
422 struct buffer_head *fe_bh, 432 struct buffer_head *fe_bh,
423 handle_t *handle, 433 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 434 struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 441 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 442 u32 bit_off, num_bits;
433 u64 block; 443 u64 block;
444 u8 flags = 0;
434 445
435 BUG_ON(!clusters_to_add); 446 BUG_ON(!clusters_to_add);
436 447
448 if (mark_unwritten)
449 flags = OCFS2_EXT_UNWRITTEN;
450
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 451 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 452 if (free_extents < 0) {
439 status = free_extents; 453 status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 497 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 498 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 499 *logical_offset, block, num_bits,
486 meta_ac); 500 flags, meta_ac);
487 if (status < 0) { 501 if (status < 0) {
488 mlog_errno(status); 502 mlog_errno(status);
489 goto leave; 503 goto leave;
@@ -516,25 +530,31 @@ leave:
516 * For a given allocation, determine which allocators will need to be 530 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 531 * accessed, and lock them, reserving the appropriate number of bits.
518 * 532 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 533 * Sparse file systems call this from ocfs2_write_begin_nolock()
520 * support holes, and from ocfs2_write() for file systems which 534 * and ocfs2_allocate_unwritten_extents().
521 * understand sparse inodes. 535 *
536 * File systems which don't support holes call this from
537 * ocfs2_extend_allocation().
522 */ 538 */
523int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 539int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 540 u32 clusters_to_add, u32 extents_to_split,
525 struct ocfs2_alloc_context **data_ac, 541 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 542 struct ocfs2_alloc_context **meta_ac)
527{ 543{
528 int ret, num_free_extents; 544 int ret = 0, num_free_extents;
545 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 546 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 547
531 *meta_ac = NULL; 548 *meta_ac = NULL;
532 *data_ac = NULL; 549 if (data_ac)
550 *data_ac = NULL;
551
552 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
533 553
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 554 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 555 "clusters_to_add = %u, extents_to_split = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 556 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 557 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
538 558
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 559 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 560 if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
552 * 572 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 573 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 574 * anyway.
575 *
576 * Always lock for any unwritten extents - we might want to
577 * add blocks during a split.
555 */ 578 */
556 if (!num_free_extents || 579 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 580 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 581 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 582 if (ret < 0) {
560 if (ret != -ENOSPC) 583 if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
563 } 586 }
564 } 587 }
565 588
589 if (clusters_to_add == 0)
590 goto out;
591
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 592 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 593 if (ret < 0) {
568 if (ret != -ENOSPC) 594 if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
585 return ret; 611 return ret;
586} 612}
587 613
588static int ocfs2_extend_allocation(struct inode *inode, 614static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
589 u32 clusters_to_add) 615 u32 clusters_to_add, int mark_unwritten)
590{ 616{
591 int status = 0; 617 int status = 0;
592 int restart_func = 0; 618 int restart_func = 0;
593 int drop_alloc_sem = 0;
594 int credits; 619 int credits;
595 u32 prev_clusters, logical_start; 620 u32 prev_clusters;
596 struct buffer_head *bh = NULL; 621 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 622 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 623 handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
607 * This function only exists for file systems which don't 632 * This function only exists for file systems which don't
608 * support holes. 633 * support holes.
609 */ 634 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 635 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
611 636
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 637 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 638 OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
623 goto leave; 648 goto leave;
624 } 649 }
625 650
626 logical_start = OCFS2_I(inode)->ip_clusters;
627
628restart_all: 651restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 652 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 653
631 /* blocks peope in read/write from reading our allocation 654 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1;
637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 655 &meta_ac);
640 if (status) { 656 if (status) {
641 mlog_errno(status); 657 mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
668 inode, 684 inode,
669 &logical_start, 685 &logical_start,
670 clusters_to_add, 686 clusters_to_add,
687 mark_unwritten,
671 bh, 688 bh,
672 handle, 689 handle,
673 data_ac, 690 data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 737 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 738
722leave: 739leave:
723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0;
726 }
727 if (handle) { 740 if (handle) {
728 ocfs2_commit_trans(osb, handle); 741 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 742 handle = NULL;
@@ -749,6 +762,25 @@ leave:
749 return status; 762 return status;
750} 763}
751 764
765static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
766 u32 clusters_to_add, int mark_unwritten)
767{
768 int ret;
769
770 /*
771 * The alloc sem blocks peope in read/write from reading our
772 * allocation until we're done changing it. We depend on
773 * i_mutex to block other extend/truncate calls while we're
774 * here.
775 */
776 down_write(&OCFS2_I(inode)->ip_alloc_sem);
777 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
778 mark_unwritten);
779 up_write(&OCFS2_I(inode)->ip_alloc_sem);
780
781 return ret;
782}
783
752/* Some parts of this taken from generic_cont_expand, which turned out 784/* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 785 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 786 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
890 } 922 }
891 923
892 if (clusters_to_add) { 924 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 925 ret = ocfs2_extend_allocation(inode,
926 OCFS2_I(inode)->ip_clusters,
927 clusters_to_add, 0);
894 if (ret < 0) { 928 if (ret < 0) {
895 mlog_errno(ret); 929 mlog_errno(ret);
896 goto out_unlock; 930 goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
995 goto bail_unlock; 1029 goto bail_unlock;
996 } 1030 }
997 1031
1032 /*
1033 * This will intentionally not wind up calling vmtruncate(),
1034 * since all the work for a size change has been done above.
1035 * Otherwise, we could get into problems with truncate as
1036 * ip_alloc_sem is used there to protect against i_size
1037 * changes.
1038 */
998 status = inode_setattr(inode, attr); 1039 status = inode_setattr(inode, attr);
999 if (status < 0) { 1040 if (status < 0) {
1000 mlog_errno(status); 1041 mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
1070 return ret; 1111 return ret;
1071} 1112}
1072 1113
1073static int ocfs2_write_remove_suid(struct inode *inode) 1114static int __ocfs2_write_remove_suid(struct inode *inode,
1115 struct buffer_head *bh)
1074{ 1116{
1075 int ret; 1117 int ret;
1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1118 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1120 struct ocfs2_dinode *di;
1081 1121
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1122 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1123 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1084 1124
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1125 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1126 if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1089 goto out; 1129 goto out;
1090 } 1130 }
1091 1131
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) {
1094 mlog_errno(ret);
1095 goto out_trans;
1096 }
1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1132 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1133 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1134 if (ret < 0) {
1101 mlog_errno(ret); 1135 mlog_errno(ret);
1102 goto out_bh; 1136 goto out_trans;
1103 } 1137 }
1104 1138
1105 inode->i_mode &= ~S_ISUID; 1139 inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1112 ret = ocfs2_journal_dirty(handle, bh); 1146 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1147 if (ret < 0)
1114 mlog_errno(ret); 1148 mlog_errno(ret);
1115out_bh: 1149
1116 brelse(bh);
1117out_trans: 1150out_trans:
1118 ocfs2_commit_trans(osb, handle); 1151 ocfs2_commit_trans(osb, handle);
1119out: 1152out:
@@ -1159,6 +1192,460 @@ out:
1159 return ret; 1192 return ret;
1160} 1193}
1161 1194
1195static int ocfs2_write_remove_suid(struct inode *inode)
1196{
1197 int ret;
1198 struct buffer_head *bh = NULL;
1199 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1200
1201 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1202 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1203 if (ret < 0) {
1204 mlog_errno(ret);
1205 goto out;
1206 }
1207
1208 ret = __ocfs2_write_remove_suid(inode, bh);
1209out:
1210 brelse(bh);
1211 return ret;
1212}
1213
1214/*
1215 * Allocate enough extents to cover the region starting at byte offset
1216 * start for len bytes. Existing extents are skipped, any extents
1217 * added are marked as "unwritten".
1218 */
1219static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1220 u64 start, u64 len)
1221{
1222 int ret;
1223 u32 cpos, phys_cpos, clusters, alloc_size;
1224
1225 /*
1226 * We consider both start and len to be inclusive.
1227 */
1228 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1229 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1230 clusters -= cpos;
1231
1232 while (clusters) {
1233 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1234 &alloc_size, NULL);
1235 if (ret) {
1236 mlog_errno(ret);
1237 goto out;
1238 }
1239
1240 /*
1241 * Hole or existing extent len can be arbitrary, so
1242 * cap it to our own allocation request.
1243 */
1244 if (alloc_size > clusters)
1245 alloc_size = clusters;
1246
1247 if (phys_cpos) {
1248 /*
1249 * We already have an allocation at this
1250 * region so we can safely skip it.
1251 */
1252 goto next;
1253 }
1254
1255 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1256 if (ret) {
1257 if (ret != -ENOSPC)
1258 mlog_errno(ret);
1259 goto out;
1260 }
1261
1262next:
1263 cpos += alloc_size;
1264 clusters -= alloc_size;
1265 }
1266
1267 ret = 0;
1268out:
1269 return ret;
1270}
1271
1272static int __ocfs2_remove_inode_range(struct inode *inode,
1273 struct buffer_head *di_bh,
1274 u32 cpos, u32 phys_cpos, u32 len,
1275 struct ocfs2_cached_dealloc_ctxt *dealloc)
1276{
1277 int ret;
1278 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 struct inode *tl_inode = osb->osb_tl_inode;
1281 handle_t *handle;
1282 struct ocfs2_alloc_context *meta_ac = NULL;
1283 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1284
1285 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
1286 if (ret) {
1287 mlog_errno(ret);
1288 return ret;
1289 }
1290
1291 mutex_lock(&tl_inode->i_mutex);
1292
1293 if (ocfs2_truncate_log_needs_flush(osb)) {
1294 ret = __ocfs2_flush_truncate_log(osb);
1295 if (ret < 0) {
1296 mlog_errno(ret);
1297 goto out;
1298 }
1299 }
1300
1301 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1302 if (handle == NULL) {
1303 ret = -ENOMEM;
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_journal_access(handle, inode, di_bh,
1309 OCFS2_JOURNAL_ACCESS_WRITE);
1310 if (ret) {
1311 mlog_errno(ret);
1312 goto out;
1313 }
1314
1315 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
1316 dealloc);
1317 if (ret) {
1318 mlog_errno(ret);
1319 goto out_commit;
1320 }
1321
1322 OCFS2_I(inode)->ip_clusters -= len;
1323 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1324
1325 ret = ocfs2_journal_dirty(handle, di_bh);
1326 if (ret) {
1327 mlog_errno(ret);
1328 goto out_commit;
1329 }
1330
1331 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1332 if (ret)
1333 mlog_errno(ret);
1334
1335out_commit:
1336 ocfs2_commit_trans(osb, handle);
1337out:
1338 mutex_unlock(&tl_inode->i_mutex);
1339
1340 if (meta_ac)
1341 ocfs2_free_alloc_context(meta_ac);
1342
1343 return ret;
1344}
1345
1346/*
1347 * Truncate a byte range, avoiding pages within partial clusters. This
1348 * preserves those pages for the zeroing code to write to.
1349 */
1350static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1351 u64 byte_len)
1352{
1353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354 loff_t start, end;
1355 struct address_space *mapping = inode->i_mapping;
1356
1357 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1358 end = byte_start + byte_len;
1359 end = end & ~(osb->s_clustersize - 1);
1360
1361 if (start < end) {
1362 unmap_mapping_range(mapping, start, end - start, 0);
1363 truncate_inode_pages_range(mapping, start, end - 1);
1364 }
1365}
1366
1367static int ocfs2_zero_partial_clusters(struct inode *inode,
1368 u64 start, u64 len)
1369{
1370 int ret = 0;
1371 u64 tmpend, end = start + len;
1372 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373 unsigned int csize = osb->s_clustersize;
1374 handle_t *handle;
1375
1376 /*
1377 * The "start" and "end" values are NOT necessarily part of
1378 * the range whose allocation is being deleted. Rather, this
1379 * is what the user passed in with the request. We must zero
1380 * partial clusters here. There's no need to worry about
1381 * physical allocation - the zeroing code knows to skip holes.
1382 */
1383 mlog(0, "byte start: %llu, end: %llu\n",
1384 (unsigned long long)start, (unsigned long long)end);
1385
1386 /*
1387 * If both edges are on a cluster boundary then there's no
1388 * zeroing required as the region is part of the allocation to
1389 * be truncated.
1390 */
1391 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1392 goto out;
1393
1394 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1395 if (handle == NULL) {
1396 ret = -ENOMEM;
1397 mlog_errno(ret);
1398 goto out;
1399 }
1400
1401 /*
1402 * We want to get the byte offset of the end of the 1st cluster.
1403 */
1404 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1405 if (tmpend > end)
1406 tmpend = end;
1407
1408 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1409 (unsigned long long)start, (unsigned long long)tmpend);
1410
1411 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1412 if (ret)
1413 mlog_errno(ret);
1414
1415 if (tmpend < end) {
1416 /*
1417 * This may make start and end equal, but the zeroing
1418 * code will skip any work in that case so there's no
1419 * need to catch it up here.
1420 */
1421 start = end & ~(osb->s_clustersize - 1);
1422
1423 mlog(0, "2nd range: start: %llu, end: %llu\n",
1424 (unsigned long long)start, (unsigned long long)end);
1425
1426 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1427 if (ret)
1428 mlog_errno(ret);
1429 }
1430
1431 ocfs2_commit_trans(osb, handle);
1432out:
1433 return ret;
1434}
1435
1436static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len)
1439{
1440 int ret = 0;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc;
1444
1445 ocfs2_init_dealloc_ctxt(&dealloc);
1446
1447 if (byte_len == 0)
1448 return 0;
1449
1450 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1451 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1452 if (trunc_len >= trunc_start)
1453 trunc_len -= trunc_start;
1454 else
1455 trunc_len = 0;
1456
1457 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1458 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1459 (unsigned long long)byte_start,
1460 (unsigned long long)byte_len, trunc_start, trunc_len);
1461
1462 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 cpos = trunc_start;
1469 while (trunc_len) {
1470 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1471 &alloc_size, NULL);
1472 if (ret) {
1473 mlog_errno(ret);
1474 goto out;
1475 }
1476
1477 if (alloc_size > trunc_len)
1478 alloc_size = trunc_len;
1479
1480 /* Only do work for non-holes */
1481 if (phys_cpos != 0) {
1482 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
1483 phys_cpos, alloc_size,
1484 &dealloc);
1485 if (ret) {
1486 mlog_errno(ret);
1487 goto out;
1488 }
1489 }
1490
1491 cpos += alloc_size;
1492 trunc_len -= alloc_size;
1493 }
1494
1495 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1496
1497out:
1498 ocfs2_schedule_truncate_log_flush(osb, 1);
1499 ocfs2_run_deallocs(osb, &dealloc);
1500
1501 return ret;
1502}
1503
1504/*
1505 * Parts of this function taken from xfs_change_file_space()
1506 */
1507int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1508 struct ocfs2_space_resv *sr)
1509{
1510 int ret;
1511 s64 llen;
1512 struct inode *inode = file->f_path.dentry->d_inode;
1513 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1514 struct buffer_head *di_bh = NULL;
1515 handle_t *handle;
1516 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
1517
1518 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1519 !ocfs2_writes_unwritten_extents(osb))
1520 return -ENOTTY;
1521 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1522 !ocfs2_sparse_alloc(osb))
1523 return -ENOTTY;
1524
1525 if (!S_ISREG(inode->i_mode))
1526 return -EINVAL;
1527
1528 if (!(file->f_mode & FMODE_WRITE))
1529 return -EBADF;
1530
1531 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1532 return -EROFS;
1533
1534 mutex_lock(&inode->i_mutex);
1535
1536 /*
1537 * This prevents concurrent writes on other nodes
1538 */
1539 ret = ocfs2_rw_lock(inode, 1);
1540 if (ret) {
1541 mlog_errno(ret);
1542 goto out;
1543 }
1544
1545 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out_rw_unlock;
1549 }
1550
1551 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1552 ret = -EPERM;
1553 goto out_meta_unlock;
1554 }
1555
1556 switch (sr->l_whence) {
1557 case 0: /*SEEK_SET*/
1558 break;
1559 case 1: /*SEEK_CUR*/
1560 sr->l_start += file->f_pos;
1561 break;
1562 case 2: /*SEEK_END*/
1563 sr->l_start += i_size_read(inode);
1564 break;
1565 default:
1566 ret = -EINVAL;
1567 goto out_meta_unlock;
1568 }
1569 sr->l_whence = 0;
1570
1571 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1572
1573 if (sr->l_start < 0
1574 || sr->l_start > max_off
1575 || (sr->l_start + llen) < 0
1576 || (sr->l_start + llen) > max_off) {
1577 ret = -EINVAL;
1578 goto out_meta_unlock;
1579 }
1580
1581 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1582 if (sr->l_len <= 0) {
1583 ret = -EINVAL;
1584 goto out_meta_unlock;
1585 }
1586 }
1587
1588 if (should_remove_suid(file->f_path.dentry)) {
1589 ret = __ocfs2_write_remove_suid(inode, di_bh);
1590 if (ret) {
1591 mlog_errno(ret);
1592 goto out_meta_unlock;
1593 }
1594 }
1595
1596 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1597 switch (cmd) {
1598 case OCFS2_IOC_RESVSP:
1599 case OCFS2_IOC_RESVSP64:
1600 /*
1601 * This takes unsigned offsets, but the signed ones we
1602 * pass have been checked against overflow above.
1603 */
1604 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1605 sr->l_len);
1606 break;
1607 case OCFS2_IOC_UNRESVSP:
1608 case OCFS2_IOC_UNRESVSP64:
1609 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1610 sr->l_len);
1611 break;
1612 default:
1613 ret = -EINVAL;
1614 }
1615 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1616 if (ret) {
1617 mlog_errno(ret);
1618 goto out_meta_unlock;
1619 }
1620
1621 /*
1622 * We update c/mtime for these changes
1623 */
1624 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1625 if (IS_ERR(handle)) {
1626 ret = PTR_ERR(handle);
1627 mlog_errno(ret);
1628 goto out_meta_unlock;
1629 }
1630
1631 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1632 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1633 if (ret < 0)
1634 mlog_errno(ret);
1635
1636 ocfs2_commit_trans(osb, handle);
1637
1638out_meta_unlock:
1639 brelse(di_bh);
1640 ocfs2_meta_unlock(inode, 1);
1641out_rw_unlock:
1642 ocfs2_rw_unlock(inode, 1);
1643
1644 mutex_unlock(&inode->i_mutex);
1645out:
1646 return ret;
1647}
1648
1162static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1649static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1650 loff_t *ppos,
1164 size_t count, 1651 size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1329 *basep = base; 1816 *basep = base;
1330} 1817}
1331 1818
1332static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1819static struct page * ocfs2_get_write_source(char **ret_src_buf,
1333 const struct iovec *cur_iov, 1820 const struct iovec *cur_iov,
1334 size_t iov_offset) 1821 size_t iov_offset)
1335{ 1822{
1336 int ret; 1823 int ret;
1337 char *buf; 1824 char *buf = cur_iov->iov_base + iov_offset;
1338 struct page *src_page = NULL; 1825 struct page *src_page = NULL;
1826 unsigned long off;
1339 1827
1340 buf = cur_iov->iov_base + iov_offset; 1828 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1341 1829
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1830 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1831 /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1837 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1838 0, 0, &src_page, NULL);
1351 if (ret == 1) 1839 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1840 *ret_src_buf = kmap(src_page) + off;
1353 else 1841 else
1354 src_page = ERR_PTR(-EFAULT); 1842 src_page = ERR_PTR(-EFAULT);
1355 } else { 1843 } else {
1356 bp->b_src_buf = buf; 1844 *ret_src_buf = buf;
1357 } 1845 }
1358 1846
1359 return src_page; 1847 return src_page;
1360} 1848}
1361 1849
1362static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1850static void ocfs2_put_write_source(struct page *page)
1363 struct page *page)
1364{ 1851{
1365 if (page) { 1852 if (page) {
1366 kunmap(page); 1853 kunmap(page);
@@ -1376,10 +1863,13 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1376{ 1863{
1377 int ret = 0; 1864 int ret = 0;
1378 ssize_t copied, total = 0; 1865 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1866 size_t iov_offset = 0, bytes;
1867 loff_t pos;
1380 const struct iovec *cur_iov = iov; 1868 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1869 struct page *user_page, *page;
1382 struct page *page; 1870 char * uninitialized_var(buf);
1871 char *dst;
1872 void *fsdata;
1383 1873
1384 /* 1874 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1875 * handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1877,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1877 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1878
1389 do { 1879 do {
1390 bp.b_cur_off = iov_offset; 1880 pos = *ppos;
1391 bp.b_cur_iov = cur_iov;
1392 1881
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1882 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1883 if (IS_ERR(user_page)) {
1395 ret = PTR_ERR(page); 1884 ret = PTR_ERR(user_page);
1396 goto out; 1885 goto out;
1397 } 1886 }
1398 1887
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1888 /* Stay within our page boundaries */
1400 ocfs2_map_and_write_user_data, 1889 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1401 &bp); 1890 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1891 /* Stay within the vector boundary */
1892 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1893 /* Stay within count */
1894 bytes = min(bytes, count);
1895
1896 page = NULL;
1897 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1898 &page, &fsdata);
1899 if (ret) {
1900 mlog_errno(ret);
1901 goto out;
1902 }
1402 1903
1403 ocfs2_put_write_source(&bp, page); 1904 dst = kmap_atomic(page, KM_USER0);
1905 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1906 kunmap_atomic(dst, KM_USER0);
1907 flush_dcache_page(page);
1908 ocfs2_put_write_source(user_page);
1404 1909
1910 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1911 bytes, page, fsdata);
1405 if (copied < 0) { 1912 if (copied < 0) {
1406 mlog_errno(copied); 1913 mlog_errno(copied);
1407 ret = copied; 1914 ret = copied;
@@ -1409,7 +1916,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1409 } 1916 }
1410 1917
1411 total += copied; 1918 total += copied;
1412 *ppos = *ppos + copied; 1919 *ppos = pos + copied;
1413 count -= copied; 1920 count -= copied;
1414 1921
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1922 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2086,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 2086 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 2087 struct splice_desc *sd)
1581{ 2088{
1582 int ret, count, total = 0; 2089 int ret, count;
1583 ssize_t copied = 0; 2090 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 2091 struct file *file = sd->u.file;
2092 unsigned int offset;
2093 struct page *page = NULL;
2094 void *fsdata;
2095 char *src, *dst;
1585 2096
1586 ret = buf->ops->pin(pipe, buf); 2097 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 2098 if (ret)
1588 goto out; 2099 goto out;
1589 2100
1590 sp.s_sd = sd; 2101 offset = sd->pos & ~PAGE_CACHE_MASK;
1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset;
1595
1596 count = sd->len; 2102 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 2103 if (count + offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 2104 count = PAGE_CACHE_SIZE - offset;
1599 2105
1600 do { 2106 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1601 /* 2107 &page, &fsdata);
1602 * splice wants us to copy up to one page at a 2108 if (ret) {
1603 * time. For pagesize > cluster size, this means we 2109 mlog_errno(ret);
1604 * might enter ocfs2_buffered_write_cluster() more 2110 goto out;
1605 * than once, so keep track of our progress here. 2111 }
1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->file,
1608 (loff_t)sd->pos + total,
1609 count,
1610 ocfs2_map_and_write_splice_data,
1611 &sp);
1612 if (copied < 0) {
1613 mlog_errno(copied);
1614 ret = copied;
1615 goto out;
1616 }
1617 2112
1618 count -= copied; 2113 src = buf->ops->map(pipe, buf, 1);
1619 sp.s_offset += copied; 2114 dst = kmap_atomic(page, KM_USER1);
1620 sp.s_buf_offset += copied; 2115 memcpy(dst + offset, src + buf->offset, count);
1621 total += copied; 2116 kunmap_atomic(page, KM_USER1);
1622 } while (count); 2117 buf->ops->unmap(pipe, buf, src);
1623 2118
1624 ret = 0; 2119 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
2120 page, fsdata);
2121 if (copied < 0) {
2122 mlog_errno(copied);
2123 ret = copied;
2124 goto out;
2125 }
1625out: 2126out:
1626 2127
1627 return total ? total : ret; 2128 return copied ? copied : ret;
1628} 2129}
1629 2130
1630static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2131static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1636,9 +2137,14 @@ static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1636 int ret, err; 2137 int ret, err;
1637 struct address_space *mapping = out->f_mapping; 2138 struct address_space *mapping = out->f_mapping;
1638 struct inode *inode = mapping->host; 2139 struct inode *inode = mapping->host;
1639 2140 struct splice_desc sd = {
1640 ret = __splice_from_pipe(pipe, out, ppos, len, flags, 2141 .total_len = len,
1641 ocfs2_splice_write_actor); 2142 .flags = flags,
2143 .pos = *ppos,
2144 .u.file = out,
2145 };
2146
2147 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
1642 if (ret > 0) { 2148 if (ret > 0) {
1643 *ppos += ret; 2149 *ppos += ret;
1644 2150
@@ -1817,7 +2323,6 @@ const struct inode_operations ocfs2_special_file_iops = {
1817const struct file_operations ocfs2_fops = { 2323const struct file_operations ocfs2_fops = {
1818 .read = do_sync_read, 2324 .read = do_sync_read,
1819 .write = do_sync_write, 2325 .write = do_sync_write,
1820 .sendfile = generic_file_sendfile,
1821 .mmap = ocfs2_mmap, 2326 .mmap = ocfs2_mmap,
1822 .fsync = ocfs2_sync_file, 2327 .fsync = ocfs2_sync_file,
1823 .release = ocfs2_file_release, 2328 .release = ocfs2_file_release,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa182..36fe27f268 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start, 42 u32 *logical_offset,
43 u32 clusters_to_add, 43 u32 clusters_to_add,
44 int mark_unwritten,
44 struct buffer_head *fe_bh, 45 struct buffer_head *fe_bh,
45 handle_t *handle, 46 handle_t *handle,
46 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
47 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
48 enum ocfs2_alloc_restarted *reason); 49 enum ocfs2_alloc_restarted *reason_ret);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add, 51 u32 clusters_to_add, u32 extents_to_split,
51 struct ocfs2_alloc_context **data_ac, 52 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac); 53 struct ocfs2_alloc_context **meta_ac);
53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 54int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
61int ocfs2_update_inode_atime(struct inode *inode, 62int ocfs2_update_inode_atime(struct inode *inode,
62 struct buffer_head *bh); 63 struct buffer_head *bh);
63 64
65int ocfs2_change_file_space(struct file *file, unsigned int cmd,
66 struct ocfs2_space_resv *sr);
67
64#endif /* OCFS2_FILE_H */ 68#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781..c4c3617124 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
157 if (ocfs2_mount_local(osb)) 157 if (ocfs2_mount_local(osb))
158 return 0; 158 return 0;
159 159
160 status = o2hb_register_callback(&osb->osb_hb_down); 160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) { 161 if (status < 0) {
162 mlog_errno(status); 162 mlog_errno(status);
163 goto bail; 163 goto bail;
164 } 164 }
165 165
166 status = o2hb_register_callback(&osb->osb_hb_up); 166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) { 167 if (status < 0) {
168 mlog_errno(status); 168 mlog_errno(status);
169 o2hb_unregister_callback(&osb->osb_hb_down); 169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 } 170 }
171 171
172bail: 172bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
178 if (ocfs2_mount_local(osb)) 178 if (ocfs2_mount_local(osb))
179 return; 179 return;
180 180
181 o2hb_unregister_callback(&osb->osb_hb_down); 181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(&osb->osb_hb_up); 182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183} 183}
184 184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 185void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 209 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
210 envp[2] = NULL; 210 envp[2] = NULL;
211 211
212 ret = call_usermodehelper(argv[0], argv, envp, 1); 212 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
213 if (ret < 0) 213 if (ret < 0)
214 mlog_errno(ret); 214 mlog_errno(ret);
215} 215}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9a..87dcece7e1 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
14#include "ocfs2.h" 14#include "ocfs2.h"
15#include "alloc.h" 15#include "alloc.h"
16#include "dlmglue.h" 16#include "dlmglue.h"
17#include "file.h"
17#include "inode.h" 18#include "inode.h"
18#include "journal.h" 19#include "journal.h"
19 20
@@ -62,7 +63,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
62 goto bail_unlock; 63 goto bail_unlock;
63 64
64 status = -EACCES; 65 status = -EACCES;
65 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 66 if (!is_owner_or_cap(inode))
66 goto bail_unlock; 67 goto bail_unlock;
67 68
68 if (!S_ISDIR(inode->i_mode)) 69 if (!S_ISDIR(inode->i_mode))
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115{ 116{
116 unsigned int flags; 117 unsigned int flags;
117 int status; 118 int status;
119 struct ocfs2_space_resv sr;
118 120
119 switch (cmd) { 121 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 122 case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
130 132
131 return ocfs2_set_inode_attr(inode, flags, 133 return ocfs2_set_inode_attr(inode, flags,
132 OCFS2_FL_MODIFIABLE); 134 OCFS2_FL_MODIFIABLE);
135 case OCFS2_IOC_RESVSP:
136 case OCFS2_IOC_RESVSP64:
137 case OCFS2_IOC_UNRESVSP:
138 case OCFS2_IOC_UNRESVSP64:
139 if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
140 return -EFAULT;
141
142 return ocfs2_change_file_space(filp, cmd, &sr);
133 default: 143 default:
134 return -ENOTTY; 144 return -ENOTTY;
135 } 145 }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
148 case OCFS2_IOC32_SETFLAGS: 158 case OCFS2_IOC32_SETFLAGS:
149 cmd = OCFS2_IOC_SETFLAGS; 159 cmd = OCFS2_IOC_SETFLAGS;
150 break; 160 break;
161 case OCFS2_IOC_RESVSP:
162 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64:
165 break;
151 default: 166 default:
152 return -ENOIOCTLCMD; 167 return -ENOIOCTLCMD;
153 } 168 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc11880817..dbfb20bb27 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
722 container_of(work, struct ocfs2_journal, j_recovery_work); 722 container_of(work, struct ocfs2_journal, j_recovery_work);
723 struct ocfs2_super *osb = journal->j_osb; 723 struct ocfs2_super *osb = journal->j_osb;
724 struct ocfs2_dinode *la_dinode, *tl_dinode; 724 struct ocfs2_dinode *la_dinode, *tl_dinode;
725 struct ocfs2_la_recovery_item *item; 725 struct ocfs2_la_recovery_item *item, *n;
726 struct list_head *p, *n;
727 LIST_HEAD(tmp_la_list); 726 LIST_HEAD(tmp_la_list);
728 727
729 mlog_entry_void(); 728 mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
734 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 733 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
735 spin_unlock(&journal->j_lock); 734 spin_unlock(&journal->j_lock);
736 735
737 list_for_each_safe(p, n, &tmp_la_list) { 736 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
738 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
739 list_del_init(&item->lri_list); 737 list_del_init(&item->lri_list);
740 738
741 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 739 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506..ce60aab013 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
290 + OCFS2_TRUNCATE_LOG_UPDATE) 290 + OCFS2_TRUNCATE_LOG_UPDATE)
291 291
292#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
293
292/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 294/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
293 * bitmap block for the new bit) */ 295 * bitmap block for the new bit) */
294#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 296#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39..d79aa12137 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
37 37
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "aops.h"
40#include "dlmglue.h" 41#include "dlmglue.h"
41#include "file.h" 42#include "file.h"
42#include "inode.h" 43#include "inode.h"
43#include "mmap.h" 44#include "mmap.h"
44 45
46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47{
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62
45static struct page *ocfs2_nopage(struct vm_area_struct * area, 63static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address, 64 unsigned long address,
47 int *type) 65 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
53 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, 71 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
54 type); 72 type);
55 73
56 /* The best way to deal with signals in this path is 74 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) { 75 if (ret < 0) {
65 mlog_errno(ret); 76 mlog_errno(ret);
66 goto out; 77 goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
68 79
69 page = filemap_nopage(area, address, type); 80 page = filemap_nopage(area, address, type);
70 81
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL); 82 ret = ocfs2_vm_op_unblock_sigs(&oldset);
72 if (ret < 0) 83 if (ret < 0)
73 mlog_errno(ret); 84 mlog_errno(ret);
74out: 85out:
@@ -76,28 +87,136 @@ out:
76 return page; 87 return page;
77} 88}
78 89
79static struct vm_operations_struct ocfs2_file_vm_ops = { 90static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
80 .nopage = ocfs2_nopage, 91 struct page *page)
81}; 92{
93 int ret;
94 struct address_space *mapping = inode->i_mapping;
95 loff_t pos = page->index << PAGE_CACHE_SHIFT;
96 unsigned int len = PAGE_CACHE_SIZE;
97 pgoff_t last_index;
98 struct page *locked_page = NULL;
99 void *fsdata;
100 loff_t size = i_size_read(inode);
82 101
83int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 102 /*
103 * Another node might have truncated while we were waiting on
104 * cluster locks.
105 */
106 last_index = size >> PAGE_CACHE_SHIFT;
107 if (page->index > last_index) {
108 ret = -EINVAL;
109 goto out;
110 }
111
112 /*
113 * The i_size check above doesn't catch the case where nodes
114 * truncated and then re-extended the file. We'll re-check the
115 * page mapping after taking the page lock inside of
116 * ocfs2_write_begin_nolock().
117 */
118 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
119 ret = -EINVAL;
120 goto out;
121 }
122
123 /*
124 * Call ocfs2_write_begin() and ocfs2_write_end() to take
125 * advantage of the allocation code there. We pass a write
126 * length of the whole page (chopped to i_size) to make sure
127 * the whole thing is allocated.
128 *
129 * Since we know the page is up to date, we don't have to
130 * worry about ocfs2_write_begin() skipping some buffer reads
131 * because the "write" would invalidate their data.
132 */
133 if (page->index == last_index)
134 len = size & ~PAGE_CACHE_MASK;
135
136 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
137 &fsdata, di_bh, page);
138 if (ret) {
139 if (ret != -ENOSPC)
140 mlog_errno(ret);
141 goto out;
142 }
143
144 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
145 fsdata);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out;
149 }
150 BUG_ON(ret != len);
151 ret = 0;
152out:
153 return ret;
154}
155
156static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
84{ 157{
85 int ret = 0, lock_level = 0; 158 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 159 struct buffer_head *di_bh = NULL;
160 sigset_t blocked, oldset;
161 int ret, ret2;
162
163 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
164 if (ret < 0) {
165 mlog_errno(ret);
166 return ret;
167 }
168
169 /*
170 * The cluster locks taken will block a truncate from another
171 * node. Taking the data lock will also ensure that we don't
172 * attempt page truncation as part of a downconvert.
173 */
174 ret = ocfs2_meta_lock(inode, &di_bh, 1);
175 if (ret < 0) {
176 mlog_errno(ret);
177 goto out;
178 }
87 179
88 /* 180 /*
89 * Only support shared writeable mmap for local mounts which 181 * The alloc sem should be enough to serialize with
90 * don't know about holes. 182 * ocfs2_truncate_file() changing i_size as well as any thread
183 * modifying the inode btree.
91 */ 184 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && 185 down_write(&OCFS2_I(inode)->ip_alloc_sem);
93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 186
94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 187 ret = ocfs2_data_lock(inode, 1);
95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 188 if (ret < 0) {
96 /* This is -EINVAL because generic_file_readonly_mmap 189 mlog_errno(ret);
97 * returns it in a similar situation. */ 190 goto out_meta_unlock;
98 return -EINVAL;
99 } 191 }
100 192
193 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
194
195 ocfs2_data_unlock(inode, 1);
196
197out_meta_unlock:
198 up_write(&OCFS2_I(inode)->ip_alloc_sem);
199
200 brelse(di_bh);
201 ocfs2_meta_unlock(inode, 1);
202
203out:
204 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
205 if (ret2 < 0)
206 mlog_errno(ret2);
207
208 return ret;
209}
210
211static struct vm_operations_struct ocfs2_file_vm_ops = {
212 .nopage = ocfs2_nopage,
213 .page_mkwrite = ocfs2_page_mkwrite,
214};
215
216int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
217{
218 int ret = 0, lock_level = 0;
219
101 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 220 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102 file->f_vfsmnt, &lock_level); 221 file->f_vfsmnt, &lock_level);
103 if (ret < 0) { 222 if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295..d430fdab16 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
1674 u32 offset = 0; 1674 u32 offset = 0;
1675 1675
1676 inode->i_op = &ocfs2_symlink_inode_operations; 1676 inode->i_op = &ocfs2_symlink_inode_operations;
1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
1678 new_fe_bh, 1678 new_fe_bh,
1679 handle, data_ac, NULL, 1679 handle, data_ac, NULL,
1680 NULL); 1680 NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e83..5cc90a40b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
219 u16 max_slots; 219 u16 max_slots;
220 s16 node_num; 220 s16 node_num;
221 s16 slot_num; 221 s16 slot_num;
222 s16 preferred_slot;
222 int s_sectsize_bits; 223 int s_sectsize_bits;
223 int s_clustersize; 224 int s_clustersize;
224 int s_clustersize_bits; 225 int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
305 return 0; 306 return 0;
306} 307}
307 308
309static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
310{
311 /*
312 * Support for sparse files is a pre-requisite
313 */
314 if (!ocfs2_sparse_alloc(osb))
315 return 0;
316
317 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
318 return 1;
319 return 0;
320}
321
308/* set / clear functions because cluster events can make these happen 322/* set / clear functions because cluster events can make these happen
309 * in parallel so we want the transitions to be atomic. this also 323 * in parallel so we want the transitions to be atomic. this also
310 * means that any future flags osb_flags must be protected by spinlock 324 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb0854..82f8a75b20 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 92
93/* 93/*
94 * Heartbeat-only devices are missing journals and other files. The 94 * Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
116 */ 116 */
117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
118 118
119/*
120 * Unwritten extents support.
121 */
122#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
123
119/* The byte offset of the first backup block will be 1G. 124/* The byte offset of the first backup block will be 1G.
120 * The following will be 4G, 16G, 64G, 256G and 1T. 125 * The following will be 4G, 16G, 64G, 256G and 1T.
121 */ 126 */
@@ -170,6 +175,32 @@
170#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 175#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
171 176
172/* 177/*
178 * Space reservation / allocation / free ioctls and argument structure
179 * are designed to be compatible with XFS.
180 *
181 * ALLOCSP* and FREESP* are not and will never be supported, but are
182 * included here for completeness.
183 */
184struct ocfs2_space_resv {
185 __s16 l_type;
186 __s16 l_whence;
187 __s64 l_start;
188 __s64 l_len; /* len == 0 means until end of file */
189 __s32 l_sysid;
190 __u32 l_pid;
191 __s32 l_pad[4]; /* reserve area */
192};
193
194#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
195#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
196#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
197#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
198#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
199#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
200#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
201#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
202
203/*
173 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 204 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
174 */ 205 */
175#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 206#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc..af4882b62c 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
121 return ret; 121 return ret;
122} 122}
123 123
124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) 124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
125{ 125{
126 int i; 126 int i;
127 s16 ret = OCFS2_INVALID_SLOT; 127 s16 ret = OCFS2_INVALID_SLOT;
128 128
129 if (preferred >= 0 && preferred < si->si_num_slots) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
131 ret = preferred;
132 goto out;
133 }
134 }
135
129 for(i = 0; i < si->si_num_slots; i++) { 136 for(i = 0; i < si->si_num_slots; i++) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 137 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
131 ret = (s16) i; 138 ret = (s16) i;
132 break; 139 break;
133 } 140 }
134 } 141 }
142out:
135 return ret; 143 return ret;
136} 144}
137 145
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 if (slot == OCFS2_INVALID_SLOT) { 256 if (slot == OCFS2_INVALID_SLOT) {
249 /* if no slot yet, then just take 1st available 257 /* if no slot yet, then just take 1st available
250 * one. */ 258 * one. */
251 slot = __ocfs2_find_empty_slot(si); 259 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
252 if (slot == OCFS2_INVALID_SLOT) { 260 if (slot == OCFS2_INVALID_SLOT) {
253 spin_unlock(&si->si_lock); 261 spin_unlock(&si->si_lock);
254 mlog(ML_ERROR, "no free slots available!\n"); 262 mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d1..d9c5c9fcb3 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
98 u16 chain); 98 u16 chain);
99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 u32 wanted); 100 u32 wanted);
101static int ocfs2_free_suballoc_bits(handle_t *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *alloc_bh,
104 unsigned int start_bit,
105 u64 bg_blkno,
106 unsigned int count);
107static inline u64 ocfs2_which_suballoc_group(u64 block,
108 unsigned int bit);
109static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 u64 bg_blkno, 102 u64 bg_blkno,
111 u16 bg_bit_off); 103 u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
496 488
497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 489 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
498 (*ac)->ac_which = OCFS2_AC_USE_META; 490 (*ac)->ac_which = OCFS2_AC_USE_META;
499
500#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 slot = 0;
502#else
503 slot = osb->slot_num; 491 slot = osb->slot_num;
504#endif
505
506 (*ac)->ac_group_search = ocfs2_block_group_search; 492 (*ac)->ac_group_search = ocfs2_block_group_search;
507 493
508 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 494 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
1626/* 1612/*
1627 * expects the suballoc inode to already be locked. 1613 * expects the suballoc inode to already be locked.
1628 */ 1614 */
1629static int ocfs2_free_suballoc_bits(handle_t *handle, 1615int ocfs2_free_suballoc_bits(handle_t *handle,
1630 struct inode *alloc_inode, 1616 struct inode *alloc_inode,
1631 struct buffer_head *alloc_bh, 1617 struct buffer_head *alloc_bh,
1632 unsigned int start_bit, 1618 unsigned int start_bit,
1633 u64 bg_blkno, 1619 u64 bg_blkno,
1634 unsigned int count) 1620 unsigned int count)
1635{ 1621{
1636 int status = 0; 1622 int status = 0;
1637 u32 tmp_used; 1623 u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
1703 return status; 1689 return status;
1704} 1690}
1705 1691
1706static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1707{
1708 u64 group = block - (u64) bit;
1709
1710 return group;
1711}
1712
1713int ocfs2_free_dinode(handle_t *handle, 1692int ocfs2_free_dinode(handle_t *handle,
1714 struct inode *inode_alloc_inode, 1693 struct inode *inode_alloc_inode,
1715 struct buffer_head *inode_alloc_bh, 1694 struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
1723 inode_alloc_bh, bit, bg_blkno, 1); 1702 inode_alloc_bh, bit, bg_blkno, 1);
1724} 1703}
1725 1704
1726int ocfs2_free_extent_block(handle_t *handle,
1727 struct inode *eb_alloc_inode,
1728 struct buffer_head *eb_alloc_bh,
1729 struct ocfs2_extent_block *eb)
1730{
1731 u64 blk = le64_to_cpu(eb->h_blkno);
1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1734
1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1736 bit, bg_blkno, 1);
1737}
1738
1739int ocfs2_free_clusters(handle_t *handle, 1705int ocfs2_free_clusters(handle_t *handle,
1740 struct inode *bitmap_inode, 1706 struct inode *bitmap_inode,
1741 struct buffer_head *bitmap_bh, 1707 struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb92..f212dc01a8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
86 u32 *cluster_start, 86 u32 *cluster_start,
87 u32 *num_clusters); 87 u32 *num_clusters);
88 88
89int ocfs2_free_suballoc_bits(handle_t *handle,
90 struct inode *alloc_inode,
91 struct buffer_head *alloc_bh,
92 unsigned int start_bit,
93 u64 bg_blkno,
94 unsigned int count);
89int ocfs2_free_dinode(handle_t *handle, 95int ocfs2_free_dinode(handle_t *handle,
90 struct inode *inode_alloc_inode, 96 struct inode *inode_alloc_inode,
91 struct buffer_head *inode_alloc_bh, 97 struct buffer_head *inode_alloc_bh,
92 struct ocfs2_dinode *di); 98 struct ocfs2_dinode *di);
93int ocfs2_free_extent_block(handle_t *handle,
94 struct inode *eb_alloc_inode,
95 struct buffer_head *eb_alloc_bh,
96 struct ocfs2_extent_block *eb);
97int ocfs2_free_clusters(handle_t *handle, 99int ocfs2_free_clusters(handle_t *handle,
98 struct inode *bitmap_inode, 100 struct inode *bitmap_inode,
99 struct buffer_head *bitmap_bh, 101 struct buffer_head *bitmap_bh,
100 u64 start_blk, 102 u64 start_blk,
101 unsigned int num_clusters); 103 unsigned int num_clusters);
102 104
105static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
106{
107 u64 group = block - (u64) bit;
108
109 return group;
110}
111
103static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, 112static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
104 u64 bg_blkno) 113 u64 bg_blkno)
105{ 114{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dc..3a5a1ed09a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
82MODULE_LICENSE("GPL"); 82MODULE_LICENSE("GPL");
83 83
84static int ocfs2_parse_options(struct super_block *sb, char *options, 84static int ocfs2_parse_options(struct super_block *sb, char *options,
85 unsigned long *mount_opt, int is_remount); 85 unsigned long *mount_opt, s16 *slot,
86 int is_remount);
86static void ocfs2_put_super(struct super_block *sb); 87static void ocfs2_put_super(struct super_block *sb);
87static int ocfs2_mount_volume(struct super_block *sb); 88static int ocfs2_mount_volume(struct super_block *sb);
88static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 89static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
114static struct inode *ocfs2_alloc_inode(struct super_block *sb); 115static struct inode *ocfs2_alloc_inode(struct super_block *sb);
115static void ocfs2_destroy_inode(struct inode *inode); 116static void ocfs2_destroy_inode(struct inode *inode);
116 117
117static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
118
119static const struct super_operations ocfs2_sops = { 118static const struct super_operations ocfs2_sops = {
120 .statfs = ocfs2_statfs, 119 .statfs = ocfs2_statfs,
121 .alloc_inode = ocfs2_alloc_inode, 120 .alloc_inode = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
140 Opt_data_ordered, 139 Opt_data_ordered,
141 Opt_data_writeback, 140 Opt_data_writeback,
142 Opt_atime_quantum, 141 Opt_atime_quantum,
142 Opt_slot,
143 Opt_err, 143 Opt_err,
144}; 144};
145 145
@@ -154,6 +154,7 @@ static match_table_t tokens = {
154 {Opt_data_ordered, "data=ordered"}, 154 {Opt_data_ordered, "data=ordered"},
155 {Opt_data_writeback, "data=writeback"}, 155 {Opt_data_writeback, "data=writeback"},
156 {Opt_atime_quantum, "atime_quantum=%u"}, 156 {Opt_atime_quantum, "atime_quantum=%u"},
157 {Opt_slot, "preferred_slot=%u"},
157 {Opt_err, NULL} 158 {Opt_err, NULL}
158}; 159};
159 160
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
318/* From xfs_super.c:xfs_max_file_offset 319/* From xfs_super.c:xfs_max_file_offset
319 * Copyright (c) 2000-2004 Silicon Graphics, Inc. 320 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
320 */ 321 */
321static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) 322unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
322{ 323{
323 unsigned int pagefactor = 1; 324 unsigned int pagefactor = 1;
324 unsigned int bitshift = BITS_PER_LONG - 1; 325 unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
355 int incompat_features; 356 int incompat_features;
356 int ret = 0; 357 int ret = 0;
357 unsigned long parsed_options; 358 unsigned long parsed_options;
359 s16 slot;
358 struct ocfs2_super *osb = OCFS2_SB(sb); 360 struct ocfs2_super *osb = OCFS2_SB(sb);
359 361
360 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 362 if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
361 ret = -EINVAL; 363 ret = -EINVAL;
362 goto out; 364 goto out;
363 } 365 }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
534 struct dentry *root; 536 struct dentry *root;
535 int status, sector_size; 537 int status, sector_size;
536 unsigned long parsed_opt; 538 unsigned long parsed_opt;
539 s16 slot;
537 struct inode *inode = NULL; 540 struct inode *inode = NULL;
538 struct ocfs2_super *osb = NULL; 541 struct ocfs2_super *osb = NULL;
539 struct buffer_head *bh = NULL; 542 struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
541 544
542 mlog_entry("%p, %p, %i", sb, data, silent); 545 mlog_entry("%p, %p, %i", sb, data, silent);
543 546
544 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { 547 if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
545 status = -EINVAL; 548 status = -EINVAL;
546 goto read_super_error; 549 goto read_super_error;
547 } 550 }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
571 brelse(bh); 574 brelse(bh);
572 bh = NULL; 575 bh = NULL;
573 osb->s_mount_opt = parsed_opt; 576 osb->s_mount_opt = parsed_opt;
577 osb->preferred_slot = slot;
574 578
575 sb->s_magic = OCFS2_SUPER_MAGIC; 579 sb->s_magic = OCFS2_SUPER_MAGIC;
576 580
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
713static int ocfs2_parse_options(struct super_block *sb, 717static int ocfs2_parse_options(struct super_block *sb,
714 char *options, 718 char *options,
715 unsigned long *mount_opt, 719 unsigned long *mount_opt,
720 s16 *slot,
716 int is_remount) 721 int is_remount)
717{ 722{
718 int status; 723 int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
722 options ? options : "(none)"); 727 options ? options : "(none)");
723 728
724 *mount_opt = 0; 729 *mount_opt = 0;
730 *slot = OCFS2_INVALID_SLOT;
725 731
726 if (!options) { 732 if (!options) {
727 status = 1; 733 status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
782 else 788 else
783 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 789 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
784 break; 790 break;
791 case Opt_slot:
792 option = 0;
793 if (match_int(&args[0], &option)) {
794 status = 0;
795 goto bail;
796 }
797 if (option)
798 *slot = (s16)option;
799 break;
785 default: 800 default:
786 mlog(ML_ERROR, 801 mlog(ML_ERROR,
787 "Unrecognized mount option \"%s\" " 802 "Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2..3b9cb3d0b0 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
49
48#endif /* OCFS2_SUPER_H */ 50#endif /* OCFS2_SUPER_H */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1619..a6b054edac 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -26,6 +26,7 @@
26#include <linux/syscalls.h> 26#include <linux/syscalls.h>
27#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/audit.h> 28#include <linux/audit.h>
29#include <linux/falloc.h>
29 30
30int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 31int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
31{ 32{
@@ -352,6 +353,64 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
352} 353}
353#endif 354#endif
354 355
356asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
357{
358 struct file *file;
359 struct inode *inode;
360 long ret = -EINVAL;
361
362 if (offset < 0 || len <= 0)
363 goto out;
364
365 /* Return error if mode is not supported */
366 ret = -EOPNOTSUPP;
367 if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
368 goto out;
369
370 ret = -EBADF;
371 file = fget(fd);
372 if (!file)
373 goto out;
374 if (!(file->f_mode & FMODE_WRITE))
375 goto out_fput;
376 /*
377 * Revalidate the write permissions, in case security policy has
378 * changed since the files were opened.
379 */
380 ret = security_file_permission(file, MAY_WRITE);
381 if (ret)
382 goto out_fput;
383
384 inode = file->f_path.dentry->d_inode;
385
386 ret = -ESPIPE;
387 if (S_ISFIFO(inode->i_mode))
388 goto out_fput;
389
390 ret = -ENODEV;
391 /*
392 * Let individual file system decide if it supports preallocation
393 * for directories or not.
394 */
395 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
396 goto out_fput;
397
398 ret = -EFBIG;
399 /* Check for wrap through zero too */
400 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
401 goto out_fput;
402
403 if (inode->i_op && inode->i_op->fallocate)
404 ret = inode->i_op->fallocate(inode, mode, offset, len);
405 else
406 ret = -ENOSYS;
407
408out_fput:
409 fput(file);
410out:
411 return ret;
412}
413
355/* 414/*
356 * access() needs to use the real uid/gid, not the effective uid/gid. 415 * access() needs to use the real uid/gid, not the effective uid/gid.
357 * We do this by temporarily clearing all FS-related capabilities and 416 * We do this by temporarily clearing all FS-related capabilities and
@@ -855,7 +914,7 @@ EXPORT_SYMBOL(dentry_open);
855/* 914/*
856 * Find an empty file descriptor entry, and mark it busy. 915 * Find an empty file descriptor entry, and mark it busy.
857 */ 916 */
858int get_unused_fd(void) 917int get_unused_fd_flags(int flags)
859{ 918{
860 struct files_struct * files = current->files; 919 struct files_struct * files = current->files;
861 int fd, error; 920 int fd, error;
@@ -891,7 +950,10 @@ repeat:
891 } 950 }
892 951
893 FD_SET(fd, fdt->open_fds); 952 FD_SET(fd, fdt->open_fds);
894 FD_CLR(fd, fdt->close_on_exec); 953 if (flags & O_CLOEXEC)
954 FD_SET(fd, fdt->close_on_exec);
955 else
956 FD_CLR(fd, fdt->close_on_exec);
895 files->next_fd = fd + 1; 957 files->next_fd = fd + 1;
896#if 1 958#if 1
897 /* Sanity check */ 959 /* Sanity check */
@@ -907,6 +969,11 @@ out:
907 return error; 969 return error;
908} 970}
909 971
972int get_unused_fd(void)
973{
974 return get_unused_fd_flags(0);
975}
976
910EXPORT_SYMBOL(get_unused_fd); 977EXPORT_SYMBOL(get_unused_fd);
911 978
912static void __put_unused_fd(struct files_struct *files, unsigned int fd) 979static void __put_unused_fd(struct files_struct *files, unsigned int fd)
@@ -959,7 +1026,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
959 int fd = PTR_ERR(tmp); 1026 int fd = PTR_ERR(tmp);
960 1027
961 if (!IS_ERR(tmp)) { 1028 if (!IS_ERR(tmp)) {
962 fd = get_unused_fd(); 1029 fd = get_unused_fd_flags(flags);
963 if (fd >= 0) { 1030 if (fd >= 0) {
964 struct file *f = do_filp_open(dfd, tmp, flags, mode); 1031 struct file *f = do_filp_open(dfd, tmp, flags, mode);
965 if (IS_ERR(f)) { 1032 if (IS_ERR(f)) {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index e349132859..3d3e166314 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -25,6 +25,8 @@
25#define PARTITION_RISCIX_SCSI 2 25#define PARTITION_RISCIX_SCSI 2
26#define PARTITION_LINUX 9 26#define PARTITION_LINUX 9
27 27
28#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
29 defined(CONFIG_ACORN_PARTITION_ADFS)
28static struct adfs_discrecord * 30static struct adfs_discrecord *
29adfs_partition(struct parsed_partitions *state, char *name, char *data, 31adfs_partition(struct parsed_partitions *state, char *name, char *data,
30 unsigned long first_sector, int slot) 32 unsigned long first_sector, int slot)
@@ -48,6 +50,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
48 put_partition(state, slot, first_sector, nr_sects); 50 put_partition(state, slot, first_sector, nr_sects);
49 return dr; 51 return dr;
50} 52}
53#endif
51 54
52#ifdef CONFIG_ACORN_PARTITION_RISCIX 55#ifdef CONFIG_ACORN_PARTITION_RISCIX
53 56
@@ -65,6 +68,8 @@ struct riscix_record {
65 struct riscix_part part[8]; 68 struct riscix_part part[8];
66}; 69};
67 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS)
68static int 73static int
69riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
70 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -105,6 +110,7 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
105 return slot; 110 return slot;
106} 111}
107#endif 112#endif
113#endif
108 114
109#define LINUX_NATIVE_MAGIC 0xdeafa1de 115#define LINUX_NATIVE_MAGIC 0xdeafa1de
110#define LINUX_SWAP_MAGIC 0xdeafab1e 116#define LINUX_SWAP_MAGIC 0xdeafab1e
@@ -115,6 +121,8 @@ struct linux_part {
115 __le32 nr_sects; 121 __le32 nr_sects;
116}; 122};
117 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS)
118static int 126static int
119linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127linux_partition(struct parsed_partitions *state, struct block_device *bdev,
120 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -146,6 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
146 put_dev_sector(sect); 154 put_dev_sector(sect);
147 return slot; 155 return slot;
148} 156}
157#endif
149 158
150#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
151int 160int
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9a3a058f35..98e0b85a9b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -397,7 +397,6 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
397 static struct attribute addpartattr = { 397 static struct attribute addpartattr = {
398 .name = "whole_disk", 398 .name = "whole_disk",
399 .mode = S_IRUSR | S_IRGRP | S_IROTH, 399 .mode = S_IRUSR | S_IRGRP | S_IROTH,
400 .owner = THIS_MODULE,
401 }; 400 };
402 401
403 sysfs_create_file(&p->kobj, &addpartattr); 402 sysfs_create_file(&p->kobj, &addpartattr);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 9f7ad4244f..1e064c4a4f 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -45,7 +45,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
45{ 45{
46 int blocksize, offset, size,res; 46 int blocksize, offset, size,res;
47 loff_t i_size; 47 loff_t i_size;
48 dasd_information_t *info; 48 dasd_information2_t *info;
49 struct hd_geometry *geo; 49 struct hd_geometry *geo;
50 char type[5] = {0,}; 50 char type[5] = {0,};
51 char name[7] = {0,}; 51 char name[7] = {0,};
@@ -64,14 +64,17 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
64 if (i_size == 0) 64 if (i_size == 0)
65 goto out_exit; 65 goto out_exit;
66 66
67 if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) 67 info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
68 if (info == NULL)
68 goto out_exit; 69 goto out_exit;
69 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) 70 geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
71 if (geo == NULL)
70 goto out_nogeo; 72 goto out_nogeo;
71 if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL) 73 label = kmalloc(sizeof(union label_t), GFP_KERNEL);
74 if (label == NULL)
72 goto out_nolab; 75 goto out_nolab;
73 76
74 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || 77 if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 ||
75 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) 78 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
76 goto out_freeall; 79 goto out_freeall;
77 80
@@ -96,84 +99,108 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
96 res = 1; 99 res = 1;
97 100
98 /* 101 /*
99 * Three different types: CMS1, VOL1 and LNX1/unlabeled 102 * Three different formats: LDL, CDL and unformated disk
103 *
104 * identified by info->format
105 *
106 * unformated disks we do not have to care about
100 */ 107 */
101 if (strncmp(type, "CMS1", 4) == 0) { 108 if (info->format == DASD_FORMAT_LDL) {
102 /* 109 if (strncmp(type, "CMS1", 4) == 0) {
103 * VM style CMS1 labeled disk 110 /*
104 */ 111 * VM style CMS1 labeled disk
105 if (label->cms.disk_offset != 0) { 112 */
106 printk("CMS1/%8s(MDSK):", name); 113 if (label->cms.disk_offset != 0) {
107 /* disk is reserved minidisk */ 114 printk("CMS1/%8s(MDSK):", name);
108 blocksize = label->cms.block_size; 115 /* disk is reserved minidisk */
109 offset = label->cms.disk_offset; 116 blocksize = label->cms.block_size;
110 size = (label->cms.block_count - 1) * (blocksize >> 9); 117 offset = label->cms.disk_offset;
118 size = (label->cms.block_count - 1)
119 * (blocksize >> 9);
120 } else {
121 printk("CMS1/%8s:", name);
122 offset = (info->label_block + 1);
123 size = i_size >> 9;
124 }
111 } else { 125 } else {
112 printk("CMS1/%8s:", name); 126 /*
127 * Old style LNX1 or unlabeled disk
128 */
129 if (strncmp(type, "LNX1", 4) == 0)
130 printk ("LNX1/%8s:", name);
131 else
132 printk("(nonl)");
113 offset = (info->label_block + 1); 133 offset = (info->label_block + 1);
114 size = i_size >> 9; 134 size = i_size >> 9;
115 } 135 }
116 put_partition(state, 1, offset*(blocksize >> 9), 136 put_partition(state, 1, offset*(blocksize >> 9),
117 size-offset*(blocksize >> 9)); 137 size-offset*(blocksize >> 9));
118 } else if ((strncmp(type, "VOL1", 4) == 0) && 138 } else if (info->format == DASD_FORMAT_CDL) {
119 (!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
120 /* 139 /*
121 * New style VOL1 labeled disk 140 * New style CDL formatted disk
122 */ 141 */
123 unsigned int blk; 142 unsigned int blk;
124 int counter; 143 int counter;
125 144
126 printk("VOL1/%8s:", name);
127
128 /* get block number and read then go through format1 labels */
129 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
130 counter = 0;
131 while ((data = read_dev_sector(bdev, blk*(blocksize/512),
132 &sect)) != NULL) {
133 struct vtoc_format1_label f1;
134
135 memcpy(&f1, data, sizeof(struct vtoc_format1_label));
136 put_dev_sector(sect);
137
138 /* skip FMT4 / FMT5 / FMT7 labels */
139 if (f1.DS1FMTID == _ascebc['4']
140 || f1.DS1FMTID == _ascebc['5']
141 || f1.DS1FMTID == _ascebc['7']) {
142 blk++;
143 continue;
144 }
145
146 /* only FMT1 valid at this point */
147 if (f1.DS1FMTID != _ascebc['1'])
148 break;
149
150 /* OK, we got valid partition data */
151 offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
152 size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
153 offset + geo->sectors;
154 if (counter >= state->limit)
155 break;
156 put_partition(state, counter + 1,
157 offset * (blocksize >> 9),
158 size * (blocksize >> 9));
159 counter++;
160 blk++;
161 }
162 if (!data)
163 /* Are we not supposed to report this ? */
164 goto out_readerr;
165 } else {
166 /* 145 /*
167 * Old style LNX1 or unlabeled disk 146 * check if VOL1 label is available
147 * if not, something is wrong, skipping partition detection
168 */ 148 */
169 if (strncmp(type, "LNX1", 4) == 0) 149 if (strncmp(type, "VOL1", 4) == 0) {
170 printk ("LNX1/%8s:", name); 150 printk("VOL1/%8s:", name);
171 else 151 /*
172 printk("(nonl)/%8s:", name); 152 * get block number and read then go through format1
173 offset = (info->label_block + 1); 153 * labels
174 size = i_size >> 9; 154 */
175 put_partition(state, 1, offset*(blocksize >> 9), 155 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
176 size-offset*(blocksize >> 9)); 156 counter = 0;
157 data = read_dev_sector(bdev, blk * (blocksize/512),
158 &sect);
159 while (data != NULL) {
160 struct vtoc_format1_label f1;
161
162 memcpy(&f1, data,
163 sizeof(struct vtoc_format1_label));
164 put_dev_sector(sect);
165
166 /* skip FMT4 / FMT5 / FMT7 labels */
167 if (f1.DS1FMTID == _ascebc['4']
168 || f1.DS1FMTID == _ascebc['5']
169 || f1.DS1FMTID == _ascebc['7']) {
170 blk++;
171 data = read_dev_sector(bdev, blk *
172 (blocksize/512),
173 &sect);
174 continue;
175 }
176
177 /* only FMT1 valid at this point */
178 if (f1.DS1FMTID != _ascebc['1'])
179 break;
180
181 /* OK, we got valid partition data */
182 offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
183 size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
184 offset + geo->sectors;
185 if (counter >= state->limit)
186 break;
187 put_partition(state, counter + 1,
188 offset * (blocksize >> 9),
189 size * (blocksize >> 9));
190 counter++;
191 blk++;
192 data = read_dev_sector(bdev,
193 blk * (blocksize/512),
194 &sect);
195 }
196
197 if (!data)
198 /* Are we not supposed to report this ? */
199 goto out_readerr;
200 } else
201 printk(KERN_WARNING "Warning, expected Label VOL1 not "
202 "found, treating as CDL formated Disk");
203
177 } 204 }
178 205
179 printk("\n"); 206 printk("\n");
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 99873a2b4c..e7dd1d4e34 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -677,15 +677,24 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
677 * Return: -1 Error, the calculated offset exceeded the size of the buffer 677 * Return: -1 Error, the calculated offset exceeded the size of the buffer
678 * n OK, a range-checked offset into buffer 678 * n OK, a range-checked offset into buffer
679 */ 679 */
680static int ldm_relative (const u8 *buffer, int buflen, int base, int offset) 680static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
681{ 681{
682 682
683 base += offset; 683 base += offset;
684 if ((!buffer) || (offset < 0) || (base > buflen)) 684 if (!buffer || offset < 0 || base > buflen) {
685 if (!buffer)
686 ldm_error("!buffer");
687 if (offset < 0)
688 ldm_error("offset (%d) < 0", offset);
689 if (base > buflen)
690 ldm_error("base (%d) > buflen (%d)", base, buflen);
685 return -1; 691 return -1;
686 if ((base + buffer[base]) >= buflen) 692 }
693 if (base + buffer[base] >= buflen) {
694 ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
695 buffer[base], buflen);
687 return -1; 696 return -1;
688 697 }
689 return buffer[base] + offset + 1; 698 return buffer[base] + offset + 1;
690} 699}
691 700
@@ -1054,60 +1063,98 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
1054 * Return: 'true' @vb contains a Volume VBLK 1063 * Return: 'true' @vb contains a Volume VBLK
1055 * 'false' @vb contents are not defined 1064 * 'false' @vb contents are not defined
1056 */ 1065 */
1057static bool ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb) 1066static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
1058{ 1067{
1059 int r_objid, r_name, r_vtype, r_child, r_size, r_id1, r_id2, r_size2; 1068 int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
1060 int r_drive, len; 1069 int r_id1, r_id2, r_size2, r_drive, len;
1061 struct vblk_volu *volu; 1070 struct vblk_volu *volu;
1062 1071
1063 BUG_ON (!buffer || !vb); 1072 BUG_ON(!buffer || !vb);
1064 1073 r_objid = ldm_relative(buffer, buflen, 0x18, 0);
1065 r_objid = ldm_relative (buffer, buflen, 0x18, 0); 1074 if (r_objid < 0) {
1066 r_name = ldm_relative (buffer, buflen, 0x18, r_objid); 1075 ldm_error("r_objid %d < 0", r_objid);
1067 r_vtype = ldm_relative (buffer, buflen, 0x18, r_name); 1076 return false;
1068 r_child = ldm_relative (buffer, buflen, 0x2E, r_vtype); 1077 }
1069 r_size = ldm_relative (buffer, buflen, 0x3E, r_child); 1078 r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
1070 1079 if (r_name < 0) {
1071 if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) 1080 ldm_error("r_name %d < 0", r_name);
1072 r_id1 = ldm_relative (buffer, buflen, 0x53, r_size); 1081 return false;
1073 else 1082 }
1083 r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
1084 if (r_vtype < 0) {
1085 ldm_error("r_vtype %d < 0", r_vtype);
1086 return false;
1087 }
1088 r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
1089 if (r_disable_drive_letter < 0) {
1090 ldm_error("r_disable_drive_letter %d < 0",
1091 r_disable_drive_letter);
1092 return false;
1093 }
1094 r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
1095 if (r_child < 0) {
1096 ldm_error("r_child %d < 0", r_child);
1097 return false;
1098 }
1099 r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
1100 if (r_size < 0) {
1101 ldm_error("r_size %d < 0", r_size);
1102 return false;
1103 }
1104 if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
1105 r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
1106 if (r_id1 < 0) {
1107 ldm_error("r_id1 %d < 0", r_id1);
1108 return false;
1109 }
1110 } else
1074 r_id1 = r_size; 1111 r_id1 = r_size;
1075 1112 if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
1076 if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) 1113 r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
1077 r_id2 = ldm_relative (buffer, buflen, 0x53, r_id1); 1114 if (r_id2 < 0) {
1078 else 1115 ldm_error("r_id2 %d < 0", r_id2);
1116 return false;
1117 }
1118 } else
1079 r_id2 = r_id1; 1119 r_id2 = r_id1;
1080 1120 if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
1081 if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) 1121 r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
1082 r_size2 = ldm_relative (buffer, buflen, 0x53, r_id2); 1122 if (r_size2 < 0) {
1083 else 1123 ldm_error("r_size2 %d < 0", r_size2);
1124 return false;
1125 }
1126 } else
1084 r_size2 = r_id2; 1127 r_size2 = r_id2;
1085 1128 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1086 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) 1129 r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
1087 r_drive = ldm_relative (buffer, buflen, 0x53, r_size2); 1130 if (r_drive < 0) {
1088 else 1131 ldm_error("r_drive %d < 0", r_drive);
1132 return false;
1133 }
1134 } else
1089 r_drive = r_size2; 1135 r_drive = r_size2;
1090
1091 len = r_drive; 1136 len = r_drive;
1092 if (len < 0) 1137 if (len < 0) {
1138 ldm_error("len %d < 0", len);
1093 return false; 1139 return false;
1094 1140 }
1095 len += VBLK_SIZE_VOL5; 1141 len += VBLK_SIZE_VOL5;
1096 if (len != BE32 (buffer + 0x14)) 1142 if (len > BE32(buffer + 0x14)) {
1143 ldm_error("len %d > BE32(buffer + 0x14) %d", len,
1144 BE32(buffer + 0x14));
1097 return false; 1145 return false;
1098 1146 }
1099 volu = &vb->vblk.volu; 1147 volu = &vb->vblk.volu;
1100 1148 ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
1101 ldm_get_vstr (buffer + 0x18 + r_name, volu->volume_type, 1149 sizeof(volu->volume_type));
1102 sizeof (volu->volume_type)); 1150 memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
1103 memcpy (volu->volume_state, buffer + 0x19 + r_vtype, 1151 sizeof(volu->volume_state));
1104 sizeof (volu->volume_state)); 1152 volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
1105 volu->size = ldm_get_vnum (buffer + 0x3E + r_child); 1153 volu->partition_type = buffer[0x41 + r_size];
1106 volu->partition_type = buffer[0x42 + r_size]; 1154 memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
1107 memcpy (volu->guid, buffer + 0x43 + r_size, sizeof (volu->guid));
1108 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { 1155 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1109 ldm_get_vstr (buffer + 0x53 + r_size, volu->drive_hint, 1156 ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
1110 sizeof (volu->drive_hint)); 1157 sizeof(volu->drive_hint));
1111 } 1158 }
1112 return true; 1159 return true;
1113} 1160}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d2e6a30469..80f63b5fdd 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -68,7 +68,7 @@ struct parsed_partitions;
68#define VBLK_SIZE_DSK3 12 68#define VBLK_SIZE_DSK3 12
69#define VBLK_SIZE_DSK4 45 69#define VBLK_SIZE_DSK4 45
70#define VBLK_SIZE_PRT3 28 70#define VBLK_SIZE_PRT3 28
71#define VBLK_SIZE_VOL5 59 71#define VBLK_SIZE_VOL5 58
72 72
73/* component types */ 73/* component types */
74#define COMP_STRIPE 0x01 /* Stripe-set */ 74#define COMP_STRIPE 0x01 /* Stripe-set */
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a89592bdf..d007830d9c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -164,6 +164,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
164 page_cache_release(page); 164 page_cache_release(page);
165} 165}
166 166
167/**
168 * generic_pipe_buf_map - virtually map a pipe buffer
169 * @pipe: the pipe that the buffer belongs to
170 * @buf: the buffer that should be mapped
171 * @atomic: whether to use an atomic map
172 *
173 * Description:
174 * This function returns a kernel virtual address mapping for the
175 * passed in @pipe_buffer. If @atomic is set, an atomic map is provided
176 * and the caller has to be careful not to fault before calling
177 * the unmap function.
178 *
179 * Note that this function occupies KM_USER0 if @atomic != 0.
180 */
167void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 181void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
168 struct pipe_buffer *buf, int atomic) 182 struct pipe_buffer *buf, int atomic)
169{ 183{
@@ -175,6 +189,15 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
175 return kmap(buf->page); 189 return kmap(buf->page);
176} 190}
177 191
192/**
193 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
194 * @pipe: the pipe that the buffer belongs to
195 * @buf: the buffer that should be unmapped
196 * @map_data: the data that the mapping function returned
197 *
198 * Description:
199 * This function undoes the mapping that ->map() provided.
200 */
178void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 201void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
179 struct pipe_buffer *buf, void *map_data) 202 struct pipe_buffer *buf, void *map_data)
180{ 203{
@@ -185,11 +208,28 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
185 kunmap(buf->page); 208 kunmap(buf->page);
186} 209}
187 210
211/**
212 * generic_pipe_buf_steal - attempt to take ownership of a @pipe_buffer
213 * @pipe: the pipe that the buffer belongs to
214 * @buf: the buffer to attempt to steal
215 *
216 * Description:
217 * This function attempts to steal the @struct page attached to
218 * @buf. If successful, this function returns 0 and returns with
219 * the page locked. The caller may then reuse the page for whatever
220 * he wishes, the typical use is insertion into a different file
221 * page cache.
222 */
188int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 223int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
189 struct pipe_buffer *buf) 224 struct pipe_buffer *buf)
190{ 225{
191 struct page *page = buf->page; 226 struct page *page = buf->page;
192 227
228 /*
229 * A reference of one is golden, that means that the owner of this
230 * page is the only one holding a reference to it. lock the page
231 * and return OK.
232 */
193 if (page_count(page) == 1) { 233 if (page_count(page) == 1) {
194 lock_page(page); 234 lock_page(page);
195 return 0; 235 return 0;
@@ -198,12 +238,32 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
198 return 1; 238 return 1;
199} 239}
200 240
201void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) 241/**
242 * generic_pipe_buf_get - get a reference to a @struct pipe_buffer
243 * @pipe: the pipe that the buffer belongs to
244 * @buf: the buffer to get a reference to
245 *
246 * Description:
247 * This function grabs an extra reference to @buf. It's used in
248 * in the tee() system call, when we duplicate the buffers in one
249 * pipe into another.
250 */
251void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
202{ 252{
203 page_cache_get(buf->page); 253 page_cache_get(buf->page);
204} 254}
205 255
206int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) 256/**
257 * generic_pipe_buf_confirm - verify contents of the pipe buffer
258 * @pipe: the pipe that the buffer belongs to
259 * @buf: the buffer to confirm
260 *
261 * Description:
262 * This function does nothing, because the generic pipe code uses
263 * pages that are always good when inserted into the pipe.
264 */
265int generic_pipe_buf_confirm(struct pipe_inode_info *info,
266 struct pipe_buffer *buf)
207{ 267{
208 return 0; 268 return 0;
209} 269}
@@ -212,7 +272,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
212 .can_merge = 1, 272 .can_merge = 1,
213 .map = generic_pipe_buf_map, 273 .map = generic_pipe_buf_map,
214 .unmap = generic_pipe_buf_unmap, 274 .unmap = generic_pipe_buf_unmap,
215 .pin = generic_pipe_buf_pin, 275 .confirm = generic_pipe_buf_confirm,
216 .release = anon_pipe_buf_release, 276 .release = anon_pipe_buf_release,
217 .steal = generic_pipe_buf_steal, 277 .steal = generic_pipe_buf_steal,
218 .get = generic_pipe_buf_get, 278 .get = generic_pipe_buf_get,
@@ -252,7 +312,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
252 if (chars > total_len) 312 if (chars > total_len)
253 chars = total_len; 313 chars = total_len;
254 314
255 error = ops->pin(pipe, buf); 315 error = ops->confirm(pipe, buf);
256 if (error) { 316 if (error) {
257 if (!ret) 317 if (!ret)
258 error = ret; 318 error = ret;
@@ -373,7 +433,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
373 int error, atomic = 1; 433 int error, atomic = 1;
374 void *addr; 434 void *addr;
375 435
376 error = ops->pin(pipe, buf); 436 error = ops->confirm(pipe, buf);
377 if (error) 437 if (error)
378 goto out; 438 goto out;
379 439
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 74f30e0c03..965625a097 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -62,6 +62,8 @@
62#include <linux/mman.h> 62#include <linux/mman.h>
63#include <linux/proc_fs.h> 63#include <linux/proc_fs.h>
64#include <linux/ioport.h> 64#include <linux/ioport.h>
65#include <linux/uaccess.h>
66#include <linux/io.h>
65#include <linux/mm.h> 67#include <linux/mm.h>
66#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
67#include <linux/pagemap.h> 69#include <linux/pagemap.h>
@@ -76,9 +78,7 @@
76#include <linux/rcupdate.h> 78#include <linux/rcupdate.h>
77#include <linux/delayacct.h> 79#include <linux/delayacct.h>
78 80
79#include <asm/uaccess.h>
80#include <asm/pgtable.h> 81#include <asm/pgtable.h>
81#include <asm/io.h>
82#include <asm/processor.h> 82#include <asm/processor.h>
83#include "internal.h" 83#include "internal.h"
84 84
@@ -87,10 +87,10 @@
87do { memcpy(buffer, string, strlen(string)); \ 87do { memcpy(buffer, string, strlen(string)); \
88 buffer += strlen(string); } while (0) 88 buffer += strlen(string); } while (0)
89 89
90static inline char * task_name(struct task_struct *p, char * buf) 90static inline char *task_name(struct task_struct *p, char *buf)
91{ 91{
92 int i; 92 int i;
93 char * name; 93 char *name;
94 char tcomm[sizeof(p->comm)]; 94 char tcomm[sizeof(p->comm)];
95 95
96 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
@@ -138,7 +138,7 @@ static const char *task_state_array[] = {
138 "X (dead)" /* 32 */ 138 "X (dead)" /* 32 */
139}; 139};
140 140
141static inline const char * get_task_state(struct task_struct *tsk) 141static inline const char *get_task_state(struct task_struct *tsk)
142{ 142{
143 unsigned int state = (tsk->state & (TASK_RUNNING | 143 unsigned int state = (tsk->state & (TASK_RUNNING |
144 TASK_INTERRUPTIBLE | 144 TASK_INTERRUPTIBLE |
@@ -156,7 +156,7 @@ static inline const char * get_task_state(struct task_struct *tsk)
156 return *p; 156 return *p;
157} 157}
158 158
159static inline char * task_state(struct task_struct *p, char *buffer) 159static inline char *task_state(struct task_struct *p, char *buffer)
160{ 160{
161 struct group_info *group_info; 161 struct group_info *group_info;
162 int g; 162 int g;
@@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
165 rcu_read_lock(); 165 rcu_read_lock();
166 buffer += sprintf(buffer, 166 buffer += sprintf(buffer,
167 "State:\t%s\n" 167 "State:\t%s\n"
168 "SleepAVG:\t%lu%%\n"
169 "Tgid:\t%d\n" 168 "Tgid:\t%d\n"
170 "Pid:\t%d\n" 169 "Pid:\t%d\n"
171 "PPid:\t%d\n" 170 "PPid:\t%d\n"
@@ -173,9 +172,8 @@ static inline char * task_state(struct task_struct *p, char *buffer)
173 "Uid:\t%d\t%d\t%d\t%d\n" 172 "Uid:\t%d\t%d\t%d\t%d\n"
174 "Gid:\t%d\t%d\t%d\t%d\n", 173 "Gid:\t%d\t%d\t%d\t%d\n",
175 get_task_state(p), 174 get_task_state(p),
176 (p->sleep_avg/1024)*100/(1020000000/1024), 175 p->tgid, p->pid,
177 p->tgid, p->pid, 176 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
178 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
179 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, 177 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
180 p->uid, p->euid, p->suid, p->fsuid, 178 p->uid, p->euid, p->suid, p->fsuid,
181 p->gid, p->egid, p->sgid, p->fsgid); 179 p->gid, p->egid, p->sgid, p->fsgid);
@@ -193,15 +191,15 @@ static inline char * task_state(struct task_struct *p, char *buffer)
193 get_group_info(group_info); 191 get_group_info(group_info);
194 task_unlock(p); 192 task_unlock(p);
195 193
196 for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++) 194 for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
197 buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g)); 195 buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g));
198 put_group_info(group_info); 196 put_group_info(group_info);
199 197
200 buffer += sprintf(buffer, "\n"); 198 buffer += sprintf(buffer, "\n");
201 return buffer; 199 return buffer;
202} 200}
203 201
204static char * render_sigset_t(const char *header, sigset_t *set, char *buffer) 202static char *render_sigset_t(const char *header, sigset_t *set, char *buffer)
205{ 203{
206 int i, len; 204 int i, len;
207 205
@@ -241,7 +239,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
241 } 239 }
242} 240}
243 241
244static inline char * task_sig(struct task_struct *p, char *buffer) 242static inline char *task_sig(struct task_struct *p, char *buffer)
245{ 243{
246 unsigned long flags; 244 unsigned long flags;
247 sigset_t pending, shpending, blocked, ignored, caught; 245 sigset_t pending, shpending, blocked, ignored, caught;
@@ -291,14 +289,23 @@ static inline char *task_cap(struct task_struct *p, char *buffer)
291 cap_t(p->cap_effective)); 289 cap_t(p->cap_effective));
292} 290}
293 291
294int proc_pid_status(struct task_struct *task, char * buffer) 292static inline char *task_context_switch_counts(struct task_struct *p,
293 char *buffer)
295{ 294{
296 char * orig = buffer; 295 return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n"
296 "nonvoluntary_ctxt_switches:\t%lu\n",
297 p->nvcsw,
298 p->nivcsw);
299}
300
301int proc_pid_status(struct task_struct *task, char *buffer)
302{
303 char *orig = buffer;
297 struct mm_struct *mm = get_task_mm(task); 304 struct mm_struct *mm = get_task_mm(task);
298 305
299 buffer = task_name(task, buffer); 306 buffer = task_name(task, buffer);
300 buffer = task_state(task, buffer); 307 buffer = task_state(task, buffer);
301 308
302 if (mm) { 309 if (mm) {
303 buffer = task_mem(mm, buffer); 310 buffer = task_mem(mm, buffer);
304 mmput(mm); 311 mmput(mm);
@@ -309,10 +316,45 @@ int proc_pid_status(struct task_struct *task, char * buffer)
309#if defined(CONFIG_S390) 316#if defined(CONFIG_S390)
310 buffer = task_show_regs(task, buffer); 317 buffer = task_show_regs(task, buffer);
311#endif 318#endif
319 buffer = task_context_switch_counts(task, buffer);
312 return buffer - orig; 320 return buffer - orig;
313} 321}
314 322
315static int do_task_stat(struct task_struct *task, char * buffer, int whole) 323static clock_t task_utime(struct task_struct *p)
324{
325 clock_t utime = cputime_to_clock_t(p->utime),
326 total = utime + cputime_to_clock_t(p->stime);
327 u64 temp;
328
329 /*
330 * Use CFS's precise accounting:
331 */
332 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
333
334 if (total) {
335 temp *= utime;
336 do_div(temp, total);
337 }
338 utime = (clock_t)temp;
339
340 return utime;
341}
342
343static clock_t task_stime(struct task_struct *p)
344{
345 clock_t stime;
346
347 /*
348 * Use CFS's precise accounting. (we subtract utime from
349 * the total, to make sure the total observed by userspace
350 * grows monotonically - apps rely on that):
351 */
352 stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
353
354 return stime;
355}
356
357static int do_task_stat(struct task_struct *task, char *buffer, int whole)
316{ 358{
317 unsigned long vsize, eip, esp, wchan = ~0UL; 359 unsigned long vsize, eip, esp, wchan = ~0UL;
318 long priority, nice; 360 long priority, nice;
@@ -320,13 +362,14 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
320 sigset_t sigign, sigcatch; 362 sigset_t sigign, sigcatch;
321 char state; 363 char state;
322 int res; 364 int res;
323 pid_t ppid = 0, pgid = -1, sid = -1; 365 pid_t ppid = 0, pgid = -1, sid = -1;
324 int num_threads = 0; 366 int num_threads = 0;
325 struct mm_struct *mm; 367 struct mm_struct *mm;
326 unsigned long long start_time; 368 unsigned long long start_time;
327 unsigned long cmin_flt = 0, cmaj_flt = 0; 369 unsigned long cmin_flt = 0, cmaj_flt = 0;
328 unsigned long min_flt = 0, maj_flt = 0; 370 unsigned long min_flt = 0, maj_flt = 0;
329 cputime_t cutime, cstime, utime, stime; 371 cputime_t cutime, cstime;
372 clock_t utime, stime;
330 unsigned long rsslim = 0; 373 unsigned long rsslim = 0;
331 char tcomm[sizeof(task->comm)]; 374 char tcomm[sizeof(task->comm)];
332 unsigned long flags; 375 unsigned long flags;
@@ -344,7 +387,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
344 387
345 sigemptyset(&sigign); 388 sigemptyset(&sigign);
346 sigemptyset(&sigcatch); 389 sigemptyset(&sigcatch);
347 cutime = cstime = utime = stime = cputime_zero; 390 cutime = cstime = cputime_zero;
391 utime = stime = 0;
348 392
349 rcu_read_lock(); 393 rcu_read_lock();
350 if (lock_task_sighand(task, &flags)) { 394 if (lock_task_sighand(task, &flags)) {
@@ -370,15 +414,15 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
370 do { 414 do {
371 min_flt += t->min_flt; 415 min_flt += t->min_flt;
372 maj_flt += t->maj_flt; 416 maj_flt += t->maj_flt;
373 utime = cputime_add(utime, t->utime); 417 utime += task_utime(t);
374 stime = cputime_add(stime, t->stime); 418 stime += task_stime(t);
375 t = next_thread(t); 419 t = next_thread(t);
376 } while (t != task); 420 } while (t != task);
377 421
378 min_flt += sig->min_flt; 422 min_flt += sig->min_flt;
379 maj_flt += sig->maj_flt; 423 maj_flt += sig->maj_flt;
380 utime = cputime_add(utime, sig->utime); 424 utime += cputime_to_clock_t(sig->utime);
381 stime = cputime_add(stime, sig->stime); 425 stime += cputime_to_clock_t(sig->stime);
382 } 426 }
383 427
384 sid = signal_session(sig); 428 sid = signal_session(sig);
@@ -389,13 +433,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
389 } 433 }
390 rcu_read_unlock(); 434 rcu_read_unlock();
391 435
392 if (!whole || num_threads<2) 436 if (!whole || num_threads < 2)
393 wchan = get_wchan(task); 437 wchan = get_wchan(task);
394 if (!whole) { 438 if (!whole) {
395 min_flt = task->min_flt; 439 min_flt = task->min_flt;
396 maj_flt = task->maj_flt; 440 maj_flt = task->maj_flt;
397 utime = task->utime; 441 utime = task_utime(task);
398 stime = task->stime; 442 stime = task_stime(task);
399 } 443 }
400 444
401 /* scale priority and nice values from timeslices to -20..20 */ 445 /* scale priority and nice values from timeslices to -20..20 */
@@ -405,12 +449,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
405 449
406 /* Temporary variable needed for gcc-2.96 */ 450 /* Temporary variable needed for gcc-2.96 */
407 /* convert timespec -> nsec*/ 451 /* convert timespec -> nsec*/
408 start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 452 start_time =
409 + task->start_time.tv_nsec; 453 (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
454 + task->real_start_time.tv_nsec;
410 /* convert nsec -> ticks */ 455 /* convert nsec -> ticks */
411 start_time = nsec_to_clock_t(start_time); 456 start_time = nsec_to_clock_t(start_time);
412 457
413 res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \ 458 res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
414%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 459%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
415%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", 460%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
416 task->pid, 461 task->pid,
@@ -426,8 +471,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
426 cmin_flt, 471 cmin_flt,
427 maj_flt, 472 maj_flt,
428 cmaj_flt, 473 cmaj_flt,
429 cputime_to_clock_t(utime), 474 utime,
430 cputime_to_clock_t(stime), 475 stime,
431 cputime_to_clock_t(cutime), 476 cputime_to_clock_t(cutime),
432 cputime_to_clock_t(cstime), 477 cputime_to_clock_t(cstime),
433 priority, 478 priority,
@@ -436,7 +481,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
436 start_time, 481 start_time,
437 vsize, 482 vsize,
438 mm ? get_mm_rss(mm) : 0, 483 mm ? get_mm_rss(mm) : 0,
439 rsslim, 484 rsslim,
440 mm ? mm->start_code : 0, 485 mm ? mm->start_code : 0,
441 mm ? mm->end_code : 0, 486 mm ? mm->end_code : 0,
442 mm ? mm->start_stack : 0, 487 mm ? mm->start_stack : 0,
@@ -458,17 +503,17 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
458 task->rt_priority, 503 task->rt_priority,
459 task->policy, 504 task->policy,
460 (unsigned long long)delayacct_blkio_ticks(task)); 505 (unsigned long long)delayacct_blkio_ticks(task));
461 if(mm) 506 if (mm)
462 mmput(mm); 507 mmput(mm);
463 return res; 508 return res;
464} 509}
465 510
466int proc_tid_stat(struct task_struct *task, char * buffer) 511int proc_tid_stat(struct task_struct *task, char *buffer)
467{ 512{
468 return do_task_stat(task, buffer, 0); 513 return do_task_stat(task, buffer, 0);
469} 514}
470 515
471int proc_tgid_stat(struct task_struct *task, char * buffer) 516int proc_tgid_stat(struct task_struct *task, char *buffer)
472{ 517{
473 return do_task_stat(task, buffer, 1); 518 return do_task_stat(task, buffer, 1);
474} 519}
@@ -477,12 +522,12 @@ int proc_pid_statm(struct task_struct *task, char *buffer)
477{ 522{
478 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; 523 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
479 struct mm_struct *mm = get_task_mm(task); 524 struct mm_struct *mm = get_task_mm(task);
480 525
481 if (mm) { 526 if (mm) {
482 size = task_statm(mm, &shared, &text, &data, &resident); 527 size = task_statm(mm, &shared, &text, &data, &resident);
483 mmput(mm); 528 mmput(mm);
484 } 529 }
485 530
486 return sprintf(buffer,"%d %d %d %d %d %d %d\n", 531 return sprintf(buffer, "%d %d %d %d %d %d %d\n",
487 size, resident, shared, text, lib, data, 0); 532 size, resident, shared, text, lib, data, 0);
488} 533}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a5fa1fdafc..42cb4f5613 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
67#include <linux/mount.h> 67#include <linux/mount.h>
68#include <linux/security.h> 68#include <linux/security.h>
69#include <linux/ptrace.h> 69#include <linux/ptrace.h>
70#include <linux/seccomp.h>
71#include <linux/cpuset.h> 70#include <linux/cpuset.h>
72#include <linux/audit.h> 71#include <linux/audit.h>
73#include <linux/poll.h> 72#include <linux/poll.h>
@@ -204,12 +203,17 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
204 int res = 0; 203 int res = 0;
205 struct mm_struct *mm = get_task_mm(task); 204 struct mm_struct *mm = get_task_mm(task);
206 if (mm) { 205 if (mm) {
207 unsigned int len = mm->env_end - mm->env_start; 206 unsigned int len;
207
208 res = -ESRCH;
209 if (!ptrace_may_attach(task))
210 goto out;
211
212 len = mm->env_end - mm->env_start;
208 if (len > PAGE_SIZE) 213 if (len > PAGE_SIZE)
209 len = PAGE_SIZE; 214 len = PAGE_SIZE;
210 res = access_process_vm(task, mm->env_start, buffer, len, 0); 215 res = access_process_vm(task, mm->env_start, buffer, len, 0);
211 if (!ptrace_may_attach(task)) 216out:
212 res = -ESRCH;
213 mmput(mm); 217 mmput(mm);
214 } 218 }
215 return res; 219 return res;
@@ -279,7 +283,7 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
279static int proc_pid_wchan(struct task_struct *task, char *buffer) 283static int proc_pid_wchan(struct task_struct *task, char *buffer)
280{ 284{
281 unsigned long wchan; 285 unsigned long wchan;
282 char symname[KSYM_NAME_LEN+1]; 286 char symname[KSYM_NAME_LEN];
283 287
284 wchan = get_wchan(task); 288 wchan = get_wchan(task);
285 289
@@ -296,7 +300,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
296 */ 300 */
297static int proc_pid_schedstat(struct task_struct *task, char *buffer) 301static int proc_pid_schedstat(struct task_struct *task, char *buffer)
298{ 302{
299 return sprintf(buffer, "%lu %lu %lu\n", 303 return sprintf(buffer, "%llu %llu %lu\n",
300 task->sched_info.cpu_time, 304 task->sched_info.cpu_time,
301 task->sched_info.run_delay, 305 task->sched_info.run_delay,
302 task->sched_info.pcnt); 306 task->sched_info.pcnt);
@@ -812,71 +816,6 @@ static const struct file_operations proc_loginuid_operations = {
812}; 816};
813#endif 817#endif
814 818
815#ifdef CONFIG_SECCOMP
816static ssize_t seccomp_read(struct file *file, char __user *buf,
817 size_t count, loff_t *ppos)
818{
819 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
820 char __buf[20];
821 size_t len;
822
823 if (!tsk)
824 return -ESRCH;
825 /* no need to print the trailing zero, so use only len */
826 len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
827 put_task_struct(tsk);
828
829 return simple_read_from_buffer(buf, count, ppos, __buf, len);
830}
831
832static ssize_t seccomp_write(struct file *file, const char __user *buf,
833 size_t count, loff_t *ppos)
834{
835 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
836 char __buf[20], *end;
837 unsigned int seccomp_mode;
838 ssize_t result;
839
840 result = -ESRCH;
841 if (!tsk)
842 goto out_no_task;
843
844 /* can set it only once to be even more secure */
845 result = -EPERM;
846 if (unlikely(tsk->seccomp.mode))
847 goto out;
848
849 result = -EFAULT;
850 memset(__buf, 0, sizeof(__buf));
851 count = min(count, sizeof(__buf) - 1);
852 if (copy_from_user(__buf, buf, count))
853 goto out;
854
855 seccomp_mode = simple_strtoul(__buf, &end, 0);
856 if (*end == '\n')
857 end++;
858 result = -EINVAL;
859 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
860 tsk->seccomp.mode = seccomp_mode;
861 set_tsk_thread_flag(tsk, TIF_SECCOMP);
862 } else
863 goto out;
864 result = -EIO;
865 if (unlikely(!(end - __buf)))
866 goto out;
867 result = end - __buf;
868out:
869 put_task_struct(tsk);
870out_no_task:
871 return result;
872}
873
874static const struct file_operations proc_seccomp_operations = {
875 .read = seccomp_read,
876 .write = seccomp_write,
877};
878#endif /* CONFIG_SECCOMP */
879
880#ifdef CONFIG_FAULT_INJECTION 819#ifdef CONFIG_FAULT_INJECTION
881static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, 820static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
882 size_t count, loff_t *ppos) 821 size_t count, loff_t *ppos)
@@ -929,6 +868,69 @@ static const struct file_operations proc_fault_inject_operations = {
929}; 868};
930#endif 869#endif
931 870
871#ifdef CONFIG_SCHED_DEBUG
872/*
873 * Print out various scheduling related per-task fields:
874 */
875static int sched_show(struct seq_file *m, void *v)
876{
877 struct inode *inode = m->private;
878 struct task_struct *p;
879
880 WARN_ON(!inode);
881
882 p = get_proc_task(inode);
883 if (!p)
884 return -ESRCH;
885 proc_sched_show_task(p, m);
886
887 put_task_struct(p);
888
889 return 0;
890}
891
892static ssize_t
893sched_write(struct file *file, const char __user *buf,
894 size_t count, loff_t *offset)
895{
896 struct inode *inode = file->f_path.dentry->d_inode;
897 struct task_struct *p;
898
899 WARN_ON(!inode);
900
901 p = get_proc_task(inode);
902 if (!p)
903 return -ESRCH;
904 proc_sched_set_task(p);
905
906 put_task_struct(p);
907
908 return count;
909}
910
911static int sched_open(struct inode *inode, struct file *filp)
912{
913 int ret;
914
915 ret = single_open(filp, sched_show, NULL);
916 if (!ret) {
917 struct seq_file *m = filp->private_data;
918
919 m->private = inode;
920 }
921 return ret;
922}
923
924static const struct file_operations proc_pid_sched_operations = {
925 .open = sched_open,
926 .read = seq_read,
927 .write = sched_write,
928 .llseek = seq_lseek,
929 .release = seq_release,
930};
931
932#endif
933
932static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) 934static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
933{ 935{
934 struct inode *inode = dentry->d_inode; 936 struct inode *inode = dentry->d_inode;
@@ -1963,6 +1965,9 @@ static const struct pid_entry tgid_base_stuff[] = {
1963 INF("environ", S_IRUSR, pid_environ), 1965 INF("environ", S_IRUSR, pid_environ),
1964 INF("auxv", S_IRUSR, pid_auxv), 1966 INF("auxv", S_IRUSR, pid_auxv),
1965 INF("status", S_IRUGO, pid_status), 1967 INF("status", S_IRUGO, pid_status),
1968#ifdef CONFIG_SCHED_DEBUG
1969 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
1970#endif
1966 INF("cmdline", S_IRUGO, pid_cmdline), 1971 INF("cmdline", S_IRUGO, pid_cmdline),
1967 INF("stat", S_IRUGO, tgid_stat), 1972 INF("stat", S_IRUGO, tgid_stat),
1968 INF("statm", S_IRUGO, pid_statm), 1973 INF("statm", S_IRUGO, pid_statm),
@@ -1971,9 +1976,6 @@ static const struct pid_entry tgid_base_stuff[] = {
1971 REG("numa_maps", S_IRUGO, numa_maps), 1976 REG("numa_maps", S_IRUGO, numa_maps),
1972#endif 1977#endif
1973 REG("mem", S_IRUSR|S_IWUSR, mem), 1978 REG("mem", S_IRUSR|S_IWUSR, mem),
1974#ifdef CONFIG_SECCOMP
1975 REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
1976#endif
1977 LNK("cwd", cwd), 1979 LNK("cwd", cwd),
1978 LNK("root", root), 1980 LNK("root", root),
1979 LNK("exe", exe), 1981 LNK("exe", exe),
@@ -2247,6 +2249,9 @@ static const struct pid_entry tid_base_stuff[] = {
2247 INF("environ", S_IRUSR, pid_environ), 2249 INF("environ", S_IRUSR, pid_environ),
2248 INF("auxv", S_IRUSR, pid_auxv), 2250 INF("auxv", S_IRUSR, pid_auxv),
2249 INF("status", S_IRUGO, pid_status), 2251 INF("status", S_IRUGO, pid_status),
2252#ifdef CONFIG_SCHED_DEBUG
2253 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
2254#endif
2250 INF("cmdline", S_IRUGO, pid_cmdline), 2255 INF("cmdline", S_IRUGO, pid_cmdline),
2251 INF("stat", S_IRUGO, tid_stat), 2256 INF("stat", S_IRUGO, tid_stat),
2252 INF("statm", S_IRUGO, pid_statm), 2257 INF("statm", S_IRUGO, pid_statm),
@@ -2255,9 +2260,6 @@ static const struct pid_entry tid_base_stuff[] = {
2255 REG("numa_maps", S_IRUGO, numa_maps), 2260 REG("numa_maps", S_IRUGO, numa_maps),
2256#endif 2261#endif
2257 REG("mem", S_IRUSR|S_IWUSR, mem), 2262 REG("mem", S_IRUSR|S_IWUSR, mem),
2258#ifdef CONFIG_SECCOMP
2259 REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
2260#endif
2261 LNK("cwd", cwd), 2263 LNK("cwd", cwd),
2262 LNK("root", root), 2264 LNK("root", root),
2263 LNK("exe", exe), 2265 LNK("exe", exe),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a40e15f5e..b5e7155d30 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -20,6 +20,7 @@
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <linux/spinlock.h> 22#include <linux/spinlock.h>
23#include <linux/completion.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
25#include "internal.h" 26#include "internal.h"
@@ -529,12 +530,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
529 return -EAGAIN; 530 return -EAGAIN;
530 dp->low_ino = i; 531 dp->low_ino = i;
531 532
532 spin_lock(&proc_subdir_lock);
533 dp->next = dir->subdir;
534 dp->parent = dir;
535 dir->subdir = dp;
536 spin_unlock(&proc_subdir_lock);
537
538 if (S_ISDIR(dp->mode)) { 533 if (S_ISDIR(dp->mode)) {
539 if (dp->proc_iops == NULL) { 534 if (dp->proc_iops == NULL) {
540 dp->proc_fops = &proc_dir_operations; 535 dp->proc_fops = &proc_dir_operations;
@@ -550,6 +545,13 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
550 if (dp->proc_iops == NULL) 545 if (dp->proc_iops == NULL)
551 dp->proc_iops = &proc_file_inode_operations; 546 dp->proc_iops = &proc_file_inode_operations;
552 } 547 }
548
549 spin_lock(&proc_subdir_lock);
550 dp->next = dir->subdir;
551 dp->parent = dir;
552 dir->subdir = dp;
553 spin_unlock(&proc_subdir_lock);
554
553 return 0; 555 return 0;
554} 556}
555 557
@@ -613,6 +615,9 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
613 ent->namelen = len; 615 ent->namelen = len;
614 ent->mode = mode; 616 ent->mode = mode;
615 ent->nlink = nlink; 617 ent->nlink = nlink;
618 ent->pde_users = 0;
619 spin_lock_init(&ent->pde_unload_lock);
620 ent->pde_unload_completion = NULL;
616 out: 621 out:
617 return ent; 622 return ent;
618} 623}
@@ -649,9 +654,6 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
649 654
650 ent = proc_create(&parent, name, S_IFDIR | mode, 2); 655 ent = proc_create(&parent, name, S_IFDIR | mode, 2);
651 if (ent) { 656 if (ent) {
652 ent->proc_fops = &proc_dir_operations;
653 ent->proc_iops = &proc_dir_inode_operations;
654
655 if (proc_register(parent, ent) < 0) { 657 if (proc_register(parent, ent) < 0) {
656 kfree(ent); 658 kfree(ent);
657 ent = NULL; 659 ent = NULL;
@@ -686,10 +688,6 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
686 688
687 ent = proc_create(&parent,name,mode,nlink); 689 ent = proc_create(&parent,name,mode,nlink);
688 if (ent) { 690 if (ent) {
689 if (S_ISDIR(mode)) {
690 ent->proc_fops = &proc_dir_operations;
691 ent->proc_iops = &proc_dir_inode_operations;
692 }
693 if (proc_register(parent, ent) < 0) { 691 if (proc_register(parent, ent) < 0) {
694 kfree(ent); 692 kfree(ent);
695 ent = NULL; 693 ent = NULL;
@@ -734,9 +732,35 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
734 de = *p; 732 de = *p;
735 *p = de->next; 733 *p = de->next;
736 de->next = NULL; 734 de->next = NULL;
735
736 spin_lock(&de->pde_unload_lock);
737 /*
738 * Stop accepting new callers into module. If you're
739 * dynamically allocating ->proc_fops, save a pointer somewhere.
740 */
741 de->proc_fops = NULL;
742 /* Wait until all existing callers into module are done. */
743 if (de->pde_users > 0) {
744 DECLARE_COMPLETION_ONSTACK(c);
745
746 if (!de->pde_unload_completion)
747 de->pde_unload_completion = &c;
748
749 spin_unlock(&de->pde_unload_lock);
750 spin_unlock(&proc_subdir_lock);
751
752 wait_for_completion(de->pde_unload_completion);
753
754 spin_lock(&proc_subdir_lock);
755 goto continue_removing;
756 }
757 spin_unlock(&de->pde_unload_lock);
758
759continue_removing:
737 if (S_ISDIR(de->mode)) 760 if (S_ISDIR(de->mode))
738 parent->nlink--; 761 parent->nlink--;
739 proc_kill_inodes(de); 762 if (!S_ISREG(de->mode))
763 proc_kill_inodes(de);
740 de->nlink = 0; 764 de->nlink = 0;
741 WARN_ON(de->subdir); 765 WARN_ON(de->subdir);
742 if (!atomic_read(&de->count)) 766 if (!atomic_read(&de->count))
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d5ce65c68d..dd28e86ab4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/completion.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/limits.h> 15#include <linux/limits.h>
15#include <linux/init.h> 16#include <linux/init.h>
@@ -140,6 +141,251 @@ static const struct super_operations proc_sops = {
140 .remount_fs = proc_remount, 141 .remount_fs = proc_remount,
141}; 142};
142 143
144static void pde_users_dec(struct proc_dir_entry *pde)
145{
146 spin_lock(&pde->pde_unload_lock);
147 pde->pde_users--;
148 if (pde->pde_unload_completion && pde->pde_users == 0)
149 complete(pde->pde_unload_completion);
150 spin_unlock(&pde->pde_unload_lock);
151}
152
153static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
154{
155 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
156 loff_t rv = -EINVAL;
157 loff_t (*llseek)(struct file *, loff_t, int);
158
159 spin_lock(&pde->pde_unload_lock);
160 /*
161 * remove_proc_entry() is going to delete PDE (as part of module
162 * cleanup sequence). No new callers into module allowed.
163 */
164 if (!pde->proc_fops) {
165 spin_unlock(&pde->pde_unload_lock);
166 return rv;
167 }
168 /*
169 * Bump refcount so that remove_proc_entry will wail for ->llseek to
170 * complete.
171 */
172 pde->pde_users++;
173 /*
174 * Save function pointer under lock, to protect against ->proc_fops
175 * NULL'ifying right after ->pde_unload_lock is dropped.
176 */
177 llseek = pde->proc_fops->llseek;
178 spin_unlock(&pde->pde_unload_lock);
179
180 if (!llseek)
181 llseek = default_llseek;
182 rv = llseek(file, offset, whence);
183
184 pde_users_dec(pde);
185 return rv;
186}
187
188static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
189{
190 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
191 ssize_t rv = -EIO;
192 ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
193
194 spin_lock(&pde->pde_unload_lock);
195 if (!pde->proc_fops) {
196 spin_unlock(&pde->pde_unload_lock);
197 return rv;
198 }
199 pde->pde_users++;
200 read = pde->proc_fops->read;
201 spin_unlock(&pde->pde_unload_lock);
202
203 if (read)
204 rv = read(file, buf, count, ppos);
205
206 pde_users_dec(pde);
207 return rv;
208}
209
210static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
211{
212 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
213 ssize_t rv = -EIO;
214 ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
215
216 spin_lock(&pde->pde_unload_lock);
217 if (!pde->proc_fops) {
218 spin_unlock(&pde->pde_unload_lock);
219 return rv;
220 }
221 pde->pde_users++;
222 write = pde->proc_fops->write;
223 spin_unlock(&pde->pde_unload_lock);
224
225 if (write)
226 rv = write(file, buf, count, ppos);
227
228 pde_users_dec(pde);
229 return rv;
230}
231
232static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
233{
234 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
235 unsigned int rv = 0;
236 unsigned int (*poll)(struct file *, struct poll_table_struct *);
237
238 spin_lock(&pde->pde_unload_lock);
239 if (!pde->proc_fops) {
240 spin_unlock(&pde->pde_unload_lock);
241 return rv;
242 }
243 pde->pde_users++;
244 poll = pde->proc_fops->poll;
245 spin_unlock(&pde->pde_unload_lock);
246
247 if (poll)
248 rv = poll(file, pts);
249
250 pde_users_dec(pde);
251 return rv;
252}
253
254static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
255{
256 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
257 long rv = -ENOTTY;
258 long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
259 int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
260
261 spin_lock(&pde->pde_unload_lock);
262 if (!pde->proc_fops) {
263 spin_unlock(&pde->pde_unload_lock);
264 return rv;
265 }
266 pde->pde_users++;
267 unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
268 ioctl = pde->proc_fops->ioctl;
269 spin_unlock(&pde->pde_unload_lock);
270
271 if (unlocked_ioctl) {
272 rv = unlocked_ioctl(file, cmd, arg);
273 if (rv == -ENOIOCTLCMD)
274 rv = -EINVAL;
275 } else if (ioctl) {
276 lock_kernel();
277 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
278 unlock_kernel();
279 }
280
281 pde_users_dec(pde);
282 return rv;
283}
284
285#ifdef CONFIG_COMPAT
286static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
287{
288 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
289 long rv = -ENOTTY;
290 long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
291
292 spin_lock(&pde->pde_unload_lock);
293 if (!pde->proc_fops) {
294 spin_unlock(&pde->pde_unload_lock);
295 return rv;
296 }
297 pde->pde_users++;
298 compat_ioctl = pde->proc_fops->compat_ioctl;
299 spin_unlock(&pde->pde_unload_lock);
300
301 if (compat_ioctl)
302 rv = compat_ioctl(file, cmd, arg);
303
304 pde_users_dec(pde);
305 return rv;
306}
307#endif
308
309static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
310{
311 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
312 int rv = -EIO;
313 int (*mmap)(struct file *, struct vm_area_struct *);
314
315 spin_lock(&pde->pde_unload_lock);
316 if (!pde->proc_fops) {
317 spin_unlock(&pde->pde_unload_lock);
318 return rv;
319 }
320 pde->pde_users++;
321 mmap = pde->proc_fops->mmap;
322 spin_unlock(&pde->pde_unload_lock);
323
324 if (mmap)
325 rv = mmap(file, vma);
326
327 pde_users_dec(pde);
328 return rv;
329}
330
331static int proc_reg_open(struct inode *inode, struct file *file)
332{
333 struct proc_dir_entry *pde = PDE(inode);
334 int rv = 0;
335 int (*open)(struct inode *, struct file *);
336
337 spin_lock(&pde->pde_unload_lock);
338 if (!pde->proc_fops) {
339 spin_unlock(&pde->pde_unload_lock);
340 return rv;
341 }
342 pde->pde_users++;
343 open = pde->proc_fops->open;
344 spin_unlock(&pde->pde_unload_lock);
345
346 if (open)
347 rv = open(inode, file);
348
349 pde_users_dec(pde);
350 return rv;
351}
352
353static int proc_reg_release(struct inode *inode, struct file *file)
354{
355 struct proc_dir_entry *pde = PDE(inode);
356 int rv = 0;
357 int (*release)(struct inode *, struct file *);
358
359 spin_lock(&pde->pde_unload_lock);
360 if (!pde->proc_fops) {
361 spin_unlock(&pde->pde_unload_lock);
362 return rv;
363 }
364 pde->pde_users++;
365 release = pde->proc_fops->release;
366 spin_unlock(&pde->pde_unload_lock);
367
368 if (release)
369 rv = release(inode, file);
370
371 pde_users_dec(pde);
372 return rv;
373}
374
375static const struct file_operations proc_reg_file_ops = {
376 .llseek = proc_reg_llseek,
377 .read = proc_reg_read,
378 .write = proc_reg_write,
379 .poll = proc_reg_poll,
380 .unlocked_ioctl = proc_reg_unlocked_ioctl,
381#ifdef CONFIG_COMPAT
382 .compat_ioctl = proc_reg_compat_ioctl,
383#endif
384 .mmap = proc_reg_mmap,
385 .open = proc_reg_open,
386 .release = proc_reg_release,
387};
388
143struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, 389struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
144 struct proc_dir_entry *de) 390 struct proc_dir_entry *de)
145{ 391{
@@ -166,8 +412,12 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
166 inode->i_nlink = de->nlink; 412 inode->i_nlink = de->nlink;
167 if (de->proc_iops) 413 if (de->proc_iops)
168 inode->i_op = de->proc_iops; 414 inode->i_op = de->proc_iops;
169 if (de->proc_fops) 415 if (de->proc_fops) {
170 inode->i_fop = de->proc_fops; 416 if (S_ISREG(inode->i_mode))
417 inode->i_fop = &proc_reg_file_ops;
418 else
419 inode->i_fop = de->proc_fops;
420 }
171 } 421 }
172 422
173 return inode; 423 return inode;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5fd49e47f8..d24b8d4605 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -105,6 +105,7 @@ static int uptime_read_proc(char *page, char **start, off_t off,
105 cputime_t idletime = cputime_add(init_task.utime, init_task.stime); 105 cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
106 106
107 do_posix_clock_monotonic_gettime(&uptime); 107 do_posix_clock_monotonic_gettime(&uptime);
108 monotonic_to_bootbased(&uptime);
108 cputime_to_timespec(idletime, &idle); 109 cputime_to_timespec(idletime, &idle);
109 len = sprintf(page,"%lu.%02lu %lu.%02lu\n", 110 len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
110 (unsigned long) uptime.tv_sec, 111 (unsigned long) uptime.tv_sec,
@@ -443,12 +444,12 @@ static int show_stat(struct seq_file *p, void *v)
443 unsigned long jif; 444 unsigned long jif;
444 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 445 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
445 u64 sum = 0; 446 u64 sum = 0;
447 struct timespec boottime;
446 448
447 user = nice = system = idle = iowait = 449 user = nice = system = idle = iowait =
448 irq = softirq = steal = cputime64_zero; 450 irq = softirq = steal = cputime64_zero;
449 jif = - wall_to_monotonic.tv_sec; 451 getboottime(&boottime);
450 if (wall_to_monotonic.tv_nsec) 452 jif = boottime.tv_sec;
451 --jif;
452 453
453 for_each_possible_cpu(i) { 454 for_each_possible_cpu(i) {
454 int j; 455 int j;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index b3a473b0a1..22846225ac 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -69,7 +69,7 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
69 69
70static int show_tty_driver(struct seq_file *m, void *v) 70static int show_tty_driver(struct seq_file *m, void *v)
71{ 71{
72 struct tty_driver *p = v; 72 struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
73 dev_t from = MKDEV(p->major, p->minor_start); 73 dev_t from = MKDEV(p->major, p->minor_start);
74 dev_t to = from + p->num; 74 dev_t to = from + p->num;
75 75
@@ -106,22 +106,13 @@ static int show_tty_driver(struct seq_file *m, void *v)
106/* iterator */ 106/* iterator */
107static void *t_start(struct seq_file *m, loff_t *pos) 107static void *t_start(struct seq_file *m, loff_t *pos)
108{ 108{
109 struct list_head *p;
110 loff_t l = *pos;
111
112 mutex_lock(&tty_mutex); 109 mutex_lock(&tty_mutex);
113 list_for_each(p, &tty_drivers) 110 return seq_list_start(&tty_drivers, *pos);
114 if (!l--)
115 return list_entry(p, struct tty_driver, tty_drivers);
116 return NULL;
117} 111}
118 112
119static void *t_next(struct seq_file *m, void *v, loff_t *pos) 113static void *t_next(struct seq_file *m, void *v, loff_t *pos)
120{ 114{
121 struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; 115 return seq_list_next(v, &tty_drivers, pos);
122 (*pos)++;
123 return p==&tty_drivers ? NULL :
124 list_entry(p, struct tty_driver, tty_drivers);
125} 116}
126 117
127static void t_stop(struct seq_file *m, void *v) 118static void t_stop(struct seq_file *m, void *v)
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 44649981bb..867f42b020 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -25,7 +25,7 @@ const struct file_operations qnx4_file_operations =
25 .read = do_sync_read, 25 .read = do_sync_read,
26 .aio_read = generic_file_aio_read, 26 .aio_read = generic_file_aio_read,
27 .mmap = generic_file_mmap, 27 .mmap = generic_file_mmap,
28 .sendfile = generic_file_sendfile, 28 .splice_read = generic_file_splice_read,
29#ifdef CONFIG_QNX4FS_RW 29#ifdef CONFIG_QNX4FS_RW
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
diff --git a/fs/quota.c b/fs/quota.c
index 9f237d6182..e6577ac15a 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -10,12 +10,14 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <asm/current.h> 11#include <asm/current.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/compat.h>
13#include <linux/kernel.h> 14#include <linux/kernel.h>
14#include <linux/security.h> 15#include <linux/security.h>
15#include <linux/syscalls.h> 16#include <linux/syscalls.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
17#include <linux/capability.h> 18#include <linux/capability.h>
18#include <linux/quotaops.h> 19#include <linux/quotaops.h>
20#include <linux/types.h>
19 21
20/* Check validity of generic quotactl commands */ 22/* Check validity of generic quotactl commands */
21static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) 23static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
@@ -384,3 +386,119 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
384 386
385 return ret; 387 return ret;
386} 388}
389
390#if defined(CONFIG_X86_64) || defined(CONFIG_IA64)
391/*
392 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
393 * and is necessary due to alignment problems.
394 */
395struct compat_if_dqblk {
396 compat_u64 dqb_bhardlimit;
397 compat_u64 dqb_bsoftlimit;
398 compat_u64 dqb_curspace;
399 compat_u64 dqb_ihardlimit;
400 compat_u64 dqb_isoftlimit;
401 compat_u64 dqb_curinodes;
402 compat_u64 dqb_btime;
403 compat_u64 dqb_itime;
404 compat_uint_t dqb_valid;
405};
406
407/* XFS structures */
408struct compat_fs_qfilestat {
409 compat_u64 dqb_bhardlimit;
410 compat_u64 qfs_nblks;
411 compat_uint_t qfs_nextents;
412};
413
414struct compat_fs_quota_stat {
415 __s8 qs_version;
416 __u16 qs_flags;
417 __s8 qs_pad;
418 struct compat_fs_qfilestat qs_uquota;
419 struct compat_fs_qfilestat qs_gquota;
420 compat_uint_t qs_incoredqs;
421 compat_int_t qs_btimelimit;
422 compat_int_t qs_itimelimit;
423 compat_int_t qs_rtbtimelimit;
424 __u16 qs_bwarnlimit;
425 __u16 qs_iwarnlimit;
426};
427
428asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
429 qid_t id, void __user *addr)
430{
431 unsigned int cmds;
432 struct if_dqblk __user *dqblk;
433 struct compat_if_dqblk __user *compat_dqblk;
434 struct fs_quota_stat __user *fsqstat;
435 struct compat_fs_quota_stat __user *compat_fsqstat;
436 compat_uint_t data;
437 u16 xdata;
438 long ret;
439
440 cmds = cmd >> SUBCMDSHIFT;
441
442 switch (cmds) {
443 case Q_GETQUOTA:
444 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
445 compat_dqblk = addr;
446 ret = sys_quotactl(cmd, special, id, dqblk);
447 if (ret)
448 break;
449 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
450 get_user(data, &dqblk->dqb_valid) ||
451 put_user(data, &compat_dqblk->dqb_valid))
452 ret = -EFAULT;
453 break;
454 case Q_SETQUOTA:
455 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
456 compat_dqblk = addr;
457 ret = -EFAULT;
458 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
459 get_user(data, &compat_dqblk->dqb_valid) ||
460 put_user(data, &dqblk->dqb_valid))
461 break;
462 ret = sys_quotactl(cmd, special, id, dqblk);
463 break;
464 case Q_XGETQSTAT:
465 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
466 compat_fsqstat = addr;
467 ret = sys_quotactl(cmd, special, id, fsqstat);
468 if (ret)
469 break;
470 ret = -EFAULT;
471 /* Copying qs_version, qs_flags, qs_pad */
472 if (copy_in_user(compat_fsqstat, fsqstat,
473 offsetof(struct compat_fs_quota_stat, qs_uquota)))
474 break;
475 /* Copying qs_uquota */
476 if (copy_in_user(&compat_fsqstat->qs_uquota,
477 &fsqstat->qs_uquota,
478 sizeof(compat_fsqstat->qs_uquota)) ||
479 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
480 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
481 break;
482 /* Copying qs_gquota */
483 if (copy_in_user(&compat_fsqstat->qs_gquota,
484 &fsqstat->qs_gquota,
485 sizeof(compat_fsqstat->qs_gquota)) ||
486 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
487 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
488 break;
489 /* Copying the rest */
490 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
491 &fsqstat->qs_incoredqs,
492 sizeof(struct compat_fs_quota_stat) -
493 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
494 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
495 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
496 break;
497 ret = 0;
498 break;
499 default:
500 ret = sys_quotactl(cmd, special, id, addr);
501 }
502 return ret;
503}
504#endif
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 2f14774a12..97bdc0b2f9 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -41,7 +41,7 @@ const struct file_operations ramfs_file_operations = {
41 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
42 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
43 .fsync = simple_sync_file, 43 .fsync = simple_sync_file,
44 .sendfile = generic_file_sendfile, 44 .splice_read = generic_file_splice_read,
45 .llseek = generic_file_llseek, 45 .llseek = generic_file_llseek,
46}; 46};
47 47
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5d258c40a2..cad2b7ace6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
42 .write = do_sync_write, 42 .write = do_sync_write,
43 .aio_write = generic_file_aio_write, 43 .aio_write = generic_file_aio_write,
44 .fsync = simple_sync_file, 44 .fsync = simple_sync_file,
45 .sendfile = generic_file_sendfile, 45 .splice_read = generic_file_splice_read,
46 .llseek = generic_file_llseek, 46 .llseek = generic_file_llseek,
47}; 47};
48 48
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d40d22b347..ef2b46d099 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -60,6 +60,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
60 inode->i_blocks = 0; 60 inode->i_blocks = 0;
61 inode->i_mapping->a_ops = &ramfs_aops; 61 inode->i_mapping->a_ops = &ramfs_aops;
62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
63 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
63 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 64 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
64 switch (mode & S_IFMT) { 65 switch (mode & S_IFMT) {
65 default: 66 default:
diff --git a/fs/read_write.c b/fs/read_write.c
index 4d03008f01..507ddff48a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/splice.h>
18#include "read_write.h" 19#include "read_write.h"
19 20
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
@@ -25,7 +26,7 @@ const struct file_operations generic_ro_fops = {
25 .read = do_sync_read, 26 .read = do_sync_read,
26 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
27 .mmap = generic_file_readonly_mmap, 28 .mmap = generic_file_readonly_mmap,
28 .sendfile = generic_file_sendfile, 29 .splice_read = generic_file_splice_read,
29}; 30};
30 31
31EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
@@ -708,7 +709,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
708 struct inode * in_inode, * out_inode; 709 struct inode * in_inode, * out_inode;
709 loff_t pos; 710 loff_t pos;
710 ssize_t retval; 711 ssize_t retval;
711 int fput_needed_in, fput_needed_out; 712 int fput_needed_in, fput_needed_out, fl;
712 713
713 /* 714 /*
714 * Get input file, and verify that it is ok.. 715 * Get input file, and verify that it is ok..
@@ -723,7 +724,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
723 in_inode = in_file->f_path.dentry->d_inode; 724 in_inode = in_file->f_path.dentry->d_inode;
724 if (!in_inode) 725 if (!in_inode)
725 goto fput_in; 726 goto fput_in;
726 if (!in_file->f_op || !in_file->f_op->sendfile) 727 if (!in_file->f_op || !in_file->f_op->splice_read)
727 goto fput_in; 728 goto fput_in;
728 retval = -ESPIPE; 729 retval = -ESPIPE;
729 if (!ppos) 730 if (!ppos)
@@ -776,7 +777,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
776 count = max - pos; 777 count = max - pos;
777 } 778 }
778 779
779 retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); 780 fl = 0;
781#if 0
782 /*
783 * We need to debate whether we can enable this or not. The
784 * man page documents EAGAIN return for the output at least,
785 * and the application is arguably buggy if it doesn't expect
786 * EAGAIN on a non-blocking file descriptor.
787 */
788 if (in_file->f_flags & O_NONBLOCK)
789 fl = SPLICE_F_NONBLOCK;
790#endif
791 retval = do_splice_direct(in_file, ppos, out_file, count, fl);
780 792
781 if (retval > 0) { 793 if (retval > 0) {
782 add_rchar(current, retval); 794 add_rchar(current, retval);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9e451a6858..2070aeee2a 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1305,7 +1305,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1305 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && 1305 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1306 *ppos + count > MAX_NON_LFS) { 1306 *ppos + count > MAX_NON_LFS) {
1307 if (*ppos >= MAX_NON_LFS) { 1307 if (*ppos >= MAX_NON_LFS) {
1308 send_sig(SIGXFSZ, current, 0);
1309 return -EFBIG; 1308 return -EFBIG;
1310 } 1309 }
1311 if (count > MAX_NON_LFS - (unsigned long)*ppos) 1310 if (count > MAX_NON_LFS - (unsigned long)*ppos)
@@ -1531,7 +1530,6 @@ const struct file_operations reiserfs_file_operations = {
1531 .open = generic_file_open, 1530 .open = generic_file_open,
1532 .release = reiserfs_file_release, 1531 .release = reiserfs_file_release,
1533 .fsync = reiserfs_sync_file, 1532 .fsync = reiserfs_sync_file,
1534 .sendfile = generic_file_sendfile,
1535 .aio_read = generic_file_aio_read, 1533 .aio_read = generic_file_aio_read,
1536 .aio_write = generic_file_aio_write, 1534 .aio_write = generic_file_aio_write,
1537 .splice_read = generic_file_splice_read, 1535 .splice_read = generic_file_splice_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1272d11399..ddde489f1c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -7,6 +7,7 @@
7#include <linux/reiserfs_fs.h> 7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h> 8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h> 9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h>
10#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
11#include <linux/pagemap.h> 12#include <linux/pagemap.h>
12#include <linux/highmem.h> 13#include <linux/highmem.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index b484d2913c..11a0fcc2d4 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -51,8 +51,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
51 if (IS_RDONLY(inode)) 51 if (IS_RDONLY(inode))
52 return -EROFS; 52 return -EROFS;
53 53
54 if ((current->fsuid != inode->i_uid) 54 if (!is_owner_or_cap(inode))
55 && !capable(CAP_FOWNER))
56 return -EPERM; 55 return -EPERM;
57 56
58 if (get_user(flags, (int __user *)arg)) 57 if (get_user(flags, (int __user *)arg))
@@ -81,7 +80,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
81 case REISERFS_IOC_GETVERSION: 80 case REISERFS_IOC_GETVERSION:
82 return put_user(inode->i_generation, (int __user *)arg); 81 return put_user(inode->i_generation, (int __user *)arg);
83 case REISERFS_IOC_SETVERSION: 82 case REISERFS_IOC_SETVERSION:
84 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 83 if (!is_owner_or_cap(inode))
85 return -EPERM; 84 return -EPERM;
86 if (IS_RDONLY(inode)) 85 if (IS_RDONLY(inode))
87 return -EROFS; 86 return -EROFS;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4ac911920..5a93cfe1a0 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/exportfs.h>
24#include <linux/vfs.h> 25#include <linux/vfs.h>
25#include <linux/mnt_namespace.h> 26#include <linux/mnt_namespace.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 5296a29cc5..b7e4fa4539 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -21,7 +21,7 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
21 21
22 if (!reiserfs_posixacl(inode->i_sb)) 22 if (!reiserfs_posixacl(inode->i_sb))
23 return -EOPNOTSUPP; 23 return -EOPNOTSUPP;
24 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) 24 if (!is_owner_or_cap(inode))
25 return -EPERM; 25 return -EPERM;
26 26
27 if (value) { 27 if (value) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 0ac22af7af..bbb19be260 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -177,21 +177,23 @@ EXPORT_SYMBOL(seq_read);
177 177
178static int traverse(struct seq_file *m, loff_t offset) 178static int traverse(struct seq_file *m, loff_t offset)
179{ 179{
180 loff_t pos = 0; 180 loff_t pos = 0, index;
181 int error = 0; 181 int error = 0;
182 void *p; 182 void *p;
183 183
184 m->version = 0; 184 m->version = 0;
185 m->index = 0; 185 index = 0;
186 m->count = m->from = 0; 186 m->count = m->from = 0;
187 if (!offset) 187 if (!offset) {
188 m->index = index;
188 return 0; 189 return 0;
190 }
189 if (!m->buf) { 191 if (!m->buf) {
190 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 192 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
191 if (!m->buf) 193 if (!m->buf)
192 return -ENOMEM; 194 return -ENOMEM;
193 } 195 }
194 p = m->op->start(m, &m->index); 196 p = m->op->start(m, &index);
195 while (p) { 197 while (p) {
196 error = PTR_ERR(p); 198 error = PTR_ERR(p);
197 if (IS_ERR(p)) 199 if (IS_ERR(p))
@@ -204,15 +206,17 @@ static int traverse(struct seq_file *m, loff_t offset)
204 if (pos + m->count > offset) { 206 if (pos + m->count > offset) {
205 m->from = offset - pos; 207 m->from = offset - pos;
206 m->count -= m->from; 208 m->count -= m->from;
209 m->index = index;
207 break; 210 break;
208 } 211 }
209 pos += m->count; 212 pos += m->count;
210 m->count = 0; 213 m->count = 0;
211 if (pos == offset) { 214 if (pos == offset) {
212 m->index++; 215 index++;
216 m->index = index;
213 break; 217 break;
214 } 218 }
215 p = m->op->next(m, p, &m->index); 219 p = m->op->next(m, p, &index);
216 } 220 }
217 m->op->stop(m, p); 221 m->op->stop(m, p);
218 return error; 222 return error;
@@ -260,8 +264,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
260 } 264 }
261 } 265 }
262 } 266 }
263 mutex_unlock(&m->lock);
264 file->f_version = m->version; 267 file->f_version = m->version;
268 mutex_unlock(&m->lock);
265 return retval; 269 return retval;
266} 270}
267EXPORT_SYMBOL(seq_lseek); 271EXPORT_SYMBOL(seq_lseek);
@@ -447,3 +451,37 @@ int seq_puts(struct seq_file *m, const char *s)
447 return -1; 451 return -1;
448} 452}
449EXPORT_SYMBOL(seq_puts); 453EXPORT_SYMBOL(seq_puts);
454
455struct list_head *seq_list_start(struct list_head *head, loff_t pos)
456{
457 struct list_head *lh;
458
459 list_for_each(lh, head)
460 if (pos-- == 0)
461 return lh;
462
463 return NULL;
464}
465
466EXPORT_SYMBOL(seq_list_start);
467
468struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
469{
470 if (!pos)
471 return head;
472
473 return seq_list_start(head, pos - 1);
474}
475
476EXPORT_SYMBOL(seq_list_start_head);
477
478struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
479{
480 struct list_head *lh;
481
482 lh = ((struct list_head *)v)->next;
483 ++*ppos;
484 return lh == head ? NULL : lh;
485}
486
487EXPORT_SYMBOL(seq_list_next);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f1da89203a..3b07f26d98 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -133,7 +133,8 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait)
133 * the peer disconnects. 133 * the peer disconnects.
134 */ 134 */
135 if (signalfd_lock(ctx, &lk)) { 135 if (signalfd_lock(ctx, &lk)) {
136 if (next_signal(&lk.tsk->pending, &ctx->sigmask) > 0 || 136 if ((lk.tsk == current &&
137 next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) ||
137 next_signal(&lk.tsk->signal->shared_pending, 138 next_signal(&lk.tsk->signal->shared_pending,
138 &ctx->sigmask) > 0) 139 &ctx->sigmask) > 0)
139 events |= POLLIN; 140 events |= POLLIN;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index aea3f8aa54..c5d78a7e49 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -262,8 +262,9 @@ out:
262} 262}
263 263
264static ssize_t 264static ssize_t
265smb_file_sendfile(struct file *file, loff_t *ppos, 265smb_file_splice_read(struct file *file, loff_t *ppos,
266 size_t count, read_actor_t actor, void *target) 266 struct pipe_inode_info *pipe, size_t count,
267 unsigned int flags)
267{ 268{
268 struct dentry *dentry = file->f_path.dentry; 269 struct dentry *dentry = file->f_path.dentry;
269 ssize_t status; 270 ssize_t status;
@@ -277,7 +278,7 @@ smb_file_sendfile(struct file *file, loff_t *ppos,
277 DENTRY_PATH(dentry), status); 278 DENTRY_PATH(dentry), status);
278 goto out; 279 goto out;
279 } 280 }
280 status = generic_file_sendfile(file, ppos, count, actor, target); 281 status = generic_file_splice_read(file, ppos, pipe, count, flags);
281out: 282out:
282 return status; 283 return status;
283} 284}
@@ -416,7 +417,7 @@ const struct file_operations smb_file_operations =
416 .open = smb_file_open, 417 .open = smb_file_open,
417 .release = smb_file_release, 418 .release = smb_file_release,
418 .fsync = smb_fsync, 419 .fsync = smb_fsync,
419 .sendfile = smb_file_sendfile, 420 .splice_read = smb_file_splice_read,
420}; 421};
421 422
422const struct inode_operations smb_file_inode_operations = 423const struct inode_operations smb_file_inode_operations =
diff --git a/fs/splice.c b/fs/splice.c
index e7d7080de2..53fc2082a4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -20,7 +20,7 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/pipe_fs_i.h> 23#include <linux/splice.h>
24#include <linux/mm_inline.h> 24#include <linux/mm_inline.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
@@ -28,22 +28,7 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h> 30#include <linux/uio.h>
31 31#include <linux/security.h>
32struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
36
37/*
38 * Passed to splice_to_pipe
39 */
40struct splice_pipe_desc {
41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
44 unsigned int flags; /* splice flags */
45 const struct pipe_buf_operations *ops;/* ops associated with output pipe */
46};
47 32
48/* 33/*
49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 34 * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -101,8 +86,12 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
101 buf->flags &= ~PIPE_BUF_FLAG_LRU; 86 buf->flags &= ~PIPE_BUF_FLAG_LRU;
102} 87}
103 88
104static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 89/*
105 struct pipe_buffer *buf) 90 * Check whether the contents of buf is OK to access. Since the content
91 * is a page cache page, IO may be in flight.
92 */
93static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
94 struct pipe_buffer *buf)
106{ 95{
107 struct page *page = buf->page; 96 struct page *page = buf->page;
108 int err; 97 int err;
@@ -143,7 +132,7 @@ static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
143 .can_merge = 0, 132 .can_merge = 0,
144 .map = generic_pipe_buf_map, 133 .map = generic_pipe_buf_map,
145 .unmap = generic_pipe_buf_unmap, 134 .unmap = generic_pipe_buf_unmap,
146 .pin = page_cache_pipe_buf_pin, 135 .confirm = page_cache_pipe_buf_confirm,
147 .release = page_cache_pipe_buf_release, 136 .release = page_cache_pipe_buf_release,
148 .steal = page_cache_pipe_buf_steal, 137 .steal = page_cache_pipe_buf_steal,
149 .get = generic_pipe_buf_get, 138 .get = generic_pipe_buf_get,
@@ -163,18 +152,25 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
163 .can_merge = 0, 152 .can_merge = 0,
164 .map = generic_pipe_buf_map, 153 .map = generic_pipe_buf_map,
165 .unmap = generic_pipe_buf_unmap, 154 .unmap = generic_pipe_buf_unmap,
166 .pin = generic_pipe_buf_pin, 155 .confirm = generic_pipe_buf_confirm,
167 .release = page_cache_pipe_buf_release, 156 .release = page_cache_pipe_buf_release,
168 .steal = user_page_pipe_buf_steal, 157 .steal = user_page_pipe_buf_steal,
169 .get = generic_pipe_buf_get, 158 .get = generic_pipe_buf_get,
170}; 159};
171 160
172/* 161/**
173 * Pipe output worker. This sets up our pipe format with the page cache 162 * splice_to_pipe - fill passed data into a pipe
174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 163 * @pipe: pipe to fill
164 * @spd: data to fill
165 *
166 * Description:
167 * @spd contains a map of pages and len/offset tupples, a long with
168 * the struct pipe_buf_operations associated with these pages. This
169 * function will link that data to the pipe.
170 *
175 */ 171 */
176static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 172ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
177 struct splice_pipe_desc *spd) 173 struct splice_pipe_desc *spd)
178{ 174{
179 unsigned int spd_pages = spd->nr_pages; 175 unsigned int spd_pages = spd->nr_pages;
180 int ret, do_wakeup, page_nr; 176 int ret, do_wakeup, page_nr;
@@ -201,6 +197,7 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
201 buf->page = spd->pages[page_nr]; 197 buf->page = spd->pages[page_nr];
202 buf->offset = spd->partial[page_nr].offset; 198 buf->offset = spd->partial[page_nr].offset;
203 buf->len = spd->partial[page_nr].len; 199 buf->len = spd->partial[page_nr].len;
200 buf->private = spd->partial[page_nr].private;
204 buf->ops = spd->ops; 201 buf->ops = spd->ops;
205 if (spd->flags & SPLICE_F_GIFT) 202 if (spd->flags & SPLICE_F_GIFT)
206 buf->flags |= PIPE_BUF_FLAG_GIFT; 203 buf->flags |= PIPE_BUF_FLAG_GIFT;
@@ -296,19 +293,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
296 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 293 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
297 294
298 /* 295 /*
299 * Now fill in the holes:
300 */
301 error = 0;
302
303 /*
304 * Lookup the (hopefully) full range of pages we need. 296 * Lookup the (hopefully) full range of pages we need.
305 */ 297 */
306 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 298 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
307 299
308 /* 300 /*
309 * If find_get_pages_contig() returned fewer pages than we needed, 301 * If find_get_pages_contig() returned fewer pages than we needed,
310 * allocate the rest. 302 * allocate the rest and fill in the holes.
311 */ 303 */
304 error = 0;
312 index += spd.nr_pages; 305 index += spd.nr_pages;
313 while (spd.nr_pages < nr_pages) { 306 while (spd.nr_pages < nr_pages) {
314 /* 307 /*
@@ -470,11 +463,16 @@ fill_it:
470/** 463/**
471 * generic_file_splice_read - splice data from file to a pipe 464 * generic_file_splice_read - splice data from file to a pipe
472 * @in: file to splice from 465 * @in: file to splice from
466 * @ppos: position in @in
473 * @pipe: pipe to splice to 467 * @pipe: pipe to splice to
474 * @len: number of bytes to splice 468 * @len: number of bytes to splice
475 * @flags: splice modifier flags 469 * @flags: splice modifier flags
476 * 470 *
477 * Will read pages from given file and fill them into a pipe. 471 * Description:
472 * Will read pages from given file and fill them into a pipe. Can be
473 * used as long as the address_space operations for the source implements
474 * a readpage() hook.
475 *
478 */ 476 */
479ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 477ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
480 struct pipe_inode_info *pipe, size_t len, 478 struct pipe_inode_info *pipe, size_t len,
@@ -494,7 +492,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
494 492
495 ret = 0; 493 ret = 0;
496 spliced = 0; 494 spliced = 0;
497 while (len) { 495 while (len && !spliced) {
498 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 496 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
499 497
500 if (ret < 0) 498 if (ret < 0)
@@ -528,11 +526,11 @@ EXPORT_SYMBOL(generic_file_splice_read);
528static int pipe_to_sendpage(struct pipe_inode_info *pipe, 526static int pipe_to_sendpage(struct pipe_inode_info *pipe,
529 struct pipe_buffer *buf, struct splice_desc *sd) 527 struct pipe_buffer *buf, struct splice_desc *sd)
530{ 528{
531 struct file *file = sd->file; 529 struct file *file = sd->u.file;
532 loff_t pos = sd->pos; 530 loff_t pos = sd->pos;
533 int ret, more; 531 int ret, more;
534 532
535 ret = buf->ops->pin(pipe, buf); 533 ret = buf->ops->confirm(pipe, buf);
536 if (!ret) { 534 if (!ret) {
537 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 535 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
538 536
@@ -566,7 +564,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
566static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 564static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
567 struct splice_desc *sd) 565 struct splice_desc *sd)
568{ 566{
569 struct file *file = sd->file; 567 struct file *file = sd->u.file;
570 struct address_space *mapping = file->f_mapping; 568 struct address_space *mapping = file->f_mapping;
571 unsigned int offset, this_len; 569 unsigned int offset, this_len;
572 struct page *page; 570 struct page *page;
@@ -576,7 +574,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
576 /* 574 /*
577 * make sure the data in this buffer is uptodate 575 * make sure the data in this buffer is uptodate
578 */ 576 */
579 ret = buf->ops->pin(pipe, buf); 577 ret = buf->ops->confirm(pipe, buf);
580 if (unlikely(ret)) 578 if (unlikely(ret))
581 return ret; 579 return ret;
582 580
@@ -663,36 +661,37 @@ out_ret:
663 return ret; 661 return ret;
664} 662}
665 663
666/* 664/**
667 * Pipe input worker. Most of this logic works like a regular pipe, the 665 * __splice_from_pipe - splice data from a pipe to given actor
668 * key here is the 'actor' worker passed in that actually moves the data 666 * @pipe: pipe to splice from
669 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 667 * @sd: information to @actor
668 * @actor: handler that splices the data
669 *
670 * Description:
671 * This function does little more than loop over the pipe and call
672 * @actor to do the actual moving of a single struct pipe_buffer to
673 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
674 * pipe_to_user.
675 *
670 */ 676 */
671ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, 677ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
672 struct file *out, loff_t *ppos, size_t len, 678 splice_actor *actor)
673 unsigned int flags, splice_actor *actor)
674{ 679{
675 int ret, do_wakeup, err; 680 int ret, do_wakeup, err;
676 struct splice_desc sd;
677 681
678 ret = 0; 682 ret = 0;
679 do_wakeup = 0; 683 do_wakeup = 0;
680 684
681 sd.total_len = len;
682 sd.flags = flags;
683 sd.file = out;
684 sd.pos = *ppos;
685
686 for (;;) { 685 for (;;) {
687 if (pipe->nrbufs) { 686 if (pipe->nrbufs) {
688 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 687 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
689 const struct pipe_buf_operations *ops = buf->ops; 688 const struct pipe_buf_operations *ops = buf->ops;
690 689
691 sd.len = buf->len; 690 sd->len = buf->len;
692 if (sd.len > sd.total_len) 691 if (sd->len > sd->total_len)
693 sd.len = sd.total_len; 692 sd->len = sd->total_len;
694 693
695 err = actor(pipe, buf, &sd); 694 err = actor(pipe, buf, sd);
696 if (err <= 0) { 695 if (err <= 0) {
697 if (!ret && err != -ENODATA) 696 if (!ret && err != -ENODATA)
698 ret = err; 697 ret = err;
@@ -704,10 +703,10 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
704 buf->offset += err; 703 buf->offset += err;
705 buf->len -= err; 704 buf->len -= err;
706 705
707 sd.len -= err; 706 sd->len -= err;
708 sd.pos += err; 707 sd->pos += err;
709 sd.total_len -= err; 708 sd->total_len -= err;
710 if (sd.len) 709 if (sd->len)
711 continue; 710 continue;
712 711
713 if (!buf->len) { 712 if (!buf->len) {
@@ -719,7 +718,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
719 do_wakeup = 1; 718 do_wakeup = 1;
720 } 719 }
721 720
722 if (!sd.total_len) 721 if (!sd->total_len)
723 break; 722 break;
724 } 723 }
725 724
@@ -732,7 +731,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
732 break; 731 break;
733 } 732 }
734 733
735 if (flags & SPLICE_F_NONBLOCK) { 734 if (sd->flags & SPLICE_F_NONBLOCK) {
736 if (!ret) 735 if (!ret)
737 ret = -EAGAIN; 736 ret = -EAGAIN;
738 break; 737 break;
@@ -766,12 +765,32 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
766} 765}
767EXPORT_SYMBOL(__splice_from_pipe); 766EXPORT_SYMBOL(__splice_from_pipe);
768 767
768/**
769 * splice_from_pipe - splice data from a pipe to a file
770 * @pipe: pipe to splice from
771 * @out: file to splice to
772 * @ppos: position in @out
773 * @len: how many bytes to splice
774 * @flags: splice modifier flags
775 * @actor: handler that splices the data
776 *
777 * Description:
778 * See __splice_from_pipe. This function locks the input and output inodes,
779 * otherwise it's identical to __splice_from_pipe().
780 *
781 */
769ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 782ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
770 loff_t *ppos, size_t len, unsigned int flags, 783 loff_t *ppos, size_t len, unsigned int flags,
771 splice_actor *actor) 784 splice_actor *actor)
772{ 785{
773 ssize_t ret; 786 ssize_t ret;
774 struct inode *inode = out->f_mapping->host; 787 struct inode *inode = out->f_mapping->host;
788 struct splice_desc sd = {
789 .total_len = len,
790 .flags = flags,
791 .pos = *ppos,
792 .u.file = out,
793 };
775 794
776 /* 795 /*
777 * The actor worker might be calling ->prepare_write and 796 * The actor worker might be calling ->prepare_write and
@@ -780,7 +799,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
780 * pipe->inode, we have to order lock acquiry here. 799 * pipe->inode, we have to order lock acquiry here.
781 */ 800 */
782 inode_double_lock(inode, pipe->inode); 801 inode_double_lock(inode, pipe->inode);
783 ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); 802 ret = __splice_from_pipe(pipe, &sd, actor);
784 inode_double_unlock(inode, pipe->inode); 803 inode_double_unlock(inode, pipe->inode);
785 804
786 return ret; 805 return ret;
@@ -790,12 +809,14 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
790 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 809 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
791 * @pipe: pipe info 810 * @pipe: pipe info
792 * @out: file to write to 811 * @out: file to write to
812 * @ppos: position in @out
793 * @len: number of bytes to splice 813 * @len: number of bytes to splice
794 * @flags: splice modifier flags 814 * @flags: splice modifier flags
795 * 815 *
796 * Will either move or copy pages (determined by @flags options) from 816 * Description:
797 * the given pipe inode to the given file. The caller is responsible 817 * Will either move or copy pages (determined by @flags options) from
798 * for acquiring i_mutex on both inodes. 818 * the given pipe inode to the given file. The caller is responsible
819 * for acquiring i_mutex on both inodes.
799 * 820 *
800 */ 821 */
801ssize_t 822ssize_t
@@ -804,6 +825,12 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
804{ 825{
805 struct address_space *mapping = out->f_mapping; 826 struct address_space *mapping = out->f_mapping;
806 struct inode *inode = mapping->host; 827 struct inode *inode = mapping->host;
828 struct splice_desc sd = {
829 .total_len = len,
830 .flags = flags,
831 .pos = *ppos,
832 .u.file = out,
833 };
807 ssize_t ret; 834 ssize_t ret;
808 int err; 835 int err;
809 836
@@ -811,7 +838,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
811 if (unlikely(err)) 838 if (unlikely(err))
812 return err; 839 return err;
813 840
814 ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 841 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
815 if (ret > 0) { 842 if (ret > 0) {
816 unsigned long nr_pages; 843 unsigned long nr_pages;
817 844
@@ -841,11 +868,13 @@ EXPORT_SYMBOL(generic_file_splice_write_nolock);
841 * generic_file_splice_write - splice data from a pipe to a file 868 * generic_file_splice_write - splice data from a pipe to a file
842 * @pipe: pipe info 869 * @pipe: pipe info
843 * @out: file to write to 870 * @out: file to write to
871 * @ppos: position in @out
844 * @len: number of bytes to splice 872 * @len: number of bytes to splice
845 * @flags: splice modifier flags 873 * @flags: splice modifier flags
846 * 874 *
847 * Will either move or copy pages (determined by @flags options) from 875 * Description:
848 * the given pipe inode to the given file. 876 * Will either move or copy pages (determined by @flags options) from
877 * the given pipe inode to the given file.
849 * 878 *
850 */ 879 */
851ssize_t 880ssize_t
@@ -896,13 +925,15 @@ EXPORT_SYMBOL(generic_file_splice_write);
896 925
897/** 926/**
898 * generic_splice_sendpage - splice data from a pipe to a socket 927 * generic_splice_sendpage - splice data from a pipe to a socket
899 * @inode: pipe inode 928 * @pipe: pipe to splice from
900 * @out: socket to write to 929 * @out: socket to write to
930 * @ppos: position in @out
901 * @len: number of bytes to splice 931 * @len: number of bytes to splice
902 * @flags: splice modifier flags 932 * @flags: splice modifier flags
903 * 933 *
904 * Will send @len bytes from the pipe to a network socket. No data copying 934 * Description:
905 * is involved. 935 * Will send @len bytes from the pipe to a network socket. No data copying
936 * is involved.
906 * 937 *
907 */ 938 */
908ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 939ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
@@ -931,6 +962,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
931 if (unlikely(ret < 0)) 962 if (unlikely(ret < 0))
932 return ret; 963 return ret;
933 964
965 ret = security_file_permission(out, MAY_WRITE);
966 if (unlikely(ret < 0))
967 return ret;
968
934 return out->f_op->splice_write(pipe, out, ppos, len, flags); 969 return out->f_op->splice_write(pipe, out, ppos, len, flags);
935} 970}
936 971
@@ -953,17 +988,34 @@ static long do_splice_to(struct file *in, loff_t *ppos,
953 if (unlikely(ret < 0)) 988 if (unlikely(ret < 0))
954 return ret; 989 return ret;
955 990
991 ret = security_file_permission(in, MAY_READ);
992 if (unlikely(ret < 0))
993 return ret;
994
956 return in->f_op->splice_read(in, ppos, pipe, len, flags); 995 return in->f_op->splice_read(in, ppos, pipe, len, flags);
957} 996}
958 997
959long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 998/**
960 size_t len, unsigned int flags) 999 * splice_direct_to_actor - splices data directly between two non-pipes
1000 * @in: file to splice from
1001 * @sd: actor information on where to splice to
1002 * @actor: handles the data splicing
1003 *
1004 * Description:
1005 * This is a special case helper to splice directly between two
1006 * points, without requiring an explicit pipe. Internally an allocated
1007 * pipe is cached in the process, and reused during the life time of
1008 * that process.
1009 *
1010 */
1011ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1012 splice_direct_actor *actor)
961{ 1013{
962 struct pipe_inode_info *pipe; 1014 struct pipe_inode_info *pipe;
963 long ret, bytes; 1015 long ret, bytes;
964 loff_t out_off;
965 umode_t i_mode; 1016 umode_t i_mode;
966 int i; 1017 size_t len;
1018 int i, flags;
967 1019
968 /* 1020 /*
969 * We require the input being a regular file, as we don't want to 1021 * We require the input being a regular file, as we don't want to
@@ -999,49 +1051,43 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
999 */ 1051 */
1000 ret = 0; 1052 ret = 0;
1001 bytes = 0; 1053 bytes = 0;
1002 out_off = 0; 1054 len = sd->total_len;
1055 flags = sd->flags;
1003 1056
1004 while (len) { 1057 /*
1005 size_t read_len, max_read_len; 1058 * Don't block on output, we have to drain the direct pipe.
1059 */
1060 sd->flags &= ~SPLICE_F_NONBLOCK;
1006 1061
1007 /* 1062 while (len) {
1008 * Do at most PIPE_BUFFERS pages worth of transfer: 1063 size_t read_len;
1009 */ 1064 loff_t pos = sd->pos;
1010 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1011 1065
1012 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 1066 ret = do_splice_to(in, &pos, pipe, len, flags);
1013 if (unlikely(ret < 0)) 1067 if (unlikely(ret <= 0))
1014 goto out_release; 1068 goto out_release;
1015 1069
1016 read_len = ret; 1070 read_len = ret;
1071 sd->total_len = read_len;
1017 1072
1018 /* 1073 /*
1019 * NOTE: nonblocking mode only applies to the input. We 1074 * NOTE: nonblocking mode only applies to the input. We
1020 * must not do the output in nonblocking mode as then we 1075 * must not do the output in nonblocking mode as then we
1021 * could get stuck data in the internal pipe: 1076 * could get stuck data in the internal pipe:
1022 */ 1077 */
1023 ret = do_splice_from(pipe, out, &out_off, read_len, 1078 ret = actor(pipe, sd);
1024 flags & ~SPLICE_F_NONBLOCK); 1079 if (unlikely(ret <= 0))
1025 if (unlikely(ret < 0))
1026 goto out_release; 1080 goto out_release;
1027 1081
1028 bytes += ret; 1082 bytes += ret;
1029 len -= ret; 1083 len -= ret;
1084 sd->pos = pos;
1030 1085
1031 /* 1086 if (ret < read_len)
1032 * In nonblocking mode, if we got back a short read then 1087 goto out_release;
1033 * that was due to either an IO error or due to the
1034 * pagecache entry not being there. In the IO error case
1035 * the _next_ splice attempt will produce a clean IO error
1036 * return value (not a short read), so in both cases it's
1037 * correct to break out of the loop here:
1038 */
1039 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1040 break;
1041 } 1088 }
1042 1089
1043 pipe->nrbufs = pipe->curbuf = 0; 1090 pipe->nrbufs = pipe->curbuf = 0;
1044
1045 return bytes; 1091 return bytes;
1046 1092
1047out_release: 1093out_release:
@@ -1066,6 +1112,50 @@ out_release:
1066 return bytes; 1112 return bytes;
1067 1113
1068 return ret; 1114 return ret;
1115
1116}
1117EXPORT_SYMBOL(splice_direct_to_actor);
1118
1119static int direct_splice_actor(struct pipe_inode_info *pipe,
1120 struct splice_desc *sd)
1121{
1122 struct file *file = sd->u.file;
1123
1124 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1125}
1126
1127/**
1128 * do_splice_direct - splices data directly between two files
1129 * @in: file to splice from
1130 * @ppos: input file offset
1131 * @out: file to splice to
1132 * @len: number of bytes to splice
1133 * @flags: splice modifier flags
1134 *
1135 * Description:
1136 * For use by do_sendfile(). splice can easily emulate sendfile, but
1137 * doing it in the application would incur an extra system call
1138 * (splice in + splice out, as compared to just sendfile()). So this helper
1139 * can splice directly through a process-private pipe.
1140 *
1141 */
1142long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1143 size_t len, unsigned int flags)
1144{
1145 struct splice_desc sd = {
1146 .len = len,
1147 .total_len = len,
1148 .flags = flags,
1149 .pos = *ppos,
1150 .u.file = out,
1151 };
1152 long ret;
1153
1154 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1155 if (ret > 0)
1156 *ppos += ret;
1157
1158 return ret;
1069} 1159}
1070 1160
1071/* 1161/*
@@ -1248,28 +1338,131 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1248 return error; 1338 return error;
1249} 1339}
1250 1340
1341static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1342 struct splice_desc *sd)
1343{
1344 char *src;
1345 int ret;
1346
1347 ret = buf->ops->confirm(pipe, buf);
1348 if (unlikely(ret))
1349 return ret;
1350
1351 /*
1352 * See if we can use the atomic maps, by prefaulting in the
1353 * pages and doing an atomic copy
1354 */
1355 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1356 src = buf->ops->map(pipe, buf, 1);
1357 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1358 sd->len);
1359 buf->ops->unmap(pipe, buf, src);
1360 if (!ret) {
1361 ret = sd->len;
1362 goto out;
1363 }
1364 }
1365
1366 /*
1367 * No dice, use slow non-atomic map and copy
1368 */
1369 src = buf->ops->map(pipe, buf, 0);
1370
1371 ret = sd->len;
1372 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1373 ret = -EFAULT;
1374
1375out:
1376 if (ret > 0)
1377 sd->u.userptr += ret;
1378 buf->ops->unmap(pipe, buf, src);
1379 return ret;
1380}
1381
1382/*
1383 * For lack of a better implementation, implement vmsplice() to userspace
1384 * as a simple copy of the pipes pages to the user iov.
1385 */
1386static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1387 unsigned long nr_segs, unsigned int flags)
1388{
1389 struct pipe_inode_info *pipe;
1390 struct splice_desc sd;
1391 ssize_t size;
1392 int error;
1393 long ret;
1394
1395 pipe = pipe_info(file->f_path.dentry->d_inode);
1396 if (!pipe)
1397 return -EBADF;
1398
1399 if (pipe->inode)
1400 mutex_lock(&pipe->inode->i_mutex);
1401
1402 error = ret = 0;
1403 while (nr_segs) {
1404 void __user *base;
1405 size_t len;
1406
1407 /*
1408 * Get user address base and length for this iovec.
1409 */
1410 error = get_user(base, &iov->iov_base);
1411 if (unlikely(error))
1412 break;
1413 error = get_user(len, &iov->iov_len);
1414 if (unlikely(error))
1415 break;
1416
1417 /*
1418 * Sanity check this iovec. 0 read succeeds.
1419 */
1420 if (unlikely(!len))
1421 break;
1422 if (unlikely(!base)) {
1423 error = -EFAULT;
1424 break;
1425 }
1426
1427 sd.len = 0;
1428 sd.total_len = len;
1429 sd.flags = flags;
1430 sd.u.userptr = base;
1431 sd.pos = 0;
1432
1433 size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1434 if (size < 0) {
1435 if (!ret)
1436 ret = size;
1437
1438 break;
1439 }
1440
1441 ret += size;
1442
1443 if (size < len)
1444 break;
1445
1446 nr_segs--;
1447 iov++;
1448 }
1449
1450 if (pipe->inode)
1451 mutex_unlock(&pipe->inode->i_mutex);
1452
1453 if (!ret)
1454 ret = error;
1455
1456 return ret;
1457}
1458
1251/* 1459/*
1252 * vmsplice splices a user address range into a pipe. It can be thought of 1460 * vmsplice splices a user address range into a pipe. It can be thought of
1253 * as splice-from-memory, where the regular splice is splice-from-file (or 1461 * as splice-from-memory, where the regular splice is splice-from-file (or
1254 * to file). In both cases the output is a pipe, naturally. 1462 * to file). In both cases the output is a pipe, naturally.
1255 *
1256 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1257 * not the other way around. Splicing from user memory is a simple operation
1258 * that can be supported without any funky alignment restrictions or nasty
1259 * vm tricks. We simply map in the user memory and fill them into a pipe.
1260 * The reverse isn't quite as easy, though. There are two possible solutions
1261 * for that:
1262 *
1263 * - memcpy() the data internally, at which point we might as well just
1264 * do a regular read() on the buffer anyway.
1265 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1266 * has restriction limitations on both ends of the pipe).
1267 *
1268 * Alas, it isn't here.
1269 *
1270 */ 1463 */
1271static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1464static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1272 unsigned long nr_segs, unsigned int flags) 1465 unsigned long nr_segs, unsigned int flags)
1273{ 1466{
1274 struct pipe_inode_info *pipe; 1467 struct pipe_inode_info *pipe;
1275 struct page *pages[PIPE_BUFFERS]; 1468 struct page *pages[PIPE_BUFFERS];
@@ -1284,10 +1477,6 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1284 pipe = pipe_info(file->f_path.dentry->d_inode); 1477 pipe = pipe_info(file->f_path.dentry->d_inode);
1285 if (!pipe) 1478 if (!pipe)
1286 return -EBADF; 1479 return -EBADF;
1287 if (unlikely(nr_segs > UIO_MAXIOV))
1288 return -EINVAL;
1289 else if (unlikely(!nr_segs))
1290 return 0;
1291 1480
1292 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1481 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1293 flags & SPLICE_F_GIFT); 1482 flags & SPLICE_F_GIFT);
@@ -1297,6 +1486,22 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1297 return splice_to_pipe(pipe, &spd); 1486 return splice_to_pipe(pipe, &spd);
1298} 1487}
1299 1488
1489/*
1490 * Note that vmsplice only really supports true splicing _from_ user memory
1491 * to a pipe, not the other way around. Splicing from user memory is a simple
1492 * operation that can be supported without any funky alignment restrictions
1493 * or nasty vm tricks. We simply map in the user memory and fill them into
1494 * a pipe. The reverse isn't quite as easy, though. There are two possible
1495 * solutions for that:
1496 *
1497 * - memcpy() the data internally, at which point we might as well just
1498 * do a regular read() on the buffer anyway.
1499 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1500 * has restriction limitations on both ends of the pipe).
1501 *
1502 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1503 *
1504 */
1300asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1505asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1301 unsigned long nr_segs, unsigned int flags) 1506 unsigned long nr_segs, unsigned int flags)
1302{ 1507{
@@ -1304,11 +1509,18 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1304 long error; 1509 long error;
1305 int fput; 1510 int fput;
1306 1511
1512 if (unlikely(nr_segs > UIO_MAXIOV))
1513 return -EINVAL;
1514 else if (unlikely(!nr_segs))
1515 return 0;
1516
1307 error = -EBADF; 1517 error = -EBADF;
1308 file = fget_light(fd, &fput); 1518 file = fget_light(fd, &fput);
1309 if (file) { 1519 if (file) {
1310 if (file->f_mode & FMODE_WRITE) 1520 if (file->f_mode & FMODE_WRITE)
1311 error = do_vmsplice(file, iov, nr_segs, flags); 1521 error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1522 else if (file->f_mode & FMODE_READ)
1523 error = vmsplice_to_user(file, iov, nr_segs, flags);
1312 1524
1313 fput_light(file, fput); 1525 fput_light(file, fput);
1314 } 1526 }
diff --git a/fs/super.c b/fs/super.c
index 5260d620c5..fc8ebedc6b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -884,6 +884,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
884 error = type->get_sb(type, flags, name, data, mnt); 884 error = type->get_sb(type, flags, name, data, mnt);
885 if (error < 0) 885 if (error < 0)
886 goto out_free_secdata; 886 goto out_free_secdata;
887 BUG_ON(!mnt->mnt_sb);
887 888
888 error = security_sb_kern_mount(mnt->mnt_sb, secdata); 889 error = security_sb_kern_mount(mnt->mnt_sb, secdata);
889 if (error) 890 if (error)
diff --git a/fs/sync.c b/fs/sync.c
index 2f97576355..7cd005ea76 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -236,6 +236,14 @@ out:
236 return ret; 236 return ret;
237} 237}
238 238
239/* It would be nice if people remember that not all the world's an i386
240 when they introduce new system calls */
241asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
242 loff_t offset, loff_t nbytes)
243{
244 return sys_sync_file_range(fd, offset, nbytes, flags);
245}
246
239/* 247/*
240 * `endbyte' is inclusive 248 * `endbyte' is inclusive
241 */ 249 */
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d3b9f5f07d..135353f8a2 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -20,29 +20,41 @@
20 20
21#include "sysfs.h" 21#include "sysfs.h"
22 22
23struct bin_buffer {
24 struct mutex mutex;
25 void *buffer;
26 int mmapped;
27};
28
23static int 29static int
24fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 30fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
25{ 31{
26 struct bin_attribute * attr = to_bin_attr(dentry); 32 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
27 struct kobject * kobj = to_kobj(dentry->d_parent); 33 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
34 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
35 int rc;
36
37 /* need attr_sd for attr, its parent for kobj */
38 if (!sysfs_get_active_two(attr_sd))
39 return -ENODEV;
28 40
29 if (!attr->read) 41 rc = -EIO;
30 return -EIO; 42 if (attr->read)
43 rc = attr->read(kobj, attr, buffer, off, count);
31 44
32 return attr->read(kobj, buffer, off, count); 45 sysfs_put_active_two(attr_sd);
46
47 return rc;
33} 48}
34 49
35static ssize_t 50static ssize_t
36read(struct file * file, char __user * userbuf, size_t count, loff_t * off) 51read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
37{ 52{
38 char *buffer = file->private_data; 53 struct bin_buffer *bb = file->private_data;
39 struct dentry *dentry = file->f_path.dentry; 54 struct dentry *dentry = file->f_path.dentry;
40 int size = dentry->d_inode->i_size; 55 int size = dentry->d_inode->i_size;
41 loff_t offs = *off; 56 loff_t offs = *off;
42 int ret; 57 int count = min_t(size_t, bytes, PAGE_SIZE);
43
44 if (count > PAGE_SIZE)
45 count = PAGE_SIZE;
46 58
47 if (size) { 59 if (size) {
48 if (offs > size) 60 if (offs > size)
@@ -51,43 +63,56 @@ read(struct file * file, char __user * userbuf, size_t count, loff_t * off)
51 count = size - offs; 63 count = size - offs;
52 } 64 }
53 65
54 ret = fill_read(dentry, buffer, offs, count); 66 mutex_lock(&bb->mutex);
55 if (ret < 0) 67
56 return ret; 68 count = fill_read(dentry, bb->buffer, offs, count);
57 count = ret; 69 if (count < 0)
70 goto out_unlock;
58 71
59 if (copy_to_user(userbuf, buffer, count)) 72 if (copy_to_user(userbuf, bb->buffer, count)) {
60 return -EFAULT; 73 count = -EFAULT;
74 goto out_unlock;
75 }
61 76
62 pr_debug("offs = %lld, *off = %lld, count = %zd\n", offs, *off, count); 77 pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
63 78
64 *off = offs + count; 79 *off = offs + count;
65 80
81 out_unlock:
82 mutex_unlock(&bb->mutex);
66 return count; 83 return count;
67} 84}
68 85
69static int 86static int
70flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 87flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
71{ 88{
72 struct bin_attribute *attr = to_bin_attr(dentry); 89 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
73 struct kobject *kobj = to_kobj(dentry->d_parent); 90 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
91 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
92 int rc;
93
94 /* need attr_sd for attr, its parent for kobj */
95 if (!sysfs_get_active_two(attr_sd))
96 return -ENODEV;
97
98 rc = -EIO;
99 if (attr->write)
100 rc = attr->write(kobj, attr, buffer, offset, count);
74 101
75 if (!attr->write) 102 sysfs_put_active_two(attr_sd);
76 return -EIO;
77 103
78 return attr->write(kobj, buffer, offset, count); 104 return rc;
79} 105}
80 106
81static ssize_t write(struct file * file, const char __user * userbuf, 107static ssize_t write(struct file *file, const char __user *userbuf,
82 size_t count, loff_t * off) 108 size_t bytes, loff_t *off)
83{ 109{
84 char *buffer = file->private_data; 110 struct bin_buffer *bb = file->private_data;
85 struct dentry *dentry = file->f_path.dentry; 111 struct dentry *dentry = file->f_path.dentry;
86 int size = dentry->d_inode->i_size; 112 int size = dentry->d_inode->i_size;
87 loff_t offs = *off; 113 loff_t offs = *off;
114 int count = min_t(size_t, bytes, PAGE_SIZE);
88 115
89 if (count > PAGE_SIZE)
90 count = PAGE_SIZE;
91 if (size) { 116 if (size) {
92 if (offs > size) 117 if (offs > size)
93 return 0; 118 return 0;
@@ -95,72 +120,100 @@ static ssize_t write(struct file * file, const char __user * userbuf,
95 count = size - offs; 120 count = size - offs;
96 } 121 }
97 122
98 if (copy_from_user(buffer, userbuf, count)) 123 mutex_lock(&bb->mutex);
99 return -EFAULT;
100 124
101 count = flush_write(dentry, buffer, offs, count); 125 if (copy_from_user(bb->buffer, userbuf, count)) {
126 count = -EFAULT;
127 goto out_unlock;
128 }
129
130 count = flush_write(dentry, bb->buffer, offs, count);
102 if (count > 0) 131 if (count > 0)
103 *off = offs + count; 132 *off = offs + count;
133
134 out_unlock:
135 mutex_unlock(&bb->mutex);
104 return count; 136 return count;
105} 137}
106 138
107static int mmap(struct file *file, struct vm_area_struct *vma) 139static int mmap(struct file *file, struct vm_area_struct *vma)
108{ 140{
109 struct dentry *dentry = file->f_path.dentry; 141 struct bin_buffer *bb = file->private_data;
110 struct bin_attribute *attr = to_bin_attr(dentry); 142 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
111 struct kobject *kobj = to_kobj(dentry->d_parent); 143 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
144 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
145 int rc;
146
147 mutex_lock(&bb->mutex);
148
149 /* need attr_sd for attr, its parent for kobj */
150 if (!sysfs_get_active_two(attr_sd))
151 return -ENODEV;
112 152
113 if (!attr->mmap) 153 rc = -EINVAL;
114 return -EINVAL; 154 if (attr->mmap)
155 rc = attr->mmap(kobj, attr, vma);
115 156
116 return attr->mmap(kobj, attr, vma); 157 if (rc == 0 && !bb->mmapped)
158 bb->mmapped = 1;
159 else
160 sysfs_put_active_two(attr_sd);
161
162 mutex_unlock(&bb->mutex);
163
164 return rc;
117} 165}
118 166
119static int open(struct inode * inode, struct file * file) 167static int open(struct inode * inode, struct file * file)
120{ 168{
121 struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); 169 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
122 struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); 170 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
123 int error = -EINVAL; 171 struct bin_buffer *bb = NULL;
124 172 int error;
125 if (!kobj || !attr)
126 goto Done;
127 173
128 /* Grab the module reference for this attribute if we have one */ 174 /* need attr_sd for attr */
129 error = -ENODEV; 175 if (!sysfs_get_active(attr_sd))
130 if (!try_module_get(attr->attr.owner)) 176 return -ENODEV;
131 goto Done;
132 177
133 error = -EACCES; 178 error = -EACCES;
134 if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap)) 179 if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
135 goto Error; 180 goto err_out;
136 if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap)) 181 if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
137 goto Error; 182 goto err_out;
138 183
139 error = -ENOMEM; 184 error = -ENOMEM;
140 file->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); 185 bb = kzalloc(sizeof(*bb), GFP_KERNEL);
141 if (!file->private_data) 186 if (!bb)
142 goto Error; 187 goto err_out;
143 188
144 error = 0; 189 bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
145 goto Done; 190 if (!bb->buffer)
146 191 goto err_out;
147 Error: 192
148 module_put(attr->attr.owner); 193 mutex_init(&bb->mutex);
149 Done: 194 file->private_data = bb;
150 if (error) 195
151 kobject_put(kobj); 196 /* open succeeded, put active reference and pin attr_sd */
197 sysfs_put_active(attr_sd);
198 sysfs_get(attr_sd);
199 return 0;
200
201 err_out:
202 sysfs_put_active(attr_sd);
203 kfree(bb);
152 return error; 204 return error;
153} 205}
154 206
155static int release(struct inode * inode, struct file * file) 207static int release(struct inode * inode, struct file * file)
156{ 208{
157 struct kobject * kobj = to_kobj(file->f_path.dentry->d_parent); 209 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
158 struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); 210 struct bin_buffer *bb = file->private_data;
159 u8 * buffer = file->private_data; 211
160 212 if (bb->mmapped)
161 kobject_put(kobj); 213 sysfs_put_active_two(attr_sd);
162 module_put(attr->attr.owner); 214 sysfs_put(attr_sd);
163 kfree(buffer); 215 kfree(bb->buffer);
216 kfree(bb);
164 return 0; 217 return 0;
165} 218}
166 219
@@ -181,9 +234,9 @@ const struct file_operations bin_fops = {
181 234
182int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) 235int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
183{ 236{
184 BUG_ON(!kobj || !kobj->dentry || !attr); 237 BUG_ON(!kobj || !kobj->sd || !attr);
185 238
186 return sysfs_add_file(kobj->dentry, &attr->attr, SYSFS_KOBJ_BIN_ATTR); 239 return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
187} 240}
188 241
189 242
@@ -195,7 +248,7 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
195 248
196void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) 249void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
197{ 250{
198 if (sysfs_hash_and_remove(kobj->dentry, attr->attr.name) < 0) { 251 if (sysfs_hash_and_remove(kobj->sd, attr->attr.name) < 0) {
199 printk(KERN_ERR "%s: " 252 printk(KERN_ERR "%s: "
200 "bad dentry or inode or no such file: \"%s\"\n", 253 "bad dentry or inode or no such file: \"%s\"\n",
201 __FUNCTION__, attr->attr.name); 254 __FUNCTION__, attr->attr.name);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index c4342a0199..aee966c44a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -9,21 +9,337 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/namei.h> 11#include <linux/namei.h>
12#include <linux/idr.h>
13#include <linux/completion.h>
12#include <asm/semaphore.h> 14#include <asm/semaphore.h>
13#include "sysfs.h" 15#include "sysfs.h"
14 16
15DECLARE_RWSEM(sysfs_rename_sem); 17DEFINE_MUTEX(sysfs_mutex);
16spinlock_t sysfs_lock = SPIN_LOCK_UNLOCKED; 18spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED;
19
20static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED;
21static DEFINE_IDA(sysfs_ino_ida);
22
23/**
24 * sysfs_link_sibling - link sysfs_dirent into sibling list
25 * @sd: sysfs_dirent of interest
26 *
27 * Link @sd into its sibling list which starts from
28 * sd->s_parent->s_children.
29 *
30 * Locking:
31 * mutex_lock(sysfs_mutex)
32 */
33void sysfs_link_sibling(struct sysfs_dirent *sd)
34{
35 struct sysfs_dirent *parent_sd = sd->s_parent;
36
37 BUG_ON(sd->s_sibling);
38 sd->s_sibling = parent_sd->s_children;
39 parent_sd->s_children = sd;
40}
41
42/**
43 * sysfs_unlink_sibling - unlink sysfs_dirent from sibling list
44 * @sd: sysfs_dirent of interest
45 *
46 * Unlink @sd from its sibling list which starts from
47 * sd->s_parent->s_children.
48 *
49 * Locking:
50 * mutex_lock(sysfs_mutex)
51 */
52void sysfs_unlink_sibling(struct sysfs_dirent *sd)
53{
54 struct sysfs_dirent **pos;
55
56 for (pos = &sd->s_parent->s_children; *pos; pos = &(*pos)->s_sibling) {
57 if (*pos == sd) {
58 *pos = sd->s_sibling;
59 sd->s_sibling = NULL;
60 break;
61 }
62 }
63}
64
65/**
66 * sysfs_get_dentry - get dentry for the given sysfs_dirent
67 * @sd: sysfs_dirent of interest
68 *
69 * Get dentry for @sd. Dentry is looked up if currently not
70 * present. This function climbs sysfs_dirent tree till it
71 * reaches a sysfs_dirent with valid dentry attached and descends
72 * down from there looking up dentry for each step.
73 *
74 * LOCKING:
75 * Kernel thread context (may sleep)
76 *
77 * RETURNS:
78 * Pointer to found dentry on success, ERR_PTR() value on error.
79 */
80struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
81{
82 struct sysfs_dirent *cur;
83 struct dentry *parent_dentry, *dentry;
84 int i, depth;
85
86 /* Find the first parent which has valid s_dentry and get the
87 * dentry.
88 */
89 mutex_lock(&sysfs_mutex);
90 restart0:
91 spin_lock(&sysfs_assoc_lock);
92 restart1:
93 spin_lock(&dcache_lock);
94
95 dentry = NULL;
96 depth = 0;
97 cur = sd;
98 while (!cur->s_dentry || !cur->s_dentry->d_inode) {
99 if (cur->s_flags & SYSFS_FLAG_REMOVED) {
100 dentry = ERR_PTR(-ENOENT);
101 depth = 0;
102 break;
103 }
104 cur = cur->s_parent;
105 depth++;
106 }
107 if (!IS_ERR(dentry))
108 dentry = dget_locked(cur->s_dentry);
109
110 spin_unlock(&dcache_lock);
111 spin_unlock(&sysfs_assoc_lock);
112
113 /* from the found dentry, look up depth times */
114 while (depth--) {
115 /* find and get depth'th ancestor */
116 for (cur = sd, i = 0; cur && i < depth; i++)
117 cur = cur->s_parent;
118
119 /* This can happen if tree structure was modified due
120 * to move/rename. Restart.
121 */
122 if (i != depth) {
123 dput(dentry);
124 goto restart0;
125 }
126
127 sysfs_get(cur);
128
129 mutex_unlock(&sysfs_mutex);
130
131 /* look it up */
132 parent_dentry = dentry;
133 dentry = lookup_one_len_kern(cur->s_name, parent_dentry,
134 strlen(cur->s_name));
135 dput(parent_dentry);
136
137 if (IS_ERR(dentry)) {
138 sysfs_put(cur);
139 return dentry;
140 }
141
142 mutex_lock(&sysfs_mutex);
143 spin_lock(&sysfs_assoc_lock);
144
145 /* This, again, can happen if tree structure has
146 * changed and we looked up the wrong thing. Restart.
147 */
148 if (cur->s_dentry != dentry) {
149 dput(dentry);
150 sysfs_put(cur);
151 goto restart1;
152 }
153
154 spin_unlock(&sysfs_assoc_lock);
155
156 sysfs_put(cur);
157 }
158
159 mutex_unlock(&sysfs_mutex);
160 return dentry;
161}
162
163/**
164 * sysfs_get_active - get an active reference to sysfs_dirent
165 * @sd: sysfs_dirent to get an active reference to
166 *
167 * Get an active reference of @sd. This function is noop if @sd
168 * is NULL.
169 *
170 * RETURNS:
171 * Pointer to @sd on success, NULL on failure.
172 */
173struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
174{
175 if (unlikely(!sd))
176 return NULL;
177
178 while (1) {
179 int v, t;
180
181 v = atomic_read(&sd->s_active);
182 if (unlikely(v < 0))
183 return NULL;
184
185 t = atomic_cmpxchg(&sd->s_active, v, v + 1);
186 if (likely(t == v))
187 return sd;
188 if (t < 0)
189 return NULL;
190
191 cpu_relax();
192 }
193}
194
195/**
196 * sysfs_put_active - put an active reference to sysfs_dirent
197 * @sd: sysfs_dirent to put an active reference to
198 *
199 * Put an active reference to @sd. This function is noop if @sd
200 * is NULL.
201 */
202void sysfs_put_active(struct sysfs_dirent *sd)
203{
204 struct completion *cmpl;
205 int v;
206
207 if (unlikely(!sd))
208 return;
209
210 v = atomic_dec_return(&sd->s_active);
211 if (likely(v != SD_DEACTIVATED_BIAS))
212 return;
213
214 /* atomic_dec_return() is a mb(), we'll always see the updated
215 * sd->s_sibling.
216 */
217 cmpl = (void *)sd->s_sibling;
218 complete(cmpl);
219}
220
221/**
222 * sysfs_get_active_two - get active references to sysfs_dirent and parent
223 * @sd: sysfs_dirent of interest
224 *
225 * Get active reference to @sd and its parent. Parent's active
226 * reference is grabbed first. This function is noop if @sd is
227 * NULL.
228 *
229 * RETURNS:
230 * Pointer to @sd on success, NULL on failure.
231 */
232struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
233{
234 if (sd) {
235 if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
236 return NULL;
237 if (unlikely(!sysfs_get_active(sd))) {
238 sysfs_put_active(sd->s_parent);
239 return NULL;
240 }
241 }
242 return sd;
243}
244
245/**
246 * sysfs_put_active_two - put active references to sysfs_dirent and parent
247 * @sd: sysfs_dirent of interest
248 *
249 * Put active references to @sd and its parent. This function is
250 * noop if @sd is NULL.
251 */
252void sysfs_put_active_two(struct sysfs_dirent *sd)
253{
254 if (sd) {
255 sysfs_put_active(sd);
256 sysfs_put_active(sd->s_parent);
257 }
258}
259
260/**
261 * sysfs_deactivate - deactivate sysfs_dirent
262 * @sd: sysfs_dirent to deactivate
263 *
264 * Deny new active references and drain existing ones.
265 */
266static void sysfs_deactivate(struct sysfs_dirent *sd)
267{
268 DECLARE_COMPLETION_ONSTACK(wait);
269 int v;
270
271 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
272 sd->s_sibling = (void *)&wait;
273
274 /* atomic_add_return() is a mb(), put_active() will always see
275 * the updated sd->s_sibling.
276 */
277 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
278
279 if (v != SD_DEACTIVATED_BIAS)
280 wait_for_completion(&wait);
281
282 sd->s_sibling = NULL;
283}
284
285static int sysfs_alloc_ino(ino_t *pino)
286{
287 int ino, rc;
288
289 retry:
290 spin_lock(&sysfs_ino_lock);
291 rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
292 spin_unlock(&sysfs_ino_lock);
293
294 if (rc == -EAGAIN) {
295 if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
296 goto retry;
297 rc = -ENOMEM;
298 }
299
300 *pino = ino;
301 return rc;
302}
303
304static void sysfs_free_ino(ino_t ino)
305{
306 spin_lock(&sysfs_ino_lock);
307 ida_remove(&sysfs_ino_ida, ino);
308 spin_unlock(&sysfs_ino_lock);
309}
310
311void release_sysfs_dirent(struct sysfs_dirent * sd)
312{
313 struct sysfs_dirent *parent_sd;
314
315 repeat:
316 /* Moving/renaming is always done while holding reference.
317 * sd->s_parent won't change beneath us.
318 */
319 parent_sd = sd->s_parent;
320
321 if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
322 sysfs_put(sd->s_elem.symlink.target_sd);
323 if (sysfs_type(sd) & SYSFS_COPY_NAME)
324 kfree(sd->s_name);
325 kfree(sd->s_iattr);
326 sysfs_free_ino(sd->s_ino);
327 kmem_cache_free(sysfs_dir_cachep, sd);
328
329 sd = parent_sd;
330 if (sd && atomic_dec_and_test(&sd->s_count))
331 goto repeat;
332}
17 333
18static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) 334static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
19{ 335{
20 struct sysfs_dirent * sd = dentry->d_fsdata; 336 struct sysfs_dirent * sd = dentry->d_fsdata;
21 337
22 if (sd) { 338 if (sd) {
23 /* sd->s_dentry is protected with sysfs_lock. This 339 /* sd->s_dentry is protected with sysfs_assoc_lock.
24 * allows sysfs_drop_dentry() to dereference it. 340 * This allows sysfs_drop_dentry() to dereference it.
25 */ 341 */
26 spin_lock(&sysfs_lock); 342 spin_lock(&sysfs_assoc_lock);
27 343
28 /* The dentry might have been deleted or another 344 /* The dentry might have been deleted or another
29 * lookup could have happened updating sd->s_dentry to 345 * lookup could have happened updating sd->s_dentry to
@@ -32,7 +348,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
32 */ 348 */
33 if (sd->s_dentry == dentry) 349 if (sd->s_dentry == dentry)
34 sd->s_dentry = NULL; 350 sd->s_dentry = NULL;
35 spin_unlock(&sysfs_lock); 351 spin_unlock(&sysfs_assoc_lock);
36 sysfs_put(sd); 352 sysfs_put(sd);
37 } 353 }
38 iput(inode); 354 iput(inode);
@@ -42,260 +358,402 @@ static struct dentry_operations sysfs_dentry_ops = {
42 .d_iput = sysfs_d_iput, 358 .d_iput = sysfs_d_iput,
43}; 359};
44 360
45static unsigned int sysfs_inode_counter; 361struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
46ino_t sysfs_get_inum(void)
47{ 362{
48 if (unlikely(sysfs_inode_counter < 3)) 363 char *dup_name = NULL;
49 sysfs_inode_counter = 3; 364 struct sysfs_dirent *sd = NULL;
50 return sysfs_inode_counter++;
51}
52 365
53/* 366 if (type & SYSFS_COPY_NAME) {
54 * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent 367 name = dup_name = kstrdup(name, GFP_KERNEL);
55 */ 368 if (!name)
56static struct sysfs_dirent * __sysfs_new_dirent(void * element) 369 goto err_out;
57{ 370 }
58 struct sysfs_dirent * sd;
59 371
60 sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); 372 sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
61 if (!sd) 373 if (!sd)
62 return NULL; 374 goto err_out;
375
376 if (sysfs_alloc_ino(&sd->s_ino))
377 goto err_out;
63 378
64 sd->s_ino = sysfs_get_inum();
65 atomic_set(&sd->s_count, 1); 379 atomic_set(&sd->s_count, 1);
380 atomic_set(&sd->s_active, 0);
66 atomic_set(&sd->s_event, 1); 381 atomic_set(&sd->s_event, 1);
67 INIT_LIST_HEAD(&sd->s_children); 382
68 INIT_LIST_HEAD(&sd->s_sibling); 383 sd->s_name = name;
69 sd->s_element = element; 384 sd->s_mode = mode;
385 sd->s_flags = type;
70 386
71 return sd; 387 return sd;
388
389 err_out:
390 kfree(dup_name);
391 kmem_cache_free(sysfs_dir_cachep, sd);
392 return NULL;
72} 393}
73 394
74static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd, 395/**
75 struct sysfs_dirent *sd) 396 * sysfs_attach_dentry - associate sysfs_dirent with dentry
397 * @sd: target sysfs_dirent
398 * @dentry: dentry to associate
399 *
400 * Associate @sd with @dentry. This is protected by
401 * sysfs_assoc_lock to avoid race with sysfs_d_iput().
402 *
403 * LOCKING:
404 * mutex_lock(sysfs_mutex)
405 */
406static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry)
76{ 407{
77 if (sd) 408 dentry->d_op = &sysfs_dentry_ops;
78 list_add(&sd->s_sibling, &parent_sd->s_children); 409 dentry->d_fsdata = sysfs_get(sd);
410
411 /* protect sd->s_dentry against sysfs_d_iput */
412 spin_lock(&sysfs_assoc_lock);
413 sd->s_dentry = dentry;
414 spin_unlock(&sysfs_assoc_lock);
415
416 d_rehash(dentry);
79} 417}
80 418
81static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd, 419static int sysfs_ilookup_test(struct inode *inode, void *arg)
82 void * element)
83{ 420{
84 struct sysfs_dirent *sd; 421 struct sysfs_dirent *sd = arg;
85 sd = __sysfs_new_dirent(element); 422 return inode->i_ino == sd->s_ino;
86 __sysfs_list_dirent(parent_sd, sd);
87 return sd;
88} 423}
89 424
90/* 425/**
426 * sysfs_addrm_start - prepare for sysfs_dirent add/remove
427 * @acxt: pointer to sysfs_addrm_cxt to be used
428 * @parent_sd: parent sysfs_dirent
91 * 429 *
92 * Return -EEXIST if there is already a sysfs element with the same name for 430 * This function is called when the caller is about to add or
93 * the same parent. 431 * remove sysfs_dirent under @parent_sd. This function acquires
432 * sysfs_mutex, grabs inode for @parent_sd if available and lock
433 * i_mutex of it. @acxt is used to keep and pass context to
434 * other addrm functions.
94 * 435 *
95 * called with parent inode's i_mutex held 436 * LOCKING:
437 * Kernel thread context (may sleep). sysfs_mutex is locked on
438 * return. i_mutex of parent inode is locked on return if
439 * available.
96 */ 440 */
97int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, 441void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
98 const unsigned char *new) 442 struct sysfs_dirent *parent_sd)
99{ 443{
100 struct sysfs_dirent * sd; 444 struct inode *inode;
101 445
102 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 446 memset(acxt, 0, sizeof(*acxt));
103 if (sd->s_element) { 447 acxt->parent_sd = parent_sd;
104 const unsigned char *existing = sysfs_get_name(sd);
105 if (strcmp(existing, new))
106 continue;
107 else
108 return -EEXIST;
109 }
110 }
111 448
112 return 0; 449 /* Lookup parent inode. inode initialization and I_NEW
450 * clearing are protected by sysfs_mutex. By grabbing it and
451 * looking up with _nowait variant, inode state can be
452 * determined reliably.
453 */
454 mutex_lock(&sysfs_mutex);
455
456 inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
457 parent_sd);
458
459 if (inode && !(inode->i_state & I_NEW)) {
460 /* parent inode available */
461 acxt->parent_inode = inode;
462
463 /* sysfs_mutex is below i_mutex in lock hierarchy.
464 * First, trylock i_mutex. If fails, unlock
465 * sysfs_mutex and lock them in order.
466 */
467 if (!mutex_trylock(&inode->i_mutex)) {
468 mutex_unlock(&sysfs_mutex);
469 mutex_lock(&inode->i_mutex);
470 mutex_lock(&sysfs_mutex);
471 }
472 } else
473 iput(inode);
113} 474}
114 475
476/**
477 * sysfs_add_one - add sysfs_dirent to parent
478 * @acxt: addrm context to use
479 * @sd: sysfs_dirent to be added
480 *
481 * Get @acxt->parent_sd and set sd->s_parent to it and increment
482 * nlink of parent inode if @sd is a directory. @sd is NOT
483 * linked into the children list of the parent. The caller
484 * should invoke sysfs_link_sibling() after this function
485 * completes if @sd needs to be on the children list.
486 *
487 * This function should be called between calls to
488 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
489 * passed the same @acxt as passed to sysfs_addrm_start().
490 *
491 * LOCKING:
492 * Determined by sysfs_addrm_start().
493 */
494void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
495{
496 sd->s_parent = sysfs_get(acxt->parent_sd);
497
498 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
499 inc_nlink(acxt->parent_inode);
500
501 acxt->cnt++;
502}
115 503
116static struct sysfs_dirent * 504/**
117__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type) 505 * sysfs_remove_one - remove sysfs_dirent from parent
506 * @acxt: addrm context to use
507 * @sd: sysfs_dirent to be added
508 *
509 * Mark @sd removed and drop nlink of parent inode if @sd is a
510 * directory. @sd is NOT unlinked from the children list of the
511 * parent. The caller is repsonsible for removing @sd from the
512 * children list before calling this function.
513 *
514 * This function should be called between calls to
515 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
516 * passed the same @acxt as passed to sysfs_addrm_start().
517 *
518 * LOCKING:
519 * Determined by sysfs_addrm_start().
520 */
521void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
118{ 522{
119 struct sysfs_dirent * sd; 523 BUG_ON(sd->s_sibling || (sd->s_flags & SYSFS_FLAG_REMOVED));
120 524
121 sd = __sysfs_new_dirent(element); 525 sd->s_flags |= SYSFS_FLAG_REMOVED;
122 if (!sd) 526 sd->s_sibling = acxt->removed;
123 goto out; 527 acxt->removed = sd;
124 528
125 sd->s_mode = mode; 529 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
126 sd->s_type = type; 530 drop_nlink(acxt->parent_inode);
127 sd->s_dentry = dentry;
128 if (dentry) {
129 dentry->d_fsdata = sysfs_get(sd);
130 dentry->d_op = &sysfs_dentry_ops;
131 }
132 531
133out: 532 acxt->cnt++;
134 return sd;
135} 533}
136 534
137int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, 535/**
138 void * element, umode_t mode, int type) 536 * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent
537 * @sd: target sysfs_dirent
538 *
539 * Drop dentry for @sd. @sd must have been unlinked from its
540 * parent on entry to this function such that it can't be looked
541 * up anymore.
542 *
543 * @sd->s_dentry which is protected with sysfs_assoc_lock points
544 * to the currently associated dentry but we're not holding a
545 * reference to it and racing with dput(). Grab dcache_lock and
546 * verify dentry before dropping it. If @sd->s_dentry is NULL or
547 * dput() beats us, no need to bother.
548 */
549static void sysfs_drop_dentry(struct sysfs_dirent *sd)
139{ 550{
140 struct sysfs_dirent *sd; 551 struct dentry *dentry = NULL;
552 struct inode *inode;
553
554 /* We're not holding a reference to ->s_dentry dentry but the
555 * field will stay valid as long as sysfs_assoc_lock is held.
556 */
557 spin_lock(&sysfs_assoc_lock);
558 spin_lock(&dcache_lock);
559
560 /* drop dentry if it's there and dput() didn't kill it yet */
561 if (sd->s_dentry && sd->s_dentry->d_inode) {
562 dentry = dget_locked(sd->s_dentry);
563 spin_lock(&dentry->d_lock);
564 __d_drop(dentry);
565 spin_unlock(&dentry->d_lock);
566 }
141 567
142 sd = __sysfs_make_dirent(dentry, element, mode, type); 568 spin_unlock(&dcache_lock);
143 __sysfs_list_dirent(parent_sd, sd); 569 spin_unlock(&sysfs_assoc_lock);
144 570
145 return sd ? 0 : -ENOMEM; 571 /* dentries for shadowed inodes are pinned, unpin */
572 if (dentry && sysfs_is_shadowed_inode(dentry->d_inode))
573 dput(dentry);
574 dput(dentry);
575
576 /* adjust nlink and update timestamp */
577 inode = ilookup(sysfs_sb, sd->s_ino);
578 if (inode) {
579 mutex_lock(&inode->i_mutex);
580
581 inode->i_ctime = CURRENT_TIME;
582 drop_nlink(inode);
583 if (sysfs_type(sd) == SYSFS_DIR)
584 drop_nlink(inode);
585
586 mutex_unlock(&inode->i_mutex);
587 iput(inode);
588 }
146} 589}
147 590
148static int init_dir(struct inode * inode) 591/**
592 * sysfs_addrm_finish - finish up sysfs_dirent add/remove
593 * @acxt: addrm context to finish up
594 *
595 * Finish up sysfs_dirent add/remove. Resources acquired by
596 * sysfs_addrm_start() are released and removed sysfs_dirents are
597 * cleaned up. Timestamps on the parent inode are updated.
598 *
599 * LOCKING:
600 * All mutexes acquired by sysfs_addrm_start() are released.
601 *
602 * RETURNS:
603 * Number of added/removed sysfs_dirents since sysfs_addrm_start().
604 */
605int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
149{ 606{
150 inode->i_op = &sysfs_dir_inode_operations; 607 /* release resources acquired by sysfs_addrm_start() */
151 inode->i_fop = &sysfs_dir_operations; 608 mutex_unlock(&sysfs_mutex);
609 if (acxt->parent_inode) {
610 struct inode *inode = acxt->parent_inode;
152 611
153 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 612 /* if added/removed, update timestamps on the parent */
154 inc_nlink(inode); 613 if (acxt->cnt)
155 return 0; 614 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
615
616 mutex_unlock(&inode->i_mutex);
617 iput(inode);
618 }
619
620 /* kill removed sysfs_dirents */
621 while (acxt->removed) {
622 struct sysfs_dirent *sd = acxt->removed;
623
624 acxt->removed = sd->s_sibling;
625 sd->s_sibling = NULL;
626
627 sysfs_drop_dentry(sd);
628 sysfs_deactivate(sd);
629 sysfs_put(sd);
630 }
631
632 return acxt->cnt;
156} 633}
157 634
158static int init_file(struct inode * inode) 635/**
636 * sysfs_find_dirent - find sysfs_dirent with the given name
637 * @parent_sd: sysfs_dirent to search under
638 * @name: name to look for
639 *
640 * Look for sysfs_dirent with name @name under @parent_sd.
641 *
642 * LOCKING:
643 * mutex_lock(sysfs_mutex)
644 *
645 * RETURNS:
646 * Pointer to sysfs_dirent if found, NULL if not.
647 */
648struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
649 const unsigned char *name)
159{ 650{
160 inode->i_size = PAGE_SIZE; 651 struct sysfs_dirent *sd;
161 inode->i_fop = &sysfs_file_operations; 652
162 return 0; 653 for (sd = parent_sd->s_children; sd; sd = sd->s_sibling)
654 if (sysfs_type(sd) && !strcmp(sd->s_name, name))
655 return sd;
656 return NULL;
163} 657}
164 658
165static int init_symlink(struct inode * inode) 659/**
660 * sysfs_get_dirent - find and get sysfs_dirent with the given name
661 * @parent_sd: sysfs_dirent to search under
662 * @name: name to look for
663 *
664 * Look for sysfs_dirent with name @name under @parent_sd and get
665 * it if found.
666 *
667 * LOCKING:
668 * Kernel thread context (may sleep). Grabs sysfs_mutex.
669 *
670 * RETURNS:
671 * Pointer to sysfs_dirent if found, NULL if not.
672 */
673struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
674 const unsigned char *name)
166{ 675{
167 inode->i_op = &sysfs_symlink_inode_operations; 676 struct sysfs_dirent *sd;
168 return 0; 677
678 mutex_lock(&sysfs_mutex);
679 sd = sysfs_find_dirent(parent_sd, name);
680 sysfs_get(sd);
681 mutex_unlock(&sysfs_mutex);
682
683 return sd;
169} 684}
170 685
171static int create_dir(struct kobject * k, struct dentry * p, 686static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
172 const char * n, struct dentry ** d) 687 const char *name, struct sysfs_dirent **p_sd)
173{ 688{
174 int error;
175 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 689 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
690 struct sysfs_addrm_cxt acxt;
691 struct sysfs_dirent *sd;
176 692
177 mutex_lock(&p->d_inode->i_mutex); 693 /* allocate */
178 *d = lookup_one_len(n, p, strlen(n)); 694 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
179 if (!IS_ERR(*d)) { 695 if (!sd)
180 if (sysfs_dirent_exist(p->d_fsdata, n)) 696 return -ENOMEM;
181 error = -EEXIST; 697 sd->s_elem.dir.kobj = kobj;
182 else
183 error = sysfs_make_dirent(p->d_fsdata, *d, k, mode,
184 SYSFS_DIR);
185 if (!error) {
186 error = sysfs_create(*d, mode, init_dir);
187 if (!error) {
188 inc_nlink(p->d_inode);
189 (*d)->d_op = &sysfs_dentry_ops;
190 d_rehash(*d);
191 }
192 }
193 if (error && (error != -EEXIST)) {
194 struct sysfs_dirent *sd = (*d)->d_fsdata;
195 if (sd) {
196 list_del_init(&sd->s_sibling);
197 sysfs_put(sd);
198 }
199 d_drop(*d);
200 }
201 dput(*d);
202 } else
203 error = PTR_ERR(*d);
204 mutex_unlock(&p->d_inode->i_mutex);
205 return error;
206}
207 698
699 /* link in */
700 sysfs_addrm_start(&acxt, parent_sd);
701 if (!sysfs_find_dirent(parent_sd, name)) {
702 sysfs_add_one(&acxt, sd);
703 sysfs_link_sibling(sd);
704 }
705 if (sysfs_addrm_finish(&acxt)) {
706 *p_sd = sd;
707 return 0;
708 }
208 709
209int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d) 710 sysfs_put(sd);
711 return -EEXIST;
712}
713
714int sysfs_create_subdir(struct kobject *kobj, const char *name,
715 struct sysfs_dirent **p_sd)
210{ 716{
211 return create_dir(k,k->dentry,n,d); 717 return create_dir(kobj, kobj->sd, name, p_sd);
212} 718}
213 719
214/** 720/**
215 * sysfs_create_dir - create a directory for an object. 721 * sysfs_create_dir - create a directory for an object.
216 * @kobj: object we're creating directory for. 722 * @kobj: object we're creating directory for.
217 * @shadow_parent: parent parent object. 723 * @shadow_parent: parent object.
218 */ 724 */
219 725int sysfs_create_dir(struct kobject *kobj,
220int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent) 726 struct sysfs_dirent *shadow_parent_sd)
221{ 727{
222 struct dentry * dentry = NULL; 728 struct sysfs_dirent *parent_sd, *sd;
223 struct dentry * parent;
224 int error = 0; 729 int error = 0;
225 730
226 BUG_ON(!kobj); 731 BUG_ON(!kobj);
227 732
228 if (shadow_parent) 733 if (shadow_parent_sd)
229 parent = shadow_parent; 734 parent_sd = shadow_parent_sd;
230 else if (kobj->parent) 735 else if (kobj->parent)
231 parent = kobj->parent->dentry; 736 parent_sd = kobj->parent->sd;
232 else if (sysfs_mount && sysfs_mount->mnt_sb) 737 else if (sysfs_mount && sysfs_mount->mnt_sb)
233 parent = sysfs_mount->mnt_sb->s_root; 738 parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
234 else 739 else
235 return -EFAULT; 740 return -EFAULT;
236 741
237 error = create_dir(kobj,parent,kobject_name(kobj),&dentry); 742 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
238 if (!error) 743 if (!error)
239 kobj->dentry = dentry; 744 kobj->sd = sd;
240 return error; 745 return error;
241} 746}
242 747
243/* attaches attribute's sysfs_dirent to the dentry corresponding to the 748static int sysfs_count_nlink(struct sysfs_dirent *sd)
244 * attribute file
245 */
246static int sysfs_attach_attr(struct sysfs_dirent * sd, struct dentry * dentry)
247{ 749{
248 struct attribute * attr = NULL; 750 struct sysfs_dirent *child;
249 struct bin_attribute * bin_attr = NULL; 751 int nr = 0;
250 int (* init) (struct inode *) = NULL;
251 int error = 0;
252
253 if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) {
254 bin_attr = sd->s_element;
255 attr = &bin_attr->attr;
256 } else {
257 attr = sd->s_element;
258 init = init_file;
259 }
260 752
261 dentry->d_fsdata = sysfs_get(sd); 753 for (child = sd->s_children; child; child = child->s_sibling)
262 /* protect sd->s_dentry against sysfs_d_iput */ 754 if (sysfs_type(child) == SYSFS_DIR)
263 spin_lock(&sysfs_lock); 755 nr++;
264 sd->s_dentry = dentry; 756 return nr + 2;
265 spin_unlock(&sysfs_lock);
266 error = sysfs_create(dentry, (attr->mode & S_IALLUGO) | S_IFREG, init);
267 if (error) {
268 sysfs_put(sd);
269 return error;
270 }
271
272 if (bin_attr) {
273 dentry->d_inode->i_size = bin_attr->size;
274 dentry->d_inode->i_fop = &bin_fops;
275 }
276 dentry->d_op = &sysfs_dentry_ops;
277 d_rehash(dentry);
278
279 return 0;
280}
281
282static int sysfs_attach_link(struct sysfs_dirent * sd, struct dentry * dentry)
283{
284 int err = 0;
285
286 dentry->d_fsdata = sysfs_get(sd);
287 /* protect sd->s_dentry against sysfs_d_iput */
288 spin_lock(&sysfs_lock);
289 sd->s_dentry = dentry;
290 spin_unlock(&sysfs_lock);
291 err = sysfs_create(dentry, S_IFLNK|S_IRWXUGO, init_symlink);
292 if (!err) {
293 dentry->d_op = &sysfs_dentry_ops;
294 d_rehash(dentry);
295 } else
296 sysfs_put(sd);
297
298 return err;
299} 757}
300 758
301static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, 759static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -303,24 +761,60 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
303{ 761{
304 struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata; 762 struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
305 struct sysfs_dirent * sd; 763 struct sysfs_dirent * sd;
306 int err = 0; 764 struct bin_attribute *bin_attr;
765 struct inode *inode;
766 int found = 0;
307 767
308 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 768 for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
309 if (sd->s_type & SYSFS_NOT_PINNED) { 769 if (sysfs_type(sd) &&
310 const unsigned char * name = sysfs_get_name(sd); 770 !strcmp(sd->s_name, dentry->d_name.name)) {
771 found = 1;
772 break;
773 }
774 }
311 775
312 if (strcmp(name, dentry->d_name.name)) 776 /* no such entry */
313 continue; 777 if (!found)
778 return NULL;
314 779
315 if (sd->s_type & SYSFS_KOBJ_LINK) 780 /* attach dentry and inode */
316 err = sysfs_attach_link(sd, dentry); 781 inode = sysfs_get_inode(sd);
317 else 782 if (!inode)
318 err = sysfs_attach_attr(sd, dentry); 783 return ERR_PTR(-ENOMEM);
784
785 mutex_lock(&sysfs_mutex);
786
787 if (inode->i_state & I_NEW) {
788 /* initialize inode according to type */
789 switch (sysfs_type(sd)) {
790 case SYSFS_DIR:
791 inode->i_op = &sysfs_dir_inode_operations;
792 inode->i_fop = &sysfs_dir_operations;
793 inode->i_nlink = sysfs_count_nlink(sd);
794 break;
795 case SYSFS_KOBJ_ATTR:
796 inode->i_size = PAGE_SIZE;
797 inode->i_fop = &sysfs_file_operations;
798 break;
799 case SYSFS_KOBJ_BIN_ATTR:
800 bin_attr = sd->s_elem.bin_attr.bin_attr;
801 inode->i_size = bin_attr->size;
802 inode->i_fop = &bin_fops;
319 break; 803 break;
804 case SYSFS_KOBJ_LINK:
805 inode->i_op = &sysfs_symlink_inode_operations;
806 break;
807 default:
808 BUG();
320 } 809 }
321 } 810 }
322 811
323 return ERR_PTR(err); 812 sysfs_instantiate(dentry, inode);
813 sysfs_attach_dentry(sd, dentry);
814
815 mutex_unlock(&sysfs_mutex);
816
817 return NULL;
324} 818}
325 819
326const struct inode_operations sysfs_dir_inode_operations = { 820const struct inode_operations sysfs_dir_inode_operations = {
@@ -328,58 +822,46 @@ const struct inode_operations sysfs_dir_inode_operations = {
328 .setattr = sysfs_setattr, 822 .setattr = sysfs_setattr,
329}; 823};
330 824
331static void remove_dir(struct dentry * d) 825static void remove_dir(struct sysfs_dirent *sd)
332{ 826{
333 struct dentry * parent = dget(d->d_parent); 827 struct sysfs_addrm_cxt acxt;
334 struct sysfs_dirent * sd;
335
336 mutex_lock(&parent->d_inode->i_mutex);
337 d_delete(d);
338 sd = d->d_fsdata;
339 list_del_init(&sd->s_sibling);
340 sysfs_put(sd);
341 if (d->d_inode)
342 simple_rmdir(parent->d_inode,d);
343
344 pr_debug(" o %s removing done (%d)\n",d->d_name.name,
345 atomic_read(&d->d_count));
346 828
347 mutex_unlock(&parent->d_inode->i_mutex); 829 sysfs_addrm_start(&acxt, sd->s_parent);
348 dput(parent); 830 sysfs_unlink_sibling(sd);
831 sysfs_remove_one(&acxt, sd);
832 sysfs_addrm_finish(&acxt);
349} 833}
350 834
351void sysfs_remove_subdir(struct dentry * d) 835void sysfs_remove_subdir(struct sysfs_dirent *sd)
352{ 836{
353 remove_dir(d); 837 remove_dir(sd);
354} 838}
355 839
356 840
357static void __sysfs_remove_dir(struct dentry *dentry) 841static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
358{ 842{
359 struct sysfs_dirent * parent_sd; 843 struct sysfs_addrm_cxt acxt;
360 struct sysfs_dirent * sd, * tmp; 844 struct sysfs_dirent **pos;
361 845
362 dget(dentry); 846 if (!dir_sd)
363 if (!dentry)
364 return; 847 return;
365 848
366 pr_debug("sysfs %s: removing dir\n",dentry->d_name.name); 849 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
367 mutex_lock(&dentry->d_inode->i_mutex); 850 sysfs_addrm_start(&acxt, dir_sd);
368 parent_sd = dentry->d_fsdata; 851 pos = &dir_sd->s_children;
369 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 852 while (*pos) {
370 if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED)) 853 struct sysfs_dirent *sd = *pos;
371 continue; 854
372 list_del_init(&sd->s_sibling); 855 if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR) {
373 sysfs_drop_dentry(sd, dentry); 856 *pos = sd->s_sibling;
374 sysfs_put(sd); 857 sd->s_sibling = NULL;
858 sysfs_remove_one(&acxt, sd);
859 } else
860 pos = &(*pos)->s_sibling;
375 } 861 }
376 mutex_unlock(&dentry->d_inode->i_mutex); 862 sysfs_addrm_finish(&acxt);
377 863
378 remove_dir(dentry); 864 remove_dir(dir_sd);
379 /**
380 * Drop reference from dget() on entrance.
381 */
382 dput(dentry);
383} 865}
384 866
385/** 867/**
@@ -393,102 +875,166 @@ static void __sysfs_remove_dir(struct dentry *dentry)
393 875
394void sysfs_remove_dir(struct kobject * kobj) 876void sysfs_remove_dir(struct kobject * kobj)
395{ 877{
396 __sysfs_remove_dir(kobj->dentry); 878 struct sysfs_dirent *sd = kobj->sd;
397 kobj->dentry = NULL; 879
880 spin_lock(&sysfs_assoc_lock);
881 kobj->sd = NULL;
882 spin_unlock(&sysfs_assoc_lock);
883
884 __sysfs_remove_dir(sd);
398} 885}
399 886
400int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, 887int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
401 const char *new_name) 888 const char *new_name)
402{ 889{
403 int error = 0; 890 struct sysfs_dirent *sd = kobj->sd;
404 struct dentry * new_dentry; 891 struct dentry *new_parent = NULL;
892 struct dentry *old_dentry = NULL, *new_dentry = NULL;
893 const char *dup_name = NULL;
894 int error;
405 895
406 if (!new_parent) 896 /* get dentries */
407 return -EFAULT; 897 old_dentry = sysfs_get_dentry(sd);
898 if (IS_ERR(old_dentry)) {
899 error = PTR_ERR(old_dentry);
900 goto out_dput;
901 }
408 902
409 down_write(&sysfs_rename_sem); 903 new_parent = sysfs_get_dentry(new_parent_sd);
904 if (IS_ERR(new_parent)) {
905 error = PTR_ERR(new_parent);
906 goto out_dput;
907 }
908
909 /* lock new_parent and get dentry for new name */
410 mutex_lock(&new_parent->d_inode->i_mutex); 910 mutex_lock(&new_parent->d_inode->i_mutex);
411 911
412 new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); 912 new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name));
413 if (!IS_ERR(new_dentry)) { 913 if (IS_ERR(new_dentry)) {
414 /* By allowing two different directories with the 914 error = PTR_ERR(new_dentry);
415 * same d_parent we allow this routine to move 915 goto out_unlock;
416 * between different shadows of the same directory
417 */
418 if (kobj->dentry->d_parent->d_inode != new_parent->d_inode)
419 return -EINVAL;
420 else if (new_dentry->d_parent->d_inode != new_parent->d_inode)
421 error = -EINVAL;
422 else if (new_dentry == kobj->dentry)
423 error = -EINVAL;
424 else if (!new_dentry->d_inode) {
425 error = kobject_set_name(kobj, "%s", new_name);
426 if (!error) {
427 struct sysfs_dirent *sd, *parent_sd;
428
429 d_add(new_dentry, NULL);
430 d_move(kobj->dentry, new_dentry);
431
432 sd = kobj->dentry->d_fsdata;
433 parent_sd = new_parent->d_fsdata;
434
435 list_del_init(&sd->s_sibling);
436 list_add(&sd->s_sibling, &parent_sd->s_children);
437 }
438 else
439 d_drop(new_dentry);
440 } else
441 error = -EEXIST;
442 dput(new_dentry);
443 } 916 }
444 mutex_unlock(&new_parent->d_inode->i_mutex);
445 up_write(&sysfs_rename_sem);
446 917
918 /* By allowing two different directories with the same
919 * d_parent we allow this routine to move between different
920 * shadows of the same directory
921 */
922 error = -EINVAL;
923 if (old_dentry->d_parent->d_inode != new_parent->d_inode ||
924 new_dentry->d_parent->d_inode != new_parent->d_inode ||
925 old_dentry == new_dentry)
926 goto out_unlock;
927
928 error = -EEXIST;
929 if (new_dentry->d_inode)
930 goto out_unlock;
931
932 /* rename kobject and sysfs_dirent */
933 error = -ENOMEM;
934 new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
935 if (!new_name)
936 goto out_drop;
937
938 error = kobject_set_name(kobj, "%s", new_name);
939 if (error)
940 goto out_drop;
941
942 dup_name = sd->s_name;
943 sd->s_name = new_name;
944
945 /* move under the new parent */
946 d_add(new_dentry, NULL);
947 d_move(sd->s_dentry, new_dentry);
948
949 mutex_lock(&sysfs_mutex);
950
951 sysfs_unlink_sibling(sd);
952 sysfs_get(new_parent_sd);
953 sysfs_put(sd->s_parent);
954 sd->s_parent = new_parent_sd;
955 sysfs_link_sibling(sd);
956
957 mutex_unlock(&sysfs_mutex);
958
959 error = 0;
960 goto out_unlock;
961
962 out_drop:
963 d_drop(new_dentry);
964 out_unlock:
965 mutex_unlock(&new_parent->d_inode->i_mutex);
966 out_dput:
967 kfree(dup_name);
968 dput(new_parent);
969 dput(old_dentry);
970 dput(new_dentry);
447 return error; 971 return error;
448} 972}
449 973
450int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent) 974int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
451{ 975{
452 struct dentry *old_parent_dentry, *new_parent_dentry, *new_dentry; 976 struct sysfs_dirent *sd = kobj->sd;
453 struct sysfs_dirent *new_parent_sd, *sd; 977 struct sysfs_dirent *new_parent_sd;
978 struct dentry *old_parent, *new_parent = NULL;
979 struct dentry *old_dentry = NULL, *new_dentry = NULL;
454 int error; 980 int error;
455 981
456 old_parent_dentry = kobj->parent ? 982 BUG_ON(!sd->s_parent);
457 kobj->parent->dentry : sysfs_mount->mnt_sb->s_root; 983 new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
458 new_parent_dentry = new_parent ? 984
459 new_parent->dentry : sysfs_mount->mnt_sb->s_root; 985 /* get dentries */
986 old_dentry = sysfs_get_dentry(sd);
987 if (IS_ERR(old_dentry)) {
988 error = PTR_ERR(old_dentry);
989 goto out_dput;
990 }
991 old_parent = sd->s_parent->s_dentry;
992
993 new_parent = sysfs_get_dentry(new_parent_sd);
994 if (IS_ERR(new_parent)) {
995 error = PTR_ERR(new_parent);
996 goto out_dput;
997 }
460 998
461 if (old_parent_dentry->d_inode == new_parent_dentry->d_inode) 999 if (old_parent->d_inode == new_parent->d_inode) {
462 return 0; /* nothing to move */ 1000 error = 0;
1001 goto out_dput; /* nothing to move */
1002 }
463again: 1003again:
464 mutex_lock(&old_parent_dentry->d_inode->i_mutex); 1004 mutex_lock(&old_parent->d_inode->i_mutex);
465 if (!mutex_trylock(&new_parent_dentry->d_inode->i_mutex)) { 1005 if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
466 mutex_unlock(&old_parent_dentry->d_inode->i_mutex); 1006 mutex_unlock(&old_parent->d_inode->i_mutex);
467 goto again; 1007 goto again;
468 } 1008 }
469 1009
470 new_parent_sd = new_parent_dentry->d_fsdata; 1010 new_dentry = lookup_one_len(kobj->name, new_parent, strlen(kobj->name));
471 sd = kobj->dentry->d_fsdata;
472
473 new_dentry = lookup_one_len(kobj->name, new_parent_dentry,
474 strlen(kobj->name));
475 if (IS_ERR(new_dentry)) { 1011 if (IS_ERR(new_dentry)) {
476 error = PTR_ERR(new_dentry); 1012 error = PTR_ERR(new_dentry);
477 goto out; 1013 goto out_unlock;
478 } else 1014 } else
479 error = 0; 1015 error = 0;
480 d_add(new_dentry, NULL); 1016 d_add(new_dentry, NULL);
481 d_move(kobj->dentry, new_dentry); 1017 d_move(sd->s_dentry, new_dentry);
482 dput(new_dentry); 1018 dput(new_dentry);
483 1019
484 /* Remove from old parent's list and insert into new parent's list. */ 1020 /* Remove from old parent's list and insert into new parent's list. */
485 list_del_init(&sd->s_sibling); 1021 mutex_lock(&sysfs_mutex);
486 list_add(&sd->s_sibling, &new_parent_sd->s_children); 1022
1023 sysfs_unlink_sibling(sd);
1024 sysfs_get(new_parent_sd);
1025 sysfs_put(sd->s_parent);
1026 sd->s_parent = new_parent_sd;
1027 sysfs_link_sibling(sd);
487 1028
488out: 1029 mutex_unlock(&sysfs_mutex);
489 mutex_unlock(&new_parent_dentry->d_inode->i_mutex);
490 mutex_unlock(&old_parent_dentry->d_inode->i_mutex);
491 1030
1031 out_unlock:
1032 mutex_unlock(&new_parent->d_inode->i_mutex);
1033 mutex_unlock(&old_parent->d_inode->i_mutex);
1034 out_dput:
1035 dput(new_parent);
1036 dput(old_dentry);
1037 dput(new_dentry);
492 return error; 1038 return error;
493} 1039}
494 1040
@@ -496,23 +1042,27 @@ static int sysfs_dir_open(struct inode *inode, struct file *file)
496{ 1042{
497 struct dentry * dentry = file->f_path.dentry; 1043 struct dentry * dentry = file->f_path.dentry;
498 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 1044 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
1045 struct sysfs_dirent * sd;
499 1046
500 mutex_lock(&dentry->d_inode->i_mutex); 1047 sd = sysfs_new_dirent("_DIR_", 0, 0);
501 file->private_data = sysfs_new_dirent(parent_sd, NULL); 1048 if (sd) {
502 mutex_unlock(&dentry->d_inode->i_mutex); 1049 mutex_lock(&sysfs_mutex);
503 1050 sd->s_parent = sysfs_get(parent_sd);
504 return file->private_data ? 0 : -ENOMEM; 1051 sysfs_link_sibling(sd);
1052 mutex_unlock(&sysfs_mutex);
1053 }
505 1054
1055 file->private_data = sd;
1056 return sd ? 0 : -ENOMEM;
506} 1057}
507 1058
508static int sysfs_dir_close(struct inode *inode, struct file *file) 1059static int sysfs_dir_close(struct inode *inode, struct file *file)
509{ 1060{
510 struct dentry * dentry = file->f_path.dentry;
511 struct sysfs_dirent * cursor = file->private_data; 1061 struct sysfs_dirent * cursor = file->private_data;
512 1062
513 mutex_lock(&dentry->d_inode->i_mutex); 1063 mutex_lock(&sysfs_mutex);
514 list_del_init(&cursor->s_sibling); 1064 sysfs_unlink_sibling(cursor);
515 mutex_unlock(&dentry->d_inode->i_mutex); 1065 mutex_unlock(&sysfs_mutex);
516 1066
517 release_sysfs_dirent(cursor); 1067 release_sysfs_dirent(cursor);
518 1068
@@ -530,7 +1080,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
530 struct dentry *dentry = filp->f_path.dentry; 1080 struct dentry *dentry = filp->f_path.dentry;
531 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 1081 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
532 struct sysfs_dirent *cursor = filp->private_data; 1082 struct sysfs_dirent *cursor = filp->private_data;
533 struct list_head *p, *q = &cursor->s_sibling; 1083 struct sysfs_dirent **pos;
534 ino_t ino; 1084 ino_t ino;
535 int i = filp->f_pos; 1085 int i = filp->f_pos;
536 1086
@@ -543,38 +1093,52 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
543 i++; 1093 i++;
544 /* fallthrough */ 1094 /* fallthrough */
545 case 1: 1095 case 1:
546 ino = parent_ino(dentry); 1096 if (parent_sd->s_parent)
1097 ino = parent_sd->s_parent->s_ino;
1098 else
1099 ino = parent_sd->s_ino;
547 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 1100 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
548 break; 1101 break;
549 filp->f_pos++; 1102 filp->f_pos++;
550 i++; 1103 i++;
551 /* fallthrough */ 1104 /* fallthrough */
552 default: 1105 default:
1106 mutex_lock(&sysfs_mutex);
1107
1108 pos = &parent_sd->s_children;
1109 while (*pos != cursor)
1110 pos = &(*pos)->s_sibling;
1111
1112 /* unlink cursor */
1113 *pos = cursor->s_sibling;
1114
553 if (filp->f_pos == 2) 1115 if (filp->f_pos == 2)
554 list_move(q, &parent_sd->s_children); 1116 pos = &parent_sd->s_children;
555 1117
556 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 1118 for ( ; *pos; pos = &(*pos)->s_sibling) {
557 struct sysfs_dirent *next; 1119 struct sysfs_dirent *next = *pos;
558 const char * name; 1120 const char * name;
559 int len; 1121 int len;
560 1122
561 next = list_entry(p, struct sysfs_dirent, 1123 if (!sysfs_type(next))
562 s_sibling);
563 if (!next->s_element)
564 continue; 1124 continue;
565 1125
566 name = sysfs_get_name(next); 1126 name = next->s_name;
567 len = strlen(name); 1127 len = strlen(name);
568 ino = next->s_ino; 1128 ino = next->s_ino;
569 1129
570 if (filldir(dirent, name, len, filp->f_pos, ino, 1130 if (filldir(dirent, name, len, filp->f_pos, ino,
571 dt_type(next)) < 0) 1131 dt_type(next)) < 0)
572 return 0; 1132 break;
573 1133
574 list_move(q, p);
575 p = q;
576 filp->f_pos++; 1134 filp->f_pos++;
577 } 1135 }
1136
1137 /* put cursor back in */
1138 cursor->s_sibling = *pos;
1139 *pos = cursor;
1140
1141 mutex_unlock(&sysfs_mutex);
578 } 1142 }
579 return 0; 1143 return 0;
580} 1144}
@@ -583,7 +1147,6 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
583{ 1147{
584 struct dentry * dentry = file->f_path.dentry; 1148 struct dentry * dentry = file->f_path.dentry;
585 1149
586 mutex_lock(&dentry->d_inode->i_mutex);
587 switch (origin) { 1150 switch (origin) {
588 case 1: 1151 case 1:
589 offset += file->f_pos; 1152 offset += file->f_pos;
@@ -591,31 +1154,35 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
591 if (offset >= 0) 1154 if (offset >= 0)
592 break; 1155 break;
593 default: 1156 default:
594 mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
595 return -EINVAL; 1157 return -EINVAL;
596 } 1158 }
597 if (offset != file->f_pos) { 1159 if (offset != file->f_pos) {
1160 mutex_lock(&sysfs_mutex);
1161
598 file->f_pos = offset; 1162 file->f_pos = offset;
599 if (file->f_pos >= 2) { 1163 if (file->f_pos >= 2) {
600 struct sysfs_dirent *sd = dentry->d_fsdata; 1164 struct sysfs_dirent *sd = dentry->d_fsdata;
601 struct sysfs_dirent *cursor = file->private_data; 1165 struct sysfs_dirent *cursor = file->private_data;
602 struct list_head *p; 1166 struct sysfs_dirent **pos;
603 loff_t n = file->f_pos - 2; 1167 loff_t n = file->f_pos - 2;
604 1168
605 list_del(&cursor->s_sibling); 1169 sysfs_unlink_sibling(cursor);
606 p = sd->s_children.next; 1170
607 while (n && p != &sd->s_children) { 1171 pos = &sd->s_children;
608 struct sysfs_dirent *next; 1172 while (n && *pos) {
609 next = list_entry(p, struct sysfs_dirent, 1173 struct sysfs_dirent *next = *pos;
610 s_sibling); 1174 if (sysfs_type(next))
611 if (next->s_element)
612 n--; 1175 n--;
613 p = p->next; 1176 pos = &(*pos)->s_sibling;
614 } 1177 }
615 list_add_tail(&cursor->s_sibling, p); 1178
1179 cursor->s_sibling = *pos;
1180 *pos = cursor;
616 } 1181 }
1182
1183 mutex_unlock(&sysfs_mutex);
617 } 1184 }
618 mutex_unlock(&dentry->d_inode->i_mutex); 1185
619 return offset; 1186 return offset;
620} 1187}
621 1188
@@ -628,12 +1195,20 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
628int sysfs_make_shadowed_dir(struct kobject *kobj, 1195int sysfs_make_shadowed_dir(struct kobject *kobj,
629 void * (*follow_link)(struct dentry *, struct nameidata *)) 1196 void * (*follow_link)(struct dentry *, struct nameidata *))
630{ 1197{
1198 struct dentry *dentry;
631 struct inode *inode; 1199 struct inode *inode;
632 struct inode_operations *i_op; 1200 struct inode_operations *i_op;
633 1201
634 inode = kobj->dentry->d_inode; 1202 /* get dentry for @kobj->sd, dentry of a shadowed dir is pinned */
635 if (inode->i_op != &sysfs_dir_inode_operations) 1203 dentry = sysfs_get_dentry(kobj->sd);
1204 if (IS_ERR(dentry))
1205 return PTR_ERR(dentry);
1206
1207 inode = dentry->d_inode;
1208 if (inode->i_op != &sysfs_dir_inode_operations) {
1209 dput(dentry);
636 return -EINVAL; 1210 return -EINVAL;
1211 }
637 1212
638 i_op = kmalloc(sizeof(*i_op), GFP_KERNEL); 1213 i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
639 if (!i_op) 1214 if (!i_op)
@@ -658,54 +1233,72 @@ int sysfs_make_shadowed_dir(struct kobject *kobj,
658 * directory. 1233 * directory.
659 */ 1234 */
660 1235
661struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) 1236struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
662{ 1237{
663 struct sysfs_dirent *sd; 1238 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
664 struct dentry *parent, *dir, *shadow; 1239 struct dentry *dir, *parent, *shadow;
665 struct inode *inode; 1240 struct inode *inode;
1241 struct sysfs_dirent *sd;
1242 struct sysfs_addrm_cxt acxt;
666 1243
667 dir = kobj->dentry; 1244 dir = sysfs_get_dentry(kobj->sd);
668 inode = dir->d_inode; 1245 if (IS_ERR(dir)) {
1246 sd = (void *)dir;
1247 goto out;
1248 }
669 parent = dir->d_parent; 1249 parent = dir->d_parent;
670 shadow = ERR_PTR(-EINVAL); 1250
1251 inode = dir->d_inode;
1252 sd = ERR_PTR(-EINVAL);
671 if (!sysfs_is_shadowed_inode(inode)) 1253 if (!sysfs_is_shadowed_inode(inode))
672 goto out; 1254 goto out_dput;
673 1255
674 shadow = d_alloc(parent, &dir->d_name); 1256 shadow = d_alloc(parent, &dir->d_name);
675 if (!shadow) 1257 if (!shadow)
676 goto nomem; 1258 goto nomem;
677 1259
678 sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR); 1260 sd = sysfs_new_dirent("_SHADOW_", inode->i_mode, SYSFS_DIR);
679 if (!sd) 1261 if (!sd)
680 goto nomem; 1262 goto nomem;
1263 sd->s_elem.dir.kobj = kobj;
681 1264
1265 sysfs_addrm_start(&acxt, parent_sd);
1266
1267 /* add but don't link into children list */
1268 sysfs_add_one(&acxt, sd);
1269
1270 /* attach and instantiate dentry */
1271 sysfs_attach_dentry(sd, shadow);
682 d_instantiate(shadow, igrab(inode)); 1272 d_instantiate(shadow, igrab(inode));
683 inc_nlink(inode); 1273 inc_nlink(inode); /* tj: synchronization? */
684 inc_nlink(parent->d_inode); 1274
685 shadow->d_op = &sysfs_dentry_ops; 1275 sysfs_addrm_finish(&acxt);
686 1276
687 dget(shadow); /* Extra count - pin the dentry in core */ 1277 dget(shadow); /* Extra count - pin the dentry in core */
688 1278
689out: 1279 goto out_dput;
690 return shadow; 1280
691nomem: 1281 nomem:
692 dput(shadow); 1282 dput(shadow);
693 shadow = ERR_PTR(-ENOMEM); 1283 sd = ERR_PTR(-ENOMEM);
694 goto out; 1284 out_dput:
1285 dput(dir);
1286 out:
1287 return sd;
695} 1288}
696 1289
697/** 1290/**
698 * sysfs_remove_shadow_dir - remove an object's directory. 1291 * sysfs_remove_shadow_dir - remove an object's directory.
699 * @shadow: dentry of shadow directory 1292 * @shadow_sd: sysfs_dirent of shadow directory
700 * 1293 *
701 * The only thing special about this is that we remove any files in 1294 * The only thing special about this is that we remove any files in
702 * the directory before we remove the directory, and we've inlined 1295 * the directory before we remove the directory, and we've inlined
703 * what used to be sysfs_rmdir() below, instead of calling separately. 1296 * what used to be sysfs_rmdir() below, instead of calling separately.
704 */ 1297 */
705 1298
706void sysfs_remove_shadow_dir(struct dentry *shadow) 1299void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd)
707{ 1300{
708 __sysfs_remove_dir(shadow); 1301 __sysfs_remove_dir(shadow_sd);
709} 1302}
710 1303
711const struct file_operations sysfs_dir_operations = { 1304const struct file_operations sysfs_dir_operations = {
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b502c7197e..cc497994b2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -50,29 +50,15 @@ static struct sysfs_ops subsys_sysfs_ops = {
50 .store = subsys_attr_store, 50 .store = subsys_attr_store,
51}; 51};
52 52
53/** 53struct sysfs_buffer {
54 * add_to_collection - add buffer to a collection 54 size_t count;
55 * @buffer: buffer to be added 55 loff_t pos;
56 * @node: inode of set to add to 56 char * page;
57 */ 57 struct sysfs_ops * ops;
58 58 struct semaphore sem;
59static inline void 59 int needs_read_fill;
60add_to_collection(struct sysfs_buffer *buffer, struct inode *node) 60 int event;
61{ 61};
62 struct sysfs_buffer_collection *set = node->i_private;
63
64 mutex_lock(&node->i_mutex);
65 list_add(&buffer->associates, &set->associates);
66 mutex_unlock(&node->i_mutex);
67}
68
69static inline void
70remove_from_collection(struct sysfs_buffer *buffer, struct inode *node)
71{
72 mutex_lock(&node->i_mutex);
73 list_del(&buffer->associates);
74 mutex_unlock(&node->i_mutex);
75}
76 62
77/** 63/**
78 * fill_read_buffer - allocate and fill buffer from object. 64 * fill_read_buffer - allocate and fill buffer from object.
@@ -87,9 +73,8 @@ remove_from_collection(struct sysfs_buffer *buffer, struct inode *node)
87 */ 73 */
88static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) 74static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
89{ 75{
90 struct sysfs_dirent * sd = dentry->d_fsdata; 76 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
91 struct attribute * attr = to_attr(dentry); 77 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
92 struct kobject * kobj = to_kobj(dentry->d_parent);
93 struct sysfs_ops * ops = buffer->ops; 78 struct sysfs_ops * ops = buffer->ops;
94 int ret = 0; 79 int ret = 0;
95 ssize_t count; 80 ssize_t count;
@@ -99,8 +84,15 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
99 if (!buffer->page) 84 if (!buffer->page)
100 return -ENOMEM; 85 return -ENOMEM;
101 86
102 buffer->event = atomic_read(&sd->s_event); 87 /* need attr_sd for attr and ops, its parent for kobj */
103 count = ops->show(kobj,attr,buffer->page); 88 if (!sysfs_get_active_two(attr_sd))
89 return -ENODEV;
90
91 buffer->event = atomic_read(&attr_sd->s_event);
92 count = ops->show(kobj, attr_sd->s_elem.attr.attr, buffer->page);
93
94 sysfs_put_active_two(attr_sd);
95
104 BUG_ON(count > (ssize_t)PAGE_SIZE); 96 BUG_ON(count > (ssize_t)PAGE_SIZE);
105 if (count >= 0) { 97 if (count >= 0) {
106 buffer->needs_read_fill = 0; 98 buffer->needs_read_fill = 0;
@@ -138,10 +130,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
138 130
139 down(&buffer->sem); 131 down(&buffer->sem);
140 if (buffer->needs_read_fill) { 132 if (buffer->needs_read_fill) {
141 if (buffer->orphaned) 133 retval = fill_read_buffer(file->f_path.dentry,buffer);
142 retval = -ENODEV;
143 else
144 retval = fill_read_buffer(file->f_path.dentry,buffer);
145 if (retval) 134 if (retval)
146 goto out; 135 goto out;
147 } 136 }
@@ -196,14 +185,23 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t
196 * passing the buffer that we acquired in fill_write_buffer(). 185 * passing the buffer that we acquired in fill_write_buffer().
197 */ 186 */
198 187
199static int 188static int
200flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) 189flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
201{ 190{
202 struct attribute * attr = to_attr(dentry); 191 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
203 struct kobject * kobj = to_kobj(dentry->d_parent); 192 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
204 struct sysfs_ops * ops = buffer->ops; 193 struct sysfs_ops * ops = buffer->ops;
194 int rc;
195
196 /* need attr_sd for attr and ops, its parent for kobj */
197 if (!sysfs_get_active_two(attr_sd))
198 return -ENODEV;
199
200 rc = ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count);
205 201
206 return ops->store(kobj,attr,buffer->page,count); 202 sysfs_put_active_two(attr_sd);
203
204 return rc;
207} 205}
208 206
209 207
@@ -231,37 +229,26 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
231 ssize_t len; 229 ssize_t len;
232 230
233 down(&buffer->sem); 231 down(&buffer->sem);
234 if (buffer->orphaned) {
235 len = -ENODEV;
236 goto out;
237 }
238 len = fill_write_buffer(buffer, buf, count); 232 len = fill_write_buffer(buffer, buf, count);
239 if (len > 0) 233 if (len > 0)
240 len = flush_write_buffer(file->f_path.dentry, buffer, len); 234 len = flush_write_buffer(file->f_path.dentry, buffer, len);
241 if (len > 0) 235 if (len > 0)
242 *ppos += len; 236 *ppos += len;
243out:
244 up(&buffer->sem); 237 up(&buffer->sem);
245 return len; 238 return len;
246} 239}
247 240
248static int sysfs_open_file(struct inode *inode, struct file *file) 241static int sysfs_open_file(struct inode *inode, struct file *file)
249{ 242{
250 struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); 243 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
251 struct attribute * attr = to_attr(file->f_path.dentry); 244 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
252 struct sysfs_buffer_collection *set;
253 struct sysfs_buffer * buffer; 245 struct sysfs_buffer * buffer;
254 struct sysfs_ops * ops = NULL; 246 struct sysfs_ops * ops = NULL;
255 int error = 0; 247 int error;
256
257 if (!kobj || !attr)
258 goto Einval;
259 248
260 /* Grab the module reference for this attribute if we have one */ 249 /* need attr_sd for attr and ops, its parent for kobj */
261 if (!try_module_get(attr->owner)) { 250 if (!sysfs_get_active_two(attr_sd))
262 error = -ENODEV; 251 return -ENODEV;
263 goto Done;
264 }
265 252
266 /* if the kobject has no ktype, then we assume that it is a subsystem 253 /* if the kobject has no ktype, then we assume that it is a subsystem
267 * itself, and use ops for it. 254 * itself, and use ops for it.
@@ -273,33 +260,21 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
273 else 260 else
274 ops = &subsys_sysfs_ops; 261 ops = &subsys_sysfs_ops;
275 262
263 error = -EACCES;
264
276 /* No sysfs operations, either from having no subsystem, 265 /* No sysfs operations, either from having no subsystem,
277 * or the subsystem have no operations. 266 * or the subsystem have no operations.
278 */ 267 */
279 if (!ops) 268 if (!ops)
280 goto Eaccess; 269 goto err_out;
281
282 /* make sure we have a collection to add our buffers to */
283 mutex_lock(&inode->i_mutex);
284 if (!(set = inode->i_private)) {
285 if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) {
286 error = -ENOMEM;
287 goto Done;
288 } else {
289 INIT_LIST_HEAD(&set->associates);
290 }
291 }
292 mutex_unlock(&inode->i_mutex);
293 270
294 /* File needs write support. 271 /* File needs write support.
295 * The inode's perms must say it's ok, 272 * The inode's perms must say it's ok,
296 * and we must have a store method. 273 * and we must have a store method.
297 */ 274 */
298 if (file->f_mode & FMODE_WRITE) { 275 if (file->f_mode & FMODE_WRITE) {
299
300 if (!(inode->i_mode & S_IWUGO) || !ops->store) 276 if (!(inode->i_mode & S_IWUGO) || !ops->store)
301 goto Eaccess; 277 goto err_out;
302
303 } 278 }
304 279
305 /* File needs read support. 280 /* File needs read support.
@@ -308,48 +283,38 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
308 */ 283 */
309 if (file->f_mode & FMODE_READ) { 284 if (file->f_mode & FMODE_READ) {
310 if (!(inode->i_mode & S_IRUGO) || !ops->show) 285 if (!(inode->i_mode & S_IRUGO) || !ops->show)
311 goto Eaccess; 286 goto err_out;
312 } 287 }
313 288
314 /* No error? Great, allocate a buffer for the file, and store it 289 /* No error? Great, allocate a buffer for the file, and store it
315 * it in file->private_data for easy access. 290 * it in file->private_data for easy access.
316 */ 291 */
292 error = -ENOMEM;
317 buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); 293 buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
318 if (buffer) { 294 if (!buffer)
319 INIT_LIST_HEAD(&buffer->associates); 295 goto err_out;
320 init_MUTEX(&buffer->sem); 296
321 buffer->needs_read_fill = 1; 297 init_MUTEX(&buffer->sem);
322 buffer->ops = ops; 298 buffer->needs_read_fill = 1;
323 add_to_collection(buffer, inode); 299 buffer->ops = ops;
324 file->private_data = buffer; 300 file->private_data = buffer;
325 } else 301
326 error = -ENOMEM; 302 /* open succeeded, put active references and pin attr_sd */
327 goto Done; 303 sysfs_put_active_two(attr_sd);
328 304 sysfs_get(attr_sd);
329 Einval: 305 return 0;
330 error = -EINVAL; 306
331 goto Done; 307 err_out:
332 Eaccess: 308 sysfs_put_active_two(attr_sd);
333 error = -EACCES;
334 module_put(attr->owner);
335 Done:
336 if (error)
337 kobject_put(kobj);
338 return error; 309 return error;
339} 310}
340 311
341static int sysfs_release(struct inode * inode, struct file * filp) 312static int sysfs_release(struct inode * inode, struct file * filp)
342{ 313{
343 struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); 314 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
344 struct attribute * attr = to_attr(filp->f_path.dentry); 315 struct sysfs_buffer *buffer = filp->private_data;
345 struct module * owner = attr->owner;
346 struct sysfs_buffer * buffer = filp->private_data;
347 316
348 if (buffer) 317 sysfs_put(attr_sd);
349 remove_from_collection(buffer, inode);
350 kobject_put(kobj);
351 /* After this point, attr should not be accessed. */
352 module_put(owner);
353 318
354 if (buffer) { 319 if (buffer) {
355 if (buffer->page) 320 if (buffer->page)
@@ -376,57 +341,43 @@ static int sysfs_release(struct inode * inode, struct file * filp)
376static unsigned int sysfs_poll(struct file *filp, poll_table *wait) 341static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
377{ 342{
378 struct sysfs_buffer * buffer = filp->private_data; 343 struct sysfs_buffer * buffer = filp->private_data;
379 struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); 344 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
380 struct sysfs_dirent * sd = filp->f_path.dentry->d_fsdata; 345 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
381 int res = 0; 346
347 /* need parent for the kobj, grab both */
348 if (!sysfs_get_active_two(attr_sd))
349 goto trigger;
382 350
383 poll_wait(filp, &kobj->poll, wait); 351 poll_wait(filp, &kobj->poll, wait);
384 352
385 if (buffer->event != atomic_read(&sd->s_event)) { 353 sysfs_put_active_two(attr_sd);
386 res = POLLERR|POLLPRI;
387 buffer->needs_read_fill = 1;
388 }
389 354
390 return res; 355 if (buffer->event != atomic_read(&attr_sd->s_event))
391} 356 goto trigger;
392 357
358 return 0;
393 359
394static struct dentry *step_down(struct dentry *dir, const char * name) 360 trigger:
395{ 361 buffer->needs_read_fill = 1;
396 struct dentry * de; 362 return POLLERR|POLLPRI;
397
398 if (dir == NULL || dir->d_inode == NULL)
399 return NULL;
400
401 mutex_lock(&dir->d_inode->i_mutex);
402 de = lookup_one_len(name, dir, strlen(name));
403 mutex_unlock(&dir->d_inode->i_mutex);
404 dput(dir);
405 if (IS_ERR(de))
406 return NULL;
407 if (de->d_inode == NULL) {
408 dput(de);
409 return NULL;
410 }
411 return de;
412} 363}
413 364
414void sysfs_notify(struct kobject * k, char *dir, char *attr) 365void sysfs_notify(struct kobject *k, char *dir, char *attr)
415{ 366{
416 struct dentry *de = k->dentry; 367 struct sysfs_dirent *sd = k->sd;
417 if (de) 368
418 dget(de); 369 mutex_lock(&sysfs_mutex);
419 if (de && dir) 370
420 de = step_down(de, dir); 371 if (sd && dir)
421 if (de && attr) 372 sd = sysfs_find_dirent(sd, dir);
422 de = step_down(de, attr); 373 if (sd && attr)
423 if (de) { 374 sd = sysfs_find_dirent(sd, attr);
424 struct sysfs_dirent * sd = de->d_fsdata; 375 if (sd) {
425 if (sd) 376 atomic_inc(&sd->s_event);
426 atomic_inc(&sd->s_event);
427 wake_up_interruptible(&k->poll); 377 wake_up_interruptible(&k->poll);
428 dput(de);
429 } 378 }
379
380 mutex_unlock(&sysfs_mutex);
430} 381}
431EXPORT_SYMBOL_GPL(sysfs_notify); 382EXPORT_SYMBOL_GPL(sysfs_notify);
432 383
@@ -440,19 +391,30 @@ const struct file_operations sysfs_file_operations = {
440}; 391};
441 392
442 393
443int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) 394int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
395 int type)
444{ 396{
445 struct sysfs_dirent * parent_sd = dir->d_fsdata;
446 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; 397 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
447 int error = -EEXIST; 398 struct sysfs_addrm_cxt acxt;
399 struct sysfs_dirent *sd;
448 400
449 mutex_lock(&dir->d_inode->i_mutex); 401 sd = sysfs_new_dirent(attr->name, mode, type);
450 if (!sysfs_dirent_exist(parent_sd, attr->name)) 402 if (!sd)
451 error = sysfs_make_dirent(parent_sd, NULL, (void *)attr, 403 return -ENOMEM;
452 mode, type); 404 sd->s_elem.attr.attr = (void *)attr;
453 mutex_unlock(&dir->d_inode->i_mutex);
454 405
455 return error; 406 sysfs_addrm_start(&acxt, dir_sd);
407
408 if (!sysfs_find_dirent(dir_sd, attr->name)) {
409 sysfs_add_one(&acxt, sd);
410 sysfs_link_sibling(sd);
411 }
412
413 if (sysfs_addrm_finish(&acxt))
414 return 0;
415
416 sysfs_put(sd);
417 return -EEXIST;
456} 418}
457 419
458 420
@@ -464,9 +426,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
464 426
465int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) 427int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
466{ 428{
467 BUG_ON(!kobj || !kobj->dentry || !attr); 429 BUG_ON(!kobj || !kobj->sd || !attr);
468 430
469 return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); 431 return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
470 432
471} 433}
472 434
@@ -480,16 +442,16 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
480int sysfs_add_file_to_group(struct kobject *kobj, 442int sysfs_add_file_to_group(struct kobject *kobj,
481 const struct attribute *attr, const char *group) 443 const struct attribute *attr, const char *group)
482{ 444{
483 struct dentry *dir; 445 struct sysfs_dirent *dir_sd;
484 int error; 446 int error;
485 447
486 dir = lookup_one_len(group, kobj->dentry, strlen(group)); 448 dir_sd = sysfs_get_dirent(kobj->sd, group);
487 if (IS_ERR(dir)) 449 if (!dir_sd)
488 error = PTR_ERR(dir); 450 return -ENOENT;
489 else { 451
490 error = sysfs_add_file(dir, attr, SYSFS_KOBJ_ATTR); 452 error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
491 dput(dir); 453 sysfs_put(dir_sd);
492 } 454
493 return error; 455 return error;
494} 456}
495EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); 457EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
@@ -502,30 +464,31 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
502 */ 464 */
503int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) 465int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
504{ 466{
505 struct dentry * dir = kobj->dentry; 467 struct sysfs_dirent *victim_sd = NULL;
506 struct dentry * victim; 468 struct dentry *victim = NULL;
507 int res = -ENOENT; 469 int rc;
508 470
509 mutex_lock(&dir->d_inode->i_mutex); 471 rc = -ENOENT;
510 victim = lookup_one_len(attr->name, dir, strlen(attr->name)); 472 victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
511 if (!IS_ERR(victim)) { 473 if (!victim_sd)
512 /* make sure dentry is really there */ 474 goto out;
513 if (victim->d_inode && 475
514 (victim->d_parent->d_inode == dir->d_inode)) { 476 victim = sysfs_get_dentry(victim_sd);
515 victim->d_inode->i_mtime = CURRENT_TIME; 477 if (IS_ERR(victim)) {
516 fsnotify_modify(victim); 478 rc = PTR_ERR(victim);
517 res = 0; 479 victim = NULL;
518 } else 480 goto out;
519 d_drop(victim);
520
521 /**
522 * Drop the reference acquired from lookup_one_len() above.
523 */
524 dput(victim);
525 } 481 }
526 mutex_unlock(&dir->d_inode->i_mutex);
527 482
528 return res; 483 mutex_lock(&victim->d_inode->i_mutex);
484 victim->d_inode->i_mtime = CURRENT_TIME;
485 fsnotify_modify(victim);
486 mutex_unlock(&victim->d_inode->i_mutex);
487 rc = 0;
488 out:
489 dput(victim);
490 sysfs_put(victim_sd);
491 return rc;
529} 492}
530 493
531 494
@@ -538,30 +501,34 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
538 */ 501 */
539int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) 502int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
540{ 503{
541 struct dentry *dir = kobj->dentry; 504 struct sysfs_dirent *victim_sd = NULL;
542 struct dentry *victim; 505 struct dentry *victim = NULL;
543 struct inode * inode; 506 struct inode * inode;
544 struct iattr newattrs; 507 struct iattr newattrs;
545 int res = -ENOENT; 508 int rc;
546 509
547 mutex_lock(&dir->d_inode->i_mutex); 510 rc = -ENOENT;
548 victim = lookup_one_len(attr->name, dir, strlen(attr->name)); 511 victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
549 if (!IS_ERR(victim)) { 512 if (!victim_sd)
550 if (victim->d_inode && 513 goto out;
551 (victim->d_parent->d_inode == dir->d_inode)) { 514
552 inode = victim->d_inode; 515 victim = sysfs_get_dentry(victim_sd);
553 mutex_lock(&inode->i_mutex); 516 if (IS_ERR(victim)) {
554 newattrs.ia_mode = (mode & S_IALLUGO) | 517 rc = PTR_ERR(victim);
555 (inode->i_mode & ~S_IALLUGO); 518 victim = NULL;
556 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 519 goto out;
557 res = notify_change(victim, &newattrs);
558 mutex_unlock(&inode->i_mutex);
559 }
560 dput(victim);
561 } 520 }
562 mutex_unlock(&dir->d_inode->i_mutex);
563 521
564 return res; 522 inode = victim->d_inode;
523 mutex_lock(&inode->i_mutex);
524 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
525 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
526 rc = notify_change(victim, &newattrs);
527 mutex_unlock(&inode->i_mutex);
528 out:
529 dput(victim);
530 sysfs_put(victim_sd);
531 return rc;
565} 532}
566EXPORT_SYMBOL_GPL(sysfs_chmod_file); 533EXPORT_SYMBOL_GPL(sysfs_chmod_file);
567 534
@@ -576,7 +543,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
576 543
577void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) 544void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
578{ 545{
579 sysfs_hash_and_remove(kobj->dentry, attr->name); 546 sysfs_hash_and_remove(kobj->sd, attr->name);
580} 547}
581 548
582 549
@@ -589,12 +556,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
589void sysfs_remove_file_from_group(struct kobject *kobj, 556void sysfs_remove_file_from_group(struct kobject *kobj,
590 const struct attribute *attr, const char *group) 557 const struct attribute *attr, const char *group)
591{ 558{
592 struct dentry *dir; 559 struct sysfs_dirent *dir_sd;
593 560
594 dir = lookup_one_len(group, kobj->dentry, strlen(group)); 561 dir_sd = sysfs_get_dirent(kobj->sd, group);
595 if (!IS_ERR(dir)) { 562 if (dir_sd) {
596 sysfs_hash_and_remove(dir, attr->name); 563 sysfs_hash_and_remove(dir_sd, attr->name);
597 dput(dir); 564 sysfs_put(dir_sd);
598 } 565 }
599} 566}
600EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); 567EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 52eed2a7a5..f318b73c79 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,26 +18,25 @@
18#include "sysfs.h" 18#include "sysfs.h"
19 19
20 20
21static void remove_files(struct dentry * dir, 21static void remove_files(struct sysfs_dirent *dir_sd,
22 const struct attribute_group * grp) 22 const struct attribute_group *grp)
23{ 23{
24 struct attribute *const* attr; 24 struct attribute *const* attr;
25 25
26 for (attr = grp->attrs; *attr; attr++) 26 for (attr = grp->attrs; *attr; attr++)
27 sysfs_hash_and_remove(dir,(*attr)->name); 27 sysfs_hash_and_remove(dir_sd, (*attr)->name);
28} 28}
29 29
30static int create_files(struct dentry * dir, 30static int create_files(struct sysfs_dirent *dir_sd,
31 const struct attribute_group * grp) 31 const struct attribute_group *grp)
32{ 32{
33 struct attribute *const* attr; 33 struct attribute *const* attr;
34 int error = 0; 34 int error = 0;
35 35
36 for (attr = grp->attrs; *attr && !error; attr++) { 36 for (attr = grp->attrs; *attr && !error; attr++)
37 error = sysfs_add_file(dir, *attr, SYSFS_KOBJ_ATTR); 37 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
38 }
39 if (error) 38 if (error)
40 remove_files(dir,grp); 39 remove_files(dir_sd, grp);
41 return error; 40 return error;
42} 41}
43 42
@@ -45,44 +44,44 @@ static int create_files(struct dentry * dir,
45int sysfs_create_group(struct kobject * kobj, 44int sysfs_create_group(struct kobject * kobj,
46 const struct attribute_group * grp) 45 const struct attribute_group * grp)
47{ 46{
48 struct dentry * dir; 47 struct sysfs_dirent *sd;
49 int error; 48 int error;
50 49
51 BUG_ON(!kobj || !kobj->dentry); 50 BUG_ON(!kobj || !kobj->sd);
52 51
53 if (grp->name) { 52 if (grp->name) {
54 error = sysfs_create_subdir(kobj,grp->name,&dir); 53 error = sysfs_create_subdir(kobj, grp->name, &sd);
55 if (error) 54 if (error)
56 return error; 55 return error;
57 } else 56 } else
58 dir = kobj->dentry; 57 sd = kobj->sd;
59 dir = dget(dir); 58 sysfs_get(sd);
60 if ((error = create_files(dir,grp))) { 59 error = create_files(sd, grp);
60 if (error) {
61 if (grp->name) 61 if (grp->name)
62 sysfs_remove_subdir(dir); 62 sysfs_remove_subdir(sd);
63 } 63 }
64 dput(dir); 64 sysfs_put(sd);
65 return error; 65 return error;
66} 66}
67 67
68void sysfs_remove_group(struct kobject * kobj, 68void sysfs_remove_group(struct kobject * kobj,
69 const struct attribute_group * grp) 69 const struct attribute_group * grp)
70{ 70{
71 struct dentry * dir; 71 struct sysfs_dirent *dir_sd = kobj->sd;
72 struct sysfs_dirent *sd;
72 73
73 if (grp->name) { 74 if (grp->name) {
74 dir = lookup_one_len_kern(grp->name, kobj->dentry, 75 sd = sysfs_get_dirent(dir_sd, grp->name);
75 strlen(grp->name)); 76 BUG_ON(!sd);
76 BUG_ON(IS_ERR(dir)); 77 } else
77 } 78 sd = sysfs_get(dir_sd);
78 else
79 dir = dget(kobj->dentry);
80 79
81 remove_files(dir,grp); 80 remove_files(sd, grp);
82 if (grp->name) 81 if (grp->name)
83 sysfs_remove_subdir(dir); 82 sysfs_remove_subdir(sd);
84 /* release the ref. taken in this routine */ 83
85 dput(dir); 84 sysfs_put(sd);
86} 85}
87 86
88 87
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 5266eec15f..3756e15228 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -133,187 +133,94 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
133 */ 133 */
134static struct lock_class_key sysfs_inode_imutex_key; 134static struct lock_class_key sysfs_inode_imutex_key;
135 135
136struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) 136void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
137{ 137{
138 struct inode * inode = new_inode(sysfs_sb); 138 inode->i_blocks = 0;
139 if (inode) { 139 inode->i_mapping->a_ops = &sysfs_aops;
140 inode->i_blocks = 0; 140 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
141 inode->i_mapping->a_ops = &sysfs_aops; 141 inode->i_op = &sysfs_inode_operations;
142 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 142 inode->i_ino = sd->s_ino;
143 inode->i_op = &sysfs_inode_operations; 143 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
144 inode->i_ino = sd->s_ino; 144
145 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); 145 if (sd->s_iattr) {
146 146 /* sysfs_dirent has non-default attributes
147 if (sd->s_iattr) { 147 * get them for the new inode from persistent copy
148 /* sysfs_dirent has non-default attributes 148 * in sysfs_dirent
149 * get them for the new inode from persistent copy 149 */
150 * in sysfs_dirent 150 set_inode_attr(inode, sd->s_iattr);
151 */
152 set_inode_attr(inode, sd->s_iattr);
153 } else
154 set_default_inode_attr(inode, mode);
155 }
156 return inode;
157}
158
159int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
160{
161 int error = 0;
162 struct inode * inode = NULL;
163 if (dentry) {
164 if (!dentry->d_inode) {
165 struct sysfs_dirent * sd = dentry->d_fsdata;
166 if ((inode = sysfs_new_inode(mode, sd))) {
167 if (dentry->d_parent && dentry->d_parent->d_inode) {
168 struct inode *p_inode = dentry->d_parent->d_inode;
169 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
170 }
171 goto Proceed;
172 }
173 else
174 error = -ENOMEM;
175 } else
176 error = -EEXIST;
177 } else
178 error = -ENOENT;
179 goto Done;
180
181 Proceed:
182 if (init)
183 error = init(inode);
184 if (!error) {
185 d_instantiate(dentry, inode);
186 if (S_ISDIR(mode))
187 dget(dentry); /* pin only directory dentry in core */
188 } else 151 } else
189 iput(inode); 152 set_default_inode_attr(inode, sd->s_mode);
190 Done:
191 return error;
192} 153}
193 154
194/* 155/**
195 * Get the name for corresponding element represented by the given sysfs_dirent 156 * sysfs_get_inode - get inode for sysfs_dirent
157 * @sd: sysfs_dirent to allocate inode for
158 *
159 * Get inode for @sd. If such inode doesn't exist, a new inode
160 * is allocated and basics are initialized. New inode is
161 * returned locked.
162 *
163 * LOCKING:
164 * Kernel thread context (may sleep).
165 *
166 * RETURNS:
167 * Pointer to allocated inode on success, NULL on failure.
196 */ 168 */
197const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) 169struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
198{ 170{
199 struct attribute * attr; 171 struct inode *inode;
200 struct bin_attribute * bin_attr;
201 struct sysfs_symlink * sl;
202
203 BUG_ON(!sd || !sd->s_element);
204
205 switch (sd->s_type) {
206 case SYSFS_DIR:
207 /* Always have a dentry so use that */
208 return sd->s_dentry->d_name.name;
209
210 case SYSFS_KOBJ_ATTR:
211 attr = sd->s_element;
212 return attr->name;
213
214 case SYSFS_KOBJ_BIN_ATTR:
215 bin_attr = sd->s_element;
216 return bin_attr->attr.name;
217 172
218 case SYSFS_KOBJ_LINK: 173 inode = iget_locked(sysfs_sb, sd->s_ino);
219 sl = sd->s_element; 174 if (inode && (inode->i_state & I_NEW))
220 return sl->link_name; 175 sysfs_init_inode(sd, inode);
221 }
222 return NULL;
223}
224 176
225static inline void orphan_all_buffers(struct inode *node) 177 return inode;
226{
227 struct sysfs_buffer_collection *set;
228 struct sysfs_buffer *buf;
229
230 mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD);
231 set = node->i_private;
232 if (set) {
233 list_for_each_entry(buf, &set->associates, associates) {
234 down(&buf->sem);
235 buf->orphaned = 1;
236 up(&buf->sem);
237 }
238 }
239 mutex_unlock(&node->i_mutex);
240} 178}
241 179
242 180/**
243/* 181 * sysfs_instantiate - instantiate dentry
244 * Unhashes the dentry corresponding to given sysfs_dirent 182 * @dentry: dentry to be instantiated
245 * Called with parent inode's i_mutex held. 183 * @inode: inode associated with @sd
184 *
185 * Unlock @inode if locked and instantiate @dentry with @inode.
186 *
187 * LOCKING:
188 * None.
246 */ 189 */
247void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) 190void sysfs_instantiate(struct dentry *dentry, struct inode *inode)
248{ 191{
249 struct dentry *dentry = NULL; 192 BUG_ON(!dentry || dentry->d_inode);
250 struct inode *inode;
251 193
252 /* We're not holding a reference to ->s_dentry dentry but the 194 if (inode->i_state & I_NEW)
253 * field will stay valid as long as sysfs_lock is held. 195 unlock_new_inode(inode);
254 */
255 spin_lock(&sysfs_lock);
256 spin_lock(&dcache_lock);
257
258 /* dget dentry if it's still alive */
259 if (sd->s_dentry && sd->s_dentry->d_inode)
260 dentry = dget_locked(sd->s_dentry);
261
262 spin_unlock(&dcache_lock);
263 spin_unlock(&sysfs_lock);
264
265 /* drop dentry */
266 if (dentry) {
267 spin_lock(&dcache_lock);
268 spin_lock(&dentry->d_lock);
269 if (!d_unhashed(dentry) && dentry->d_inode) {
270 inode = dentry->d_inode;
271 spin_lock(&inode->i_lock);
272 __iget(inode);
273 spin_unlock(&inode->i_lock);
274 dget_locked(dentry);
275 __d_drop(dentry);
276 spin_unlock(&dentry->d_lock);
277 spin_unlock(&dcache_lock);
278 simple_unlink(parent->d_inode, dentry);
279 orphan_all_buffers(inode);
280 iput(inode);
281 } else {
282 spin_unlock(&dentry->d_lock);
283 spin_unlock(&dcache_lock);
284 }
285 196
286 dput(dentry); 197 d_instantiate(dentry, inode);
287 }
288} 198}
289 199
290int sysfs_hash_and_remove(struct dentry * dir, const char * name) 200int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
291{ 201{
292 struct sysfs_dirent * sd; 202 struct sysfs_addrm_cxt acxt;
293 struct sysfs_dirent * parent_sd; 203 struct sysfs_dirent **pos, *sd;
294 int found = 0;
295 204
296 if (!dir) 205 if (!dir_sd)
297 return -ENOENT; 206 return -ENOENT;
298 207
299 if (dir->d_inode == NULL) 208 sysfs_addrm_start(&acxt, dir_sd);
300 /* no inode means this hasn't been made visible yet */ 209
301 return -ENOENT; 210 for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
211 sd = *pos;
302 212
303 parent_sd = dir->d_fsdata; 213 if (!sysfs_type(sd))
304 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
305 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
306 if (!sd->s_element)
307 continue; 214 continue;
308 if (!strcmp(sysfs_get_name(sd), name)) { 215 if (!strcmp(sd->s_name, name)) {
309 list_del_init(&sd->s_sibling); 216 *pos = sd->s_sibling;
310 sysfs_drop_dentry(sd, dir); 217 sd->s_sibling = NULL;
311 sysfs_put(sd); 218 sysfs_remove_one(&acxt, sd);
312 found = 1;
313 break; 219 break;
314 } 220 }
315 } 221 }
316 mutex_unlock(&dir->d_inode->i_mutex);
317 222
318 return found ? 0 : -ENOENT; 223 if (sysfs_addrm_finish(&acxt))
224 return 0;
225 return -ENOENT;
319} 226}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 00ab9125d3..402cc35620 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -19,28 +19,18 @@ struct vfsmount *sysfs_mount;
19struct super_block * sysfs_sb = NULL; 19struct super_block * sysfs_sb = NULL;
20struct kmem_cache *sysfs_dir_cachep; 20struct kmem_cache *sysfs_dir_cachep;
21 21
22static void sysfs_clear_inode(struct inode *inode);
23
24static const struct super_operations sysfs_ops = { 22static const struct super_operations sysfs_ops = {
25 .statfs = simple_statfs, 23 .statfs = simple_statfs,
26 .drop_inode = sysfs_delete_inode, 24 .drop_inode = sysfs_delete_inode,
27 .clear_inode = sysfs_clear_inode,
28}; 25};
29 26
30static struct sysfs_dirent sysfs_root = { 27struct sysfs_dirent sysfs_root = {
31 .s_sibling = LIST_HEAD_INIT(sysfs_root.s_sibling), 28 .s_count = ATOMIC_INIT(1),
32 .s_children = LIST_HEAD_INIT(sysfs_root.s_children), 29 .s_flags = SYSFS_ROOT,
33 .s_element = NULL, 30 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
34 .s_type = SYSFS_ROOT,
35 .s_iattr = NULL,
36 .s_ino = 1, 31 .s_ino = 1,
37}; 32};
38 33
39static void sysfs_clear_inode(struct inode *inode)
40{
41 kfree(inode->i_private);
42}
43
44static int sysfs_fill_super(struct super_block *sb, void *data, int silent) 34static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
45{ 35{
46 struct inode *inode; 36 struct inode *inode;
@@ -53,24 +43,26 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
53 sb->s_time_gran = 1; 43 sb->s_time_gran = 1;
54 sysfs_sb = sb; 44 sysfs_sb = sb;
55 45
56 inode = sysfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 46 inode = new_inode(sysfs_sb);
57 &sysfs_root); 47 if (!inode) {
58 if (inode) {
59 inode->i_op = &sysfs_dir_inode_operations;
60 inode->i_fop = &sysfs_dir_operations;
61 /* directory inodes start off with i_nlink == 2 (for "." entry) */
62 inc_nlink(inode);
63 } else {
64 pr_debug("sysfs: could not get root inode\n"); 48 pr_debug("sysfs: could not get root inode\n");
65 return -ENOMEM; 49 return -ENOMEM;
66 } 50 }
67 51
52 sysfs_init_inode(&sysfs_root, inode);
53
54 inode->i_op = &sysfs_dir_inode_operations;
55 inode->i_fop = &sysfs_dir_operations;
56 /* directory inodes start off with i_nlink == 2 (for "." entry) */
57 inc_nlink(inode);
58
68 root = d_alloc_root(inode); 59 root = d_alloc_root(inode);
69 if (!root) { 60 if (!root) {
70 pr_debug("%s: could not get root dentry!\n",__FUNCTION__); 61 pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
71 iput(inode); 62 iput(inode);
72 return -ENOMEM; 63 return -ENOMEM;
73 } 64 }
65 sysfs_root.s_dentry = root;
74 root->d_fsdata = &sysfs_root; 66 root->d_fsdata = &sysfs_root;
75 sb->s_root = root; 67 sb->s_root = root;
76 return 0; 68 return 0;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 7b9c5bfde9..2f86e04222 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,71 +11,39 @@
11 11
12#include "sysfs.h" 12#include "sysfs.h"
13 13
14static int object_depth(struct kobject * kobj) 14static int object_depth(struct sysfs_dirent *sd)
15{ 15{
16 struct kobject * p = kobj;
17 int depth = 0; 16 int depth = 0;
18 do { depth++; } while ((p = p->parent)); 17
18 for (; sd->s_parent; sd = sd->s_parent)
19 depth++;
20
19 return depth; 21 return depth;
20} 22}
21 23
22static int object_path_length(struct kobject * kobj) 24static int object_path_length(struct sysfs_dirent * sd)
23{ 25{
24 struct kobject * p = kobj;
25 int length = 1; 26 int length = 1;
26 do { 27
27 length += strlen(kobject_name(p)) + 1; 28 for (; sd->s_parent; sd = sd->s_parent)
28 p = p->parent; 29 length += strlen(sd->s_name) + 1;
29 } while (p); 30
30 return length; 31 return length;
31} 32}
32 33
33static void fill_object_path(struct kobject * kobj, char * buffer, int length) 34static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
34{ 35{
35 struct kobject * p;
36
37 --length; 36 --length;
38 for (p = kobj; p; p = p->parent) { 37 for (; sd->s_parent; sd = sd->s_parent) {
39 int cur = strlen(kobject_name(p)); 38 int cur = strlen(sd->s_name);
40 39
41 /* back up enough to print this bus id with '/' */ 40 /* back up enough to print this bus id with '/' */
42 length -= cur; 41 length -= cur;
43 strncpy(buffer + length,kobject_name(p),cur); 42 strncpy(buffer + length, sd->s_name, cur);
44 *(buffer + --length) = '/'; 43 *(buffer + --length) = '/';
45 } 44 }
46} 45}
47 46
48static int sysfs_add_link(struct dentry * parent, const char * name, struct kobject * target)
49{
50 struct sysfs_dirent * parent_sd = parent->d_fsdata;
51 struct sysfs_symlink * sl;
52 int error = 0;
53
54 error = -ENOMEM;
55 sl = kmalloc(sizeof(*sl), GFP_KERNEL);
56 if (!sl)
57 goto exit1;
58
59 sl->link_name = kmalloc(strlen(name) + 1, GFP_KERNEL);
60 if (!sl->link_name)
61 goto exit2;
62
63 strcpy(sl->link_name, name);
64 sl->target_kobj = kobject_get(target);
65
66 error = sysfs_make_dirent(parent_sd, NULL, sl, S_IFLNK|S_IRWXUGO,
67 SYSFS_KOBJ_LINK);
68 if (!error)
69 return 0;
70
71 kobject_put(target);
72 kfree(sl->link_name);
73exit2:
74 kfree(sl);
75exit1:
76 return error;
77}
78
79/** 47/**
80 * sysfs_create_link - create symlink between two objects. 48 * sysfs_create_link - create symlink between two objects.
81 * @kobj: object whose directory we're creating the link in. 49 * @kobj: object whose directory we're creating the link in.
@@ -84,24 +52,57 @@ exit1:
84 */ 52 */
85int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) 53int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
86{ 54{
87 struct dentry *dentry = NULL; 55 struct sysfs_dirent *parent_sd = NULL;
88 int error = -EEXIST; 56 struct sysfs_dirent *target_sd = NULL;
57 struct sysfs_dirent *sd = NULL;
58 struct sysfs_addrm_cxt acxt;
59 int error;
89 60
90 BUG_ON(!name); 61 BUG_ON(!name);
91 62
92 if (!kobj) { 63 if (!kobj) {
93 if (sysfs_mount && sysfs_mount->mnt_sb) 64 if (sysfs_mount && sysfs_mount->mnt_sb)
94 dentry = sysfs_mount->mnt_sb->s_root; 65 parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
95 } else 66 } else
96 dentry = kobj->dentry; 67 parent_sd = kobj->sd;
68
69 error = -EFAULT;
70 if (!parent_sd)
71 goto out_put;
72
73 /* target->sd can go away beneath us but is protected with
74 * sysfs_assoc_lock. Fetch target_sd from it.
75 */
76 spin_lock(&sysfs_assoc_lock);
77 if (target->sd)
78 target_sd = sysfs_get(target->sd);
79 spin_unlock(&sysfs_assoc_lock);
80
81 error = -ENOENT;
82 if (!target_sd)
83 goto out_put;
84
85 error = -ENOMEM;
86 sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
87 if (!sd)
88 goto out_put;
89 sd->s_elem.symlink.target_sd = target_sd;
97 90
98 if (!dentry) 91 sysfs_addrm_start(&acxt, parent_sd);
99 return -EFAULT;
100 92
101 mutex_lock(&dentry->d_inode->i_mutex); 93 if (!sysfs_find_dirent(parent_sd, name)) {
102 if (!sysfs_dirent_exist(dentry->d_fsdata, name)) 94 sysfs_add_one(&acxt, sd);
103 error = sysfs_add_link(dentry, name, target); 95 sysfs_link_sibling(sd);
104 mutex_unlock(&dentry->d_inode->i_mutex); 96 }
97
98 if (sysfs_addrm_finish(&acxt))
99 return 0;
100
101 error = -EEXIST;
102 /* fall through */
103 out_put:
104 sysfs_put(target_sd);
105 sysfs_put(sd);
105 return error; 106 return error;
106} 107}
107 108
@@ -114,17 +115,17 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
114 115
115void sysfs_remove_link(struct kobject * kobj, const char * name) 116void sysfs_remove_link(struct kobject * kobj, const char * name)
116{ 117{
117 sysfs_hash_and_remove(kobj->dentry,name); 118 sysfs_hash_and_remove(kobj->sd, name);
118} 119}
119 120
120static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target, 121static int sysfs_get_target_path(struct sysfs_dirent * parent_sd,
121 char *path) 122 struct sysfs_dirent * target_sd, char *path)
122{ 123{
123 char * s; 124 char * s;
124 int depth, size; 125 int depth, size;
125 126
126 depth = object_depth(kobj); 127 depth = object_depth(parent_sd);
127 size = object_path_length(target) + depth * 3 - 1; 128 size = object_path_length(target_sd) + depth * 3 - 1;
128 if (size > PATH_MAX) 129 if (size > PATH_MAX)
129 return -ENAMETOOLONG; 130 return -ENAMETOOLONG;
130 131
@@ -133,7 +134,7 @@ static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target,
133 for (s = path; depth--; s += 3) 134 for (s = path; depth--; s += 3)
134 strcpy(s,"../"); 135 strcpy(s,"../");
135 136
136 fill_object_path(target, path, size); 137 fill_object_path(target_sd, path, size);
137 pr_debug("%s: path = '%s'\n", __FUNCTION__, path); 138 pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
138 139
139 return 0; 140 return 0;
@@ -141,27 +142,16 @@ static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target,
141 142
142static int sysfs_getlink(struct dentry *dentry, char * path) 143static int sysfs_getlink(struct dentry *dentry, char * path)
143{ 144{
144 struct kobject *kobj, *target_kobj; 145 struct sysfs_dirent *sd = dentry->d_fsdata;
145 int error = 0; 146 struct sysfs_dirent *parent_sd = sd->s_parent;
147 struct sysfs_dirent *target_sd = sd->s_elem.symlink.target_sd;
148 int error;
146 149
147 kobj = sysfs_get_kobject(dentry->d_parent); 150 mutex_lock(&sysfs_mutex);
148 if (!kobj) 151 error = sysfs_get_target_path(parent_sd, target_sd, path);
149 return -EINVAL; 152 mutex_unlock(&sysfs_mutex);
150 153
151 target_kobj = sysfs_get_kobject(dentry);
152 if (!target_kobj) {
153 kobject_put(kobj);
154 return -EINVAL;
155 }
156
157 down_read(&sysfs_rename_sem);
158 error = sysfs_get_target_path(kobj, target_kobj, path);
159 up_read(&sysfs_rename_sem);
160
161 kobject_put(kobj);
162 kobject_put(target_kobj);
163 return error; 154 return error;
164
165} 155}
166 156
167static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) 157static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 502c949c40..6a37f2386a 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,9 +1,40 @@
1struct sysfs_elem_dir {
2 struct kobject * kobj;
3};
4
5struct sysfs_elem_symlink {
6 struct sysfs_dirent * target_sd;
7};
8
9struct sysfs_elem_attr {
10 struct attribute * attr;
11};
12
13struct sysfs_elem_bin_attr {
14 struct bin_attribute * bin_attr;
15};
16
17/*
18 * As long as s_count reference is held, the sysfs_dirent itself is
19 * accessible. Dereferencing s_elem or any other outer entity
20 * requires s_active reference.
21 */
1struct sysfs_dirent { 22struct sysfs_dirent {
2 atomic_t s_count; 23 atomic_t s_count;
3 struct list_head s_sibling; 24 atomic_t s_active;
4 struct list_head s_children; 25 struct sysfs_dirent * s_parent;
5 void * s_element; 26 struct sysfs_dirent * s_sibling;
6 int s_type; 27 struct sysfs_dirent * s_children;
28 const char * s_name;
29
30 union {
31 struct sysfs_elem_dir dir;
32 struct sysfs_elem_symlink symlink;
33 struct sysfs_elem_attr attr;
34 struct sysfs_elem_bin_attr bin_attr;
35 } s_elem;
36
37 unsigned int s_flags;
7 umode_t s_mode; 38 umode_t s_mode;
8 ino_t s_ino; 39 ino_t s_ino;
9 struct dentry * s_dentry; 40 struct dentry * s_dentry;
@@ -11,30 +42,60 @@ struct sysfs_dirent {
11 atomic_t s_event; 42 atomic_t s_event;
12}; 43};
13 44
45#define SD_DEACTIVATED_BIAS INT_MIN
46
47struct sysfs_addrm_cxt {
48 struct sysfs_dirent *parent_sd;
49 struct inode *parent_inode;
50 struct sysfs_dirent *removed;
51 int cnt;
52};
53
14extern struct vfsmount * sysfs_mount; 54extern struct vfsmount * sysfs_mount;
55extern struct sysfs_dirent sysfs_root;
15extern struct kmem_cache *sysfs_dir_cachep; 56extern struct kmem_cache *sysfs_dir_cachep;
16 57
17extern void sysfs_delete_inode(struct inode *inode); 58extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
18extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); 59extern void sysfs_link_sibling(struct sysfs_dirent *sd);
19extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); 60extern void sysfs_unlink_sibling(struct sysfs_dirent *sd);
61extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
62extern void sysfs_put_active(struct sysfs_dirent *sd);
63extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
64extern void sysfs_put_active_two(struct sysfs_dirent *sd);
65extern void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
66 struct sysfs_dirent *parent_sd);
67extern void sysfs_add_one(struct sysfs_addrm_cxt *acxt,
68 struct sysfs_dirent *sd);
69extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
70 struct sysfs_dirent *sd);
71extern int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
20 72
21extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *); 73extern void sysfs_delete_inode(struct inode *inode);
22extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, 74extern void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode);
23 umode_t, int); 75extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
24 76extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode);
25extern int sysfs_add_file(struct dentry *, const struct attribute *, int); 77
26extern int sysfs_hash_and_remove(struct dentry * dir, const char * name); 78extern void release_sysfs_dirent(struct sysfs_dirent * sd);
79extern struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
80 const unsigned char *name);
81extern struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
82 const unsigned char *name);
83extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode,
84 int type);
85
86extern int sysfs_add_file(struct sysfs_dirent *dir_sd,
87 const struct attribute *attr, int type);
88extern int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
27extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); 89extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
28 90
29extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); 91extern int sysfs_create_subdir(struct kobject *kobj, const char *name,
30extern void sysfs_remove_subdir(struct dentry *); 92 struct sysfs_dirent **p_sd);
93extern void sysfs_remove_subdir(struct sysfs_dirent *sd);
31 94
32extern const unsigned char * sysfs_get_name(struct sysfs_dirent *sd);
33extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent);
34extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 95extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
35 96
36extern spinlock_t sysfs_lock; 97extern spinlock_t sysfs_assoc_lock;
37extern struct rw_semaphore sysfs_rename_sem; 98extern struct mutex sysfs_mutex;
38extern struct super_block * sysfs_sb; 99extern struct super_block * sysfs_sb;
39extern const struct file_operations sysfs_dir_operations; 100extern const struct file_operations sysfs_dir_operations;
40extern const struct file_operations sysfs_file_operations; 101extern const struct file_operations sysfs_file_operations;
@@ -42,73 +103,9 @@ extern const struct file_operations bin_fops;
42extern const struct inode_operations sysfs_dir_inode_operations; 103extern const struct inode_operations sysfs_dir_inode_operations;
43extern const struct inode_operations sysfs_symlink_inode_operations; 104extern const struct inode_operations sysfs_symlink_inode_operations;
44 105
45struct sysfs_symlink { 106static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
46 char * link_name;
47 struct kobject * target_kobj;
48};
49
50struct sysfs_buffer {
51 struct list_head associates;
52 size_t count;
53 loff_t pos;
54 char * page;
55 struct sysfs_ops * ops;
56 struct semaphore sem;
57 int orphaned;
58 int needs_read_fill;
59 int event;
60};
61
62struct sysfs_buffer_collection {
63 struct list_head associates;
64};
65
66static inline struct kobject * to_kobj(struct dentry * dentry)
67{
68 struct sysfs_dirent * sd = dentry->d_fsdata;
69 return ((struct kobject *) sd->s_element);
70}
71
72static inline struct attribute * to_attr(struct dentry * dentry)
73{ 107{
74 struct sysfs_dirent * sd = dentry->d_fsdata; 108 return sd->s_flags & SYSFS_TYPE_MASK;
75 return ((struct attribute *) sd->s_element);
76}
77
78static inline struct bin_attribute * to_bin_attr(struct dentry * dentry)
79{
80 struct sysfs_dirent * sd = dentry->d_fsdata;
81 return ((struct bin_attribute *) sd->s_element);
82}
83
84static inline struct kobject *sysfs_get_kobject(struct dentry *dentry)
85{
86 struct kobject * kobj = NULL;
87
88 spin_lock(&dcache_lock);
89 if (!d_unhashed(dentry)) {
90 struct sysfs_dirent * sd = dentry->d_fsdata;
91 if (sd->s_type & SYSFS_KOBJ_LINK) {
92 struct sysfs_symlink * sl = sd->s_element;
93 kobj = kobject_get(sl->target_kobj);
94 } else
95 kobj = kobject_get(sd->s_element);
96 }
97 spin_unlock(&dcache_lock);
98
99 return kobj;
100}
101
102static inline void release_sysfs_dirent(struct sysfs_dirent * sd)
103{
104 if (sd->s_type & SYSFS_KOBJ_LINK) {
105 struct sysfs_symlink * sl = sd->s_element;
106 kfree(sl->link_name);
107 kobject_put(sl->target_kobj);
108 kfree(sl);
109 }
110 kfree(sd->s_iattr);
111 kmem_cache_free(sysfs_dir_cachep, sd);
112} 109}
113 110
114static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) 111static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
@@ -122,7 +119,7 @@ static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
122 119
123static inline void sysfs_put(struct sysfs_dirent * sd) 120static inline void sysfs_put(struct sysfs_dirent * sd)
124{ 121{
125 if (atomic_dec_and_test(&sd->s_count)) 122 if (sd && atomic_dec_and_test(&sd->s_count))
126 release_sysfs_dirent(sd); 123 release_sysfs_dirent(sd);
127} 124}
128 125
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0732ddb902..589be21d88 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -27,7 +27,7 @@ const struct file_operations sysv_file_operations = {
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = sysv_sync_file, 29 .fsync = sysv_sync_file,
30 .sendfile = generic_file_sendfile, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
33const struct inode_operations sysv_file_inode_operations = { 33const struct inode_operations sysv_file_inode_operations = {
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index 1b82a4adc2..ef2bfaa19d 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -106,8 +106,8 @@ int main(void)
106{ 106{
107 unsigned short x; 107 unsigned short x;
108 108
109 x = udf_crc16(bytes, sizeof bytes); 109 x = udf_crc(bytes, sizeof bytes);
110 printf("udf_crc16: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U); 110 printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
111 111
112 return 0; 112 return 0;
113} 113}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 51b5764685..df070bee8d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -261,7 +261,7 @@ const struct file_operations udf_file_operations = {
261 .aio_write = udf_file_aio_write, 261 .aio_write = udf_file_aio_write,
262 .release = udf_release_file, 262 .release = udf_release_file,
263 .fsync = udf_fsync_file, 263 .fsync = udf_fsync_file,
264 .sendfile = generic_file_sendfile, 264 .splice_read = generic_file_splice_read,
265}; 265};
266 266
267const struct inode_operations udf_file_inode_operations = { 267const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 8206983f2e..10f3188738 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -50,7 +50,7 @@ void udf_free_inode(struct inode * inode)
50 else 50 else
51 UDF_SB_LVIDIU(sb)->numFiles = 51 UDF_SB_LVIDIU(sb)->numFiles =
52 cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1); 52 cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1);
53 53
54 mark_buffer_dirty(sbi->s_lvidbh); 54 mark_buffer_dirty(sbi->s_lvidbh);
55 } 55 }
56 mutex_unlock(&sbi->s_alloc_mutex); 56 mutex_unlock(&sbi->s_alloc_mutex);
@@ -136,6 +136,13 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
136 UDF_I_EFE(inode) = 0; 136 UDF_I_EFE(inode) = 0;
137 UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); 137 UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
138 } 138 }
139 if (!UDF_I_DATA(inode))
140 {
141 iput(inode);
142 *err = -ENOMEM;
143 mutex_unlock(&sbi->s_alloc_mutex);
144 return NULL;
145 }
139 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 146 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
140 UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB; 147 UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
141 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 148 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bf7de0bdba..5b82e489af 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -49,6 +49,7 @@ MODULE_LICENSE("GPL");
49static mode_t udf_convert_permissions(struct fileEntry *); 49static mode_t udf_convert_permissions(struct fileEntry *);
50static int udf_update_inode(struct inode *, int); 50static int udf_update_inode(struct inode *, int);
51static void udf_fill_inode(struct inode *, struct buffer_head *); 51static void udf_fill_inode(struct inode *, struct buffer_head *);
52static int udf_alloc_i_data(struct inode *inode, size_t size);
52static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 53static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
53 long *, int *); 54 long *, int *);
54static int8_t udf_insert_aext(struct inode *, struct extent_position, 55static int8_t udf_insert_aext(struct inode *, struct extent_position,
@@ -734,7 +735,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, int newbl
734 (*c) ++; 735 (*c) ++;
735 (*endnum) ++; 736 (*endnum) ++;
736 } 737 }
737 738
738 laarr[curr].extLocation.logicalBlockNum = newblocknum; 739 laarr[curr].extLocation.logicalBlockNum = newblocknum;
739 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) 740 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
740 laarr[curr].extLocation.partitionReferenceNum = 741 laarr[curr].extLocation.partitionReferenceNum =
@@ -836,7 +837,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
836 { 837 {
837 numalloc -= elen; 838 numalloc -= elen;
838 if (*endnum > (i+1)) 839 if (*endnum > (i+1))
839 memmove(&laarr[i], &laarr[i+1], 840 memmove(&laarr[i], &laarr[i+1],
840 sizeof(long_ad) * (*endnum - (i+1))); 841 sizeof(long_ad) * (*endnum - (i+1)));
841 i --; 842 i --;
842 (*endnum) --; 843 (*endnum) --;
@@ -1024,7 +1025,7 @@ void udf_truncate(struct inode * inode)
1024 { 1025 {
1025 block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block); 1026 block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block);
1026 udf_truncate_extents(inode); 1027 udf_truncate_extents(inode);
1027 } 1028 }
1028 1029
1029 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1030 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
1030 if (IS_SYNC(inode)) 1031 if (IS_SYNC(inode))
@@ -1087,10 +1088,10 @@ __udf_read_inode(struct inode *inode)
1087 { 1088 {
1088 kernel_lb_addr loc; 1089 kernel_lb_addr loc;
1089 ie = (struct indirectEntry *)ibh->b_data; 1090 ie = (struct indirectEntry *)ibh->b_data;
1090 1091
1091 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1092 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1092 1093
1093 if (ie->indirectICB.extLength && 1094 if (ie->indirectICB.extLength &&
1094 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident))) 1095 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident)))
1095 { 1096 {
1096 if (ident == TAG_IDENT_FE || 1097 if (ident == TAG_IDENT_FE ||
@@ -1156,14 +1157,22 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1156 { 1157 {
1157 UDF_I_EFE(inode) = 1; 1158 UDF_I_EFE(inode) = 1;
1158 UDF_I_USE(inode) = 0; 1159 UDF_I_USE(inode) = 0;
1159 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); 1160 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)))
1161 {
1162 make_bad_inode(inode);
1163 return;
1164 }
1160 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); 1165 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry));
1161 } 1166 }
1162 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE) 1167 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE)
1163 { 1168 {
1164 UDF_I_EFE(inode) = 0; 1169 UDF_I_EFE(inode) = 0;
1165 UDF_I_USE(inode) = 0; 1170 UDF_I_USE(inode) = 0;
1166 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); 1171 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct fileEntry)))
1172 {
1173 make_bad_inode(inode);
1174 return;
1175 }
1167 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1176 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry));
1168 } 1177 }
1169 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE) 1178 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE)
@@ -1173,7 +1182,11 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1173 UDF_I_LENALLOC(inode) = 1182 UDF_I_LENALLOC(inode) =
1174 le32_to_cpu( 1183 le32_to_cpu(
1175 ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs); 1184 ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs);
1176 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry), GFP_KERNEL); 1185 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)))
1186 {
1187 make_bad_inode(inode);
1188 return;
1189 }
1177 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); 1190 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry));
1178 return; 1191 return;
1179 } 1192 }
@@ -1191,7 +1204,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1191 inode->i_nlink = le16_to_cpu(fe->fileLinkCount); 1204 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1192 if (!inode->i_nlink) 1205 if (!inode->i_nlink)
1193 inode->i_nlink = 1; 1206 inode->i_nlink = 1;
1194 1207
1195 inode->i_size = le64_to_cpu(fe->informationLength); 1208 inode->i_size = le64_to_cpu(fe->informationLength);
1196 UDF_I_LENEXTENTS(inode) = inode->i_size; 1209 UDF_I_LENEXTENTS(inode) = inode->i_size;
1197 1210
@@ -1243,7 +1256,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1243 } 1256 }
1244 else 1257 else
1245 { 1258 {
1246 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << 1259 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
1247 (inode->i_sb->s_blocksize_bits - 9); 1260 (inode->i_sb->s_blocksize_bits - 9);
1248 1261
1249 if ( udf_stamp_to_time(&convtime, &convtime_usec, 1262 if ( udf_stamp_to_time(&convtime, &convtime_usec,
@@ -1374,6 +1387,20 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1374 } 1387 }
1375} 1388}
1376 1389
1390static int udf_alloc_i_data(struct inode *inode, size_t size)
1391{
1392 UDF_I_DATA(inode) = kmalloc(size, GFP_KERNEL);
1393
1394 if (!UDF_I_DATA(inode))
1395 {
1396 printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) no free memory\n",
1397 inode->i_ino);
1398 return -ENOMEM;
1399 }
1400
1401 return 0;
1402}
1403
1377static mode_t 1404static mode_t
1378udf_convert_permissions(struct fileEntry *fe) 1405udf_convert_permissions(struct fileEntry *fe)
1379{ 1406{
@@ -2072,7 +2099,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2072 mark_buffer_dirty_inode(oepos.bh, inode); 2099 mark_buffer_dirty_inode(oepos.bh, inode);
2073 } 2100 }
2074 } 2101 }
2075 2102
2076 brelse(epos.bh); 2103 brelse(epos.bh);
2077 brelse(oepos.bh); 2104 brelse(oepos.bh);
2078 return (elen >> 30); 2105 return (elen >> 30);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6658afb41c..d6a504f5d7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1356,7 +1356,7 @@ udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
1356 case UDF_VIRTUAL_MAP15: 1356 case UDF_VIRTUAL_MAP15:
1357 case UDF_VIRTUAL_MAP20: 1357 case UDF_VIRTUAL_MAP20:
1358 { 1358 {
1359 kernel_lb_addr ino; 1359 kernel_lb_addr uninitialized_var(ino);
1360 1360
1361 if (!UDF_SB_LASTBLOCK(sb)) 1361 if (!UDF_SB_LASTBLOCK(sb))
1362 { 1362 {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 1e096323ba..6705d74c6d 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -60,5 +60,5 @@ const struct file_operations ufs_file_operations = {
60 .mmap = generic_file_mmap, 60 .mmap = generic_file_mmap,
61 .open = generic_file_open, 61 .open = generic_file_open,
62 .fsync = ufs_sync_file, 62 .fsync = ufs_sync_file,
63 .sendfile = generic_file_sendfile, 63 .splice_read = generic_file_splice_read,
64}; 64};
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 22ff6ed55c..2b3011689e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -87,6 +87,7 @@
87#include <linux/smp_lock.h> 87#include <linux/smp_lock.h>
88#include <linux/buffer_head.h> 88#include <linux/buffer_head.h>
89#include <linux/vfs.h> 89#include <linux/vfs.h>
90#include <linux/log2.h>
90 91
91#include "swab.h" 92#include "swab.h"
92#include "util.h" 93#include "util.h"
@@ -854,7 +855,7 @@ magic_found:
854 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); 855 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
855 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); 856 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
856 857
857 if (uspi->s_fsize & (uspi->s_fsize - 1)) { 858 if (!is_power_of_2(uspi->s_fsize)) {
858 printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", 859 printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
859 uspi->s_fsize); 860 uspi->s_fsize);
860 goto failed; 861 goto failed;
@@ -869,7 +870,7 @@ magic_found:
869 uspi->s_fsize); 870 uspi->s_fsize);
870 goto failed; 871 goto failed;
871 } 872 }
872 if (uspi->s_bsize & (uspi->s_bsize - 1)) { 873 if (!is_power_of_2(uspi->s_bsize)) {
873 printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", 874 printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
874 uspi->s_bsize); 875 uspi->s_bsize);
875 goto failed; 876 goto failed;
diff --git a/fs/utimes.c b/fs/utimes.c
index 480f7c8c29..682eb63b20 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -106,9 +106,16 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
106 if (IS_IMMUTABLE(inode)) 106 if (IS_IMMUTABLE(inode))
107 goto dput_and_out; 107 goto dput_and_out;
108 108
109 if (current->fsuid != inode->i_uid && 109 if (!is_owner_or_cap(inode)) {
110 (error = vfs_permission(&nd, MAY_WRITE)) != 0) 110 if (f) {
111 goto dput_and_out; 111 if (!(f->f_mode & FMODE_WRITE))
112 goto dput_and_out;
113 } else {
114 error = vfs_permission(&nd, MAY_WRITE);
115 if (error)
116 goto dput_and_out;
117 }
118 }
112 } 119 }
113 mutex_lock(&inode->i_mutex); 120 mutex_lock(&inode->i_mutex);
114 error = notify_change(dentry, &newattrs); 121 error = notify_change(dentry, &newattrs);
diff --git a/fs/xattr.c b/fs/xattr.c
index 4523aca796..a44fd92cac 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -60,8 +60,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
60 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 60 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
61 return -EPERM; 61 return -EPERM;
62 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && 62 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
63 (mask & MAY_WRITE) && (current->fsuid != inode->i_uid) && 63 (mask & MAY_WRITE) && !is_owner_or_cap(inode))
64 !capable(CAP_FOWNER))
65 return -EPERM; 64 return -EPERM;
66 } 65 }
67 66
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index b49989bb89..e7a9a83f00 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -64,6 +64,7 @@ xfs-y += xfs_alloc.o \
64 xfs_dir2_sf.o \ 64 xfs_dir2_sf.o \
65 xfs_error.o \ 65 xfs_error.o \
66 xfs_extfree_item.o \ 66 xfs_extfree_item.o \
67 xfs_filestream.o \
67 xfs_fsops.o \ 68 xfs_fsops.o \
68 xfs_ialloc.o \ 69 xfs_ialloc.o \
69 xfs_ialloc_btree.o \ 70 xfs_ialloc_btree.o \
@@ -77,6 +78,7 @@ xfs-y += xfs_alloc.o \
77 xfs_log.o \ 78 xfs_log.o \
78 xfs_log_recover.o \ 79 xfs_log_recover.o \
79 xfs_mount.o \ 80 xfs_mount.o \
81 xfs_mru_cache.o \
80 xfs_rename.o \ 82 xfs_rename.o \
81 xfs_trans.o \ 83 xfs_trans.o \
82 xfs_trans_ail.o \ 84 xfs_trans_ail.o \
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 9ebabdf782..4b6470cf87 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -100,25 +100,6 @@ kmem_zone_destroy(kmem_zone_t *zone)
100extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); 100extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
101extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); 101extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
102 102
103/*
104 * Low memory cache shrinkers
105 */
106
107typedef struct shrinker *kmem_shaker_t;
108typedef int (*kmem_shake_func_t)(int, gfp_t);
109
110static inline kmem_shaker_t
111kmem_shake_register(kmem_shake_func_t sfunc)
112{
113 return set_shrinker(DEFAULT_SEEKS, sfunc);
114}
115
116static inline void
117kmem_shake_deregister(kmem_shaker_t shrinker)
118{
119 remove_shrinker(shrinker);
120}
121
122static inline int 103static inline int
123kmem_shake_allow(gfp_t gfp_mask) 104kmem_shake_allow(gfp_t gfp_mask)
124{ 105{
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7361861e3a..fd4105d662 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -108,14 +108,19 @@ xfs_page_trace(
108 108
109/* 109/*
110 * Schedule IO completion handling on a xfsdatad if this was 110 * Schedule IO completion handling on a xfsdatad if this was
111 * the final hold on this ioend. 111 * the final hold on this ioend. If we are asked to wait,
112 * flush the workqueue.
112 */ 113 */
113STATIC void 114STATIC void
114xfs_finish_ioend( 115xfs_finish_ioend(
115 xfs_ioend_t *ioend) 116 xfs_ioend_t *ioend,
117 int wait)
116{ 118{
117 if (atomic_dec_and_test(&ioend->io_remaining)) 119 if (atomic_dec_and_test(&ioend->io_remaining)) {
118 queue_work(xfsdatad_workqueue, &ioend->io_work); 120 queue_work(xfsdatad_workqueue, &ioend->io_work);
121 if (wait)
122 flush_workqueue(xfsdatad_workqueue);
123 }
119} 124}
120 125
121/* 126/*
@@ -156,6 +161,8 @@ xfs_setfilesize(
156 xfs_fsize_t bsize; 161 xfs_fsize_t bsize;
157 162
158 ip = xfs_vtoi(ioend->io_vnode); 163 ip = xfs_vtoi(ioend->io_vnode);
164 if (!ip)
165 return;
159 166
160 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 167 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
161 ASSERT(ioend->io_type != IOMAP_READ); 168 ASSERT(ioend->io_type != IOMAP_READ);
@@ -334,7 +341,7 @@ xfs_end_bio(
334 bio->bi_end_io = NULL; 341 bio->bi_end_io = NULL;
335 bio_put(bio); 342 bio_put(bio);
336 343
337 xfs_finish_ioend(ioend); 344 xfs_finish_ioend(ioend, 0);
338 return 0; 345 return 0;
339} 346}
340 347
@@ -470,7 +477,7 @@ xfs_submit_ioend(
470 } 477 }
471 if (bio) 478 if (bio)
472 xfs_submit_ioend_bio(ioend, bio); 479 xfs_submit_ioend_bio(ioend, bio);
473 xfs_finish_ioend(ioend); 480 xfs_finish_ioend(ioend, 0);
474 } while ((ioend = next) != NULL); 481 } while ((ioend = next) != NULL);
475} 482}
476 483
@@ -1003,6 +1010,8 @@ xfs_page_state_convert(
1003 if (buffer_unwritten(bh) || buffer_delay(bh) || 1010 if (buffer_unwritten(bh) || buffer_delay(bh) ||
1004 ((buffer_uptodate(bh) || PageUptodate(page)) && 1011 ((buffer_uptodate(bh) || PageUptodate(page)) &&
1005 !buffer_mapped(bh) && (unmapped || startio))) { 1012 !buffer_mapped(bh) && (unmapped || startio))) {
1013 int new_ioend = 0;
1014
1006 /* 1015 /*
1007 * Make sure we don't use a read-only iomap 1016 * Make sure we don't use a read-only iomap
1008 */ 1017 */
@@ -1021,6 +1030,15 @@ xfs_page_state_convert(
1021 } 1030 }
1022 1031
1023 if (!iomap_valid) { 1032 if (!iomap_valid) {
1033 /*
1034 * if we didn't have a valid mapping then we
1035 * need to ensure that we put the new mapping
1036 * in a new ioend structure. This needs to be
1037 * done to ensure that the ioends correctly
1038 * reflect the block mappings at io completion
1039 * for unwritten extent conversion.
1040 */
1041 new_ioend = 1;
1024 if (type == IOMAP_NEW) { 1042 if (type == IOMAP_NEW) {
1025 size = xfs_probe_cluster(inode, 1043 size = xfs_probe_cluster(inode,
1026 page, bh, head, 0); 1044 page, bh, head, 0);
@@ -1040,7 +1058,7 @@ xfs_page_state_convert(
1040 if (startio) { 1058 if (startio) {
1041 xfs_add_to_ioend(inode, bh, offset, 1059 xfs_add_to_ioend(inode, bh, offset,
1042 type, &ioend, 1060 type, &ioend,
1043 !iomap_valid); 1061 new_ioend);
1044 } else { 1062 } else {
1045 set_buffer_dirty(bh); 1063 set_buffer_dirty(bh);
1046 unlock_buffer(bh); 1064 unlock_buffer(bh);
@@ -1416,6 +1434,13 @@ xfs_end_io_direct(
1416 * This is not necessary for synchronous direct I/O, but we do 1434 * This is not necessary for synchronous direct I/O, but we do
1417 * it anyway to keep the code uniform and simpler. 1435 * it anyway to keep the code uniform and simpler.
1418 * 1436 *
1437 * Well, if only it were that simple. Because synchronous direct I/O
1438 * requires extent conversion to occur *before* we return to userspace,
1439 * we have to wait for extent conversion to complete. Look at the
1440 * iocb that has been passed to us to determine if this is AIO or
1441 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1442 * workqueue and wait for it to complete.
1443 *
1419 * The core direct I/O code might be changed to always call the 1444 * The core direct I/O code might be changed to always call the
1420 * completion handler in the future, in which case all this can 1445 * completion handler in the future, in which case all this can
1421 * go away. 1446 * go away.
@@ -1423,9 +1448,9 @@ xfs_end_io_direct(
1423 ioend->io_offset = offset; 1448 ioend->io_offset = offset;
1424 ioend->io_size = size; 1449 ioend->io_size = size;
1425 if (ioend->io_type == IOMAP_READ) { 1450 if (ioend->io_type == IOMAP_READ) {
1426 xfs_finish_ioend(ioend); 1451 xfs_finish_ioend(ioend, 0);
1427 } else if (private && size > 0) { 1452 } else if (private && size > 0) {
1428 xfs_finish_ioend(ioend); 1453 xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
1429 } else { 1454 } else {
1430 /* 1455 /*
1431 * A direct I/O write ioend starts it's life in unwritten 1456 * A direct I/O write ioend starts it's life in unwritten
@@ -1434,7 +1459,7 @@ xfs_end_io_direct(
1434 * handler. 1459 * handler.
1435 */ 1460 */
1436 INIT_WORK(&ioend->io_work, xfs_end_bio_written); 1461 INIT_WORK(&ioend->io_work, xfs_end_bio_written);
1437 xfs_finish_ioend(ioend); 1462 xfs_finish_ioend(ioend, 0);
1438 } 1463 }
1439 1464
1440 /* 1465 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index fe4f66a5af..b0f0e58866 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -35,10 +35,13 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37static kmem_zone_t *xfs_buf_zone; 37static kmem_zone_t *xfs_buf_zone;
38static kmem_shaker_t xfs_buf_shake;
39STATIC int xfsbufd(void *); 38STATIC int xfsbufd(void *);
40STATIC int xfsbufd_wakeup(int, gfp_t); 39STATIC int xfsbufd_wakeup(int, gfp_t);
41STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 40STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
41static struct shrinker xfs_buf_shake = {
42 .shrink = xfsbufd_wakeup,
43 .seeks = DEFAULT_SEEKS,
44};
42 45
43static struct workqueue_struct *xfslogd_workqueue; 46static struct workqueue_struct *xfslogd_workqueue;
44struct workqueue_struct *xfsdatad_workqueue; 47struct workqueue_struct *xfsdatad_workqueue;
@@ -314,7 +317,7 @@ xfs_buf_free(
314 317
315 ASSERT(list_empty(&bp->b_hash_list)); 318 ASSERT(list_empty(&bp->b_hash_list));
316 319
317 if (bp->b_flags & _XBF_PAGE_CACHE) { 320 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
318 uint i; 321 uint i;
319 322
320 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 323 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
@@ -323,18 +326,11 @@ xfs_buf_free(
323 for (i = 0; i < bp->b_page_count; i++) { 326 for (i = 0; i < bp->b_page_count; i++) {
324 struct page *page = bp->b_pages[i]; 327 struct page *page = bp->b_pages[i];
325 328
326 ASSERT(!PagePrivate(page)); 329 if (bp->b_flags & _XBF_PAGE_CACHE)
330 ASSERT(!PagePrivate(page));
327 page_cache_release(page); 331 page_cache_release(page);
328 } 332 }
329 _xfs_buf_free_pages(bp); 333 _xfs_buf_free_pages(bp);
330 } else if (bp->b_flags & _XBF_KMEM_ALLOC) {
331 /*
332 * XXX(hch): bp->b_count_desired might be incorrect (see
333 * xfs_buf_associate_memory for details), but fortunately
334 * the Linux version of kmem_free ignores the len argument..
335 */
336 kmem_free(bp->b_addr, bp->b_count_desired);
337 _xfs_buf_free_pages(bp);
338 } 334 }
339 335
340 xfs_buf_deallocate(bp); 336 xfs_buf_deallocate(bp);
@@ -764,43 +760,44 @@ xfs_buf_get_noaddr(
764 size_t len, 760 size_t len,
765 xfs_buftarg_t *target) 761 xfs_buftarg_t *target)
766{ 762{
767 size_t malloc_len = len; 763 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
764 int error, i;
768 xfs_buf_t *bp; 765 xfs_buf_t *bp;
769 void *data;
770 int error;
771 766
772 bp = xfs_buf_allocate(0); 767 bp = xfs_buf_allocate(0);
773 if (unlikely(bp == NULL)) 768 if (unlikely(bp == NULL))
774 goto fail; 769 goto fail;
775 _xfs_buf_initialize(bp, target, 0, len, 0); 770 _xfs_buf_initialize(bp, target, 0, len, 0);
776 771
777 try_again: 772 error = _xfs_buf_get_pages(bp, page_count, 0);
778 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE); 773 if (error)
779 if (unlikely(data == NULL))
780 goto fail_free_buf; 774 goto fail_free_buf;
781 775
782 /* check whether alignment matches.. */ 776 for (i = 0; i < page_count; i++) {
783 if ((__psunsigned_t)data != 777 bp->b_pages[i] = alloc_page(GFP_KERNEL);
784 ((__psunsigned_t)data & ~target->bt_smask)) { 778 if (!bp->b_pages[i])
785 /* .. else double the size and try again */ 779 goto fail_free_mem;
786 kmem_free(data, malloc_len);
787 malloc_len <<= 1;
788 goto try_again;
789 } 780 }
781 bp->b_flags |= _XBF_PAGES;
790 782
791 error = xfs_buf_associate_memory(bp, data, len); 783 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
792 if (error) 784 if (unlikely(error)) {
785 printk(KERN_WARNING "%s: failed to map pages\n",
786 __FUNCTION__);
793 goto fail_free_mem; 787 goto fail_free_mem;
794 bp->b_flags |= _XBF_KMEM_ALLOC; 788 }
795 789
796 xfs_buf_unlock(bp); 790 xfs_buf_unlock(bp);
797 791
798 XB_TRACE(bp, "no_daddr", data); 792 XB_TRACE(bp, "no_daddr", len);
799 return bp; 793 return bp;
794
800 fail_free_mem: 795 fail_free_mem:
801 kmem_free(data, malloc_len); 796 while (--i >= 0)
797 __free_page(bp->b_pages[i]);
798 _xfs_buf_free_pages(bp);
802 fail_free_buf: 799 fail_free_buf:
803 xfs_buf_free(bp); 800 xfs_buf_deallocate(bp);
804 fail: 801 fail:
805 return NULL; 802 return NULL;
806} 803}
@@ -1453,6 +1450,7 @@ xfs_free_buftarg(
1453 int external) 1450 int external)
1454{ 1451{
1455 xfs_flush_buftarg(btp, 1); 1452 xfs_flush_buftarg(btp, 1);
1453 xfs_blkdev_issue_flush(btp);
1456 if (external) 1454 if (external)
1457 xfs_blkdev_put(btp->bt_bdev); 1455 xfs_blkdev_put(btp->bt_bdev);
1458 xfs_free_bufhash(btp); 1456 xfs_free_bufhash(btp);
@@ -1837,14 +1835,9 @@ xfs_buf_init(void)
1837 if (!xfsdatad_workqueue) 1835 if (!xfsdatad_workqueue)
1838 goto out_destroy_xfslogd_workqueue; 1836 goto out_destroy_xfslogd_workqueue;
1839 1837
1840 xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup); 1838 register_shrinker(&xfs_buf_shake);
1841 if (!xfs_buf_shake)
1842 goto out_destroy_xfsdatad_workqueue;
1843
1844 return 0; 1839 return 0;
1845 1840
1846 out_destroy_xfsdatad_workqueue:
1847 destroy_workqueue(xfsdatad_workqueue);
1848 out_destroy_xfslogd_workqueue: 1841 out_destroy_xfslogd_workqueue:
1849 destroy_workqueue(xfslogd_workqueue); 1842 destroy_workqueue(xfslogd_workqueue);
1850 out_free_buf_zone: 1843 out_free_buf_zone:
@@ -1859,7 +1852,7 @@ xfs_buf_init(void)
1859void 1852void
1860xfs_buf_terminate(void) 1853xfs_buf_terminate(void)
1861{ 1854{
1862 kmem_shake_deregister(xfs_buf_shake); 1855 unregister_shrinker(&xfs_buf_shake);
1863 destroy_workqueue(xfsdatad_workqueue); 1856 destroy_workqueue(xfsdatad_workqueue);
1864 destroy_workqueue(xfslogd_workqueue); 1857 destroy_workqueue(xfslogd_workqueue);
1865 kmem_zone_destroy(xfs_buf_zone); 1858 kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index b6241f6201..b5908a34b1 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -63,7 +63,7 @@ typedef enum {
63 63
64 /* flags used only internally */ 64 /* flags used only internally */
65 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 65 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
66 _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ 66 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */
67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ 68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
69} xfs_buf_flags_t; 69} xfs_buf_flags_t;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index cb51dc9613..cbcd40c8c2 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -124,30 +124,6 @@ xfs_file_aio_write_invis(
124} 124}
125 125
126STATIC ssize_t 126STATIC ssize_t
127xfs_file_sendfile(
128 struct file *filp,
129 loff_t *pos,
130 size_t count,
131 read_actor_t actor,
132 void *target)
133{
134 return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode),
135 filp, pos, 0, count, actor, target, NULL);
136}
137
138STATIC ssize_t
139xfs_file_sendfile_invis(
140 struct file *filp,
141 loff_t *pos,
142 size_t count,
143 read_actor_t actor,
144 void *target)
145{
146 return bhv_vop_sendfile(vn_from_inode(filp->f_path.dentry->d_inode),
147 filp, pos, IO_INVIS, count, actor, target, NULL);
148}
149
150STATIC ssize_t
151xfs_file_splice_read( 127xfs_file_splice_read(
152 struct file *infilp, 128 struct file *infilp,
153 loff_t *ppos, 129 loff_t *ppos,
@@ -208,15 +184,6 @@ xfs_file_open(
208} 184}
209 185
210STATIC int 186STATIC int
211xfs_file_close(
212 struct file *filp,
213 fl_owner_t id)
214{
215 return -bhv_vop_close(vn_from_inode(filp->f_path.dentry->d_inode), 0,
216 file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
217}
218
219STATIC int
220xfs_file_release( 187xfs_file_release(
221 struct inode *inode, 188 struct inode *inode,
222 struct file *filp) 189 struct file *filp)
@@ -452,7 +419,6 @@ const struct file_operations xfs_file_operations = {
452 .write = do_sync_write, 419 .write = do_sync_write,
453 .aio_read = xfs_file_aio_read, 420 .aio_read = xfs_file_aio_read,
454 .aio_write = xfs_file_aio_write, 421 .aio_write = xfs_file_aio_write,
455 .sendfile = xfs_file_sendfile,
456 .splice_read = xfs_file_splice_read, 422 .splice_read = xfs_file_splice_read,
457 .splice_write = xfs_file_splice_write, 423 .splice_write = xfs_file_splice_write,
458 .unlocked_ioctl = xfs_file_ioctl, 424 .unlocked_ioctl = xfs_file_ioctl,
@@ -461,7 +427,6 @@ const struct file_operations xfs_file_operations = {
461#endif 427#endif
462 .mmap = xfs_file_mmap, 428 .mmap = xfs_file_mmap,
463 .open = xfs_file_open, 429 .open = xfs_file_open,
464 .flush = xfs_file_close,
465 .release = xfs_file_release, 430 .release = xfs_file_release,
466 .fsync = xfs_file_fsync, 431 .fsync = xfs_file_fsync,
467#ifdef HAVE_FOP_OPEN_EXEC 432#ifdef HAVE_FOP_OPEN_EXEC
@@ -475,7 +440,6 @@ const struct file_operations xfs_invis_file_operations = {
475 .write = do_sync_write, 440 .write = do_sync_write,
476 .aio_read = xfs_file_aio_read_invis, 441 .aio_read = xfs_file_aio_read_invis,
477 .aio_write = xfs_file_aio_write_invis, 442 .aio_write = xfs_file_aio_write_invis,
478 .sendfile = xfs_file_sendfile_invis,
479 .splice_read = xfs_file_splice_read_invis, 443 .splice_read = xfs_file_splice_read_invis,
480 .splice_write = xfs_file_splice_write_invis, 444 .splice_write = xfs_file_splice_write_invis,
481 .unlocked_ioctl = xfs_file_ioctl_invis, 445 .unlocked_ioctl = xfs_file_ioctl_invis,
@@ -484,7 +448,6 @@ const struct file_operations xfs_invis_file_operations = {
484#endif 448#endif
485 .mmap = xfs_file_mmap, 449 .mmap = xfs_file_mmap,
486 .open = xfs_file_open, 450 .open = xfs_file_open,
487 .flush = xfs_file_close,
488 .release = xfs_file_release, 451 .release = xfs_file_release,
489 .fsync = xfs_file_fsync, 452 .fsync = xfs_file_fsync,
490}; 453};
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ed3a5e1b4b..bb72c3d414 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -46,6 +46,7 @@ xfs_param_t xfs_params = {
46 .inherit_nosym = { 0, 0, 1 }, 46 .inherit_nosym = { 0, 0, 1 },
47 .rotorstep = { 1, 1, 255 }, 47 .rotorstep = { 1, 1, 255 },
48 .inherit_nodfrg = { 0, 1, 1 }, 48 .inherit_nodfrg = { 0, 1, 1 },
49 .fstrm_timer = { 1, 50, 3600*100},
49}; 50};
50 51
51/* 52/*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ff5c41ff8d..5917808abb 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1019,7 +1019,7 @@ xfs_ioc_bulkstat(
1019 1019
1020 if (cmd == XFS_IOC_FSINUMBERS) 1020 if (cmd == XFS_IOC_FSINUMBERS)
1021 error = xfs_inumbers(mp, &inlast, &count, 1021 error = xfs_inumbers(mp, &inlast, &count,
1022 bulkreq.ubuffer); 1022 bulkreq.ubuffer, xfs_inumbers_fmt);
1023 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) 1023 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
1024 error = xfs_bulkstat_single(mp, &inlast, 1024 error = xfs_bulkstat_single(mp, &inlast,
1025 bulkreq.ubuffer, &done); 1025 bulkreq.ubuffer, &done);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b83cebc165..141cf15067 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -23,10 +23,25 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include "xfs.h" 25#include "xfs.h"
26#include "xfs_types.h"
27#include "xfs_fs.h" 26#include "xfs_fs.h"
27#include "xfs_bit.h"
28#include "xfs_log.h"
29#include "xfs_inum.h"
30#include "xfs_trans.h"
31#include "xfs_sb.h"
32#include "xfs_ag.h"
33#include "xfs_dir2.h"
34#include "xfs_dmapi.h"
35#include "xfs_mount.h"
36#include "xfs_bmap_btree.h"
37#include "xfs_attr_sf.h"
38#include "xfs_dir2_sf.h"
28#include "xfs_vfs.h" 39#include "xfs_vfs.h"
29#include "xfs_vnode.h" 40#include "xfs_vnode.h"
41#include "xfs_dinode.h"
42#include "xfs_inode.h"
43#include "xfs_itable.h"
44#include "xfs_error.h"
30#include "xfs_dfrag.h" 45#include "xfs_dfrag.h"
31 46
32#define _NATIVE_IOC(cmd, type) \ 47#define _NATIVE_IOC(cmd, type) \
@@ -34,6 +49,7 @@
34 49
35#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 50#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
36#define BROKEN_X86_ALIGNMENT 51#define BROKEN_X86_ALIGNMENT
52#define _PACKED __attribute__((packed))
37/* on ia32 l_start is on a 32-bit boundary */ 53/* on ia32 l_start is on a 32-bit boundary */
38typedef struct xfs_flock64_32 { 54typedef struct xfs_flock64_32 {
39 __s16 l_type; 55 __s16 l_type;
@@ -75,35 +91,276 @@ xfs_ioctl32_flock(
75 return (unsigned long)p; 91 return (unsigned long)p;
76} 92}
77 93
94typedef struct compat_xfs_fsop_geom_v1 {
95 __u32 blocksize; /* filesystem (data) block size */
96 __u32 rtextsize; /* realtime extent size */
97 __u32 agblocks; /* fsblocks in an AG */
98 __u32 agcount; /* number of allocation groups */
99 __u32 logblocks; /* fsblocks in the log */
100 __u32 sectsize; /* (data) sector size, bytes */
101 __u32 inodesize; /* inode size in bytes */
102 __u32 imaxpct; /* max allowed inode space(%) */
103 __u64 datablocks; /* fsblocks in data subvolume */
104 __u64 rtblocks; /* fsblocks in realtime subvol */
105 __u64 rtextents; /* rt extents in realtime subvol*/
106 __u64 logstart; /* starting fsblock of the log */
107 unsigned char uuid[16]; /* unique id of the filesystem */
108 __u32 sunit; /* stripe unit, fsblocks */
109 __u32 swidth; /* stripe width, fsblocks */
110 __s32 version; /* structure version */
111 __u32 flags; /* superblock version flags */
112 __u32 logsectsize; /* log sector size, bytes */
113 __u32 rtsectsize; /* realtime sector size, bytes */
114 __u32 dirblocksize; /* directory block size, bytes */
115} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
116
117#define XFS_IOC_FSGEOMETRY_V1_32 \
118 _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
119
120STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
121{
122 compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
123 xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
124
125 if (copy_in_user(p, p32, sizeof(*p32)))
126 return -EFAULT;
127 return (unsigned long)p;
128}
129
130typedef struct compat_xfs_inogrp {
131 __u64 xi_startino; /* starting inode number */
132 __s32 xi_alloccount; /* # bits set in allocmask */
133 __u64 xi_allocmask; /* mask of allocated inodes */
134} __attribute__((packed)) compat_xfs_inogrp_t;
135
136STATIC int xfs_inumbers_fmt_compat(
137 void __user *ubuffer,
138 const xfs_inogrp_t *buffer,
139 long count,
140 long *written)
141{
142 compat_xfs_inogrp_t *p32 = ubuffer;
143 long i;
144
145 for (i = 0; i < count; i++) {
146 if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
147 put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
148 put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
149 return -EFAULT;
150 }
151 *written = count * sizeof(*p32);
152 return 0;
153}
154
78#else 155#else
79 156
80typedef struct xfs_fsop_bulkreq32 { 157#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
158#define _PACKED
159
160#endif
161
162/* XFS_IOC_FSBULKSTAT and friends */
163
164typedef struct compat_xfs_bstime {
165 __s32 tv_sec; /* seconds */
166 __s32 tv_nsec; /* and nanoseconds */
167} compat_xfs_bstime_t;
168
169STATIC int xfs_bstime_store_compat(
170 compat_xfs_bstime_t __user *p32,
171 const xfs_bstime_t *p)
172{
173 __s32 sec32;
174
175 sec32 = p->tv_sec;
176 if (put_user(sec32, &p32->tv_sec) ||
177 put_user(p->tv_nsec, &p32->tv_nsec))
178 return -EFAULT;
179 return 0;
180}
181
182typedef struct compat_xfs_bstat {
183 __u64 bs_ino; /* inode number */
184 __u16 bs_mode; /* type and mode */
185 __u16 bs_nlink; /* number of links */
186 __u32 bs_uid; /* user id */
187 __u32 bs_gid; /* group id */
188 __u32 bs_rdev; /* device value */
189 __s32 bs_blksize; /* block size */
190 __s64 bs_size; /* file size */
191 compat_xfs_bstime_t bs_atime; /* access time */
192 compat_xfs_bstime_t bs_mtime; /* modify time */
193 compat_xfs_bstime_t bs_ctime; /* inode change time */
194 int64_t bs_blocks; /* number of blocks */
195 __u32 bs_xflags; /* extended flags */
196 __s32 bs_extsize; /* extent size */
197 __s32 bs_extents; /* number of extents */
198 __u32 bs_gen; /* generation count */
199 __u16 bs_projid; /* project id */
200 unsigned char bs_pad[14]; /* pad space, unused */
201 __u32 bs_dmevmask; /* DMIG event mask */
202 __u16 bs_dmstate; /* DMIG state info */
203 __u16 bs_aextents; /* attribute number of extents */
204} _PACKED compat_xfs_bstat_t;
205
206STATIC int xfs_bulkstat_one_fmt_compat(
207 void __user *ubuffer,
208 const xfs_bstat_t *buffer)
209{
210 compat_xfs_bstat_t __user *p32 = ubuffer;
211
212 if (put_user(buffer->bs_ino, &p32->bs_ino) ||
213 put_user(buffer->bs_mode, &p32->bs_mode) ||
214 put_user(buffer->bs_nlink, &p32->bs_nlink) ||
215 put_user(buffer->bs_uid, &p32->bs_uid) ||
216 put_user(buffer->bs_gid, &p32->bs_gid) ||
217 put_user(buffer->bs_rdev, &p32->bs_rdev) ||
218 put_user(buffer->bs_blksize, &p32->bs_blksize) ||
219 put_user(buffer->bs_size, &p32->bs_size) ||
220 xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
221 xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
222 xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
223 put_user(buffer->bs_blocks, &p32->bs_blocks) ||
224 put_user(buffer->bs_xflags, &p32->bs_xflags) ||
225 put_user(buffer->bs_extsize, &p32->bs_extsize) ||
226 put_user(buffer->bs_extents, &p32->bs_extents) ||
227 put_user(buffer->bs_gen, &p32->bs_gen) ||
228 put_user(buffer->bs_projid, &p32->bs_projid) ||
229 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
230 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
231 put_user(buffer->bs_aextents, &p32->bs_aextents))
232 return -EFAULT;
233 return sizeof(*p32);
234}
235
236
237
238typedef struct compat_xfs_fsop_bulkreq {
81 compat_uptr_t lastip; /* last inode # pointer */ 239 compat_uptr_t lastip; /* last inode # pointer */
82 __s32 icount; /* count of entries in buffer */ 240 __s32 icount; /* count of entries in buffer */
83 compat_uptr_t ubuffer; /* user buffer for inode desc. */ 241 compat_uptr_t ubuffer; /* user buffer for inode desc. */
84 __s32 ocount; /* output count pointer */ 242 compat_uptr_t ocount; /* output count pointer */
85} xfs_fsop_bulkreq32_t; 243} compat_xfs_fsop_bulkreq_t;
86 244
87STATIC unsigned long 245#define XFS_IOC_FSBULKSTAT_32 \
88xfs_ioctl32_bulkstat( 246 _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
89 unsigned long arg) 247#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
248 _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
249#define XFS_IOC_FSINUMBERS_32 \
250 _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
251
252/* copied from xfs_ioctl.c */
253STATIC int
254xfs_ioc_bulkstat_compat(
255 xfs_mount_t *mp,
256 unsigned int cmd,
257 void __user *arg)
90{ 258{
91 xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; 259 compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
92 xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p));
93 u32 addr; 260 u32 addr;
261 xfs_fsop_bulkreq_t bulkreq;
262 int count; /* # of records returned */
263 xfs_ino_t inlast; /* last inode number */
264 int done;
265 int error;
266
267 /* done = 1 if there are more stats to get and if bulkstat */
268 /* should be called again (unused here, but used in dmapi) */
269
270 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM;
272
273 if (XFS_FORCED_SHUTDOWN(mp))
274 return -XFS_ERROR(EIO);
275
276 if (get_user(addr, &p32->lastip))
277 return -EFAULT;
278 bulkreq.lastip = compat_ptr(addr);
279 if (get_user(bulkreq.icount, &p32->icount) ||
280 get_user(addr, &p32->ubuffer))
281 return -EFAULT;
282 bulkreq.ubuffer = compat_ptr(addr);
283 if (get_user(addr, &p32->ocount))
284 return -EFAULT;
285 bulkreq.ocount = compat_ptr(addr);
286
287 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
288 return -XFS_ERROR(EFAULT);
289
290 if ((count = bulkreq.icount) <= 0)
291 return -XFS_ERROR(EINVAL);
292
293 if (cmd == XFS_IOC_FSINUMBERS)
294 error = xfs_inumbers(mp, &inlast, &count,
295 bulkreq.ubuffer, xfs_inumbers_fmt_compat);
296 else {
297 /* declare a var to get a warning in case the type changes */
298 bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
299 error = xfs_bulkstat(mp, &inlast, &count,
300 xfs_bulkstat_one, formatter,
301 sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
302 BULKSTAT_FG_QUICK, &done);
303 }
304 if (error)
305 return -error;
306
307 if (bulkreq.ocount != NULL) {
308 if (copy_to_user(bulkreq.lastip, &inlast,
309 sizeof(xfs_ino_t)))
310 return -XFS_ERROR(EFAULT);
311
312 if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
313 return -XFS_ERROR(EFAULT);
314 }
315
316 return 0;
317}
318
319
320
321typedef struct compat_xfs_fsop_handlereq {
322 __u32 fd; /* fd for FD_TO_HANDLE */
323 compat_uptr_t path; /* user pathname */
324 __u32 oflags; /* open flags */
325 compat_uptr_t ihandle; /* user supplied handle */
326 __u32 ihandlen; /* user supplied length */
327 compat_uptr_t ohandle; /* user buffer for handle */
328 compat_uptr_t ohandlen; /* user buffer length */
329} compat_xfs_fsop_handlereq_t;
330
331#define XFS_IOC_PATH_TO_FSHANDLE_32 \
332 _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
333#define XFS_IOC_PATH_TO_HANDLE_32 \
334 _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
335#define XFS_IOC_FD_TO_HANDLE_32 \
336 _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
337#define XFS_IOC_OPEN_BY_HANDLE_32 \
338 _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
339#define XFS_IOC_READLINK_BY_HANDLE_32 \
340 _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
341
342STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
343{
344 compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
345 xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
346 u32 addr;
94 347
95 if (get_user(addr, &p32->lastip) || 348 if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
96 put_user(compat_ptr(addr), &p->lastip) || 349 get_user(addr, &p32->path) ||
97 copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || 350 put_user(compat_ptr(addr), &p->path) ||
98 get_user(addr, &p32->ubuffer) || 351 copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
99 put_user(compat_ptr(addr), &p->ubuffer) || 352 get_user(addr, &p32->ihandle) ||
100 get_user(addr, &p32->ocount) || 353 put_user(compat_ptr(addr), &p->ihandle) ||
101 put_user(compat_ptr(addr), &p->ocount)) 354 copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
355 get_user(addr, &p32->ohandle) ||
356 put_user(compat_ptr(addr), &p->ohandle) ||
357 get_user(addr, &p32->ohandlen) ||
358 put_user(compat_ptr(addr), &p->ohandlen))
102 return -EFAULT; 359 return -EFAULT;
103 360
104 return (unsigned long)p; 361 return (unsigned long)p;
105} 362}
106#endif 363
107 364
108STATIC long 365STATIC long
109xfs_compat_ioctl( 366xfs_compat_ioctl(
@@ -118,7 +375,6 @@ xfs_compat_ioctl(
118 375
119 switch (cmd) { 376 switch (cmd) {
120 case XFS_IOC_DIOINFO: 377 case XFS_IOC_DIOINFO:
121 case XFS_IOC_FSGEOMETRY_V1:
122 case XFS_IOC_FSGEOMETRY: 378 case XFS_IOC_FSGEOMETRY:
123 case XFS_IOC_GETVERSION: 379 case XFS_IOC_GETVERSION:
124 case XFS_IOC_GETXFLAGS: 380 case XFS_IOC_GETXFLAGS:
@@ -131,12 +387,7 @@ xfs_compat_ioctl(
131 case XFS_IOC_GETBMAPA: 387 case XFS_IOC_GETBMAPA:
132 case XFS_IOC_GETBMAPX: 388 case XFS_IOC_GETBMAPX:
133/* not handled 389/* not handled
134 case XFS_IOC_FD_TO_HANDLE:
135 case XFS_IOC_PATH_TO_HANDLE:
136 case XFS_IOC_PATH_TO_FSHANDLE:
137 case XFS_IOC_OPEN_BY_HANDLE:
138 case XFS_IOC_FSSETDM_BY_HANDLE: 390 case XFS_IOC_FSSETDM_BY_HANDLE:
139 case XFS_IOC_READLINK_BY_HANDLE:
140 case XFS_IOC_ATTRLIST_BY_HANDLE: 391 case XFS_IOC_ATTRLIST_BY_HANDLE:
141 case XFS_IOC_ATTRMULTI_BY_HANDLE: 392 case XFS_IOC_ATTRMULTI_BY_HANDLE:
142*/ 393*/
@@ -166,6 +417,10 @@ xfs_compat_ioctl(
166 arg = xfs_ioctl32_flock(arg); 417 arg = xfs_ioctl32_flock(arg);
167 cmd = _NATIVE_IOC(cmd, struct xfs_flock64); 418 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
168 break; 419 break;
420 case XFS_IOC_FSGEOMETRY_V1_32:
421 arg = xfs_ioctl32_geom_v1(arg);
422 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
423 break;
169 424
170#else /* These are handled fine if no alignment issues */ 425#else /* These are handled fine if no alignment issues */
171 case XFS_IOC_ALLOCSP: 426 case XFS_IOC_ALLOCSP:
@@ -176,18 +431,28 @@ xfs_compat_ioctl(
176 case XFS_IOC_FREESP64: 431 case XFS_IOC_FREESP64:
177 case XFS_IOC_RESVSP64: 432 case XFS_IOC_RESVSP64:
178 case XFS_IOC_UNRESVSP64: 433 case XFS_IOC_UNRESVSP64:
434 case XFS_IOC_FSGEOMETRY_V1:
179 break; 435 break;
180 436
181 /* xfs_bstat_t still has wrong u32 vs u64 alignment */ 437 /* xfs_bstat_t still has wrong u32 vs u64 alignment */
182 case XFS_IOC_SWAPEXT: 438 case XFS_IOC_SWAPEXT:
183 break; 439 break;
184 440
185 case XFS_IOC_FSBULKSTAT_SINGLE:
186 case XFS_IOC_FSBULKSTAT:
187 case XFS_IOC_FSINUMBERS:
188 arg = xfs_ioctl32_bulkstat(arg);
189 break;
190#endif 441#endif
442 case XFS_IOC_FSBULKSTAT_32:
443 case XFS_IOC_FSBULKSTAT_SINGLE_32:
444 case XFS_IOC_FSINUMBERS_32:
445 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
446 return xfs_ioc_bulkstat_compat(XFS_BHVTOI(VNHEAD(vp))->i_mount,
447 cmd, (void*)arg);
448 case XFS_IOC_FD_TO_HANDLE_32:
449 case XFS_IOC_PATH_TO_HANDLE_32:
450 case XFS_IOC_PATH_TO_FSHANDLE_32:
451 case XFS_IOC_OPEN_BY_HANDLE_32:
452 case XFS_IOC_READLINK_BY_HANDLE_32:
453 arg = xfs_ioctl32_fshandle(arg);
454 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
455 break;
191 default: 456 default:
192 return -ENOIOCTLCMD; 457 return -ENOIOCTLCMD;
193 } 458 }
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 715adad7dd..330c4ba9d4 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -101,7 +101,6 @@
101 * Feature macros (disable/enable) 101 * Feature macros (disable/enable)
102 */ 102 */
103#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ 103#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
104#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */
105#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ 104#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */
106#ifdef CONFIG_SMP 105#ifdef CONFIG_SMP
107#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ 106#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
@@ -124,6 +123,7 @@
124#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val 123#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
125#define xfs_rotorstep xfs_params.rotorstep.val 124#define xfs_rotorstep xfs_params.rotorstep.val
126#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 125#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
126#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
127 127
128#define current_cpu() (raw_smp_processor_id()) 128#define current_cpu() (raw_smp_processor_id())
129#define current_pid() (current->pid) 129#define current_pid() (current->pid)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index ed90403f0e..765ec16a6e 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -287,50 +287,6 @@ xfs_read(
287} 287}
288 288
289ssize_t 289ssize_t
290xfs_sendfile(
291 bhv_desc_t *bdp,
292 struct file *filp,
293 loff_t *offset,
294 int ioflags,
295 size_t count,
296 read_actor_t actor,
297 void *target,
298 cred_t *credp)
299{
300 xfs_inode_t *ip = XFS_BHVTOI(bdp);
301 xfs_mount_t *mp = ip->i_mount;
302 ssize_t ret;
303
304 XFS_STATS_INC(xs_read_calls);
305 if (XFS_FORCED_SHUTDOWN(mp))
306 return -EIO;
307
308 xfs_ilock(ip, XFS_IOLOCK_SHARED);
309
310 if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
311 (!(ioflags & IO_INVIS))) {
312 bhv_vrwlock_t locktype = VRWLOCK_READ;
313 int error;
314
315 error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
316 *offset, count,
317 FILP_DELAY_FLAG(filp), &locktype);
318 if (error) {
319 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
320 return -error;
321 }
322 }
323 xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
324 (void *)(unsigned long)target, count, *offset, ioflags);
325 ret = generic_file_sendfile(filp, offset, count, actor, target);
326 if (ret > 0)
327 XFS_STATS_ADD(xs_read_bytes, ret);
328
329 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
330 return ret;
331}
332
333ssize_t
334xfs_splice_read( 290xfs_splice_read(
335 bhv_desc_t *bdp, 291 bhv_desc_t *bdp,
336 struct file *infilp, 292 struct file *infilp,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 7ac51b1d21..7c60a1eed8 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -90,9 +90,6 @@ extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
90extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, 90extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
91 const struct iovec *, unsigned int, 91 const struct iovec *, unsigned int,
92 loff_t *, int, struct cred *); 92 loff_t *, int, struct cred *);
93extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
94 loff_t *, int, size_t, read_actor_t,
95 void *, struct cred *);
96extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, 93extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
97 struct pipe_inode_info *, size_t, int, int, 94 struct pipe_inode_info *, size_t, int, int,
98 struct cred *); 95 struct cred *);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bf9a9d5909..4528f9a3f3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -547,7 +547,8 @@ vfs_sync_worker(
547 547
548 if (!(vfsp->vfs_flag & VFS_RDONLY)) 548 if (!(vfsp->vfs_flag & VFS_RDONLY))
549 error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \ 549 error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \
550 SYNC_ATTR | SYNC_REFCACHE, NULL); 550 SYNC_ATTR | SYNC_REFCACHE | SYNC_SUPER,
551 NULL);
551 vfsp->vfs_sync_seq++; 552 vfsp->vfs_sync_seq++;
552 wake_up(&vfsp->vfs_wait_single_sync_task); 553 wake_up(&vfsp->vfs_wait_single_sync_task);
553} 554}
@@ -561,6 +562,7 @@ xfssyncd(
561 bhv_vfs_sync_work_t *work, *n; 562 bhv_vfs_sync_work_t *work, *n;
562 LIST_HEAD (tmp); 563 LIST_HEAD (tmp);
563 564
565 set_freezable();
564 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 566 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
565 for (;;) { 567 for (;;) {
566 timeleft = schedule_timeout_interruptible(timeleft); 568 timeleft = schedule_timeout_interruptible(timeleft);
@@ -663,7 +665,7 @@ xfs_fs_sync_super(
663 * occur here so don't bother flushing the buftarg (i.e 665 * occur here so don't bother flushing the buftarg (i.e
664 * SYNC_QUIESCE) because it'll just get dirty again. 666 * SYNC_QUIESCE) because it'll just get dirty again.
665 */ 667 */
666 flags = SYNC_FSDATA | SYNC_DELWRI | SYNC_WAIT | SYNC_IOWAIT; 668 flags = SYNC_DATA_QUIESCE;
667 } else 669 } else
668 flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); 670 flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
669 671
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 33dd1ca132..201cc3273c 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_SUPER_H__ 18#ifndef __XFS_SUPER_H__
19#define __XFS_SUPER_H__ 19#define __XFS_SUPER_H__
20 20
21#include <linux/exportfs.h>
22
21#ifdef CONFIG_XFS_DMAPI 23#ifdef CONFIG_XFS_DMAPI
22# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) 24# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops)
23# define vfs_initdmapi() dmapi_init() 25# define vfs_initdmapi() dmapi_init()
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index cd6eaa44aa..bb997d75c0 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -210,6 +210,17 @@ static ctl_table xfs_table[] = {
210 .extra1 = &xfs_params.inherit_nodfrg.min, 210 .extra1 = &xfs_params.inherit_nodfrg.min,
211 .extra2 = &xfs_params.inherit_nodfrg.max 211 .extra2 = &xfs_params.inherit_nodfrg.max
212 }, 212 },
213 {
214 .ctl_name = XFS_FILESTREAM_TIMER,
215 .procname = "filestream_centisecs",
216 .data = &xfs_params.fstrm_timer.val,
217 .maxlen = sizeof(int),
218 .mode = 0644,
219 .proc_handler = &proc_dointvec_minmax,
220 .strategy = &sysctl_intvec,
221 .extra1 = &xfs_params.fstrm_timer.min,
222 .extra2 = &xfs_params.fstrm_timer.max,
223 },
213 /* please keep this the last entry */ 224 /* please keep this the last entry */
214#ifdef CONFIG_PROC_FS 225#ifdef CONFIG_PROC_FS
215 { 226 {
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index a631fb8cc5..98b97e399d 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ 47 xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
48 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 48 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
49 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 49 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
50 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
@@ -86,6 +87,7 @@ enum {
86 XFS_INHERIT_NOSYM = 19, 87 XFS_INHERIT_NOSYM = 19,
87 XFS_ROTORSTEP = 20, 88 XFS_ROTORSTEP = 20,
88 XFS_INHERIT_NODFRG = 21, 89 XFS_INHERIT_NODFRG = 21,
90 XFS_FILESTREAM_TIMER = 22,
89}; 91};
90 92
91extern xfs_param_t xfs_params; 93extern xfs_param_t xfs_params;
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index e2c2ce98ab..dca3481aaa 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -92,6 +92,21 @@ typedef enum {
92#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ 92#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
93#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ 93#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
94#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ 94#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */
95#define SYNC_SUPER 0x0200 /* flush superblock to disk */
96
97/*
98 * When remounting a filesystem read-only or freezing the filesystem,
99 * we have two phases to execute. This first phase is syncing the data
100 * before we quiesce the fielsystem, and the second is flushing all the
101 * inodes out after we've waited for all the transactions created by
102 * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
103 * to ensure that the inodes are written to their location on disk
104 * rather than just existing in transactions in the log. This means
105 * after a quiesce there is no log replay required to write the inodes
106 * to disk (this is the main difference between a sync and a quiesce).
107 */
108#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
109#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
95 110
96#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ 111#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
97#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ 112#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index d1b2d01843..5742d65f07 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -129,19 +129,13 @@ typedef enum bhv_vchange {
129 VCHANGE_FLAGS_IOEXCL_COUNT = 4 129 VCHANGE_FLAGS_IOEXCL_COUNT = 4
130} bhv_vchange_t; 130} bhv_vchange_t;
131 131
132typedef enum { L_FALSE, L_TRUE } lastclose_t;
133
134typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); 132typedef int (*vop_open_t)(bhv_desc_t *, struct cred *);
135typedef int (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *);
136typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, 133typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
137 const struct iovec *, unsigned int, 134 const struct iovec *, unsigned int,
138 loff_t *, int, struct cred *); 135 loff_t *, int, struct cred *);
139typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, 136typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
140 const struct iovec *, unsigned int, 137 const struct iovec *, unsigned int,
141 loff_t *, int, struct cred *); 138 loff_t *, int, struct cred *);
142typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
143 loff_t *, int, size_t, read_actor_t,
144 void *, struct cred *);
145typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, 139typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
146 struct pipe_inode_info *, size_t, int, int, 140 struct pipe_inode_info *, size_t, int, int,
147 struct cred *); 141 struct cred *);
@@ -203,10 +197,8 @@ typedef int (*vop_iflush_t)(bhv_desc_t *, int);
203typedef struct bhv_vnodeops { 197typedef struct bhv_vnodeops {
204 bhv_position_t vn_position; /* position within behavior chain */ 198 bhv_position_t vn_position; /* position within behavior chain */
205 vop_open_t vop_open; 199 vop_open_t vop_open;
206 vop_close_t vop_close;
207 vop_read_t vop_read; 200 vop_read_t vop_read;
208 vop_write_t vop_write; 201 vop_write_t vop_write;
209 vop_sendfile_t vop_sendfile;
210 vop_splice_read_t vop_splice_read; 202 vop_splice_read_t vop_splice_read;
211 vop_splice_write_t vop_splice_write; 203 vop_splice_write_t vop_splice_write;
212 vop_ioctl_t vop_ioctl; 204 vop_ioctl_t vop_ioctl;
@@ -249,13 +241,10 @@ typedef struct bhv_vnodeops {
249#define VNHEAD(vp) ((vp)->v_bh.bh_first) 241#define VNHEAD(vp) ((vp)->v_bh.bh_first)
250#define VOP(op, vp) (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op) 242#define VOP(op, vp) (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op)
251#define bhv_vop_open(vp, cr) VOP(vop_open, vp)(VNHEAD(vp),cr) 243#define bhv_vop_open(vp, cr) VOP(vop_open, vp)(VNHEAD(vp),cr)
252#define bhv_vop_close(vp, f,last,cr) VOP(vop_close, vp)(VNHEAD(vp),f,last,cr)
253#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr) \ 244#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr) \
254 VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) 245 VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
255#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \ 246#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \
256 VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) 247 VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
257#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr) \
258 VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr)
259#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr) \ 248#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr) \
260 VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) 249 VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
261#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr) \ 250#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr) \
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 3e4a8ad8a3..2d274b23ad 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -62,10 +62,8 @@ uint ndquot;
62 62
63kmem_zone_t *qm_dqzone; 63kmem_zone_t *qm_dqzone;
64kmem_zone_t *qm_dqtrxzone; 64kmem_zone_t *qm_dqtrxzone;
65static kmem_shaker_t xfs_qm_shaker;
66 65
67static cred_t xfs_zerocr; 66static cred_t xfs_zerocr;
68static xfs_inode_t xfs_zeroino;
69 67
70STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 68STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
71STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 69STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
@@ -79,6 +77,11 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
79STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 77STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
80STATIC int xfs_qm_shake(int, gfp_t); 78STATIC int xfs_qm_shake(int, gfp_t);
81 79
80static struct shrinker xfs_qm_shaker = {
81 .shrink = xfs_qm_shake,
82 .seeks = DEFAULT_SEEKS,
83};
84
82#ifdef DEBUG 85#ifdef DEBUG
83extern mutex_t qcheck_lock; 86extern mutex_t qcheck_lock;
84#endif 87#endif
@@ -150,7 +153,7 @@ xfs_Gqm_init(void)
150 } else 153 } else
151 xqm->qm_dqzone = qm_dqzone; 154 xqm->qm_dqzone = qm_dqzone;
152 155
153 xfs_qm_shaker = kmem_shake_register(xfs_qm_shake); 156 register_shrinker(&xfs_qm_shaker);
154 157
155 /* 158 /*
156 * The t_dqinfo portion of transactions. 159 * The t_dqinfo portion of transactions.
@@ -182,7 +185,7 @@ xfs_qm_destroy(
182 185
183 ASSERT(xqm != NULL); 186 ASSERT(xqm != NULL);
184 ASSERT(xqm->qm_nrefs == 0); 187 ASSERT(xqm->qm_nrefs == 0);
185 kmem_shake_deregister(xfs_qm_shaker); 188 unregister_shrinker(&xfs_qm_shaker);
186 hsize = xqm->qm_dqhashmask + 1; 189 hsize = xqm->qm_dqhashmask + 1;
187 for (i = 0; i < hsize; i++) { 190 for (i = 0; i < hsize; i++) {
188 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 191 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
@@ -1415,7 +1418,7 @@ xfs_qm_qino_alloc(
1415 return error; 1418 return error;
1416 } 1419 }
1417 1420
1418 if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, 1421 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
1419 &xfs_zerocr, 0, 1, ip, &committed))) { 1422 &xfs_zerocr, 0, 1, ip, &committed))) {
1420 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1423 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1421 XFS_TRANS_ABORT); 1424 XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index bf0a12040b..b5a7d92c68 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -38,6 +38,7 @@
38#define XFS_RW_TRACE 1 38#define XFS_RW_TRACE 1
39#define XFS_BUF_TRACE 1 39#define XFS_BUF_TRACE 1
40#define XFS_VNODE_TRACE 1 40#define XFS_VNODE_TRACE 1
41#define XFS_FILESTREAMS_TRACE 1
41#endif 42#endif
42 43
43#include <linux-2.6/xfs_linux.h> 44#include <linux-2.6/xfs_linux.h>
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 9ece7f87ec..51c09c114a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -68,6 +68,7 @@ typedef struct xfs_agf {
68 __be32 agf_flcount; /* count of blocks in freelist */ 68 __be32 agf_flcount; /* count of blocks in freelist */
69 __be32 agf_freeblks; /* total free blocks */ 69 __be32 agf_freeblks; /* total free blocks */
70 __be32 agf_longest; /* longest free space */ 70 __be32 agf_longest; /* longest free space */
71 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
71} xfs_agf_t; 72} xfs_agf_t;
72 73
73#define XFS_AGF_MAGICNUM 0x00000001 74#define XFS_AGF_MAGICNUM 0x00000001
@@ -81,7 +82,8 @@ typedef struct xfs_agf {
81#define XFS_AGF_FLCOUNT 0x00000100 82#define XFS_AGF_FLCOUNT 0x00000100
82#define XFS_AGF_FREEBLKS 0x00000200 83#define XFS_AGF_FREEBLKS 0x00000200
83#define XFS_AGF_LONGEST 0x00000400 84#define XFS_AGF_LONGEST 0x00000400
84#define XFS_AGF_NUM_BITS 11 85#define XFS_AGF_BTREEBLKS 0x00000800
86#define XFS_AGF_NUM_BITS 12
85#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) 87#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
86 88
87/* disk block (xfs_daddr_t) in the AG */ 89/* disk block (xfs_daddr_t) in the AG */
@@ -186,12 +188,15 @@ typedef struct xfs_perag
186 __uint32_t pagf_flcount; /* count of blocks in freelist */ 188 __uint32_t pagf_flcount; /* count of blocks in freelist */
187 xfs_extlen_t pagf_freeblks; /* total free blocks */ 189 xfs_extlen_t pagf_freeblks; /* total free blocks */
188 xfs_extlen_t pagf_longest; /* longest free space */ 190 xfs_extlen_t pagf_longest; /* longest free space */
191 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
189 xfs_agino_t pagi_freecount; /* number of free inodes */ 192 xfs_agino_t pagi_freecount; /* number of free inodes */
193 xfs_agino_t pagi_count; /* number of allocated inodes */
194 int pagb_count; /* pagb slots in use */
190#ifdef __KERNEL__ 195#ifdef __KERNEL__
191 lock_t pagb_lock; /* lock for pagb_list */ 196 lock_t pagb_lock; /* lock for pagb_list */
192#endif 197#endif
193 int pagb_count; /* pagb slots in use */
194 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 198 xfs_perag_busy_t *pagb_list; /* unstable blocks */
199 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
195} xfs_perag_t; 200} xfs_perag_t;
196 201
197#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 202#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8e9a40aa0c..012a649a19 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -55,17 +55,17 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
55ktrace_t *xfs_alloc_trace_buf; 55ktrace_t *xfs_alloc_trace_buf;
56 56
57#define TRACE_ALLOC(s,a) \ 57#define TRACE_ALLOC(s,a) \
58 xfs_alloc_trace_alloc(fname, s, a, __LINE__) 58 xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
59#define TRACE_FREE(s,a,b,x,f) \ 59#define TRACE_FREE(s,a,b,x,f) \
60 xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) 60 xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
61#define TRACE_MODAGF(s,a,f) \ 61#define TRACE_MODAGF(s,a,f) \
62 xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) 62 xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
63#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ 63#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp) \
64 xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) 64 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
65#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ 65#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp) \
66 xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) 66 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
67#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ 67#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \
68 xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) 68 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
69#else 69#else
70#define TRACE_ALLOC(s,a) 70#define TRACE_ALLOC(s,a)
71#define TRACE_FREE(s,a,b,x,f) 71#define TRACE_FREE(s,a,b,x,f)
@@ -420,7 +420,7 @@ xfs_alloc_read_agfl(
420 */ 420 */
421STATIC void 421STATIC void
422xfs_alloc_trace_alloc( 422xfs_alloc_trace_alloc(
423 char *name, /* function tag string */ 423 const char *name, /* function tag string */
424 char *str, /* additional string */ 424 char *str, /* additional string */
425 xfs_alloc_arg_t *args, /* allocation argument structure */ 425 xfs_alloc_arg_t *args, /* allocation argument structure */
426 int line) /* source line number */ 426 int line) /* source line number */
@@ -453,7 +453,7 @@ xfs_alloc_trace_alloc(
453 */ 453 */
454STATIC void 454STATIC void
455xfs_alloc_trace_free( 455xfs_alloc_trace_free(
456 char *name, /* function tag string */ 456 const char *name, /* function tag string */
457 char *str, /* additional string */ 457 char *str, /* additional string */
458 xfs_mount_t *mp, /* file system mount point */ 458 xfs_mount_t *mp, /* file system mount point */
459 xfs_agnumber_t agno, /* allocation group number */ 459 xfs_agnumber_t agno, /* allocation group number */
@@ -479,7 +479,7 @@ xfs_alloc_trace_free(
479 */ 479 */
480STATIC void 480STATIC void
481xfs_alloc_trace_modagf( 481xfs_alloc_trace_modagf(
482 char *name, /* function tag string */ 482 const char *name, /* function tag string */
483 char *str, /* additional string */ 483 char *str, /* additional string */
484 xfs_mount_t *mp, /* file system mount point */ 484 xfs_mount_t *mp, /* file system mount point */
485 xfs_agf_t *agf, /* new agf value */ 485 xfs_agf_t *agf, /* new agf value */
@@ -507,7 +507,7 @@ xfs_alloc_trace_modagf(
507 507
508STATIC void 508STATIC void
509xfs_alloc_trace_busy( 509xfs_alloc_trace_busy(
510 char *name, /* function tag string */ 510 const char *name, /* function tag string */
511 char *str, /* additional string */ 511 char *str, /* additional string */
512 xfs_mount_t *mp, /* file system mount point */ 512 xfs_mount_t *mp, /* file system mount point */
513 xfs_agnumber_t agno, /* allocation group number */ 513 xfs_agnumber_t agno, /* allocation group number */
@@ -549,9 +549,6 @@ xfs_alloc_ag_vextent(
549 xfs_alloc_arg_t *args) /* argument structure for allocation */ 549 xfs_alloc_arg_t *args) /* argument structure for allocation */
550{ 550{
551 int error=0; 551 int error=0;
552#ifdef XFS_ALLOC_TRACE
553 static char fname[] = "xfs_alloc_ag_vextent";
554#endif
555 552
556 ASSERT(args->minlen > 0); 553 ASSERT(args->minlen > 0);
557 ASSERT(args->maxlen > 0); 554 ASSERT(args->maxlen > 0);
@@ -635,9 +632,6 @@ xfs_alloc_ag_vextent_exact(
635 xfs_agblock_t fbno; /* start block of found extent */ 632 xfs_agblock_t fbno; /* start block of found extent */
636 xfs_agblock_t fend; /* end block of found extent */ 633 xfs_agblock_t fend; /* end block of found extent */
637 xfs_extlen_t flen; /* length of found extent */ 634 xfs_extlen_t flen; /* length of found extent */
638#ifdef XFS_ALLOC_TRACE
639 static char fname[] = "xfs_alloc_ag_vextent_exact";
640#endif
641 int i; /* success/failure of operation */ 635 int i; /* success/failure of operation */
642 xfs_agblock_t maxend; /* end of maximal extent */ 636 xfs_agblock_t maxend; /* end of maximal extent */
643 xfs_agblock_t minend; /* end of minimal extent */ 637 xfs_agblock_t minend; /* end of minimal extent */
@@ -737,9 +731,6 @@ xfs_alloc_ag_vextent_near(
737 xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ 731 xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
738 xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ 732 xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
739 xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ 733 xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
740#ifdef XFS_ALLOC_TRACE
741 static char fname[] = "xfs_alloc_ag_vextent_near";
742#endif
743 xfs_agblock_t gtbno; /* start bno of right side entry */ 734 xfs_agblock_t gtbno; /* start bno of right side entry */
744 xfs_agblock_t gtbnoa; /* aligned ... */ 735 xfs_agblock_t gtbnoa; /* aligned ... */
745 xfs_extlen_t gtdiff; /* difference to right side entry */ 736 xfs_extlen_t gtdiff; /* difference to right side entry */
@@ -1270,9 +1261,6 @@ xfs_alloc_ag_vextent_size(
1270 int error; /* error result */ 1261 int error; /* error result */
1271 xfs_agblock_t fbno; /* start of found freespace */ 1262 xfs_agblock_t fbno; /* start of found freespace */
1272 xfs_extlen_t flen; /* length of found freespace */ 1263 xfs_extlen_t flen; /* length of found freespace */
1273#ifdef XFS_ALLOC_TRACE
1274 static char fname[] = "xfs_alloc_ag_vextent_size";
1275#endif
1276 int i; /* temp status variable */ 1264 int i; /* temp status variable */
1277 xfs_agblock_t rbno; /* returned block number */ 1265 xfs_agblock_t rbno; /* returned block number */
1278 xfs_extlen_t rlen; /* length of returned extent */ 1266 xfs_extlen_t rlen; /* length of returned extent */
@@ -1427,9 +1415,6 @@ xfs_alloc_ag_vextent_small(
1427 int error; 1415 int error;
1428 xfs_agblock_t fbno; 1416 xfs_agblock_t fbno;
1429 xfs_extlen_t flen; 1417 xfs_extlen_t flen;
1430#ifdef XFS_ALLOC_TRACE
1431 static char fname[] = "xfs_alloc_ag_vextent_small";
1432#endif
1433 int i; 1418 int i;
1434 1419
1435 if ((error = xfs_alloc_decrement(ccur, 0, &i))) 1420 if ((error = xfs_alloc_decrement(ccur, 0, &i)))
@@ -1447,7 +1432,8 @@ xfs_alloc_ag_vextent_small(
1447 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && 1432 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
1448 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) 1433 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
1449 > args->minleft)) { 1434 > args->minleft)) {
1450 if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) 1435 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1436 if (error)
1451 goto error0; 1437 goto error0;
1452 if (fbno != NULLAGBLOCK) { 1438 if (fbno != NULLAGBLOCK) {
1453 if (args->userdata) { 1439 if (args->userdata) {
@@ -1515,9 +1501,6 @@ xfs_free_ag_extent(
1515 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ 1501 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
1516 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ 1502 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
1517 int error; /* error return value */ 1503 int error; /* error return value */
1518#ifdef XFS_ALLOC_TRACE
1519 static char fname[] = "xfs_free_ag_extent";
1520#endif
1521 xfs_agblock_t gtbno; /* start of right neighbor block */ 1504 xfs_agblock_t gtbno; /* start of right neighbor block */
1522 xfs_extlen_t gtlen; /* length of right neighbor block */ 1505 xfs_extlen_t gtlen; /* length of right neighbor block */
1523 int haveleft; /* have a left neighbor block */ 1506 int haveleft; /* have a left neighbor block */
@@ -1923,7 +1906,8 @@ xfs_alloc_fix_freelist(
1923 while (be32_to_cpu(agf->agf_flcount) > need) { 1906 while (be32_to_cpu(agf->agf_flcount) > need) {
1924 xfs_buf_t *bp; 1907 xfs_buf_t *bp;
1925 1908
1926 if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) 1909 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
1910 if (error)
1927 return error; 1911 return error;
1928 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) 1912 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
1929 return error; 1913 return error;
@@ -1973,8 +1957,9 @@ xfs_alloc_fix_freelist(
1973 * Put each allocated block on the list. 1957 * Put each allocated block on the list.
1974 */ 1958 */
1975 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 1959 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
1976 if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, 1960 error = xfs_alloc_put_freelist(tp, agbp,
1977 bno))) 1961 agflbp, bno, 0);
1962 if (error)
1978 return error; 1963 return error;
1979 } 1964 }
1980 } 1965 }
@@ -1991,16 +1976,15 @@ int /* error */
1991xfs_alloc_get_freelist( 1976xfs_alloc_get_freelist(
1992 xfs_trans_t *tp, /* transaction pointer */ 1977 xfs_trans_t *tp, /* transaction pointer */
1993 xfs_buf_t *agbp, /* buffer containing the agf structure */ 1978 xfs_buf_t *agbp, /* buffer containing the agf structure */
1994 xfs_agblock_t *bnop) /* block address retrieved from freelist */ 1979 xfs_agblock_t *bnop, /* block address retrieved from freelist */
1980 int btreeblk) /* destination is a AGF btree */
1995{ 1981{
1996 xfs_agf_t *agf; /* a.g. freespace structure */ 1982 xfs_agf_t *agf; /* a.g. freespace structure */
1997 xfs_agfl_t *agfl; /* a.g. freelist structure */ 1983 xfs_agfl_t *agfl; /* a.g. freelist structure */
1998 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ 1984 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
1999 xfs_agblock_t bno; /* block number returned */ 1985 xfs_agblock_t bno; /* block number returned */
2000 int error; 1986 int error;
2001#ifdef XFS_ALLOC_TRACE 1987 int logflags;
2002 static char fname[] = "xfs_alloc_get_freelist";
2003#endif
2004 xfs_mount_t *mp; /* mount structure */ 1988 xfs_mount_t *mp; /* mount structure */
2005 xfs_perag_t *pag; /* per allocation group data */ 1989 xfs_perag_t *pag; /* per allocation group data */
2006 1990
@@ -2032,8 +2016,16 @@ xfs_alloc_get_freelist(
2032 be32_add(&agf->agf_flcount, -1); 2016 be32_add(&agf->agf_flcount, -1);
2033 xfs_trans_agflist_delta(tp, -1); 2017 xfs_trans_agflist_delta(tp, -1);
2034 pag->pagf_flcount--; 2018 pag->pagf_flcount--;
2035 TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); 2019
2036 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); 2020 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2021 if (btreeblk) {
2022 be32_add(&agf->agf_btreeblks, 1);
2023 pag->pagf_btreeblks++;
2024 logflags |= XFS_AGF_BTREEBLKS;
2025 }
2026
2027 TRACE_MODAGF(NULL, agf, logflags);
2028 xfs_alloc_log_agf(tp, agbp, logflags);
2037 *bnop = bno; 2029 *bnop = bno;
2038 2030
2039 /* 2031 /*
@@ -2071,6 +2063,7 @@ xfs_alloc_log_agf(
2071 offsetof(xfs_agf_t, agf_flcount), 2063 offsetof(xfs_agf_t, agf_flcount),
2072 offsetof(xfs_agf_t, agf_freeblks), 2064 offsetof(xfs_agf_t, agf_freeblks),
2073 offsetof(xfs_agf_t, agf_longest), 2065 offsetof(xfs_agf_t, agf_longest),
2066 offsetof(xfs_agf_t, agf_btreeblks),
2074 sizeof(xfs_agf_t) 2067 sizeof(xfs_agf_t)
2075 }; 2068 };
2076 2069
@@ -2106,15 +2099,14 @@ xfs_alloc_put_freelist(
2106 xfs_trans_t *tp, /* transaction pointer */ 2099 xfs_trans_t *tp, /* transaction pointer */
2107 xfs_buf_t *agbp, /* buffer for a.g. freelist header */ 2100 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2108 xfs_buf_t *agflbp,/* buffer for a.g. free block array */ 2101 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2109 xfs_agblock_t bno) /* block being freed */ 2102 xfs_agblock_t bno, /* block being freed */
2103 int btreeblk) /* block came from a AGF btree */
2110{ 2104{
2111 xfs_agf_t *agf; /* a.g. freespace structure */ 2105 xfs_agf_t *agf; /* a.g. freespace structure */
2112 xfs_agfl_t *agfl; /* a.g. free block array */ 2106 xfs_agfl_t *agfl; /* a.g. free block array */
2113 __be32 *blockp;/* pointer to array entry */ 2107 __be32 *blockp;/* pointer to array entry */
2114 int error; 2108 int error;
2115#ifdef XFS_ALLOC_TRACE 2109 int logflags;
2116 static char fname[] = "xfs_alloc_put_freelist";
2117#endif
2118 xfs_mount_t *mp; /* mount structure */ 2110 xfs_mount_t *mp; /* mount structure */
2119 xfs_perag_t *pag; /* per allocation group data */ 2111 xfs_perag_t *pag; /* per allocation group data */
2120 2112
@@ -2132,11 +2124,22 @@ xfs_alloc_put_freelist(
2132 be32_add(&agf->agf_flcount, 1); 2124 be32_add(&agf->agf_flcount, 1);
2133 xfs_trans_agflist_delta(tp, 1); 2125 xfs_trans_agflist_delta(tp, 1);
2134 pag->pagf_flcount++; 2126 pag->pagf_flcount++;
2127
2128 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2129 if (btreeblk) {
2130 be32_add(&agf->agf_btreeblks, -1);
2131 pag->pagf_btreeblks--;
2132 logflags |= XFS_AGF_BTREEBLKS;
2133 }
2134
2135 TRACE_MODAGF(NULL, agf, logflags);
2136 xfs_alloc_log_agf(tp, agbp, logflags);
2137
2135 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); 2138 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
2136 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2139 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
2137 *blockp = cpu_to_be32(bno); 2140 *blockp = cpu_to_be32(bno);
2138 TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); 2141 TRACE_MODAGF(NULL, agf, logflags);
2139 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); 2142 xfs_alloc_log_agf(tp, agbp, logflags);
2140 xfs_trans_log_buf(tp, agflbp, 2143 xfs_trans_log_buf(tp, agflbp,
2141 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), 2144 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
2142 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + 2145 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
@@ -2196,6 +2199,7 @@ xfs_alloc_read_agf(
2196 pag = &mp->m_perag[agno]; 2199 pag = &mp->m_perag[agno];
2197 if (!pag->pagf_init) { 2200 if (!pag->pagf_init) {
2198 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2201 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2202 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
2199 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 2203 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
2200 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 2204 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2201 pag->pagf_levels[XFS_BTNUM_BNOi] = 2205 pag->pagf_levels[XFS_BTNUM_BNOi] =
@@ -2235,9 +2239,6 @@ xfs_alloc_vextent(
2235 xfs_agblock_t agsize; /* allocation group size */ 2239 xfs_agblock_t agsize; /* allocation group size */
2236 int error; 2240 int error;
2237 int flags; /* XFS_ALLOC_FLAG_... locking flags */ 2241 int flags; /* XFS_ALLOC_FLAG_... locking flags */
2238#ifdef XFS_ALLOC_TRACE
2239 static char fname[] = "xfs_alloc_vextent";
2240#endif
2241 xfs_extlen_t minleft;/* minimum left value, temp copy */ 2242 xfs_extlen_t minleft;/* minimum left value, temp copy */
2242 xfs_mount_t *mp; /* mount structure pointer */ 2243 xfs_mount_t *mp; /* mount structure pointer */
2243 xfs_agnumber_t sagno; /* starting allocation group number */ 2244 xfs_agnumber_t sagno; /* starting allocation group number */
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5a4256120c..5aec15d065 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -136,7 +136,8 @@ int /* error */
136xfs_alloc_get_freelist( 136xfs_alloc_get_freelist(
137 struct xfs_trans *tp, /* transaction pointer */ 137 struct xfs_trans *tp, /* transaction pointer */
138 struct xfs_buf *agbp, /* buffer containing the agf structure */ 138 struct xfs_buf *agbp, /* buffer containing the agf structure */
139 xfs_agblock_t *bnop); /* block address retrieved from freelist */ 139 xfs_agblock_t *bnop, /* block address retrieved from freelist */
140 int btreeblk); /* destination is a AGF btree */
140 141
141/* 142/*
142 * Log the given fields from the agf structure. 143 * Log the given fields from the agf structure.
@@ -165,7 +166,8 @@ xfs_alloc_put_freelist(
165 struct xfs_trans *tp, /* transaction pointer */ 166 struct xfs_trans *tp, /* transaction pointer */
166 struct xfs_buf *agbp, /* buffer for a.g. freelist header */ 167 struct xfs_buf *agbp, /* buffer for a.g. freelist header */
167 struct xfs_buf *agflbp,/* buffer for a.g. free block array */ 168 struct xfs_buf *agflbp,/* buffer for a.g. free block array */
168 xfs_agblock_t bno); /* block being freed */ 169 xfs_agblock_t bno, /* block being freed */
170 int btreeblk); /* owner was a AGF btree */
169 171
170/* 172/*
171 * Read in the allocation group header (free/alloc section). 173 * Read in the allocation group header (free/alloc section).
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 74cadf95d4..1603ce5958 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -226,8 +226,9 @@ xfs_alloc_delrec(
226 /* 226 /*
227 * Put this buffer/block on the ag's freelist. 227 * Put this buffer/block on the ag's freelist.
228 */ 228 */
229 if ((error = xfs_alloc_put_freelist(cur->bc_tp, 229 error = xfs_alloc_put_freelist(cur->bc_tp,
230 cur->bc_private.a.agbp, NULL, bno))) 230 cur->bc_private.a.agbp, NULL, bno, 1);
231 if (error)
231 return error; 232 return error;
232 /* 233 /*
233 * Since blocks move to the free list without the 234 * Since blocks move to the free list without the
@@ -549,8 +550,9 @@ xfs_alloc_delrec(
549 /* 550 /*
550 * Free the deleting block by putting it on the freelist. 551 * Free the deleting block by putting it on the freelist.
551 */ 552 */
552 if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, 553 error = xfs_alloc_put_freelist(cur->bc_tp,
553 NULL, rbno))) 554 cur->bc_private.a.agbp, NULL, rbno, 1);
555 if (error)
554 return error; 556 return error;
555 /* 557 /*
556 * Since blocks move to the free list without the coordination 558 * Since blocks move to the free list without the coordination
@@ -1320,8 +1322,9 @@ xfs_alloc_newroot(
1320 /* 1322 /*
1321 * Get a buffer from the freelist blocks, for the new root. 1323 * Get a buffer from the freelist blocks, for the new root.
1322 */ 1324 */
1323 if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 1325 error = xfs_alloc_get_freelist(cur->bc_tp,
1324 &nbno))) 1326 cur->bc_private.a.agbp, &nbno, 1);
1327 if (error)
1325 return error; 1328 return error;
1326 /* 1329 /*
1327 * None available, we fail. 1330 * None available, we fail.
@@ -1604,8 +1607,9 @@ xfs_alloc_split(
1604 * Allocate the new block from the freelist. 1607 * Allocate the new block from the freelist.
1605 * If we can't do it, we're toast. Give up. 1608 * If we can't do it, we're toast. Give up.
1606 */ 1609 */
1607 if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 1610 error = xfs_alloc_get_freelist(cur->bc_tp,
1608 &rbno))) 1611 cur->bc_private.a.agbp, &rbno, 1);
1612 if (error)
1609 return error; 1613 return error;
1610 if (rbno == NULLAGBLOCK) { 1614 if (rbno == NULLAGBLOCK) {
1611 *stat = 0; 1615 *stat = 0;
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 1afe07f67e..fab0b6d5a4 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -66,44 +66,6 @@ static const char xfs_highbit[256] = {
66#endif 66#endif
67 67
68/* 68/*
69 * Count of bits set in byte, 0..8.
70 */
71static const char xfs_countbit[256] = {
72 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */
73 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */
74 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */
75 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */
76 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */
77 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */
78 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */
79 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */
80 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */
81 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */
82 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */
83 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */
84 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */
85 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */
86 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */
87 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */
88 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */
89 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */
90 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */
91 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */
92 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */
93 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */
94 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */
95 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */
96 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */
97 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */
98 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */
99 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */
100 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */
101 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */
102 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */
103 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */
104};
105
106/*
107 * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. 69 * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
108 */ 70 */
109inline int 71inline int
@@ -167,56 +129,21 @@ xfs_highbit64(
167 129
168 130
169/* 131/*
170 * Count the number of bits set in the bitmap starting with bit 132 * Return whether bitmap is empty.
171 * start_bit. Size is the size of the bitmap in words. 133 * Size is number of words in the bitmap, which is padded to word boundary
172 * 134 * Returns 1 for empty, 0 for non-empty.
173 * Do the counting by mapping a byte value to the number of set
174 * bits for that value using the xfs_countbit array, i.e.
175 * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
176 * xfs_countbit[3] == 2, etc.
177 */ 135 */
178int 136int
179xfs_count_bits(uint *map, uint size, uint start_bit) 137xfs_bitmap_empty(uint *map, uint size)
180{ 138{
181 register int bits; 139 uint i;
182 register unsigned char *bytep; 140 uint ret = 0;
183 register unsigned char *end_map;
184 int byte_bit;
185
186 bits = 0;
187 end_map = (char*)(map + size);
188 bytep = (char*)(map + (start_bit & ~0x7));
189 byte_bit = start_bit & 0x7;
190
191 /*
192 * If the caller fell off the end of the map, return 0.
193 */
194 if (bytep >= end_map) {
195 return (0);
196 }
197
198 /*
199 * If start_bit is not byte aligned, then process the
200 * first byte separately.
201 */
202 if (byte_bit != 0) {
203 /*
204 * Shift off the bits we don't want to look at,
205 * before indexing into xfs_countbit.
206 */
207 bits += xfs_countbit[(*bytep >> byte_bit)];
208 bytep++;
209 }
210 141
211 /* 142 for (i = 0; i < size; i++) {
212 * Count the bits in each byte until the end of the bitmap. 143 ret |= map[i];
213 */
214 while (bytep < end_map) {
215 bits += xfs_countbit[*bytep];
216 bytep++;
217 } 144 }
218 145
219 return (bits); 146 return (ret == 0);
220} 147}
221 148
222/* 149/*
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 0bbe568175..082641a978 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -55,8 +55,8 @@ extern int xfs_lowbit64(__uint64_t v);
55/* Get high bit set out of 64-bit argument, -1 if none set */ 55/* Get high bit set out of 64-bit argument, -1 if none set */
56extern int xfs_highbit64(__uint64_t); 56extern int xfs_highbit64(__uint64_t);
57 57
58/* Count set bits in map starting with start_bit */ 58/* Return whether bitmap is empty (1 == empty) */
59extern int xfs_count_bits(uint *map, uint size, uint start_bit); 59extern int xfs_bitmap_empty(uint *map, uint size);
60 60
61/* Count continuous one bits in map starting with start_bit */ 61/* Count continuous one bits in map starting with start_bit */
62extern int xfs_contig_bits(uint *map, uint size, uint start_bit); 62extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index b1ea26e40a..94b5c5fe26 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -52,6 +52,7 @@
52#include "xfs_quota.h" 52#include "xfs_quota.h"
53#include "xfs_trans_space.h" 53#include "xfs_trans_space.h"
54#include "xfs_buf_item.h" 54#include "xfs_buf_item.h"
55#include "xfs_filestream.h"
55 56
56 57
57#ifdef DEBUG 58#ifdef DEBUG
@@ -277,7 +278,7 @@ xfs_bmap_isaeof(
277STATIC void 278STATIC void
278xfs_bmap_trace_addentry( 279xfs_bmap_trace_addentry(
279 int opcode, /* operation */ 280 int opcode, /* operation */
280 char *fname, /* function name */ 281 const char *fname, /* function name */
281 char *desc, /* operation description */ 282 char *desc, /* operation description */
282 xfs_inode_t *ip, /* incore inode pointer */ 283 xfs_inode_t *ip, /* incore inode pointer */
283 xfs_extnum_t idx, /* index of entry(ies) */ 284 xfs_extnum_t idx, /* index of entry(ies) */
@@ -291,7 +292,7 @@ xfs_bmap_trace_addentry(
291 */ 292 */
292STATIC void 293STATIC void
293xfs_bmap_trace_delete( 294xfs_bmap_trace_delete(
294 char *fname, /* function name */ 295 const char *fname, /* function name */
295 char *desc, /* operation description */ 296 char *desc, /* operation description */
296 xfs_inode_t *ip, /* incore inode pointer */ 297 xfs_inode_t *ip, /* incore inode pointer */
297 xfs_extnum_t idx, /* index of entry(entries) deleted */ 298 xfs_extnum_t idx, /* index of entry(entries) deleted */
@@ -304,7 +305,7 @@ xfs_bmap_trace_delete(
304 */ 305 */
305STATIC void 306STATIC void
306xfs_bmap_trace_insert( 307xfs_bmap_trace_insert(
307 char *fname, /* function name */ 308 const char *fname, /* function name */
308 char *desc, /* operation description */ 309 char *desc, /* operation description */
309 xfs_inode_t *ip, /* incore inode pointer */ 310 xfs_inode_t *ip, /* incore inode pointer */
310 xfs_extnum_t idx, /* index of entry(entries) inserted */ 311 xfs_extnum_t idx, /* index of entry(entries) inserted */
@@ -318,7 +319,7 @@ xfs_bmap_trace_insert(
318 */ 319 */
319STATIC void 320STATIC void
320xfs_bmap_trace_post_update( 321xfs_bmap_trace_post_update(
321 char *fname, /* function name */ 322 const char *fname, /* function name */
322 char *desc, /* operation description */ 323 char *desc, /* operation description */
323 xfs_inode_t *ip, /* incore inode pointer */ 324 xfs_inode_t *ip, /* incore inode pointer */
324 xfs_extnum_t idx, /* index of entry updated */ 325 xfs_extnum_t idx, /* index of entry updated */
@@ -329,17 +330,25 @@ xfs_bmap_trace_post_update(
329 */ 330 */
330STATIC void 331STATIC void
331xfs_bmap_trace_pre_update( 332xfs_bmap_trace_pre_update(
332 char *fname, /* function name */ 333 const char *fname, /* function name */
333 char *desc, /* operation description */ 334 char *desc, /* operation description */
334 xfs_inode_t *ip, /* incore inode pointer */ 335 xfs_inode_t *ip, /* incore inode pointer */
335 xfs_extnum_t idx, /* index of entry to be updated */ 336 xfs_extnum_t idx, /* index of entry to be updated */
336 int whichfork); /* data or attr fork */ 337 int whichfork); /* data or attr fork */
337 338
339#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \
340 xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
341#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
342 xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
343#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \
344 xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
345#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \
346 xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
338#else 347#else
339#define xfs_bmap_trace_delete(f,d,ip,i,c,w) 348#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
340#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) 349#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
341#define xfs_bmap_trace_post_update(f,d,ip,i,w) 350#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)
342#define xfs_bmap_trace_pre_update(f,d,ip,i,w) 351#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)
343#endif /* XFS_BMAP_TRACE */ 352#endif /* XFS_BMAP_TRACE */
344 353
345/* 354/*
@@ -531,9 +540,6 @@ xfs_bmap_add_extent(
531 xfs_filblks_t da_new; /* new count del alloc blocks used */ 540 xfs_filblks_t da_new; /* new count del alloc blocks used */
532 xfs_filblks_t da_old; /* old count del alloc blocks used */ 541 xfs_filblks_t da_old; /* old count del alloc blocks used */
533 int error; /* error return value */ 542 int error; /* error return value */
534#ifdef XFS_BMAP_TRACE
535 static char fname[] = "xfs_bmap_add_extent";
536#endif
537 xfs_ifork_t *ifp; /* inode fork ptr */ 543 xfs_ifork_t *ifp; /* inode fork ptr */
538 int logflags; /* returned value */ 544 int logflags; /* returned value */
539 xfs_extnum_t nextents; /* number of extents in file now */ 545 xfs_extnum_t nextents; /* number of extents in file now */
@@ -551,8 +557,8 @@ xfs_bmap_add_extent(
551 * already extents in the list. 557 * already extents in the list.
552 */ 558 */
553 if (nextents == 0) { 559 if (nextents == 0) {
554 xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, 560 XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL,
555 NULL, whichfork); 561 whichfork);
556 xfs_iext_insert(ifp, 0, 1, new); 562 xfs_iext_insert(ifp, 0, 1, new);
557 ASSERT(cur == NULL); 563 ASSERT(cur == NULL);
558 ifp->if_lastex = 0; 564 ifp->if_lastex = 0;
@@ -710,9 +716,6 @@ xfs_bmap_add_extent_delay_real(
710 int diff; /* temp value */ 716 int diff; /* temp value */
711 xfs_bmbt_rec_t *ep; /* extent entry for idx */ 717 xfs_bmbt_rec_t *ep; /* extent entry for idx */
712 int error; /* error return value */ 718 int error; /* error return value */
713#ifdef XFS_BMAP_TRACE
714 static char fname[] = "xfs_bmap_add_extent_delay_real";
715#endif
716 int i; /* temp state */ 719 int i; /* temp state */
717 xfs_ifork_t *ifp; /* inode fork pointer */ 720 xfs_ifork_t *ifp; /* inode fork pointer */
718 xfs_fileoff_t new_endoff; /* end offset of new entry */ 721 xfs_fileoff_t new_endoff; /* end offset of new entry */
@@ -808,15 +811,14 @@ xfs_bmap_add_extent_delay_real(
808 * Filling in all of a previously delayed allocation extent. 811 * Filling in all of a previously delayed allocation extent.
809 * The left and right neighbors are both contiguous with new. 812 * The left and right neighbors are both contiguous with new.
810 */ 813 */
811 xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, 814 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
812 XFS_DATA_FORK); 815 XFS_DATA_FORK);
813 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 816 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
814 LEFT.br_blockcount + PREV.br_blockcount + 817 LEFT.br_blockcount + PREV.br_blockcount +
815 RIGHT.br_blockcount); 818 RIGHT.br_blockcount);
816 xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, 819 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
817 XFS_DATA_FORK);
818 xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
819 XFS_DATA_FORK); 820 XFS_DATA_FORK);
821 XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
820 xfs_iext_remove(ifp, idx, 2); 822 xfs_iext_remove(ifp, idx, 2);
821 ip->i_df.if_lastex = idx - 1; 823 ip->i_df.if_lastex = idx - 1;
822 ip->i_d.di_nextents--; 824 ip->i_d.di_nextents--;
@@ -855,15 +857,14 @@ xfs_bmap_add_extent_delay_real(
855 * Filling in all of a previously delayed allocation extent. 857 * Filling in all of a previously delayed allocation extent.
856 * The left neighbor is contiguous, the right is not. 858 * The left neighbor is contiguous, the right is not.
857 */ 859 */
858 xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, 860 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
859 XFS_DATA_FORK); 861 XFS_DATA_FORK);
860 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 862 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
861 LEFT.br_blockcount + PREV.br_blockcount); 863 LEFT.br_blockcount + PREV.br_blockcount);
862 xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, 864 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
863 XFS_DATA_FORK); 865 XFS_DATA_FORK);
864 ip->i_df.if_lastex = idx - 1; 866 ip->i_df.if_lastex = idx - 1;
865 xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, 867 XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
866 XFS_DATA_FORK);
867 xfs_iext_remove(ifp, idx, 1); 868 xfs_iext_remove(ifp, idx, 1);
868 if (cur == NULL) 869 if (cur == NULL)
869 rval = XFS_ILOG_DEXT; 870 rval = XFS_ILOG_DEXT;
@@ -892,16 +893,13 @@ xfs_bmap_add_extent_delay_real(
892 * Filling in all of a previously delayed allocation extent. 893 * Filling in all of a previously delayed allocation extent.
893 * The right neighbor is contiguous, the left is not. 894 * The right neighbor is contiguous, the left is not.
894 */ 895 */
895 xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, 896 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
896 XFS_DATA_FORK);
897 xfs_bmbt_set_startblock(ep, new->br_startblock); 897 xfs_bmbt_set_startblock(ep, new->br_startblock);
898 xfs_bmbt_set_blockcount(ep, 898 xfs_bmbt_set_blockcount(ep,
899 PREV.br_blockcount + RIGHT.br_blockcount); 899 PREV.br_blockcount + RIGHT.br_blockcount);
900 xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, 900 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK);
901 XFS_DATA_FORK);
902 ip->i_df.if_lastex = idx; 901 ip->i_df.if_lastex = idx;
903 xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, 902 XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
904 XFS_DATA_FORK);
905 xfs_iext_remove(ifp, idx + 1, 1); 903 xfs_iext_remove(ifp, idx + 1, 1);
906 if (cur == NULL) 904 if (cur == NULL)
907 rval = XFS_ILOG_DEXT; 905 rval = XFS_ILOG_DEXT;
@@ -931,11 +929,9 @@ xfs_bmap_add_extent_delay_real(
931 * Neither the left nor right neighbors are contiguous with 929 * Neither the left nor right neighbors are contiguous with
932 * the new one. 930 * the new one.
933 */ 931 */
934 xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, 932 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
935 XFS_DATA_FORK);
936 xfs_bmbt_set_startblock(ep, new->br_startblock); 933 xfs_bmbt_set_startblock(ep, new->br_startblock);
937 xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, 934 XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK);
938 XFS_DATA_FORK);
939 ip->i_df.if_lastex = idx; 935 ip->i_df.if_lastex = idx;
940 ip->i_d.di_nextents++; 936 ip->i_d.di_nextents++;
941 if (cur == NULL) 937 if (cur == NULL)
@@ -963,17 +959,14 @@ xfs_bmap_add_extent_delay_real(
963 * Filling in the first part of a previous delayed allocation. 959 * Filling in the first part of a previous delayed allocation.
964 * The left neighbor is contiguous. 960 * The left neighbor is contiguous.
965 */ 961 */
966 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, 962 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
967 XFS_DATA_FORK);
968 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 963 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
969 LEFT.br_blockcount + new->br_blockcount); 964 LEFT.br_blockcount + new->br_blockcount);
970 xfs_bmbt_set_startoff(ep, 965 xfs_bmbt_set_startoff(ep,
971 PREV.br_startoff + new->br_blockcount); 966 PREV.br_startoff + new->br_blockcount);
972 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, 967 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK);
973 XFS_DATA_FORK);
974 temp = PREV.br_blockcount - new->br_blockcount; 968 temp = PREV.br_blockcount - new->br_blockcount;
975 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, 969 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
976 XFS_DATA_FORK);
977 xfs_bmbt_set_blockcount(ep, temp); 970 xfs_bmbt_set_blockcount(ep, temp);
978 ip->i_df.if_lastex = idx - 1; 971 ip->i_df.if_lastex = idx - 1;
979 if (cur == NULL) 972 if (cur == NULL)
@@ -995,8 +988,7 @@ xfs_bmap_add_extent_delay_real(
995 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 988 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
996 STARTBLOCKVAL(PREV.br_startblock)); 989 STARTBLOCKVAL(PREV.br_startblock));
997 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 990 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
998 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, 991 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
999 XFS_DATA_FORK);
1000 *dnew = temp; 992 *dnew = temp;
1001 /* DELTA: The boundary between two in-core extents moved. */ 993 /* DELTA: The boundary between two in-core extents moved. */
1002 temp = LEFT.br_startoff; 994 temp = LEFT.br_startoff;
@@ -1009,11 +1001,11 @@ xfs_bmap_add_extent_delay_real(
1009 * Filling in the first part of a previous delayed allocation. 1001 * Filling in the first part of a previous delayed allocation.
1010 * The left neighbor is not contiguous. 1002 * The left neighbor is not contiguous.
1011 */ 1003 */
1012 xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); 1004 XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
1013 xfs_bmbt_set_startoff(ep, new_endoff); 1005 xfs_bmbt_set_startoff(ep, new_endoff);
1014 temp = PREV.br_blockcount - new->br_blockcount; 1006 temp = PREV.br_blockcount - new->br_blockcount;
1015 xfs_bmbt_set_blockcount(ep, temp); 1007 xfs_bmbt_set_blockcount(ep, temp);
1016 xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, 1008 XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
1017 XFS_DATA_FORK); 1009 XFS_DATA_FORK);
1018 xfs_iext_insert(ifp, idx, 1, new); 1010 xfs_iext_insert(ifp, idx, 1, new);
1019 ip->i_df.if_lastex = idx; 1011 ip->i_df.if_lastex = idx;
@@ -1046,8 +1038,7 @@ xfs_bmap_add_extent_delay_real(
1046 (cur ? cur->bc_private.b.allocated : 0)); 1038 (cur ? cur->bc_private.b.allocated : 0));
1047 ep = xfs_iext_get_ext(ifp, idx + 1); 1039 ep = xfs_iext_get_ext(ifp, idx + 1);
1048 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1040 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1049 xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, 1041 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
1050 XFS_DATA_FORK);
1051 *dnew = temp; 1042 *dnew = temp;
1052 /* DELTA: One in-core extent is split in two. */ 1043 /* DELTA: One in-core extent is split in two. */
1053 temp = PREV.br_startoff; 1044 temp = PREV.br_startoff;
@@ -1060,17 +1051,14 @@ xfs_bmap_add_extent_delay_real(
1060 * The right neighbor is contiguous with the new allocation. 1051 * The right neighbor is contiguous with the new allocation.
1061 */ 1052 */
1062 temp = PREV.br_blockcount - new->br_blockcount; 1053 temp = PREV.br_blockcount - new->br_blockcount;
1063 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, 1054 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
1064 XFS_DATA_FORK); 1055 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
1065 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
1066 XFS_DATA_FORK);
1067 xfs_bmbt_set_blockcount(ep, temp); 1056 xfs_bmbt_set_blockcount(ep, temp);
1068 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1057 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
1069 new->br_startoff, new->br_startblock, 1058 new->br_startoff, new->br_startblock,
1070 new->br_blockcount + RIGHT.br_blockcount, 1059 new->br_blockcount + RIGHT.br_blockcount,
1071 RIGHT.br_state); 1060 RIGHT.br_state);
1072 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, 1061 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK);
1073 XFS_DATA_FORK);
1074 ip->i_df.if_lastex = idx + 1; 1062 ip->i_df.if_lastex = idx + 1;
1075 if (cur == NULL) 1063 if (cur == NULL)
1076 rval = XFS_ILOG_DEXT; 1064 rval = XFS_ILOG_DEXT;
@@ -1091,8 +1079,7 @@ xfs_bmap_add_extent_delay_real(
1091 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1079 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1092 STARTBLOCKVAL(PREV.br_startblock)); 1080 STARTBLOCKVAL(PREV.br_startblock));
1093 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1081 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1094 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, 1082 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
1095 XFS_DATA_FORK);
1096 *dnew = temp; 1083 *dnew = temp;
1097 /* DELTA: The boundary between two in-core extents moved. */ 1084 /* DELTA: The boundary between two in-core extents moved. */
1098 temp = PREV.br_startoff; 1085 temp = PREV.br_startoff;
@@ -1106,10 +1093,10 @@ xfs_bmap_add_extent_delay_real(
1106 * The right neighbor is not contiguous. 1093 * The right neighbor is not contiguous.
1107 */ 1094 */
1108 temp = PREV.br_blockcount - new->br_blockcount; 1095 temp = PREV.br_blockcount - new->br_blockcount;
1109 xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); 1096 XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1110 xfs_bmbt_set_blockcount(ep, temp); 1097 xfs_bmbt_set_blockcount(ep, temp);
1111 xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, 1098 XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
1112 new, NULL, XFS_DATA_FORK); 1099 XFS_DATA_FORK);
1113 xfs_iext_insert(ifp, idx + 1, 1, new); 1100 xfs_iext_insert(ifp, idx + 1, 1, new);
1114 ip->i_df.if_lastex = idx + 1; 1101 ip->i_df.if_lastex = idx + 1;
1115 ip->i_d.di_nextents++; 1102 ip->i_d.di_nextents++;
@@ -1141,7 +1128,7 @@ xfs_bmap_add_extent_delay_real(
1141 (cur ? cur->bc_private.b.allocated : 0)); 1128 (cur ? cur->bc_private.b.allocated : 0));
1142 ep = xfs_iext_get_ext(ifp, idx); 1129 ep = xfs_iext_get_ext(ifp, idx);
1143 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1130 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1144 xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); 1131 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1145 *dnew = temp; 1132 *dnew = temp;
1146 /* DELTA: One in-core extent is split in two. */ 1133 /* DELTA: One in-core extent is split in two. */
1147 temp = PREV.br_startoff; 1134 temp = PREV.br_startoff;
@@ -1155,7 +1142,7 @@ xfs_bmap_add_extent_delay_real(
1155 * This case is avoided almost all the time. 1142 * This case is avoided almost all the time.
1156 */ 1143 */
1157 temp = new->br_startoff - PREV.br_startoff; 1144 temp = new->br_startoff - PREV.br_startoff;
1158 xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); 1145 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
1159 xfs_bmbt_set_blockcount(ep, temp); 1146 xfs_bmbt_set_blockcount(ep, temp);
1160 r[0] = *new; 1147 r[0] = *new;
1161 r[1].br_state = PREV.br_state; 1148 r[1].br_state = PREV.br_state;
@@ -1163,7 +1150,7 @@ xfs_bmap_add_extent_delay_real(
1163 r[1].br_startoff = new_endoff; 1150 r[1].br_startoff = new_endoff;
1164 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1151 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1165 r[1].br_blockcount = temp2; 1152 r[1].br_blockcount = temp2;
1166 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], 1153 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
1167 XFS_DATA_FORK); 1154 XFS_DATA_FORK);
1168 xfs_iext_insert(ifp, idx + 1, 2, &r[0]); 1155 xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
1169 ip->i_df.if_lastex = idx + 1; 1156 ip->i_df.if_lastex = idx + 1;
@@ -1222,13 +1209,11 @@ xfs_bmap_add_extent_delay_real(
1222 } 1209 }
1223 ep = xfs_iext_get_ext(ifp, idx); 1210 ep = xfs_iext_get_ext(ifp, idx);
1224 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1211 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1225 xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); 1212 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
1226 xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, 1213 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1227 XFS_DATA_FORK);
1228 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1214 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
1229 NULLSTARTBLOCK((int)temp2)); 1215 NULLSTARTBLOCK((int)temp2));
1230 xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, 1216 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1231 XFS_DATA_FORK);
1232 *dnew = temp + temp2; 1217 *dnew = temp + temp2;
1233 /* DELTA: One in-core extent is split in three. */ 1218 /* DELTA: One in-core extent is split in three. */
1234 temp = PREV.br_startoff; 1219 temp = PREV.br_startoff;
@@ -1287,9 +1272,6 @@ xfs_bmap_add_extent_unwritten_real(
1287 xfs_btree_cur_t *cur; /* btree cursor */ 1272 xfs_btree_cur_t *cur; /* btree cursor */
1288 xfs_bmbt_rec_t *ep; /* extent entry for idx */ 1273 xfs_bmbt_rec_t *ep; /* extent entry for idx */
1289 int error; /* error return value */ 1274 int error; /* error return value */
1290#ifdef XFS_BMAP_TRACE
1291 static char fname[] = "xfs_bmap_add_extent_unwritten_real";
1292#endif
1293 int i; /* temp state */ 1275 int i; /* temp state */
1294 xfs_ifork_t *ifp; /* inode fork pointer */ 1276 xfs_ifork_t *ifp; /* inode fork pointer */
1295 xfs_fileoff_t new_endoff; /* end offset of new entry */ 1277 xfs_fileoff_t new_endoff; /* end offset of new entry */
@@ -1390,15 +1372,14 @@ xfs_bmap_add_extent_unwritten_real(
1390 * Setting all of a previous oldext extent to newext. 1372 * Setting all of a previous oldext extent to newext.
1391 * The left and right neighbors are both contiguous with new. 1373 * The left and right neighbors are both contiguous with new.
1392 */ 1374 */
1393 xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, 1375 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1,
1394 XFS_DATA_FORK); 1376 XFS_DATA_FORK);
1395 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1377 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1396 LEFT.br_blockcount + PREV.br_blockcount + 1378 LEFT.br_blockcount + PREV.br_blockcount +
1397 RIGHT.br_blockcount); 1379 RIGHT.br_blockcount);
1398 xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, 1380 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1,
1399 XFS_DATA_FORK);
1400 xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
1401 XFS_DATA_FORK); 1381 XFS_DATA_FORK);
1382 XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK);
1402 xfs_iext_remove(ifp, idx, 2); 1383 xfs_iext_remove(ifp, idx, 2);
1403 ip->i_df.if_lastex = idx - 1; 1384 ip->i_df.if_lastex = idx - 1;
1404 ip->i_d.di_nextents -= 2; 1385 ip->i_d.di_nextents -= 2;
@@ -1441,15 +1422,14 @@ xfs_bmap_add_extent_unwritten_real(
1441 * Setting all of a previous oldext extent to newext. 1422 * Setting all of a previous oldext extent to newext.
1442 * The left neighbor is contiguous, the right is not. 1423 * The left neighbor is contiguous, the right is not.
1443 */ 1424 */
1444 xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, 1425 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1,
1445 XFS_DATA_FORK); 1426 XFS_DATA_FORK);
1446 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1427 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1447 LEFT.br_blockcount + PREV.br_blockcount); 1428 LEFT.br_blockcount + PREV.br_blockcount);
1448 xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, 1429 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1,
1449 XFS_DATA_FORK); 1430 XFS_DATA_FORK);
1450 ip->i_df.if_lastex = idx - 1; 1431 ip->i_df.if_lastex = idx - 1;
1451 xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, 1432 XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK);
1452 XFS_DATA_FORK);
1453 xfs_iext_remove(ifp, idx, 1); 1433 xfs_iext_remove(ifp, idx, 1);
1454 ip->i_d.di_nextents--; 1434 ip->i_d.di_nextents--;
1455 if (cur == NULL) 1435 if (cur == NULL)
@@ -1484,16 +1464,15 @@ xfs_bmap_add_extent_unwritten_real(
1484 * Setting all of a previous oldext extent to newext. 1464 * Setting all of a previous oldext extent to newext.
1485 * The right neighbor is contiguous, the left is not. 1465 * The right neighbor is contiguous, the left is not.
1486 */ 1466 */
1487 xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, 1467 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx,
1488 XFS_DATA_FORK); 1468 XFS_DATA_FORK);
1489 xfs_bmbt_set_blockcount(ep, 1469 xfs_bmbt_set_blockcount(ep,
1490 PREV.br_blockcount + RIGHT.br_blockcount); 1470 PREV.br_blockcount + RIGHT.br_blockcount);
1491 xfs_bmbt_set_state(ep, newext); 1471 xfs_bmbt_set_state(ep, newext);
1492 xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, 1472 XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx,
1493 XFS_DATA_FORK); 1473 XFS_DATA_FORK);
1494 ip->i_df.if_lastex = idx; 1474 ip->i_df.if_lastex = idx;
1495 xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, 1475 XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK);
1496 XFS_DATA_FORK);
1497 xfs_iext_remove(ifp, idx + 1, 1); 1476 xfs_iext_remove(ifp, idx + 1, 1);
1498 ip->i_d.di_nextents--; 1477 ip->i_d.di_nextents--;
1499 if (cur == NULL) 1478 if (cur == NULL)
@@ -1529,10 +1508,10 @@ xfs_bmap_add_extent_unwritten_real(
1529 * Neither the left nor right neighbors are contiguous with 1508 * Neither the left nor right neighbors are contiguous with
1530 * the new one. 1509 * the new one.
1531 */ 1510 */
1532 xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, 1511 XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx,
1533 XFS_DATA_FORK); 1512 XFS_DATA_FORK);
1534 xfs_bmbt_set_state(ep, newext); 1513 xfs_bmbt_set_state(ep, newext);
1535 xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, 1514 XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx,
1536 XFS_DATA_FORK); 1515 XFS_DATA_FORK);
1537 ip->i_df.if_lastex = idx; 1516 ip->i_df.if_lastex = idx;
1538 if (cur == NULL) 1517 if (cur == NULL)
@@ -1559,21 +1538,21 @@ xfs_bmap_add_extent_unwritten_real(
1559 * Setting the first part of a previous oldext extent to newext. 1538 * Setting the first part of a previous oldext extent to newext.
1560 * The left neighbor is contiguous. 1539 * The left neighbor is contiguous.
1561 */ 1540 */
1562 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, 1541 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1,
1563 XFS_DATA_FORK); 1542 XFS_DATA_FORK);
1564 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1543 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
1565 LEFT.br_blockcount + new->br_blockcount); 1544 LEFT.br_blockcount + new->br_blockcount);
1566 xfs_bmbt_set_startoff(ep, 1545 xfs_bmbt_set_startoff(ep,
1567 PREV.br_startoff + new->br_blockcount); 1546 PREV.br_startoff + new->br_blockcount);
1568 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, 1547 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1,
1569 XFS_DATA_FORK); 1548 XFS_DATA_FORK);
1570 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, 1549 XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx,
1571 XFS_DATA_FORK); 1550 XFS_DATA_FORK);
1572 xfs_bmbt_set_startblock(ep, 1551 xfs_bmbt_set_startblock(ep,
1573 new->br_startblock + new->br_blockcount); 1552 new->br_startblock + new->br_blockcount);
1574 xfs_bmbt_set_blockcount(ep, 1553 xfs_bmbt_set_blockcount(ep,
1575 PREV.br_blockcount - new->br_blockcount); 1554 PREV.br_blockcount - new->br_blockcount);
1576 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, 1555 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx,
1577 XFS_DATA_FORK); 1556 XFS_DATA_FORK);
1578 ip->i_df.if_lastex = idx - 1; 1557 ip->i_df.if_lastex = idx - 1;
1579 if (cur == NULL) 1558 if (cur == NULL)
@@ -1610,15 +1589,15 @@ xfs_bmap_add_extent_unwritten_real(
1610 * Setting the first part of a previous oldext extent to newext. 1589 * Setting the first part of a previous oldext extent to newext.
1611 * The left neighbor is not contiguous. 1590 * The left neighbor is not contiguous.
1612 */ 1591 */
1613 xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); 1592 XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK);
1614 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); 1593 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1615 xfs_bmbt_set_startoff(ep, new_endoff); 1594 xfs_bmbt_set_startoff(ep, new_endoff);
1616 xfs_bmbt_set_blockcount(ep, 1595 xfs_bmbt_set_blockcount(ep,
1617 PREV.br_blockcount - new->br_blockcount); 1596 PREV.br_blockcount - new->br_blockcount);
1618 xfs_bmbt_set_startblock(ep, 1597 xfs_bmbt_set_startblock(ep,
1619 new->br_startblock + new->br_blockcount); 1598 new->br_startblock + new->br_blockcount);
1620 xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); 1599 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK);
1621 xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, 1600 XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL,
1622 XFS_DATA_FORK); 1601 XFS_DATA_FORK);
1623 xfs_iext_insert(ifp, idx, 1, new); 1602 xfs_iext_insert(ifp, idx, 1, new);
1624 ip->i_df.if_lastex = idx; 1603 ip->i_df.if_lastex = idx;
@@ -1653,18 +1632,18 @@ xfs_bmap_add_extent_unwritten_real(
1653 * Setting the last part of a previous oldext extent to newext. 1632 * Setting the last part of a previous oldext extent to newext.
1654 * The right neighbor is contiguous with the new allocation. 1633 * The right neighbor is contiguous with the new allocation.
1655 */ 1634 */
1656 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, 1635 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx,
1657 XFS_DATA_FORK); 1636 XFS_DATA_FORK);
1658 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, 1637 XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1,
1659 XFS_DATA_FORK); 1638 XFS_DATA_FORK);
1660 xfs_bmbt_set_blockcount(ep, 1639 xfs_bmbt_set_blockcount(ep,
1661 PREV.br_blockcount - new->br_blockcount); 1640 PREV.br_blockcount - new->br_blockcount);
1662 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, 1641 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx,
1663 XFS_DATA_FORK); 1642 XFS_DATA_FORK);
1664 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1643 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
1665 new->br_startoff, new->br_startblock, 1644 new->br_startoff, new->br_startblock,
1666 new->br_blockcount + RIGHT.br_blockcount, newext); 1645 new->br_blockcount + RIGHT.br_blockcount, newext);
1667 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, 1646 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1,
1668 XFS_DATA_FORK); 1647 XFS_DATA_FORK);
1669 ip->i_df.if_lastex = idx + 1; 1648 ip->i_df.if_lastex = idx + 1;
1670 if (cur == NULL) 1649 if (cur == NULL)
@@ -1700,12 +1679,12 @@ xfs_bmap_add_extent_unwritten_real(
1700 * Setting the last part of a previous oldext extent to newext. 1679 * Setting the last part of a previous oldext extent to newext.
1701 * The right neighbor is not contiguous. 1680 * The right neighbor is not contiguous.
1702 */ 1681 */
1703 xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); 1682 XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1704 xfs_bmbt_set_blockcount(ep, 1683 xfs_bmbt_set_blockcount(ep,
1705 PREV.br_blockcount - new->br_blockcount); 1684 PREV.br_blockcount - new->br_blockcount);
1706 xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); 1685 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1707 xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, 1686 XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL,
1708 new, NULL, XFS_DATA_FORK); 1687 XFS_DATA_FORK);
1709 xfs_iext_insert(ifp, idx + 1, 1, new); 1688 xfs_iext_insert(ifp, idx + 1, 1, new);
1710 ip->i_df.if_lastex = idx + 1; 1689 ip->i_df.if_lastex = idx + 1;
1711 ip->i_d.di_nextents++; 1690 ip->i_d.di_nextents++;
@@ -1744,17 +1723,17 @@ xfs_bmap_add_extent_unwritten_real(
1744 * newext. Contiguity is impossible here. 1723 * newext. Contiguity is impossible here.
1745 * One extent becomes three extents. 1724 * One extent becomes three extents.
1746 */ 1725 */
1747 xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); 1726 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK);
1748 xfs_bmbt_set_blockcount(ep, 1727 xfs_bmbt_set_blockcount(ep,
1749 new->br_startoff - PREV.br_startoff); 1728 new->br_startoff - PREV.br_startoff);
1750 xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); 1729 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
1751 r[0] = *new; 1730 r[0] = *new;
1752 r[1].br_startoff = new_endoff; 1731 r[1].br_startoff = new_endoff;
1753 r[1].br_blockcount = 1732 r[1].br_blockcount =
1754 PREV.br_startoff + PREV.br_blockcount - new_endoff; 1733 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1755 r[1].br_startblock = new->br_startblock + new->br_blockcount; 1734 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1756 r[1].br_state = oldext; 1735 r[1].br_state = oldext;
1757 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], 1736 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1],
1758 XFS_DATA_FORK); 1737 XFS_DATA_FORK);
1759 xfs_iext_insert(ifp, idx + 1, 2, &r[0]); 1738 xfs_iext_insert(ifp, idx + 1, 2, &r[0]);
1760 ip->i_df.if_lastex = idx + 1; 1739 ip->i_df.if_lastex = idx + 1;
@@ -1845,9 +1824,6 @@ xfs_bmap_add_extent_hole_delay(
1845 int rsvd) /* OK to allocate reserved blocks */ 1824 int rsvd) /* OK to allocate reserved blocks */
1846{ 1825{
1847 xfs_bmbt_rec_t *ep; /* extent record for idx */ 1826 xfs_bmbt_rec_t *ep; /* extent record for idx */
1848#ifdef XFS_BMAP_TRACE
1849 static char fname[] = "xfs_bmap_add_extent_hole_delay";
1850#endif
1851 xfs_ifork_t *ifp; /* inode fork pointer */ 1827 xfs_ifork_t *ifp; /* inode fork pointer */
1852 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1828 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1853 xfs_filblks_t newlen=0; /* new indirect size */ 1829 xfs_filblks_t newlen=0; /* new indirect size */
@@ -1919,7 +1895,7 @@ xfs_bmap_add_extent_hole_delay(
1919 */ 1895 */
1920 temp = left.br_blockcount + new->br_blockcount + 1896 temp = left.br_blockcount + new->br_blockcount +
1921 right.br_blockcount; 1897 right.br_blockcount;
1922 xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, 1898 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
1923 XFS_DATA_FORK); 1899 XFS_DATA_FORK);
1924 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1900 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1925 oldlen = STARTBLOCKVAL(left.br_startblock) + 1901 oldlen = STARTBLOCKVAL(left.br_startblock) +
@@ -1928,10 +1904,9 @@ xfs_bmap_add_extent_hole_delay(
1928 newlen = xfs_bmap_worst_indlen(ip, temp); 1904 newlen = xfs_bmap_worst_indlen(ip, temp);
1929 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1905 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1930 NULLSTARTBLOCK((int)newlen)); 1906 NULLSTARTBLOCK((int)newlen));
1931 xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, 1907 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
1932 XFS_DATA_FORK);
1933 xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
1934 XFS_DATA_FORK); 1908 XFS_DATA_FORK);
1909 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
1935 xfs_iext_remove(ifp, idx, 1); 1910 xfs_iext_remove(ifp, idx, 1);
1936 ip->i_df.if_lastex = idx - 1; 1911 ip->i_df.if_lastex = idx - 1;
1937 /* DELTA: Two in-core extents were replaced by one. */ 1912 /* DELTA: Two in-core extents were replaced by one. */
@@ -1946,7 +1921,7 @@ xfs_bmap_add_extent_hole_delay(
1946 * Merge the new allocation with the left neighbor. 1921 * Merge the new allocation with the left neighbor.
1947 */ 1922 */
1948 temp = left.br_blockcount + new->br_blockcount; 1923 temp = left.br_blockcount + new->br_blockcount;
1949 xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, 1924 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
1950 XFS_DATA_FORK); 1925 XFS_DATA_FORK);
1951 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1926 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1952 oldlen = STARTBLOCKVAL(left.br_startblock) + 1927 oldlen = STARTBLOCKVAL(left.br_startblock) +
@@ -1954,7 +1929,7 @@ xfs_bmap_add_extent_hole_delay(
1954 newlen = xfs_bmap_worst_indlen(ip, temp); 1929 newlen = xfs_bmap_worst_indlen(ip, temp);
1955 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1930 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1956 NULLSTARTBLOCK((int)newlen)); 1931 NULLSTARTBLOCK((int)newlen));
1957 xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, 1932 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
1958 XFS_DATA_FORK); 1933 XFS_DATA_FORK);
1959 ip->i_df.if_lastex = idx - 1; 1934 ip->i_df.if_lastex = idx - 1;
1960 /* DELTA: One in-core extent grew into a hole. */ 1935 /* DELTA: One in-core extent grew into a hole. */
@@ -1968,14 +1943,14 @@ xfs_bmap_add_extent_hole_delay(
1968 * on the right. 1943 * on the right.
1969 * Merge the new allocation with the right neighbor. 1944 * Merge the new allocation with the right neighbor.
1970 */ 1945 */
1971 xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); 1946 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1972 temp = new->br_blockcount + right.br_blockcount; 1947 temp = new->br_blockcount + right.br_blockcount;
1973 oldlen = STARTBLOCKVAL(new->br_startblock) + 1948 oldlen = STARTBLOCKVAL(new->br_startblock) +
1974 STARTBLOCKVAL(right.br_startblock); 1949 STARTBLOCKVAL(right.br_startblock);
1975 newlen = xfs_bmap_worst_indlen(ip, temp); 1950 newlen = xfs_bmap_worst_indlen(ip, temp);
1976 xfs_bmbt_set_allf(ep, new->br_startoff, 1951 xfs_bmbt_set_allf(ep, new->br_startoff,
1977 NULLSTARTBLOCK((int)newlen), temp, right.br_state); 1952 NULLSTARTBLOCK((int)newlen), temp, right.br_state);
1978 xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); 1953 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1979 ip->i_df.if_lastex = idx; 1954 ip->i_df.if_lastex = idx;
1980 /* DELTA: One in-core extent grew into a hole. */ 1955 /* DELTA: One in-core extent grew into a hole. */
1981 temp2 = temp; 1956 temp2 = temp;
@@ -1989,7 +1964,7 @@ xfs_bmap_add_extent_hole_delay(
1989 * Insert a new entry. 1964 * Insert a new entry.
1990 */ 1965 */
1991 oldlen = newlen = 0; 1966 oldlen = newlen = 0;
1992 xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, 1967 XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL,
1993 XFS_DATA_FORK); 1968 XFS_DATA_FORK);
1994 xfs_iext_insert(ifp, idx, 1, new); 1969 xfs_iext_insert(ifp, idx, 1, new);
1995 ip->i_df.if_lastex = idx; 1970 ip->i_df.if_lastex = idx;
@@ -2039,9 +2014,6 @@ xfs_bmap_add_extent_hole_real(
2039{ 2014{
2040 xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ 2015 xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */
2041 int error; /* error return value */ 2016 int error; /* error return value */
2042#ifdef XFS_BMAP_TRACE
2043 static char fname[] = "xfs_bmap_add_extent_hole_real";
2044#endif
2045 int i; /* temp state */ 2017 int i; /* temp state */
2046 xfs_ifork_t *ifp; /* inode fork pointer */ 2018 xfs_ifork_t *ifp; /* inode fork pointer */
2047 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 2019 xfs_bmbt_irec_t left; /* left neighbor extent entry */
@@ -2118,15 +2090,14 @@ xfs_bmap_add_extent_hole_real(
2118 * left and on the right. 2090 * left and on the right.
2119 * Merge all three into a single extent record. 2091 * Merge all three into a single extent record.
2120 */ 2092 */
2121 xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, 2093 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
2122 whichfork); 2094 whichfork);
2123 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 2095 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
2124 left.br_blockcount + new->br_blockcount + 2096 left.br_blockcount + new->br_blockcount +
2125 right.br_blockcount); 2097 right.br_blockcount);
2126 xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, 2098 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
2127 whichfork); 2099 whichfork);
2128 xfs_bmap_trace_delete(fname, "LC|RC", ip, 2100 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork);
2129 idx, 1, whichfork);
2130 xfs_iext_remove(ifp, idx, 1); 2101 xfs_iext_remove(ifp, idx, 1);
2131 ifp->if_lastex = idx - 1; 2102 ifp->if_lastex = idx - 1;
2132 XFS_IFORK_NEXT_SET(ip, whichfork, 2103 XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -2168,10 +2139,10 @@ xfs_bmap_add_extent_hole_real(
2168 * on the left. 2139 * on the left.
2169 * Merge the new allocation with the left neighbor. 2140 * Merge the new allocation with the left neighbor.
2170 */ 2141 */
2171 xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); 2142 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork);
2172 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 2143 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
2173 left.br_blockcount + new->br_blockcount); 2144 left.br_blockcount + new->br_blockcount);
2174 xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); 2145 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
2175 ifp->if_lastex = idx - 1; 2146 ifp->if_lastex = idx - 1;
2176 if (cur == NULL) { 2147 if (cur == NULL) {
2177 rval = XFS_ILOG_FEXT(whichfork); 2148 rval = XFS_ILOG_FEXT(whichfork);
@@ -2202,11 +2173,11 @@ xfs_bmap_add_extent_hole_real(
2202 * on the right. 2173 * on the right.
2203 * Merge the new allocation with the right neighbor. 2174 * Merge the new allocation with the right neighbor.
2204 */ 2175 */
2205 xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); 2176 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork);
2206 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, 2177 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
2207 new->br_blockcount + right.br_blockcount, 2178 new->br_blockcount + right.br_blockcount,
2208 right.br_state); 2179 right.br_state);
2209 xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); 2180 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
2210 ifp->if_lastex = idx; 2181 ifp->if_lastex = idx;
2211 if (cur == NULL) { 2182 if (cur == NULL) {
2212 rval = XFS_ILOG_FEXT(whichfork); 2183 rval = XFS_ILOG_FEXT(whichfork);
@@ -2237,8 +2208,7 @@ xfs_bmap_add_extent_hole_real(
2237 * real allocation. 2208 * real allocation.
2238 * Insert a new entry. 2209 * Insert a new entry.
2239 */ 2210 */
2240 xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, 2211 XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork);
2241 whichfork);
2242 xfs_iext_insert(ifp, idx, 1, new); 2212 xfs_iext_insert(ifp, idx, 1, new);
2243 ifp->if_lastex = idx; 2213 ifp->if_lastex = idx;
2244 XFS_IFORK_NEXT_SET(ip, whichfork, 2214 XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -2605,12 +2575,10 @@ xfs_bmap_rtalloc(
2605 xfs_extlen_t prod = 0; /* product factor for allocators */ 2575 xfs_extlen_t prod = 0; /* product factor for allocators */
2606 xfs_extlen_t ralen = 0; /* realtime allocation length */ 2576 xfs_extlen_t ralen = 0; /* realtime allocation length */
2607 xfs_extlen_t align; /* minimum allocation alignment */ 2577 xfs_extlen_t align; /* minimum allocation alignment */
2608 xfs_rtblock_t rtx; /* realtime extent number */
2609 xfs_rtblock_t rtb; 2578 xfs_rtblock_t rtb;
2610 2579
2611 mp = ap->ip->i_mount; 2580 mp = ap->ip->i_mount;
2612 align = ap->ip->i_d.di_extsize ? 2581 align = xfs_get_extsz_hint(ap->ip);
2613 ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize;
2614 prod = align / mp->m_sb.sb_rextsize; 2582 prod = align / mp->m_sb.sb_rextsize;
2615 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2583 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
2616 align, 1, ap->eof, 0, 2584 align, 1, ap->eof, 0,
@@ -2644,6 +2612,8 @@ xfs_bmap_rtalloc(
2644 * pick an extent that will space things out in the rt area. 2612 * pick an extent that will space things out in the rt area.
2645 */ 2613 */
2646 if (ap->eof && ap->off == 0) { 2614 if (ap->eof && ap->off == 0) {
2615 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
2616
2647 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); 2617 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
2648 if (error) 2618 if (error)
2649 return error; 2619 return error;
@@ -2715,9 +2685,7 @@ xfs_bmap_btalloc(
2715 int error; 2685 int error;
2716 2686
2717 mp = ap->ip->i_mount; 2687 mp = ap->ip->i_mount;
2718 align = (ap->userdata && ap->ip->i_d.di_extsize && 2688 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
2719 (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ?
2720 ap->ip->i_d.di_extsize : 0;
2721 if (unlikely(align)) { 2689 if (unlikely(align)) {
2722 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, 2690 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
2723 align, 0, ap->eof, 0, ap->conv, 2691 align, 0, ap->eof, 0, ap->conv,
@@ -2727,9 +2695,15 @@ xfs_bmap_btalloc(
2727 } 2695 }
2728 nullfb = ap->firstblock == NULLFSBLOCK; 2696 nullfb = ap->firstblock == NULLFSBLOCK;
2729 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2697 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2730 if (nullfb) 2698 if (nullfb) {
2731 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); 2699 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
2732 else 2700 ag = xfs_filestream_lookup_ag(ap->ip);
2701 ag = (ag != NULLAGNUMBER) ? ag : 0;
2702 ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
2703 } else {
2704 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2705 }
2706 } else
2733 ap->rval = ap->firstblock; 2707 ap->rval = ap->firstblock;
2734 2708
2735 xfs_bmap_adjacent(ap); 2709 xfs_bmap_adjacent(ap);
@@ -2753,13 +2727,22 @@ xfs_bmap_btalloc(
2753 args.firstblock = ap->firstblock; 2727 args.firstblock = ap->firstblock;
2754 blen = 0; 2728 blen = 0;
2755 if (nullfb) { 2729 if (nullfb) {
2756 args.type = XFS_ALLOCTYPE_START_BNO; 2730 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2731 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2732 else
2733 args.type = XFS_ALLOCTYPE_START_BNO;
2757 args.total = ap->total; 2734 args.total = ap->total;
2735
2758 /* 2736 /*
2759 * Find the longest available space. 2737 * Search for an allocation group with a single extent
2760 * We're going to try for the whole allocation at once. 2738 * large enough for the request.
2739 *
2740 * If one isn't found, then adjust the minimum allocation
2741 * size to the largest space found.
2761 */ 2742 */
2762 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); 2743 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2744 if (startag == NULLAGNUMBER)
2745 startag = ag = 0;
2763 notinit = 0; 2746 notinit = 0;
2764 down_read(&mp->m_peraglock); 2747 down_read(&mp->m_peraglock);
2765 while (blen < ap->alen) { 2748 while (blen < ap->alen) {
@@ -2785,6 +2768,35 @@ xfs_bmap_btalloc(
2785 blen = longest; 2768 blen = longest;
2786 } else 2769 } else
2787 notinit = 1; 2770 notinit = 1;
2771
2772 if (xfs_inode_is_filestream(ap->ip)) {
2773 if (blen >= ap->alen)
2774 break;
2775
2776 if (ap->userdata) {
2777 /*
2778 * If startag is an invalid AG, we've
2779 * come here once before and
2780 * xfs_filestream_new_ag picked the
2781 * best currently available.
2782 *
2783 * Don't continue looping, since we
2784 * could loop forever.
2785 */
2786 if (startag == NULLAGNUMBER)
2787 break;
2788
2789 error = xfs_filestream_new_ag(ap, &ag);
2790 if (error) {
2791 up_read(&mp->m_peraglock);
2792 return error;
2793 }
2794
2795 /* loop again to set 'blen'*/
2796 startag = NULLAGNUMBER;
2797 continue;
2798 }
2799 }
2788 if (++ag == mp->m_sb.sb_agcount) 2800 if (++ag == mp->m_sb.sb_agcount)
2789 ag = 0; 2801 ag = 0;
2790 if (ag == startag) 2802 if (ag == startag)
@@ -2809,17 +2821,27 @@ xfs_bmap_btalloc(
2809 */ 2821 */
2810 else 2822 else
2811 args.minlen = ap->alen; 2823 args.minlen = ap->alen;
2824
2825 /*
2826 * set the failure fallback case to look in the selected
2827 * AG as the stream may have moved.
2828 */
2829 if (xfs_inode_is_filestream(ap->ip))
2830 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2812 } else if (ap->low) { 2831 } else if (ap->low) {
2813 args.type = XFS_ALLOCTYPE_START_BNO; 2832 if (xfs_inode_is_filestream(ap->ip))
2833 args.type = XFS_ALLOCTYPE_FIRST_AG;
2834 else
2835 args.type = XFS_ALLOCTYPE_START_BNO;
2814 args.total = args.minlen = ap->minlen; 2836 args.total = args.minlen = ap->minlen;
2815 } else { 2837 } else {
2816 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2838 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2817 args.total = ap->total; 2839 args.total = ap->total;
2818 args.minlen = ap->minlen; 2840 args.minlen = ap->minlen;
2819 } 2841 }
2820 if (unlikely(ap->userdata && ap->ip->i_d.di_extsize && 2842 /* apply extent size hints if obtained earlier */
2821 (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) { 2843 if (unlikely(align)) {
2822 args.prod = ap->ip->i_d.di_extsize; 2844 args.prod = align;
2823 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) 2845 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
2824 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2846 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2825 } else if (mp->m_sb.sb_blocksize >= NBPP) { 2847 } else if (mp->m_sb.sb_blocksize >= NBPP) {
@@ -3051,9 +3073,6 @@ xfs_bmap_del_extent(
3051 xfs_bmbt_rec_t *ep; /* current extent entry pointer */ 3073 xfs_bmbt_rec_t *ep; /* current extent entry pointer */
3052 int error; /* error return value */ 3074 int error; /* error return value */
3053 int flags; /* inode logging flags */ 3075 int flags; /* inode logging flags */
3054#ifdef XFS_BMAP_TRACE
3055 static char fname[] = "xfs_bmap_del_extent";
3056#endif
3057 xfs_bmbt_irec_t got; /* current extent entry */ 3076 xfs_bmbt_irec_t got; /* current extent entry */
3058 xfs_fileoff_t got_endoff; /* first offset past got */ 3077 xfs_fileoff_t got_endoff; /* first offset past got */
3059 int i; /* temp state */ 3078 int i; /* temp state */
@@ -3147,7 +3166,7 @@ xfs_bmap_del_extent(
3147 /* 3166 /*
3148 * Matches the whole extent. Delete the entry. 3167 * Matches the whole extent. Delete the entry.
3149 */ 3168 */
3150 xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); 3169 XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork);
3151 xfs_iext_remove(ifp, idx, 1); 3170 xfs_iext_remove(ifp, idx, 1);
3152 ifp->if_lastex = idx; 3171 ifp->if_lastex = idx;
3153 if (delay) 3172 if (delay)
@@ -3168,7 +3187,7 @@ xfs_bmap_del_extent(
3168 /* 3187 /*
3169 * Deleting the first part of the extent. 3188 * Deleting the first part of the extent.
3170 */ 3189 */
3171 xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); 3190 XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork);
3172 xfs_bmbt_set_startoff(ep, del_endoff); 3191 xfs_bmbt_set_startoff(ep, del_endoff);
3173 temp = got.br_blockcount - del->br_blockcount; 3192 temp = got.br_blockcount - del->br_blockcount;
3174 xfs_bmbt_set_blockcount(ep, temp); 3193 xfs_bmbt_set_blockcount(ep, temp);
@@ -3177,13 +3196,13 @@ xfs_bmap_del_extent(
3177 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3196 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3178 da_old); 3197 da_old);
3179 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3198 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
3180 xfs_bmap_trace_post_update(fname, "2", ip, idx, 3199 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
3181 whichfork); 3200 whichfork);
3182 da_new = temp; 3201 da_new = temp;
3183 break; 3202 break;
3184 } 3203 }
3185 xfs_bmbt_set_startblock(ep, del_endblock); 3204 xfs_bmbt_set_startblock(ep, del_endblock);
3186 xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); 3205 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
3187 if (!cur) { 3206 if (!cur) {
3188 flags |= XFS_ILOG_FEXT(whichfork); 3207 flags |= XFS_ILOG_FEXT(whichfork);
3189 break; 3208 break;
@@ -3199,19 +3218,19 @@ xfs_bmap_del_extent(
3199 * Deleting the last part of the extent. 3218 * Deleting the last part of the extent.
3200 */ 3219 */
3201 temp = got.br_blockcount - del->br_blockcount; 3220 temp = got.br_blockcount - del->br_blockcount;
3202 xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); 3221 XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork);
3203 xfs_bmbt_set_blockcount(ep, temp); 3222 xfs_bmbt_set_blockcount(ep, temp);
3204 ifp->if_lastex = idx; 3223 ifp->if_lastex = idx;
3205 if (delay) { 3224 if (delay) {
3206 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3225 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3207 da_old); 3226 da_old);
3208 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3227 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
3209 xfs_bmap_trace_post_update(fname, "1", ip, idx, 3228 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
3210 whichfork); 3229 whichfork);
3211 da_new = temp; 3230 da_new = temp;
3212 break; 3231 break;
3213 } 3232 }
3214 xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); 3233 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
3215 if (!cur) { 3234 if (!cur) {
3216 flags |= XFS_ILOG_FEXT(whichfork); 3235 flags |= XFS_ILOG_FEXT(whichfork);
3217 break; 3236 break;
@@ -3228,7 +3247,7 @@ xfs_bmap_del_extent(
3228 * Deleting the middle of the extent. 3247 * Deleting the middle of the extent.
3229 */ 3248 */
3230 temp = del->br_startoff - got.br_startoff; 3249 temp = del->br_startoff - got.br_startoff;
3231 xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); 3250 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork);
3232 xfs_bmbt_set_blockcount(ep, temp); 3251 xfs_bmbt_set_blockcount(ep, temp);
3233 new.br_startoff = del_endoff; 3252 new.br_startoff = del_endoff;
3234 temp2 = got_endoff - del_endoff; 3253 temp2 = got_endoff - del_endoff;
@@ -3315,8 +3334,8 @@ xfs_bmap_del_extent(
3315 } 3334 }
3316 } 3335 }
3317 } 3336 }
3318 xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); 3337 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork);
3319 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, 3338 XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL,
3320 whichfork); 3339 whichfork);
3321 xfs_iext_insert(ifp, idx + 1, 1, &new); 3340 xfs_iext_insert(ifp, idx + 1, 1, &new);
3322 ifp->if_lastex = idx + 1; 3341 ifp->if_lastex = idx + 1;
@@ -3556,9 +3575,6 @@ xfs_bmap_local_to_extents(
3556{ 3575{
3557 int error; /* error return value */ 3576 int error; /* error return value */
3558 int flags; /* logging flags returned */ 3577 int flags; /* logging flags returned */
3559#ifdef XFS_BMAP_TRACE
3560 static char fname[] = "xfs_bmap_local_to_extents";
3561#endif
3562 xfs_ifork_t *ifp; /* inode fork pointer */ 3578 xfs_ifork_t *ifp; /* inode fork pointer */
3563 3579
3564 /* 3580 /*
@@ -3613,7 +3629,7 @@ xfs_bmap_local_to_extents(
3613 xfs_iext_add(ifp, 0, 1); 3629 xfs_iext_add(ifp, 0, 1);
3614 ep = xfs_iext_get_ext(ifp, 0); 3630 ep = xfs_iext_get_ext(ifp, 0);
3615 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); 3631 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
3616 xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); 3632 XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
3617 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 3633 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
3618 ip->i_d.di_nblocks = 1; 3634 ip->i_d.di_nblocks = 1;
3619 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, 3635 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
@@ -3736,7 +3752,7 @@ ktrace_t *xfs_bmap_trace_buf;
3736STATIC void 3752STATIC void
3737xfs_bmap_trace_addentry( 3753xfs_bmap_trace_addentry(
3738 int opcode, /* operation */ 3754 int opcode, /* operation */
3739 char *fname, /* function name */ 3755 const char *fname, /* function name */
3740 char *desc, /* operation description */ 3756 char *desc, /* operation description */
3741 xfs_inode_t *ip, /* incore inode pointer */ 3757 xfs_inode_t *ip, /* incore inode pointer */
3742 xfs_extnum_t idx, /* index of entry(ies) */ 3758 xfs_extnum_t idx, /* index of entry(ies) */
@@ -3795,7 +3811,7 @@ xfs_bmap_trace_addentry(
3795 */ 3811 */
3796STATIC void 3812STATIC void
3797xfs_bmap_trace_delete( 3813xfs_bmap_trace_delete(
3798 char *fname, /* function name */ 3814 const char *fname, /* function name */
3799 char *desc, /* operation description */ 3815 char *desc, /* operation description */
3800 xfs_inode_t *ip, /* incore inode pointer */ 3816 xfs_inode_t *ip, /* incore inode pointer */
3801 xfs_extnum_t idx, /* index of entry(entries) deleted */ 3817 xfs_extnum_t idx, /* index of entry(entries) deleted */
@@ -3817,7 +3833,7 @@ xfs_bmap_trace_delete(
3817 */ 3833 */
3818STATIC void 3834STATIC void
3819xfs_bmap_trace_insert( 3835xfs_bmap_trace_insert(
3820 char *fname, /* function name */ 3836 const char *fname, /* function name */
3821 char *desc, /* operation description */ 3837 char *desc, /* operation description */
3822 xfs_inode_t *ip, /* incore inode pointer */ 3838 xfs_inode_t *ip, /* incore inode pointer */
3823 xfs_extnum_t idx, /* index of entry(entries) inserted */ 3839 xfs_extnum_t idx, /* index of entry(entries) inserted */
@@ -3846,7 +3862,7 @@ xfs_bmap_trace_insert(
3846 */ 3862 */
3847STATIC void 3863STATIC void
3848xfs_bmap_trace_post_update( 3864xfs_bmap_trace_post_update(
3849 char *fname, /* function name */ 3865 const char *fname, /* function name */
3850 char *desc, /* operation description */ 3866 char *desc, /* operation description */
3851 xfs_inode_t *ip, /* incore inode pointer */ 3867 xfs_inode_t *ip, /* incore inode pointer */
3852 xfs_extnum_t idx, /* index of entry updated */ 3868 xfs_extnum_t idx, /* index of entry updated */
@@ -3864,7 +3880,7 @@ xfs_bmap_trace_post_update(
3864 */ 3880 */
3865STATIC void 3881STATIC void
3866xfs_bmap_trace_pre_update( 3882xfs_bmap_trace_pre_update(
3867 char *fname, /* function name */ 3883 const char *fname, /* function name */
3868 char *desc, /* operation description */ 3884 char *desc, /* operation description */
3869 xfs_inode_t *ip, /* incore inode pointer */ 3885 xfs_inode_t *ip, /* incore inode pointer */
3870 xfs_extnum_t idx, /* index of entry to be updated */ 3886 xfs_extnum_t idx, /* index of entry to be updated */
@@ -4481,9 +4497,6 @@ xfs_bmap_read_extents(
4481 xfs_buf_t *bp; /* buffer for "block" */ 4497 xfs_buf_t *bp; /* buffer for "block" */
4482 int error; /* error return value */ 4498 int error; /* error return value */
4483 xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ 4499 xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
4484#ifdef XFS_BMAP_TRACE
4485 static char fname[] = "xfs_bmap_read_extents";
4486#endif
4487 xfs_extnum_t i, j; /* index into the extents list */ 4500 xfs_extnum_t i, j; /* index into the extents list */
4488 xfs_ifork_t *ifp; /* fork structure */ 4501 xfs_ifork_t *ifp; /* fork structure */
4489 int level; /* btree level, for checking */ 4502 int level; /* btree level, for checking */
@@ -4600,7 +4613,7 @@ xfs_bmap_read_extents(
4600 } 4613 }
4601 ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); 4614 ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
4602 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); 4615 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
4603 xfs_bmap_trace_exlist(fname, ip, i, whichfork); 4616 XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
4604 return 0; 4617 return 0;
4605error0: 4618error0:
4606 xfs_trans_brelse(tp, bp); 4619 xfs_trans_brelse(tp, bp);
@@ -4613,7 +4626,7 @@ error0:
4613 */ 4626 */
4614void 4627void
4615xfs_bmap_trace_exlist( 4628xfs_bmap_trace_exlist(
4616 char *fname, /* function name */ 4629 const char *fname, /* function name */
4617 xfs_inode_t *ip, /* incore inode pointer */ 4630 xfs_inode_t *ip, /* incore inode pointer */
4618 xfs_extnum_t cnt, /* count of entries in the list */ 4631 xfs_extnum_t cnt, /* count of entries in the list */
4619 int whichfork) /* data or attr fork */ 4632 int whichfork) /* data or attr fork */
@@ -4628,7 +4641,7 @@ xfs_bmap_trace_exlist(
4628 for (idx = 0; idx < cnt; idx++) { 4641 for (idx = 0; idx < cnt; idx++) {
4629 ep = xfs_iext_get_ext(ifp, idx); 4642 ep = xfs_iext_get_ext(ifp, idx);
4630 xfs_bmbt_get_all(ep, &s); 4643 xfs_bmbt_get_all(ep, &s);
4631 xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, 4644 XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL,
4632 whichfork); 4645 whichfork);
4633 } 4646 }
4634} 4647}
@@ -4868,12 +4881,7 @@ xfs_bmapi(
4868 xfs_extlen_t extsz; 4881 xfs_extlen_t extsz;
4869 4882
4870 /* Figure out the extent size, adjust alen */ 4883 /* Figure out the extent size, adjust alen */
4871 if (rt) { 4884 extsz = xfs_get_extsz_hint(ip);
4872 if (!(extsz = ip->i_d.di_extsize))
4873 extsz = mp->m_sb.sb_rextsize;
4874 } else {
4875 extsz = ip->i_d.di_extsize;
4876 }
4877 if (extsz) { 4885 if (extsz) {
4878 error = xfs_bmap_extsize_align(mp, 4886 error = xfs_bmap_extsize_align(mp,
4879 &got, &prev, extsz, 4887 &got, &prev, extsz,
@@ -5219,10 +5227,10 @@ xfs_bmapi(
5219 * Else go on to the next record. 5227 * Else go on to the next record.
5220 */ 5228 */
5221 ep = xfs_iext_get_ext(ifp, ++lastx); 5229 ep = xfs_iext_get_ext(ifp, ++lastx);
5222 if (lastx >= nextents) { 5230 prev = got;
5231 if (lastx >= nextents)
5223 eof = 1; 5232 eof = 1;
5224 prev = got; 5233 else
5225 } else
5226 xfs_bmbt_get_all(ep, &got); 5234 xfs_bmbt_get_all(ep, &got);
5227 } 5235 }
5228 ifp->if_lastex = lastx; 5236 ifp->if_lastex = lastx;
@@ -5813,8 +5821,7 @@ xfs_getbmap(
5813 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 5821 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5814 return XFS_ERROR(EINVAL); 5822 return XFS_ERROR(EINVAL);
5815 if (whichfork == XFS_DATA_FORK) { 5823 if (whichfork == XFS_DATA_FORK) {
5816 if ((ip->i_d.di_extsize && (ip->i_d.di_flags & 5824 if (xfs_get_extsz_hint(ip) ||
5817 (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) ||
5818 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ 5825 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
5819 prealloced = 1; 5826 prealloced = 1;
5820 fixlen = XFS_MAXIOFFSET(mp); 5827 fixlen = XFS_MAXIOFFSET(mp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 4f24c7e39b..524b1c9d52 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -144,12 +144,14 @@ extern ktrace_t *xfs_bmap_trace_buf;
144 */ 144 */
145void 145void
146xfs_bmap_trace_exlist( 146xfs_bmap_trace_exlist(
147 char *fname, /* function name */ 147 const char *fname, /* function name */
148 struct xfs_inode *ip, /* incore inode pointer */ 148 struct xfs_inode *ip, /* incore inode pointer */
149 xfs_extnum_t cnt, /* count of entries in list */ 149 xfs_extnum_t cnt, /* count of entries in list */
150 int whichfork); /* data or attr fork */ 150 int whichfork); /* data or attr fork */
151#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
152 xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
151#else 153#else
152#define xfs_bmap_trace_exlist(f,ip,c,w) 154#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
153#endif 155#endif
154 156
155/* 157/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0bf192fea3..89b891f51c 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -76,7 +76,7 @@ static char EXIT[] = "exit";
76 */ 76 */
77STATIC void 77STATIC void
78xfs_bmbt_trace_enter( 78xfs_bmbt_trace_enter(
79 char *func, 79 const char *func,
80 xfs_btree_cur_t *cur, 80 xfs_btree_cur_t *cur,
81 char *s, 81 char *s,
82 int type, 82 int type,
@@ -117,7 +117,7 @@ xfs_bmbt_trace_enter(
117 */ 117 */
118STATIC void 118STATIC void
119xfs_bmbt_trace_argbi( 119xfs_bmbt_trace_argbi(
120 char *func, 120 const char *func,
121 xfs_btree_cur_t *cur, 121 xfs_btree_cur_t *cur,
122 xfs_buf_t *b, 122 xfs_buf_t *b,
123 int i, 123 int i,
@@ -134,7 +134,7 @@ xfs_bmbt_trace_argbi(
134 */ 134 */
135STATIC void 135STATIC void
136xfs_bmbt_trace_argbii( 136xfs_bmbt_trace_argbii(
137 char *func, 137 const char *func,
138 xfs_btree_cur_t *cur, 138 xfs_btree_cur_t *cur,
139 xfs_buf_t *b, 139 xfs_buf_t *b,
140 int i0, 140 int i0,
@@ -153,7 +153,7 @@ xfs_bmbt_trace_argbii(
153 */ 153 */
154STATIC void 154STATIC void
155xfs_bmbt_trace_argfffi( 155xfs_bmbt_trace_argfffi(
156 char *func, 156 const char *func,
157 xfs_btree_cur_t *cur, 157 xfs_btree_cur_t *cur,
158 xfs_dfiloff_t o, 158 xfs_dfiloff_t o,
159 xfs_dfsbno_t b, 159 xfs_dfsbno_t b,
@@ -172,7 +172,7 @@ xfs_bmbt_trace_argfffi(
172 */ 172 */
173STATIC void 173STATIC void
174xfs_bmbt_trace_argi( 174xfs_bmbt_trace_argi(
175 char *func, 175 const char *func,
176 xfs_btree_cur_t *cur, 176 xfs_btree_cur_t *cur,
177 int i, 177 int i,
178 int line) 178 int line)
@@ -188,7 +188,7 @@ xfs_bmbt_trace_argi(
188 */ 188 */
189STATIC void 189STATIC void
190xfs_bmbt_trace_argifk( 190xfs_bmbt_trace_argifk(
191 char *func, 191 const char *func,
192 xfs_btree_cur_t *cur, 192 xfs_btree_cur_t *cur,
193 int i, 193 int i,
194 xfs_fsblock_t f, 194 xfs_fsblock_t f,
@@ -206,7 +206,7 @@ xfs_bmbt_trace_argifk(
206 */ 206 */
207STATIC void 207STATIC void
208xfs_bmbt_trace_argifr( 208xfs_bmbt_trace_argifr(
209 char *func, 209 const char *func,
210 xfs_btree_cur_t *cur, 210 xfs_btree_cur_t *cur,
211 int i, 211 int i,
212 xfs_fsblock_t f, 212 xfs_fsblock_t f,
@@ -235,7 +235,7 @@ xfs_bmbt_trace_argifr(
235 */ 235 */
236STATIC void 236STATIC void
237xfs_bmbt_trace_argik( 237xfs_bmbt_trace_argik(
238 char *func, 238 const char *func,
239 xfs_btree_cur_t *cur, 239 xfs_btree_cur_t *cur,
240 int i, 240 int i,
241 xfs_bmbt_key_t *k, 241 xfs_bmbt_key_t *k,
@@ -255,7 +255,7 @@ xfs_bmbt_trace_argik(
255 */ 255 */
256STATIC void 256STATIC void
257xfs_bmbt_trace_cursor( 257xfs_bmbt_trace_cursor(
258 char *func, 258 const char *func,
259 xfs_btree_cur_t *cur, 259 xfs_btree_cur_t *cur,
260 char *s, 260 char *s,
261 int line) 261 int line)
@@ -274,21 +274,21 @@ xfs_bmbt_trace_cursor(
274} 274}
275 275
276#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ 276#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
277 xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) 277 xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
278#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ 278#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
279 xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) 279 xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
280#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ 280#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
281 xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) 281 xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
282#define XFS_BMBT_TRACE_ARGI(c,i) \ 282#define XFS_BMBT_TRACE_ARGI(c,i) \
283 xfs_bmbt_trace_argi(fname, c, i, __LINE__) 283 xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
284#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ 284#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \
285 xfs_bmbt_trace_argifk(fname, c, i, f, s, __LINE__) 285 xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
286#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ 286#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
287 xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) 287 xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
288#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ 288#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
289 xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) 289 xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
290#define XFS_BMBT_TRACE_CURSOR(c,s) \ 290#define XFS_BMBT_TRACE_CURSOR(c,s) \
291 xfs_bmbt_trace_cursor(fname, c, s, __LINE__) 291 xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
292#else 292#else
293#define XFS_BMBT_TRACE_ARGBI(c,b,i) 293#define XFS_BMBT_TRACE_ARGBI(c,b,i)
294#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) 294#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
@@ -318,9 +318,6 @@ xfs_bmbt_delrec(
318 xfs_fsblock_t bno; /* fs-relative block number */ 318 xfs_fsblock_t bno; /* fs-relative block number */
319 xfs_buf_t *bp; /* buffer for block */ 319 xfs_buf_t *bp; /* buffer for block */
320 int error; /* error return value */ 320 int error; /* error return value */
321#ifdef XFS_BMBT_TRACE
322 static char fname[] = "xfs_bmbt_delrec";
323#endif
324 int i; /* loop counter */ 321 int i; /* loop counter */
325 int j; /* temp state */ 322 int j; /* temp state */
326 xfs_bmbt_key_t key; /* bmap btree key */ 323 xfs_bmbt_key_t key; /* bmap btree key */
@@ -694,9 +691,6 @@ xfs_bmbt_insrec(
694 xfs_bmbt_block_t *block; /* bmap btree block */ 691 xfs_bmbt_block_t *block; /* bmap btree block */
695 xfs_buf_t *bp; /* buffer for block */ 692 xfs_buf_t *bp; /* buffer for block */
696 int error; /* error return value */ 693 int error; /* error return value */
697#ifdef XFS_BMBT_TRACE
698 static char fname[] = "xfs_bmbt_insrec";
699#endif
700 int i; /* loop index */ 694 int i; /* loop index */
701 xfs_bmbt_key_t key; /* bmap btree key */ 695 xfs_bmbt_key_t key; /* bmap btree key */
702 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ 696 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
@@ -881,9 +875,6 @@ xfs_bmbt_killroot(
881#ifdef DEBUG 875#ifdef DEBUG
882 int error; 876 int error;
883#endif 877#endif
884#ifdef XFS_BMBT_TRACE
885 static char fname[] = "xfs_bmbt_killroot";
886#endif
887 int i; 878 int i;
888 xfs_bmbt_key_t *kp; 879 xfs_bmbt_key_t *kp;
889 xfs_inode_t *ip; 880 xfs_inode_t *ip;
@@ -973,9 +964,6 @@ xfs_bmbt_log_keys(
973 int kfirst, 964 int kfirst,
974 int klast) 965 int klast)
975{ 966{
976#ifdef XFS_BMBT_TRACE
977 static char fname[] = "xfs_bmbt_log_keys";
978#endif
979 xfs_trans_t *tp; 967 xfs_trans_t *tp;
980 968
981 XFS_BMBT_TRACE_CURSOR(cur, ENTRY); 969 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
@@ -1012,9 +1000,6 @@ xfs_bmbt_log_ptrs(
1012 int pfirst, 1000 int pfirst,
1013 int plast) 1001 int plast)
1014{ 1002{
1015#ifdef XFS_BMBT_TRACE
1016 static char fname[] = "xfs_bmbt_log_ptrs";
1017#endif
1018 xfs_trans_t *tp; 1003 xfs_trans_t *tp;
1019 1004
1020 XFS_BMBT_TRACE_CURSOR(cur, ENTRY); 1005 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
@@ -1055,9 +1040,6 @@ xfs_bmbt_lookup(
1055 xfs_daddr_t d; 1040 xfs_daddr_t d;
1056 xfs_sfiloff_t diff; 1041 xfs_sfiloff_t diff;
1057 int error; /* error return value */ 1042 int error; /* error return value */
1058#ifdef XFS_BMBT_TRACE
1059 static char fname[] = "xfs_bmbt_lookup";
1060#endif
1061 xfs_fsblock_t fsbno=0; 1043 xfs_fsblock_t fsbno=0;
1062 int high; 1044 int high;
1063 int i; 1045 int i;
@@ -1195,9 +1177,6 @@ xfs_bmbt_lshift(
1195 int *stat) /* success/failure */ 1177 int *stat) /* success/failure */
1196{ 1178{
1197 int error; /* error return value */ 1179 int error; /* error return value */
1198#ifdef XFS_BMBT_TRACE
1199 static char fname[] = "xfs_bmbt_lshift";
1200#endif
1201#ifdef DEBUG 1180#ifdef DEBUG
1202 int i; /* loop counter */ 1181 int i; /* loop counter */
1203#endif 1182#endif
@@ -1331,9 +1310,6 @@ xfs_bmbt_rshift(
1331 int *stat) /* success/failure */ 1310 int *stat) /* success/failure */
1332{ 1311{
1333 int error; /* error return value */ 1312 int error; /* error return value */
1334#ifdef XFS_BMBT_TRACE
1335 static char fname[] = "xfs_bmbt_rshift";
1336#endif
1337 int i; /* loop counter */ 1313 int i; /* loop counter */
1338 xfs_bmbt_key_t key; /* bmap btree key */ 1314 xfs_bmbt_key_t key; /* bmap btree key */
1339 xfs_buf_t *lbp; /* left buffer pointer */ 1315 xfs_buf_t *lbp; /* left buffer pointer */
@@ -1492,9 +1468,6 @@ xfs_bmbt_split(
1492{ 1468{
1493 xfs_alloc_arg_t args; /* block allocation args */ 1469 xfs_alloc_arg_t args; /* block allocation args */
1494 int error; /* error return value */ 1470 int error; /* error return value */
1495#ifdef XFS_BMBT_TRACE
1496 static char fname[] = "xfs_bmbt_split";
1497#endif
1498 int i; /* loop counter */ 1471 int i; /* loop counter */
1499 xfs_fsblock_t lbno; /* left sibling block number */ 1472 xfs_fsblock_t lbno; /* left sibling block number */
1500 xfs_buf_t *lbp; /* left buffer pointer */ 1473 xfs_buf_t *lbp; /* left buffer pointer */
@@ -1641,9 +1614,6 @@ xfs_bmbt_updkey(
1641#ifdef DEBUG 1614#ifdef DEBUG
1642 int error; 1615 int error;
1643#endif 1616#endif
1644#ifdef XFS_BMBT_TRACE
1645 static char fname[] = "xfs_bmbt_updkey";
1646#endif
1647 xfs_bmbt_key_t *kp; 1617 xfs_bmbt_key_t *kp;
1648 int ptr; 1618 int ptr;
1649 1619
@@ -1712,9 +1682,6 @@ xfs_bmbt_decrement(
1712 xfs_bmbt_block_t *block; 1682 xfs_bmbt_block_t *block;
1713 xfs_buf_t *bp; 1683 xfs_buf_t *bp;
1714 int error; /* error return value */ 1684 int error; /* error return value */
1715#ifdef XFS_BMBT_TRACE
1716 static char fname[] = "xfs_bmbt_decrement";
1717#endif
1718 xfs_fsblock_t fsbno; 1685 xfs_fsblock_t fsbno;
1719 int lev; 1686 int lev;
1720 xfs_mount_t *mp; 1687 xfs_mount_t *mp;
@@ -1785,9 +1752,6 @@ xfs_bmbt_delete(
1785 int *stat) /* success/failure */ 1752 int *stat) /* success/failure */
1786{ 1753{
1787 int error; /* error return value */ 1754 int error; /* error return value */
1788#ifdef XFS_BMBT_TRACE
1789 static char fname[] = "xfs_bmbt_delete";
1790#endif
1791 int i; 1755 int i;
1792 int level; 1756 int level;
1793 1757
@@ -2000,9 +1964,6 @@ xfs_bmbt_increment(
2000 xfs_bmbt_block_t *block; 1964 xfs_bmbt_block_t *block;
2001 xfs_buf_t *bp; 1965 xfs_buf_t *bp;
2002 int error; /* error return value */ 1966 int error; /* error return value */
2003#ifdef XFS_BMBT_TRACE
2004 static char fname[] = "xfs_bmbt_increment";
2005#endif
2006 xfs_fsblock_t fsbno; 1967 xfs_fsblock_t fsbno;
2007 int lev; 1968 int lev;
2008 xfs_mount_t *mp; 1969 xfs_mount_t *mp;
@@ -2080,9 +2041,6 @@ xfs_bmbt_insert(
2080 int *stat) /* success/failure */ 2041 int *stat) /* success/failure */
2081{ 2042{
2082 int error; /* error return value */ 2043 int error; /* error return value */
2083#ifdef XFS_BMBT_TRACE
2084 static char fname[] = "xfs_bmbt_insert";
2085#endif
2086 int i; 2044 int i;
2087 int level; 2045 int level;
2088 xfs_fsblock_t nbno; 2046 xfs_fsblock_t nbno;
@@ -2142,9 +2100,6 @@ xfs_bmbt_log_block(
2142 int fields) 2100 int fields)
2143{ 2101{
2144 int first; 2102 int first;
2145#ifdef XFS_BMBT_TRACE
2146 static char fname[] = "xfs_bmbt_log_block";
2147#endif
2148 int last; 2103 int last;
2149 xfs_trans_t *tp; 2104 xfs_trans_t *tp;
2150 static const short offsets[] = { 2105 static const short offsets[] = {
@@ -2181,9 +2136,6 @@ xfs_bmbt_log_recs(
2181{ 2136{
2182 xfs_bmbt_block_t *block; 2137 xfs_bmbt_block_t *block;
2183 int first; 2138 int first;
2184#ifdef XFS_BMBT_TRACE
2185 static char fname[] = "xfs_bmbt_log_recs";
2186#endif
2187 int last; 2139 int last;
2188 xfs_bmbt_rec_t *rp; 2140 xfs_bmbt_rec_t *rp;
2189 xfs_trans_t *tp; 2141 xfs_trans_t *tp;
@@ -2245,9 +2197,6 @@ xfs_bmbt_newroot(
2245 xfs_bmbt_key_t *ckp; /* child key pointer */ 2197 xfs_bmbt_key_t *ckp; /* child key pointer */
2246 xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ 2198 xfs_bmbt_ptr_t *cpp; /* child ptr pointer */
2247 int error; /* error return code */ 2199 int error; /* error return code */
2248#ifdef XFS_BMBT_TRACE
2249 static char fname[] = "xfs_bmbt_newroot";
2250#endif
2251#ifdef DEBUG 2200#ifdef DEBUG
2252 int i; /* loop counter */ 2201 int i; /* loop counter */
2253#endif 2202#endif
@@ -2630,9 +2579,6 @@ xfs_bmbt_update(
2630 xfs_bmbt_block_t *block; 2579 xfs_bmbt_block_t *block;
2631 xfs_buf_t *bp; 2580 xfs_buf_t *bp;
2632 int error; 2581 int error;
2633#ifdef XFS_BMBT_TRACE
2634 static char fname[] = "xfs_bmbt_update";
2635#endif
2636 xfs_bmbt_key_t key; 2582 xfs_bmbt_key_t key;
2637 int ptr; 2583 int ptr;
2638 xfs_bmbt_rec_t *rp; 2584 xfs_bmbt_rec_t *rp;
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4e27d55a1e..6e40a0a198 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -444,30 +444,14 @@ xfs_btree_setbuf(
444/* 444/*
445 * Min and max functions for extlen, agblock, fileoff, and filblks types. 445 * Min and max functions for extlen, agblock, fileoff, and filblks types.
446 */ 446 */
447#define XFS_EXTLEN_MIN(a,b) \ 447#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
448 ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ 448#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
449 (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) 449#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
450#define XFS_EXTLEN_MAX(a,b) \ 450#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
451 ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ 451#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
452 (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) 452#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
453#define XFS_AGBLOCK_MIN(a,b) \ 453#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
454 ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ 454#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
455 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
456#define XFS_AGBLOCK_MAX(a,b) \
457 ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
458 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
459#define XFS_FILEOFF_MIN(a,b) \
460 ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
461 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
462#define XFS_FILEOFF_MAX(a,b) \
463 ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
464 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
465#define XFS_FILBLKS_MIN(a,b) \
466 ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
467 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
468#define XFS_FILBLKS_MAX(a,b) \
469 ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
470 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
471 455
472#define XFS_FSB_SANITY_CHECK(mp,fsb) \ 456#define XFS_FSB_SANITY_CHECK(mp,fsb) \
473 (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ 457 (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 6c1bddc04e..b0667cb27d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -580,8 +580,8 @@ xfs_buf_item_unlock(
580 * If the buf item isn't tracking any data, free it. 580 * If the buf item isn't tracking any data, free it.
581 * Otherwise, if XFS_BLI_HOLD is set clear it. 581 * Otherwise, if XFS_BLI_HOLD is set clear it.
582 */ 582 */
583 if (xfs_count_bits(bip->bli_format.blf_data_map, 583 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
584 bip->bli_format.blf_map_size, 0) == 0) { 584 bip->bli_format.blf_map_size)) {
585 xfs_buf_item_relse(bp); 585 xfs_buf_item_relse(bp);
586 } else if (hold) { 586 } else if (hold) {
587 bip->bli_flags &= ~XFS_BLI_HOLD; 587 bip->bli_flags &= ~XFS_BLI_HOLD;
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index 5b7eb81453..f89196cb08 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -99,5 +99,7 @@ struct xfs_mount_args {
99 */ 99 */
100#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred 100#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred
101 * I/O size in stat(2) */ 101 * I/O size in stat(2) */
102#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams
103 * allocator */
102 104
103#endif /* __XFS_CLNT_H__ */ 105#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index b33826961c..fefd0116ba 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -257,6 +257,7 @@ typedef enum xfs_dinode_fmt
257#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ 257#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
258#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ 258#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
259#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ 259#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
260#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
260#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) 261#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
261#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) 262#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
262#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) 263#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -271,12 +272,13 @@ typedef enum xfs_dinode_fmt
271#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) 272#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
272#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) 273#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
273#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) 274#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
275#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
274 276
275#define XFS_DIFLAG_ANY \ 277#define XFS_DIFLAG_ANY \
276 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ 278 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
277 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ 279 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
278 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ 280 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
279 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ 281 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
280 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) 282 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
281 283
282#endif /* __XFS_DINODE_H__ */ 284#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 8e8e527933..29e091914d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -55,9 +55,9 @@ xfs_dir_mount(
55 XFS_MAX_BLOCKSIZE); 55 XFS_MAX_BLOCKSIZE);
56 mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); 56 mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
57 mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; 57 mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
58 mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); 58 mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
59 mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); 59 mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
60 mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); 60 mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
61 mp->m_attr_node_ents = 61 mp->m_attr_node_ents =
62 (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / 62 (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
63 (uint)sizeof(xfs_da_node_entry_t); 63 (uint)sizeof(xfs_da_node_entry_t);
@@ -554,7 +554,7 @@ xfs_dir2_grow_inode(
554 */ 554 */
555 if (mapp != &map) 555 if (mapp != &map)
556 kmem_free(mapp, sizeof(*mapp) * count); 556 kmem_free(mapp, sizeof(*mapp) * count);
557 *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); 557 *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
558 /* 558 /*
559 * Update file's size if this is the data space and it grew. 559 * Update file's size if this is the data space and it grew.
560 */ 560 */
@@ -706,7 +706,7 @@ xfs_dir2_shrink_inode(
706 dp = args->dp; 706 dp = args->dp;
707 mp = dp->i_mount; 707 mp = dp->i_mount;
708 tp = args->trans; 708 tp = args->trans;
709 da = XFS_DIR2_DB_TO_DA(mp, db); 709 da = xfs_dir2_db_to_da(mp, db);
710 /* 710 /*
711 * Unmap the fsblock(s). 711 * Unmap the fsblock(s).
712 */ 712 */
@@ -742,7 +742,7 @@ xfs_dir2_shrink_inode(
742 /* 742 /*
743 * If the block isn't the last one in the directory, we're done. 743 * If the block isn't the last one in the directory, we're done.
744 */ 744 */
745 if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) 745 if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
746 return 0; 746 return 0;
747 bno = da; 747 bno = da;
748 if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { 748 if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 3accc1dcd6..e4df1aaae2 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -115,13 +115,13 @@ xfs_dir2_block_addname(
115 xfs_da_brelse(tp, bp); 115 xfs_da_brelse(tp, bp);
116 return XFS_ERROR(EFSCORRUPTED); 116 return XFS_ERROR(EFSCORRUPTED);
117 } 117 }
118 len = XFS_DIR2_DATA_ENTSIZE(args->namelen); 118 len = xfs_dir2_data_entsize(args->namelen);
119 /* 119 /*
120 * Set up pointers to parts of the block. 120 * Set up pointers to parts of the block.
121 */ 121 */
122 bf = block->hdr.bestfree; 122 bf = block->hdr.bestfree;
123 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 123 btp = xfs_dir2_block_tail_p(mp, block);
124 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 124 blp = xfs_dir2_block_leaf_p(btp);
125 /* 125 /*
126 * No stale entries? Need space for entry and new leaf. 126 * No stale entries? Need space for entry and new leaf.
127 */ 127 */
@@ -396,7 +396,7 @@ xfs_dir2_block_addname(
396 * Fill in the leaf entry. 396 * Fill in the leaf entry.
397 */ 397 */
398 blp[mid].hashval = cpu_to_be32(args->hashval); 398 blp[mid].hashval = cpu_to_be32(args->hashval);
399 blp[mid].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, 399 blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
400 (char *)dep - (char *)block)); 400 (char *)dep - (char *)block));
401 xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); 401 xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
402 /* 402 /*
@@ -411,7 +411,7 @@ xfs_dir2_block_addname(
411 dep->inumber = cpu_to_be64(args->inumber); 411 dep->inumber = cpu_to_be64(args->inumber);
412 dep->namelen = args->namelen; 412 dep->namelen = args->namelen;
413 memcpy(dep->name, args->name, args->namelen); 413 memcpy(dep->name, args->name, args->namelen);
414 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 414 tagp = xfs_dir2_data_entry_tag_p(dep);
415 *tagp = cpu_to_be16((char *)dep - (char *)block); 415 *tagp = cpu_to_be16((char *)dep - (char *)block);
416 /* 416 /*
417 * Clean up the bestfree array and log the header, tail, and entry. 417 * Clean up the bestfree array and log the header, tail, and entry.
@@ -455,7 +455,7 @@ xfs_dir2_block_getdents(
455 /* 455 /*
456 * If the block number in the offset is out of range, we're done. 456 * If the block number in the offset is out of range, we're done.
457 */ 457 */
458 if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { 458 if (xfs_dir2_dataptr_to_db(mp, uio->uio_offset) > mp->m_dirdatablk) {
459 *eofp = 1; 459 *eofp = 1;
460 return 0; 460 return 0;
461 } 461 }
@@ -471,15 +471,15 @@ xfs_dir2_block_getdents(
471 * Extract the byte offset we start at from the seek pointer. 471 * Extract the byte offset we start at from the seek pointer.
472 * We'll skip entries before this. 472 * We'll skip entries before this.
473 */ 473 */
474 wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); 474 wantoff = xfs_dir2_dataptr_to_off(mp, uio->uio_offset);
475 block = bp->data; 475 block = bp->data;
476 xfs_dir2_data_check(dp, bp); 476 xfs_dir2_data_check(dp, bp);
477 /* 477 /*
478 * Set up values for the loop. 478 * Set up values for the loop.
479 */ 479 */
480 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 480 btp = xfs_dir2_block_tail_p(mp, block);
481 ptr = (char *)block->u; 481 ptr = (char *)block->u;
482 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); 482 endptr = (char *)xfs_dir2_block_leaf_p(btp);
483 p.dbp = dbp; 483 p.dbp = dbp;
484 p.put = put; 484 p.put = put;
485 p.uio = uio; 485 p.uio = uio;
@@ -502,7 +502,7 @@ xfs_dir2_block_getdents(
502 /* 502 /*
503 * Bump pointer for the next iteration. 503 * Bump pointer for the next iteration.
504 */ 504 */
505 ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); 505 ptr += xfs_dir2_data_entsize(dep->namelen);
506 /* 506 /*
507 * The entry is before the desired starting point, skip it. 507 * The entry is before the desired starting point, skip it.
508 */ 508 */
@@ -513,7 +513,7 @@ xfs_dir2_block_getdents(
513 */ 513 */
514 p.namelen = dep->namelen; 514 p.namelen = dep->namelen;
515 515
516 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 516 p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
517 ptr - (char *)block); 517 ptr - (char *)block);
518 p.ino = be64_to_cpu(dep->inumber); 518 p.ino = be64_to_cpu(dep->inumber);
519#if XFS_BIG_INUMS 519#if XFS_BIG_INUMS
@@ -531,7 +531,7 @@ xfs_dir2_block_getdents(
531 */ 531 */
532 if (!p.done) { 532 if (!p.done) {
533 uio->uio_offset = 533 uio->uio_offset =
534 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 534 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
535 (char *)dep - (char *)block); 535 (char *)dep - (char *)block);
536 xfs_da_brelse(tp, bp); 536 xfs_da_brelse(tp, bp);
537 return error; 537 return error;
@@ -545,7 +545,7 @@ xfs_dir2_block_getdents(
545 *eofp = 1; 545 *eofp = 1;
546 546
547 uio->uio_offset = 547 uio->uio_offset =
548 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); 548 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
549 549
550 xfs_da_brelse(tp, bp); 550 xfs_da_brelse(tp, bp);
551 551
@@ -569,8 +569,8 @@ xfs_dir2_block_log_leaf(
569 569
570 mp = tp->t_mountp; 570 mp = tp->t_mountp;
571 block = bp->data; 571 block = bp->data;
572 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 572 btp = xfs_dir2_block_tail_p(mp, block);
573 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 573 blp = xfs_dir2_block_leaf_p(btp);
574 xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), 574 xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
575 (uint)((char *)&blp[last + 1] - (char *)block - 1)); 575 (uint)((char *)&blp[last + 1] - (char *)block - 1));
576} 576}
@@ -589,7 +589,7 @@ xfs_dir2_block_log_tail(
589 589
590 mp = tp->t_mountp; 590 mp = tp->t_mountp;
591 block = bp->data; 591 block = bp->data;
592 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 592 btp = xfs_dir2_block_tail_p(mp, block);
593 xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), 593 xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
594 (uint)((char *)(btp + 1) - (char *)block - 1)); 594 (uint)((char *)(btp + 1) - (char *)block - 1));
595} 595}
@@ -623,13 +623,13 @@ xfs_dir2_block_lookup(
623 mp = dp->i_mount; 623 mp = dp->i_mount;
624 block = bp->data; 624 block = bp->data;
625 xfs_dir2_data_check(dp, bp); 625 xfs_dir2_data_check(dp, bp);
626 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 626 btp = xfs_dir2_block_tail_p(mp, block);
627 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 627 blp = xfs_dir2_block_leaf_p(btp);
628 /* 628 /*
629 * Get the offset from the leaf entry, to point to the data. 629 * Get the offset from the leaf entry, to point to the data.
630 */ 630 */
631 dep = (xfs_dir2_data_entry_t *) 631 dep = (xfs_dir2_data_entry_t *)
632 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); 632 ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
633 /* 633 /*
634 * Fill in inode number, release the block. 634 * Fill in inode number, release the block.
635 */ 635 */
@@ -675,8 +675,8 @@ xfs_dir2_block_lookup_int(
675 ASSERT(bp != NULL); 675 ASSERT(bp != NULL);
676 block = bp->data; 676 block = bp->data;
677 xfs_dir2_data_check(dp, bp); 677 xfs_dir2_data_check(dp, bp);
678 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 678 btp = xfs_dir2_block_tail_p(mp, block);
679 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 679 blp = xfs_dir2_block_leaf_p(btp);
680 /* 680 /*
681 * Loop doing a binary search for our hash value. 681 * Loop doing a binary search for our hash value.
682 * Find our entry, ENOENT if it's not there. 682 * Find our entry, ENOENT if it's not there.
@@ -713,7 +713,7 @@ xfs_dir2_block_lookup_int(
713 * Get pointer to the entry from the leaf. 713 * Get pointer to the entry from the leaf.
714 */ 714 */
715 dep = (xfs_dir2_data_entry_t *) 715 dep = (xfs_dir2_data_entry_t *)
716 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); 716 ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
717 /* 717 /*
718 * Compare, if it's right give back buffer & entry number. 718 * Compare, if it's right give back buffer & entry number.
719 */ 719 */
@@ -768,20 +768,20 @@ xfs_dir2_block_removename(
768 tp = args->trans; 768 tp = args->trans;
769 mp = dp->i_mount; 769 mp = dp->i_mount;
770 block = bp->data; 770 block = bp->data;
771 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 771 btp = xfs_dir2_block_tail_p(mp, block);
772 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 772 blp = xfs_dir2_block_leaf_p(btp);
773 /* 773 /*
774 * Point to the data entry using the leaf entry. 774 * Point to the data entry using the leaf entry.
775 */ 775 */
776 dep = (xfs_dir2_data_entry_t *) 776 dep = (xfs_dir2_data_entry_t *)
777 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); 777 ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
778 /* 778 /*
779 * Mark the data entry's space free. 779 * Mark the data entry's space free.
780 */ 780 */
781 needlog = needscan = 0; 781 needlog = needscan = 0;
782 xfs_dir2_data_make_free(tp, bp, 782 xfs_dir2_data_make_free(tp, bp,
783 (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), 783 (xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
784 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); 784 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
785 /* 785 /*
786 * Fix up the block tail. 786 * Fix up the block tail.
787 */ 787 */
@@ -843,13 +843,13 @@ xfs_dir2_block_replace(
843 dp = args->dp; 843 dp = args->dp;
844 mp = dp->i_mount; 844 mp = dp->i_mount;
845 block = bp->data; 845 block = bp->data;
846 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 846 btp = xfs_dir2_block_tail_p(mp, block);
847 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 847 blp = xfs_dir2_block_leaf_p(btp);
848 /* 848 /*
849 * Point to the data entry we need to change. 849 * Point to the data entry we need to change.
850 */ 850 */
851 dep = (xfs_dir2_data_entry_t *) 851 dep = (xfs_dir2_data_entry_t *)
852 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); 852 ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
853 ASSERT(be64_to_cpu(dep->inumber) != args->inumber); 853 ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
854 /* 854 /*
855 * Change the inode number to the new value. 855 * Change the inode number to the new value.
@@ -912,7 +912,7 @@ xfs_dir2_leaf_to_block(
912 mp = dp->i_mount; 912 mp = dp->i_mount;
913 leaf = lbp->data; 913 leaf = lbp->data;
914 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); 914 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
915 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 915 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
916 /* 916 /*
917 * If there are data blocks other than the first one, take this 917 * If there are data blocks other than the first one, take this
918 * opportunity to remove trailing empty data blocks that may have 918 * opportunity to remove trailing empty data blocks that may have
@@ -920,7 +920,7 @@ xfs_dir2_leaf_to_block(
920 * These will show up in the leaf bests table. 920 * These will show up in the leaf bests table.
921 */ 921 */
922 while (dp->i_d.di_size > mp->m_dirblksize) { 922 while (dp->i_d.di_size > mp->m_dirblksize) {
923 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); 923 bestsp = xfs_dir2_leaf_bests_p(ltp);
924 if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == 924 if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
925 mp->m_dirblksize - (uint)sizeof(block->hdr)) { 925 mp->m_dirblksize - (uint)sizeof(block->hdr)) {
926 if ((error = 926 if ((error =
@@ -974,14 +974,14 @@ xfs_dir2_leaf_to_block(
974 /* 974 /*
975 * Initialize the block tail. 975 * Initialize the block tail.
976 */ 976 */
977 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 977 btp = xfs_dir2_block_tail_p(mp, block);
978 btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); 978 btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
979 btp->stale = 0; 979 btp->stale = 0;
980 xfs_dir2_block_log_tail(tp, dbp); 980 xfs_dir2_block_log_tail(tp, dbp);
981 /* 981 /*
982 * Initialize the block leaf area. We compact out stale entries. 982 * Initialize the block leaf area. We compact out stale entries.
983 */ 983 */
984 lep = XFS_DIR2_BLOCK_LEAF_P(btp); 984 lep = xfs_dir2_block_leaf_p(btp);
985 for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { 985 for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
986 if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) 986 if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR)
987 continue; 987 continue;
@@ -1067,7 +1067,7 @@ xfs_dir2_sf_to_block(
1067 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 1067 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
1068 ASSERT(dp->i_df.if_u1.if_data != NULL); 1068 ASSERT(dp->i_df.if_u1.if_data != NULL);
1069 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 1069 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1070 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 1070 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
1071 /* 1071 /*
1072 * Copy the directory into the stack buffer. 1072 * Copy the directory into the stack buffer.
1073 * Then pitch the incore inode data so we can make extents. 1073 * Then pitch the incore inode data so we can make extents.
@@ -1119,10 +1119,10 @@ xfs_dir2_sf_to_block(
1119 /* 1119 /*
1120 * Fill in the tail. 1120 * Fill in the tail.
1121 */ 1121 */
1122 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 1122 btp = xfs_dir2_block_tail_p(mp, block);
1123 btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ 1123 btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */
1124 btp->stale = 0; 1124 btp->stale = 0;
1125 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 1125 blp = xfs_dir2_block_leaf_p(btp);
1126 endoffset = (uint)((char *)blp - (char *)block); 1126 endoffset = (uint)((char *)blp - (char *)block);
1127 /* 1127 /*
1128 * Remove the freespace, we'll manage it. 1128 * Remove the freespace, we'll manage it.
@@ -1138,25 +1138,25 @@ xfs_dir2_sf_to_block(
1138 dep->inumber = cpu_to_be64(dp->i_ino); 1138 dep->inumber = cpu_to_be64(dp->i_ino);
1139 dep->namelen = 1; 1139 dep->namelen = 1;
1140 dep->name[0] = '.'; 1140 dep->name[0] = '.';
1141 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 1141 tagp = xfs_dir2_data_entry_tag_p(dep);
1142 *tagp = cpu_to_be16((char *)dep - (char *)block); 1142 *tagp = cpu_to_be16((char *)dep - (char *)block);
1143 xfs_dir2_data_log_entry(tp, bp, dep); 1143 xfs_dir2_data_log_entry(tp, bp, dep);
1144 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); 1144 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
1145 blp[0].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, 1145 blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
1146 (char *)dep - (char *)block)); 1146 (char *)dep - (char *)block));
1147 /* 1147 /*
1148 * Create entry for .. 1148 * Create entry for ..
1149 */ 1149 */
1150 dep = (xfs_dir2_data_entry_t *) 1150 dep = (xfs_dir2_data_entry_t *)
1151 ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); 1151 ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
1152 dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); 1152 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
1153 dep->namelen = 2; 1153 dep->namelen = 2;
1154 dep->name[0] = dep->name[1] = '.'; 1154 dep->name[0] = dep->name[1] = '.';
1155 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 1155 tagp = xfs_dir2_data_entry_tag_p(dep);
1156 *tagp = cpu_to_be16((char *)dep - (char *)block); 1156 *tagp = cpu_to_be16((char *)dep - (char *)block);
1157 xfs_dir2_data_log_entry(tp, bp, dep); 1157 xfs_dir2_data_log_entry(tp, bp, dep);
1158 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); 1158 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
1159 blp[1].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, 1159 blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
1160 (char *)dep - (char *)block)); 1160 (char *)dep - (char *)block));
1161 offset = XFS_DIR2_DATA_FIRST_OFFSET; 1161 offset = XFS_DIR2_DATA_FIRST_OFFSET;
1162 /* 1162 /*
@@ -1165,7 +1165,7 @@ xfs_dir2_sf_to_block(
1165 if ((i = 0) == sfp->hdr.count) 1165 if ((i = 0) == sfp->hdr.count)
1166 sfep = NULL; 1166 sfep = NULL;
1167 else 1167 else
1168 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 1168 sfep = xfs_dir2_sf_firstentry(sfp);
1169 /* 1169 /*
1170 * Need to preserve the existing offset values in the sf directory. 1170 * Need to preserve the existing offset values in the sf directory.
1171 * Insert holes (unused entries) where necessary. 1171 * Insert holes (unused entries) where necessary.
@@ -1177,7 +1177,7 @@ xfs_dir2_sf_to_block(
1177 if (sfep == NULL) 1177 if (sfep == NULL)
1178 newoffset = endoffset; 1178 newoffset = endoffset;
1179 else 1179 else
1180 newoffset = XFS_DIR2_SF_GET_OFFSET(sfep); 1180 newoffset = xfs_dir2_sf_get_offset(sfep);
1181 /* 1181 /*
1182 * There should be a hole here, make one. 1182 * There should be a hole here, make one.
1183 */ 1183 */
@@ -1186,7 +1186,7 @@ xfs_dir2_sf_to_block(
1186 ((char *)block + offset); 1186 ((char *)block + offset);
1187 dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); 1187 dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
1188 dup->length = cpu_to_be16(newoffset - offset); 1188 dup->length = cpu_to_be16(newoffset - offset);
1189 *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16( 1189 *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
1190 ((char *)dup - (char *)block)); 1190 ((char *)dup - (char *)block));
1191 xfs_dir2_data_log_unused(tp, bp, dup); 1191 xfs_dir2_data_log_unused(tp, bp, dup);
1192 (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, 1192 (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
@@ -1198,22 +1198,22 @@ xfs_dir2_sf_to_block(
1198 * Copy a real entry. 1198 * Copy a real entry.
1199 */ 1199 */
1200 dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); 1200 dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
1201 dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, 1201 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp,
1202 XFS_DIR2_SF_INUMBERP(sfep))); 1202 xfs_dir2_sf_inumberp(sfep)));
1203 dep->namelen = sfep->namelen; 1203 dep->namelen = sfep->namelen;
1204 memcpy(dep->name, sfep->name, dep->namelen); 1204 memcpy(dep->name, sfep->name, dep->namelen);
1205 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 1205 tagp = xfs_dir2_data_entry_tag_p(dep);
1206 *tagp = cpu_to_be16((char *)dep - (char *)block); 1206 *tagp = cpu_to_be16((char *)dep - (char *)block);
1207 xfs_dir2_data_log_entry(tp, bp, dep); 1207 xfs_dir2_data_log_entry(tp, bp, dep);
1208 blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname( 1208 blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname(
1209 (char *)sfep->name, sfep->namelen)); 1209 (char *)sfep->name, sfep->namelen));
1210 blp[2 + i].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, 1210 blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
1211 (char *)dep - (char *)block)); 1211 (char *)dep - (char *)block));
1212 offset = (int)((char *)(tagp + 1) - (char *)block); 1212 offset = (int)((char *)(tagp + 1) - (char *)block);
1213 if (++i == sfp->hdr.count) 1213 if (++i == sfp->hdr.count)
1214 sfep = NULL; 1214 sfep = NULL;
1215 else 1215 else
1216 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); 1216 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
1217 } 1217 }
1218 /* Done with the temporary buffer */ 1218 /* Done with the temporary buffer */
1219 kmem_free(buf, buf_len); 1219 kmem_free(buf, buf_len);
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
index 6722effd0b..e7c2606161 100644
--- a/fs/xfs/xfs_dir2_block.h
+++ b/fs/xfs/xfs_dir2_block.h
@@ -60,7 +60,6 @@ typedef struct xfs_dir2_block {
60/* 60/*
61 * Pointer to the leaf header embedded in a data block (1-block format) 61 * Pointer to the leaf header embedded in a data block (1-block format)
62 */ 62 */
63#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
64static inline xfs_dir2_block_tail_t * 63static inline xfs_dir2_block_tail_t *
65xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) 64xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block)
66{ 65{
@@ -71,7 +70,6 @@ xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block)
71/* 70/*
72 * Pointer to the leaf entries embedded in a data block (1-block format) 71 * Pointer to the leaf entries embedded in a data block (1-block format)
73 */ 72 */
74#define XFS_DIR2_BLOCK_LEAF_P(btp) xfs_dir2_block_leaf_p(btp)
75static inline struct xfs_dir2_leaf_entry * 73static inline struct xfs_dir2_leaf_entry *
76xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) 74xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
77{ 75{
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index c211c37ef6..7ebe295bd6 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -72,8 +72,8 @@ xfs_dir2_data_check(
72 bf = d->hdr.bestfree; 72 bf = d->hdr.bestfree;
73 p = (char *)d->u; 73 p = (char *)d->u;
74 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { 74 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
75 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); 75 btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
76 lep = XFS_DIR2_BLOCK_LEAF_P(btp); 76 lep = xfs_dir2_block_leaf_p(btp);
77 endp = (char *)lep; 77 endp = (char *)lep;
78 } else 78 } else
79 endp = (char *)d + mp->m_dirblksize; 79 endp = (char *)d + mp->m_dirblksize;
@@ -107,7 +107,7 @@ xfs_dir2_data_check(
107 */ 107 */
108 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 108 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
109 ASSERT(lastfree == 0); 109 ASSERT(lastfree == 0);
110 ASSERT(be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup)) == 110 ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
111 (char *)dup - (char *)d); 111 (char *)dup - (char *)d);
112 dfp = xfs_dir2_data_freefind(d, dup); 112 dfp = xfs_dir2_data_freefind(d, dup);
113 if (dfp) { 113 if (dfp) {
@@ -131,12 +131,12 @@ xfs_dir2_data_check(
131 dep = (xfs_dir2_data_entry_t *)p; 131 dep = (xfs_dir2_data_entry_t *)p;
132 ASSERT(dep->namelen != 0); 132 ASSERT(dep->namelen != 0);
133 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); 133 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
134 ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) == 134 ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
135 (char *)dep - (char *)d); 135 (char *)dep - (char *)d);
136 count++; 136 count++;
137 lastfree = 0; 137 lastfree = 0;
138 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { 138 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
139 addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 139 addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
140 (xfs_dir2_data_aoff_t) 140 (xfs_dir2_data_aoff_t)
141 ((char *)dep - (char *)d)); 141 ((char *)dep - (char *)d));
142 hash = xfs_da_hashname((char *)dep->name, dep->namelen); 142 hash = xfs_da_hashname((char *)dep->name, dep->namelen);
@@ -147,7 +147,7 @@ xfs_dir2_data_check(
147 } 147 }
148 ASSERT(i < be32_to_cpu(btp->count)); 148 ASSERT(i < be32_to_cpu(btp->count));
149 } 149 }
150 p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); 150 p += xfs_dir2_data_entsize(dep->namelen);
151 } 151 }
152 /* 152 /*
153 * Need to have seen all the entries and all the bestfree slots. 153 * Need to have seen all the entries and all the bestfree slots.
@@ -346,8 +346,8 @@ xfs_dir2_data_freescan(
346 */ 346 */
347 p = (char *)d->u; 347 p = (char *)d->u;
348 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { 348 if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
349 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); 349 btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
350 endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); 350 endp = (char *)xfs_dir2_block_leaf_p(btp);
351 } else 351 } else
352 endp = (char *)d + mp->m_dirblksize; 352 endp = (char *)d + mp->m_dirblksize;
353 /* 353 /*
@@ -360,7 +360,7 @@ xfs_dir2_data_freescan(
360 */ 360 */
361 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 361 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
362 ASSERT((char *)dup - (char *)d == 362 ASSERT((char *)dup - (char *)d ==
363 be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); 363 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
364 xfs_dir2_data_freeinsert(d, dup, loghead); 364 xfs_dir2_data_freeinsert(d, dup, loghead);
365 p += be16_to_cpu(dup->length); 365 p += be16_to_cpu(dup->length);
366 } 366 }
@@ -370,8 +370,8 @@ xfs_dir2_data_freescan(
370 else { 370 else {
371 dep = (xfs_dir2_data_entry_t *)p; 371 dep = (xfs_dir2_data_entry_t *)p;
372 ASSERT((char *)dep - (char *)d == 372 ASSERT((char *)dep - (char *)d ==
373 be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep))); 373 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
374 p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); 374 p += xfs_dir2_data_entsize(dep->namelen);
375 } 375 }
376 } 376 }
377} 377}
@@ -402,7 +402,7 @@ xfs_dir2_data_init(
402 /* 402 /*
403 * Get the buffer set up for the block. 403 * Get the buffer set up for the block.
404 */ 404 */
405 error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp, 405 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
406 XFS_DATA_FORK); 406 XFS_DATA_FORK);
407 if (error) { 407 if (error) {
408 return error; 408 return error;
@@ -427,7 +427,7 @@ xfs_dir2_data_init(
427 t=mp->m_dirblksize - (uint)sizeof(d->hdr); 427 t=mp->m_dirblksize - (uint)sizeof(d->hdr);
428 d->hdr.bestfree[0].length = cpu_to_be16(t); 428 d->hdr.bestfree[0].length = cpu_to_be16(t);
429 dup->length = cpu_to_be16(t); 429 dup->length = cpu_to_be16(t);
430 *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16((char *)dup - (char *)d); 430 *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d);
431 /* 431 /*
432 * Log it and return it. 432 * Log it and return it.
433 */ 433 */
@@ -452,7 +452,7 @@ xfs_dir2_data_log_entry(
452 ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || 452 ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
453 be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); 453 be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
454 xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), 454 xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
455 (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) - 455 (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
456 (char *)d - 1)); 456 (char *)d - 1));
457} 457}
458 458
@@ -497,8 +497,8 @@ xfs_dir2_data_log_unused(
497 * Log the end (tag) of the unused entry. 497 * Log the end (tag) of the unused entry.
498 */ 498 */
499 xfs_da_log_buf(tp, bp, 499 xfs_da_log_buf(tp, bp,
500 (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d), 500 (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d),
501 (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d + 501 (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d +
502 sizeof(xfs_dir2_data_off_t) - 1)); 502 sizeof(xfs_dir2_data_off_t) - 1));
503} 503}
504 504
@@ -535,8 +535,8 @@ xfs_dir2_data_make_free(
535 xfs_dir2_block_tail_t *btp; /* block tail */ 535 xfs_dir2_block_tail_t *btp; /* block tail */
536 536
537 ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); 537 ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
538 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); 538 btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
539 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); 539 endptr = (char *)xfs_dir2_block_leaf_p(btp);
540 } 540 }
541 /* 541 /*
542 * If this isn't the start of the block, then back up to 542 * If this isn't the start of the block, then back up to
@@ -587,7 +587,7 @@ xfs_dir2_data_make_free(
587 * Fix up the new big freespace. 587 * Fix up the new big freespace.
588 */ 588 */
589 be16_add(&prevdup->length, len + be16_to_cpu(postdup->length)); 589 be16_add(&prevdup->length, len + be16_to_cpu(postdup->length));
590 *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = 590 *xfs_dir2_data_unused_tag_p(prevdup) =
591 cpu_to_be16((char *)prevdup - (char *)d); 591 cpu_to_be16((char *)prevdup - (char *)d);
592 xfs_dir2_data_log_unused(tp, bp, prevdup); 592 xfs_dir2_data_log_unused(tp, bp, prevdup);
593 if (!needscan) { 593 if (!needscan) {
@@ -621,7 +621,7 @@ xfs_dir2_data_make_free(
621 else if (prevdup) { 621 else if (prevdup) {
622 dfp = xfs_dir2_data_freefind(d, prevdup); 622 dfp = xfs_dir2_data_freefind(d, prevdup);
623 be16_add(&prevdup->length, len); 623 be16_add(&prevdup->length, len);
624 *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = 624 *xfs_dir2_data_unused_tag_p(prevdup) =
625 cpu_to_be16((char *)prevdup - (char *)d); 625 cpu_to_be16((char *)prevdup - (char *)d);
626 xfs_dir2_data_log_unused(tp, bp, prevdup); 626 xfs_dir2_data_log_unused(tp, bp, prevdup);
627 /* 627 /*
@@ -649,7 +649,7 @@ xfs_dir2_data_make_free(
649 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); 649 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
650 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); 650 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
651 newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); 651 newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
652 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = 652 *xfs_dir2_data_unused_tag_p(newdup) =
653 cpu_to_be16((char *)newdup - (char *)d); 653 cpu_to_be16((char *)newdup - (char *)d);
654 xfs_dir2_data_log_unused(tp, bp, newdup); 654 xfs_dir2_data_log_unused(tp, bp, newdup);
655 /* 655 /*
@@ -676,7 +676,7 @@ xfs_dir2_data_make_free(
676 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); 676 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
677 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); 677 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
678 newdup->length = cpu_to_be16(len); 678 newdup->length = cpu_to_be16(len);
679 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = 679 *xfs_dir2_data_unused_tag_p(newdup) =
680 cpu_to_be16((char *)newdup - (char *)d); 680 cpu_to_be16((char *)newdup - (char *)d);
681 xfs_dir2_data_log_unused(tp, bp, newdup); 681 xfs_dir2_data_log_unused(tp, bp, newdup);
682 (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); 682 (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
@@ -712,7 +712,7 @@ xfs_dir2_data_use_free(
712 ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); 712 ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
713 ASSERT(offset >= (char *)dup - (char *)d); 713 ASSERT(offset >= (char *)dup - (char *)d);
714 ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); 714 ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d);
715 ASSERT((char *)dup - (char *)d == be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); 715 ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
716 /* 716 /*
717 * Look up the entry in the bestfree table. 717 * Look up the entry in the bestfree table.
718 */ 718 */
@@ -745,7 +745,7 @@ xfs_dir2_data_use_free(
745 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); 745 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
746 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); 746 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
747 newdup->length = cpu_to_be16(oldlen - len); 747 newdup->length = cpu_to_be16(oldlen - len);
748 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = 748 *xfs_dir2_data_unused_tag_p(newdup) =
749 cpu_to_be16((char *)newdup - (char *)d); 749 cpu_to_be16((char *)newdup - (char *)d);
750 xfs_dir2_data_log_unused(tp, bp, newdup); 750 xfs_dir2_data_log_unused(tp, bp, newdup);
751 /* 751 /*
@@ -772,7 +772,7 @@ xfs_dir2_data_use_free(
772 else if (matchback) { 772 else if (matchback) {
773 newdup = dup; 773 newdup = dup;
774 newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); 774 newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
775 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = 775 *xfs_dir2_data_unused_tag_p(newdup) =
776 cpu_to_be16((char *)newdup - (char *)d); 776 cpu_to_be16((char *)newdup - (char *)d);
777 xfs_dir2_data_log_unused(tp, bp, newdup); 777 xfs_dir2_data_log_unused(tp, bp, newdup);
778 /* 778 /*
@@ -799,13 +799,13 @@ xfs_dir2_data_use_free(
799 else { 799 else {
800 newdup = dup; 800 newdup = dup;
801 newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); 801 newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
802 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = 802 *xfs_dir2_data_unused_tag_p(newdup) =
803 cpu_to_be16((char *)newdup - (char *)d); 803 cpu_to_be16((char *)newdup - (char *)d);
804 xfs_dir2_data_log_unused(tp, bp, newdup); 804 xfs_dir2_data_log_unused(tp, bp, newdup);
805 newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); 805 newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
806 newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); 806 newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
807 newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); 807 newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
808 *XFS_DIR2_DATA_UNUSED_TAG_P(newdup2) = 808 *xfs_dir2_data_unused_tag_p(newdup2) =
809 cpu_to_be16((char *)newdup2 - (char *)d); 809 cpu_to_be16((char *)newdup2 - (char *)d);
810 xfs_dir2_data_log_unused(tp, bp, newdup2); 810 xfs_dir2_data_log_unused(tp, bp, newdup2);
811 /* 811 /*
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index c94c9099cf..b816e02527 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -44,7 +44,7 @@ struct xfs_trans;
44#define XFS_DIR2_DATA_SPACE 0 44#define XFS_DIR2_DATA_SPACE 0
45#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) 45#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
46#define XFS_DIR2_DATA_FIRSTDB(mp) \ 46#define XFS_DIR2_DATA_FIRSTDB(mp) \
47 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET) 47 xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
48 48
49/* 49/*
50 * Offsets of . and .. in data space (always block 0) 50 * Offsets of . and .. in data space (always block 0)
@@ -52,9 +52,9 @@ struct xfs_trans;
52#define XFS_DIR2_DATA_DOT_OFFSET \ 52#define XFS_DIR2_DATA_DOT_OFFSET \
53 ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) 53 ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
54#define XFS_DIR2_DATA_DOTDOT_OFFSET \ 54#define XFS_DIR2_DATA_DOTDOT_OFFSET \
55 (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1)) 55 (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
56#define XFS_DIR2_DATA_FIRST_OFFSET \ 56#define XFS_DIR2_DATA_FIRST_OFFSET \
57 (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2)) 57 (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
58 58
59/* 59/*
60 * Structures. 60 * Structures.
@@ -123,7 +123,6 @@ typedef struct xfs_dir2_data {
123/* 123/*
124 * Size of a data entry. 124 * Size of a data entry.
125 */ 125 */
126#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n)
127static inline int xfs_dir2_data_entsize(int n) 126static inline int xfs_dir2_data_entsize(int n)
128{ 127{
129 return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ 128 return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
@@ -133,19 +132,16 @@ static inline int xfs_dir2_data_entsize(int n)
133/* 132/*
134 * Pointer to an entry's tag word. 133 * Pointer to an entry's tag word.
135 */ 134 */
136#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep)
137static inline __be16 * 135static inline __be16 *
138xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) 136xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
139{ 137{
140 return (__be16 *)((char *)dep + 138 return (__be16 *)((char *)dep +
141 XFS_DIR2_DATA_ENTSIZE(dep->namelen) - sizeof(__be16)); 139 xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
142} 140}
143 141
144/* 142/*
145 * Pointer to a freespace's tag word. 143 * Pointer to a freespace's tag word.
146 */ 144 */
147#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
148 xfs_dir2_data_unused_tag_p(dup)
149static inline __be16 * 145static inline __be16 *
150xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) 146xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
151{ 147{
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index db14ea7145..1b73c9ad64 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -92,7 +92,7 @@ xfs_dir2_block_to_leaf(
92 if ((error = xfs_da_grow_inode(args, &blkno))) { 92 if ((error = xfs_da_grow_inode(args, &blkno))) {
93 return error; 93 return error;
94 } 94 }
95 ldb = XFS_DIR2_DA_TO_DB(mp, blkno); 95 ldb = xfs_dir2_da_to_db(mp, blkno);
96 ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); 96 ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
97 /* 97 /*
98 * Initialize the leaf block, get a buffer for it. 98 * Initialize the leaf block, get a buffer for it.
@@ -104,8 +104,8 @@ xfs_dir2_block_to_leaf(
104 leaf = lbp->data; 104 leaf = lbp->data;
105 block = dbp->data; 105 block = dbp->data;
106 xfs_dir2_data_check(dp, dbp); 106 xfs_dir2_data_check(dp, dbp);
107 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 107 btp = xfs_dir2_block_tail_p(mp, block);
108 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 108 blp = xfs_dir2_block_leaf_p(btp);
109 /* 109 /*
110 * Set the counts in the leaf header. 110 * Set the counts in the leaf header.
111 */ 111 */
@@ -137,9 +137,9 @@ xfs_dir2_block_to_leaf(
137 /* 137 /*
138 * Set up leaf tail and bests table. 138 * Set up leaf tail and bests table.
139 */ 139 */
140 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 140 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
141 ltp->bestcount = cpu_to_be32(1); 141 ltp->bestcount = cpu_to_be32(1);
142 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); 142 bestsp = xfs_dir2_leaf_bests_p(ltp);
143 bestsp[0] = block->hdr.bestfree[0].length; 143 bestsp[0] = block->hdr.bestfree[0].length;
144 /* 144 /*
145 * Log the data header and leaf bests table. 145 * Log the data header and leaf bests table.
@@ -209,9 +209,9 @@ xfs_dir2_leaf_addname(
209 */ 209 */
210 index = xfs_dir2_leaf_search_hash(args, lbp); 210 index = xfs_dir2_leaf_search_hash(args, lbp);
211 leaf = lbp->data; 211 leaf = lbp->data;
212 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 212 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
213 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); 213 bestsp = xfs_dir2_leaf_bests_p(ltp);
214 length = XFS_DIR2_DATA_ENTSIZE(args->namelen); 214 length = xfs_dir2_data_entsize(args->namelen);
215 /* 215 /*
216 * See if there are any entries with the same hash value 216 * See if there are any entries with the same hash value
217 * and space in their block for the new entry. 217 * and space in their block for the new entry.
@@ -223,7 +223,7 @@ xfs_dir2_leaf_addname(
223 index++, lep++) { 223 index++, lep++) {
224 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) 224 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
225 continue; 225 continue;
226 i = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); 226 i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
227 ASSERT(i < be32_to_cpu(ltp->bestcount)); 227 ASSERT(i < be32_to_cpu(ltp->bestcount));
228 ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); 228 ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF);
229 if (be16_to_cpu(bestsp[i]) >= length) { 229 if (be16_to_cpu(bestsp[i]) >= length) {
@@ -378,7 +378,7 @@ xfs_dir2_leaf_addname(
378 */ 378 */
379 else { 379 else {
380 if ((error = 380 if ((error =
381 xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block), 381 xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
382 -1, &dbp, XFS_DATA_FORK))) { 382 -1, &dbp, XFS_DATA_FORK))) {
383 xfs_da_brelse(tp, lbp); 383 xfs_da_brelse(tp, lbp);
384 return error; 384 return error;
@@ -407,7 +407,7 @@ xfs_dir2_leaf_addname(
407 dep->inumber = cpu_to_be64(args->inumber); 407 dep->inumber = cpu_to_be64(args->inumber);
408 dep->namelen = args->namelen; 408 dep->namelen = args->namelen;
409 memcpy(dep->name, args->name, dep->namelen); 409 memcpy(dep->name, args->name, dep->namelen);
410 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 410 tagp = xfs_dir2_data_entry_tag_p(dep);
411 *tagp = cpu_to_be16((char *)dep - (char *)data); 411 *tagp = cpu_to_be16((char *)dep - (char *)data);
412 /* 412 /*
413 * Need to scan fix up the bestfree table. 413 * Need to scan fix up the bestfree table.
@@ -529,7 +529,7 @@ xfs_dir2_leaf_addname(
529 * Fill in the new leaf entry. 529 * Fill in the new leaf entry.
530 */ 530 */
531 lep->hashval = cpu_to_be32(args->hashval); 531 lep->hashval = cpu_to_be32(args->hashval);
532 lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, 532 lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
533 be16_to_cpu(*tagp))); 533 be16_to_cpu(*tagp)));
534 /* 534 /*
535 * Log the leaf fields and give up the buffers. 535 * Log the leaf fields and give up the buffers.
@@ -567,13 +567,13 @@ xfs_dir2_leaf_check(
567 * Should factor in the size of the bests table as well. 567 * Should factor in the size of the bests table as well.
568 * We can deduce a value for that from di_size. 568 * We can deduce a value for that from di_size.
569 */ 569 */
570 ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); 570 ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
571 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 571 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
572 /* 572 /*
573 * Leaves and bests don't overlap. 573 * Leaves and bests don't overlap.
574 */ 574 */
575 ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= 575 ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
576 (char *)XFS_DIR2_LEAF_BESTS_P(ltp)); 576 (char *)xfs_dir2_leaf_bests_p(ltp));
577 /* 577 /*
578 * Check hash value order, count stale entries. 578 * Check hash value order, count stale entries.
579 */ 579 */
@@ -815,12 +815,12 @@ xfs_dir2_leaf_getdents(
815 * Inside the loop we keep the main offset value as a byte offset 815 * Inside the loop we keep the main offset value as a byte offset
816 * in the directory file. 816 * in the directory file.
817 */ 817 */
818 curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset); 818 curoff = xfs_dir2_dataptr_to_byte(mp, uio->uio_offset);
819 /* 819 /*
820 * Force this conversion through db so we truncate the offset 820 * Force this conversion through db so we truncate the offset
821 * down to get the start of the data block. 821 * down to get the start of the data block.
822 */ 822 */
823 map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff)); 823 map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff));
824 /* 824 /*
825 * Loop over directory entries until we reach the end offset. 825 * Loop over directory entries until we reach the end offset.
826 * Get more blocks and readahead as necessary. 826 * Get more blocks and readahead as necessary.
@@ -870,7 +870,7 @@ xfs_dir2_leaf_getdents(
870 */ 870 */
871 if (1 + ra_want > map_blocks && 871 if (1 + ra_want > map_blocks &&
872 map_off < 872 map_off <
873 XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) { 873 xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
874 /* 874 /*
875 * Get more bmaps, fill in after the ones 875 * Get more bmaps, fill in after the ones
876 * we already have in the table. 876 * we already have in the table.
@@ -878,7 +878,7 @@ xfs_dir2_leaf_getdents(
878 nmap = map_size - map_valid; 878 nmap = map_size - map_valid;
879 error = xfs_bmapi(tp, dp, 879 error = xfs_bmapi(tp, dp,
880 map_off, 880 map_off,
881 XFS_DIR2_BYTE_TO_DA(mp, 881 xfs_dir2_byte_to_da(mp,
882 XFS_DIR2_LEAF_OFFSET) - map_off, 882 XFS_DIR2_LEAF_OFFSET) - map_off,
883 XFS_BMAPI_METADATA, NULL, 0, 883 XFS_BMAPI_METADATA, NULL, 0,
884 &map[map_valid], &nmap, NULL, NULL); 884 &map[map_valid], &nmap, NULL, NULL);
@@ -903,7 +903,7 @@ xfs_dir2_leaf_getdents(
903 map[map_valid + nmap - 1].br_blockcount; 903 map[map_valid + nmap - 1].br_blockcount;
904 else 904 else
905 map_off = 905 map_off =
906 XFS_DIR2_BYTE_TO_DA(mp, 906 xfs_dir2_byte_to_da(mp,
907 XFS_DIR2_LEAF_OFFSET); 907 XFS_DIR2_LEAF_OFFSET);
908 /* 908 /*
909 * Look for holes in the mapping, and 909 * Look for holes in the mapping, and
@@ -931,14 +931,14 @@ xfs_dir2_leaf_getdents(
931 * No valid mappings, so no more data blocks. 931 * No valid mappings, so no more data blocks.
932 */ 932 */
933 if (!map_valid) { 933 if (!map_valid) {
934 curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off); 934 curoff = xfs_dir2_da_to_byte(mp, map_off);
935 break; 935 break;
936 } 936 }
937 /* 937 /*
938 * Read the directory block starting at the first 938 * Read the directory block starting at the first
939 * mapping. 939 * mapping.
940 */ 940 */
941 curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff); 941 curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
942 error = xfs_da_read_buf(tp, dp, map->br_startoff, 942 error = xfs_da_read_buf(tp, dp, map->br_startoff,
943 map->br_blockcount >= mp->m_dirblkfsbs ? 943 map->br_blockcount >= mp->m_dirblkfsbs ?
944 XFS_FSB_TO_DADDR(mp, map->br_startblock) : 944 XFS_FSB_TO_DADDR(mp, map->br_startblock) :
@@ -1014,7 +1014,7 @@ xfs_dir2_leaf_getdents(
1014 /* 1014 /*
1015 * Having done a read, we need to set a new offset. 1015 * Having done a read, we need to set a new offset.
1016 */ 1016 */
1017 newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0); 1017 newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0);
1018 /* 1018 /*
1019 * Start of the current block. 1019 * Start of the current block.
1020 */ 1020 */
@@ -1024,7 +1024,7 @@ xfs_dir2_leaf_getdents(
1024 * Make sure we're in the right block. 1024 * Make sure we're in the right block.
1025 */ 1025 */
1026 else if (curoff > newoff) 1026 else if (curoff > newoff)
1027 ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) == 1027 ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
1028 curdb); 1028 curdb);
1029 data = bp->data; 1029 data = bp->data;
1030 xfs_dir2_data_check(dp, bp); 1030 xfs_dir2_data_check(dp, bp);
@@ -1032,7 +1032,7 @@ xfs_dir2_leaf_getdents(
1032 * Find our position in the block. 1032 * Find our position in the block.
1033 */ 1033 */
1034 ptr = (char *)&data->u; 1034 ptr = (char *)&data->u;
1035 byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff); 1035 byteoff = xfs_dir2_byte_to_off(mp, curoff);
1036 /* 1036 /*
1037 * Skip past the header. 1037 * Skip past the header.
1038 */ 1038 */
@@ -1054,15 +1054,15 @@ xfs_dir2_leaf_getdents(
1054 } 1054 }
1055 dep = (xfs_dir2_data_entry_t *)ptr; 1055 dep = (xfs_dir2_data_entry_t *)ptr;
1056 length = 1056 length =
1057 XFS_DIR2_DATA_ENTSIZE(dep->namelen); 1057 xfs_dir2_data_entsize(dep->namelen);
1058 ptr += length; 1058 ptr += length;
1059 } 1059 }
1060 /* 1060 /*
1061 * Now set our real offset. 1061 * Now set our real offset.
1062 */ 1062 */
1063 curoff = 1063 curoff =
1064 XFS_DIR2_DB_OFF_TO_BYTE(mp, 1064 xfs_dir2_db_off_to_byte(mp,
1065 XFS_DIR2_BYTE_TO_DB(mp, curoff), 1065 xfs_dir2_byte_to_db(mp, curoff),
1066 (char *)ptr - (char *)data); 1066 (char *)ptr - (char *)data);
1067 if (ptr >= (char *)data + mp->m_dirblksize) { 1067 if (ptr >= (char *)data + mp->m_dirblksize) {
1068 continue; 1068 continue;
@@ -1091,9 +1091,9 @@ xfs_dir2_leaf_getdents(
1091 1091
1092 p->namelen = dep->namelen; 1092 p->namelen = dep->namelen;
1093 1093
1094 length = XFS_DIR2_DATA_ENTSIZE(p->namelen); 1094 length = xfs_dir2_data_entsize(p->namelen);
1095 1095
1096 p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length); 1096 p->cook = xfs_dir2_byte_to_dataptr(mp, curoff + length);
1097 1097
1098 p->ino = be64_to_cpu(dep->inumber); 1098 p->ino = be64_to_cpu(dep->inumber);
1099#if XFS_BIG_INUMS 1099#if XFS_BIG_INUMS
@@ -1121,10 +1121,10 @@ xfs_dir2_leaf_getdents(
1121 * All done. Set output offset value to current offset. 1121 * All done. Set output offset value to current offset.
1122 */ 1122 */
1123 *eofp = eof; 1123 *eofp = eof;
1124 if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR)) 1124 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1125 uio->uio_offset = XFS_DIR2_MAX_DATAPTR; 1125 uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
1126 else 1126 else
1127 uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff); 1127 uio->uio_offset = xfs_dir2_byte_to_dataptr(mp, curoff);
1128 kmem_free(map, map_size * sizeof(*map)); 1128 kmem_free(map, map_size * sizeof(*map));
1129 kmem_free(p, sizeof(*p)); 1129 kmem_free(p, sizeof(*p));
1130 if (bp) 1130 if (bp)
@@ -1159,7 +1159,7 @@ xfs_dir2_leaf_init(
1159 /* 1159 /*
1160 * Get the buffer for the block. 1160 * Get the buffer for the block.
1161 */ 1161 */
1162 error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp, 1162 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
1163 XFS_DATA_FORK); 1163 XFS_DATA_FORK);
1164 if (error) { 1164 if (error) {
1165 return error; 1165 return error;
@@ -1181,7 +1181,7 @@ xfs_dir2_leaf_init(
1181 * the block. 1181 * the block.
1182 */ 1182 */
1183 if (magic == XFS_DIR2_LEAF1_MAGIC) { 1183 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1184 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 1184 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1185 ltp->bestcount = 0; 1185 ltp->bestcount = 0;
1186 xfs_dir2_leaf_log_tail(tp, bp); 1186 xfs_dir2_leaf_log_tail(tp, bp);
1187 } 1187 }
@@ -1206,9 +1206,9 @@ xfs_dir2_leaf_log_bests(
1206 1206
1207 leaf = bp->data; 1207 leaf = bp->data;
1208 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); 1208 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
1209 ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf); 1209 ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
1210 firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first; 1210 firstb = xfs_dir2_leaf_bests_p(ltp) + first;
1211 lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last; 1211 lastb = xfs_dir2_leaf_bests_p(ltp) + last;
1212 xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), 1212 xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
1213 (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); 1213 (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
1214} 1214}
@@ -1268,7 +1268,7 @@ xfs_dir2_leaf_log_tail(
1268 mp = tp->t_mountp; 1268 mp = tp->t_mountp;
1269 leaf = bp->data; 1269 leaf = bp->data;
1270 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); 1270 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
1271 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 1271 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1272 xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), 1272 xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
1273 (uint)(mp->m_dirblksize - 1)); 1273 (uint)(mp->m_dirblksize - 1));
1274} 1274}
@@ -1312,7 +1312,7 @@ xfs_dir2_leaf_lookup(
1312 */ 1312 */
1313 dep = (xfs_dir2_data_entry_t *) 1313 dep = (xfs_dir2_data_entry_t *)
1314 ((char *)dbp->data + 1314 ((char *)dbp->data +
1315 XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); 1315 xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
1316 /* 1316 /*
1317 * Return the found inode number. 1317 * Return the found inode number.
1318 */ 1318 */
@@ -1381,7 +1381,7 @@ xfs_dir2_leaf_lookup_int(
1381 /* 1381 /*
1382 * Get the new data block number. 1382 * Get the new data block number.
1383 */ 1383 */
1384 newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); 1384 newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
1385 /* 1385 /*
1386 * If it's not the same as the old data block number, 1386 * If it's not the same as the old data block number,
1387 * need to pitch the old one and read the new one. 1387 * need to pitch the old one and read the new one.
@@ -1391,7 +1391,7 @@ xfs_dir2_leaf_lookup_int(
1391 xfs_da_brelse(tp, dbp); 1391 xfs_da_brelse(tp, dbp);
1392 if ((error = 1392 if ((error =
1393 xfs_da_read_buf(tp, dp, 1393 xfs_da_read_buf(tp, dp,
1394 XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp, 1394 xfs_dir2_db_to_da(mp, newdb), -1, &dbp,
1395 XFS_DATA_FORK))) { 1395 XFS_DATA_FORK))) {
1396 xfs_da_brelse(tp, lbp); 1396 xfs_da_brelse(tp, lbp);
1397 return error; 1397 return error;
@@ -1404,7 +1404,7 @@ xfs_dir2_leaf_lookup_int(
1404 */ 1404 */
1405 dep = (xfs_dir2_data_entry_t *) 1405 dep = (xfs_dir2_data_entry_t *)
1406 ((char *)dbp->data + 1406 ((char *)dbp->data +
1407 XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); 1407 xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
1408 /* 1408 /*
1409 * If it matches then return it. 1409 * If it matches then return it.
1410 */ 1410 */
@@ -1469,20 +1469,20 @@ xfs_dir2_leaf_removename(
1469 * Point to the leaf entry, use that to point to the data entry. 1469 * Point to the leaf entry, use that to point to the data entry.
1470 */ 1470 */
1471 lep = &leaf->ents[index]; 1471 lep = &leaf->ents[index];
1472 db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); 1472 db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
1473 dep = (xfs_dir2_data_entry_t *) 1473 dep = (xfs_dir2_data_entry_t *)
1474 ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); 1474 ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
1475 needscan = needlog = 0; 1475 needscan = needlog = 0;
1476 oldbest = be16_to_cpu(data->hdr.bestfree[0].length); 1476 oldbest = be16_to_cpu(data->hdr.bestfree[0].length);
1477 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 1477 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1478 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); 1478 bestsp = xfs_dir2_leaf_bests_p(ltp);
1479 ASSERT(be16_to_cpu(bestsp[db]) == oldbest); 1479 ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
1480 /* 1480 /*
1481 * Mark the former data entry unused. 1481 * Mark the former data entry unused.
1482 */ 1482 */
1483 xfs_dir2_data_make_free(tp, dbp, 1483 xfs_dir2_data_make_free(tp, dbp,
1484 (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), 1484 (xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
1485 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); 1485 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
1486 /* 1486 /*
1487 * We just mark the leaf entry stale by putting a null in it. 1487 * We just mark the leaf entry stale by putting a null in it.
1488 */ 1488 */
@@ -1602,7 +1602,7 @@ xfs_dir2_leaf_replace(
1602 */ 1602 */
1603 dep = (xfs_dir2_data_entry_t *) 1603 dep = (xfs_dir2_data_entry_t *)
1604 ((char *)dbp->data + 1604 ((char *)dbp->data +
1605 XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); 1605 xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
1606 ASSERT(args->inumber != be64_to_cpu(dep->inumber)); 1606 ASSERT(args->inumber != be64_to_cpu(dep->inumber));
1607 /* 1607 /*
1608 * Put the new inode number in, log it. 1608 * Put the new inode number in, log it.
@@ -1698,7 +1698,7 @@ xfs_dir2_leaf_trim_data(
1698 /* 1698 /*
1699 * Read the offending data block. We need its buffer. 1699 * Read the offending data block. We need its buffer.
1700 */ 1700 */
1701 if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp, 1701 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
1702 XFS_DATA_FORK))) { 1702 XFS_DATA_FORK))) {
1703 return error; 1703 return error;
1704 } 1704 }
@@ -1712,7 +1712,7 @@ xfs_dir2_leaf_trim_data(
1712 */ 1712 */
1713 1713
1714 leaf = lbp->data; 1714 leaf = lbp->data;
1715 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 1715 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1716 ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == 1716 ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) ==
1717 mp->m_dirblksize - (uint)sizeof(data->hdr)); 1717 mp->m_dirblksize - (uint)sizeof(data->hdr));
1718 ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); 1718 ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
@@ -1727,7 +1727,7 @@ xfs_dir2_leaf_trim_data(
1727 /* 1727 /*
1728 * Eliminate the last bests entry from the table. 1728 * Eliminate the last bests entry from the table.
1729 */ 1729 */
1730 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); 1730 bestsp = xfs_dir2_leaf_bests_p(ltp);
1731 be32_add(&ltp->bestcount, -1); 1731 be32_add(&ltp->bestcount, -1);
1732 memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); 1732 memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
1733 xfs_dir2_leaf_log_tail(tp, lbp); 1733 xfs_dir2_leaf_log_tail(tp, lbp);
@@ -1838,12 +1838,12 @@ xfs_dir2_node_to_leaf(
1838 /* 1838 /*
1839 * Set up the leaf tail from the freespace block. 1839 * Set up the leaf tail from the freespace block.
1840 */ 1840 */
1841 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 1841 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1842 ltp->bestcount = free->hdr.nvalid; 1842 ltp->bestcount = free->hdr.nvalid;
1843 /* 1843 /*
1844 * Set up the leaf bests table. 1844 * Set up the leaf bests table.
1845 */ 1845 */
1846 memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests, 1846 memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
1847 be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); 1847 be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0]));
1848 xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); 1848 xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
1849 xfs_dir2_leaf_log_tail(tp, lbp); 1849 xfs_dir2_leaf_log_tail(tp, lbp);
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
index f57ca11624..70c97f3f81 100644
--- a/fs/xfs/xfs_dir2_leaf.h
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -32,7 +32,7 @@ struct xfs_trans;
32#define XFS_DIR2_LEAF_SPACE 1 32#define XFS_DIR2_LEAF_SPACE 1
33#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) 33#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
34#define XFS_DIR2_LEAF_FIRSTDB(mp) \ 34#define XFS_DIR2_LEAF_FIRSTDB(mp) \
35 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET) 35 xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
36 36
37/* 37/*
38 * Offset in data space of a data entry. 38 * Offset in data space of a data entry.
@@ -82,7 +82,6 @@ typedef struct xfs_dir2_leaf {
82 * DB blocks here are logical directory block numbers, not filesystem blocks. 82 * DB blocks here are logical directory block numbers, not filesystem blocks.
83 */ 83 */
84 84
85#define XFS_DIR2_MAX_LEAF_ENTS(mp) xfs_dir2_max_leaf_ents(mp)
86static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) 85static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
87{ 86{
88 return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / 87 return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) /
@@ -92,7 +91,6 @@ static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
92/* 91/*
93 * Get address of the bestcount field in the single-leaf block. 92 * Get address of the bestcount field in the single-leaf block.
94 */ 93 */
95#define XFS_DIR2_LEAF_TAIL_P(mp,lp) xfs_dir2_leaf_tail_p(mp, lp)
96static inline xfs_dir2_leaf_tail_t * 94static inline xfs_dir2_leaf_tail_t *
97xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) 95xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp)
98{ 96{
@@ -104,7 +102,6 @@ xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp)
104/* 102/*
105 * Get address of the bests array in the single-leaf block. 103 * Get address of the bests array in the single-leaf block.
106 */ 104 */
107#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp)
108static inline __be16 * 105static inline __be16 *
109xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) 106xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
110{ 107{
@@ -114,7 +111,6 @@ xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
114/* 111/*
115 * Convert dataptr to byte in file space 112 * Convert dataptr to byte in file space
116 */ 113 */
117#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
118static inline xfs_dir2_off_t 114static inline xfs_dir2_off_t
119xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) 115xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
120{ 116{
@@ -124,7 +120,6 @@ xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
124/* 120/*
125 * Convert byte in file space to dataptr. It had better be aligned. 121 * Convert byte in file space to dataptr. It had better be aligned.
126 */ 122 */
127#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
128static inline xfs_dir2_dataptr_t 123static inline xfs_dir2_dataptr_t
129xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) 124xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
130{ 125{
@@ -134,7 +129,6 @@ xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
134/* 129/*
135 * Convert byte in space to (DB) block 130 * Convert byte in space to (DB) block
136 */ 131 */
137#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by)
138static inline xfs_dir2_db_t 132static inline xfs_dir2_db_t
139xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) 133xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
140{ 134{
@@ -145,17 +139,15 @@ xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
145/* 139/*
146 * Convert dataptr to a block number 140 * Convert dataptr to a block number
147 */ 141 */
148#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp)
149static inline xfs_dir2_db_t 142static inline xfs_dir2_db_t
150xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) 143xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
151{ 144{
152 return XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); 145 return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
153} 146}
154 147
155/* 148/*
156 * Convert byte in space to offset in a block 149 * Convert byte in space to offset in a block
157 */ 150 */
158#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by)
159static inline xfs_dir2_data_aoff_t 151static inline xfs_dir2_data_aoff_t
160xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) 152xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
161{ 153{
@@ -166,18 +158,15 @@ xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
166/* 158/*
167 * Convert dataptr to a byte offset in a block 159 * Convert dataptr to a byte offset in a block
168 */ 160 */
169#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp)
170static inline xfs_dir2_data_aoff_t 161static inline xfs_dir2_data_aoff_t
171xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) 162xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
172{ 163{
173 return XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); 164 return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
174} 165}
175 166
176/* 167/*
177 * Convert block and offset to byte in space 168 * Convert block and offset to byte in space
178 */ 169 */
179#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \
180 xfs_dir2_db_off_to_byte(mp, db, o)
181static inline xfs_dir2_off_t 170static inline xfs_dir2_off_t
182xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, 171xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
183 xfs_dir2_data_aoff_t o) 172 xfs_dir2_data_aoff_t o)
@@ -189,7 +178,6 @@ xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
189/* 178/*
190 * Convert block (DB) to block (dablk) 179 * Convert block (DB) to block (dablk)
191 */ 180 */
192#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db)
193static inline xfs_dablk_t 181static inline xfs_dablk_t
194xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) 182xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
195{ 183{
@@ -199,29 +187,25 @@ xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
199/* 187/*
200 * Convert byte in space to (DA) block 188 * Convert byte in space to (DA) block
201 */ 189 */
202#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by)
203static inline xfs_dablk_t 190static inline xfs_dablk_t
204xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) 191xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
205{ 192{
206 return XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by)); 193 return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
207} 194}
208 195
209/* 196/*
210 * Convert block and offset to dataptr 197 * Convert block and offset to dataptr
211 */ 198 */
212#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \
213 xfs_dir2_db_off_to_dataptr(mp, db, o)
214static inline xfs_dir2_dataptr_t 199static inline xfs_dir2_dataptr_t
215xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, 200xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
216 xfs_dir2_data_aoff_t o) 201 xfs_dir2_data_aoff_t o)
217{ 202{
218 return XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o)); 203 return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
219} 204}
220 205
221/* 206/*
222 * Convert block (dablk) to block (DB) 207 * Convert block (dablk) to block (DB)
223 */ 208 */
224#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da)
225static inline xfs_dir2_db_t 209static inline xfs_dir2_db_t
226xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) 210xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
227{ 211{
@@ -231,11 +215,10 @@ xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
231/* 215/*
232 * Convert block (dablk) to byte offset in space 216 * Convert block (dablk) to byte offset in space
233 */ 217 */
234#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da)
235static inline xfs_dir2_off_t 218static inline xfs_dir2_off_t
236xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) 219xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
237{ 220{
238 return XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0); 221 return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
239} 222}
240 223
241/* 224/*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index d083c38199..91c61d9632 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -136,14 +136,14 @@ xfs_dir2_leaf_to_node(
136 /* 136 /*
137 * Get the buffer for the new freespace block. 137 * Get the buffer for the new freespace block.
138 */ 138 */
139 if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp, 139 if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
140 XFS_DATA_FORK))) { 140 XFS_DATA_FORK))) {
141 return error; 141 return error;
142 } 142 }
143 ASSERT(fbp != NULL); 143 ASSERT(fbp != NULL);
144 free = fbp->data; 144 free = fbp->data;
145 leaf = lbp->data; 145 leaf = lbp->data;
146 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); 146 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
147 /* 147 /*
148 * Initialize the freespace block header. 148 * Initialize the freespace block header.
149 */ 149 */
@@ -155,7 +155,7 @@ xfs_dir2_leaf_to_node(
155 * Copy freespace entries from the leaf block to the new block. 155 * Copy freespace entries from the leaf block to the new block.
156 * Count active entries. 156 * Count active entries.
157 */ 157 */
158 for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests; 158 for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
159 i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { 159 i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
160 if ((off = be16_to_cpu(*from)) != NULLDATAOFF) 160 if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
161 n++; 161 n++;
@@ -215,7 +215,7 @@ xfs_dir2_leafn_add(
215 * a compact. 215 * a compact.
216 */ 216 */
217 217
218 if (be16_to_cpu(leaf->hdr.count) == XFS_DIR2_MAX_LEAF_ENTS(mp)) { 218 if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
219 if (!leaf->hdr.stale) 219 if (!leaf->hdr.stale)
220 return XFS_ERROR(ENOSPC); 220 return XFS_ERROR(ENOSPC);
221 compact = be16_to_cpu(leaf->hdr.stale) > 1; 221 compact = be16_to_cpu(leaf->hdr.stale) > 1;
@@ -327,7 +327,7 @@ xfs_dir2_leafn_add(
327 * Insert the new entry, log everything. 327 * Insert the new entry, log everything.
328 */ 328 */
329 lep->hashval = cpu_to_be32(args->hashval); 329 lep->hashval = cpu_to_be32(args->hashval);
330 lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 330 lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
331 args->blkno, args->index)); 331 args->blkno, args->index));
332 xfs_dir2_leaf_log_header(tp, bp); 332 xfs_dir2_leaf_log_header(tp, bp);
333 xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); 333 xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
@@ -352,7 +352,7 @@ xfs_dir2_leafn_check(
352 leaf = bp->data; 352 leaf = bp->data;
353 mp = dp->i_mount; 353 mp = dp->i_mount;
354 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); 354 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
355 ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); 355 ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
356 for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { 356 for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
357 if (i + 1 < be16_to_cpu(leaf->hdr.count)) { 357 if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
358 ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= 358 ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
@@ -440,7 +440,7 @@ xfs_dir2_leafn_lookup_int(
440 if (args->addname) { 440 if (args->addname) {
441 curfdb = curbp ? state->extrablk.blkno : -1; 441 curfdb = curbp ? state->extrablk.blkno : -1;
442 curdb = -1; 442 curdb = -1;
443 length = XFS_DIR2_DATA_ENTSIZE(args->namelen); 443 length = xfs_dir2_data_entsize(args->namelen);
444 if ((free = (curbp ? curbp->data : NULL))) 444 if ((free = (curbp ? curbp->data : NULL)))
445 ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); 445 ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
446 } 446 }
@@ -465,7 +465,7 @@ xfs_dir2_leafn_lookup_int(
465 /* 465 /*
466 * Pull the data block number from the entry. 466 * Pull the data block number from the entry.
467 */ 467 */
468 newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); 468 newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
469 /* 469 /*
470 * For addname, we're looking for a place to put the new entry. 470 * For addname, we're looking for a place to put the new entry.
471 * We want to use a data block with an entry of equal 471 * We want to use a data block with an entry of equal
@@ -482,7 +482,7 @@ xfs_dir2_leafn_lookup_int(
482 * Convert the data block to the free block 482 * Convert the data block to the free block
483 * holding its freespace information. 483 * holding its freespace information.
484 */ 484 */
485 newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb); 485 newfdb = xfs_dir2_db_to_fdb(mp, newdb);
486 /* 486 /*
487 * If it's not the one we have in hand, 487 * If it's not the one we have in hand,
488 * read it in. 488 * read it in.
@@ -497,7 +497,7 @@ xfs_dir2_leafn_lookup_int(
497 * Read the free block. 497 * Read the free block.
498 */ 498 */
499 if ((error = xfs_da_read_buf(tp, dp, 499 if ((error = xfs_da_read_buf(tp, dp,
500 XFS_DIR2_DB_TO_DA(mp, 500 xfs_dir2_db_to_da(mp,
501 newfdb), 501 newfdb),
502 -1, &curbp, 502 -1, &curbp,
503 XFS_DATA_FORK))) { 503 XFS_DATA_FORK))) {
@@ -517,7 +517,7 @@ xfs_dir2_leafn_lookup_int(
517 /* 517 /*
518 * Get the index for our entry. 518 * Get the index for our entry.
519 */ 519 */
520 fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb); 520 fi = xfs_dir2_db_to_fdindex(mp, curdb);
521 /* 521 /*
522 * If it has room, return it. 522 * If it has room, return it.
523 */ 523 */
@@ -561,7 +561,7 @@ xfs_dir2_leafn_lookup_int(
561 */ 561 */
562 if ((error = 562 if ((error =
563 xfs_da_read_buf(tp, dp, 563 xfs_da_read_buf(tp, dp,
564 XFS_DIR2_DB_TO_DA(mp, newdb), -1, 564 xfs_dir2_db_to_da(mp, newdb), -1,
565 &curbp, XFS_DATA_FORK))) { 565 &curbp, XFS_DATA_FORK))) {
566 return error; 566 return error;
567 } 567 }
@@ -573,7 +573,7 @@ xfs_dir2_leafn_lookup_int(
573 */ 573 */
574 dep = (xfs_dir2_data_entry_t *) 574 dep = (xfs_dir2_data_entry_t *)
575 ((char *)curbp->data + 575 ((char *)curbp->data +
576 XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); 576 xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
577 /* 577 /*
578 * Compare the entry, return it if it matches. 578 * Compare the entry, return it if it matches.
579 */ 579 */
@@ -876,9 +876,9 @@ xfs_dir2_leafn_remove(
876 /* 876 /*
877 * Extract the data block and offset from the entry. 877 * Extract the data block and offset from the entry.
878 */ 878 */
879 db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); 879 db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
880 ASSERT(dblk->blkno == db); 880 ASSERT(dblk->blkno == db);
881 off = XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address)); 881 off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
882 ASSERT(dblk->index == off); 882 ASSERT(dblk->index == off);
883 /* 883 /*
884 * Kill the leaf entry by marking it stale. 884 * Kill the leaf entry by marking it stale.
@@ -898,7 +898,7 @@ xfs_dir2_leafn_remove(
898 longest = be16_to_cpu(data->hdr.bestfree[0].length); 898 longest = be16_to_cpu(data->hdr.bestfree[0].length);
899 needlog = needscan = 0; 899 needlog = needscan = 0;
900 xfs_dir2_data_make_free(tp, dbp, off, 900 xfs_dir2_data_make_free(tp, dbp, off,
901 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); 901 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
902 /* 902 /*
903 * Rescan the data block freespaces for bestfree. 903 * Rescan the data block freespaces for bestfree.
904 * Log the data block header if needed. 904 * Log the data block header if needed.
@@ -924,8 +924,8 @@ xfs_dir2_leafn_remove(
924 * Convert the data block number to a free block, 924 * Convert the data block number to a free block,
925 * read in the free block. 925 * read in the free block.
926 */ 926 */
927 fdb = XFS_DIR2_DB_TO_FDB(mp, db); 927 fdb = xfs_dir2_db_to_fdb(mp, db);
928 if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), 928 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
929 -1, &fbp, XFS_DATA_FORK))) { 929 -1, &fbp, XFS_DATA_FORK))) {
930 return error; 930 return error;
931 } 931 }
@@ -937,7 +937,7 @@ xfs_dir2_leafn_remove(
937 /* 937 /*
938 * Calculate which entry we need to fix. 938 * Calculate which entry we need to fix.
939 */ 939 */
940 findex = XFS_DIR2_DB_TO_FDINDEX(mp, db); 940 findex = xfs_dir2_db_to_fdindex(mp, db);
941 longest = be16_to_cpu(data->hdr.bestfree[0].length); 941 longest = be16_to_cpu(data->hdr.bestfree[0].length);
942 /* 942 /*
943 * If the data block is now empty we can get rid of it 943 * If the data block is now empty we can get rid of it
@@ -1073,7 +1073,7 @@ xfs_dir2_leafn_split(
1073 /* 1073 /*
1074 * Initialize the new leaf block. 1074 * Initialize the new leaf block.
1075 */ 1075 */
1076 error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno), 1076 error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
1077 &newblk->bp, XFS_DIR2_LEAFN_MAGIC); 1077 &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
1078 if (error) { 1078 if (error) {
1079 return error; 1079 return error;
@@ -1385,7 +1385,7 @@ xfs_dir2_node_addname_int(
1385 dp = args->dp; 1385 dp = args->dp;
1386 mp = dp->i_mount; 1386 mp = dp->i_mount;
1387 tp = args->trans; 1387 tp = args->trans;
1388 length = XFS_DIR2_DATA_ENTSIZE(args->namelen); 1388 length = xfs_dir2_data_entsize(args->namelen);
1389 /* 1389 /*
1390 * If we came in with a freespace block that means that lookup 1390 * If we came in with a freespace block that means that lookup
1391 * found an entry with our hash value. This is the freespace 1391 * found an entry with our hash value. This is the freespace
@@ -1438,7 +1438,7 @@ xfs_dir2_node_addname_int(
1438 1438
1439 if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) 1439 if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
1440 return error; 1440 return error;
1441 lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo); 1441 lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
1442 fbno = ifbno; 1442 fbno = ifbno;
1443 } 1443 }
1444 /* 1444 /*
@@ -1474,7 +1474,7 @@ xfs_dir2_node_addname_int(
1474 * to avoid it. 1474 * to avoid it.
1475 */ 1475 */
1476 if ((error = xfs_da_read_buf(tp, dp, 1476 if ((error = xfs_da_read_buf(tp, dp,
1477 XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, 1477 xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
1478 XFS_DATA_FORK))) { 1478 XFS_DATA_FORK))) {
1479 return error; 1479 return error;
1480 } 1480 }
@@ -1550,9 +1550,9 @@ xfs_dir2_node_addname_int(
1550 * Get the freespace block corresponding to the data block 1550 * Get the freespace block corresponding to the data block
1551 * that was just allocated. 1551 * that was just allocated.
1552 */ 1552 */
1553 fbno = XFS_DIR2_DB_TO_FDB(mp, dbno); 1553 fbno = xfs_dir2_db_to_fdb(mp, dbno);
1554 if (unlikely(error = xfs_da_read_buf(tp, dp, 1554 if (unlikely(error = xfs_da_read_buf(tp, dp,
1555 XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, 1555 xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
1556 XFS_DATA_FORK))) { 1556 XFS_DATA_FORK))) {
1557 xfs_da_buf_done(dbp); 1557 xfs_da_buf_done(dbp);
1558 return error; 1558 return error;
@@ -1567,14 +1567,14 @@ xfs_dir2_node_addname_int(
1567 return error; 1567 return error;
1568 } 1568 }
1569 1569
1570 if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) { 1570 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
1571 cmn_err(CE_ALERT, 1571 cmn_err(CE_ALERT,
1572 "xfs_dir2_node_addname_int: dir ino " 1572 "xfs_dir2_node_addname_int: dir ino "
1573 "%llu needed freesp block %lld for\n" 1573 "%llu needed freesp block %lld for\n"
1574 " data block %lld, got %lld\n" 1574 " data block %lld, got %lld\n"
1575 " ifbno %llu lastfbno %d\n", 1575 " ifbno %llu lastfbno %d\n",
1576 (unsigned long long)dp->i_ino, 1576 (unsigned long long)dp->i_ino,
1577 (long long)XFS_DIR2_DB_TO_FDB(mp, dbno), 1577 (long long)xfs_dir2_db_to_fdb(mp, dbno),
1578 (long long)dbno, (long long)fbno, 1578 (long long)dbno, (long long)fbno,
1579 (unsigned long long)ifbno, lastfbno); 1579 (unsigned long long)ifbno, lastfbno);
1580 if (fblk) { 1580 if (fblk) {
@@ -1598,7 +1598,7 @@ xfs_dir2_node_addname_int(
1598 * Get a buffer for the new block. 1598 * Get a buffer for the new block.
1599 */ 1599 */
1600 if ((error = xfs_da_get_buf(tp, dp, 1600 if ((error = xfs_da_get_buf(tp, dp,
1601 XFS_DIR2_DB_TO_DA(mp, fbno), 1601 xfs_dir2_db_to_da(mp, fbno),
1602 -1, &fbp, XFS_DATA_FORK))) { 1602 -1, &fbp, XFS_DATA_FORK))) {
1603 return error; 1603 return error;
1604 } 1604 }
@@ -1623,7 +1623,7 @@ xfs_dir2_node_addname_int(
1623 /* 1623 /*
1624 * Set the freespace block index from the data block number. 1624 * Set the freespace block index from the data block number.
1625 */ 1625 */
1626 findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno); 1626 findex = xfs_dir2_db_to_fdindex(mp, dbno);
1627 /* 1627 /*
1628 * If it's after the end of the current entries in the 1628 * If it's after the end of the current entries in the
1629 * freespace block, extend that table. 1629 * freespace block, extend that table.
@@ -1669,7 +1669,7 @@ xfs_dir2_node_addname_int(
1669 * Read the data block in. 1669 * Read the data block in.
1670 */ 1670 */
1671 if (unlikely( 1671 if (unlikely(
1672 error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno), 1672 error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
1673 -1, &dbp, XFS_DATA_FORK))) { 1673 -1, &dbp, XFS_DATA_FORK))) {
1674 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) 1674 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1675 xfs_da_buf_done(fbp); 1675 xfs_da_buf_done(fbp);
@@ -1698,7 +1698,7 @@ xfs_dir2_node_addname_int(
1698 dep->inumber = cpu_to_be64(args->inumber); 1698 dep->inumber = cpu_to_be64(args->inumber);
1699 dep->namelen = args->namelen; 1699 dep->namelen = args->namelen;
1700 memcpy(dep->name, args->name, dep->namelen); 1700 memcpy(dep->name, args->name, dep->namelen);
1701 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); 1701 tagp = xfs_dir2_data_entry_tag_p(dep);
1702 *tagp = cpu_to_be16((char *)dep - (char *)data); 1702 *tagp = cpu_to_be16((char *)dep - (char *)data);
1703 xfs_dir2_data_log_entry(tp, dbp, dep); 1703 xfs_dir2_data_log_entry(tp, dbp, dep);
1704 /* 1704 /*
@@ -1904,7 +1904,7 @@ xfs_dir2_node_replace(
1904 ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); 1904 ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC);
1905 dep = (xfs_dir2_data_entry_t *) 1905 dep = (xfs_dir2_data_entry_t *)
1906 ((char *)data + 1906 ((char *)data +
1907 XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address))); 1907 xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
1908 ASSERT(inum != be64_to_cpu(dep->inumber)); 1908 ASSERT(inum != be64_to_cpu(dep->inumber));
1909 /* 1909 /*
1910 * Fill in the new inode number and log the entry. 1910 * Fill in the new inode number and log the entry.
@@ -1980,7 +1980,7 @@ xfs_dir2_node_trim_free(
1980 * Blow the block away. 1980 * Blow the block away.
1981 */ 1981 */
1982 if ((error = 1982 if ((error =
1983 xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo), 1983 xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
1984 bp))) { 1984 bp))) {
1985 /* 1985 /*
1986 * Can't fail with ENOSPC since that only happens with no 1986 * Can't fail with ENOSPC since that only happens with no
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index c7c870ee78..dde72db3d6 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -36,7 +36,7 @@ struct xfs_trans;
36#define XFS_DIR2_FREE_SPACE 2 36#define XFS_DIR2_FREE_SPACE 2
37#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) 37#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
38#define XFS_DIR2_FREE_FIRSTDB(mp) \ 38#define XFS_DIR2_FREE_FIRSTDB(mp) \
39 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET) 39 xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
40 40
41#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ 41#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */
42 42
@@ -60,7 +60,6 @@ typedef struct xfs_dir2_free {
60/* 60/*
61 * Convert data space db to the corresponding free db. 61 * Convert data space db to the corresponding free db.
62 */ 62 */
63#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db)
64static inline xfs_dir2_db_t 63static inline xfs_dir2_db_t
65xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) 64xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
66{ 65{
@@ -70,7 +69,6 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
70/* 69/*
71 * Convert data space db to the corresponding index in a free db. 70 * Convert data space db to the corresponding index in a free db.
72 */ 71 */
73#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db)
74static inline int 72static inline int
75xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) 73xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
76{ 74{
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 0cd77b17bf..38fc4f22b7 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -89,8 +89,8 @@ xfs_dir2_block_sfsize(
89 mp = dp->i_mount; 89 mp = dp->i_mount;
90 90
91 count = i8count = namelen = 0; 91 count = i8count = namelen = 0;
92 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 92 btp = xfs_dir2_block_tail_p(mp, block);
93 blp = XFS_DIR2_BLOCK_LEAF_P(btp); 93 blp = xfs_dir2_block_leaf_p(btp);
94 94
95 /* 95 /*
96 * Iterate over the block's data entries by using the leaf pointers. 96 * Iterate over the block's data entries by using the leaf pointers.
@@ -102,7 +102,7 @@ xfs_dir2_block_sfsize(
102 * Calculate the pointer to the entry at hand. 102 * Calculate the pointer to the entry at hand.
103 */ 103 */
104 dep = (xfs_dir2_data_entry_t *) 104 dep = (xfs_dir2_data_entry_t *)
105 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); 105 ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
106 /* 106 /*
107 * Detect . and .., so we can special-case them. 107 * Detect . and .., so we can special-case them.
108 * . is not included in sf directories. 108 * . is not included in sf directories.
@@ -124,7 +124,7 @@ xfs_dir2_block_sfsize(
124 /* 124 /*
125 * Calculate the new size, see if we should give up yet. 125 * Calculate the new size, see if we should give up yet.
126 */ 126 */
127 size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */ 127 size = xfs_dir2_sf_hdr_size(i8count) + /* header */
128 count + /* namelen */ 128 count + /* namelen */
129 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ 129 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
130 namelen + /* name */ 130 namelen + /* name */
@@ -139,7 +139,7 @@ xfs_dir2_block_sfsize(
139 */ 139 */
140 sfhp->count = count; 140 sfhp->count = count;
141 sfhp->i8count = i8count; 141 sfhp->i8count = i8count;
142 XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); 142 xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
143 return size; 143 return size;
144} 144}
145 145
@@ -199,15 +199,15 @@ xfs_dir2_block_to_sf(
199 * Copy the header into the newly allocate local space. 199 * Copy the header into the newly allocate local space.
200 */ 200 */
201 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 201 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
202 memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count)); 202 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
203 dp->i_d.di_size = size; 203 dp->i_d.di_size = size;
204 /* 204 /*
205 * Set up to loop over the block's entries. 205 * Set up to loop over the block's entries.
206 */ 206 */
207 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); 207 btp = xfs_dir2_block_tail_p(mp, block);
208 ptr = (char *)block->u; 208 ptr = (char *)block->u;
209 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); 209 endptr = (char *)xfs_dir2_block_leaf_p(btp);
210 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 210 sfep = xfs_dir2_sf_firstentry(sfp);
211 /* 211 /*
212 * Loop over the active and unused entries. 212 * Loop over the active and unused entries.
213 * Stop when we reach the leaf/tail portion of the block. 213 * Stop when we reach the leaf/tail portion of the block.
@@ -233,22 +233,22 @@ xfs_dir2_block_to_sf(
233 else if (dep->namelen == 2 && 233 else if (dep->namelen == 2 &&
234 dep->name[0] == '.' && dep->name[1] == '.') 234 dep->name[0] == '.' && dep->name[1] == '.')
235 ASSERT(be64_to_cpu(dep->inumber) == 235 ASSERT(be64_to_cpu(dep->inumber) ==
236 XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); 236 xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
237 /* 237 /*
238 * Normal entry, copy it into shortform. 238 * Normal entry, copy it into shortform.
239 */ 239 */
240 else { 240 else {
241 sfep->namelen = dep->namelen; 241 sfep->namelen = dep->namelen;
242 XFS_DIR2_SF_PUT_OFFSET(sfep, 242 xfs_dir2_sf_put_offset(sfep,
243 (xfs_dir2_data_aoff_t) 243 (xfs_dir2_data_aoff_t)
244 ((char *)dep - (char *)block)); 244 ((char *)dep - (char *)block));
245 memcpy(sfep->name, dep->name, dep->namelen); 245 memcpy(sfep->name, dep->name, dep->namelen);
246 temp = be64_to_cpu(dep->inumber); 246 temp = be64_to_cpu(dep->inumber);
247 XFS_DIR2_SF_PUT_INUMBER(sfp, &temp, 247 xfs_dir2_sf_put_inumber(sfp, &temp,
248 XFS_DIR2_SF_INUMBERP(sfep)); 248 xfs_dir2_sf_inumberp(sfep));
249 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); 249 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
250 } 250 }
251 ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); 251 ptr += xfs_dir2_data_entsize(dep->namelen);
252 } 252 }
253 ASSERT((char *)sfep - (char *)sfp == size); 253 ASSERT((char *)sfep - (char *)sfp == size);
254 xfs_dir2_sf_check(args); 254 xfs_dir2_sf_check(args);
@@ -294,11 +294,11 @@ xfs_dir2_sf_addname(
294 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 294 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
295 ASSERT(dp->i_df.if_u1.if_data != NULL); 295 ASSERT(dp->i_df.if_u1.if_data != NULL);
296 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 296 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
297 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 297 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
298 /* 298 /*
299 * Compute entry (and change in) size. 299 * Compute entry (and change in) size.
300 */ 300 */
301 add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); 301 add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
302 incr_isize = add_entsize; 302 incr_isize = add_entsize;
303 objchange = 0; 303 objchange = 0;
304#if XFS_BIG_INUMS 304#if XFS_BIG_INUMS
@@ -392,7 +392,7 @@ xfs_dir2_sf_addname_easy(
392 /* 392 /*
393 * Grow the in-inode space. 393 * Grow the in-inode space.
394 */ 394 */
395 xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen), 395 xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen),
396 XFS_DATA_FORK); 396 XFS_DATA_FORK);
397 /* 397 /*
398 * Need to set up again due to realloc of the inode data. 398 * Need to set up again due to realloc of the inode data.
@@ -403,10 +403,10 @@ xfs_dir2_sf_addname_easy(
403 * Fill in the new entry. 403 * Fill in the new entry.
404 */ 404 */
405 sfep->namelen = args->namelen; 405 sfep->namelen = args->namelen;
406 XFS_DIR2_SF_PUT_OFFSET(sfep, offset); 406 xfs_dir2_sf_put_offset(sfep, offset);
407 memcpy(sfep->name, args->name, sfep->namelen); 407 memcpy(sfep->name, args->name, sfep->namelen);
408 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, 408 xfs_dir2_sf_put_inumber(sfp, &args->inumber,
409 XFS_DIR2_SF_INUMBERP(sfep)); 409 xfs_dir2_sf_inumberp(sfep));
410 /* 410 /*
411 * Update the header and inode. 411 * Update the header and inode.
412 */ 412 */
@@ -463,14 +463,14 @@ xfs_dir2_sf_addname_hard(
463 * If it's going to end up at the end then oldsfep will point there. 463 * If it's going to end up at the end then oldsfep will point there.
464 */ 464 */
465 for (offset = XFS_DIR2_DATA_FIRST_OFFSET, 465 for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
466 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp), 466 oldsfep = xfs_dir2_sf_firstentry(oldsfp),
467 add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen), 467 add_datasize = xfs_dir2_data_entsize(args->namelen),
468 eof = (char *)oldsfep == &buf[old_isize]; 468 eof = (char *)oldsfep == &buf[old_isize];
469 !eof; 469 !eof;
470 offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen), 470 offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
471 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep), 471 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
472 eof = (char *)oldsfep == &buf[old_isize]) { 472 eof = (char *)oldsfep == &buf[old_isize]) {
473 new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep); 473 new_offset = xfs_dir2_sf_get_offset(oldsfep);
474 if (offset + add_datasize <= new_offset) 474 if (offset + add_datasize <= new_offset)
475 break; 475 break;
476 } 476 }
@@ -495,10 +495,10 @@ xfs_dir2_sf_addname_hard(
495 * Fill in the new entry, and update the header counts. 495 * Fill in the new entry, and update the header counts.
496 */ 496 */
497 sfep->namelen = args->namelen; 497 sfep->namelen = args->namelen;
498 XFS_DIR2_SF_PUT_OFFSET(sfep, offset); 498 xfs_dir2_sf_put_offset(sfep, offset);
499 memcpy(sfep->name, args->name, sfep->namelen); 499 memcpy(sfep->name, args->name, sfep->namelen);
500 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, 500 xfs_dir2_sf_put_inumber(sfp, &args->inumber,
501 XFS_DIR2_SF_INUMBERP(sfep)); 501 xfs_dir2_sf_inumberp(sfep));
502 sfp->hdr.count++; 502 sfp->hdr.count++;
503#if XFS_BIG_INUMS 503#if XFS_BIG_INUMS
504 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) 504 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -508,7 +508,7 @@ xfs_dir2_sf_addname_hard(
508 * If there's more left to copy, do that. 508 * If there's more left to copy, do that.
509 */ 509 */
510 if (!eof) { 510 if (!eof) {
511 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); 511 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
512 memcpy(sfep, oldsfep, old_isize - nbytes); 512 memcpy(sfep, oldsfep, old_isize - nbytes);
513 } 513 }
514 kmem_free(buf, old_isize); 514 kmem_free(buf, old_isize);
@@ -544,9 +544,9 @@ xfs_dir2_sf_addname_pick(
544 mp = dp->i_mount; 544 mp = dp->i_mount;
545 545
546 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 546 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
547 size = XFS_DIR2_DATA_ENTSIZE(args->namelen); 547 size = xfs_dir2_data_entsize(args->namelen);
548 offset = XFS_DIR2_DATA_FIRST_OFFSET; 548 offset = XFS_DIR2_DATA_FIRST_OFFSET;
549 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 549 sfep = xfs_dir2_sf_firstentry(sfp);
550 holefit = 0; 550 holefit = 0;
551 /* 551 /*
552 * Loop over sf entries. 552 * Loop over sf entries.
@@ -555,10 +555,10 @@ xfs_dir2_sf_addname_pick(
555 */ 555 */
556 for (i = 0; i < sfp->hdr.count; i++) { 556 for (i = 0; i < sfp->hdr.count; i++) {
557 if (!holefit) 557 if (!holefit)
558 holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep); 558 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
559 offset = XFS_DIR2_SF_GET_OFFSET(sfep) + 559 offset = xfs_dir2_sf_get_offset(sfep) +
560 XFS_DIR2_DATA_ENTSIZE(sfep->namelen); 560 xfs_dir2_data_entsize(sfep->namelen);
561 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); 561 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
562 } 562 }
563 /* 563 /*
564 * Calculate data bytes used excluding the new entry, if this 564 * Calculate data bytes used excluding the new entry, if this
@@ -617,18 +617,18 @@ xfs_dir2_sf_check(
617 617
618 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 618 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
619 offset = XFS_DIR2_DATA_FIRST_OFFSET; 619 offset = XFS_DIR2_DATA_FIRST_OFFSET;
620 ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); 620 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
621 i8count = ino > XFS_DIR2_MAX_SHORT_INUM; 621 i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
622 622
623 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 623 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
624 i < sfp->hdr.count; 624 i < sfp->hdr.count;
625 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { 625 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
626 ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset); 626 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
627 ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); 627 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
628 i8count += ino > XFS_DIR2_MAX_SHORT_INUM; 628 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
629 offset = 629 offset =
630 XFS_DIR2_SF_GET_OFFSET(sfep) + 630 xfs_dir2_sf_get_offset(sfep) +
631 XFS_DIR2_DATA_ENTSIZE(sfep->namelen); 631 xfs_dir2_data_entsize(sfep->namelen);
632 } 632 }
633 ASSERT(i8count == sfp->hdr.i8count); 633 ASSERT(i8count == sfp->hdr.i8count);
634 ASSERT(XFS_BIG_INUMS || i8count == 0); 634 ASSERT(XFS_BIG_INUMS || i8count == 0);
@@ -671,7 +671,7 @@ xfs_dir2_sf_create(
671 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 671 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
672 ASSERT(dp->i_df.if_bytes == 0); 672 ASSERT(dp->i_df.if_bytes == 0);
673 i8count = pino > XFS_DIR2_MAX_SHORT_INUM; 673 i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
674 size = XFS_DIR2_SF_HDR_SIZE(i8count); 674 size = xfs_dir2_sf_hdr_size(i8count);
675 /* 675 /*
676 * Make a buffer for the data. 676 * Make a buffer for the data.
677 */ 677 */
@@ -684,7 +684,7 @@ xfs_dir2_sf_create(
684 /* 684 /*
685 * Now can put in the inode number, since i8count is set. 685 * Now can put in the inode number, since i8count is set.
686 */ 686 */
687 XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent); 687 xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent);
688 sfp->hdr.count = 0; 688 sfp->hdr.count = 0;
689 dp->i_d.di_size = size; 689 dp->i_d.di_size = size;
690 xfs_dir2_sf_check(args); 690 xfs_dir2_sf_check(args);
@@ -727,12 +727,12 @@ xfs_dir2_sf_getdents(
727 727
728 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 728 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
729 729
730 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 730 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
731 731
732 /* 732 /*
733 * If the block number in the offset is out of range, we're done. 733 * If the block number in the offset is out of range, we're done.
734 */ 734 */
735 if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) { 735 if (xfs_dir2_dataptr_to_db(mp, dir_offset) > mp->m_dirdatablk) {
736 *eofp = 1; 736 *eofp = 1;
737 return 0; 737 return 0;
738 } 738 }
@@ -747,9 +747,9 @@ xfs_dir2_sf_getdents(
747 * Put . entry unless we're starting past it. 747 * Put . entry unless we're starting past it.
748 */ 748 */
749 if (dir_offset <= 749 if (dir_offset <=
750 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 750 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
751 XFS_DIR2_DATA_DOT_OFFSET)) { 751 XFS_DIR2_DATA_DOT_OFFSET)) {
752 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0, 752 p.cook = xfs_dir2_db_off_to_dataptr(mp, 0,
753 XFS_DIR2_DATA_DOTDOT_OFFSET); 753 XFS_DIR2_DATA_DOTDOT_OFFSET);
754 p.ino = dp->i_ino; 754 p.ino = dp->i_ino;
755#if XFS_BIG_INUMS 755#if XFS_BIG_INUMS
@@ -762,7 +762,7 @@ xfs_dir2_sf_getdents(
762 762
763 if (!p.done) { 763 if (!p.done) {
764 uio->uio_offset = 764 uio->uio_offset =
765 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 765 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
766 XFS_DIR2_DATA_DOT_OFFSET); 766 XFS_DIR2_DATA_DOT_OFFSET);
767 return error; 767 return error;
768 } 768 }
@@ -772,11 +772,11 @@ xfs_dir2_sf_getdents(
772 * Put .. entry unless we're starting past it. 772 * Put .. entry unless we're starting past it.
773 */ 773 */
774 if (dir_offset <= 774 if (dir_offset <=
775 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 775 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
776 XFS_DIR2_DATA_DOTDOT_OFFSET)) { 776 XFS_DIR2_DATA_DOTDOT_OFFSET)) {
777 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 777 p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
778 XFS_DIR2_DATA_FIRST_OFFSET); 778 XFS_DIR2_DATA_FIRST_OFFSET);
779 p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); 779 p.ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
780#if XFS_BIG_INUMS 780#if XFS_BIG_INUMS
781 p.ino += mp->m_inoadd; 781 p.ino += mp->m_inoadd;
782#endif 782#endif
@@ -787,7 +787,7 @@ xfs_dir2_sf_getdents(
787 787
788 if (!p.done) { 788 if (!p.done) {
789 uio->uio_offset = 789 uio->uio_offset =
790 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 790 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
791 XFS_DIR2_DATA_DOTDOT_OFFSET); 791 XFS_DIR2_DATA_DOTDOT_OFFSET);
792 return error; 792 return error;
793 } 793 }
@@ -796,23 +796,23 @@ xfs_dir2_sf_getdents(
796 /* 796 /*
797 * Loop while there are more entries and put'ing works. 797 * Loop while there are more entries and put'ing works.
798 */ 798 */
799 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 799 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
800 i < sfp->hdr.count; 800 i < sfp->hdr.count;
801 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { 801 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
802 802
803 off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 803 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
804 XFS_DIR2_SF_GET_OFFSET(sfep)); 804 xfs_dir2_sf_get_offset(sfep));
805 805
806 if (dir_offset > off) 806 if (dir_offset > off)
807 continue; 807 continue;
808 808
809 p.namelen = sfep->namelen; 809 p.namelen = sfep->namelen;
810 810
811 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, 811 p.cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
812 XFS_DIR2_SF_GET_OFFSET(sfep) + 812 xfs_dir2_sf_get_offset(sfep) +
813 XFS_DIR2_DATA_ENTSIZE(p.namelen)); 813 xfs_dir2_data_entsize(p.namelen));
814 814
815 p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); 815 p.ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
816#if XFS_BIG_INUMS 816#if XFS_BIG_INUMS
817 p.ino += mp->m_inoadd; 817 p.ino += mp->m_inoadd;
818#endif 818#endif
@@ -832,7 +832,7 @@ xfs_dir2_sf_getdents(
832 *eofp = 1; 832 *eofp = 1;
833 833
834 uio->uio_offset = 834 uio->uio_offset =
835 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); 835 xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
836 836
837 return 0; 837 return 0;
838} 838}
@@ -865,7 +865,7 @@ xfs_dir2_sf_lookup(
865 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 865 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
866 ASSERT(dp->i_df.if_u1.if_data != NULL); 866 ASSERT(dp->i_df.if_u1.if_data != NULL);
867 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 867 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
868 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 868 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
869 /* 869 /*
870 * Special case for . 870 * Special case for .
871 */ 871 */
@@ -878,21 +878,21 @@ xfs_dir2_sf_lookup(
878 */ 878 */
879 if (args->namelen == 2 && 879 if (args->namelen == 2 &&
880 args->name[0] == '.' && args->name[1] == '.') { 880 args->name[0] == '.' && args->name[1] == '.') {
881 args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); 881 args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
882 return XFS_ERROR(EEXIST); 882 return XFS_ERROR(EEXIST);
883 } 883 }
884 /* 884 /*
885 * Loop over all the entries trying to match ours. 885 * Loop over all the entries trying to match ours.
886 */ 886 */
887 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 887 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
888 i < sfp->hdr.count; 888 i < sfp->hdr.count;
889 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { 889 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
890 if (sfep->namelen == args->namelen && 890 if (sfep->namelen == args->namelen &&
891 sfep->name[0] == args->name[0] && 891 sfep->name[0] == args->name[0] &&
892 memcmp(args->name, sfep->name, args->namelen) == 0) { 892 memcmp(args->name, sfep->name, args->namelen) == 0) {
893 args->inumber = 893 args->inumber =
894 XFS_DIR2_SF_GET_INUMBER(sfp, 894 xfs_dir2_sf_get_inumber(sfp,
895 XFS_DIR2_SF_INUMBERP(sfep)); 895 xfs_dir2_sf_inumberp(sfep));
896 return XFS_ERROR(EEXIST); 896 return XFS_ERROR(EEXIST);
897 } 897 }
898 } 898 }
@@ -934,19 +934,19 @@ xfs_dir2_sf_removename(
934 ASSERT(dp->i_df.if_bytes == oldsize); 934 ASSERT(dp->i_df.if_bytes == oldsize);
935 ASSERT(dp->i_df.if_u1.if_data != NULL); 935 ASSERT(dp->i_df.if_u1.if_data != NULL);
936 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 936 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
937 ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 937 ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
938 /* 938 /*
939 * Loop over the old directory entries. 939 * Loop over the old directory entries.
940 * Find the one we're deleting. 940 * Find the one we're deleting.
941 */ 941 */
942 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 942 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
943 i < sfp->hdr.count; 943 i < sfp->hdr.count;
944 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { 944 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
945 if (sfep->namelen == args->namelen && 945 if (sfep->namelen == args->namelen &&
946 sfep->name[0] == args->name[0] && 946 sfep->name[0] == args->name[0] &&
947 memcmp(sfep->name, args->name, args->namelen) == 0) { 947 memcmp(sfep->name, args->name, args->namelen) == 0) {
948 ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp, 948 ASSERT(xfs_dir2_sf_get_inumber(sfp,
949 XFS_DIR2_SF_INUMBERP(sfep)) == 949 xfs_dir2_sf_inumberp(sfep)) ==
950 args->inumber); 950 args->inumber);
951 break; 951 break;
952 } 952 }
@@ -961,7 +961,7 @@ xfs_dir2_sf_removename(
961 * Calculate sizes. 961 * Calculate sizes.
962 */ 962 */
963 byteoff = (int)((char *)sfep - (char *)sfp); 963 byteoff = (int)((char *)sfep - (char *)sfp);
964 entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); 964 entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
965 newsize = oldsize - entsize; 965 newsize = oldsize - entsize;
966 /* 966 /*
967 * Copy the part if any after the removed entry, sliding it down. 967 * Copy the part if any after the removed entry, sliding it down.
@@ -1027,7 +1027,7 @@ xfs_dir2_sf_replace(
1027 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 1027 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
1028 ASSERT(dp->i_df.if_u1.if_data != NULL); 1028 ASSERT(dp->i_df.if_u1.if_data != NULL);
1029 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; 1029 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1030 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); 1030 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
1031#if XFS_BIG_INUMS 1031#if XFS_BIG_INUMS
1032 /* 1032 /*
1033 * New inode number is large, and need to convert to 8-byte inodes. 1033 * New inode number is large, and need to convert to 8-byte inodes.
@@ -1067,28 +1067,28 @@ xfs_dir2_sf_replace(
1067 if (args->namelen == 2 && 1067 if (args->namelen == 2 &&
1068 args->name[0] == '.' && args->name[1] == '.') { 1068 args->name[0] == '.' && args->name[1] == '.') {
1069#if XFS_BIG_INUMS || defined(DEBUG) 1069#if XFS_BIG_INUMS || defined(DEBUG)
1070 ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); 1070 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
1071 ASSERT(args->inumber != ino); 1071 ASSERT(args->inumber != ino);
1072#endif 1072#endif
1073 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent); 1073 xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent);
1074 } 1074 }
1075 /* 1075 /*
1076 * Normal entry, look for the name. 1076 * Normal entry, look for the name.
1077 */ 1077 */
1078 else { 1078 else {
1079 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); 1079 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
1080 i < sfp->hdr.count; 1080 i < sfp->hdr.count;
1081 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { 1081 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
1082 if (sfep->namelen == args->namelen && 1082 if (sfep->namelen == args->namelen &&
1083 sfep->name[0] == args->name[0] && 1083 sfep->name[0] == args->name[0] &&
1084 memcmp(args->name, sfep->name, args->namelen) == 0) { 1084 memcmp(args->name, sfep->name, args->namelen) == 0) {
1085#if XFS_BIG_INUMS || defined(DEBUG) 1085#if XFS_BIG_INUMS || defined(DEBUG)
1086 ino = XFS_DIR2_SF_GET_INUMBER(sfp, 1086 ino = xfs_dir2_sf_get_inumber(sfp,
1087 XFS_DIR2_SF_INUMBERP(sfep)); 1087 xfs_dir2_sf_inumberp(sfep));
1088 ASSERT(args->inumber != ino); 1088 ASSERT(args->inumber != ino);
1089#endif 1089#endif
1090 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, 1090 xfs_dir2_sf_put_inumber(sfp, &args->inumber,
1091 XFS_DIR2_SF_INUMBERP(sfep)); 1091 xfs_dir2_sf_inumberp(sfep));
1092 break; 1092 break;
1093 } 1093 }
1094 } 1094 }
@@ -1189,22 +1189,22 @@ xfs_dir2_sf_toino4(
1189 */ 1189 */
1190 sfp->hdr.count = oldsfp->hdr.count; 1190 sfp->hdr.count = oldsfp->hdr.count;
1191 sfp->hdr.i8count = 0; 1191 sfp->hdr.i8count = 0;
1192 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); 1192 ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
1193 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); 1193 xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
1194 /* 1194 /*
1195 * Copy the entries field by field. 1195 * Copy the entries field by field.
1196 */ 1196 */
1197 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), 1197 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1198 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); 1198 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1199 i < sfp->hdr.count; 1199 i < sfp->hdr.count;
1200 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), 1200 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
1201 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { 1201 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
1202 sfep->namelen = oldsfep->namelen; 1202 sfep->namelen = oldsfep->namelen;
1203 sfep->offset = oldsfep->offset; 1203 sfep->offset = oldsfep->offset;
1204 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1204 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1205 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, 1205 ino = xfs_dir2_sf_get_inumber(oldsfp,
1206 XFS_DIR2_SF_INUMBERP(oldsfep)); 1206 xfs_dir2_sf_inumberp(oldsfep));
1207 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); 1207 xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
1208 } 1208 }
1209 /* 1209 /*
1210 * Clean up the inode. 1210 * Clean up the inode.
@@ -1266,22 +1266,22 @@ xfs_dir2_sf_toino8(
1266 */ 1266 */
1267 sfp->hdr.count = oldsfp->hdr.count; 1267 sfp->hdr.count = oldsfp->hdr.count;
1268 sfp->hdr.i8count = 1; 1268 sfp->hdr.i8count = 1;
1269 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); 1269 ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
1270 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); 1270 xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
1271 /* 1271 /*
1272 * Copy the entries field by field. 1272 * Copy the entries field by field.
1273 */ 1273 */
1274 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), 1274 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1275 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); 1275 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1276 i < sfp->hdr.count; 1276 i < sfp->hdr.count;
1277 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), 1277 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
1278 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { 1278 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
1279 sfep->namelen = oldsfep->namelen; 1279 sfep->namelen = oldsfep->namelen;
1280 sfep->offset = oldsfep->offset; 1280 sfep->offset = oldsfep->offset;
1281 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1281 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1282 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, 1282 ino = xfs_dir2_sf_get_inumber(oldsfp,
1283 XFS_DIR2_SF_INUMBERP(oldsfep)); 1283 xfs_dir2_sf_inumberp(oldsfep));
1284 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); 1284 xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
1285 } 1285 }
1286 /* 1286 /*
1287 * Clean up the inode. 1287 * Clean up the inode.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index 42f015b700..11e503209a 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -90,7 +90,6 @@ typedef struct xfs_dir2_sf {
90 xfs_dir2_sf_entry_t list[1]; /* shortform entries */ 90 xfs_dir2_sf_entry_t list[1]; /* shortform entries */
91} xfs_dir2_sf_t; 91} xfs_dir2_sf_t;
92 92
93#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count)
94static inline int xfs_dir2_sf_hdr_size(int i8count) 93static inline int xfs_dir2_sf_hdr_size(int i8count)
95{ 94{
96 return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ 95 return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
@@ -98,14 +97,11 @@ static inline int xfs_dir2_sf_hdr_size(int i8count)
98 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); 97 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
99} 98}
100 99
101#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep)
102static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) 100static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
103{ 101{
104 return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; 102 return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen];
105} 103}
106 104
107#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \
108 xfs_dir2_sf_get_inumber(sfp, from)
109static inline xfs_intino_t 105static inline xfs_intino_t
110xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) 106xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
111{ 107{
@@ -114,8 +110,6 @@ xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
114 (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); 110 (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8));
115} 111}
116 112
117#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \
118 xfs_dir2_sf_put_inumber(sfp,from,to)
119static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, 113static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
120 xfs_dir2_inou_t *to) 114 xfs_dir2_inou_t *to)
121{ 115{
@@ -125,24 +119,18 @@ static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
125 XFS_PUT_DIR_INO8(*(from), (to)->i8); 119 XFS_PUT_DIR_INO8(*(from), (to)->i8);
126} 120}
127 121
128#define XFS_DIR2_SF_GET_OFFSET(sfep) \
129 xfs_dir2_sf_get_offset(sfep)
130static inline xfs_dir2_data_aoff_t 122static inline xfs_dir2_data_aoff_t
131xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) 123xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
132{ 124{
133 return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); 125 return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i);
134} 126}
135 127
136#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \
137 xfs_dir2_sf_put_offset(sfep,off)
138static inline void 128static inline void
139xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) 129xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
140{ 130{
141 INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); 131 INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off);
142} 132}
143 133
144#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \
145 xfs_dir2_sf_entsize_byname(sfp,len)
146static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) 134static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
147{ 135{
148 return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ 136 return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
@@ -150,8 +138,6 @@ static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
150 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); 138 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
151} 139}
152 140
153#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \
154 xfs_dir2_sf_entsize_byentry(sfp,sfep)
155static inline int 141static inline int
156xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) 142xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
157{ 143{
@@ -160,19 +146,17 @@ xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
160 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); 146 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
161} 147}
162 148
163#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp)
164static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) 149static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
165{ 150{
166 return ((xfs_dir2_sf_entry_t *) \ 151 return ((xfs_dir2_sf_entry_t *) \
167 ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count))); 152 ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count)));
168} 153}
169 154
170#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep)
171static inline xfs_dir2_sf_entry_t * 155static inline xfs_dir2_sf_entry_t *
172xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) 156xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
173{ 157{
174 return ((xfs_dir2_sf_entry_t *) \ 158 return ((xfs_dir2_sf_entry_t *) \
175 ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep))); 159 ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep)));
176} 160}
177 161
178/* 162/*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 0000000000..ce2278611b
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,771 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_bmap_btree.h"
20#include "xfs_inum.h"
21#include "xfs_dir2.h"
22#include "xfs_dir2_sf.h"
23#include "xfs_attr_sf.h"
24#include "xfs_dinode.h"
25#include "xfs_inode.h"
26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_log.h"
29#include "xfs_trans.h"
30#include "xfs_sb.h"
31#include "xfs_mount.h"
32#include "xfs_bmap.h"
33#include "xfs_alloc.h"
34#include "xfs_utils.h"
35#include "xfs_mru_cache.h"
36#include "xfs_filestream.h"
37
38#ifdef XFS_FILESTREAMS_TRACE
39
40ktrace_t *xfs_filestreams_trace_buf;
41
42STATIC void
43xfs_filestreams_trace(
44 xfs_mount_t *mp, /* mount point */
45 int type, /* type of trace */
46 const char *func, /* source function */
47 int line, /* source line number */
48 __psunsigned_t arg0,
49 __psunsigned_t arg1,
50 __psunsigned_t arg2,
51 __psunsigned_t arg3,
52 __psunsigned_t arg4,
53 __psunsigned_t arg5)
54{
55 ktrace_enter(xfs_filestreams_trace_buf,
56 (void *)(__psint_t)(type | (line << 16)),
57 (void *)func,
58 (void *)(__psunsigned_t)current_pid(),
59 (void *)mp,
60 (void *)(__psunsigned_t)arg0,
61 (void *)(__psunsigned_t)arg1,
62 (void *)(__psunsigned_t)arg2,
63 (void *)(__psunsigned_t)arg3,
64 (void *)(__psunsigned_t)arg4,
65 (void *)(__psunsigned_t)arg5,
66 NULL, NULL, NULL, NULL, NULL, NULL);
67}
68
69#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
70#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
71#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
72#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
73#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
74#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
75#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
76 xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
77 (__psunsigned_t)a0, (__psunsigned_t)a1, \
78 (__psunsigned_t)a2, (__psunsigned_t)a3, \
79 (__psunsigned_t)a4, (__psunsigned_t)a5)
80
81#define TRACE_AG_SCAN(mp, ag, ag2) \
82 TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
83#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
84 TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
85#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
86 TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
87 cnt, free, scan, flag)
88#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
89 TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
90#define TRACE_FREE(mp, ip, pip, ag, cnt) \
91 TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
92#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
93 TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
94#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
95 TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
96#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
97 TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
98#define TRACE_ORPHAN(mp, ip, ag) \
99 TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
100
101
102#else
103#define TRACE_AG_SCAN(mp, ag, ag2)
104#define TRACE_AG_PICK1(mp, max_ag, maxfree)
105#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
106#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
107#define TRACE_FREE(mp, ip, pip, ag, cnt)
108#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
109#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
110#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
111#define TRACE_ORPHAN(mp, ip, ag)
112#endif
113
114static kmem_zone_t *item_zone;
115
116/*
117 * Structure for associating a file or a directory with an allocation group.
118 * The parent directory pointer is only needed for files, but since there will
119 * generally be vastly more files than directories in the cache, using the same
120 * data structure simplifies the code with very little memory overhead.
121 */
122typedef struct fstrm_item
123{
124 xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
125 xfs_inode_t *ip; /* inode self-pointer. */
126 xfs_inode_t *pip; /* Parent directory inode pointer. */
127} fstrm_item_t;
128
129
130/*
131 * Scan the AGs starting at startag looking for an AG that isn't in use and has
132 * at least minlen blocks free.
133 */
134static int
135_xfs_filestream_pick_ag(
136 xfs_mount_t *mp,
137 xfs_agnumber_t startag,
138 xfs_agnumber_t *agp,
139 int flags,
140 xfs_extlen_t minlen)
141{
142 int err, trylock, nscan;
143 xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
145 struct xfs_perag *pag;
146
147 /* 2% of an AG's blocks must be free for it to be chosen. */
148 minfree = mp->m_sb.sb_agblocks / 50;
149
150 ag = startag;
151 *agp = NULLAGNUMBER;
152
153 /* For the first pass, don't sleep trying to init the per-AG. */
154 trylock = XFS_ALLOC_FLAG_TRYLOCK;
155
156 for (nscan = 0; 1; nscan++) {
157
158 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag));
159
160 pag = mp->m_perag + ag;
161
162 if (!pag->pagf_init) {
163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
164 if (err && !trylock)
165 return err;
166 }
167
168 /* Might fail sometimes during the 1st pass with trylock set. */
169 if (!pag->pagf_init)
170 goto next_ag;
171
172 /* Keep track of the AG with the most free blocks. */
173 if (pag->pagf_freeblks > maxfree) {
174 maxfree = pag->pagf_freeblks;
175 max_ag = ag;
176 }
177
178 /*
179 * The AG reference count does two things: it enforces mutual
180 * exclusion when examining the suitability of an AG in this
181 * loop, and it guards against two filestreams being established
182 * in the same AG as each other.
183 */
184 if (xfs_filestream_get_ag(mp, ag) > 1) {
185 xfs_filestream_put_ag(mp, ag);
186 goto next_ag;
187 }
188
189 need = XFS_MIN_FREELIST_PAG(pag, mp);
190 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
191 longest = (pag->pagf_longest > delta) ?
192 (pag->pagf_longest - delta) :
193 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
194
195 if (((minlen && longest >= minlen) ||
196 (!minlen && pag->pagf_freeblks >= minfree)) &&
197 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
198 (flags & XFS_PICK_LOWSPACE))) {
199
200 /* Break out, retaining the reference on the AG. */
201 free = pag->pagf_freeblks;
202 *agp = ag;
203 break;
204 }
205
206 /* Drop the reference on this AG, it's not usable. */
207 xfs_filestream_put_ag(mp, ag);
208next_ag:
209 /* Move to the next AG, wrapping to AG 0 if necessary. */
210 if (++ag >= mp->m_sb.sb_agcount)
211 ag = 0;
212
213 /* If a full pass of the AGs hasn't been done yet, continue. */
214 if (ag != startag)
215 continue;
216
217 /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
218 if (trylock != 0) {
219 trylock = 0;
220 continue;
221 }
222
223 /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
224 if (!(flags & XFS_PICK_LOWSPACE)) {
225 flags |= XFS_PICK_LOWSPACE;
226 continue;
227 }
228
229 /*
230 * Take the AG with the most free space, regardless of whether
231 * it's already in use by another filestream.
232 */
233 if (max_ag != NULLAGNUMBER) {
234 xfs_filestream_get_ag(mp, max_ag);
235 TRACE_AG_PICK1(mp, max_ag, maxfree);
236 free = maxfree;
237 *agp = max_ag;
238 break;
239 }
240
241 /* take AG 0 if none matched */
242 TRACE_AG_PICK1(mp, max_ag, maxfree);
243 *agp = 0;
244 return 0;
245 }
246
247 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp),
248 free, nscan, flags);
249
250 return 0;
251}
252
253/*
254 * Set the allocation group number for a file or a directory, updating inode
255 * references and per-AG references as appropriate. Must be called with the
256 * m_peraglock held in read mode.
257 */
258static int
259_xfs_filestream_update_ag(
260 xfs_inode_t *ip,
261 xfs_inode_t *pip,
262 xfs_agnumber_t ag)
263{
264 int err = 0;
265 xfs_mount_t *mp;
266 xfs_mru_cache_t *cache;
267 fstrm_item_t *item;
268 xfs_agnumber_t old_ag;
269 xfs_inode_t *old_pip;
270
271 /*
272 * Either ip is a regular file and pip is a directory, or ip is a
273 * directory and pip is NULL.
274 */
275 ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
276 (pip->i_d.di_mode & S_IFDIR)) ||
277 ((ip->i_d.di_mode & S_IFDIR) && !pip)));
278
279 mp = ip->i_mount;
280 cache = mp->m_filestream;
281
282 item = xfs_mru_cache_lookup(cache, ip->i_ino);
283 if (item) {
284 ASSERT(item->ip == ip);
285 old_ag = item->ag;
286 item->ag = ag;
287 old_pip = item->pip;
288 item->pip = pip;
289 xfs_mru_cache_done(cache);
290
291 /*
292 * If the AG has changed, drop the old ref and take a new one,
293 * effectively transferring the reference from old to new AG.
294 */
295 if (ag != old_ag) {
296 xfs_filestream_put_ag(mp, old_ag);
297 xfs_filestream_get_ag(mp, ag);
298 }
299
300 /*
301 * If ip is a file and its pip has changed, drop the old ref and
302 * take a new one.
303 */
304 if (pip && pip != old_pip) {
305 IRELE(old_pip);
306 IHOLD(pip);
307 }
308
309 TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
310 ag, xfs_filestream_peek_ag(mp, ag));
311 return 0;
312 }
313
314 item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
315 if (!item)
316 return ENOMEM;
317
318 item->ag = ag;
319 item->ip = ip;
320 item->pip = pip;
321
322 err = xfs_mru_cache_insert(cache, ip->i_ino, item);
323 if (err) {
324 kmem_zone_free(item_zone, item);
325 return err;
326 }
327
328 /* Take a reference on the AG. */
329 xfs_filestream_get_ag(mp, ag);
330
331 /*
332 * Take a reference on the inode itself regardless of whether it's a
333 * regular file or a directory.
334 */
335 IHOLD(ip);
336
337 /*
338 * In the case of a regular file, take a reference on the parent inode
339 * as well to ensure it remains in-core.
340 */
341 if (pip)
342 IHOLD(pip);
343
344 TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
345 ag, xfs_filestream_peek_ag(mp, ag));
346
347 return 0;
348}
349
350/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
351void
352xfs_fstrm_free_func(
353 xfs_ino_t ino,
354 fstrm_item_t *item)
355{
356 xfs_inode_t *ip = item->ip;
357 int ref;
358
359 ASSERT(ip->i_ino == ino);
360
361 xfs_iflags_clear(ip, XFS_IFILESTREAM);
362
363 /* Drop the reference taken on the AG when the item was added. */
364 ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
365
366 ASSERT(ref >= 0);
367 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
368 xfs_filestream_peek_ag(ip->i_mount, item->ag));
369
370 /*
371 * _xfs_filestream_update_ag() always takes a reference on the inode
372 * itself, whether it's a file or a directory. Release it here.
373 * This can result in the inode being freed and so we must
374 * not hold any inode locks when freeing filesstreams objects
375 * otherwise we can deadlock here.
376 */
377 IRELE(ip);
378
379 /*
380 * In the case of a regular file, _xfs_filestream_update_ag() also
381 * takes a ref on the parent inode to keep it in-core. Release that
382 * too.
383 */
384 if (item->pip)
385 IRELE(item->pip);
386
387 /* Finally, free the memory allocated for the item. */
388 kmem_zone_free(item_zone, item);
389}
390
391/*
392 * xfs_filestream_init() is called at xfs initialisation time to set up the
393 * memory zone that will be used for filestream data structure allocation.
394 */
395int
396xfs_filestream_init(void)
397{
398 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
399#ifdef XFS_FILESTREAMS_TRACE
400 xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
401#endif
402 return item_zone ? 0 : -ENOMEM;
403}
404
405/*
406 * xfs_filestream_uninit() is called at xfs termination time to destroy the
407 * memory zone that was used for filestream data structure allocation.
408 */
409void
410xfs_filestream_uninit(void)
411{
412#ifdef XFS_FILESTREAMS_TRACE
413 ktrace_free(xfs_filestreams_trace_buf);
414#endif
415 kmem_zone_destroy(item_zone);
416}
417
418/*
419 * xfs_filestream_mount() is called when a file system is mounted with the
420 * filestream option. It is responsible for allocating the data structures
421 * needed to track the new file system's file streams.
422 */
423int
424xfs_filestream_mount(
425 xfs_mount_t *mp)
426{
427 int err;
428 unsigned int lifetime, grp_count;
429
430 /*
431 * The filestream timer tunable is currently fixed within the range of
432 * one second to four minutes, with five seconds being the default. The
433 * group count is somewhat arbitrary, but it'd be nice to adhere to the
434 * timer tunable to within about 10 percent. This requires at least 10
435 * groups.
436 */
437 lifetime = xfs_fstrm_centisecs * 10;
438 grp_count = 10;
439
440 err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
441 (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
442
443 return err;
444}
445
446/*
447 * xfs_filestream_unmount() is called when a file system that was mounted with
448 * the filestream option is unmounted. It drains the data structures created
449 * to track the file system's file streams and frees all the memory that was
450 * allocated.
451 */
452void
453xfs_filestream_unmount(
454 xfs_mount_t *mp)
455{
456 xfs_mru_cache_destroy(mp->m_filestream);
457}
458
459/*
460 * If the mount point's m_perag array is going to be reallocated, all
461 * outstanding cache entries must be flushed to avoid accessing reference count
462 * addresses that have been freed. The call to xfs_filestream_flush() must be
463 * made inside the block that holds the m_peraglock in write mode to do the
464 * reallocation.
465 */
466void
467xfs_filestream_flush(
468 xfs_mount_t *mp)
469{
470 /* point in time flush, so keep the reaper running */
471 xfs_mru_cache_flush(mp->m_filestream, 1);
472}
473
474/*
475 * Return the AG of the filestream the file or directory belongs to, or
476 * NULLAGNUMBER otherwise.
477 */
478xfs_agnumber_t
479xfs_filestream_lookup_ag(
480 xfs_inode_t *ip)
481{
482 xfs_mru_cache_t *cache;
483 fstrm_item_t *item;
484 xfs_agnumber_t ag;
485 int ref;
486
487 if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
488 ASSERT(0);
489 return NULLAGNUMBER;
490 }
491
492 cache = ip->i_mount->m_filestream;
493 item = xfs_mru_cache_lookup(cache, ip->i_ino);
494 if (!item) {
495 TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
496 return NULLAGNUMBER;
497 }
498
499 ASSERT(ip == item->ip);
500 ag = item->ag;
501 ref = xfs_filestream_peek_ag(ip->i_mount, ag);
502 xfs_mru_cache_done(cache);
503
504 TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
505 return ag;
506}
507
508/*
509 * xfs_filestream_associate() should only be called to associate a regular file
510 * with its parent directory. Calling it with a child directory isn't
511 * appropriate because filestreams don't apply to entire directory hierarchies.
512 * Creating a file in a child directory of an existing filestream directory
513 * starts a new filestream with its own allocation group association.
514 *
515 * Returns < 0 on error, 0 if successful association occurred, > 0 if
516 * we failed to get an association because of locking issues.
517 */
518int
519xfs_filestream_associate(
520 xfs_inode_t *pip,
521 xfs_inode_t *ip)
522{
523 xfs_mount_t *mp;
524 xfs_mru_cache_t *cache;
525 fstrm_item_t *item;
526 xfs_agnumber_t ag, rotorstep, startag;
527 int err = 0;
528
529 ASSERT(pip->i_d.di_mode & S_IFDIR);
530 ASSERT(ip->i_d.di_mode & S_IFREG);
531 if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
532 return -EINVAL;
533
534 mp = pip->i_mount;
535 cache = mp->m_filestream;
536 down_read(&mp->m_peraglock);
537
538 /*
539 * We have a problem, Houston.
540 *
541 * Taking the iolock here violates inode locking order - we already
542 * hold the ilock. Hence if we block getting this lock we may never
543 * wake. Unfortunately, that means if we can't get the lock, we're
544 * screwed in terms of getting a stream association - we can't spin
545 * waiting for the lock because someone else is waiting on the lock we
546 * hold and we cannot drop that as we are in a transaction here.
547 *
548 * Lucky for us, this inversion is rarely a problem because it's a
549 * directory inode that we are trying to lock here and that means the
550 * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
551 * used. i.e. freeze, remount-ro, quotasync or unmount.
552 *
553 * So, if we can't get the iolock without sleeping then just give up
554 */
555 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
556 up_read(&mp->m_peraglock);
557 return 1;
558 }
559
560 /* If the parent directory is already in the cache, use its AG. */
561 item = xfs_mru_cache_lookup(cache, pip->i_ino);
562 if (item) {
563 ASSERT(item->ip == pip);
564 ag = item->ag;
565 xfs_mru_cache_done(cache);
566
567 TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
568 err = _xfs_filestream_update_ag(ip, pip, ag);
569
570 goto exit;
571 }
572
573 /*
574 * Set the starting AG using the rotor for inode32, otherwise
575 * use the directory inode's AG.
576 */
577 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
578 rotorstep = xfs_rotorstep;
579 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
580 mp->m_agfrotor = (mp->m_agfrotor + 1) %
581 (mp->m_sb.sb_agcount * rotorstep);
582 } else
583 startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
584
585 /* Pick a new AG for the parent inode starting at startag. */
586 err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
587 if (err || ag == NULLAGNUMBER)
588 goto exit_did_pick;
589
590 /* Associate the parent inode with the AG. */
591 err = _xfs_filestream_update_ag(pip, NULL, ag);
592 if (err)
593 goto exit_did_pick;
594
595 /* Associate the file inode with the AG. */
596 err = _xfs_filestream_update_ag(ip, pip, ag);
597 if (err)
598 goto exit_did_pick;
599
600 TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
601
602exit_did_pick:
603 /*
604 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
605 * reference it took on it, since the file and directory will have taken
606 * their own now if they were successfully cached.
607 */
608 if (ag != NULLAGNUMBER)
609 xfs_filestream_put_ag(mp, ag);
610
611exit:
612 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
613 up_read(&mp->m_peraglock);
614 return -err;
615}
616
617/*
618 * Pick a new allocation group for the current file and its file stream. This
619 * function is called by xfs_bmap_filestreams() with the mount point's per-ag
620 * lock held.
621 */
622int
623xfs_filestream_new_ag(
624 xfs_bmalloca_t *ap,
625 xfs_agnumber_t *agp)
626{
627 int flags, err;
628 xfs_inode_t *ip, *pip = NULL;
629 xfs_mount_t *mp;
630 xfs_mru_cache_t *cache;
631 xfs_extlen_t minlen;
632 fstrm_item_t *dir, *file;
633 xfs_agnumber_t ag = NULLAGNUMBER;
634
635 ip = ap->ip;
636 mp = ip->i_mount;
637 cache = mp->m_filestream;
638 minlen = ap->alen;
639 *agp = NULLAGNUMBER;
640
641 /*
642 * Look for the file in the cache, removing it if it's found. Doing
643 * this allows it to be held across the dir lookup that follows.
644 */
645 file = xfs_mru_cache_remove(cache, ip->i_ino);
646 if (file) {
647 ASSERT(ip == file->ip);
648
649 /* Save the file's parent inode and old AG number for later. */
650 pip = file->pip;
651 ag = file->ag;
652
653 /* Look for the file's directory in the cache. */
654 dir = xfs_mru_cache_lookup(cache, pip->i_ino);
655 if (dir) {
656 ASSERT(pip == dir->ip);
657
658 /*
659 * If the directory has already moved on to a new AG,
660 * use that AG as the new AG for the file. Don't
661 * forget to twiddle the AG refcounts to match the
662 * movement.
663 */
664 if (dir->ag != file->ag) {
665 xfs_filestream_put_ag(mp, file->ag);
666 xfs_filestream_get_ag(mp, dir->ag);
667 *agp = file->ag = dir->ag;
668 }
669
670 xfs_mru_cache_done(cache);
671 }
672
673 /*
674 * Put the file back in the cache. If this fails, the free
675 * function needs to be called to tidy up in the same way as if
676 * the item had simply expired from the cache.
677 */
678 err = xfs_mru_cache_insert(cache, ip->i_ino, file);
679 if (err) {
680 xfs_fstrm_free_func(ip->i_ino, file);
681 return err;
682 }
683
684 /*
685 * If the file's AG was moved to the directory's new AG, there's
686 * nothing more to be done.
687 */
688 if (*agp != NULLAGNUMBER) {
689 TRACE_MOVEAG(mp, ip, pip,
690 ag, xfs_filestream_peek_ag(mp, ag),
691 *agp, xfs_filestream_peek_ag(mp, *agp));
692 return 0;
693 }
694 }
695
696 /*
697 * If the file's parent directory is known, take its iolock in exclusive
698 * mode to prevent two sibling files from racing each other to migrate
699 * themselves and their parent to different AGs.
700 */
701 if (pip)
702 xfs_ilock(pip, XFS_IOLOCK_EXCL);
703
704 /*
705 * A new AG needs to be found for the file. If the file's parent
706 * directory is also known, it will be moved to the new AG as well to
707 * ensure that files created inside it in future use the new AG.
708 */
709 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
710 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
711 (ap->low ? XFS_PICK_LOWSPACE : 0);
712
713 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
714 if (err || *agp == NULLAGNUMBER)
715 goto exit;
716
717 /*
718 * If the file wasn't found in the file cache, then its parent directory
719 * inode isn't known. For this to have happened, the file must either
720 * be pre-existing, or it was created long enough ago that its cache
721 * entry has expired. This isn't the sort of usage that the filestreams
722 * allocator is trying to optimise, so there's no point trying to track
723 * its new AG somehow in the filestream data structures.
724 */
725 if (!pip) {
726 TRACE_ORPHAN(mp, ip, *agp);
727 goto exit;
728 }
729
730 /* Associate the parent inode with the AG. */
731 err = _xfs_filestream_update_ag(pip, NULL, *agp);
732 if (err)
733 goto exit;
734
735 /* Associate the file inode with the AG. */
736 err = _xfs_filestream_update_ag(ip, pip, *agp);
737 if (err)
738 goto exit;
739
740 TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
741 *agp, xfs_filestream_peek_ag(mp, *agp));
742
743exit:
744 /*
745 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
746 * reference it took on it, since the file and directory will have taken
747 * their own now if they were successfully cached.
748 */
749 if (*agp != NULLAGNUMBER)
750 xfs_filestream_put_ag(mp, *agp);
751 else
752 *agp = 0;
753
754 if (pip)
755 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
756
757 return err;
758}
759
760/*
761 * Remove an association between an inode and a filestream object.
762 * Typically this is done on last close of an unlinked file.
763 */
764void
765xfs_filestream_deassociate(
766 xfs_inode_t *ip)
767{
768 xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
769
770 xfs_mru_cache_delete(cache, ip->i_ino);
771}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 0000000000..f655f7dc33
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,136 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FILESTREAM_H__
19#define __XFS_FILESTREAM_H__
20
21#ifdef __KERNEL__
22
23struct xfs_mount;
24struct xfs_inode;
25struct xfs_perag;
26struct xfs_bmalloca;
27
28#ifdef XFS_FILESTREAMS_TRACE
29#define XFS_FSTRM_KTRACE_INFO 1
30#define XFS_FSTRM_KTRACE_AGSCAN 2
31#define XFS_FSTRM_KTRACE_AGPICK1 3
32#define XFS_FSTRM_KTRACE_AGPICK2 4
33#define XFS_FSTRM_KTRACE_UPDATE 5
34#define XFS_FSTRM_KTRACE_FREE 6
35#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
36#define XFS_FSTRM_KTRACE_ASSOCIATE 8
37#define XFS_FSTRM_KTRACE_MOVEAG 9
38#define XFS_FSTRM_KTRACE_ORPHAN 10
39
40#define XFS_FSTRM_KTRACE_SIZE 16384
41extern ktrace_t *xfs_filestreams_trace_buf;
42
43#endif
44
45/*
46 * Allocation group filestream associations are tracked with per-ag atomic
47 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
48 * particular AG already has active filestreams associated with it. The mount
49 * point's m_peraglock is used to protect these counters from per-ag array
50 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
51 * about to reallocate the array, it calls xfs_filestream_flush() with the
52 * m_peraglock held in write mode.
53 *
54 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
55 * the cache elements have finished executing before it returns, it's safe for
56 * the free functions to use the atomic counters without m_peraglock protection.
57 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
58 * whether it was called with the m_peraglock held in read mode, write mode or
59 * not held at all. The race condition this addresses is the following:
60 *
61 * - The work queue scheduler fires and pulls a filestream directory cache
62 * element off the LRU end of the cache for deletion, then gets pre-empted.
63 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
64 * remaining items from the cache and reallocates the mount point's per-ag
65 * array, resetting all the counters to zero.
66 * - The work queue thread resumes and calls the free function for the element
67 * it started cleaning up earlier. In the process it decrements the
68 * filestreams counter for an AG that now has no references.
69 *
70 * With a shrinkfs feature, the above scenario could panic the system.
71 *
72 * All other uses of the following macros should be protected by either the
73 * m_peraglock held in read mode, or the cache's internal locking exposed by the
74 * interval between a call to xfs_mru_cache_lookup() and a call to
75 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
76 * when new elements are added to the cache.
77 *
78 * Combined, these locking rules ensure that no associations will ever exist in
79 * the cache that reference per-ag array elements that have since been
80 * reallocated.
81 */
82STATIC_INLINE int
83xfs_filestream_peek_ag(
84 xfs_mount_t *mp,
85 xfs_agnumber_t agno)
86{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms);
88}
89
90STATIC_INLINE int
91xfs_filestream_get_ag(
92 xfs_mount_t *mp,
93 xfs_agnumber_t agno)
94{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
96}
97
98STATIC_INLINE int
99xfs_filestream_put_ag(
100 xfs_mount_t *mp,
101 xfs_agnumber_t agno)
102{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms);
104}
105
106/* allocation selection flags */
107typedef enum xfs_fstrm_alloc {
108 XFS_PICK_USERDATA = 1,
109 XFS_PICK_LOWSPACE = 2,
110} xfs_fstrm_alloc_t;
111
112/* prototypes for filestream.c */
113int xfs_filestream_init(void);
114void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip);
121int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
122
123
124/* filestreams for the inode? */
125STATIC_INLINE int
126xfs_inode_is_filestream(
127 struct xfs_inode *ip)
128{
129 return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
130 xfs_iflags_test(ip, XFS_IFILESTREAM) ||
131 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
132}
133
134#endif /* __KERNEL__ */
135
136#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1335449841..ec3c9c27e0 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -66,6 +66,7 @@ struct fsxattr {
66#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ 66#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
67#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ 67#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
68#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ 68#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
69#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
69#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ 70#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
70 71
71/* 72/*
@@ -238,6 +239,7 @@ typedef struct xfs_fsop_resblks {
238#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ 239#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
239#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ 240#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
240#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ 241#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
242#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
241 243
242 244
243/* 245/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b599e6be9e..432e82347e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -44,6 +44,7 @@
44#include "xfs_trans_space.h" 44#include "xfs_trans_space.h"
45#include "xfs_rtalloc.h" 45#include "xfs_rtalloc.h"
46#include "xfs_rw.h" 46#include "xfs_rw.h"
47#include "xfs_filestream.h"
47 48
48/* 49/*
49 * File system operations 50 * File system operations
@@ -94,6 +95,8 @@ xfs_fs_geometry(
94 XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | 95 XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
95 (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? 96 (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
96 XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | 97 XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
98 (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
99 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
97 (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ? 100 (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ?
98 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); 101 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
99 geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? 102 geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
@@ -140,6 +143,8 @@ xfs_growfs_data_private(
140 pct = in->imaxpct; 143 pct = in->imaxpct;
141 if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) 144 if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
142 return XFS_ERROR(EINVAL); 145 return XFS_ERROR(EINVAL);
146 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
147 return error;
143 dpct = pct - mp->m_sb.sb_imax_pct; 148 dpct = pct - mp->m_sb.sb_imax_pct;
144 error = xfs_read_buf(mp, mp->m_ddev_targp, 149 error = xfs_read_buf(mp, mp->m_ddev_targp,
145 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 150 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
@@ -161,6 +166,7 @@ xfs_growfs_data_private(
161 new = nb - mp->m_sb.sb_dblocks; 166 new = nb - mp->m_sb.sb_dblocks;
162 oagcount = mp->m_sb.sb_agcount; 167 oagcount = mp->m_sb.sb_agcount;
163 if (nagcount > oagcount) { 168 if (nagcount > oagcount) {
169 xfs_filestream_flush(mp);
164 down_write(&mp->m_peraglock); 170 down_write(&mp->m_peraglock);
165 mp->m_perag = kmem_realloc(mp->m_perag, 171 mp->m_perag = kmem_realloc(mp->m_perag,
166 sizeof(xfs_perag_t) * nagcount, 172 sizeof(xfs_perag_t) * nagcount,
@@ -173,6 +179,7 @@ xfs_growfs_data_private(
173 up_write(&mp->m_peraglock); 179 up_write(&mp->m_peraglock);
174 } 180 }
175 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 181 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
182 tp->t_flags |= XFS_TRANS_RESERVE;
176 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 183 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
177 XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { 184 XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
178 xfs_trans_cancel(tp, 0); 185 xfs_trans_cancel(tp, 0);
@@ -328,6 +335,7 @@ xfs_growfs_data_private(
328 be32_add(&agf->agf_length, new); 335 be32_add(&agf->agf_length, new);
329 ASSERT(be32_to_cpu(agf->agf_length) == 336 ASSERT(be32_to_cpu(agf->agf_length) ==
330 be32_to_cpu(agi->agi_length)); 337 be32_to_cpu(agi->agi_length));
338 xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
331 /* 339 /*
332 * Free the new space. 340 * Free the new space.
333 */ 341 */
@@ -494,8 +502,9 @@ xfs_reserve_blocks(
494 unsigned long s; 502 unsigned long s;
495 503
496 /* If inval is null, report current values and return */ 504 /* If inval is null, report current values and return */
497
498 if (inval == (__uint64_t *)NULL) { 505 if (inval == (__uint64_t *)NULL) {
506 if (!outval)
507 return EINVAL;
499 outval->resblks = mp->m_resblks; 508 outval->resblks = mp->m_resblks;
500 outval->resblks_avail = mp->m_resblks_avail; 509 outval->resblks_avail = mp->m_resblks_avail;
501 return 0; 510 return 0;
@@ -558,8 +567,10 @@ retry:
558 } 567 }
559 } 568 }
560out: 569out:
561 outval->resblks = mp->m_resblks; 570 if (outval) {
562 outval->resblks_avail = mp->m_resblks_avail; 571 outval->resblks = mp->m_resblks;
572 outval->resblks_avail = mp->m_resblks_avail;
573 }
563 XFS_SB_UNLOCK(mp, s); 574 XFS_SB_UNLOCK(mp, s);
564 575
565 if (fdblks_delta) { 576 if (fdblks_delta) {
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index b5feb3e771..f943368c9b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -123,6 +123,7 @@ xfs_ialloc_ag_alloc(
123 int blks_per_cluster; /* fs blocks per inode cluster */ 123 int blks_per_cluster; /* fs blocks per inode cluster */
124 xfs_btree_cur_t *cur; /* inode btree cursor */ 124 xfs_btree_cur_t *cur; /* inode btree cursor */
125 xfs_daddr_t d; /* disk addr of buffer */ 125 xfs_daddr_t d; /* disk addr of buffer */
126 xfs_agnumber_t agno;
126 int error; 127 int error;
127 xfs_buf_t *fbuf; /* new free inodes' buffer */ 128 xfs_buf_t *fbuf; /* new free inodes' buffer */
128 xfs_dinode_t *free; /* new free inode structure */ 129 xfs_dinode_t *free; /* new free inode structure */
@@ -302,15 +303,15 @@ xfs_ialloc_ag_alloc(
302 } 303 }
303 be32_add(&agi->agi_count, newlen); 304 be32_add(&agi->agi_count, newlen);
304 be32_add(&agi->agi_freecount, newlen); 305 be32_add(&agi->agi_freecount, newlen);
306 agno = be32_to_cpu(agi->agi_seqno);
305 down_read(&args.mp->m_peraglock); 307 down_read(&args.mp->m_peraglock);
306 args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen; 308 args.mp->m_perag[agno].pagi_freecount += newlen;
307 up_read(&args.mp->m_peraglock); 309 up_read(&args.mp->m_peraglock);
308 agi->agi_newino = cpu_to_be32(newino); 310 agi->agi_newino = cpu_to_be32(newino);
309 /* 311 /*
310 * Insert records describing the new inode chunk into the btree. 312 * Insert records describing the new inode chunk into the btree.
311 */ 313 */
312 cur = xfs_btree_init_cursor(args.mp, tp, agbp, 314 cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
313 be32_to_cpu(agi->agi_seqno),
314 XFS_BTNUM_INO, (xfs_inode_t *)0, 0); 315 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
315 for (thisino = newino; 316 for (thisino = newino;
316 thisino < newino + newlen; 317 thisino < newino + newlen;
@@ -1387,6 +1388,7 @@ xfs_ialloc_read_agi(
1387 pag = &mp->m_perag[agno]; 1388 pag = &mp->m_perag[agno];
1388 if (!pag->pagi_init) { 1389 if (!pag->pagi_init) {
1389 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1390 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1391 pag->pagi_count = be32_to_cpu(agi->agi_count);
1390 pag->pagi_init = 1; 1392 pag->pagi_init = 1;
1391 } else { 1393 } else {
1392 /* 1394 /*
@@ -1410,3 +1412,23 @@ xfs_ialloc_read_agi(
1410 *bpp = bp; 1412 *bpp = bp;
1411 return 0; 1413 return 0;
1412} 1414}
1415
1416/*
1417 * Read in the agi to initialise the per-ag data in the mount structure
1418 */
1419int
1420xfs_ialloc_pagi_init(
1421 xfs_mount_t *mp, /* file system mount structure */
1422 xfs_trans_t *tp, /* transaction pointer */
1423 xfs_agnumber_t agno) /* allocation group number */
1424{
1425 xfs_buf_t *bp = NULL;
1426 int error;
1427
1428 error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
1429 if (error)
1430 return error;
1431 if (bp)
1432 xfs_trans_brelse(tp, bp);
1433 return 0;
1434}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 7f5debe1ac..97f4040931 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -149,6 +149,16 @@ xfs_ialloc_read_agi(
149 xfs_agnumber_t agno, /* allocation group number */ 149 xfs_agnumber_t agno, /* allocation group number */
150 struct xfs_buf **bpp); /* allocation group hdr buf */ 150 struct xfs_buf **bpp); /* allocation group hdr buf */
151 151
152/*
153 * Read in the allocation group header to initialise the per-ag data
154 * in the mount structure
155 */
156int
157xfs_ialloc_pagi_init(
158 struct xfs_mount *mp, /* file system mount structure */
159 struct xfs_trans *tp, /* transaction pointer */
160 xfs_agnumber_t agno); /* allocation group number */
161
152#endif /* __KERNEL__ */ 162#endif /* __KERNEL__ */
153 163
154#endif /* __XFS_IALLOC_H__ */ 164#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3ca5d43b83..cdc4c28926 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -48,7 +48,9 @@
48#include "xfs_dir2_trace.h" 48#include "xfs_dir2_trace.h"
49#include "xfs_quota.h" 49#include "xfs_quota.h"
50#include "xfs_acl.h" 50#include "xfs_acl.h"
51#include "xfs_filestream.h"
51 52
53#include <linux/log2.h>
52 54
53kmem_zone_t *xfs_ifork_zone; 55kmem_zone_t *xfs_ifork_zone;
54kmem_zone_t *xfs_inode_zone; 56kmem_zone_t *xfs_inode_zone;
@@ -643,8 +645,7 @@ xfs_iformat_extents(
643 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), 645 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
644 ARCH_CONVERT); 646 ARCH_CONVERT);
645 } 647 }
646 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, 648 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
647 whichfork);
648 if (whichfork != XFS_DATA_FORK || 649 if (whichfork != XFS_DATA_FORK ||
649 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 650 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
650 if (unlikely(xfs_check_nostate_extents( 651 if (unlikely(xfs_check_nostate_extents(
@@ -817,6 +818,8 @@ _xfs_dic2xflags(
817 flags |= XFS_XFLAG_EXTSZINHERIT; 818 flags |= XFS_XFLAG_EXTSZINHERIT;
818 if (di_flags & XFS_DIFLAG_NODEFRAG) 819 if (di_flags & XFS_DIFLAG_NODEFRAG)
819 flags |= XFS_XFLAG_NODEFRAG; 820 flags |= XFS_XFLAG_NODEFRAG;
821 if (di_flags & XFS_DIFLAG_FILESTREAM)
822 flags |= XFS_XFLAG_FILESTREAM;
820 } 823 }
821 824
822 return flags; 825 return flags;
@@ -1074,6 +1077,11 @@ xfs_iread_extents(
1074 * also returns the [locked] bp pointing to the head of the freelist 1077 * also returns the [locked] bp pointing to the head of the freelist
1075 * as ialloc_context. The caller should hold this buffer across 1078 * as ialloc_context. The caller should hold this buffer across
1076 * the commit and pass it back into this routine on the second call. 1079 * the commit and pass it back into this routine on the second call.
1080 *
1081 * If we are allocating quota inodes, we do not have a parent inode
1082 * to attach to or associate with (i.e. pip == NULL) because they
1083 * are not linked into the directory structure - they are attached
1084 * directly to the superblock - and so have no parent.
1077 */ 1085 */
1078int 1086int
1079xfs_ialloc( 1087xfs_ialloc(
@@ -1099,7 +1107,7 @@ xfs_ialloc(
1099 * Call the space management code to pick 1107 * Call the space management code to pick
1100 * the on-disk inode to be allocated. 1108 * the on-disk inode to be allocated.
1101 */ 1109 */
1102 error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, 1110 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1103 ialloc_context, call_again, &ino); 1111 ialloc_context, call_again, &ino);
1104 if (error != 0) { 1112 if (error != 0) {
1105 return error; 1113 return error;
@@ -1150,10 +1158,10 @@ xfs_ialloc(
1150 /* 1158 /*
1151 * Project ids won't be stored on disk if we are using a version 1 inode. 1159 * Project ids won't be stored on disk if we are using a version 1 inode.
1152 */ 1160 */
1153 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1161 if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1154 xfs_bump_ino_vers2(tp, ip); 1162 xfs_bump_ino_vers2(tp, ip);
1155 1163
1156 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { 1164 if (pip && XFS_INHERIT_GID(pip, vp->v_vfsp)) {
1157 ip->i_d.di_gid = pip->i_d.di_gid; 1165 ip->i_d.di_gid = pip->i_d.di_gid;
1158 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1166 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1159 ip->i_d.di_mode |= S_ISGID; 1167 ip->i_d.di_mode |= S_ISGID;
@@ -1195,8 +1203,16 @@ xfs_ialloc(
1195 flags |= XFS_ILOG_DEV; 1203 flags |= XFS_ILOG_DEV;
1196 break; 1204 break;
1197 case S_IFREG: 1205 case S_IFREG:
1206 if (pip && xfs_inode_is_filestream(pip)) {
1207 error = xfs_filestream_associate(pip, ip);
1208 if (error < 0)
1209 return -error;
1210 if (!error)
1211 xfs_iflags_set(ip, XFS_IFILESTREAM);
1212 }
1213 /* fall through */
1198 case S_IFDIR: 1214 case S_IFDIR:
1199 if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1215 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1200 uint di_flags = 0; 1216 uint di_flags = 0;
1201 1217
1202 if ((mode & S_IFMT) == S_IFDIR) { 1218 if ((mode & S_IFMT) == S_IFDIR) {
@@ -1233,6 +1249,8 @@ xfs_ialloc(
1233 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1249 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1234 xfs_inherit_nodefrag) 1250 xfs_inherit_nodefrag)
1235 di_flags |= XFS_DIFLAG_NODEFRAG; 1251 di_flags |= XFS_DIFLAG_NODEFRAG;
1252 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1253 di_flags |= XFS_DIFLAG_FILESTREAM;
1236 ip->i_d.di_flags |= di_flags; 1254 ip->i_d.di_flags |= di_flags;
1237 } 1255 }
1238 /* FALLTHROUGH */ 1256 /* FALLTHROUGH */
@@ -2875,9 +2893,6 @@ xfs_iextents_copy(
2875 int copied; 2893 int copied;
2876 xfs_bmbt_rec_t *dest_ep; 2894 xfs_bmbt_rec_t *dest_ep;
2877 xfs_bmbt_rec_t *ep; 2895 xfs_bmbt_rec_t *ep;
2878#ifdef XFS_BMAP_TRACE
2879 static char fname[] = "xfs_iextents_copy";
2880#endif
2881 int i; 2896 int i;
2882 xfs_ifork_t *ifp; 2897 xfs_ifork_t *ifp;
2883 int nrecs; 2898 int nrecs;
@@ -2888,7 +2903,7 @@ xfs_iextents_copy(
2888 ASSERT(ifp->if_bytes > 0); 2903 ASSERT(ifp->if_bytes > 0);
2889 2904
2890 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2905 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2891 xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); 2906 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2892 ASSERT(nrecs > 0); 2907 ASSERT(nrecs > 0);
2893 2908
2894 /* 2909 /*
@@ -4184,7 +4199,7 @@ xfs_iext_realloc_direct(
4184 ifp->if_bytes = new_size; 4199 ifp->if_bytes = new_size;
4185 return; 4200 return;
4186 } 4201 }
4187 if ((new_size & (new_size - 1)) != 0) { 4202 if (!is_power_of_2(new_size)){
4188 rnew_size = xfs_iroundup(new_size); 4203 rnew_size = xfs_iroundup(new_size);
4189 } 4204 }
4190 if (rnew_size != ifp->if_real_bytes) { 4205 if (rnew_size != ifp->if_real_bytes) {
@@ -4207,7 +4222,7 @@ xfs_iext_realloc_direct(
4207 */ 4222 */
4208 else { 4223 else {
4209 new_size += ifp->if_bytes; 4224 new_size += ifp->if_bytes;
4210 if ((new_size & (new_size - 1)) != 0) { 4225 if (!is_power_of_2(new_size)) {
4211 rnew_size = xfs_iroundup(new_size); 4226 rnew_size = xfs_iroundup(new_size);
4212 } 4227 }
4213 xfs_iext_inline_to_direct(ifp, rnew_size); 4228 xfs_iext_inline_to_direct(ifp, rnew_size);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f75afecef8..012dfd4a95 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -379,6 +379,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
379#define XFS_ISTALE 0x0010 /* inode has been staled */ 379#define XFS_ISTALE 0x0010 /* inode has been staled */
380#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ 380#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
381#define XFS_INEW 0x0040 381#define XFS_INEW 0x0040
382#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */
382 383
383/* 384/*
384 * Flags for inode locking. 385 * Flags for inode locking.
@@ -414,19 +415,22 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
414 * gets a lockdep subclass of 1 and the second lock will have a lockdep 415 * gets a lockdep subclass of 1 and the second lock will have a lockdep
415 * subclass of 0. 416 * subclass of 0.
416 * 417 *
417 * XFS_I[O]LOCK_INUMORDER - for locking several inodes at the some time 418 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
418 * with xfs_lock_inodes(). This flag is used as the starting subclass 419 * with xfs_lock_inodes(). This flag is used as the starting subclass
419 * and each subsequent lock acquired will increment the subclass by one. 420 * and each subsequent lock acquired will increment the subclass by one.
420 * So the first lock acquired will have a lockdep subclass of 2, the 421 * So the first lock acquired will have a lockdep subclass of 2, the
421 * second lock will have a lockdep subclass of 3, and so on. 422 * second lock will have a lockdep subclass of 3, and so on. It is
423 * the responsibility of the class builder to shift this to the correct
424 * portion of the lock_mode lockdep mask.
422 */ 425 */
426#define XFS_LOCK_PARENT 1
427#define XFS_LOCK_INUMORDER 2
428
423#define XFS_IOLOCK_SHIFT 16 429#define XFS_IOLOCK_SHIFT 16
424#define XFS_IOLOCK_PARENT (1 << XFS_IOLOCK_SHIFT) 430#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
425#define XFS_IOLOCK_INUMORDER (2 << XFS_IOLOCK_SHIFT)
426 431
427#define XFS_ILOCK_SHIFT 24 432#define XFS_ILOCK_SHIFT 24
428#define XFS_ILOCK_PARENT (1 << XFS_ILOCK_SHIFT) 433#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
429#define XFS_ILOCK_INUMORDER (2 << XFS_ILOCK_SHIFT)
430 434
431#define XFS_IOLOCK_DEP_MASK 0x00ff0000 435#define XFS_IOLOCK_DEP_MASK 0x00ff0000
432#define XFS_ILOCK_DEP_MASK 0xff000000 436#define XFS_ILOCK_DEP_MASK 0xff000000
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3f2b9f2a7b..bf57b75acb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -451,19 +451,14 @@ xfs_iomap_write_direct(
451 return XFS_ERROR(error); 451 return XFS_ERROR(error);
452 452
453 rt = XFS_IS_REALTIME_INODE(ip); 453 rt = XFS_IS_REALTIME_INODE(ip);
454 if (unlikely(rt)) { 454 extsz = xfs_get_extsz_hint(ip);
455 if (!(extsz = ip->i_d.di_extsize))
456 extsz = mp->m_sb.sb_rextsize;
457 } else {
458 extsz = ip->i_d.di_extsize;
459 }
460 455
461 isize = ip->i_size; 456 isize = ip->i_size;
462 if (io->io_new_size > isize) 457 if (io->io_new_size > isize)
463 isize = io->io_new_size; 458 isize = io->io_new_size;
464 459
465 offset_fsb = XFS_B_TO_FSBT(mp, offset); 460 offset_fsb = XFS_B_TO_FSBT(mp, offset);
466 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 461 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
467 if ((offset + count) > isize) { 462 if ((offset + count) > isize) {
468 error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, 463 error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz,
469 &last_fsb); 464 &last_fsb);
@@ -489,13 +484,13 @@ xfs_iomap_write_direct(
489 if (unlikely(rt)) { 484 if (unlikely(rt)) {
490 resrtextents = qblocks = resaligned; 485 resrtextents = qblocks = resaligned;
491 resrtextents /= mp->m_sb.sb_rextsize; 486 resrtextents /= mp->m_sb.sb_rextsize;
492 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 487 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
493 quota_flag = XFS_QMOPT_RES_RTBLKS; 488 quota_flag = XFS_QMOPT_RES_RTBLKS;
494 } else { 489 } else {
495 resrtextents = 0; 490 resrtextents = 0;
496 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 491 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
497 quota_flag = XFS_QMOPT_RES_REGBLKS; 492 quota_flag = XFS_QMOPT_RES_REGBLKS;
498 } 493 }
499 494
500 /* 495 /*
501 * Allocate and setup the transaction 496 * Allocate and setup the transaction
@@ -666,13 +661,7 @@ xfs_iomap_write_delay(
666 if (error) 661 if (error)
667 return XFS_ERROR(error); 662 return XFS_ERROR(error);
668 663
669 if (XFS_IS_REALTIME_INODE(ip)) { 664 extsz = xfs_get_extsz_hint(ip);
670 if (!(extsz = ip->i_d.di_extsize))
671 extsz = mp->m_sb.sb_rextsize;
672 } else {
673 extsz = ip->i_d.di_extsize;
674 }
675
676 offset_fsb = XFS_B_TO_FSBT(mp, offset); 665 offset_fsb = XFS_B_TO_FSBT(mp, offset);
677 666
678retry: 667retry:
@@ -788,18 +777,12 @@ xfs_iomap_write_allocate(
788 nimaps = 0; 777 nimaps = 0;
789 while (nimaps == 0) { 778 while (nimaps == 0) {
790 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 779 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
780 tp->t_flags |= XFS_TRANS_RESERVE;
791 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 781 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
792 error = xfs_trans_reserve(tp, nres, 782 error = xfs_trans_reserve(tp, nres,
793 XFS_WRITE_LOG_RES(mp), 783 XFS_WRITE_LOG_RES(mp),
794 0, XFS_TRANS_PERM_LOG_RES, 784 0, XFS_TRANS_PERM_LOG_RES,
795 XFS_WRITE_LOG_COUNT); 785 XFS_WRITE_LOG_COUNT);
796 if (error == ENOSPC) {
797 error = xfs_trans_reserve(tp, 0,
798 XFS_WRITE_LOG_RES(mp),
799 0,
800 XFS_TRANS_PERM_LOG_RES,
801 XFS_WRITE_LOG_COUNT);
802 }
803 if (error) { 786 if (error) {
804 xfs_trans_cancel(tp, 0); 787 xfs_trans_cancel(tp, 0);
805 return XFS_ERROR(error); 788 return XFS_ERROR(error);
@@ -917,8 +900,8 @@ xfs_iomap_write_unwritten(
917 * from unwritten to real. Do allocations in a loop until 900 * from unwritten to real. Do allocations in a loop until
918 * we have covered the range passed in. 901 * we have covered the range passed in.
919 */ 902 */
920
921 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 903 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
904 tp->t_flags |= XFS_TRANS_RESERVE;
922 error = xfs_trans_reserve(tp, resblks, 905 error = xfs_trans_reserve(tp, resblks,
923 XFS_WRITE_LOG_RES(mp), 0, 906 XFS_WRITE_LOG_RES(mp), 0,
924 XFS_TRANS_PERM_LOG_RES, 907 XFS_TRANS_PERM_LOG_RES,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e725ddd3de..4c2454bcc7 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -202,6 +202,16 @@ xfs_bulkstat_one_dinode(
202 return 0; 202 return 0;
203} 203}
204 204
205STATIC int
206xfs_bulkstat_one_fmt(
207 void __user *ubuffer,
208 const xfs_bstat_t *buffer)
209{
210 if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
211 return -EFAULT;
212 return sizeof(*buffer);
213}
214
205/* 215/*
206 * Return stat information for one inode. 216 * Return stat information for one inode.
207 * Return 0 if ok, else errno. 217 * Return 0 if ok, else errno.
@@ -221,6 +231,7 @@ xfs_bulkstat_one(
221 xfs_bstat_t *buf; /* return buffer */ 231 xfs_bstat_t *buf; /* return buffer */
222 int error = 0; /* error value */ 232 int error = 0; /* error value */
223 xfs_dinode_t *dip; /* dinode inode pointer */ 233 xfs_dinode_t *dip; /* dinode inode pointer */
234 bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
224 235
225 dip = (xfs_dinode_t *)dibuff; 236 dip = (xfs_dinode_t *)dibuff;
226 *stat = BULKSTAT_RV_NOTHING; 237 *stat = BULKSTAT_RV_NOTHING;
@@ -243,14 +254,15 @@ xfs_bulkstat_one(
243 xfs_bulkstat_one_dinode(mp, ino, dip, buf); 254 xfs_bulkstat_one_dinode(mp, ino, dip, buf);
244 } 255 }
245 256
246 if (copy_to_user(buffer, buf, sizeof(*buf))) { 257 error = formatter(buffer, buf);
258 if (error < 0) {
247 error = EFAULT; 259 error = EFAULT;
248 goto out_free; 260 goto out_free;
249 } 261 }
250 262
251 *stat = BULKSTAT_RV_DIDONE; 263 *stat = BULKSTAT_RV_DIDONE;
252 if (ubused) 264 if (ubused)
253 *ubused = sizeof(*buf); 265 *ubused = error;
254 266
255 out_free: 267 out_free:
256 kmem_free(buf, sizeof(*buf)); 268 kmem_free(buf, sizeof(*buf));
@@ -748,6 +760,19 @@ xfs_bulkstat_single(
748 return 0; 760 return 0;
749} 761}
750 762
763int
764xfs_inumbers_fmt(
765 void __user *ubuffer, /* buffer to write to */
766 const xfs_inogrp_t *buffer, /* buffer to read from */
767 long count, /* # of elements to read */
768 long *written) /* # of bytes written */
769{
770 if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
771 return -EFAULT;
772 *written = count * sizeof(*buffer);
773 return 0;
774}
775
751/* 776/*
752 * Return inode number table for the filesystem. 777 * Return inode number table for the filesystem.
753 */ 778 */
@@ -756,7 +781,8 @@ xfs_inumbers(
756 xfs_mount_t *mp, /* mount point for filesystem */ 781 xfs_mount_t *mp, /* mount point for filesystem */
757 xfs_ino_t *lastino, /* last inode returned */ 782 xfs_ino_t *lastino, /* last inode returned */
758 int *count, /* size of buffer/count returned */ 783 int *count, /* size of buffer/count returned */
759 xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */ 784 void __user *ubuffer,/* buffer with inode descriptions */
785 inumbers_fmt_pf formatter)
760{ 786{
761 xfs_buf_t *agbp; 787 xfs_buf_t *agbp;
762 xfs_agino_t agino; 788 xfs_agino_t agino;
@@ -835,12 +861,12 @@ xfs_inumbers(
835 bufidx++; 861 bufidx++;
836 left--; 862 left--;
837 if (bufidx == bcount) { 863 if (bufidx == bcount) {
838 if (copy_to_user(ubuffer, buffer, 864 long written;
839 bufidx * sizeof(*buffer))) { 865 if (formatter(ubuffer, buffer, bufidx, &written)) {
840 error = XFS_ERROR(EFAULT); 866 error = XFS_ERROR(EFAULT);
841 break; 867 break;
842 } 868 }
843 ubuffer += bufidx; 869 ubuffer += written;
844 *count += bufidx; 870 *count += bufidx;
845 bufidx = 0; 871 bufidx = 0;
846 } 872 }
@@ -862,8 +888,8 @@ xfs_inumbers(
862 } 888 }
863 if (!error) { 889 if (!error) {
864 if (bufidx) { 890 if (bufidx) {
865 if (copy_to_user(ubuffer, buffer, 891 long written;
866 bufidx * sizeof(*buffer))) 892 if (formatter(ubuffer, buffer, bufidx, &written))
867 error = XFS_ERROR(EFAULT); 893 error = XFS_ERROR(EFAULT);
868 else 894 else
869 *count += bufidx; 895 *count += bufidx;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f25a28862a..a1f18fce9b 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -69,6 +69,10 @@ xfs_bulkstat_single(
69 char __user *buffer, 69 char __user *buffer,
70 int *done); 70 int *done);
71 71
72typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
73 void __user *ubuffer, /* buffer to write to */
74 const xfs_bstat_t *buffer); /* buffer to read from */
75
72int 76int
73xfs_bulkstat_one( 77xfs_bulkstat_one(
74 xfs_mount_t *mp, 78 xfs_mount_t *mp,
@@ -86,11 +90,25 @@ xfs_internal_inum(
86 xfs_mount_t *mp, 90 xfs_mount_t *mp,
87 xfs_ino_t ino); 91 xfs_ino_t ino);
88 92
93typedef int (*inumbers_fmt_pf)(
94 void __user *ubuffer, /* buffer to write to */
95 const xfs_inogrp_t *buffer, /* buffer to read from */
96 long count, /* # of elements to read */
97 long *written); /* # of bytes written */
98
99int
100xfs_inumbers_fmt(
101 void __user *ubuffer, /* buffer to write to */
102 const xfs_inogrp_t *buffer, /* buffer to read from */
103 long count, /* # of elements to read */
104 long *written); /* # of bytes written */
105
89int /* error status */ 106int /* error status */
90xfs_inumbers( 107xfs_inumbers(
91 xfs_mount_t *mp, /* mount point for filesystem */ 108 xfs_mount_t *mp, /* mount point for filesystem */
92 xfs_ino_t *last, /* last inode returned */ 109 xfs_ino_t *last, /* last inode returned */
93 int *count, /* size of buffer/count returned */ 110 int *count, /* size of buffer/count returned */
94 xfs_inogrp_t __user *buffer);/* buffer with inode info */ 111 void __user *buffer, /* buffer with inode info */
112 inumbers_fmt_pf formatter);
95 113
96#endif /* __XFS_ITABLE_H__ */ 114#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c48bf61f17..9d4c4fbeb3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -817,10 +817,8 @@ xfs_log_need_covered(xfs_mount_t *mp)
817 SPLDECL(s); 817 SPLDECL(s);
818 int needed = 0, gen; 818 int needed = 0, gen;
819 xlog_t *log = mp->m_log; 819 xlog_t *log = mp->m_log;
820 bhv_vfs_t *vfsp = XFS_MTOVFS(mp);
821 820
822 if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || 821 if (!xfs_fs_writable(mp))
823 (vfsp->vfs_flag & VFS_RDONLY))
824 return 0; 822 return 0;
825 823
826 s = LOG_LOCK(log); 824 s = LOG_LOCK(log);
@@ -967,14 +965,16 @@ xlog_iodone(xfs_buf_t *bp)
967 } else if (iclog->ic_state & XLOG_STATE_IOERROR) { 965 } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
968 aborted = XFS_LI_ABORTED; 966 aborted = XFS_LI_ABORTED;
969 } 967 }
968
969 /* log I/O is always issued ASYNC */
970 ASSERT(XFS_BUF_ISASYNC(bp));
970 xlog_state_done_syncing(iclog, aborted); 971 xlog_state_done_syncing(iclog, aborted);
971 if (!(XFS_BUF_ISASYNC(bp))) { 972 /*
972 /* 973 * do not reference the buffer (bp) here as we could race
973 * Corresponding psema() will be done in bwrite(). If we don't 974 * with it being freed after writing the unmount record to the
974 * vsema() here, panic. 975 * log.
975 */ 976 */
976 XFS_BUF_V_IODONESEMA(bp); 977
977 }
978} /* xlog_iodone */ 978} /* xlog_iodone */
979 979
980/* 980/*
@@ -1199,11 +1199,18 @@ xlog_alloc_log(xfs_mount_t *mp,
1199 *iclogp = (xlog_in_core_t *) 1199 *iclogp = (xlog_in_core_t *)
1200 kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); 1200 kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
1201 iclog = *iclogp; 1201 iclog = *iclogp;
1202 iclog->hic_data = (xlog_in_core_2_t *)
1203 kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE);
1204
1205 iclog->ic_prev = prev_iclog; 1202 iclog->ic_prev = prev_iclog;
1206 prev_iclog = iclog; 1203 prev_iclog = iclog;
1204
1205 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
1206 if (!XFS_BUF_CPSEMA(bp))
1207 ASSERT(0);
1208 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1209 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1210 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1211 iclog->ic_bp = bp;
1212 iclog->hic_data = bp->b_addr;
1213
1207 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); 1214 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
1208 1215
1209 head = &iclog->ic_header; 1216 head = &iclog->ic_header;
@@ -1216,11 +1223,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1216 INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); 1223 INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
1217 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1224 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1218 1225
1219 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
1220 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1221 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1222 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1223 iclog->ic_bp = bp;
1224 1226
1225 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; 1227 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
1226 iclog->ic_state = XLOG_STATE_ACTIVE; 1228 iclog->ic_state = XLOG_STATE_ACTIVE;
@@ -1432,7 +1434,7 @@ xlog_sync(xlog_t *log,
1432 } else { 1434 } else {
1433 iclog->ic_bwritecnt = 1; 1435 iclog->ic_bwritecnt = 1;
1434 } 1436 }
1435 XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); 1437 XFS_BUF_SET_COUNT(bp, count);
1436 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ 1438 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
1437 XFS_BUF_ZEROFLAGS(bp); 1439 XFS_BUF_ZEROFLAGS(bp);
1438 XFS_BUF_BUSY(bp); 1440 XFS_BUF_BUSY(bp);
@@ -1528,7 +1530,6 @@ xlog_dealloc_log(xlog_t *log)
1528 } 1530 }
1529#endif 1531#endif
1530 next_iclog = iclog->ic_next; 1532 next_iclog = iclog->ic_next;
1531 kmem_free(iclog->hic_data, log->l_iclog_size);
1532 kmem_free(iclog, sizeof(xlog_in_core_t)); 1533 kmem_free(iclog, sizeof(xlog_in_core_t));
1533 iclog = next_iclog; 1534 iclog = next_iclog;
1534 } 1535 }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 080fabf61c..fddbb091a8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -927,6 +927,14 @@ xlog_find_tail(
927 ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle, 927 ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle,
928 after_umount_blk); 928 after_umount_blk);
929 *tail_blk = after_umount_blk; 929 *tail_blk = after_umount_blk;
930
931 /*
932 * Note that the unmount was clean. If the unmount
933 * was not clean, we need to know this to rebuild the
934 * superblock counters from the perag headers if we
935 * have a filesystem using non-persistent counters.
936 */
937 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
930 } 938 }
931 } 939 }
932 940
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a96bde6df9..a66b398051 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -202,6 +202,27 @@ xfs_mount_free(
202 kmem_free(mp, sizeof(xfs_mount_t)); 202 kmem_free(mp, sizeof(xfs_mount_t));
203} 203}
204 204
205/*
206 * Check size of device based on the (data/realtime) block count.
207 * Note: this check is used by the growfs code as well as mount.
208 */
209int
210xfs_sb_validate_fsb_count(
211 xfs_sb_t *sbp,
212 __uint64_t nblocks)
213{
214 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
215 ASSERT(sbp->sb_blocklog >= BBSHIFT);
216
217#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
218 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
219 return E2BIG;
220#else /* Limited by UINT_MAX of sectors */
221 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
222 return E2BIG;
223#endif
224 return 0;
225}
205 226
206/* 227/*
207 * Check the validity of the SB found. 228 * Check the validity of the SB found.
@@ -284,18 +305,8 @@ xfs_mount_validate_sb(
284 return XFS_ERROR(EFSCORRUPTED); 305 return XFS_ERROR(EFSCORRUPTED);
285 } 306 }
286 307
287 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); 308 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
288 ASSERT(sbp->sb_blocklog >= BBSHIFT); 309 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
289
290#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
291 if (unlikely(
292 (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
293 (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
294#else /* Limited by UINT_MAX of sectors */
295 if (unlikely(
296 (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
297 (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
298#endif
299 xfs_fs_mount_cmn_err(flags, 310 xfs_fs_mount_cmn_err(flags,
300 "file system too large to be mounted on this system."); 311 "file system too large to be mounted on this system.");
301 return XFS_ERROR(E2BIG); 312 return XFS_ERROR(E2BIG);
@@ -632,6 +643,64 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
632 sbp->sb_inopblock); 643 sbp->sb_inopblock);
633 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 644 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
634} 645}
646
647/*
648 * xfs_initialize_perag_data
649 *
650 * Read in each per-ag structure so we can count up the number of
651 * allocated inodes, free inodes and used filesystem blocks as this
652 * information is no longer persistent in the superblock. Once we have
653 * this information, write it into the in-core superblock structure.
654 */
655STATIC int
656xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
657{
658 xfs_agnumber_t index;
659 xfs_perag_t *pag;
660 xfs_sb_t *sbp = &mp->m_sb;
661 uint64_t ifree = 0;
662 uint64_t ialloc = 0;
663 uint64_t bfree = 0;
664 uint64_t bfreelst = 0;
665 uint64_t btree = 0;
666 int error;
667 int s;
668
669 for (index = 0; index < agcount; index++) {
670 /*
671 * read the agf, then the agi. This gets us
672 * all the inforamtion we need and populates the
673 * per-ag structures for us.
674 */
675 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
676 if (error)
677 return error;
678
679 error = xfs_ialloc_pagi_init(mp, NULL, index);
680 if (error)
681 return error;
682 pag = &mp->m_perag[index];
683 ifree += pag->pagi_freecount;
684 ialloc += pag->pagi_count;
685 bfree += pag->pagf_freeblks;
686 bfreelst += pag->pagf_flcount;
687 btree += pag->pagf_btreeblks;
688 }
689 /*
690 * Overwrite incore superblock counters with just-read data
691 */
692 s = XFS_SB_LOCK(mp);
693 sbp->sb_ifree = ifree;
694 sbp->sb_icount = ialloc;
695 sbp->sb_fdblocks = bfree + bfreelst + btree;
696 XFS_SB_UNLOCK(mp, s);
697
698 /* Fixup the per-cpu counters as well. */
699 xfs_icsb_reinit_counters(mp);
700
701 return 0;
702}
703
635/* 704/*
636 * xfs_mountfs 705 * xfs_mountfs
637 * 706 *
@@ -656,7 +725,7 @@ xfs_mountfs(
656 bhv_vnode_t *rvp = NULL; 725 bhv_vnode_t *rvp = NULL;
657 int readio_log, writeio_log; 726 int readio_log, writeio_log;
658 xfs_daddr_t d; 727 xfs_daddr_t d;
659 __uint64_t ret64; 728 __uint64_t resblks;
660 __int64_t update_flags; 729 __int64_t update_flags;
661 uint quotamount, quotaflags; 730 uint quotamount, quotaflags;
662 int agno; 731 int agno;
@@ -773,6 +842,7 @@ xfs_mountfs(
773 */ 842 */
774 if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && 843 if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
775 (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { 844 (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
845 __uint64_t ret64;
776 if (xfs_uuid_mount(mp)) { 846 if (xfs_uuid_mount(mp)) {
777 error = XFS_ERROR(EINVAL); 847 error = XFS_ERROR(EINVAL);
778 goto error1; 848 goto error1;
@@ -976,6 +1046,34 @@ xfs_mountfs(
976 } 1046 }
977 1047
978 /* 1048 /*
1049 * Now the log is mounted, we know if it was an unclean shutdown or
1050 * not. If it was, with the first phase of recovery has completed, we
1051 * have consistent AG blocks on disk. We have not recovered EFIs yet,
1052 * but they are recovered transactionally in the second recovery phase
1053 * later.
1054 *
1055 * Hence we can safely re-initialise incore superblock counters from
1056 * the per-ag data. These may not be correct if the filesystem was not
1057 * cleanly unmounted, so we need to wait for recovery to finish before
1058 * doing this.
1059 *
1060 * If the filesystem was cleanly unmounted, then we can trust the
1061 * values in the superblock to be correct and we don't need to do
1062 * anything here.
1063 *
1064 * If we are currently making the filesystem, the initialisation will
1065 * fail as the perag data is in an undefined state.
1066 */
1067
1068 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1069 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1070 !mp->m_sb.sb_inprogress) {
1071 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1072 if (error) {
1073 goto error2;
1074 }
1075 }
1076 /*
979 * Get and sanity-check the root inode. 1077 * Get and sanity-check the root inode.
980 * Save the pointer to it in the mount structure. 1078 * Save the pointer to it in the mount structure.
981 */ 1079 */
@@ -1044,6 +1142,23 @@ xfs_mountfs(
1044 if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) 1142 if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
1045 goto error4; 1143 goto error4;
1046 1144
1145 /*
1146 * Now we are mounted, reserve a small amount of unused space for
1147 * privileged transactions. This is needed so that transaction
1148 * space required for critical operations can dip into this pool
1149 * when at ENOSPC. This is needed for operations like create with
1150 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1151 * are not allowed to use this reserved space.
1152 *
1153 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1154 * This may drive us straight to ENOSPC on mount, but that implies
1155 * we were already there on the last unmount.
1156 */
1157 resblks = mp->m_sb.sb_dblocks;
1158 do_div(resblks, 20);
1159 resblks = min_t(__uint64_t, resblks, 1024);
1160 xfs_reserve_blocks(mp, &resblks, NULL);
1161
1047 return 0; 1162 return 0;
1048 1163
1049 error4: 1164 error4:
@@ -1083,7 +1198,19 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1083#if defined(DEBUG) || defined(INDUCE_IO_ERROR) 1198#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1084 int64_t fsid; 1199 int64_t fsid;
1085#endif 1200#endif
1201 __uint64_t resblks;
1086 1202
1203 /*
1204 * We can potentially deadlock here if we have an inode cluster
1205 * that has been freed has it's buffer still pinned in memory because
1206 * the transaction is still sitting in a iclog. The stale inodes
1207 * on that buffer will have their flush locks held until the
1208 * transaction hits the disk and the callbacks run. the inode
1209 * flush takes the flush lock unconditionally and with nothing to
1210 * push out the iclog we will never get that unlocked. hence we
1211 * need to force the log first.
1212 */
1213 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1087 xfs_iflush_all(mp); 1214 xfs_iflush_all(mp);
1088 1215
1089 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); 1216 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
@@ -1100,10 +1227,26 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1100 xfs_binval(mp->m_rtdev_targp); 1227 xfs_binval(mp->m_rtdev_targp);
1101 } 1228 }
1102 1229
1103 xfs_unmountfs_writesb(mp); 1230 /*
1231 * Unreserve any blocks we have so that when we unmount we don't account
1232 * the reserved free space as used. This is really only necessary for
1233 * lazy superblock counting because it trusts the incore superblock
1234 * counters to be aboslutely correct on clean unmount.
1235 *
1236 * We don't bother correcting this elsewhere for lazy superblock
1237 * counting because on mount of an unclean filesystem we reconstruct the
1238 * correct counter value and this is irrelevant.
1239 *
1240 * For non-lazy counter filesystems, this doesn't matter at all because
1241 * we only every apply deltas to the superblock and hence the incore
1242 * value does not matter....
1243 */
1244 resblks = 0;
1245 xfs_reserve_blocks(mp, &resblks, NULL);
1104 1246
1247 xfs_log_sbcount(mp, 1);
1248 xfs_unmountfs_writesb(mp);
1105 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1249 xfs_unmountfs_wait(mp); /* wait for async bufs */
1106
1107 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1250 xfs_log_unmount(mp); /* Done! No more fs ops. */
1108 1251
1109 xfs_freesb(mp); 1252 xfs_freesb(mp);
@@ -1150,6 +1293,62 @@ xfs_unmountfs_wait(xfs_mount_t *mp)
1150} 1293}
1151 1294
1152int 1295int
1296xfs_fs_writable(xfs_mount_t *mp)
1297{
1298 bhv_vfs_t *vfsp = XFS_MTOVFS(mp);
1299
1300 return !(vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
1301 (vfsp->vfs_flag & VFS_RDONLY));
1302}
1303
1304/*
1305 * xfs_log_sbcount
1306 *
1307 * Called either periodically to keep the on disk superblock values
1308 * roughly up to date or from unmount to make sure the values are
1309 * correct on a clean unmount.
1310 *
1311 * Note this code can be called during the process of freezing, so
1312 * we may need to use the transaction allocator which does not not
1313 * block when the transaction subsystem is in its frozen state.
1314 */
1315int
1316xfs_log_sbcount(
1317 xfs_mount_t *mp,
1318 uint sync)
1319{
1320 xfs_trans_t *tp;
1321 int error;
1322
1323 if (!xfs_fs_writable(mp))
1324 return 0;
1325
1326 xfs_icsb_sync_counters(mp);
1327
1328 /*
1329 * we don't need to do this if we are updating the superblock
1330 * counters on every modification.
1331 */
1332 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1333 return 0;
1334
1335 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
1336 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1337 XFS_DEFAULT_LOG_COUNT);
1338 if (error) {
1339 xfs_trans_cancel(tp, 0);
1340 return error;
1341 }
1342
1343 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1344 if (sync)
1345 xfs_trans_set_sync(tp);
1346 xfs_trans_commit(tp, 0);
1347
1348 return 0;
1349}
1350
1351int
1153xfs_unmountfs_writesb(xfs_mount_t *mp) 1352xfs_unmountfs_writesb(xfs_mount_t *mp)
1154{ 1353{
1155 xfs_buf_t *sbp; 1354 xfs_buf_t *sbp;
@@ -1160,16 +1359,15 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1160 * skip superblock write if fs is read-only, or 1359 * skip superblock write if fs is read-only, or
1161 * if we are doing a forced umount. 1360 * if we are doing a forced umount.
1162 */ 1361 */
1163 sbp = xfs_getsb(mp, 0);
1164 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || 1362 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
1165 XFS_FORCED_SHUTDOWN(mp))) { 1363 XFS_FORCED_SHUTDOWN(mp))) {
1166 1364
1167 xfs_icsb_sync_counters(mp); 1365 sbp = xfs_getsb(mp, 0);
1366 sb = XFS_BUF_TO_SBP(sbp);
1168 1367
1169 /* 1368 /*
1170 * mark shared-readonly if desired 1369 * mark shared-readonly if desired
1171 */ 1370 */
1172 sb = XFS_BUF_TO_SBP(sbp);
1173 if (mp->m_mk_sharedro) { 1371 if (mp->m_mk_sharedro) {
1174 if (!(sb->sb_flags & XFS_SBF_READONLY)) 1372 if (!(sb->sb_flags & XFS_SBF_READONLY))
1175 sb->sb_flags |= XFS_SBF_READONLY; 1373 sb->sb_flags |= XFS_SBF_READONLY;
@@ -1178,6 +1376,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1178 xfs_fs_cmn_err(CE_NOTE, mp, 1376 xfs_fs_cmn_err(CE_NOTE, mp,
1179 "Unmounting, marking shared read-only"); 1377 "Unmounting, marking shared read-only");
1180 } 1378 }
1379
1181 XFS_BUF_UNDONE(sbp); 1380 XFS_BUF_UNDONE(sbp);
1182 XFS_BUF_UNREAD(sbp); 1381 XFS_BUF_UNREAD(sbp);
1183 XFS_BUF_UNDELAYWRITE(sbp); 1382 XFS_BUF_UNDELAYWRITE(sbp);
@@ -1192,8 +1391,8 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1192 mp, sbp, XFS_BUF_ADDR(sbp)); 1391 mp, sbp, XFS_BUF_ADDR(sbp));
1193 if (error && mp->m_mk_sharedro) 1392 if (error && mp->m_mk_sharedro)
1194 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); 1393 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
1394 xfs_buf_relse(sbp);
1195 } 1395 }
1196 xfs_buf_relse(sbp);
1197 return error; 1396 return error;
1198} 1397}
1199 1398
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 82304b9464..76ad747586 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,6 +66,7 @@ struct xfs_bmbt_irec;
66struct xfs_bmap_free; 66struct xfs_bmap_free;
67struct xfs_extdelta; 67struct xfs_extdelta;
68struct xfs_swapext; 68struct xfs_swapext;
69struct xfs_mru_cache;
69 70
70extern struct bhv_vfsops xfs_vfsops; 71extern struct bhv_vfsops xfs_vfsops;
71extern struct bhv_vnodeops xfs_vnodeops; 72extern struct bhv_vnodeops xfs_vnodeops;
@@ -424,17 +425,18 @@ typedef struct xfs_mount {
424 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 425 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
425 struct mutex m_icsb_mutex; /* balancer sync lock */ 426 struct mutex m_icsb_mutex; /* balancer sync lock */
426#endif 427#endif
428 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
427} xfs_mount_t; 429} xfs_mount_t;
428 430
429/* 431/*
430 * Flags for m_flags. 432 * Flags for m_flags.
431 */ 433 */
432#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 434#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
433 must be synchronous except 435 must be synchronous except
434 for space allocations */ 436 for space allocations */
435#define XFS_MOUNT_INO64 (1ULL << 1) 437#define XFS_MOUNT_INO64 (1ULL << 1)
436 /* (1ULL << 2) -- currently unused */ 438 /* (1ULL << 2) -- currently unused */
437 /* (1ULL << 3) -- currently unused */ 439#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
438#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 440#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
439 operations, typically for 441 operations, typically for
440 disk errors in metadata */ 442 disk errors in metadata */
@@ -463,6 +465,8 @@ typedef struct xfs_mount {
463 * I/O size in stat() */ 465 * I/O size in stat() */
464#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock 466#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
465 counters */ 467 counters */
468#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
469 allocator */
466 470
467 471
468/* 472/*
@@ -511,6 +515,8 @@ xfs_preferred_iosize(xfs_mount_t *mp)
511 515
512#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) 516#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset)
513 517
518#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
519 ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
514#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) 520#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
515#define xfs_force_shutdown(m,f) \ 521#define xfs_force_shutdown(m,f) \
516 bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__) 522 bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
@@ -602,6 +608,7 @@ typedef struct xfs_mod_sb {
602 608
603extern xfs_mount_t *xfs_mount_init(void); 609extern xfs_mount_t *xfs_mount_init(void);
604extern void xfs_mod_sb(xfs_trans_t *, __int64_t); 610extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
611extern int xfs_log_sbcount(xfs_mount_t *, uint);
605extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); 612extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
606extern int xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int); 613extern int xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int);
607extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 614extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
@@ -618,12 +625,14 @@ extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
618extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 625extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
619extern int xfs_readsb(xfs_mount_t *, int); 626extern int xfs_readsb(xfs_mount_t *, int);
620extern void xfs_freesb(xfs_mount_t *); 627extern void xfs_freesb(xfs_mount_t *);
628extern int xfs_fs_writable(xfs_mount_t *);
621extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); 629extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
622extern int xfs_syncsub(xfs_mount_t *, int, int *); 630extern int xfs_syncsub(xfs_mount_t *, int, int *);
623extern int xfs_sync_inodes(xfs_mount_t *, int, int *); 631extern int xfs_sync_inodes(xfs_mount_t *, int, int *);
624extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, 632extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *,
625 xfs_agnumber_t); 633 xfs_agnumber_t);
626extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); 634extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
635extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
627 636
628extern struct xfs_dmops xfs_dmcore_stub; 637extern struct xfs_dmops xfs_dmcore_stub;
629extern struct xfs_qmops xfs_qmcore_stub; 638extern struct xfs_qmops xfs_qmcore_stub;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 0000000000..7deb9e3cbb
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,608 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_mru_cache.h"
20
21/*
22 * The MRU Cache data structure consists of a data store, an array of lists and
23 * a lock to protect its internal state. At initialisation time, the client
24 * supplies an element lifetime in milliseconds and a group count, as well as a
25 * function pointer to call when deleting elements. A data structure for
26 * queueing up work in the form of timed callbacks is also included.
27 *
28 * The group count controls how many lists are created, and thereby how finely
29 * the elements are grouped in time. When reaping occurs, all the elements in
30 * all the lists whose time has expired are deleted.
31 *
32 * To give an example of how this works in practice, consider a client that
33 * initialises an MRU Cache with a lifetime of ten seconds and a group count of
34 * five. Five internal lists will be created, each representing a two second
35 * period in time. When the first element is added, time zero for the data
36 * structure is initialised to the current time.
37 *
38 * All the elements added in the first two seconds are appended to the first
39 * list. Elements added in the third second go into the second list, and so on.
40 * If an element is accessed at any point, it is removed from its list and
41 * inserted at the head of the current most-recently-used list.
42 *
43 * The reaper function will have nothing to do until at least twelve seconds
44 * have elapsed since the first element was added. The reason for this is that
45 * if it were called at t=11s, there could be elements in the first list that
46 * have only been inactive for nine seconds, so it still does nothing. If it is
47 * called anywhere between t=12 and t=14 seconds, it will delete all the
48 * elements that remain in the first list. It's therefore possible for elements
49 * to remain in the data store even after they've been inactive for up to
50 * (t + t/g) seconds, where t is the inactive element lifetime and g is the
51 * number of groups.
52 *
53 * The above example assumes that the reaper function gets called at least once
54 * every (t/g) seconds. If it is called less frequently, unused elements will
55 * accumulate in the reap list until the reaper function is eventually called.
56 * The current implementation uses work queue callbacks to carefully time the
57 * reaper function calls, so this should happen rarely, if at all.
58 *
59 * From a design perspective, the primary reason for the choice of a list array
60 * representing discrete time intervals is that it's only practical to reap
61 * expired elements in groups of some appreciable size. This automatically
62 * introduces a granularity to element lifetimes, so there's no point storing an
63 * individual timeout with each element that specifies a more precise reap time.
64 * The bonus is a saving of sizeof(long) bytes of memory per element stored.
65 *
66 * The elements could have been stored in just one list, but an array of
67 * counters or pointers would need to be maintained to allow them to be divided
68 * up into discrete time groups. More critically, the process of touching or
69 * removing an element would involve walking large portions of the entire list,
70 * which would have a detrimental effect on performance. The additional memory
71 * requirement for the array of list heads is minimal.
72 *
73 * When an element is touched or deleted, it needs to be removed from its
74 * current list. Doubly linked lists are used to make the list maintenance
75 * portion of these operations O(1). Since reaper timing can be imprecise,
76 * inserts and lookups can occur when there are no free lists available. When
77 * this happens, all the elements on the LRU list need to be migrated to the end
78 * of the reap list. To keep the list maintenance portion of these operations
79 * O(1) also, list tails need to be accessible without walking the entire list.
80 * This is the reason why doubly linked list heads are used.
81 */
82
83/*
84 * An MRU Cache is a dynamic data structure that stores its elements in a way
85 * that allows efficient lookups, but also groups them into discrete time
86 * intervals based on insertion time. This allows elements to be efficiently
87 * and automatically reaped after a fixed period of inactivity.
88 *
89 * When a client data pointer is stored in the MRU Cache it needs to be added to
90 * both the data store and to one of the lists. It must also be possible to
91 * access each of these entries via the other, i.e. to:
92 *
93 * a) Walk a list, removing the corresponding data store entry for each item.
94 * b) Look up a data store entry, then access its list entry directly.
95 *
96 * To achieve both of these goals, each entry must contain both a list entry and
97 * a key, in addition to the user's data pointer. Note that it's not a good
98 * idea to have the client embed one of these structures at the top of their own
99 * data structure, because inserting the same item more than once would most
100 * likely result in a loop in one of the lists. That's a sure-fire recipe for
101 * an infinite loop in the code.
102 */
103typedef struct xfs_mru_cache_elem
104{
105 struct list_head list_node;
106 unsigned long key;
107 void *value;
108} xfs_mru_cache_elem_t;
109
110static kmem_zone_t *xfs_mru_elem_zone;
111static struct workqueue_struct *xfs_mru_reap_wq;
112
113/*
114 * When inserting, destroying or reaping, it's first necessary to update the
115 * lists relative to a particular time. In the case of destroying, that time
116 * will be well in the future to ensure that all items are moved to the reap
117 * list. In all other cases though, the time will be the current time.
118 *
119 * This function enters a loop, moving the contents of the LRU list to the reap
120 * list again and again until either a) the lists are all empty, or b) time zero
121 * has been advanced sufficiently to be within the immediate element lifetime.
122 *
123 * Case a) above is detected by counting how many groups are migrated and
124 * stopping when they've all been moved. Case b) is detected by monitoring the
125 * time_zero field, which is updated as each group is migrated.
126 *
127 * The return value is the earliest time that more migration could be needed, or
128 * zero if there's no need to schedule more work because the lists are empty.
129 */
130STATIC unsigned long
131_xfs_mru_cache_migrate(
132 xfs_mru_cache_t *mru,
133 unsigned long now)
134{
135 unsigned int grp;
136 unsigned int migrated = 0;
137 struct list_head *lru_list;
138
139 /* Nothing to do if the data store is empty. */
140 if (!mru->time_zero)
141 return 0;
142
143 /* While time zero is older than the time spanned by all the lists. */
144 while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
145
146 /*
147 * If the LRU list isn't empty, migrate its elements to the tail
148 * of the reap list.
149 */
150 lru_list = mru->lists + mru->lru_grp;
151 if (!list_empty(lru_list))
152 list_splice_init(lru_list, mru->reap_list.prev);
153
154 /*
155 * Advance the LRU group number, freeing the old LRU list to
156 * become the new MRU list; advance time zero accordingly.
157 */
158 mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
159 mru->time_zero += mru->grp_time;
160
161 /*
162 * If reaping is so far behind that all the elements on all the
163 * lists have been migrated to the reap list, it's now empty.
164 */
165 if (++migrated == mru->grp_count) {
166 mru->lru_grp = 0;
167 mru->time_zero = 0;
168 return 0;
169 }
170 }
171
172 /* Find the first non-empty list from the LRU end. */
173 for (grp = 0; grp < mru->grp_count; grp++) {
174
175 /* Check the grp'th list from the LRU end. */
176 lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
177 if (!list_empty(lru_list))
178 return mru->time_zero +
179 (mru->grp_count + grp) * mru->grp_time;
180 }
181
182 /* All the lists must be empty. */
183 mru->lru_grp = 0;
184 mru->time_zero = 0;
185 return 0;
186}
187
188/*
189 * When inserting or doing a lookup, an element needs to be inserted into the
190 * MRU list. The lists must be migrated first to ensure that they're
191 * up-to-date, otherwise the new element could be given a shorter lifetime in
192 * the cache than it should.
193 */
194STATIC void
195_xfs_mru_cache_list_insert(
196 xfs_mru_cache_t *mru,
197 xfs_mru_cache_elem_t *elem)
198{
199 unsigned int grp = 0;
200 unsigned long now = jiffies;
201
202 /*
203 * If the data store is empty, initialise time zero, leave grp set to
204 * zero and start the work queue timer if necessary. Otherwise, set grp
205 * to the number of group times that have elapsed since time zero.
206 */
207 if (!_xfs_mru_cache_migrate(mru, now)) {
208 mru->time_zero = now;
209 if (!mru->next_reap)
210 mru->next_reap = mru->grp_count * mru->grp_time;
211 } else {
212 grp = (now - mru->time_zero) / mru->grp_time;
213 grp = (mru->lru_grp + grp) % mru->grp_count;
214 }
215
216 /* Insert the element at the tail of the corresponding list. */
217 list_add_tail(&elem->list_node, mru->lists + grp);
218}
219
220/*
221 * When destroying or reaping, all the elements that were migrated to the reap
222 * list need to be deleted. For each element this involves removing it from the
223 * data store, removing it from the reap list, calling the client's free
224 * function and deleting the element from the element zone.
225 */
226STATIC void
227_xfs_mru_cache_clear_reap_list(
228 xfs_mru_cache_t *mru)
229{
230 xfs_mru_cache_elem_t *elem, *next;
231 struct list_head tmp;
232
233 INIT_LIST_HEAD(&tmp);
234 list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
235
236 /* Remove the element from the data store. */
237 radix_tree_delete(&mru->store, elem->key);
238
239 /*
240 * remove to temp list so it can be freed without
241 * needing to hold the lock
242 */
243 list_move(&elem->list_node, &tmp);
244 }
245 mutex_spinunlock(&mru->lock, 0);
246
247 list_for_each_entry_safe(elem, next, &tmp, list_node) {
248
249 /* Remove the element from the reap list. */
250 list_del_init(&elem->list_node);
251
252 /* Call the client's free function with the key and value pointer. */
253 mru->free_func(elem->key, elem->value);
254
255 /* Free the element structure. */
256 kmem_zone_free(xfs_mru_elem_zone, elem);
257 }
258
259 mutex_spinlock(&mru->lock);
260}
261
262/*
263 * We fire the reap timer every group expiry interval so
264 * we always have a reaper ready to run. This makes shutdown
265 * and flushing of the reaper easy to do. Hence we need to
266 * keep when the next reap must occur so we can determine
267 * at each interval whether there is anything we need to do.
268 */
269STATIC void
270_xfs_mru_cache_reap(
271 struct work_struct *work)
272{
273 xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
274 unsigned long now;
275
276 ASSERT(mru && mru->lists);
277 if (!mru || !mru->lists)
278 return;
279
280 mutex_spinlock(&mru->lock);
281 now = jiffies;
282 if (mru->reap_all ||
283 (mru->next_reap && time_after(now, mru->next_reap))) {
284 if (mru->reap_all)
285 now += mru->grp_count * mru->grp_time * 2;
286 mru->next_reap = _xfs_mru_cache_migrate(mru, now);
287 _xfs_mru_cache_clear_reap_list(mru);
288 }
289
290 /*
291 * the process that triggered the reap_all is responsible
292 * for restating the periodic reap if it is required.
293 */
294 if (!mru->reap_all)
295 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
296 mru->reap_all = 0;
297 mutex_spinunlock(&mru->lock, 0);
298}
299
300int
301xfs_mru_cache_init(void)
302{
303 xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
304 "xfs_mru_cache_elem");
305 if (!xfs_mru_elem_zone)
306 return ENOMEM;
307
308 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
309 if (!xfs_mru_reap_wq) {
310 kmem_zone_destroy(xfs_mru_elem_zone);
311 return ENOMEM;
312 }
313
314 return 0;
315}
316
317void
318xfs_mru_cache_uninit(void)
319{
320 destroy_workqueue(xfs_mru_reap_wq);
321 kmem_zone_destroy(xfs_mru_elem_zone);
322}
323
324/*
325 * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
326 * with the address of the pointer, a lifetime value in milliseconds, a group
327 * count and a free function to use when deleting elements. This function
328 * returns 0 if the initialisation was successful.
329 */
330int
331xfs_mru_cache_create(
332 xfs_mru_cache_t **mrup,
333 unsigned int lifetime_ms,
334 unsigned int grp_count,
335 xfs_mru_cache_free_func_t free_func)
336{
337 xfs_mru_cache_t *mru = NULL;
338 int err = 0, grp;
339 unsigned int grp_time;
340
341 if (mrup)
342 *mrup = NULL;
343
344 if (!mrup || !grp_count || !lifetime_ms || !free_func)
345 return EINVAL;
346
347 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
348 return EINVAL;
349
350 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
351 return ENOMEM;
352
353 /* An extra list is needed to avoid reaping up to a grp_time early. */
354 mru->grp_count = grp_count + 1;
355 mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
356
357 if (!mru->lists) {
358 err = ENOMEM;
359 goto exit;
360 }
361
362 for (grp = 0; grp < mru->grp_count; grp++)
363 INIT_LIST_HEAD(mru->lists + grp);
364
365 /*
366 * We use GFP_KERNEL radix tree preload and do inserts under a
367 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
368 */
369 INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
370 INIT_LIST_HEAD(&mru->reap_list);
371 spinlock_init(&mru->lock, "xfs_mru_cache");
372 INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
373
374 mru->grp_time = grp_time;
375 mru->free_func = free_func;
376
377 /* start up the reaper event */
378 mru->next_reap = 0;
379 mru->reap_all = 0;
380 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
381
382 *mrup = mru;
383
384exit:
385 if (err && mru && mru->lists)
386 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
387 if (err && mru)
388 kmem_free(mru, sizeof(*mru));
389
390 return err;
391}
392
393/*
394 * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
395 * free functions as they're deleted. When this function returns, the caller is
396 * guaranteed that all the free functions for all the elements have finished
397 * executing.
398 *
399 * While we are flushing, we stop the periodic reaper event from triggering.
400 * Normally, we want to restart this periodic event, but if we are shutting
401 * down the cache we do not want it restarted. hence the restart parameter
402 * where 0 = do not restart reaper and 1 = restart reaper.
403 */
404void
405xfs_mru_cache_flush(
406 xfs_mru_cache_t *mru,
407 int restart)
408{
409 if (!mru || !mru->lists)
410 return;
411
412 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
413
414 mutex_spinlock(&mru->lock);
415 mru->reap_all = 1;
416 mutex_spinunlock(&mru->lock, 0);
417
418 queue_work(xfs_mru_reap_wq, &mru->work.work);
419 flush_workqueue(xfs_mru_reap_wq);
420
421 mutex_spinlock(&mru->lock);
422 WARN_ON_ONCE(mru->reap_all != 0);
423 mru->reap_all = 0;
424 if (restart)
425 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
426 mutex_spinunlock(&mru->lock, 0);
427}
428
429void
430xfs_mru_cache_destroy(
431 xfs_mru_cache_t *mru)
432{
433 if (!mru || !mru->lists)
434 return;
435
436 /* we don't want the reaper to restart here */
437 xfs_mru_cache_flush(mru, 0);
438
439 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
440 kmem_free(mru, sizeof(*mru));
441}
442
443/*
444 * To insert an element, call xfs_mru_cache_insert() with the data store, the
445 * element's key and the client data pointer. This function returns 0 on
446 * success or ENOMEM if memory for the data element couldn't be allocated.
447 */
448int
449xfs_mru_cache_insert(
450 xfs_mru_cache_t *mru,
451 unsigned long key,
452 void *value)
453{
454 xfs_mru_cache_elem_t *elem;
455
456 ASSERT(mru && mru->lists);
457 if (!mru || !mru->lists)
458 return EINVAL;
459
460 elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
461 if (!elem)
462 return ENOMEM;
463
464 if (radix_tree_preload(GFP_KERNEL)) {
465 kmem_zone_free(xfs_mru_elem_zone, elem);
466 return ENOMEM;
467 }
468
469 INIT_LIST_HEAD(&elem->list_node);
470 elem->key = key;
471 elem->value = value;
472
473 mutex_spinlock(&mru->lock);
474
475 radix_tree_insert(&mru->store, key, elem);
476 radix_tree_preload_end();
477 _xfs_mru_cache_list_insert(mru, elem);
478
479 mutex_spinunlock(&mru->lock, 0);
480
481 return 0;
482}
483
484/*
485 * To remove an element without calling the free function, call
486 * xfs_mru_cache_remove() with the data store and the element's key. On success
487 * the client data pointer for the removed element is returned, otherwise this
488 * function will return a NULL pointer.
489 */
490void *
491xfs_mru_cache_remove(
492 xfs_mru_cache_t *mru,
493 unsigned long key)
494{
495 xfs_mru_cache_elem_t *elem;
496 void *value = NULL;
497
498 ASSERT(mru && mru->lists);
499 if (!mru || !mru->lists)
500 return NULL;
501
502 mutex_spinlock(&mru->lock);
503 elem = radix_tree_delete(&mru->store, key);
504 if (elem) {
505 value = elem->value;
506 list_del(&elem->list_node);
507 }
508
509 mutex_spinunlock(&mru->lock, 0);
510
511 if (elem)
512 kmem_zone_free(xfs_mru_elem_zone, elem);
513
514 return value;
515}
516
517/*
518 * To remove and element and call the free function, call xfs_mru_cache_delete()
519 * with the data store and the element's key.
520 */
521void
522xfs_mru_cache_delete(
523 xfs_mru_cache_t *mru,
524 unsigned long key)
525{
526 void *value = xfs_mru_cache_remove(mru, key);
527
528 if (value)
529 mru->free_func(key, value);
530}
531
532/*
533 * To look up an element using its key, call xfs_mru_cache_lookup() with the
534 * data store and the element's key. If found, the element will be moved to the
535 * head of the MRU list to indicate that it's been touched.
536 *
537 * The internal data structures are protected by a spinlock that is STILL HELD
538 * when this function returns. Call xfs_mru_cache_done() to release it. Note
539 * that it is not safe to call any function that might sleep in the interim.
540 *
541 * The implementation could have used reference counting to avoid this
542 * restriction, but since most clients simply want to get, set or test a member
543 * of the returned data structure, the extra per-element memory isn't warranted.
544 *
545 * If the element isn't found, this function returns NULL and the spinlock is
546 * released. xfs_mru_cache_done() should NOT be called when this occurs.
547 */
548void *
549xfs_mru_cache_lookup(
550 xfs_mru_cache_t *mru,
551 unsigned long key)
552{
553 xfs_mru_cache_elem_t *elem;
554
555 ASSERT(mru && mru->lists);
556 if (!mru || !mru->lists)
557 return NULL;
558
559 mutex_spinlock(&mru->lock);
560 elem = radix_tree_lookup(&mru->store, key);
561 if (elem) {
562 list_del(&elem->list_node);
563 _xfs_mru_cache_list_insert(mru, elem);
564 }
565 else
566 mutex_spinunlock(&mru->lock, 0);
567
568 return elem ? elem->value : NULL;
569}
570
571/*
572 * To look up an element using its key, but leave its location in the internal
573 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
574 * function returns NULL.
575 *
576 * See the comments above the declaration of the xfs_mru_cache_lookup() function
577 * for important locking information pertaining to this call.
578 */
579void *
580xfs_mru_cache_peek(
581 xfs_mru_cache_t *mru,
582 unsigned long key)
583{
584 xfs_mru_cache_elem_t *elem;
585
586 ASSERT(mru && mru->lists);
587 if (!mru || !mru->lists)
588 return NULL;
589
590 mutex_spinlock(&mru->lock);
591 elem = radix_tree_lookup(&mru->store, key);
592 if (!elem)
593 mutex_spinunlock(&mru->lock, 0);
594
595 return elem ? elem->value : NULL;
596}
597
598/*
599 * To release the internal data structure spinlock after having performed an
600 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
601 * with the data store pointer.
602 */
603void
604xfs_mru_cache_done(
605 xfs_mru_cache_t *mru)
606{
607 mutex_spinunlock(&mru->lock, 0);
608}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 0000000000..624fd10ee8
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,57 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_MRU_CACHE_H__
19#define __XFS_MRU_CACHE_H__
20
21
22/* Function pointer type for callback to free a client's data pointer. */
23typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
24
25typedef struct xfs_mru_cache
26{
27 struct radix_tree_root store; /* Core storage data structure. */
28 struct list_head *lists; /* Array of lists, one per grp. */
29 struct list_head reap_list; /* Elements overdue for reaping. */
30 spinlock_t lock; /* Lock to protect this struct. */
31 unsigned int grp_count; /* Number of discrete groups. */
32 unsigned int grp_time; /* Time period spanned by grps. */
33 unsigned int lru_grp; /* Group containing time zero. */
34 unsigned long time_zero; /* Time first element was added. */
35 unsigned long next_reap; /* Time that the reaper should
36 next do something. */
37 unsigned int reap_all; /* if set, reap all lists */
38 xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
39 struct delayed_work work; /* Workqueue data for reaping. */
40} xfs_mru_cache_t;
41
42int xfs_mru_cache_init(void);
43void xfs_mru_cache_uninit(void);
44int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
45 unsigned int grp_count,
46 xfs_mru_cache_free_func_t free_func);
47void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart);
48void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
49int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
50 void *value);
51void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
52void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
53void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
54void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
55void xfs_mru_cache_done(struct xfs_mru_cache *mru);
56
57#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b3a5f07bd0..47082c0187 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1882,11 +1882,13 @@ xfs_growfs_rt(
1882 (nrblocks = in->newblocks) <= sbp->sb_rblocks || 1882 (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
1883 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) 1883 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
1884 return XFS_ERROR(EINVAL); 1884 return XFS_ERROR(EINVAL);
1885 if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
1886 return error;
1885 /* 1887 /*
1886 * Read in the last block of the device, make sure it exists. 1888 * Read in the last block of the device, make sure it exists.
1887 */ 1889 */
1888 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1890 error = xfs_read_buf(mp, mp->m_rtdev_targp,
1889 XFS_FSB_TO_BB(mp, in->newblocks - 1), 1891 XFS_FSB_TO_BB(mp, nrblocks - 1),
1890 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1892 XFS_FSB_TO_BB(mp, 1), 0, &bp);
1891 if (error) 1893 if (error)
1892 return error; 1894 return error;
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 188b296ff5..fcf28dbded 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -72,6 +72,34 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb)
72} 72}
73 73
74/* 74/*
75 * Flags for xfs_free_eofblocks
76 */
77#define XFS_FREE_EOF_LOCK (1<<0)
78#define XFS_FREE_EOF_NOLOCK (1<<1)
79
80
81/*
82 * helper function to extract extent size hint from inode
83 */
84STATIC_INLINE xfs_extlen_t
85xfs_get_extsz_hint(
86 xfs_inode_t *ip)
87{
88 xfs_extlen_t extsz;
89
90 if (unlikely(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
91 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
92 ? ip->i_d.di_extsize
93 : ip->i_mount->m_sb.sb_rextsize;
94 ASSERT(extsz);
95 } else {
96 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
97 ? ip->i_d.di_extsize : 0;
98 }
99 return extsz;
100}
101
102/*
75 * Prototypes for functions in xfs_rw.c. 103 * Prototypes for functions in xfs_rw.c.
76 */ 104 */
77extern int xfs_write_clear_setuid(struct xfs_inode *ip); 105extern int xfs_write_clear_setuid(struct xfs_inode *ip);
@@ -91,10 +119,12 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
91extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); 119extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
92extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); 120extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
93extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags, 121extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags,
94 cred_t *credp); 122 cred_t *credp);
95extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf, 123extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf,
96 xfs_off_t offset, cred_t *credp, int flags); 124 xfs_off_t offset, cred_t *credp, int flags);
97extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state, 125extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state,
98 cred_t *credp); 126 cred_t *credp);
127extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
128 int flags);
99 129
100#endif /* __XFS_RW_H__ */ 130#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 467854b45c..ef42537a60 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -74,12 +74,13 @@ struct xfs_mount;
74 */ 74 */
75#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ 75#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
76#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 76#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
77#define XFS_SB_VERSION2_RESERVED2BIT 0x00000002 77#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
78#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 78#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
79#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 79#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
80 80
81#define XFS_SB_VERSION2_OKREALFBITS \ 81#define XFS_SB_VERSION2_OKREALFBITS \
82 (XFS_SB_VERSION2_ATTR2BIT) 82 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
83 XFS_SB_VERSION2_ATTR2BIT)
83#define XFS_SB_VERSION2_OKSASHFBITS \ 84#define XFS_SB_VERSION2_OKSASHFBITS \
84 (0) 85 (0)
85#define XFS_SB_VERSION2_OKREALBITS \ 86#define XFS_SB_VERSION2_OKREALBITS \
@@ -181,6 +182,9 @@ typedef enum {
181#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) 182#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
182#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) 183#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
183#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) 184#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
185#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
186#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
187#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
184#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) 188#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
185#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) 189#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
186#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) 190#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
@@ -188,7 +192,7 @@ typedef enum {
188 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ 192 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
189 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ 193 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
190 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ 194 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
191 XFS_SB_FEATURES2) 195 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2)
192 196
193 197
194/* 198/*
@@ -414,6 +418,12 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
414 * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) 418 * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
415 */ 419 */
416 420
421static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
422{
423 return (XFS_SB_VERSION_HASMOREBITS(sbp) && \
424 ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
425}
426
417#define XFS_SB_VERSION_HASATTR2(sbp) xfs_sb_version_hasattr2(sbp) 427#define XFS_SB_VERSION_HASATTR2(sbp) xfs_sb_version_hasattr2(sbp)
418static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) 428static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
419{ 429{
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index cc2d60951e..356d6627f5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -427,6 +427,14 @@ undo_blocks:
427 * 427 *
428 * Mark the transaction structure to indicate that the superblock 428 * Mark the transaction structure to indicate that the superblock
429 * needs to be updated before committing. 429 * needs to be updated before committing.
430 *
431 * Because we may not be keeping track of allocated/free inodes and
432 * used filesystem blocks in the superblock, we do not mark the
433 * superblock dirty in this transaction if we modify these fields.
434 * We still need to update the transaction deltas so that they get
435 * applied to the incore superblock, but we don't want them to
436 * cause the superblock to get locked and logged if these are the
437 * only fields in the superblock that the transaction modifies.
430 */ 438 */
431void 439void
432xfs_trans_mod_sb( 440xfs_trans_mod_sb(
@@ -434,13 +442,19 @@ xfs_trans_mod_sb(
434 uint field, 442 uint field,
435 int64_t delta) 443 int64_t delta)
436{ 444{
445 uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY);
446 xfs_mount_t *mp = tp->t_mountp;
437 447
438 switch (field) { 448 switch (field) {
439 case XFS_TRANS_SB_ICOUNT: 449 case XFS_TRANS_SB_ICOUNT:
440 tp->t_icount_delta += delta; 450 tp->t_icount_delta += delta;
451 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
452 flags &= ~XFS_TRANS_SB_DIRTY;
441 break; 453 break;
442 case XFS_TRANS_SB_IFREE: 454 case XFS_TRANS_SB_IFREE:
443 tp->t_ifree_delta += delta; 455 tp->t_ifree_delta += delta;
456 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
457 flags &= ~XFS_TRANS_SB_DIRTY;
444 break; 458 break;
445 case XFS_TRANS_SB_FDBLOCKS: 459 case XFS_TRANS_SB_FDBLOCKS:
446 /* 460 /*
@@ -453,6 +467,8 @@ xfs_trans_mod_sb(
453 ASSERT(tp->t_blk_res_used <= tp->t_blk_res); 467 ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
454 } 468 }
455 tp->t_fdblocks_delta += delta; 469 tp->t_fdblocks_delta += delta;
470 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
471 flags &= ~XFS_TRANS_SB_DIRTY;
456 break; 472 break;
457 case XFS_TRANS_SB_RES_FDBLOCKS: 473 case XFS_TRANS_SB_RES_FDBLOCKS:
458 /* 474 /*
@@ -462,6 +478,8 @@ xfs_trans_mod_sb(
462 */ 478 */
463 ASSERT(delta < 0); 479 ASSERT(delta < 0);
464 tp->t_res_fdblocks_delta += delta; 480 tp->t_res_fdblocks_delta += delta;
481 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
482 flags &= ~XFS_TRANS_SB_DIRTY;
465 break; 483 break;
466 case XFS_TRANS_SB_FREXTENTS: 484 case XFS_TRANS_SB_FREXTENTS:
467 /* 485 /*
@@ -515,7 +533,7 @@ xfs_trans_mod_sb(
515 return; 533 return;
516 } 534 }
517 535
518 tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY); 536 tp->t_flags |= flags;
519} 537}
520 538
521/* 539/*
@@ -544,18 +562,23 @@ xfs_trans_apply_sb_deltas(
544 (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + 562 (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
545 tp->t_ag_btree_delta)); 563 tp->t_ag_btree_delta));
546 564
547 if (tp->t_icount_delta != 0) { 565 /*
548 INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); 566 * Only update the superblock counters if we are logging them
549 } 567 */
550 if (tp->t_ifree_delta != 0) { 568 if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
551 INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); 569 if (tp->t_icount_delta != 0) {
552 } 570 INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta);
571 }
572 if (tp->t_ifree_delta != 0) {
573 INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta);
574 }
553 575
554 if (tp->t_fdblocks_delta != 0) { 576 if (tp->t_fdblocks_delta != 0) {
555 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); 577 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta);
556 } 578 }
557 if (tp->t_res_fdblocks_delta != 0) { 579 if (tp->t_res_fdblocks_delta != 0) {
558 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); 580 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta);
581 }
559 } 582 }
560 583
561 if (tp->t_frextents_delta != 0) { 584 if (tp->t_frextents_delta != 0) {
@@ -615,11 +638,23 @@ xfs_trans_apply_sb_deltas(
615} 638}
616 639
617/* 640/*
618 * xfs_trans_unreserve_and_mod_sb() is called to release unused 641 * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
619 * reservations and apply superblock counter changes to the in-core 642 * and apply superblock counter changes to the in-core superblock. The
620 * superblock. 643 * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
644 * applied to the in-core superblock. The idea is that that has already been
645 * done.
621 * 646 *
622 * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). 647 * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
648 * However, we have to ensure that we only modify each superblock field only
649 * once because the application of the delta values may not be atomic. That can
650 * lead to ENOSPC races occurring if we have two separate modifcations of the
651 * free space counter to put back the entire reservation and then take away
652 * what we used.
653 *
654 * If we are not logging superblock counters, then the inode allocated/free and
655 * used block counts are not updated in the on disk superblock. In this case,
656 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
657 * still need to update the incore superblock with the changes.
623 */ 658 */
624STATIC void 659STATIC void
625xfs_trans_unreserve_and_mod_sb( 660xfs_trans_unreserve_and_mod_sb(
@@ -627,40 +662,49 @@ xfs_trans_unreserve_and_mod_sb(
627{ 662{
628 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 663 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */
629 xfs_mod_sb_t *msbp; 664 xfs_mod_sb_t *msbp;
665 xfs_mount_t *mp = tp->t_mountp;
630 /* REFERENCED */ 666 /* REFERENCED */
631 int error; 667 int error;
632 int rsvd; 668 int rsvd;
669 int64_t blkdelta = 0;
670 int64_t rtxdelta = 0;
633 671
634 msbp = msb; 672 msbp = msb;
635 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 673 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
636 674
637 /* 675 /* calculate free blocks delta */
638 * Release any reserved blocks. Any that were allocated 676 if (tp->t_blk_res > 0)
639 * will be taken back again by fdblocks_delta below. 677 blkdelta = tp->t_blk_res;
640 */ 678
641 if (tp->t_blk_res > 0) { 679 if ((tp->t_fdblocks_delta != 0) &&
680 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
681 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
682 blkdelta += tp->t_fdblocks_delta;
683
684 if (blkdelta != 0) {
642 msbp->msb_field = XFS_SBS_FDBLOCKS; 685 msbp->msb_field = XFS_SBS_FDBLOCKS;
643 msbp->msb_delta = tp->t_blk_res; 686 msbp->msb_delta = blkdelta;
644 msbp++; 687 msbp++;
645 } 688 }
646 689
647 /* 690 /* calculate free realtime extents delta */
648 * Release any reserved real time extents . Any that were 691 if (tp->t_rtx_res > 0)
649 * allocated will be taken back again by frextents_delta below. 692 rtxdelta = tp->t_rtx_res;
650 */ 693
651 if (tp->t_rtx_res > 0) { 694 if ((tp->t_frextents_delta != 0) &&
695 (tp->t_flags & XFS_TRANS_SB_DIRTY))
696 rtxdelta += tp->t_frextents_delta;
697
698 if (rtxdelta != 0) {
652 msbp->msb_field = XFS_SBS_FREXTENTS; 699 msbp->msb_field = XFS_SBS_FREXTENTS;
653 msbp->msb_delta = tp->t_rtx_res; 700 msbp->msb_delta = rtxdelta;
654 msbp++; 701 msbp++;
655 } 702 }
656 703
657 /* 704 /* apply remaining deltas */
658 * Apply any superblock modifications to the in-core version. 705
659 * The t_res_fdblocks_delta and t_res_frextents_delta fields are 706 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
660 * explicitly NOT applied to the in-core superblock. 707 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
661 * The idea is that that has already been done.
662 */
663 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
664 if (tp->t_icount_delta != 0) { 708 if (tp->t_icount_delta != 0) {
665 msbp->msb_field = XFS_SBS_ICOUNT; 709 msbp->msb_field = XFS_SBS_ICOUNT;
666 msbp->msb_delta = tp->t_icount_delta; 710 msbp->msb_delta = tp->t_icount_delta;
@@ -671,16 +715,9 @@ xfs_trans_unreserve_and_mod_sb(
671 msbp->msb_delta = tp->t_ifree_delta; 715 msbp->msb_delta = tp->t_ifree_delta;
672 msbp++; 716 msbp++;
673 } 717 }
674 if (tp->t_fdblocks_delta != 0) { 718 }
675 msbp->msb_field = XFS_SBS_FDBLOCKS; 719
676 msbp->msb_delta = tp->t_fdblocks_delta; 720 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
677 msbp++;
678 }
679 if (tp->t_frextents_delta != 0) {
680 msbp->msb_field = XFS_SBS_FREXTENTS;
681 msbp->msb_delta = tp->t_frextents_delta;
682 msbp++;
683 }
684 if (tp->t_dblocks_delta != 0) { 721 if (tp->t_dblocks_delta != 0) {
685 msbp->msb_field = XFS_SBS_DBLOCKS; 722 msbp->msb_field = XFS_SBS_DBLOCKS;
686 msbp->msb_delta = tp->t_dblocks_delta; 723 msbp->msb_delta = tp->t_dblocks_delta;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7dfcc45036..0e26e72902 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -94,7 +94,8 @@ typedef struct xfs_trans_header {
94#define XFS_TRANS_GROWFSRT_ZERO 38 94#define XFS_TRANS_GROWFSRT_ZERO 38
95#define XFS_TRANS_GROWFSRT_FREE 39 95#define XFS_TRANS_GROWFSRT_FREE 39
96#define XFS_TRANS_SWAPEXT 40 96#define XFS_TRANS_SWAPEXT 40
97#define XFS_TRANS_TYPE_MAX 40 97#define XFS_TRANS_SB_COUNT 41
98#define XFS_TRANS_TYPE_MAX 41
98/* new transaction types need to be reflected in xfs_logprint(8) */ 99/* new transaction types need to be reflected in xfs_logprint(8) */
99 100
100 101
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 65c561201c..11f5ea29a0 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -51,6 +51,8 @@
51#include "xfs_acl.h" 51#include "xfs_acl.h"
52#include "xfs_attr.h" 52#include "xfs_attr.h"
53#include "xfs_clnt.h" 53#include "xfs_clnt.h"
54#include "xfs_mru_cache.h"
55#include "xfs_filestream.h"
54#include "xfs_fsops.h" 56#include "xfs_fsops.h"
55 57
56STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); 58STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
@@ -81,6 +83,8 @@ xfs_init(void)
81 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); 83 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
82 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); 84 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
83 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); 85 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
86 xfs_mru_cache_init();
87 xfs_filestream_init();
84 88
85 /* 89 /*
86 * The size of the zone allocated buf log item is the maximum 90 * The size of the zone allocated buf log item is the maximum
@@ -164,6 +168,8 @@ xfs_cleanup(void)
164 xfs_cleanup_procfs(); 168 xfs_cleanup_procfs();
165 xfs_sysctl_unregister(); 169 xfs_sysctl_unregister();
166 xfs_refcache_destroy(); 170 xfs_refcache_destroy();
171 xfs_filestream_uninit();
172 xfs_mru_cache_uninit();
167 xfs_acl_zone_destroy(xfs_acl_zone); 173 xfs_acl_zone_destroy(xfs_acl_zone);
168 174
169#ifdef XFS_DIR2_TRACE 175#ifdef XFS_DIR2_TRACE
@@ -320,6 +326,9 @@ xfs_start_flags(
320 else 326 else
321 mp->m_flags &= ~XFS_MOUNT_BARRIER; 327 mp->m_flags &= ~XFS_MOUNT_BARRIER;
322 328
329 if (ap->flags2 & XFSMNT2_FILESTREAMS)
330 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
331
323 return 0; 332 return 0;
324} 333}
325 334
@@ -518,6 +527,9 @@ xfs_mount(
518 if (mp->m_flags & XFS_MOUNT_BARRIER) 527 if (mp->m_flags & XFS_MOUNT_BARRIER)
519 xfs_mountfs_check_barriers(mp); 528 xfs_mountfs_check_barriers(mp);
520 529
530 if ((error = xfs_filestream_mount(mp)))
531 goto error2;
532
521 error = XFS_IOINIT(vfsp, args, flags); 533 error = XFS_IOINIT(vfsp, args, flags);
522 if (error) 534 if (error)
523 goto error2; 535 goto error2;
@@ -575,6 +587,13 @@ xfs_unmount(
575 */ 587 */
576 xfs_refcache_purge_mp(mp); 588 xfs_refcache_purge_mp(mp);
577 589
590 /*
591 * Blow away any referenced inode in the filestreams cache.
592 * This can and will cause log traffic as inodes go inactive
593 * here.
594 */
595 xfs_filestream_unmount(mp);
596
578 XFS_bflush(mp->m_ddev_targp); 597 XFS_bflush(mp->m_ddev_targp);
579 error = xfs_unmount_flush(mp, 0); 598 error = xfs_unmount_flush(mp, 0);
580 if (error) 599 if (error)
@@ -640,7 +659,7 @@ xfs_quiesce_fs(
640 * we can write the unmount record. 659 * we can write the unmount record.
641 */ 660 */
642 do { 661 do {
643 xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, NULL); 662 xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
644 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 663 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
645 if (!pincount) { 664 if (!pincount) {
646 delay(50); 665 delay(50);
@@ -651,6 +670,30 @@ xfs_quiesce_fs(
651 return 0; 670 return 0;
652} 671}
653 672
673/*
674 * Second stage of a quiesce. The data is already synced, now we have to take
675 * care of the metadata. New transactions are already blocked, so we need to
676 * wait for any remaining transactions to drain out before proceding.
677 */
678STATIC void
679xfs_attr_quiesce(
680 xfs_mount_t *mp)
681{
682 /* wait for all modifications to complete */
683 while (atomic_read(&mp->m_active_trans) > 0)
684 delay(100);
685
686 /* flush inodes and push all remaining buffers out to disk */
687 xfs_quiesce_fs(mp);
688
689 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
690
691 /* Push the superblock and write an unmount record */
692 xfs_log_sbcount(mp, 1);
693 xfs_log_unmount_write(mp);
694 xfs_unmountfs_writesb(mp);
695}
696
654STATIC int 697STATIC int
655xfs_mntupdate( 698xfs_mntupdate(
656 bhv_desc_t *bdp, 699 bhv_desc_t *bdp,
@@ -670,10 +713,9 @@ xfs_mntupdate(
670 mp->m_flags &= ~XFS_MOUNT_BARRIER; 713 mp->m_flags &= ~XFS_MOUNT_BARRIER;
671 } 714 }
672 } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ 715 } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */
673 bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL); 716 xfs_filestream_flush(mp);
674 xfs_quiesce_fs(mp); 717 bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL);
675 xfs_log_unmount_write(mp); 718 xfs_attr_quiesce(mp);
676 xfs_unmountfs_writesb(mp);
677 vfsp->vfs_flag |= VFS_RDONLY; 719 vfsp->vfs_flag |= VFS_RDONLY;
678 } 720 }
679 return 0; 721 return 0;
@@ -887,6 +929,9 @@ xfs_sync(
887{ 929{
888 xfs_mount_t *mp = XFS_BHVTOM(bdp); 930 xfs_mount_t *mp = XFS_BHVTOM(bdp);
889 931
932 if (flags & SYNC_IOWAIT)
933 xfs_filestream_flush(mp);
934
890 return xfs_syncsub(mp, flags, NULL); 935 return xfs_syncsub(mp, flags, NULL);
891} 936}
892 937
@@ -1128,58 +1173,41 @@ xfs_sync_inodes(
1128 * in the inode list. 1173 * in the inode list.
1129 */ 1174 */
1130 1175
1131 if ((flags & SYNC_CLOSE) && (vp != NULL)) { 1176 /*
1132 /* 1177 * If we have to flush data or wait for I/O completion
1133 * This is the shutdown case. We just need to 1178 * we need to drop the ilock that we currently hold.
1134 * flush and invalidate all the pages associated 1179 * If we need to drop the lock, insert a marker if we
1135 * with the inode. Drop the inode lock since 1180 * have not already done so.
1136 * we can't hold it across calls to the buffer 1181 */
1137 * cache. 1182 if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
1138 * 1183 ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
1139 * We don't set the VREMAPPING bit in the vnode 1184 if (mount_locked) {
1140 * here, because we don't hold the vnode lock 1185 IPOINTER_INSERT(ip, mp);
1141 * exclusively. It doesn't really matter, though,
1142 * because we only come here when we're shutting
1143 * down anyway.
1144 */
1145 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1146
1147 if (XFS_FORCED_SHUTDOWN(mp)) {
1148 bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
1149 } else {
1150 error = bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
1151 } 1186 }
1187 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1152 1188
1153 xfs_ilock(ip, XFS_ILOCK_SHARED); 1189 if (flags & SYNC_CLOSE) {
1154 1190 /* Shutdown case. Flush and invalidate. */
1155 } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { 1191 if (XFS_FORCED_SHUTDOWN(mp))
1156 if (VN_DIRTY(vp)) { 1192 bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
1157 /* We need to have dropped the lock here, 1193 else
1158 * so insert a marker if we have not already 1194 error = bhv_vop_flushinval_pages(vp, 0,
1159 * done so. 1195 -1, FI_REMAPF);
1160 */ 1196 } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
1161 if (mount_locked) {
1162 IPOINTER_INSERT(ip, mp);
1163 }
1164
1165 /*
1166 * Drop the inode lock since we can't hold it
1167 * across calls to the buffer cache.
1168 */
1169 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1170 error = bhv_vop_flush_pages(vp, (xfs_off_t)0, 1197 error = bhv_vop_flush_pages(vp, (xfs_off_t)0,
1171 -1, fflag, FI_NONE); 1198 -1, fflag, FI_NONE);
1172 xfs_ilock(ip, XFS_ILOCK_SHARED);
1173 } 1199 }
1174 1200
1201 /*
1202 * When freezing, we need to wait ensure all I/O (including direct
1203 * I/O) is complete to ensure no further data modification can take
1204 * place after this point
1205 */
1206 if (flags & SYNC_IOWAIT)
1207 vn_iowait(vp);
1208
1209 xfs_ilock(ip, XFS_ILOCK_SHARED);
1175 } 1210 }
1176 /*
1177 * When freezing, we need to wait ensure all I/O (including direct
1178 * I/O) is complete to ensure no further data modification can take
1179 * place after this point
1180 */
1181 if (flags & SYNC_IOWAIT)
1182 vn_iowait(vp);
1183 1211
1184 if (flags & SYNC_BDFLUSH) { 1212 if (flags & SYNC_BDFLUSH) {
1185 if ((flags & SYNC_ATTR) && 1213 if ((flags & SYNC_ATTR) &&
@@ -1514,6 +1542,15 @@ xfs_syncsub(
1514 } 1542 }
1515 1543
1516 /* 1544 /*
1545 * If asked, update the disk superblock with incore counter values if we
1546 * are using non-persistent counters so that they don't get too far out
1547 * of sync if we crash or get a forced shutdown. We don't want to force
1548 * this to disk, just get a transaction into the iclogs....
1549 */
1550 if (flags & SYNC_SUPER)
1551 xfs_log_sbcount(mp, 0);
1552
1553 /*
1517 * Now check to see if the log needs a "dummy" transaction. 1554 * Now check to see if the log needs a "dummy" transaction.
1518 */ 1555 */
1519 1556
@@ -1645,6 +1682,7 @@ xfs_vget(
1645 * in stat(). */ 1682 * in stat(). */
1646#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ 1683#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
1647#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ 1684#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
1685#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
1648 1686
1649STATIC unsigned long 1687STATIC unsigned long
1650suffix_strtoul(char *s, char **endp, unsigned int base) 1688suffix_strtoul(char *s, char **endp, unsigned int base)
@@ -1831,6 +1869,8 @@ xfs_parseargs(
1831 args->flags |= XFSMNT_ATTR2; 1869 args->flags |= XFSMNT_ATTR2;
1832 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { 1870 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
1833 args->flags &= ~XFSMNT_ATTR2; 1871 args->flags &= ~XFSMNT_ATTR2;
1872 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
1873 args->flags2 |= XFSMNT2_FILESTREAMS;
1834 } else if (!strcmp(this_char, "osyncisdsync")) { 1874 } else if (!strcmp(this_char, "osyncisdsync")) {
1835 /* no-op, this is now the default */ 1875 /* no-op, this is now the default */
1836 cmn_err(CE_WARN, 1876 cmn_err(CE_WARN,
@@ -1959,9 +1999,9 @@ xfs_showargs(
1959} 1999}
1960 2000
1961/* 2001/*
1962 * Second stage of a freeze. The data is already frozen, now we have to take 2002 * Second stage of a freeze. The data is already frozen so we only
1963 * care of the metadata. New transactions are already blocked, so we need to 2003 * need to take care of themetadata. Once that's done write a dummy
1964 * wait for any remaining transactions to drain out before proceding. 2004 * record to dirty the log in case of a crash while frozen.
1965 */ 2005 */
1966STATIC void 2006STATIC void
1967xfs_freeze( 2007xfs_freeze(
@@ -1969,18 +2009,7 @@ xfs_freeze(
1969{ 2009{
1970 xfs_mount_t *mp = XFS_BHVTOM(bdp); 2010 xfs_mount_t *mp = XFS_BHVTOM(bdp);
1971 2011
1972 /* wait for all modifications to complete */ 2012 xfs_attr_quiesce(mp);
1973 while (atomic_read(&mp->m_active_trans) > 0)
1974 delay(100);
1975
1976 /* flush inodes and push all remaining buffers out to disk */
1977 xfs_quiesce_fs(mp);
1978
1979 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
1980
1981 /* Push the superblock and write an unmount record */
1982 xfs_log_unmount_write(mp);
1983 xfs_unmountfs_writesb(mp);
1984 xfs_fs_log_dummy(mp); 2013 xfs_fs_log_dummy(mp);
1985} 2014}
1986 2015
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index de17aed578..79b522779a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -51,6 +51,7 @@
51#include "xfs_refcache.h" 51#include "xfs_refcache.h"
52#include "xfs_trans_space.h" 52#include "xfs_trans_space.h"
53#include "xfs_log_priv.h" 53#include "xfs_log_priv.h"
54#include "xfs_filestream.h"
54 55
55STATIC int 56STATIC int
56xfs_open( 57xfs_open(
@@ -77,36 +78,6 @@ xfs_open(
77 return 0; 78 return 0;
78} 79}
79 80
80STATIC int
81xfs_close(
82 bhv_desc_t *bdp,
83 int flags,
84 lastclose_t lastclose,
85 cred_t *credp)
86{
87 bhv_vnode_t *vp = BHV_TO_VNODE(bdp);
88 xfs_inode_t *ip = XFS_BHVTOI(bdp);
89
90 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
91 return XFS_ERROR(EIO);
92
93 if (lastclose != L_TRUE || !VN_ISREG(vp))
94 return 0;
95
96 /*
97 * If we previously truncated this file and removed old data in
98 * the process, we want to initiate "early" writeout on the last
99 * close. This is an attempt to combat the notorious NULL files
100 * problem which is particularly noticable from a truncate down,
101 * buffered (re-)write (delalloc), followed by a crash. What we
102 * are effectively doing here is significantly reducing the time
103 * window where we'd otherwise be exposed to that problem.
104 */
105 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
106 return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
107 return 0;
108}
109
110/* 81/*
111 * xfs_getattr 82 * xfs_getattr
112 */ 83 */
@@ -183,9 +154,8 @@ xfs_getattr(
183 * realtime extent size or the realtime volume's 154 * realtime extent size or the realtime volume's
184 * extent size. 155 * extent size.
185 */ 156 */
186 vap->va_blocksize = ip->i_d.di_extsize ? 157 vap->va_blocksize =
187 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) : 158 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
188 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
189 } 159 }
190 break; 160 break;
191 } 161 }
@@ -814,6 +784,8 @@ xfs_setattr(
814 di_flags |= XFS_DIFLAG_PROJINHERIT; 784 di_flags |= XFS_DIFLAG_PROJINHERIT;
815 if (vap->va_xflags & XFS_XFLAG_NODEFRAG) 785 if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
816 di_flags |= XFS_DIFLAG_NODEFRAG; 786 di_flags |= XFS_DIFLAG_NODEFRAG;
787 if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
788 di_flags |= XFS_DIFLAG_FILESTREAM;
817 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 789 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
818 if (vap->va_xflags & XFS_XFLAG_RTINHERIT) 790 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
819 di_flags |= XFS_DIFLAG_RTINHERIT; 791 di_flags |= XFS_DIFLAG_RTINHERIT;
@@ -1201,13 +1173,15 @@ xfs_fsync(
1201} 1173}
1202 1174
1203/* 1175/*
1204 * This is called by xfs_inactive to free any blocks beyond eof, 1176 * This is called by xfs_inactive to free any blocks beyond eof
1205 * when the link count isn't zero. 1177 * when the link count isn't zero and by xfs_dm_punch_hole() when
1178 * punching a hole to EOF.
1206 */ 1179 */
1207STATIC int 1180int
1208xfs_inactive_free_eofblocks( 1181xfs_free_eofblocks(
1209 xfs_mount_t *mp, 1182 xfs_mount_t *mp,
1210 xfs_inode_t *ip) 1183 xfs_inode_t *ip,
1184 int flags)
1211{ 1185{
1212 xfs_trans_t *tp; 1186 xfs_trans_t *tp;
1213 int error; 1187 int error;
@@ -1216,6 +1190,7 @@ xfs_inactive_free_eofblocks(
1216 xfs_filblks_t map_len; 1190 xfs_filblks_t map_len;
1217 int nimaps; 1191 int nimaps;
1218 xfs_bmbt_irec_t imap; 1192 xfs_bmbt_irec_t imap;
1193 int use_iolock = (flags & XFS_FREE_EOF_LOCK);
1219 1194
1220 /* 1195 /*
1221 * Figure out if there are any blocks beyond the end 1196 * Figure out if there are any blocks beyond the end
@@ -1256,11 +1231,14 @@ xfs_inactive_free_eofblocks(
1256 * cache and we can't 1231 * cache and we can't
1257 * do that within a transaction. 1232 * do that within a transaction.
1258 */ 1233 */
1259 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1234 if (use_iolock)
1235 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1260 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 1236 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1261 ip->i_size); 1237 ip->i_size);
1262 if (error) { 1238 if (error) {
1263 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1239 xfs_trans_cancel(tp, 0);
1240 if (use_iolock)
1241 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1264 return error; 1242 return error;
1265 } 1243 }
1266 1244
@@ -1297,7 +1275,8 @@ xfs_inactive_free_eofblocks(
1297 error = xfs_trans_commit(tp, 1275 error = xfs_trans_commit(tp,
1298 XFS_TRANS_RELEASE_LOG_RES); 1276 XFS_TRANS_RELEASE_LOG_RES);
1299 } 1277 }
1300 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1278 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1279 : XFS_ILOCK_EXCL));
1301 } 1280 }
1302 return error; 1281 return error;
1303} 1282}
@@ -1560,6 +1539,31 @@ xfs_release(
1560 if (vp->v_vfsp->vfs_flag & VFS_RDONLY) 1539 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1561 return 0; 1540 return 0;
1562 1541
1542 if (!XFS_FORCED_SHUTDOWN(mp)) {
1543 /*
1544 * If we are using filestreams, and we have an unlinked
1545 * file that we are processing the last close on, then nothing
1546 * will be able to reopen and write to this file. Purge this
1547 * inode from the filestreams cache so that it doesn't delay
1548 * teardown of the inode.
1549 */
1550 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1551 xfs_filestream_deassociate(ip);
1552
1553 /*
1554 * If we previously truncated this file and removed old data
1555 * in the process, we want to initiate "early" writeout on
1556 * the last close. This is an attempt to combat the notorious
1557 * NULL files problem which is particularly noticable from a
1558 * truncate down, buffered (re-)write (delalloc), followed by
1559 * a crash. What we are effectively doing here is
1560 * significantly reducing the time window where we'd otherwise
1561 * be exposed to that problem.
1562 */
1563 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1564 bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1565 }
1566
1563#ifdef HAVE_REFCACHE 1567#ifdef HAVE_REFCACHE
1564 /* If we are in the NFS reference cache then don't do this now */ 1568 /* If we are in the NFS reference cache then don't do this now */
1565 if (ip->i_refcache) 1569 if (ip->i_refcache)
@@ -1573,7 +1577,8 @@ xfs_release(
1573 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 1577 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1574 (!(ip->i_d.di_flags & 1578 (!(ip->i_d.di_flags &
1575 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { 1579 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1576 if ((error = xfs_inactive_free_eofblocks(mp, ip))) 1580 error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1581 if (error)
1577 return error; 1582 return error;
1578 /* Update linux inode block count after free above */ 1583 /* Update linux inode block count after free above */
1579 vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, 1584 vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
@@ -1654,7 +1659,8 @@ xfs_inactive(
1654 (!(ip->i_d.di_flags & 1659 (!(ip->i_d.di_flags &
1655 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 1660 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1656 (ip->i_delayed_blks != 0)))) { 1661 (ip->i_delayed_blks != 0)))) {
1657 if ((error = xfs_inactive_free_eofblocks(mp, ip))) 1662 error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1663 if (error)
1658 return VN_INACTIVE_CACHE; 1664 return VN_INACTIVE_CACHE;
1659 /* Update linux inode block count after free above */ 1665 /* Update linux inode block count after free above */
1660 vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, 1666 vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
@@ -1680,6 +1686,7 @@ xfs_inactive(
1680 1686
1681 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); 1687 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1682 if (error) { 1688 if (error) {
1689 xfs_trans_cancel(tp, 0);
1683 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1690 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1684 return VN_INACTIVE_CACHE; 1691 return VN_INACTIVE_CACHE;
1685 } 1692 }
@@ -2217,9 +2224,9 @@ static inline int
2217xfs_lock_inumorder(int lock_mode, int subclass) 2224xfs_lock_inumorder(int lock_mode, int subclass)
2218{ 2225{
2219 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 2226 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2220 lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT; 2227 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2221 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) 2228 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2222 lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT; 2229 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2223 2230
2224 return lock_mode; 2231 return lock_mode;
2225} 2232}
@@ -2546,6 +2553,15 @@ xfs_remove(
2546 */ 2553 */
2547 xfs_refcache_purge_ip(ip); 2554 xfs_refcache_purge_ip(ip);
2548 2555
2556 /*
2557 * If we are using filestreams, kill the stream association.
2558 * If the file is still open it may get a new one but that
2559 * will get killed on last close in xfs_close() so we don't
2560 * have to worry about that.
2561 */
2562 if (link_zero && xfs_inode_is_filestream(ip))
2563 xfs_filestream_deassociate(ip);
2564
2549 vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); 2565 vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2550 2566
2551 /* 2567 /*
@@ -4047,22 +4063,16 @@ xfs_alloc_file_space(
4047 if (XFS_FORCED_SHUTDOWN(mp)) 4063 if (XFS_FORCED_SHUTDOWN(mp))
4048 return XFS_ERROR(EIO); 4064 return XFS_ERROR(EIO);
4049 4065
4050 rt = XFS_IS_REALTIME_INODE(ip);
4051 if (unlikely(rt)) {
4052 if (!(extsz = ip->i_d.di_extsize))
4053 extsz = mp->m_sb.sb_rextsize;
4054 } else {
4055 extsz = ip->i_d.di_extsize;
4056 }
4057
4058 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 4066 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4059 return error; 4067 return error;
4060 4068
4061 if (len <= 0) 4069 if (len <= 0)
4062 return XFS_ERROR(EINVAL); 4070 return XFS_ERROR(EINVAL);
4063 4071
4072 rt = XFS_IS_REALTIME_INODE(ip);
4073 extsz = xfs_get_extsz_hint(ip);
4074
4064 count = len; 4075 count = len;
4065 error = 0;
4066 imapp = &imaps[0]; 4076 imapp = &imaps[0];
4067 nimaps = 1; 4077 nimaps = 1;
4068 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 4078 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
@@ -4678,11 +4688,7 @@ xfs_change_file_space(
4678bhv_vnodeops_t xfs_vnodeops = { 4688bhv_vnodeops_t xfs_vnodeops = {
4679 BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS), 4689 BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4680 .vop_open = xfs_open, 4690 .vop_open = xfs_open,
4681 .vop_close = xfs_close,
4682 .vop_read = xfs_read, 4691 .vop_read = xfs_read,
4683#ifdef HAVE_SENDFILE
4684 .vop_sendfile = xfs_sendfile,
4685#endif
4686#ifdef HAVE_SPLICE 4692#ifdef HAVE_SPLICE
4687 .vop_splice_read = xfs_splice_read, 4693 .vop_splice_read = xfs_splice_read,
4688 .vop_splice_write = xfs_splice_write, 4694 .vop_splice_write = xfs_splice_write,