aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c7
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/adfs/inode.c2
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/afs/dir.c37
-rw-r--r--fs/aio.c748
-rw-r--r--fs/anon_inodes.c66
-rw-r--r--fs/autofs4/dev-ioctl.c23
-rw-r--r--fs/autofs4/waitq.c13
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/binfmt_elf.c30
-rw-r--r--fs/bio-integrity.c11
-rw-r--r--fs/bio.c6
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/cachefiles/interface.c26
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/xattr.c37
-rw-r--r--fs/ceph/Kconfig9
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/addr.c116
-rw-r--r--fs/ceph/cache.c398
-rw-r--r--fs/ceph/cache.h159
-rw-r--r--fs/ceph/caps.c87
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/file.c299
-rw-r--r--fs/ceph/inode.c46
-rw-r--r--fs/ceph/ioctl.c12
-rw-r--r--fs/ceph/mds_client.c34
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h17
-rw-r--r--fs/cifs/AUTHORS55
-rw-r--r--fs/cifs/CHANGES1065
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README753
-rw-r--r--fs/cifs/TODO129
-rw-r--r--fs/cifs/cifs_unicode.h2
-rw-r--r--fs/cifs/cifsfs.c49
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h66
-rw-r--r--fs/cifs/cifspdu.h32
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c111
-rw-r--r--fs/cifs/connect.c43
-rw-r--r--fs/cifs/dir.c59
-rw-r--r--fs/cifs/file.c60
-rw-r--r--fs/cifs/fscache.c7
-rw-r--r--fs/cifs/fscache.h13
-rw-r--r--fs/cifs/inode.c62
-rw-r--r--fs/cifs/link.c24
-rw-r--r--fs/cifs/misc.c13
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/sess.c142
-rw-r--r--fs/cifs/smb1ops.c44
-rw-r--r--fs/cifs/smb2file.c25
-rw-r--r--fs/cifs/smb2inode.c9
-rw-r--r--fs/cifs/smb2misc.c186
-rw-r--r--fs/cifs/smb2ops.c344
-rw-r--r--fs/cifs/smb2pdu.c135
-rw-r--r--fs/cifs/smb2pdu.h37
-rw-r--r--fs/cifs/smb2proto.h5
-rw-r--r--fs/cifs/smb2transport.c70
-rw-r--r--fs/cifs/winucase.c663
-rw-r--r--fs/coredump.c5
-rw-r--r--fs/dcache.c1064
-rw-r--r--fs/direct-io.c127
-rw-r--r--fs/dlm/ast.c5
-rw-r--r--fs/dlm/user.c24
-rw-r--r--fs/drop_caches.c1
-rw-r--r--fs/ecryptfs/crypto.c16
-rw-r--r--fs/eventpoll.c33
-rw-r--r--fs/exec.c122
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/super.c43
-rw-r--r--fs/ext4/balloc.c24
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h69
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c296
-rw-r--r--fs/ext4/extents_status.c158
-rw-r--r--fs/ext4/extents_status.h51
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c90
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c320
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/mballoc.c49
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c35
-rw-r--r--fs/ext4/page-io.c30
-rw-r--r--fs/ext4/super.c67
-rw-r--r--fs/f2fs/checkpoint.c24
-rw-r--r--fs/f2fs/data.c28
-rw-r--r--fs/f2fs/debug.c34
-rw-r--r--fs/f2fs/dir.c19
-rw-r--r--fs/f2fs/f2fs.h106
-rw-r--r--fs/f2fs/file.c25
-rw-r--r--fs/f2fs/gc.c58
-rw-r--r--fs/f2fs/gc.h38
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c33
-rw-r--r--fs/f2fs/node.c100
-rw-r--r--fs/f2fs/node.h44
-rw-r--r--fs/f2fs/recovery.c29
-rw-r--r--fs/f2fs/segment.c41
-rw-r--r--fs/f2fs/segment.h6
-rw-r--r--fs/f2fs/super.c209
-rw-r--r--fs/f2fs/xattr.c289
-rw-r--r--fs/f2fs/xattr.h15
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/fs-writeback.c20
-rw-r--r--fs/fscache/cookie.c72
-rw-r--r--fs/fscache/internal.h6
-rw-r--r--fs/fscache/page.c73
-rw-r--r--fs/fuse/cuse.c13
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/dir.c132
-rw-r--r--fs/fuse/file.c34
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/gfs2/aops.c44
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/dentry.c9
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/glock.c43
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/lops.c18
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c18
-rw-r--r--fs/gfs2/meta_io.h26
-rw-r--r--fs/gfs2/ops_fstype.c53
-rw-r--r--fs/gfs2/quota.c18
-rw-r--r--fs/gfs2/quota.h6
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/Kconfig18
-rw-r--r--fs/hfsplus/Makefile2
-rw-r--r--fs/hfsplus/acl.h30
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/inode.c13
-rw-r--r--fs/hfsplus/posix_acl.c274
-rw-r--r--fs/hfsplus/xattr.c62
-rw-r--r--fs/hfsplus/xattr.h33
-rw-r--r--fs/hfsplus/xattr_security.c13
-rw-r--r--fs/hostfs/hostfs_kern.c9
-rw-r--r--fs/hpfs/file.c2
-rw-r--r--fs/inode.c195
-rw-r--r--fs/internal.h10
-rw-r--r--fs/isofs/inode.c16
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c18
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jbd2/recovery.c24
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/mbcache.c49
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/namei.c377
-rw-r--r--fs/namespace.c136
-rw-r--r--fs/nfs/Makefile9
-rw-r--r--fs/nfs/callback_proc.c11
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c3
-rw-r--r--fs/nfs/dir.c111
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/file.c20
-rw-r--r--fs/nfs/idmap.c195
-rw-r--r--fs/nfs/inode.c36
-rw-r--r--fs/nfs/internal.h37
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs4_fs.h98
-rw-r--r--fs/nfs/nfs4client.c240
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4filelayout.c39
-rw-r--r--fs/nfs/nfs4filelayoutdev.c20
-rw-r--r--fs/nfs/nfs4getroot.c4
-rw-r--r--fs/nfs/nfs4namespace.c21
-rw-r--r--fs/nfs/nfs4proc.c857
-rw-r--r--fs/nfs/nfs4session.c86
-rw-r--r--fs/nfs/nfs4session.h37
-rw-r--r--fs/nfs/nfs4state.c174
-rw-r--r--fs/nfs/nfs4super.c2
-rw-r--r--fs/nfs/nfs4trace.c17
-rw-r--r--fs/nfs/nfs4trace.h1148
-rw-r--r--fs/nfs/nfs4xdr.c231
-rw-r--r--fs/nfs/nfstrace.c9
-rw-r--r--fs/nfs/nfstrace.h729
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c3
-rw-r--r--fs/nfs/proc.c6
-rw-r--r--fs/nfs/read.c7
-rw-r--r--fs/nfs/super.c102
-rw-r--r--fs/nfs/unlink.c38
-rw-r--r--fs/nfs/write.c57
-rw-r--r--fs/nfsd/nfs4recover.c2
-rw-r--r--fs/nfsd/nfs4state.c33
-rw-r--r--fs/nfsd/nfs4xdr.c14
-rw-r--r--fs/nfsd/nfscache.c32
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/segment.c11
-rw-r--r--fs/nilfs2/super.c26
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/aops.c10
-rw-r--r--fs/ocfs2/cluster/heartbeat.c32
-rw-r--r--fs/ocfs2/cluster/tcp.c60
-rw-r--r--fs/ocfs2/dcache.c7
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c18
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c15
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c35
-rw-r--r--fs/ocfs2/dlm/dlmlock.c9
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c18
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c13
-rw-r--r--fs/ocfs2/dlm/dlmthread.c19
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c3
-rw-r--r--fs/ocfs2/extent_map.c11
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c43
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/move_extents.c3
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota_global.c6
-rw-r--r--fs/ocfs2/quota_local.c12
-rw-r--r--fs/ocfs2/refcounttree.c10
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ocfs2/xattr.c11
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/open.c34
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/inode.c16
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/root.c6
-rw-r--r--fs/proc/task_mmu.c50
-rw-r--r--fs/proc/vmcore.c154
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/inode.c10
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c229
-rw-r--r--fs/pstore/ram.c47
-rw-r--r--fs/quota/dquot.c80
-rw-r--r--fs/quota/quota.c29
-rw-r--r--fs/ramfs/inode.c26
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/bitmap.c22
-rw-r--r--fs/reiserfs/dir.c7
-rw-r--r--fs/reiserfs/fix_node.c26
-rw-r--r--fs/reiserfs/inode.c114
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/reiserfs/journal.c171
-rw-r--r--fs/reiserfs/lock.c43
-rw-r--r--fs/reiserfs/namei.c24
-rw-r--r--fs/reiserfs/prints.c5
-rw-r--r--fs/reiserfs/reiserfs.h36
-rw-r--r--fs/reiserfs/resize.c10
-rw-r--r--fs/reiserfs/stree.c74
-rw-r--r--fs/reiserfs/super.c75
-rw-r--r--fs/reiserfs/xattr.c46
-rw-r--r--fs/reiserfs/xattr_acl.c16
-rw-r--r--fs/squashfs/block.c11
-rw-r--r--fs/squashfs/dir.c17
-rw-r--r--fs/squashfs/namei.c8
-rw-r--r--fs/squashfs/squashfs_fs.h5
-rw-r--r--fs/stat.c11
-rw-r--r--fs/super.c131
-rw-r--r--fs/sysfs/bin.c13
-rw-r--r--fs/sysfs/dir.c61
-rw-r--r--fs/sysfs/file.c82
-rw-r--r--fs/sysfs/group.c92
-rw-r--r--fs/sysfs/inode.c21
-rw-r--r--fs/sysfs/mount.c13
-rw-r--r--fs/sysfs/symlink.c18
-rw-r--r--fs/sysfs/sysfs.h18
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/ubifs/debug.c7
-rw-r--r--fs/ubifs/shrinker.c29
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/ialloc.c16
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/udf/super.c402
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/xfs/Makefile20
-rw-r--r--fs/xfs/kmem.c15
-rw-r--r--fs/xfs/kmem.h9
-rw-r--r--fs/xfs/xfs_acl.c36
-rw-r--r--fs/xfs/xfs_ag.h53
-rw-r--r--fs/xfs/xfs_alloc.c6
-rw-r--r--fs/xfs/xfs_aops.c55
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr.c427
-rw-r--r--fs/xfs/xfs_attr.h9
-rw-r--r--fs/xfs/xfs_attr_inactive.c453
-rw-r--r--fs/xfs/xfs_attr_leaf.c657
-rw-r--r--fs/xfs/xfs_attr_leaf.h2
-rw-r--r--fs/xfs/xfs_attr_list.c655
-rw-r--r--fs/xfs/xfs_attr_remote.c18
-rw-r--r--fs/xfs/xfs_bmap.c825
-rw-r--r--fs/xfs/xfs_bmap.h56
-rw-r--r--fs/xfs/xfs_bmap_btree.c50
-rw-r--r--fs/xfs/xfs_bmap_btree.h4
-rw-r--r--fs/xfs/xfs_bmap_util.c2045
-rw-r--r--fs/xfs/xfs_bmap_util.h110
-rw-r--r--fs/xfs/xfs_btree.c177
-rw-r--r--fs/xfs/xfs_btree.h21
-rw-r--r--fs/xfs/xfs_buf.c258
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c77
-rw-r--r--fs/xfs/xfs_buf_item.h100
-rw-r--r--fs/xfs/xfs_da_btree.c14
-rw-r--r--fs/xfs/xfs_da_btree.h12
-rw-r--r--fs/xfs/xfs_dfrag.c459
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_dir2.c58
-rw-r--r--fs/xfs/xfs_dir2.h46
-rw-r--r--fs/xfs/xfs_dir2_block.c128
-rw-r--r--fs/xfs/xfs_dir2_data.c25
-rw-r--r--fs/xfs/xfs_dir2_format.h217
-rw-r--r--fs/xfs/xfs_dir2_leaf.c424
-rw-r--r--fs/xfs/xfs_dir2_node.c14
-rw-r--r--fs/xfs/xfs_dir2_priv.h49
-rw-r--r--fs/xfs/xfs_dir2_readdir.c695
-rw-r--r--fs/xfs/xfs_dir2_sf.c242
-rw-r--r--fs/xfs/xfs_discard.c5
-rw-r--r--fs/xfs/xfs_dquot.c34
-rw-r--r--fs/xfs/xfs_dquot_item.c26
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c5
-rw-r--r--fs/xfs/xfs_extent_busy.c5
-rw-r--r--fs/xfs/xfs_extfree_item.c50
-rw-r--r--fs/xfs/xfs_extfree_item.h88
-rw-r--r--fs/xfs/xfs_file.c3
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_filestream.h4
-rw-r--r--fs/xfs/xfs_format.h169
-rw-r--r--fs/xfs/xfs_fs.h40
-rw-r--r--fs/xfs/xfs_fsops.c8
-rw-r--r--fs/xfs/xfs_ialloc.c7
-rw-r--r--fs/xfs/xfs_icache.c32
-rw-r--r--fs/xfs/xfs_icache.h56
-rw-r--r--fs/xfs/xfs_icreate_item.c21
-rw-r--r--fs/xfs/xfs_icreate_item.h18
-rw-r--r--fs/xfs/xfs_inode.c3749
-rw-r--r--fs/xfs/xfs_inode.h312
-rw-r--r--fs/xfs/xfs_inode_buf.c481
-rw-r--r--fs/xfs/xfs_inode_buf.h53
-rw-r--r--fs/xfs/xfs_inode_fork.c1920
-rw-r--r--fs/xfs/xfs_inode_fork.h171
-rw-r--r--fs/xfs/xfs_inode_item.c53
-rw-r--r--fs/xfs/xfs_inode_item.h115
-rw-r--r--fs/xfs/xfs_ioctl.c184
-rw-r--r--fs/xfs/xfs_ioctl.h10
-rw-r--r--fs/xfs/xfs_ioctl32.c22
-rw-r--r--fs/xfs/xfs_iomap.c21
-rw-r--r--fs/xfs/xfs_iops.c78
-rw-r--r--fs/xfs/xfs_iops.h13
-rw-r--r--fs/xfs/xfs_itable.c7
-rw-r--r--fs/xfs/xfs_linux.h60
-rw-r--r--fs/xfs/xfs_log.c116
-rw-r--r--fs/xfs/xfs_log.h90
-rw-r--r--fs/xfs/xfs_log_cil.c371
-rw-r--r--fs/xfs/xfs_log_format.h856
-rw-r--r--fs/xfs/xfs_log_priv.h155
-rw-r--r--fs/xfs/xfs_log_recover.c569
-rw-r--r--fs/xfs/xfs_log_rlimit.c147
-rw-r--r--fs/xfs/xfs_mount.c755
-rw-r--r--fs/xfs/xfs_mount.h113
-rw-r--r--fs/xfs/xfs_qm.c382
-rw-r--r--fs/xfs/xfs_qm.h6
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_qm_syscalls.c126
-rw-r--r--fs/xfs/xfs_quota.h278
-rw-r--r--fs/xfs/xfs_quota_defs.h157
-rw-r--r--fs/xfs/xfs_quotaops.c17
-rw-r--r--fs/xfs/xfs_rename.c346
-rw-r--r--fs/xfs/xfs_rtalloc.c28
-rw-r--r--fs/xfs/xfs_rtalloc.h53
-rw-r--r--fs/xfs/xfs_sb.c834
-rw-r--r--fs/xfs/xfs_sb.h72
-rw-r--r--fs/xfs/xfs_super.c43
-rw-r--r--fs/xfs/xfs_symlink.c196
-rw-r--r--fs/xfs/xfs_symlink.h41
-rw-r--r--fs/xfs/xfs_symlink_remote.c200
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trans.c732
-rw-r--r--fs/xfs/xfs_trans.h301
-rw-r--r--fs/xfs/xfs_trans_ail.c18
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_dquot.c1
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_trans_resv.c803
-rw-r--r--fs/xfs/xfs_trans_resv.h116
-rw-r--r--fs/xfs/xfs_types.h60
-rw-r--r--fs/xfs/xfs_utils.c314
-rw-r--r--fs/xfs/xfs_utils.h27
-rw-r--r--fs/xfs/xfs_vnodeops.c1870
-rw-r--r--fs/xfs/xfs_vnodeops.h55
-rw-r--r--fs/xfs/xfs_xattr.c2
424 files changed, 27083 insertions, 19078 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 58e6cbce4156..08f2e1e9a7e6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -603,10 +603,11 @@ static int v9fs_cache_register(void)
603 if (ret < 0) 603 if (ret < 0)
604 return ret; 604 return ret;
605#ifdef CONFIG_9P_FSCACHE 605#ifdef CONFIG_9P_FSCACHE
606 return fscache_register_netfs(&v9fs_cache_netfs); 606 ret = fscache_register_netfs(&v9fs_cache_netfs);
607#else 607 if (ret < 0)
608 return ret; 608 v9fs_destroy_inode_cache();
609#endif 609#endif
610 return ret;
610} 611}
611 612
612static void v9fs_cache_unregister(void) 613static void v9fs_cache_unregister(void)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d384a8b77ee8..aa5ecf479a57 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -183,7 +183,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
183 else 183 else
184 flock.length = fl->fl_end - fl->fl_start + 1; 184 flock.length = fl->fl_end - fl->fl_start + 1;
185 flock.proc_id = fl->fl_pid; 185 flock.proc_id = fl->fl_pid;
186 flock.client_id = utsname()->nodename; 186 flock.client_id = fid->clnt->name;
187 if (IS_SETLKW(cmd)) 187 if (IS_SETLKW(cmd))
188 flock.flags = P9_LOCK_FLAGS_BLOCK; 188 flock.flags = P9_LOCK_FLAGS_BLOCK;
189 189
@@ -260,7 +260,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
260 else 260 else
261 glock.length = fl->fl_end - fl->fl_start + 1; 261 glock.length = fl->fl_end - fl->fl_start + 1;
262 glock.proc_id = fl->fl_pid; 262 glock.proc_id = fl->fl_pid;
263 glock.client_id = utsname()->nodename; 263 glock.client_id = fid->clnt->name;
264 264
265 res = p9_client_getlock_dotl(fid, &glock); 265 res = p9_client_getlock_dotl(fid, &glock);
266 if (res < 0) 266 if (res < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 25b018efb8ab..94de6d1482e2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -146,7 +146,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
146 char type = 0, ext[32]; 146 char type = 0, ext[32];
147 int major = -1, minor = -1; 147 int major = -1, minor = -1;
148 148
149 strncpy(ext, stat->extension, sizeof(ext)); 149 strlcpy(ext, stat->extension, sizeof(ext));
150 sscanf(ext, "%c %u %u", &type, &major, &minor); 150 sscanf(ext, "%c %u %u", &type, &major, &minor);
151 switch (type) { 151 switch (type) {
152 case 'c': 152 case 'c':
@@ -1186,7 +1186,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1186 * this even with .u extension. So check 1186 * this even with .u extension. So check
1187 * for non NULL stat->extension 1187 * for non NULL stat->extension
1188 */ 1188 */
1189 strncpy(ext, stat->extension, sizeof(ext)); 1189 strlcpy(ext, stat->extension, sizeof(ext));
1190 /* HARDLINKCOUNT %u */ 1190 /* HARDLINKCOUNT %u */
1191 sscanf(ext, "%13s %u", tag_name, &i_nlink); 1191 sscanf(ext, "%13s %u", tag_name, &i_nlink);
1192 if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) 1192 if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 53687bbf2296..a7c481402c46 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -267,14 +267,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
267 } 267 }
268 268
269 /* Only creates */ 269 /* Only creates */
270 if (!(flags & O_CREAT)) 270 if (!(flags & O_CREAT) || dentry->d_inode)
271 return finish_no_open(file, res); 271 return finish_no_open(file, res);
272 else if (dentry->d_inode) {
273 if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
274 return -EEXIST;
275 else
276 return finish_no_open(file, res);
277 }
278 272
279 v9ses = v9fs_inode2v9ses(dir); 273 v9ses = v9fs_inode2v9ses(dir);
280 274
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 5f95d1ed9c6d..b9acadafa4a1 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
50 struct inode *inode = mapping->host; 50 struct inode *inode = mapping->host;
51 51
52 if (to > inode->i_size) 52 if (to > inode->i_size)
53 truncate_pagecache(inode, to, inode->i_size); 53 truncate_pagecache(inode, inode->i_size);
54} 54}
55 55
56static int adfs_write_begin(struct file *file, struct address_space *mapping, 56static int adfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index af3261b78102..8669b6ecddee 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
406 struct inode *inode = mapping->host; 406 struct inode *inode = mapping->host;
407 407
408 if (to > inode->i_size) { 408 if (to > inode->i_size) {
409 truncate_pagecache(inode, to, inode->i_size); 409 truncate_pagecache(inode, inode->i_size);
410 affs_truncate(inode); 410 affs_truncate(inode);
411 } 411 }
412} 412}
@@ -836,7 +836,7 @@ affs_truncate(struct inode *inode)
836 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
837 struct page *page; 837 struct page *page;
838 void *fsdata; 838 void *fsdata;
839 u32 size = inode->i_size; 839 loff_t size = inode->i_size;
840 int res; 840 int res;
841 841
842 res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); 842 res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34494fbead0a..529300327f45 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -600,9 +600,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
600 600
601 /* lock down the parent dentry so we can peer at it */ 601 /* lock down the parent dentry so we can peer at it */
602 parent = dget_parent(dentry); 602 parent = dget_parent(dentry);
603 if (!parent->d_inode)
604 goto out_bad;
605
606 dir = AFS_FS_I(parent->d_inode); 603 dir = AFS_FS_I(parent->d_inode);
607 604
608 /* validate the parent directory */ 605 /* validate the parent directory */
@@ -685,16 +682,12 @@ not_found:
685 spin_unlock(&dentry->d_lock); 682 spin_unlock(&dentry->d_lock);
686 683
687out_bad: 684out_bad:
688 if (dentry->d_inode) { 685 /* don't unhash if we have submounts */
689 /* don't unhash if we have submounts */ 686 if (check_submounts_and_drop(dentry) != 0)
690 if (have_submounts(dentry)) 687 goto out_skip;
691 goto out_skip;
692 }
693 688
694 _debug("dropping dentry %s/%s", 689 _debug("dropping dentry %s/%s",
695 parent->d_name.name, dentry->d_name.name); 690 parent->d_name.name, dentry->d_name.name);
696 shrink_dcache_parent(dentry);
697 d_drop(dentry);
698 dput(parent); 691 dput(parent);
699 key_put(key); 692 key_put(key);
700 693
@@ -755,10 +748,6 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
755 _enter("{%x:%u},{%s},%ho", 748 _enter("{%x:%u},{%s},%ho",
756 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 749 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
757 750
758 ret = -ENAMETOOLONG;
759 if (dentry->d_name.len >= AFSNAMEMAX)
760 goto error;
761
762 key = afs_request_key(dvnode->volume->cell); 751 key = afs_request_key(dvnode->volume->cell);
763 if (IS_ERR(key)) { 752 if (IS_ERR(key)) {
764 ret = PTR_ERR(key); 753 ret = PTR_ERR(key);
@@ -820,10 +809,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
820 _enter("{%x:%u},{%s}", 809 _enter("{%x:%u},{%s}",
821 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 810 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
822 811
823 ret = -ENAMETOOLONG;
824 if (dentry->d_name.len >= AFSNAMEMAX)
825 goto error;
826
827 key = afs_request_key(dvnode->volume->cell); 812 key = afs_request_key(dvnode->volume->cell);
828 if (IS_ERR(key)) { 813 if (IS_ERR(key)) {
829 ret = PTR_ERR(key); 814 ret = PTR_ERR(key);
@@ -940,10 +925,6 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
940 _enter("{%x:%u},{%s},%ho,", 925 _enter("{%x:%u},{%s},%ho,",
941 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); 926 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
942 927
943 ret = -ENAMETOOLONG;
944 if (dentry->d_name.len >= AFSNAMEMAX)
945 goto error;
946
947 key = afs_request_key(dvnode->volume->cell); 928 key = afs_request_key(dvnode->volume->cell);
948 if (IS_ERR(key)) { 929 if (IS_ERR(key)) {
949 ret = PTR_ERR(key); 930 ret = PTR_ERR(key);
@@ -1009,10 +990,6 @@ static int afs_link(struct dentry *from, struct inode *dir,
1009 dvnode->fid.vid, dvnode->fid.vnode, 990 dvnode->fid.vid, dvnode->fid.vnode,
1010 dentry->d_name.name); 991 dentry->d_name.name);
1011 992
1012 ret = -ENAMETOOLONG;
1013 if (dentry->d_name.len >= AFSNAMEMAX)
1014 goto error;
1015
1016 key = afs_request_key(dvnode->volume->cell); 993 key = afs_request_key(dvnode->volume->cell);
1017 if (IS_ERR(key)) { 994 if (IS_ERR(key)) {
1018 ret = PTR_ERR(key); 995 ret = PTR_ERR(key);
@@ -1057,10 +1034,6 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
1057 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, 1034 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
1058 content); 1035 content);
1059 1036
1060 ret = -ENAMETOOLONG;
1061 if (dentry->d_name.len >= AFSNAMEMAX)
1062 goto error;
1063
1064 ret = -EINVAL; 1037 ret = -EINVAL;
1065 if (strlen(content) >= AFSPATHMAX) 1038 if (strlen(content) >= AFSPATHMAX)
1066 goto error; 1039 goto error;
@@ -1131,10 +1104,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 new_dvnode->fid.vid, new_dvnode->fid.vnode, 1104 new_dvnode->fid.vid, new_dvnode->fid.vnode,
1132 new_dentry->d_name.name); 1105 new_dentry->d_name.name);
1133 1106
1134 ret = -ENAMETOOLONG;
1135 if (new_dentry->d_name.len >= AFSNAMEMAX)
1136 goto error;
1137
1138 key = afs_request_key(orig_dvnode->volume->cell); 1107 key = afs_request_key(orig_dvnode->volume->cell);
1139 if (IS_ERR(key)) { 1108 if (IS_ERR(key)) {
1140 ret = PTR_ERR(key); 1109 ret = PTR_ERR(key);
diff --git a/fs/aio.c b/fs/aio.c
index 9b5ca1137419..067e3d340c35 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -26,6 +26,7 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/mman.h> 27#include <linux/mman.h>
28#include <linux/mmu_context.h> 28#include <linux/mmu_context.h>
29#include <linux/percpu.h>
29#include <linux/slab.h> 30#include <linux/slab.h>
30#include <linux/timer.h> 31#include <linux/timer.h>
31#include <linux/aio.h> 32#include <linux/aio.h>
@@ -35,6 +36,10 @@
35#include <linux/eventfd.h> 36#include <linux/eventfd.h>
36#include <linux/blkdev.h> 37#include <linux/blkdev.h>
37#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/anon_inodes.h>
40#include <linux/migrate.h>
41#include <linux/ramfs.h>
42#include <linux/percpu-refcount.h>
38 43
39#include <asm/kmap_types.h> 44#include <asm/kmap_types.h>
40#include <asm/uaccess.h> 45#include <asm/uaccess.h>
@@ -61,14 +66,29 @@ struct aio_ring {
61 66
62#define AIO_RING_PAGES 8 67#define AIO_RING_PAGES 8
63 68
69struct kioctx_table {
70 struct rcu_head rcu;
71 unsigned nr;
72 struct kioctx *table[];
73};
74
75struct kioctx_cpu {
76 unsigned reqs_available;
77};
78
64struct kioctx { 79struct kioctx {
65 atomic_t users; 80 struct percpu_ref users;
66 atomic_t dead; 81 atomic_t dead;
67 82
68 /* This needs improving */
69 unsigned long user_id; 83 unsigned long user_id;
70 struct hlist_node list;
71 84
85 struct __percpu kioctx_cpu *cpu;
86
87 /*
88 * For percpu reqs_available, number of slots we move to/from global
89 * counter at a time:
90 */
91 unsigned req_batch;
72 /* 92 /*
73 * This is what userspace passed to io_setup(), it's not used for 93 * This is what userspace passed to io_setup(), it's not used for
74 * anything but counting against the global max_reqs quota. 94 * anything but counting against the global max_reqs quota.
@@ -88,10 +108,18 @@ struct kioctx {
88 long nr_pages; 108 long nr_pages;
89 109
90 struct rcu_head rcu_head; 110 struct rcu_head rcu_head;
91 struct work_struct rcu_work; 111 struct work_struct free_work;
92 112
93 struct { 113 struct {
94 atomic_t reqs_active; 114 /*
115 * This counts the number of available slots in the ringbuffer,
116 * so we avoid overflowing it: it's decremented (if positive)
117 * when allocating a kiocb and incremented when the resulting
118 * io_event is pulled off the ringbuffer.
119 *
120 * We batch accesses to it with a percpu version.
121 */
122 atomic_t reqs_available;
95 } ____cacheline_aligned_in_smp; 123 } ____cacheline_aligned_in_smp;
96 124
97 struct { 125 struct {
@@ -110,6 +138,9 @@ struct kioctx {
110 } ____cacheline_aligned_in_smp; 138 } ____cacheline_aligned_in_smp;
111 139
112 struct page *internal_pages[AIO_RING_PAGES]; 140 struct page *internal_pages[AIO_RING_PAGES];
141 struct file *aio_ring_file;
142
143 unsigned id;
113}; 144};
114 145
115/*------ sysctl variables----*/ 146/*------ sysctl variables----*/
@@ -136,17 +167,102 @@ static int __init aio_setup(void)
136} 167}
137__initcall(aio_setup); 168__initcall(aio_setup);
138 169
170static void put_aio_ring_file(struct kioctx *ctx)
171{
172 struct file *aio_ring_file = ctx->aio_ring_file;
173 if (aio_ring_file) {
174 truncate_setsize(aio_ring_file->f_inode, 0);
175
176 /* Prevent further access to the kioctx from migratepages */
177 spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
178 aio_ring_file->f_inode->i_mapping->private_data = NULL;
179 ctx->aio_ring_file = NULL;
180 spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
181
182 fput(aio_ring_file);
183 }
184}
185
139static void aio_free_ring(struct kioctx *ctx) 186static void aio_free_ring(struct kioctx *ctx)
140{ 187{
141 long i; 188 int i;
142 189
143 for (i = 0; i < ctx->nr_pages; i++) 190 for (i = 0; i < ctx->nr_pages; i++) {
191 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
192 page_count(ctx->ring_pages[i]));
144 put_page(ctx->ring_pages[i]); 193 put_page(ctx->ring_pages[i]);
194 }
195
196 put_aio_ring_file(ctx);
145 197
146 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) 198 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
147 kfree(ctx->ring_pages); 199 kfree(ctx->ring_pages);
148} 200}
149 201
202static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
203{
204 vma->vm_ops = &generic_file_vm_ops;
205 return 0;
206}
207
208static const struct file_operations aio_ring_fops = {
209 .mmap = aio_ring_mmap,
210};
211
212static int aio_set_page_dirty(struct page *page)
213{
214 return 0;
215}
216
217#if IS_ENABLED(CONFIG_MIGRATION)
218static int aio_migratepage(struct address_space *mapping, struct page *new,
219 struct page *old, enum migrate_mode mode)
220{
221 struct kioctx *ctx;
222 unsigned long flags;
223 int rc;
224
225 /* Writeback must be complete */
226 BUG_ON(PageWriteback(old));
227 put_page(old);
228
229 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
230 if (rc != MIGRATEPAGE_SUCCESS) {
231 get_page(old);
232 return rc;
233 }
234
235 get_page(new);
236
237 /* We can potentially race against kioctx teardown here. Use the
238 * address_space's private data lock to protect the mapping's
239 * private_data.
240 */
241 spin_lock(&mapping->private_lock);
242 ctx = mapping->private_data;
243 if (ctx) {
244 pgoff_t idx;
245 spin_lock_irqsave(&ctx->completion_lock, flags);
246 migrate_page_copy(new, old);
247 idx = old->index;
248 if (idx < (pgoff_t)ctx->nr_pages)
249 ctx->ring_pages[idx] = new;
250 spin_unlock_irqrestore(&ctx->completion_lock, flags);
251 } else
252 rc = -EBUSY;
253 spin_unlock(&mapping->private_lock);
254
255 return rc;
256}
257#endif
258
259static const struct address_space_operations aio_ctx_aops = {
260 .set_page_dirty = aio_set_page_dirty,
261#if IS_ENABLED(CONFIG_MIGRATION)
262 .migratepage = aio_migratepage,
263#endif
264};
265
150static int aio_setup_ring(struct kioctx *ctx) 266static int aio_setup_ring(struct kioctx *ctx)
151{ 267{
152 struct aio_ring *ring; 268 struct aio_ring *ring;
@@ -154,20 +270,45 @@ static int aio_setup_ring(struct kioctx *ctx)
154 struct mm_struct *mm = current->mm; 270 struct mm_struct *mm = current->mm;
155 unsigned long size, populate; 271 unsigned long size, populate;
156 int nr_pages; 272 int nr_pages;
273 int i;
274 struct file *file;
157 275
158 /* Compensate for the ring buffer's head/tail overlap entry */ 276 /* Compensate for the ring buffer's head/tail overlap entry */
159 nr_events += 2; /* 1 is required, 2 for good luck */ 277 nr_events += 2; /* 1 is required, 2 for good luck */
160 278
161 size = sizeof(struct aio_ring); 279 size = sizeof(struct aio_ring);
162 size += sizeof(struct io_event) * nr_events; 280 size += sizeof(struct io_event) * nr_events;
163 nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
164 281
282 nr_pages = PFN_UP(size);
165 if (nr_pages < 0) 283 if (nr_pages < 0)
166 return -EINVAL; 284 return -EINVAL;
167 285
168 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); 286 file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
287 if (IS_ERR(file)) {
288 ctx->aio_ring_file = NULL;
289 return -EAGAIN;
290 }
291
292 file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
293 file->f_inode->i_mapping->private_data = ctx;
294 file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
295
296 for (i = 0; i < nr_pages; i++) {
297 struct page *page;
298 page = find_or_create_page(file->f_inode->i_mapping,
299 i, GFP_HIGHUSER | __GFP_ZERO);
300 if (!page)
301 break;
302 pr_debug("pid(%d) page[%d]->count=%d\n",
303 current->pid, i, page_count(page));
304 SetPageUptodate(page);
305 SetPageDirty(page);
306 unlock_page(page);
307 }
308 ctx->aio_ring_file = file;
309 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
310 / sizeof(struct io_event);
169 311
170 ctx->nr_events = 0;
171 ctx->ring_pages = ctx->internal_pages; 312 ctx->ring_pages = ctx->internal_pages;
172 if (nr_pages > AIO_RING_PAGES) { 313 if (nr_pages > AIO_RING_PAGES) {
173 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), 314 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,10 +319,11 @@ static int aio_setup_ring(struct kioctx *ctx)
178 319
179 ctx->mmap_size = nr_pages * PAGE_SIZE; 320 ctx->mmap_size = nr_pages * PAGE_SIZE;
180 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); 321 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
322
181 down_write(&mm->mmap_sem); 323 down_write(&mm->mmap_sem);
182 ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, 324 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
183 PROT_READ|PROT_WRITE, 325 PROT_READ | PROT_WRITE,
184 MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); 326 MAP_SHARED | MAP_POPULATE, 0, &populate);
185 if (IS_ERR((void *)ctx->mmap_base)) { 327 if (IS_ERR((void *)ctx->mmap_base)) {
186 up_write(&mm->mmap_sem); 328 up_write(&mm->mmap_sem);
187 ctx->mmap_size = 0; 329 ctx->mmap_size = 0;
@@ -190,23 +332,34 @@ static int aio_setup_ring(struct kioctx *ctx)
190 } 332 }
191 333
192 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 334 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
335
336 /* We must do this while still holding mmap_sem for write, as we
337 * need to be protected against userspace attempting to mremap()
338 * or munmap() the ring buffer.
339 */
193 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 340 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
194 1, 0, ctx->ring_pages, NULL); 341 1, 0, ctx->ring_pages, NULL);
342
343 /* Dropping the reference here is safe as the page cache will hold
344 * onto the pages for us. It is also required so that page migration
345 * can unmap the pages and get the right reference count.
346 */
347 for (i = 0; i < ctx->nr_pages; i++)
348 put_page(ctx->ring_pages[i]);
349
195 up_write(&mm->mmap_sem); 350 up_write(&mm->mmap_sem);
196 351
197 if (unlikely(ctx->nr_pages != nr_pages)) { 352 if (unlikely(ctx->nr_pages != nr_pages)) {
198 aio_free_ring(ctx); 353 aio_free_ring(ctx);
199 return -EAGAIN; 354 return -EAGAIN;
200 } 355 }
201 if (populate)
202 mm_populate(ctx->mmap_base, populate);
203 356
204 ctx->user_id = ctx->mmap_base; 357 ctx->user_id = ctx->mmap_base;
205 ctx->nr_events = nr_events; /* trusted copy */ 358 ctx->nr_events = nr_events; /* trusted copy */
206 359
207 ring = kmap_atomic(ctx->ring_pages[0]); 360 ring = kmap_atomic(ctx->ring_pages[0]);
208 ring->nr = nr_events; /* user copy */ 361 ring->nr = nr_events; /* user copy */
209 ring->id = ctx->user_id; 362 ring->id = ~0U;
210 ring->head = ring->tail = 0; 363 ring->head = ring->tail = 0;
211 ring->magic = AIO_RING_MAGIC; 364 ring->magic = AIO_RING_MAGIC;
212 ring->compat_features = AIO_RING_COMPAT_FEATURES; 365 ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -238,11 +391,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
238} 391}
239EXPORT_SYMBOL(kiocb_set_cancel_fn); 392EXPORT_SYMBOL(kiocb_set_cancel_fn);
240 393
241static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, 394static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
242 struct io_event *res)
243{ 395{
244 kiocb_cancel_fn *old, *cancel; 396 kiocb_cancel_fn *old, *cancel;
245 int ret = -EINVAL;
246 397
247 /* 398 /*
248 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it 399 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
@@ -252,28 +403,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
252 cancel = ACCESS_ONCE(kiocb->ki_cancel); 403 cancel = ACCESS_ONCE(kiocb->ki_cancel);
253 do { 404 do {
254 if (!cancel || cancel == KIOCB_CANCELLED) 405 if (!cancel || cancel == KIOCB_CANCELLED)
255 return ret; 406 return -EINVAL;
256 407
257 old = cancel; 408 old = cancel;
258 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); 409 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
259 } while (cancel != old); 410 } while (cancel != old);
260 411
261 atomic_inc(&kiocb->ki_users); 412 return cancel(kiocb);
262 spin_unlock_irq(&ctx->ctx_lock);
263
264 memset(res, 0, sizeof(*res));
265 res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
266 res->data = kiocb->ki_user_data;
267 ret = cancel(kiocb, res);
268
269 spin_lock_irq(&ctx->ctx_lock);
270
271 return ret;
272} 413}
273 414
274static void free_ioctx_rcu(struct rcu_head *head) 415static void free_ioctx_rcu(struct rcu_head *head)
275{ 416{
276 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); 417 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
418
419 free_percpu(ctx->cpu);
277 kmem_cache_free(kioctx_cachep, ctx); 420 kmem_cache_free(kioctx_cachep, ctx);
278} 421}
279 422
@@ -282,12 +425,13 @@ static void free_ioctx_rcu(struct rcu_head *head)
282 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - 425 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
283 * now it's safe to cancel any that need to be. 426 * now it's safe to cancel any that need to be.
284 */ 427 */
285static void free_ioctx(struct kioctx *ctx) 428static void free_ioctx(struct work_struct *work)
286{ 429{
430 struct kioctx *ctx = container_of(work, struct kioctx, free_work);
287 struct aio_ring *ring; 431 struct aio_ring *ring;
288 struct io_event res;
289 struct kiocb *req; 432 struct kiocb *req;
290 unsigned head, avail; 433 unsigned cpu, avail;
434 DEFINE_WAIT(wait);
291 435
292 spin_lock_irq(&ctx->ctx_lock); 436 spin_lock_irq(&ctx->ctx_lock);
293 437
@@ -296,28 +440,38 @@ static void free_ioctx(struct kioctx *ctx)
296 struct kiocb, ki_list); 440 struct kiocb, ki_list);
297 441
298 list_del_init(&req->ki_list); 442 list_del_init(&req->ki_list);
299 kiocb_cancel(ctx, req, &res); 443 kiocb_cancel(ctx, req);
300 } 444 }
301 445
302 spin_unlock_irq(&ctx->ctx_lock); 446 spin_unlock_irq(&ctx->ctx_lock);
303 447
304 ring = kmap_atomic(ctx->ring_pages[0]); 448 for_each_possible_cpu(cpu) {
305 head = ring->head; 449 struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
306 kunmap_atomic(ring); 450
451 atomic_add(kcpu->reqs_available, &ctx->reqs_available);
452 kcpu->reqs_available = 0;
453 }
307 454
308 while (atomic_read(&ctx->reqs_active) > 0) { 455 while (1) {
309 wait_event(ctx->wait, 456 prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
310 head != ctx->tail ||
311 atomic_read(&ctx->reqs_active) <= 0);
312 457
313 avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; 458 ring = kmap_atomic(ctx->ring_pages[0]);
459 avail = (ring->head <= ring->tail)
460 ? ring->tail - ring->head
461 : ctx->nr_events - ring->head + ring->tail;
314 462
315 atomic_sub(avail, &ctx->reqs_active); 463 atomic_add(avail, &ctx->reqs_available);
316 head += avail; 464 ring->head = ring->tail;
317 head %= ctx->nr_events; 465 kunmap_atomic(ring);
466
467 if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
468 break;
469
470 schedule();
318 } 471 }
472 finish_wait(&ctx->wait, &wait);
319 473
320 WARN_ON(atomic_read(&ctx->reqs_active) < 0); 474 WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
321 475
322 aio_free_ring(ctx); 476 aio_free_ring(ctx);
323 477
@@ -333,10 +487,68 @@ static void free_ioctx(struct kioctx *ctx)
333 call_rcu(&ctx->rcu_head, free_ioctx_rcu); 487 call_rcu(&ctx->rcu_head, free_ioctx_rcu);
334} 488}
335 489
336static void put_ioctx(struct kioctx *ctx) 490static void free_ioctx_ref(struct percpu_ref *ref)
337{ 491{
338 if (unlikely(atomic_dec_and_test(&ctx->users))) 492 struct kioctx *ctx = container_of(ref, struct kioctx, users);
339 free_ioctx(ctx); 493
494 INIT_WORK(&ctx->free_work, free_ioctx);
495 schedule_work(&ctx->free_work);
496}
497
498static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
499{
500 unsigned i, new_nr;
501 struct kioctx_table *table, *old;
502 struct aio_ring *ring;
503
504 spin_lock(&mm->ioctx_lock);
505 rcu_read_lock();
506 table = rcu_dereference(mm->ioctx_table);
507
508 while (1) {
509 if (table)
510 for (i = 0; i < table->nr; i++)
511 if (!table->table[i]) {
512 ctx->id = i;
513 table->table[i] = ctx;
514 rcu_read_unlock();
515 spin_unlock(&mm->ioctx_lock);
516
517 ring = kmap_atomic(ctx->ring_pages[0]);
518 ring->id = ctx->id;
519 kunmap_atomic(ring);
520 return 0;
521 }
522
523 new_nr = (table ? table->nr : 1) * 4;
524
525 rcu_read_unlock();
526 spin_unlock(&mm->ioctx_lock);
527
528 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
529 new_nr, GFP_KERNEL);
530 if (!table)
531 return -ENOMEM;
532
533 table->nr = new_nr;
534
535 spin_lock(&mm->ioctx_lock);
536 rcu_read_lock();
537 old = rcu_dereference(mm->ioctx_table);
538
539 if (!old) {
540 rcu_assign_pointer(mm->ioctx_table, table);
541 } else if (table->nr > old->nr) {
542 memcpy(table->table, old->table,
543 old->nr * sizeof(struct kioctx *));
544
545 rcu_assign_pointer(mm->ioctx_table, table);
546 kfree_rcu(old, rcu);
547 } else {
548 kfree(table);
549 table = old;
550 }
551 }
340} 552}
341 553
342/* ioctx_alloc 554/* ioctx_alloc
@@ -348,6 +560,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
348 struct kioctx *ctx; 560 struct kioctx *ctx;
349 int err = -ENOMEM; 561 int err = -ENOMEM;
350 562
563 /*
564 * We keep track of the number of available ringbuffer slots, to prevent
565 * overflow (reqs_available), and we also use percpu counters for this.
566 *
567 * So since up to half the slots might be on other cpu's percpu counters
568 * and unavailable, double nr_events so userspace sees what they
569 * expected: additionally, we move req_batch slots to/from percpu
570 * counters at a time, so make sure that isn't 0:
571 */
572 nr_events = max(nr_events, num_possible_cpus() * 4);
573 nr_events *= 2;
574
351 /* Prevent overflows */ 575 /* Prevent overflows */
352 if ((nr_events > (0x10000000U / sizeof(struct io_event))) || 576 if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
353 (nr_events > (0x10000000U / sizeof(struct kiocb)))) { 577 (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -355,7 +579,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
355 return ERR_PTR(-EINVAL); 579 return ERR_PTR(-EINVAL);
356 } 580 }
357 581
358 if (!nr_events || (unsigned long)nr_events > aio_max_nr) 582 if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
359 return ERR_PTR(-EAGAIN); 583 return ERR_PTR(-EAGAIN);
360 584
361 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); 585 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -364,8 +588,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
364 588
365 ctx->max_reqs = nr_events; 589 ctx->max_reqs = nr_events;
366 590
367 atomic_set(&ctx->users, 2); 591 if (percpu_ref_init(&ctx->users, free_ioctx_ref))
368 atomic_set(&ctx->dead, 0); 592 goto out_freectx;
593
369 spin_lock_init(&ctx->ctx_lock); 594 spin_lock_init(&ctx->ctx_lock);
370 spin_lock_init(&ctx->completion_lock); 595 spin_lock_init(&ctx->completion_lock);
371 mutex_init(&ctx->ring_lock); 596 mutex_init(&ctx->ring_lock);
@@ -373,12 +598,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
373 598
374 INIT_LIST_HEAD(&ctx->active_reqs); 599 INIT_LIST_HEAD(&ctx->active_reqs);
375 600
601 ctx->cpu = alloc_percpu(struct kioctx_cpu);
602 if (!ctx->cpu)
603 goto out_freeref;
604
376 if (aio_setup_ring(ctx) < 0) 605 if (aio_setup_ring(ctx) < 0)
377 goto out_freectx; 606 goto out_freepcpu;
607
608 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
609 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
610 if (ctx->req_batch < 1)
611 ctx->req_batch = 1;
378 612
379 /* limit the number of system wide aios */ 613 /* limit the number of system wide aios */
380 spin_lock(&aio_nr_lock); 614 spin_lock(&aio_nr_lock);
381 if (aio_nr + nr_events > aio_max_nr || 615 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
382 aio_nr + nr_events < aio_nr) { 616 aio_nr + nr_events < aio_nr) {
383 spin_unlock(&aio_nr_lock); 617 spin_unlock(&aio_nr_lock);
384 goto out_cleanup; 618 goto out_cleanup;
@@ -386,49 +620,53 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
386 aio_nr += ctx->max_reqs; 620 aio_nr += ctx->max_reqs;
387 spin_unlock(&aio_nr_lock); 621 spin_unlock(&aio_nr_lock);
388 622
389 /* now link into global list. */ 623 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
390 spin_lock(&mm->ioctx_lock); 624
391 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); 625 err = ioctx_add_table(ctx, mm);
392 spin_unlock(&mm->ioctx_lock); 626 if (err)
627 goto out_cleanup_put;
393 628
394 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 629 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
395 ctx, ctx->user_id, mm, ctx->nr_events); 630 ctx, ctx->user_id, mm, ctx->nr_events);
396 return ctx; 631 return ctx;
397 632
633out_cleanup_put:
634 percpu_ref_put(&ctx->users);
398out_cleanup: 635out_cleanup:
399 err = -EAGAIN; 636 err = -EAGAIN;
400 aio_free_ring(ctx); 637 aio_free_ring(ctx);
638out_freepcpu:
639 free_percpu(ctx->cpu);
640out_freeref:
641 free_percpu(ctx->users.pcpu_count);
401out_freectx: 642out_freectx:
643 put_aio_ring_file(ctx);
402 kmem_cache_free(kioctx_cachep, ctx); 644 kmem_cache_free(kioctx_cachep, ctx);
403 pr_debug("error allocating ioctx %d\n", err); 645 pr_debug("error allocating ioctx %d\n", err);
404 return ERR_PTR(err); 646 return ERR_PTR(err);
405} 647}
406 648
407static void kill_ioctx_work(struct work_struct *work)
408{
409 struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
410
411 wake_up_all(&ctx->wait);
412 put_ioctx(ctx);
413}
414
415static void kill_ioctx_rcu(struct rcu_head *head)
416{
417 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
418
419 INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
420 schedule_work(&ctx->rcu_work);
421}
422
423/* kill_ioctx 649/* kill_ioctx
424 * Cancels all outstanding aio requests on an aio context. Used 650 * Cancels all outstanding aio requests on an aio context. Used
425 * when the processes owning a context have all exited to encourage 651 * when the processes owning a context have all exited to encourage
426 * the rapid destruction of the kioctx. 652 * the rapid destruction of the kioctx.
427 */ 653 */
428static void kill_ioctx(struct kioctx *ctx) 654static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
429{ 655{
430 if (!atomic_xchg(&ctx->dead, 1)) { 656 if (!atomic_xchg(&ctx->dead, 1)) {
431 hlist_del_rcu(&ctx->list); 657 struct kioctx_table *table;
658
659 spin_lock(&mm->ioctx_lock);
660 rcu_read_lock();
661 table = rcu_dereference(mm->ioctx_table);
662
663 WARN_ON(ctx != table->table[ctx->id]);
664 table->table[ctx->id] = NULL;
665 rcu_read_unlock();
666 spin_unlock(&mm->ioctx_lock);
667
668 /* percpu_ref_kill() will do the necessary call_rcu() */
669 wake_up_all(&ctx->wait);
432 670
433 /* 671 /*
434 * It'd be more correct to do this in free_ioctx(), after all 672 * It'd be more correct to do this in free_ioctx(), after all
@@ -445,24 +683,23 @@ static void kill_ioctx(struct kioctx *ctx)
445 if (ctx->mmap_size) 683 if (ctx->mmap_size)
446 vm_munmap(ctx->mmap_base, ctx->mmap_size); 684 vm_munmap(ctx->mmap_base, ctx->mmap_size);
447 685
448 /* Between hlist_del_rcu() and dropping the initial ref */ 686 percpu_ref_kill(&ctx->users);
449 call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
450 } 687 }
451} 688}
452 689
453/* wait_on_sync_kiocb: 690/* wait_on_sync_kiocb:
454 * Waits on the given sync kiocb to complete. 691 * Waits on the given sync kiocb to complete.
455 */ 692 */
456ssize_t wait_on_sync_kiocb(struct kiocb *iocb) 693ssize_t wait_on_sync_kiocb(struct kiocb *req)
457{ 694{
458 while (atomic_read(&iocb->ki_users)) { 695 while (!req->ki_ctx) {
459 set_current_state(TASK_UNINTERRUPTIBLE); 696 set_current_state(TASK_UNINTERRUPTIBLE);
460 if (!atomic_read(&iocb->ki_users)) 697 if (req->ki_ctx)
461 break; 698 break;
462 io_schedule(); 699 io_schedule();
463 } 700 }
464 __set_current_state(TASK_RUNNING); 701 __set_current_state(TASK_RUNNING);
465 return iocb->ki_user_data; 702 return req->ki_user_data;
466} 703}
467EXPORT_SYMBOL(wait_on_sync_kiocb); 704EXPORT_SYMBOL(wait_on_sync_kiocb);
468 705
@@ -476,16 +713,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
476 */ 713 */
477void exit_aio(struct mm_struct *mm) 714void exit_aio(struct mm_struct *mm)
478{ 715{
716 struct kioctx_table *table;
479 struct kioctx *ctx; 717 struct kioctx *ctx;
480 struct hlist_node *n; 718 unsigned i = 0;
481 719
482 hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { 720 while (1) {
483 if (1 != atomic_read(&ctx->users)) 721 rcu_read_lock();
484 printk(KERN_DEBUG 722 table = rcu_dereference(mm->ioctx_table);
485 "exit_aio:ioctx still alive: %d %d %d\n", 723
486 atomic_read(&ctx->users), 724 do {
487 atomic_read(&ctx->dead), 725 if (!table || i >= table->nr) {
488 atomic_read(&ctx->reqs_active)); 726 rcu_read_unlock();
727 rcu_assign_pointer(mm->ioctx_table, NULL);
728 if (table)
729 kfree(table);
730 return;
731 }
732
733 ctx = table->table[i++];
734 } while (!ctx);
735
736 rcu_read_unlock();
737
489 /* 738 /*
490 * We don't need to bother with munmap() here - 739 * We don't need to bother with munmap() here -
491 * exit_mmap(mm) is coming and it'll unmap everything. 740 * exit_mmap(mm) is coming and it'll unmap everything.
@@ -496,40 +745,75 @@ void exit_aio(struct mm_struct *mm)
496 */ 745 */
497 ctx->mmap_size = 0; 746 ctx->mmap_size = 0;
498 747
499 kill_ioctx(ctx); 748 kill_ioctx(mm, ctx);
749 }
750}
751
752static void put_reqs_available(struct kioctx *ctx, unsigned nr)
753{
754 struct kioctx_cpu *kcpu;
755
756 preempt_disable();
757 kcpu = this_cpu_ptr(ctx->cpu);
758
759 kcpu->reqs_available += nr;
760 while (kcpu->reqs_available >= ctx->req_batch * 2) {
761 kcpu->reqs_available -= ctx->req_batch;
762 atomic_add(ctx->req_batch, &ctx->reqs_available);
500 } 763 }
764
765 preempt_enable();
766}
767
768static bool get_reqs_available(struct kioctx *ctx)
769{
770 struct kioctx_cpu *kcpu;
771 bool ret = false;
772
773 preempt_disable();
774 kcpu = this_cpu_ptr(ctx->cpu);
775
776 if (!kcpu->reqs_available) {
777 int old, avail = atomic_read(&ctx->reqs_available);
778
779 do {
780 if (avail < ctx->req_batch)
781 goto out;
782
783 old = avail;
784 avail = atomic_cmpxchg(&ctx->reqs_available,
785 avail, avail - ctx->req_batch);
786 } while (avail != old);
787
788 kcpu->reqs_available += ctx->req_batch;
789 }
790
791 ret = true;
792 kcpu->reqs_available--;
793out:
794 preempt_enable();
795 return ret;
501} 796}
502 797
503/* aio_get_req 798/* aio_get_req
504 * Allocate a slot for an aio request. Increments the ki_users count 799 * Allocate a slot for an aio request.
505 * of the kioctx so that the kioctx stays around until all requests are 800 * Returns NULL if no requests are free.
506 * complete. Returns NULL if no requests are free.
507 *
508 * Returns with kiocb->ki_users set to 2. The io submit code path holds
509 * an extra reference while submitting the i/o.
510 * This prevents races between the aio code path referencing the
511 * req (after submitting it) and aio_complete() freeing the req.
512 */ 801 */
513static inline struct kiocb *aio_get_req(struct kioctx *ctx) 802static inline struct kiocb *aio_get_req(struct kioctx *ctx)
514{ 803{
515 struct kiocb *req; 804 struct kiocb *req;
516 805
517 if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) 806 if (!get_reqs_available(ctx))
518 return NULL; 807 return NULL;
519 808
520 if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
521 goto out_put;
522
523 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 809 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
524 if (unlikely(!req)) 810 if (unlikely(!req))
525 goto out_put; 811 goto out_put;
526 812
527 atomic_set(&req->ki_users, 2);
528 req->ki_ctx = ctx; 813 req->ki_ctx = ctx;
529
530 return req; 814 return req;
531out_put: 815out_put:
532 atomic_dec(&ctx->reqs_active); 816 put_reqs_available(ctx, 1);
533 return NULL; 817 return NULL;
534} 818}
535 819
@@ -539,35 +823,32 @@ static void kiocb_free(struct kiocb *req)
539 fput(req->ki_filp); 823 fput(req->ki_filp);
540 if (req->ki_eventfd != NULL) 824 if (req->ki_eventfd != NULL)
541 eventfd_ctx_put(req->ki_eventfd); 825 eventfd_ctx_put(req->ki_eventfd);
542 if (req->ki_dtor)
543 req->ki_dtor(req);
544 if (req->ki_iovec != &req->ki_inline_vec)
545 kfree(req->ki_iovec);
546 kmem_cache_free(kiocb_cachep, req); 826 kmem_cache_free(kiocb_cachep, req);
547} 827}
548 828
549void aio_put_req(struct kiocb *req)
550{
551 if (atomic_dec_and_test(&req->ki_users))
552 kiocb_free(req);
553}
554EXPORT_SYMBOL(aio_put_req);
555
556static struct kioctx *lookup_ioctx(unsigned long ctx_id) 829static struct kioctx *lookup_ioctx(unsigned long ctx_id)
557{ 830{
831 struct aio_ring __user *ring = (void __user *)ctx_id;
558 struct mm_struct *mm = current->mm; 832 struct mm_struct *mm = current->mm;
559 struct kioctx *ctx, *ret = NULL; 833 struct kioctx *ctx, *ret = NULL;
834 struct kioctx_table *table;
835 unsigned id;
836
837 if (get_user(id, &ring->id))
838 return NULL;
560 839
561 rcu_read_lock(); 840 rcu_read_lock();
841 table = rcu_dereference(mm->ioctx_table);
562 842
563 hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { 843 if (!table || id >= table->nr)
564 if (ctx->user_id == ctx_id) { 844 goto out;
565 atomic_inc(&ctx->users);
566 ret = ctx;
567 break;
568 }
569 }
570 845
846 ctx = table->table[id];
847 if (ctx && ctx->user_id == ctx_id) {
848 percpu_ref_get(&ctx->users);
849 ret = ctx;
850 }
851out:
571 rcu_read_unlock(); 852 rcu_read_unlock();
572 return ret; 853 return ret;
573} 854}
@@ -591,16 +872,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
591 * - the sync task helpfully left a reference to itself in the iocb 872 * - the sync task helpfully left a reference to itself in the iocb
592 */ 873 */
593 if (is_sync_kiocb(iocb)) { 874 if (is_sync_kiocb(iocb)) {
594 BUG_ON(atomic_read(&iocb->ki_users) != 1);
595 iocb->ki_user_data = res; 875 iocb->ki_user_data = res;
596 atomic_set(&iocb->ki_users, 0); 876 smp_wmb();
877 iocb->ki_ctx = ERR_PTR(-EXDEV);
597 wake_up_process(iocb->ki_obj.tsk); 878 wake_up_process(iocb->ki_obj.tsk);
598 return; 879 return;
599 } 880 }
600 881
601 /* 882 /*
602 * Take rcu_read_lock() in case the kioctx is being destroyed, as we 883 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
603 * need to issue a wakeup after decrementing reqs_active. 884 * need to issue a wakeup after incrementing reqs_available.
604 */ 885 */
605 rcu_read_lock(); 886 rcu_read_lock();
606 887
@@ -613,17 +894,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
613 } 894 }
614 895
615 /* 896 /*
616 * cancelled requests don't get events, userland was given one
617 * when the event got cancelled.
618 */
619 if (unlikely(xchg(&iocb->ki_cancel,
620 KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
621 atomic_dec(&ctx->reqs_active);
622 /* Still need the wake_up in case free_ioctx is waiting */
623 goto put_rq;
624 }
625
626 /*
627 * Add a completion event to the ring buffer. Must be done holding 897 * Add a completion event to the ring buffer. Must be done holding
628 * ctx->completion_lock to prevent other code from messing with the tail 898 * ctx->completion_lock to prevent other code from messing with the tail
629 * pointer since we might be called from irq context. 899 * pointer since we might be called from irq context.
@@ -675,9 +945,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
675 if (iocb->ki_eventfd != NULL) 945 if (iocb->ki_eventfd != NULL)
676 eventfd_signal(iocb->ki_eventfd, 1); 946 eventfd_signal(iocb->ki_eventfd, 1);
677 947
678put_rq:
679 /* everything turned out well, dispose of the aiocb. */ 948 /* everything turned out well, dispose of the aiocb. */
680 aio_put_req(iocb); 949 kiocb_free(iocb);
681 950
682 /* 951 /*
683 * We have to order our ring_info tail store above and test 952 * We have to order our ring_info tail store above and test
@@ -702,7 +971,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
702 struct io_event __user *event, long nr) 971 struct io_event __user *event, long nr)
703{ 972{
704 struct aio_ring *ring; 973 struct aio_ring *ring;
705 unsigned head, pos; 974 unsigned head, tail, pos;
706 long ret = 0; 975 long ret = 0;
707 int copy_ret; 976 int copy_ret;
708 977
@@ -710,11 +979,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
710 979
711 ring = kmap_atomic(ctx->ring_pages[0]); 980 ring = kmap_atomic(ctx->ring_pages[0]);
712 head = ring->head; 981 head = ring->head;
982 tail = ring->tail;
713 kunmap_atomic(ring); 983 kunmap_atomic(ring);
714 984
715 pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); 985 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
716 986
717 if (head == ctx->tail) 987 if (head == tail)
718 goto out; 988 goto out;
719 989
720 while (ret < nr) { 990 while (ret < nr) {
@@ -722,8 +992,8 @@ static long aio_read_events_ring(struct kioctx *ctx,
722 struct io_event *ev; 992 struct io_event *ev;
723 struct page *page; 993 struct page *page;
724 994
725 avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; 995 avail = (head <= tail ? tail : ctx->nr_events) - head;
726 if (head == ctx->tail) 996 if (head == tail)
727 break; 997 break;
728 998
729 avail = min(avail, nr - ret); 999 avail = min(avail, nr - ret);
@@ -754,9 +1024,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
754 kunmap_atomic(ring); 1024 kunmap_atomic(ring);
755 flush_dcache_page(ctx->ring_pages[0]); 1025 flush_dcache_page(ctx->ring_pages[0]);
756 1026
757 pr_debug("%li h%u t%u\n", ret, head, ctx->tail); 1027 pr_debug("%li h%u t%u\n", ret, head, tail);
758 1028
759 atomic_sub(ret, &ctx->reqs_active); 1029 put_reqs_available(ctx, ret);
760out: 1030out:
761 mutex_unlock(&ctx->ring_lock); 1031 mutex_unlock(&ctx->ring_lock);
762 1032
@@ -854,8 +1124,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
854 if (!IS_ERR(ioctx)) { 1124 if (!IS_ERR(ioctx)) {
855 ret = put_user(ioctx->user_id, ctxp); 1125 ret = put_user(ioctx->user_id, ctxp);
856 if (ret) 1126 if (ret)
857 kill_ioctx(ioctx); 1127 kill_ioctx(current->mm, ioctx);
858 put_ioctx(ioctx); 1128 percpu_ref_put(&ioctx->users);
859 } 1129 }
860 1130
861out: 1131out:
@@ -872,101 +1142,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
872{ 1142{
873 struct kioctx *ioctx = lookup_ioctx(ctx); 1143 struct kioctx *ioctx = lookup_ioctx(ctx);
874 if (likely(NULL != ioctx)) { 1144 if (likely(NULL != ioctx)) {
875 kill_ioctx(ioctx); 1145 kill_ioctx(current->mm, ioctx);
876 put_ioctx(ioctx); 1146 percpu_ref_put(&ioctx->users);
877 return 0; 1147 return 0;
878 } 1148 }
879 pr_debug("EINVAL: io_destroy: invalid context id\n"); 1149 pr_debug("EINVAL: io_destroy: invalid context id\n");
880 return -EINVAL; 1150 return -EINVAL;
881} 1151}
882 1152
883static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
884{
885 struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
886
887 BUG_ON(ret <= 0);
888
889 while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
890 ssize_t this = min((ssize_t)iov->iov_len, ret);
891 iov->iov_base += this;
892 iov->iov_len -= this;
893 iocb->ki_left -= this;
894 ret -= this;
895 if (iov->iov_len == 0) {
896 iocb->ki_cur_seg++;
897 iov++;
898 }
899 }
900
901 /* the caller should not have done more io than what fit in
902 * the remaining iovecs */
903 BUG_ON(ret > 0 && iocb->ki_left == 0);
904}
905
906typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, 1153typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
907 unsigned long, loff_t); 1154 unsigned long, loff_t);
908 1155
909static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) 1156static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
910{ 1157 int rw, char __user *buf,
911 struct file *file = iocb->ki_filp; 1158 unsigned long *nr_segs,
912 struct address_space *mapping = file->f_mapping; 1159 struct iovec **iovec,
913 struct inode *inode = mapping->host; 1160 bool compat)
914 ssize_t ret = 0;
915
916 /* This matches the pread()/pwrite() logic */
917 if (iocb->ki_pos < 0)
918 return -EINVAL;
919
920 if (rw == WRITE)
921 file_start_write(file);
922 do {
923 ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
924 iocb->ki_nr_segs - iocb->ki_cur_seg,
925 iocb->ki_pos);
926 if (ret > 0)
927 aio_advance_iovec(iocb, ret);
928
929 /* retry all partial writes. retry partial reads as long as its a
930 * regular file. */
931 } while (ret > 0 && iocb->ki_left > 0 &&
932 (rw == WRITE ||
933 (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
934 if (rw == WRITE)
935 file_end_write(file);
936
937 /* This means we must have transferred all that we could */
938 /* No need to retry anymore */
939 if ((ret == 0) || (iocb->ki_left == 0))
940 ret = iocb->ki_nbytes - iocb->ki_left;
941
942 /* If we managed to write some out we return that, rather than
943 * the eventual error. */
944 if (rw == WRITE
945 && ret < 0 && ret != -EIOCBQUEUED
946 && iocb->ki_nbytes - iocb->ki_left)
947 ret = iocb->ki_nbytes - iocb->ki_left;
948
949 return ret;
950}
951
952static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
953{ 1161{
954 ssize_t ret; 1162 ssize_t ret;
955 1163
956 kiocb->ki_nr_segs = kiocb->ki_nbytes; 1164 *nr_segs = kiocb->ki_nbytes;
957 1165
958#ifdef CONFIG_COMPAT 1166#ifdef CONFIG_COMPAT
959 if (compat) 1167 if (compat)
960 ret = compat_rw_copy_check_uvector(rw, 1168 ret = compat_rw_copy_check_uvector(rw,
961 (struct compat_iovec __user *)kiocb->ki_buf, 1169 (struct compat_iovec __user *)buf,
962 kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, 1170 *nr_segs, 1, *iovec, iovec);
963 &kiocb->ki_iovec);
964 else 1171 else
965#endif 1172#endif
966 ret = rw_copy_check_uvector(rw, 1173 ret = rw_copy_check_uvector(rw,
967 (struct iovec __user *)kiocb->ki_buf, 1174 (struct iovec __user *)buf,
968 kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, 1175 *nr_segs, 1, *iovec, iovec);
969 &kiocb->ki_iovec);
970 if (ret < 0) 1176 if (ret < 0)
971 return ret; 1177 return ret;
972 1178
@@ -975,15 +1181,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
975 return 0; 1181 return 0;
976} 1182}
977 1183
978static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) 1184static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
1185 int rw, char __user *buf,
1186 unsigned long *nr_segs,
1187 struct iovec *iovec)
979{ 1188{
980 if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) 1189 if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
981 return -EFAULT; 1190 return -EFAULT;
982 1191
983 kiocb->ki_iovec = &kiocb->ki_inline_vec; 1192 iovec->iov_base = buf;
984 kiocb->ki_iovec->iov_base = kiocb->ki_buf; 1193 iovec->iov_len = kiocb->ki_nbytes;
985 kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; 1194 *nr_segs = 1;
986 kiocb->ki_nr_segs = 1;
987 return 0; 1195 return 0;
988} 1196}
989 1197
@@ -992,15 +1200,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
992 * Performs the initial checks and aio retry method 1200 * Performs the initial checks and aio retry method
993 * setup for the kiocb at the time of io submission. 1201 * setup for the kiocb at the time of io submission.
994 */ 1202 */
995static ssize_t aio_run_iocb(struct kiocb *req, bool compat) 1203static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1204 char __user *buf, bool compat)
996{ 1205{
997 struct file *file = req->ki_filp; 1206 struct file *file = req->ki_filp;
998 ssize_t ret; 1207 ssize_t ret;
1208 unsigned long nr_segs;
999 int rw; 1209 int rw;
1000 fmode_t mode; 1210 fmode_t mode;
1001 aio_rw_op *rw_op; 1211 aio_rw_op *rw_op;
1212 struct iovec inline_vec, *iovec = &inline_vec;
1002 1213
1003 switch (req->ki_opcode) { 1214 switch (opcode) {
1004 case IOCB_CMD_PREAD: 1215 case IOCB_CMD_PREAD:
1005 case IOCB_CMD_PREADV: 1216 case IOCB_CMD_PREADV:
1006 mode = FMODE_READ; 1217 mode = FMODE_READ;
@@ -1021,21 +1232,38 @@ rw_common:
1021 if (!rw_op) 1232 if (!rw_op)
1022 return -EINVAL; 1233 return -EINVAL;
1023 1234
1024 ret = (req->ki_opcode == IOCB_CMD_PREADV || 1235 ret = (opcode == IOCB_CMD_PREADV ||
1025 req->ki_opcode == IOCB_CMD_PWRITEV) 1236 opcode == IOCB_CMD_PWRITEV)
1026 ? aio_setup_vectored_rw(rw, req, compat) 1237 ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
1027 : aio_setup_single_vector(rw, req); 1238 &iovec, compat)
1239 : aio_setup_single_vector(req, rw, buf, &nr_segs,
1240 iovec);
1028 if (ret) 1241 if (ret)
1029 return ret; 1242 return ret;
1030 1243
1031 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); 1244 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
1032 if (ret < 0) 1245 if (ret < 0) {
1246 if (iovec != &inline_vec)
1247 kfree(iovec);
1033 return ret; 1248 return ret;
1249 }
1034 1250
1035 req->ki_nbytes = ret; 1251 req->ki_nbytes = ret;
1036 req->ki_left = ret;
1037 1252
1038 ret = aio_rw_vect_retry(req, rw, rw_op); 1253 /* XXX: move/kill - rw_verify_area()? */
1254 /* This matches the pread()/pwrite() logic */
1255 if (req->ki_pos < 0) {
1256 ret = -EINVAL;
1257 break;
1258 }
1259
1260 if (rw == WRITE)
1261 file_start_write(file);
1262
1263 ret = rw_op(req, iovec, nr_segs, req->ki_pos);
1264
1265 if (rw == WRITE)
1266 file_end_write(file);
1039 break; 1267 break;
1040 1268
1041 case IOCB_CMD_FDSYNC: 1269 case IOCB_CMD_FDSYNC:
@@ -1057,6 +1285,9 @@ rw_common:
1057 return -EINVAL; 1285 return -EINVAL;
1058 } 1286 }
1059 1287
1288 if (iovec != &inline_vec)
1289 kfree(iovec);
1290
1060 if (ret != -EIOCBQUEUED) { 1291 if (ret != -EIOCBQUEUED) {
1061 /* 1292 /*
1062 * There's no easy way to restart the syscall since other AIO's 1293 * There's no easy way to restart the syscall since other AIO's
@@ -1128,21 +1359,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1128 req->ki_obj.user = user_iocb; 1359 req->ki_obj.user = user_iocb;
1129 req->ki_user_data = iocb->aio_data; 1360 req->ki_user_data = iocb->aio_data;
1130 req->ki_pos = iocb->aio_offset; 1361 req->ki_pos = iocb->aio_offset;
1362 req->ki_nbytes = iocb->aio_nbytes;
1131 1363
1132 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; 1364 ret = aio_run_iocb(req, iocb->aio_lio_opcode,
1133 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1365 (char __user *)(unsigned long)iocb->aio_buf,
1134 req->ki_opcode = iocb->aio_lio_opcode; 1366 compat);
1135
1136 ret = aio_run_iocb(req, compat);
1137 if (ret) 1367 if (ret)
1138 goto out_put_req; 1368 goto out_put_req;
1139 1369
1140 aio_put_req(req); /* drop extra ref to req */
1141 return 0; 1370 return 0;
1142out_put_req: 1371out_put_req:
1143 atomic_dec(&ctx->reqs_active); 1372 put_reqs_available(ctx, 1);
1144 aio_put_req(req); /* drop extra ref to req */ 1373 kiocb_free(req);
1145 aio_put_req(req); /* drop i/o ref to req */
1146 return ret; 1374 return ret;
1147} 1375}
1148 1376
@@ -1195,7 +1423,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1195 } 1423 }
1196 blk_finish_plug(&plug); 1424 blk_finish_plug(&plug);
1197 1425
1198 put_ioctx(ctx); 1426 percpu_ref_put(&ctx->users);
1199 return i ? i : ret; 1427 return i ? i : ret;
1200} 1428}
1201 1429
@@ -1252,7 +1480,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
1252SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, 1480SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1253 struct io_event __user *, result) 1481 struct io_event __user *, result)
1254{ 1482{
1255 struct io_event res;
1256 struct kioctx *ctx; 1483 struct kioctx *ctx;
1257 struct kiocb *kiocb; 1484 struct kiocb *kiocb;
1258 u32 key; 1485 u32 key;
@@ -1270,21 +1497,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1270 1497
1271 kiocb = lookup_kiocb(ctx, iocb, key); 1498 kiocb = lookup_kiocb(ctx, iocb, key);
1272 if (kiocb) 1499 if (kiocb)
1273 ret = kiocb_cancel(ctx, kiocb, &res); 1500 ret = kiocb_cancel(ctx, kiocb);
1274 else 1501 else
1275 ret = -EINVAL; 1502 ret = -EINVAL;
1276 1503
1277 spin_unlock_irq(&ctx->ctx_lock); 1504 spin_unlock_irq(&ctx->ctx_lock);
1278 1505
1279 if (!ret) { 1506 if (!ret) {
1280 /* Cancellation succeeded -- copy the result 1507 /*
1281 * into the user's buffer. 1508 * The result argument is no longer used - the io_event is
1509 * always delivered via the ring buffer. -EINPROGRESS indicates
1510 * cancellation is progress:
1282 */ 1511 */
1283 if (copy_to_user(result, &res, sizeof(res))) 1512 ret = -EINPROGRESS;
1284 ret = -EFAULT;
1285 } 1513 }
1286 1514
1287 put_ioctx(ctx); 1515 percpu_ref_put(&ctx->users);
1288 1516
1289 return ret; 1517 return ret;
1290} 1518}
@@ -1313,7 +1541,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1313 if (likely(ioctx)) { 1541 if (likely(ioctx)) {
1314 if (likely(min_nr <= nr && min_nr >= 0)) 1542 if (likely(min_nr <= nr && min_nr >= 0))
1315 ret = read_events(ioctx, min_nr, nr, events, timeout); 1543 ret = read_events(ioctx, min_nr, nr, events, timeout);
1316 put_ioctx(ioctx); 1544 percpu_ref_put(&ioctx->users);
1317 } 1545 }
1318 return ret; 1546 return ret;
1319} 1547}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47a65df8c871..85c961849953 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = {
109}; 109};
110 110
111/** 111/**
112 * anon_inode_getfile_private - creates a new file instance by hooking it up to an
113 * anonymous inode, and a dentry that describe the "class"
114 * of the file
115 *
116 * @name: [in] name of the "class" of the new file
117 * @fops: [in] file operations for the new file
118 * @priv: [in] private data for the new file (will be file's private_data)
119 * @flags: [in] flags
120 *
121 *
122 * Similar to anon_inode_getfile, but each file holds a single inode.
123 *
124 */
125struct file *anon_inode_getfile_private(const char *name,
126 const struct file_operations *fops,
127 void *priv, int flags)
128{
129 struct qstr this;
130 struct path path;
131 struct file *file;
132 struct inode *inode;
133
134 if (fops->owner && !try_module_get(fops->owner))
135 return ERR_PTR(-ENOENT);
136
137 inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb);
138 if (IS_ERR(inode)) {
139 file = ERR_PTR(-ENOMEM);
140 goto err_module;
141 }
142
143 /*
144 * Link the inode to a directory entry by creating a unique name
145 * using the inode sequence number.
146 */
147 file = ERR_PTR(-ENOMEM);
148 this.name = name;
149 this.len = strlen(name);
150 this.hash = 0;
151 path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
152 if (!path.dentry)
153 goto err_module;
154
155 path.mnt = mntget(anon_inode_mnt);
156
157 d_instantiate(path.dentry, inode);
158
159 file = alloc_file(&path, OPEN_FMODE(flags), fops);
160 if (IS_ERR(file))
161 goto err_dput;
162
163 file->f_mapping = inode->i_mapping;
164 file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
165 file->private_data = priv;
166
167 return file;
168
169err_dput:
170 path_put(&path);
171err_module:
172 module_put(fops->owner);
173 return file;
174}
175EXPORT_SYMBOL_GPL(anon_inode_getfile_private);
176
177/**
112 * anon_inode_getfile - creates a new file instance by hooking it up to an 178 * anon_inode_getfile - creates a new file instance by hooking it up to an
113 * anonymous inode, and a dentry that describe the "class" 179 * anonymous inode, and a dentry that describe the "class"
114 * of the file 180 * of the file
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 743c7c2c949d..0f00da329e71 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -183,13 +183,14 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
183 return 0; 183 return 0;
184} 184}
185 185
186/* Find the topmost mount satisfying test() */
186static int find_autofs_mount(const char *pathname, 187static int find_autofs_mount(const char *pathname,
187 struct path *res, 188 struct path *res,
188 int test(struct path *path, void *data), 189 int test(struct path *path, void *data),
189 void *data) 190 void *data)
190{ 191{
191 struct path path; 192 struct path path;
192 int err = kern_path(pathname, 0, &path); 193 int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
193 if (err) 194 if (err)
194 return err; 195 return err;
195 err = -ENOENT; 196 err = -ENOENT;
@@ -197,10 +198,9 @@ static int find_autofs_mount(const char *pathname,
197 if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { 198 if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
198 if (test(&path, data)) { 199 if (test(&path, data)) {
199 path_get(&path); 200 path_get(&path);
200 if (!err) /* already found some */
201 path_put(res);
202 *res = path; 201 *res = path;
203 err = 0; 202 err = 0;
203 break;
204 } 204 }
205 } 205 }
206 if (!follow_up(&path)) 206 if (!follow_up(&path))
@@ -486,12 +486,11 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
486 * mount if there is one or 0 if it isn't a mountpoint. 486 * mount if there is one or 0 if it isn't a mountpoint.
487 * 487 *
488 * If we aren't supplied with a file descriptor then we 488 * If we aren't supplied with a file descriptor then we
489 * lookup the nameidata of the path and check if it is the 489 * lookup the path and check if it is the root of a mount.
490 * root of a mount. If a type is given we are looking for 490 * If a type is given we are looking for a particular autofs
491 * a particular autofs mount and if we don't find a match 491 * mount and if we don't find a match we return fail. If the
492 * we return fail. If the located nameidata path is the 492 * located path is the root of a mount we return 1 along with
493 * root of a mount we return 1 along with the super magic 493 * the super magic of the mount or 0 otherwise.
494 * of the mount or 0 otherwise.
495 * 494 *
496 * In both cases the the device number (as returned by 495 * In both cases the the device number (as returned by
497 * new_encode_dev()) is also returned. 496 * new_encode_dev()) is also returned.
@@ -519,9 +518,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
519 518
520 if (!fp || param->ioctlfd == -1) { 519 if (!fp || param->ioctlfd == -1) {
521 if (autofs_type_any(type)) 520 if (autofs_type_any(type))
522 err = kern_path(name, LOOKUP_FOLLOW, &path); 521 err = kern_path_mountpoint(AT_FDCWD,
522 name, &path, LOOKUP_FOLLOW);
523 else 523 else
524 err = find_autofs_mount(name, &path, test_by_type, &type); 524 err = find_autofs_mount(name, &path,
525 test_by_type, &type);
525 if (err) 526 if (err)
526 goto out; 527 goto out;
527 devid = new_encode_dev(path.dentry->d_sb->s_dev); 528 devid = new_encode_dev(path.dentry->d_sb->s_dev);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3db70dae40d3..689e40d983ad 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -109,13 +109,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
109 109
110 pkt.hdr.proto_version = sbi->version; 110 pkt.hdr.proto_version = sbi->version;
111 pkt.hdr.type = type; 111 pkt.hdr.type = type;
112 mutex_lock(&sbi->wq_mutex);
113 112
114 /* Check if we have become catatonic */
115 if (sbi->catatonic) {
116 mutex_unlock(&sbi->wq_mutex);
117 return;
118 }
119 switch (type) { 113 switch (type) {
120 /* Kernel protocol v4 missing and expire packets */ 114 /* Kernel protocol v4 missing and expire packets */
121 case autofs_ptype_missing: 115 case autofs_ptype_missing:
@@ -427,7 +421,6 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
427 wq->tgid = current->tgid; 421 wq->tgid = current->tgid;
428 wq->status = -EINTR; /* Status return if interrupted */ 422 wq->status = -EINTR; /* Status return if interrupted */
429 wq->wait_ctr = 2; 423 wq->wait_ctr = 2;
430 mutex_unlock(&sbi->wq_mutex);
431 424
432 if (sbi->version < 5) { 425 if (sbi->version < 5) {
433 if (notify == NFY_MOUNT) 426 if (notify == NFY_MOUNT)
@@ -449,15 +442,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
449 (unsigned long) wq->wait_queue_token, wq->name.len, 442 (unsigned long) wq->wait_queue_token, wq->name.len,
450 wq->name.name, notify); 443 wq->name.name, notify);
451 444
452 /* autofs4_notify_daemon() may block */ 445 /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
453 autofs4_notify_daemon(sbi, wq, type); 446 autofs4_notify_daemon(sbi, wq, type);
454 } else { 447 } else {
455 wq->wait_ctr++; 448 wq->wait_ctr++;
456 mutex_unlock(&sbi->wq_mutex);
457 kfree(qstr.name);
458 DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", 449 DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
459 (unsigned long) wq->wait_queue_token, wq->name.len, 450 (unsigned long) wq->wait_queue_token, wq->name.len,
460 wq->name.name, notify); 451 wq->name.name, notify);
452 mutex_unlock(&sbi->wq_mutex);
453 kfree(qstr.name);
461 } 454 }
462 455
463 /* 456 /*
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ad3ea1497cc3..ae2892218335 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
166 struct inode *inode = mapping->host; 166 struct inode *inode = mapping->host;
167 167
168 if (to > inode->i_size) 168 if (to > inode->i_size)
169 truncate_pagecache(inode, to, inode->i_size); 169 truncate_pagecache(inode, inode->i_size);
170} 170}
171 171
172static int bfs_write_begin(struct file *file, struct address_space *mapping, 172static int bfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 100edcc5e312..4c94a79991bb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1413,7 +1413,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
1413 * long file_ofs 1413 * long file_ofs
1414 * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... 1414 * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
1415 */ 1415 */
1416static void fill_files_note(struct memelfnote *note) 1416static int fill_files_note(struct memelfnote *note)
1417{ 1417{
1418 struct vm_area_struct *vma; 1418 struct vm_area_struct *vma;
1419 unsigned count, size, names_ofs, remaining, n; 1419 unsigned count, size, names_ofs, remaining, n;
@@ -1428,11 +1428,11 @@ static void fill_files_note(struct memelfnote *note)
1428 names_ofs = (2 + 3 * count) * sizeof(data[0]); 1428 names_ofs = (2 + 3 * count) * sizeof(data[0]);
1429 alloc: 1429 alloc:
1430 if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ 1430 if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
1431 goto err; 1431 return -EINVAL;
1432 size = round_up(size, PAGE_SIZE); 1432 size = round_up(size, PAGE_SIZE);
1433 data = vmalloc(size); 1433 data = vmalloc(size);
1434 if (!data) 1434 if (!data)
1435 goto err; 1435 return -ENOMEM;
1436 1436
1437 start_end_ofs = data + 2; 1437 start_end_ofs = data + 2;
1438 name_base = name_curpos = ((char *)data) + names_ofs; 1438 name_base = name_curpos = ((char *)data) + names_ofs;
@@ -1485,7 +1485,7 @@ static void fill_files_note(struct memelfnote *note)
1485 1485
1486 size = name_curpos - (char *)data; 1486 size = name_curpos - (char *)data;
1487 fill_note(note, "CORE", NT_FILE, size, data); 1487 fill_note(note, "CORE", NT_FILE, size, data);
1488 err: ; 1488 return 0;
1489} 1489}
1490 1490
1491#ifdef CORE_DUMP_USE_REGSET 1491#ifdef CORE_DUMP_USE_REGSET
@@ -1686,8 +1686,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1686 fill_auxv_note(&info->auxv, current->mm); 1686 fill_auxv_note(&info->auxv, current->mm);
1687 info->size += notesize(&info->auxv); 1687 info->size += notesize(&info->auxv);
1688 1688
1689 fill_files_note(&info->files); 1689 if (fill_files_note(&info->files) == 0)
1690 info->size += notesize(&info->files); 1690 info->size += notesize(&info->files);
1691 1691
1692 return 1; 1692 return 1;
1693} 1693}
@@ -1719,7 +1719,8 @@ static int write_note_info(struct elf_note_info *info,
1719 return 0; 1719 return 0;
1720 if (first && !writenote(&info->auxv, file, foffset)) 1720 if (first && !writenote(&info->auxv, file, foffset))
1721 return 0; 1721 return 0;
1722 if (first && !writenote(&info->files, file, foffset)) 1722 if (first && info->files.data &&
1723 !writenote(&info->files, file, foffset))
1723 return 0; 1724 return 0;
1724 1725
1725 for (i = 1; i < info->thread_notes; ++i) 1726 for (i = 1; i < info->thread_notes; ++i)
@@ -1806,6 +1807,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1806 1807
1807struct elf_note_info { 1808struct elf_note_info {
1808 struct memelfnote *notes; 1809 struct memelfnote *notes;
1810 struct memelfnote *notes_files;
1809 struct elf_prstatus *prstatus; /* NT_PRSTATUS */ 1811 struct elf_prstatus *prstatus; /* NT_PRSTATUS */
1810 struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ 1812 struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
1811 struct list_head thread_list; 1813 struct list_head thread_list;
@@ -1896,9 +1898,12 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1896 1898
1897 fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); 1899 fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
1898 fill_auxv_note(info->notes + 3, current->mm); 1900 fill_auxv_note(info->notes + 3, current->mm);
1899 fill_files_note(info->notes + 4); 1901 info->numnote = 4;
1900 1902
1901 info->numnote = 5; 1903 if (fill_files_note(info->notes + info->numnote) == 0) {
1904 info->notes_files = info->notes + info->numnote;
1905 info->numnote++;
1906 }
1902 1907
1903 /* Try to dump the FPU. */ 1908 /* Try to dump the FPU. */
1904 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, 1909 info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1960,8 +1965,9 @@ static void free_note_info(struct elf_note_info *info)
1960 kfree(list_entry(tmp, struct elf_thread_status, list)); 1965 kfree(list_entry(tmp, struct elf_thread_status, list));
1961 } 1966 }
1962 1967
1963 /* Free data allocated by fill_files_note(): */ 1968 /* Free data possibly allocated by fill_files_note(): */
1964 vfree(info->notes[4].data); 1969 if (info->notes_files)
1970 vfree(info->notes_files->data);
1965 1971
1966 kfree(info->prstatus); 1972 kfree(info->prstatus);
1967 kfree(info->psinfo); 1973 kfree(info->psinfo);
@@ -2044,7 +2050,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2044 struct vm_area_struct *vma, *gate_vma; 2050 struct vm_area_struct *vma, *gate_vma;
2045 struct elfhdr *elf = NULL; 2051 struct elfhdr *elf = NULL;
2046 loff_t offset = 0, dataoff, foffset; 2052 loff_t offset = 0, dataoff, foffset;
2047 struct elf_note_info info; 2053 struct elf_note_info info = { };
2048 struct elf_phdr *phdr4note = NULL; 2054 struct elf_phdr *phdr4note = NULL;
2049 struct elf_shdr *shdr4extnum = NULL; 2055 struct elf_shdr *shdr4extnum = NULL;
2050 Elf_Half e_phnum; 2056 Elf_Half e_phnum;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8fb42916d8a2..fc60b31453ee 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -716,13 +716,14 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
716 return 0; 716 return 0;
717 717
718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); 718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
719 719 if (!bs->bio_integrity_pool)
720 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
721 if (!bs->bvec_integrity_pool)
722 return -1; 720 return -1;
723 721
724 if (!bs->bio_integrity_pool) 722 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
723 if (!bs->bvec_integrity_pool) {
724 mempool_destroy(bs->bio_integrity_pool);
725 return -1; 725 return -1;
726 }
726 727
727 return 0; 728 return 0;
728} 729}
@@ -734,7 +735,7 @@ void bioset_integrity_free(struct bio_set *bs)
734 mempool_destroy(bs->bio_integrity_pool); 735 mempool_destroy(bs->bio_integrity_pool);
735 736
736 if (bs->bvec_integrity_pool) 737 if (bs->bvec_integrity_pool)
737 mempool_destroy(bs->bio_integrity_pool); 738 mempool_destroy(bs->bvec_integrity_pool);
738} 739}
739EXPORT_SYMBOL(bioset_integrity_free); 740EXPORT_SYMBOL(bioset_integrity_free);
740 741
diff --git a/fs/bio.c b/fs/bio.c
index c5eae7251490..ea5035da4d9a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
917 src_p = kmap_atomic(src_bv->bv_page); 917 src_p = kmap_atomic(src_bv->bv_page);
918 dst_p = kmap_atomic(dst_bv->bv_page); 918 dst_p = kmap_atomic(dst_bv->bv_page);
919 919
920 memcpy(dst_p + dst_bv->bv_offset, 920 memcpy(dst_p + dst_offset,
921 src_p + src_bv->bv_offset, 921 src_p + src_offset,
922 bytes); 922 bytes);
923 923
924 kunmap_atomic(dst_p); 924 kunmap_atomic(dst_p);
@@ -1956,7 +1956,7 @@ int bio_associate_current(struct bio *bio)
1956 1956
1957 /* associate blkcg if exists */ 1957 /* associate blkcg if exists */
1958 rcu_read_lock(); 1958 rcu_read_lock();
1959 css = task_subsys_state(current, blkio_subsys_id); 1959 css = task_css(current, blkio_subsys_id);
1960 if (css && css_tryget(css)) 1960 if (css && css_tryget(css))
1961 bio->bi_css = css; 1961 bio->bi_css = css;
1962 rcu_read_unlock(); 1962 rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7bda5cd3da7..1e86823a9cbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -592,7 +592,7 @@ static struct block_device *bd_acquire(struct inode *inode)
592 return bdev; 592 return bdev;
593} 593}
594 594
595static inline int sb_is_blkdev_sb(struct super_block *sb) 595int sb_is_blkdev_sb(struct super_block *sb)
596{ 596{
597 return sb == blockdev_superblock; 597 return sb == blockdev_superblock;
598} 598}
@@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1519 1519
1520 blk_start_plug(&plug); 1520 blk_start_plug(&plug);
1521 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1521 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1522 if (ret > 0 || ret == -EIOCBQUEUED) { 1522 if (ret > 0) {
1523 ssize_t err; 1523 ssize_t err;
1524 1524
1525 err = generic_write_sync(file, pos, ret); 1525 err = generic_write_sync(file, pos, ret);
@@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1542 return 0; 1542 return 0;
1543 1543
1544 size -= pos; 1544 size -= pos;
1545 if (size < iocb->ki_left) 1545 if (size < iocb->ki_nbytes)
1546 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1546 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1547 return generic_file_aio_read(iocb, iov, nr_segs, pos); 1547 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1548} 1548}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2a9bd5bd24c3..9efb94e95858 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -744,7 +744,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
744 WARN_ON(atomic_xchg( 744 WARN_ON(atomic_xchg(
745 &fs_info->mutually_exclusive_operation_running, 1)); 745 &fs_info->mutually_exclusive_operation_running, 1));
746 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 746 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
747 return PTR_RET(task); 747 return PTR_ERR_OR_ZERO(task);
748} 748}
749 749
750static int btrfs_dev_replace_kthread(void *data) 750static int btrfs_dev_replace_kthread(void *data)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d8ea0cb200b4..51731b76900d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -145,8 +145,16 @@ int __init extent_io_init(void)
145 offsetof(struct btrfs_io_bio, bio)); 145 offsetof(struct btrfs_io_bio, bio));
146 if (!btrfs_bioset) 146 if (!btrfs_bioset)
147 goto free_buffer_cache; 147 goto free_buffer_cache;
148
149 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
150 goto free_bioset;
151
148 return 0; 152 return 0;
149 153
154free_bioset:
155 bioset_free(btrfs_bioset);
156 btrfs_bioset = NULL;
157
150free_buffer_cache: 158free_buffer_cache:
151 kmem_cache_destroy(extent_buffer_cache); 159 kmem_cache_destroy(extent_buffer_cache);
152 extent_buffer_cache = NULL; 160 extent_buffer_cache = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d12107e90987..72da4df53c9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1718,7 +1718,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1718 */ 1718 */
1719 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1719 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1720 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1720 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1721 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1721 if (num_written > 0) {
1722 err = generic_write_sync(file, pos, num_written); 1722 err = generic_write_sync(file, pos, num_written);
1723 if (err < 0 && num_written > 0) 1723 if (err < 0 && num_written > 0)
1724 num_written = err; 1724 num_written = err;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4f419bafd071..b4f9904c4c6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
221 struct btrfs_path *path, 221 struct btrfs_path *path,
222 struct inode *inode) 222 struct inode *inode)
223{ 223{
224 loff_t oldsize;
225 int ret = 0; 224 int ret = 0;
226 225
227 oldsize = i_size_read(inode);
228 btrfs_i_size_write(inode, 0); 226 btrfs_i_size_write(inode, 0);
229 truncate_pagecache(inode, oldsize, 0); 227 truncate_pagecache(inode, 0);
230 228
231 /* 229 /*
232 * We don't need an orphan item because truncating the free space cache 230 * We don't need an orphan item because truncating the free space cache
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b4ffaf0cd52..b0ef7b07b1b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3106,7 +3106,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3106 found_key.type = BTRFS_INODE_ITEM_KEY; 3106 found_key.type = BTRFS_INODE_ITEM_KEY;
3107 found_key.offset = 0; 3107 found_key.offset = 0;
3108 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 3108 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3109 ret = PTR_RET(inode); 3109 ret = PTR_ERR_OR_ZERO(inode);
3110 if (ret && ret != -ESTALE) 3110 if (ret && ret != -ESTALE)
3111 goto out; 3111 goto out;
3112 3112
@@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4349 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 4349 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4350 4350
4351 if (newsize > oldsize) { 4351 if (newsize > oldsize) {
4352 truncate_pagecache(inode, oldsize, newsize); 4352 truncate_pagecache(inode, newsize);
4353 ret = btrfs_cont_expand(inode, oldsize, newsize); 4353 ret = btrfs_cont_expand(inode, oldsize, newsize);
4354 if (ret) 4354 if (ret)
4355 return ret; 4355 return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b4b15467426b..e46e0ed74925 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -220,7 +220,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
220 len = PAGE_ALIGN(len); 220 len = PAGE_ALIGN(len);
221 221
222 if (p->buf == p->inline_buf) { 222 if (p->buf == p->inline_buf) {
223 tmp_buf = kmalloc(len, GFP_NOFS); 223 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
224 if (!tmp_buf) { 224 if (!tmp_buf) {
225 tmp_buf = vmalloc(len); 225 tmp_buf = vmalloc(len);
226 if (!tmp_buf) 226 if (!tmp_buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b0203b1322ac..043b215769c2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3329,7 +3329,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3329 } 3329 }
3330 3330
3331 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3331 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3332 return PTR_RET(tsk); 3332 return PTR_ERR_OR_ZERO(tsk);
3333} 3333}
3334 3334
3335int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3335int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index d4c1206af9fc..43eb5592cdea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -378,6 +378,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
378} 378}
379 379
380/* 380/*
381 * check if the backing cache is updated to FS-Cache
382 * - called by FS-Cache when evaluates if need to invalidate the cache
383 */
384static bool cachefiles_check_consistency(struct fscache_operation *op)
385{
386 struct cachefiles_object *object;
387 struct cachefiles_cache *cache;
388 const struct cred *saved_cred;
389 int ret;
390
391 _enter("{OBJ%x}", op->object->debug_id);
392
393 object = container_of(op->object, struct cachefiles_object, fscache);
394 cache = container_of(object->fscache.cache,
395 struct cachefiles_cache, cache);
396
397 cachefiles_begin_secure(cache, &saved_cred);
398 ret = cachefiles_check_auxdata(object);
399 cachefiles_end_secure(cache, saved_cred);
400
401 _leave(" = %d", ret);
402 return ret;
403}
404
405/*
381 * notification the attributes on an object have changed 406 * notification the attributes on an object have changed
382 * - called with reads/writes excluded by FS-Cache 407 * - called with reads/writes excluded by FS-Cache
383 */ 408 */
@@ -522,4 +547,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
522 .write_page = cachefiles_write_page, 547 .write_page = cachefiles_write_page,
523 .uncache_page = cachefiles_uncache_page, 548 .uncache_page = cachefiles_uncache_page,
524 .dissociate_pages = cachefiles_dissociate_pages, 549 .dissociate_pages = cachefiles_dissociate_pages,
550 .check_consistency = cachefiles_check_consistency,
525}; 551};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 49382519907a..5349473df1b1 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -235,6 +235,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
235 struct cachefiles_xattr *auxdata); 235 struct cachefiles_xattr *auxdata);
236extern int cachefiles_update_object_xattr(struct cachefiles_object *object, 236extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
237 struct cachefiles_xattr *auxdata); 237 struct cachefiles_xattr *auxdata);
238extern int cachefiles_check_auxdata(struct cachefiles_object *object);
238extern int cachefiles_check_object_xattr(struct cachefiles_object *object, 239extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
239 struct cachefiles_xattr *auxdata); 240 struct cachefiles_xattr *auxdata);
240extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, 241extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 25badd1aec5c..f4a08d7fa2f7 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -56,7 +56,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
56 object->fscache.cookie->parent, 56 object->fscache.cookie->parent,
57 object->fscache.cookie->netfs_data, 57 object->fscache.cookie->netfs_data,
58 object->fscache.cookie->flags); 58 object->fscache.cookie->flags);
59 if (keybuf) 59 if (keybuf && cookie->def)
60 keylen = cookie->def->get_key(cookie->netfs_data, keybuf, 60 keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
61 CACHEFILES_KEYBUF_SIZE); 61 CACHEFILES_KEYBUF_SIZE);
62 else 62 else
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 2476e5162609..12b0eef84183 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -157,6 +157,43 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
157} 157}
158 158
159/* 159/*
160 * check the consistency between the backing cache and the FS-Cache cookie
161 */
162int cachefiles_check_auxdata(struct cachefiles_object *object)
163{
164 struct cachefiles_xattr *auxbuf;
165 enum fscache_checkaux validity;
166 struct dentry *dentry = object->dentry;
167 ssize_t xlen;
168 int ret;
169
170 ASSERT(dentry);
171 ASSERT(dentry->d_inode);
172 ASSERT(object->fscache.cookie->def->check_aux);
173
174 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
175 if (!auxbuf)
176 return -ENOMEM;
177
178 xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
179 &auxbuf->type, 512 + 1);
180 ret = -ESTALE;
181 if (xlen < 1 ||
182 auxbuf->type != object->fscache.cookie->def->type)
183 goto error;
184
185 xlen--;
186 validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
187 if (validity != FSCACHE_CHECKAUX_OKAY)
188 goto error;
189
190 ret = 0;
191error:
192 kfree(auxbuf);
193 return ret;
194}
195
196/*
160 * check the state xattr on a cache file 197 * check the state xattr on a cache file
161 * - return -ESTALE if the object should be deleted 198 * - return -ESTALE if the object should be deleted
162 */ 199 */
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 49bc78243db9..ac9a2ef5bb9b 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -16,3 +16,12 @@ config CEPH_FS
16 16
17 If unsure, say N. 17 If unsure, say N.
18 18
19if CEPH_FS
20config CEPH_FSCACHE
21 bool "Enable Ceph client caching support"
22 depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
23 help
24 Choose Y here to enable persistent, read-only local
25 caching support for Ceph clients using FS-Cache
26
27endif
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bd352125e829..32e30106a2f0 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -9,3 +9,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
9 mds_client.o mdsmap.o strings.o ceph_frag.o \ 9 mds_client.o mdsmap.o strings.o ceph_frag.o \
10 debugfs.o 10 debugfs.o
11 11
12ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5318a3b704f6..6df8bd481425 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -11,6 +11,7 @@
11 11
12#include "super.h" 12#include "super.h"
13#include "mds_client.h" 13#include "mds_client.h"
14#include "cache.h"
14#include <linux/ceph/osd_client.h> 15#include <linux/ceph/osd_client.h>
15 16
16/* 17/*
@@ -70,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page)
70 struct address_space *mapping = page->mapping; 71 struct address_space *mapping = page->mapping;
71 struct inode *inode; 72 struct inode *inode;
72 struct ceph_inode_info *ci; 73 struct ceph_inode_info *ci;
73 int undo = 0;
74 struct ceph_snap_context *snapc; 74 struct ceph_snap_context *snapc;
75 int ret;
75 76
76 if (unlikely(!mapping)) 77 if (unlikely(!mapping))
77 return !TestSetPageDirty(page); 78 return !TestSetPageDirty(page);
78 79
79 if (TestSetPageDirty(page)) { 80 if (PageDirty(page)) {
80 dout("%p set_page_dirty %p idx %lu -- already dirty\n", 81 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
81 mapping->host, page, page->index); 82 mapping->host, page, page->index);
83 BUG_ON(!PagePrivate(page));
82 return 0; 84 return 0;
83 } 85 }
84 86
@@ -107,35 +109,19 @@ static int ceph_set_page_dirty(struct page *page)
107 snapc, snapc->seq, snapc->num_snaps); 109 snapc, snapc->seq, snapc->num_snaps);
108 spin_unlock(&ci->i_ceph_lock); 110 spin_unlock(&ci->i_ceph_lock);
109 111
110 /* now adjust page */ 112 /*
111 spin_lock_irq(&mapping->tree_lock); 113 * Reference snap context in page->private. Also set
112 if (page->mapping) { /* Race with truncate? */ 114 * PagePrivate so that we get invalidatepage callback.
113 WARN_ON_ONCE(!PageUptodate(page)); 115 */
114 account_page_dirtied(page, page->mapping); 116 BUG_ON(PagePrivate(page));
115 radix_tree_tag_set(&mapping->page_tree, 117 page->private = (unsigned long)snapc;
116 page_index(page), PAGECACHE_TAG_DIRTY); 118 SetPagePrivate(page);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134 119
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 120 ret = __set_page_dirty_nobuffers(page);
121 WARN_ON(!PageLocked(page));
122 WARN_ON(!page->mapping);
136 123
137 BUG_ON(!PageDirty(page)); 124 return ret;
138 return 1;
139} 125}
140 126
141/* 127/*
@@ -150,11 +136,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
150 struct ceph_inode_info *ci; 136 struct ceph_inode_info *ci;
151 struct ceph_snap_context *snapc = page_snap_context(page); 137 struct ceph_snap_context *snapc = page_snap_context(page);
152 138
153 BUG_ON(!PageLocked(page));
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host; 139 inode = page->mapping->host;
140 ci = ceph_inode(inode);
141
142 if (offset != 0 || length != PAGE_CACHE_SIZE) {
143 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
144 inode, page, page->index, offset, length);
145 return;
146 }
147
148 ceph_invalidate_fscache_page(inode, page);
149
150 if (!PagePrivate(page))
151 return;
158 152
159 /* 153 /*
160 * We can get non-dirty pages here due to races between 154 * We can get non-dirty pages here due to races between
@@ -164,31 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
164 if (!PageDirty(page)) 158 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 159 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166 160
167 if (offset == 0 && length == PAGE_CACHE_SIZE) 161 ClearPageChecked(page);
168 ClearPageChecked(page);
169 162
170 ci = ceph_inode(inode); 163 dout("%p invalidatepage %p idx %lu full dirty page\n",
171 if (offset == 0 && length == PAGE_CACHE_SIZE) { 164 inode, page, page->index);
172 dout("%p invalidatepage %p idx %lu full dirty page\n", 165
173 inode, page, page->index); 166 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 167 ceph_put_snap_context(snapc);
175 ceph_put_snap_context(snapc); 168 page->private = 0;
176 page->private = 0; 169 ClearPagePrivate(page);
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
180 inode, page, page->index, offset, length);
181 }
182} 170}
183 171
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g) 172static int ceph_releasepage(struct page *page, gfp_t g)
186{ 173{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL; 174 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index); 175 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page)); 176 WARN_ON(PageDirty(page));
190 WARN_ON(PagePrivate(page)); 177
191 return 0; 178 /* Can we release the page from the cache? */
179 if (!ceph_release_fscache_page(page, g))
180 return 0;
181
182 return !PagePrivate(page);
192} 183}
193 184
194/* 185/*
@@ -198,11 +189,16 @@ static int readpage_nounlock(struct file *filp, struct page *page)
198{ 189{
199 struct inode *inode = file_inode(filp); 190 struct inode *inode = file_inode(filp);
200 struct ceph_inode_info *ci = ceph_inode(inode); 191 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_osd_client *osdc = 192 struct ceph_osd_client *osdc =
202 &ceph_inode_to_client(inode)->client->osdc; 193 &ceph_inode_to_client(inode)->client->osdc;
203 int err = 0; 194 int err = 0;
204 u64 len = PAGE_CACHE_SIZE; 195 u64 len = PAGE_CACHE_SIZE;
205 196
197 err = ceph_readpage_from_fscache(inode, page);
198
199 if (err == 0)
200 goto out;
201
206 dout("readpage inode %p file %p page %p index %lu\n", 202 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index); 203 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
@@ -220,6 +216,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
220 } 216 }
221 SetPageUptodate(page); 217 SetPageUptodate(page);
222 218
219 if (err == 0)
220 ceph_readpage_to_fscache(inode, page);
221
223out: 222out:
224 return err < 0 ? err : 0; 223 return err < 0 ? err : 0;
225} 224}
@@ -262,6 +261,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
262 page->index); 261 page->index);
263 flush_dcache_page(page); 262 flush_dcache_page(page);
264 SetPageUptodate(page); 263 SetPageUptodate(page);
264 ceph_readpage_to_fscache(inode, page);
265 unlock_page(page); 265 unlock_page(page);
266 page_cache_release(page); 266 page_cache_release(page);
267 bytes -= PAGE_CACHE_SIZE; 267 bytes -= PAGE_CACHE_SIZE;
@@ -331,11 +331,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
331 page = list_entry(page_list->prev, struct page, lru); 331 page = list_entry(page_list->prev, struct page, lru);
332 BUG_ON(PageLocked(page)); 332 BUG_ON(PageLocked(page));
333 list_del(&page->lru); 333 list_del(&page->lru);
334 334
335 dout("start_read %p adding %p idx %lu\n", inode, page, 335 dout("start_read %p adding %p idx %lu\n", inode, page,
336 page->index); 336 page->index);
337 if (add_to_page_cache_lru(page, &inode->i_data, page->index, 337 if (add_to_page_cache_lru(page, &inode->i_data, page->index,
338 GFP_NOFS)) { 338 GFP_NOFS)) {
339 ceph_fscache_uncache_page(inode, page);
339 page_cache_release(page); 340 page_cache_release(page);
340 dout("start_read %p add_to_page_cache failed %p\n", 341 dout("start_read %p add_to_page_cache failed %p\n",
341 inode, page); 342 inode, page);
@@ -378,6 +379,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
378 int rc = 0; 379 int rc = 0;
379 int max = 0; 380 int max = 0;
380 381
382 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
383 &nr_pages);
384
385 if (rc == 0)
386 goto out;
387
381 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 388 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
382 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 389 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
383 >> PAGE_SHIFT; 390 >> PAGE_SHIFT;
@@ -392,6 +399,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
392 BUG_ON(rc == 0); 399 BUG_ON(rc == 0);
393 } 400 }
394out: 401out:
402 ceph_fscache_readpages_cancel(inode, page_list);
403
395 dout("readpages %p file %p ret %d\n", inode, file, rc); 404 dout("readpages %p file %p ret %d\n", inode, file, rc);
396 return rc; 405 return rc;
397} 406}
@@ -497,6 +506,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
497 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) 506 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
498 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); 507 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
499 508
509 ceph_readpage_to_fscache(inode, page);
510
500 set_page_writeback(page); 511 set_page_writeback(page);
501 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 512 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
502 &ci->i_layout, snapc, 513 &ci->i_layout, snapc,
@@ -552,7 +563,6 @@ static void ceph_release_pages(struct page **pages, int num)
552 pagevec_release(&pvec); 563 pagevec_release(&pvec);
553} 564}
554 565
555
556/* 566/*
557 * async writeback completion handler. 567 * async writeback completion handler.
558 * 568 *
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 000000000000..6bfe65e0b038
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,398 @@
1/*
2 * Ceph cache definitions.
3 *
4 * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
5 * Written by Milosz Tanski (milosz@adfin.com)
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2
9 * as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to:
18 * Free Software Foundation
19 * 51 Franklin Street, Fifth Floor
20 * Boston, MA 02111-1301 USA
21 *
22 */
23
24#include "super.h"
25#include "cache.h"
26
27struct ceph_aux_inode {
28 struct timespec mtime;
29 loff_t size;
30};
31
32struct fscache_netfs ceph_cache_netfs = {
33 .name = "ceph",
34 .version = 0,
35};
36
37static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
38 void *buffer, uint16_t maxbuf)
39{
40 const struct ceph_fs_client* fsc = cookie_netfs_data;
41 uint16_t klen;
42
43 klen = sizeof(fsc->client->fsid);
44 if (klen > maxbuf)
45 return 0;
46
47 memcpy(buffer, &fsc->client->fsid, klen);
48 return klen;
49}
50
51static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
52 .name = "CEPH.fsid",
53 .type = FSCACHE_COOKIE_TYPE_INDEX,
54 .get_key = ceph_fscache_session_get_key,
55};
56
57int ceph_fscache_register(void)
58{
59 return fscache_register_netfs(&ceph_cache_netfs);
60}
61
62void ceph_fscache_unregister(void)
63{
64 fscache_unregister_netfs(&ceph_cache_netfs);
65}
66
67int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
68{
69 fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
70 &ceph_fscache_fsid_object_def,
71 fsc);
72
73 if (fsc->fscache == NULL) {
74 pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
75 return 0;
76 }
77
78 fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
79 if (fsc->revalidate_wq == NULL)
80 return -ENOMEM;
81
82 return 0;
83}
84
85static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
86 void *buffer, uint16_t maxbuf)
87{
88 const struct ceph_inode_info* ci = cookie_netfs_data;
89 uint16_t klen;
90
91 /* use ceph virtual inode (id + snaphot) */
92 klen = sizeof(ci->i_vino);
93 if (klen > maxbuf)
94 return 0;
95
96 memcpy(buffer, &ci->i_vino, klen);
97 return klen;
98}
99
100static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
101 void *buffer, uint16_t bufmax)
102{
103 struct ceph_aux_inode aux;
104 const struct ceph_inode_info* ci = cookie_netfs_data;
105 const struct inode* inode = &ci->vfs_inode;
106
107 memset(&aux, 0, sizeof(aux));
108 aux.mtime = inode->i_mtime;
109 aux.size = inode->i_size;
110
111 memcpy(buffer, &aux, sizeof(aux));
112
113 return sizeof(aux);
114}
115
116static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
117 uint64_t *size)
118{
119 const struct ceph_inode_info* ci = cookie_netfs_data;
120 const struct inode* inode = &ci->vfs_inode;
121
122 *size = inode->i_size;
123}
124
125static enum fscache_checkaux ceph_fscache_inode_check_aux(
126 void *cookie_netfs_data, const void *data, uint16_t dlen)
127{
128 struct ceph_aux_inode aux;
129 struct ceph_inode_info* ci = cookie_netfs_data;
130 struct inode* inode = &ci->vfs_inode;
131
132 if (dlen != sizeof(aux))
133 return FSCACHE_CHECKAUX_OBSOLETE;
134
135 memset(&aux, 0, sizeof(aux));
136 aux.mtime = inode->i_mtime;
137 aux.size = inode->i_size;
138
139 if (memcmp(data, &aux, sizeof(aux)) != 0)
140 return FSCACHE_CHECKAUX_OBSOLETE;
141
142 dout("ceph inode 0x%p cached okay", ci);
143 return FSCACHE_CHECKAUX_OKAY;
144}
145
146static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
147{
148 struct ceph_inode_info* ci = cookie_netfs_data;
149 struct pagevec pvec;
150 pgoff_t first;
151 int loop, nr_pages;
152
153 pagevec_init(&pvec, 0);
154 first = 0;
155
156 dout("ceph inode 0x%p now uncached", ci);
157
158 while (1) {
159 nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
160 PAGEVEC_SIZE - pagevec_count(&pvec));
161
162 if (!nr_pages)
163 break;
164
165 for (loop = 0; loop < nr_pages; loop++)
166 ClearPageFsCache(pvec.pages[loop]);
167
168 first = pvec.pages[nr_pages - 1]->index + 1;
169
170 pvec.nr = nr_pages;
171 pagevec_release(&pvec);
172 cond_resched();
173 }
174}
175
176static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
177 .name = "CEPH.inode",
178 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
179 .get_key = ceph_fscache_inode_get_key,
180 .get_attr = ceph_fscache_inode_get_attr,
181 .get_aux = ceph_fscache_inode_get_aux,
182 .check_aux = ceph_fscache_inode_check_aux,
183 .now_uncached = ceph_fscache_inode_now_uncached,
184};
185
186void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
187 struct ceph_inode_info* ci)
188{
189 struct inode* inode = &ci->vfs_inode;
190
191 /* No caching for filesystem */
192 if (fsc->fscache == NULL)
193 return;
194
195 /* Only cache for regular files that are read only */
196 if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
197 return;
198
199 /* Avoid multiple racing open requests */
200 mutex_lock(&inode->i_mutex);
201
202 if (ci->fscache)
203 goto done;
204
205 ci->fscache = fscache_acquire_cookie(fsc->fscache,
206 &ceph_fscache_inode_object_def,
207 ci);
208done:
209 mutex_unlock(&inode->i_mutex);
210
211}
212
213void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
214{
215 struct fscache_cookie* cookie;
216
217 if ((cookie = ci->fscache) == NULL)
218 return;
219
220 ci->fscache = NULL;
221
222 fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
223 fscache_relinquish_cookie(cookie, 0);
224}
225
226static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
227{
228 if (!error)
229 SetPageUptodate(page);
230}
231
232static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
233{
234 if (!error)
235 SetPageUptodate(page);
236
237 unlock_page(page);
238}
239
240static inline int cache_valid(struct ceph_inode_info *ci)
241{
242 return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
243 (ci->i_fscache_gen == ci->i_rdcache_gen));
244}
245
246
247/* Atempt to read from the fscache,
248 *
249 * This function is called from the readpage_nounlock context. DO NOT attempt to
250 * unlock the page here (or in the callback).
251 */
252int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
253{
254 struct ceph_inode_info *ci = ceph_inode(inode);
255 int ret;
256
257 if (!cache_valid(ci))
258 return -ENOBUFS;
259
260 ret = fscache_read_or_alloc_page(ci->fscache, page,
261 ceph_vfs_readpage_complete, NULL,
262 GFP_KERNEL);
263
264 switch (ret) {
265 case 0: /* Page found */
266 dout("page read submitted\n");
267 return 0;
268 case -ENOBUFS: /* Pages were not found, and can't be */
269 case -ENODATA: /* Pages were not found */
270 dout("page/inode not in cache\n");
271 return ret;
272 default:
273 dout("%s: unknown error ret = %i\n", __func__, ret);
274 return ret;
275 }
276}
277
278int ceph_readpages_from_fscache(struct inode *inode,
279 struct address_space *mapping,
280 struct list_head *pages,
281 unsigned *nr_pages)
282{
283 struct ceph_inode_info *ci = ceph_inode(inode);
284 int ret;
285
286 if (!cache_valid(ci))
287 return -ENOBUFS;
288
289 ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
290 ceph_vfs_readpage_complete_unlock,
291 NULL, mapping_gfp_mask(mapping));
292
293 switch (ret) {
294 case 0: /* All pages found */
295 dout("all-page read submitted\n");
296 return 0;
297 case -ENOBUFS: /* Some pages were not found, and can't be */
298 case -ENODATA: /* some pages were not found */
299 dout("page/inode not in cache\n");
300 return ret;
301 default:
302 dout("%s: unknown error ret = %i\n", __func__, ret);
303 return ret;
304 }
305}
306
307void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
308{
309 struct ceph_inode_info *ci = ceph_inode(inode);
310 int ret;
311
312 if (!PageFsCache(page))
313 return;
314
315 if (!cache_valid(ci))
316 return;
317
318 ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
319 if (ret)
320 fscache_uncache_page(ci->fscache, page);
321}
322
323void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
324{
325 struct ceph_inode_info *ci = ceph_inode(inode);
326
327 fscache_wait_on_page_write(ci->fscache, page);
328 fscache_uncache_page(ci->fscache, page);
329}
330
331void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
332{
333 if (fsc->revalidate_wq)
334 destroy_workqueue(fsc->revalidate_wq);
335
336 fscache_relinquish_cookie(fsc->fscache, 0);
337 fsc->fscache = NULL;
338}
339
340static void ceph_revalidate_work(struct work_struct *work)
341{
342 int issued;
343 u32 orig_gen;
344 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
345 i_revalidate_work);
346 struct inode *inode = &ci->vfs_inode;
347
348 spin_lock(&ci->i_ceph_lock);
349 issued = __ceph_caps_issued(ci, NULL);
350 orig_gen = ci->i_rdcache_gen;
351 spin_unlock(&ci->i_ceph_lock);
352
353 if (!(issued & CEPH_CAP_FILE_CACHE)) {
354 dout("revalidate_work lost cache before validation %p\n",
355 inode);
356 goto out;
357 }
358
359 if (!fscache_check_consistency(ci->fscache))
360 fscache_invalidate(ci->fscache);
361
362 spin_lock(&ci->i_ceph_lock);
363 /* Update the new valid generation (backwards sanity check too) */
364 if (orig_gen > ci->i_fscache_gen) {
365 ci->i_fscache_gen = orig_gen;
366 }
367 spin_unlock(&ci->i_ceph_lock);
368
369out:
370 iput(&ci->vfs_inode);
371}
372
373void ceph_queue_revalidate(struct inode *inode)
374{
375 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
376 struct ceph_inode_info *ci = ceph_inode(inode);
377
378 if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
379 return;
380
381 ihold(inode);
382
383 if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
384 &ci->i_revalidate_work)) {
385 dout("ceph_queue_revalidate %p\n", inode);
386 } else {
387 dout("ceph_queue_revalidate %p failed\n)", inode);
388 iput(inode);
389 }
390}
391
392void ceph_fscache_inode_init(struct ceph_inode_info *ci)
393{
394 ci->fscache = NULL;
395 /* The first load is verifed cookie open time */
396 ci->i_fscache_gen = 1;
397 INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
398}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 000000000000..ba949408a336
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,159 @@
1/*
2 * Ceph cache definitions.
3 *
4 * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
5 * Written by Milosz Tanski (milosz@adfin.com)
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2
9 * as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to:
18 * Free Software Foundation
19 * 51 Franklin Street, Fifth Floor
20 * Boston, MA 02111-1301 USA
21 *
22 */
23
24#ifndef _CEPH_CACHE_H
25#define _CEPH_CACHE_H
26
27#ifdef CONFIG_CEPH_FSCACHE
28
29extern struct fscache_netfs ceph_cache_netfs;
30
31int ceph_fscache_register(void);
32void ceph_fscache_unregister(void);
33
34int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
35void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
36
37void ceph_fscache_inode_init(struct ceph_inode_info *ci);
38void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
39 struct ceph_inode_info* ci);
40void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
41
42int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
43int ceph_readpages_from_fscache(struct inode *inode,
44 struct address_space *mapping,
45 struct list_head *pages,
46 unsigned *nr_pages);
47void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
48void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
49void ceph_queue_revalidate(struct inode *inode);
50
51static inline void ceph_fscache_invalidate(struct inode *inode)
52{
53 fscache_invalidate(ceph_inode(inode)->fscache);
54}
55
56static inline void ceph_fscache_uncache_page(struct inode *inode,
57 struct page *page)
58{
59 struct ceph_inode_info *ci = ceph_inode(inode);
60 return fscache_uncache_page(ci->fscache, page);
61}
62
63static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
64{
65 struct inode* inode = page->mapping->host;
66 struct ceph_inode_info *ci = ceph_inode(inode);
67 return fscache_maybe_release_page(ci->fscache, page, gfp);
68}
69
70static inline void ceph_fscache_readpages_cancel(struct inode *inode,
71 struct list_head *pages)
72{
73 struct ceph_inode_info *ci = ceph_inode(inode);
74 return fscache_readpages_cancel(ci->fscache, pages);
75}
76
77#else
78
79static inline int ceph_fscache_register(void)
80{
81 return 0;
82}
83
84static inline void ceph_fscache_unregister(void)
85{
86}
87
88static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
89{
90 return 0;
91}
92
93static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
94{
95}
96
97static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
98{
99}
100
101static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
102 struct ceph_inode_info* ci)
103{
104}
105
106static inline void ceph_fscache_uncache_page(struct inode *inode,
107 struct page *pages)
108{
109}
110
111static inline int ceph_readpage_from_fscache(struct inode* inode,
112 struct page *page)
113{
114 return -ENOBUFS;
115}
116
117static inline int ceph_readpages_from_fscache(struct inode *inode,
118 struct address_space *mapping,
119 struct list_head *pages,
120 unsigned *nr_pages)
121{
122 return -ENOBUFS;
123}
124
125static inline void ceph_readpage_to_fscache(struct inode *inode,
126 struct page *page)
127{
128}
129
130static inline void ceph_fscache_invalidate(struct inode *inode)
131{
132}
133
134static inline void ceph_invalidate_fscache_page(struct inode *inode,
135 struct page *page)
136{
137}
138
139static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
140{
141}
142
143static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
144{
145 return 1;
146}
147
148static inline void ceph_fscache_readpages_cancel(struct inode *inode,
149 struct list_head *pages)
150{
151}
152
153static inline void ceph_queue_revalidate(struct inode *inode)
154{
155}
156
157#endif
158
159#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 25442b40c25a..13976c33332e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
10 10
11#include "super.h" 11#include "super.h"
12#include "mds_client.h" 12#include "mds_client.h"
13#include "cache.h"
13#include <linux/ceph/decode.h> 14#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h> 15#include <linux/ceph/messenger.h>
15 16
@@ -479,8 +480,9 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
479 * i_rdcache_gen. 480 * i_rdcache_gen.
480 */ 481 */
481 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 482 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
482 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 483 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
483 ci->i_rdcache_gen++; 484 ci->i_rdcache_gen++;
485 }
484 486
485 /* 487 /*
486 * if we are newly issued FILE_SHARED, mark dir not complete; we 488 * if we are newly issued FILE_SHARED, mark dir not complete; we
@@ -2072,19 +2074,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2072 /* finish pending truncate */ 2074 /* finish pending truncate */
2073 while (ci->i_truncate_pending) { 2075 while (ci->i_truncate_pending) {
2074 spin_unlock(&ci->i_ceph_lock); 2076 spin_unlock(&ci->i_ceph_lock);
2075 if (!(need & CEPH_CAP_FILE_WR))
2076 mutex_lock(&inode->i_mutex);
2077 __ceph_do_pending_vmtruncate(inode); 2077 __ceph_do_pending_vmtruncate(inode);
2078 if (!(need & CEPH_CAP_FILE_WR))
2079 mutex_unlock(&inode->i_mutex);
2080 spin_lock(&ci->i_ceph_lock); 2078 spin_lock(&ci->i_ceph_lock);
2081 } 2079 }
2082 2080
2083 if (need & CEPH_CAP_FILE_WR) { 2081 have = __ceph_caps_issued(ci, &implemented);
2082
2083 if (have & need & CEPH_CAP_FILE_WR) {
2084 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { 2084 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2085 dout("get_cap_refs %p endoff %llu > maxsize %llu\n", 2085 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2086 inode, endoff, ci->i_max_size); 2086 inode, endoff, ci->i_max_size);
2087 if (endoff > ci->i_wanted_max_size) { 2087 if (endoff > ci->i_requested_max_size) {
2088 *check_max = 1; 2088 *check_max = 1;
2089 ret = 1; 2089 ret = 1;
2090 } 2090 }
@@ -2099,7 +2099,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2099 goto out; 2099 goto out;
2100 } 2100 }
2101 } 2101 }
2102 have = __ceph_caps_issued(ci, &implemented);
2103 2102
2104 if ((have & need) == need) { 2103 if ((have & need) == need) {
2105 /* 2104 /*
@@ -2141,14 +2140,17 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2141 2140
2142 /* do we need to explicitly request a larger max_size? */ 2141 /* do we need to explicitly request a larger max_size? */
2143 spin_lock(&ci->i_ceph_lock); 2142 spin_lock(&ci->i_ceph_lock);
2144 if ((endoff >= ci->i_max_size || 2143 if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
2145 endoff > (inode->i_size << 1)) &&
2146 endoff > ci->i_wanted_max_size) {
2147 dout("write %p at large endoff %llu, req max_size\n", 2144 dout("write %p at large endoff %llu, req max_size\n",
2148 inode, endoff); 2145 inode, endoff);
2149 ci->i_wanted_max_size = endoff; 2146 ci->i_wanted_max_size = endoff;
2150 check = 1;
2151 } 2147 }
2148 /* duplicate ceph_check_caps()'s logic */
2149 if (ci->i_auth_cap &&
2150 (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2151 ci->i_wanted_max_size > ci->i_max_size &&
2152 ci->i_wanted_max_size > ci->i_requested_max_size)
2153 check = 1;
2152 spin_unlock(&ci->i_ceph_lock); 2154 spin_unlock(&ci->i_ceph_lock);
2153 if (check) 2155 if (check)
2154 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2156 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2334,6 +2336,38 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2334} 2336}
2335 2337
2336/* 2338/*
2339 * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
2340 */
2341static void invalidate_aliases(struct inode *inode)
2342{
2343 struct dentry *dn, *prev = NULL;
2344
2345 dout("invalidate_aliases inode %p\n", inode);
2346 d_prune_aliases(inode);
2347 /*
2348 * For non-directory inode, d_find_alias() only returns
2349 * connected dentry. After calling d_invalidate(), the
2350 * dentry become disconnected.
2351 *
2352 * For directory inode, d_find_alias() can return
2353 * disconnected dentry. But directory inode should have
2354 * one alias at most.
2355 */
2356 while ((dn = d_find_alias(inode))) {
2357 if (dn == prev) {
2358 dput(dn);
2359 break;
2360 }
2361 d_invalidate(dn);
2362 if (prev)
2363 dput(prev);
2364 prev = dn;
2365 }
2366 if (prev)
2367 dput(prev);
2368}
2369
2370/*
2337 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 2371 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2338 * actually be a revocation if it specifies a smaller cap set.) 2372 * actually be a revocation if it specifies a smaller cap set.)
2339 * 2373 *
@@ -2361,8 +2395,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2361 int check_caps = 0; 2395 int check_caps = 0;
2362 int wake = 0; 2396 int wake = 0;
2363 int writeback = 0; 2397 int writeback = 0;
2364 int revoked_rdcache = 0;
2365 int queue_invalidate = 0; 2398 int queue_invalidate = 0;
2399 int deleted_inode = 0;
2400 int queue_revalidate = 0;
2366 2401
2367 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2402 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2368 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2403 inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2377,9 +2412,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2377 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2412 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2378 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2413 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2379 !ci->i_wrbuffer_ref) { 2414 !ci->i_wrbuffer_ref) {
2380 if (try_nonblocking_invalidate(inode) == 0) { 2415 if (try_nonblocking_invalidate(inode)) {
2381 revoked_rdcache = 1;
2382 } else {
2383 /* there were locked pages.. invalidate later 2416 /* there were locked pages.. invalidate later
2384 in a separate thread. */ 2417 in a separate thread. */
2385 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 2418 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
@@ -2387,6 +2420,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2387 ci->i_rdcache_revoking = ci->i_rdcache_gen; 2420 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2388 } 2421 }
2389 } 2422 }
2423
2424 ceph_fscache_invalidate(inode);
2390 } 2425 }
2391 2426
2392 /* side effects now are allowed */ 2427 /* side effects now are allowed */
@@ -2407,8 +2442,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2407 from_kgid(&init_user_ns, inode->i_gid)); 2442 from_kgid(&init_user_ns, inode->i_gid));
2408 } 2443 }
2409 2444
2410 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 2445 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
2411 set_nlink(inode, le32_to_cpu(grant->nlink)); 2446 set_nlink(inode, le32_to_cpu(grant->nlink));
2447 if (inode->i_nlink == 0 &&
2448 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
2449 deleted_inode = 1;
2450 }
2412 2451
2413 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { 2452 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2414 int len = le32_to_cpu(grant->xattr_len); 2453 int len = le32_to_cpu(grant->xattr_len);
@@ -2424,6 +2463,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2424 } 2463 }
2425 } 2464 }
2426 2465
2466 /* Do we need to revalidate our fscache cookie. Don't bother on the
2467 * first cache cap as we already validate at cookie creation time. */
2468 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
2469 queue_revalidate = 1;
2470
2427 /* size/ctime/mtime/atime? */ 2471 /* size/ctime/mtime/atime? */
2428 ceph_fill_file_size(inode, issued, 2472 ceph_fill_file_size(inode, issued,
2429 le32_to_cpu(grant->truncate_seq), 2473 le32_to_cpu(grant->truncate_seq),
@@ -2508,6 +2552,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2508 BUG_ON(cap->issued & ~cap->implemented); 2552 BUG_ON(cap->issued & ~cap->implemented);
2509 2553
2510 spin_unlock(&ci->i_ceph_lock); 2554 spin_unlock(&ci->i_ceph_lock);
2555
2511 if (writeback) 2556 if (writeback)
2512 /* 2557 /*
2513 * queue inode for writeback: we can't actually call 2558 * queue inode for writeback: we can't actually call
@@ -2517,6 +2562,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2517 ceph_queue_writeback(inode); 2562 ceph_queue_writeback(inode);
2518 if (queue_invalidate) 2563 if (queue_invalidate)
2519 ceph_queue_invalidate(inode); 2564 ceph_queue_invalidate(inode);
2565 if (deleted_inode)
2566 invalidate_aliases(inode);
2567 if (queue_revalidate)
2568 ceph_queue_revalidate(inode);
2520 if (wake) 2569 if (wake)
2521 wake_up_all(&ci->i_cap_wq); 2570 wake_up_all(&ci->i_cap_wq);
2522 2571
@@ -2673,8 +2722,10 @@ static void handle_cap_trunc(struct inode *inode,
2673 truncate_seq, truncate_size, size); 2722 truncate_seq, truncate_size, size);
2674 spin_unlock(&ci->i_ceph_lock); 2723 spin_unlock(&ci->i_ceph_lock);
2675 2724
2676 if (queue_trunc) 2725 if (queue_trunc) {
2677 ceph_queue_vmtruncate(inode); 2726 ceph_queue_vmtruncate(inode);
2727 ceph_fscache_invalidate(inode);
2728 }
2678} 2729}
2679 2730
2680/* 2731/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a40ceda47a32..868b61d56cac 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -793,6 +793,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
793 req->r_locked_dir = dir; 793 req->r_locked_dir = dir;
794 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 794 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
795 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 795 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
796 /* release LINK_SHARED on source inode (mds will lock it) */
797 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
796 err = ceph_mdsc_do_request(mdsc, dir, req); 798 err = ceph_mdsc_do_request(mdsc, dir, req);
797 if (err) { 799 if (err) {
798 d_drop(dentry); 800 d_drop(dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ddf061c1c4a..3de89829e2a1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,9 +8,11 @@
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10#include <linux/aio.h> 10#include <linux/aio.h>
11#include <linux/falloc.h>
11 12
12#include "super.h" 13#include "super.h"
13#include "mds_client.h" 14#include "mds_client.h"
15#include "cache.h"
14 16
15/* 17/*
16 * Ceph file operations 18 * Ceph file operations
@@ -68,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
68{ 70{
69 struct ceph_file_info *cf; 71 struct ceph_file_info *cf;
70 int ret = 0; 72 int ret = 0;
73 struct ceph_inode_info *ci = ceph_inode(inode);
74 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
75 struct ceph_mds_client *mdsc = fsc->mdsc;
71 76
72 switch (inode->i_mode & S_IFMT) { 77 switch (inode->i_mode & S_IFMT) {
73 case S_IFREG: 78 case S_IFREG:
79 /* First file open request creates the cookie, we want to keep
80 * this cookie around for the filetime of the inode as not to
81 * have to worry about fscache register / revoke / operation
82 * races.
83 *
84 * Also, if we know the operation is going to invalidate data
85 * (non readonly) just nuke the cache right away.
86 */
87 ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
88 if ((fmode & CEPH_FILE_MODE_WR))
89 ceph_fscache_invalidate(inode);
74 case S_IFDIR: 90 case S_IFDIR:
75 dout("init_file %p %p 0%o (regular)\n", inode, file, 91 dout("init_file %p %p 0%o (regular)\n", inode, file,
76 inode->i_mode); 92 inode->i_mode);
@@ -181,6 +197,7 @@ int ceph_open(struct inode *inode, struct file *file)
181 spin_unlock(&ci->i_ceph_lock); 197 spin_unlock(&ci->i_ceph_lock);
182 return ceph_init_file(inode, file, fmode); 198 return ceph_init_file(inode, file, fmode);
183 } 199 }
200
184 spin_unlock(&ci->i_ceph_lock); 201 spin_unlock(&ci->i_ceph_lock);
185 202
186 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 203 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
@@ -191,6 +208,7 @@ int ceph_open(struct inode *inode, struct file *file)
191 } 208 }
192 req->r_inode = inode; 209 req->r_inode = inode;
193 ihold(inode); 210 ihold(inode);
211
194 req->r_num_caps = 1; 212 req->r_num_caps = 1;
195 if (flags & (O_CREAT|O_TRUNC)) 213 if (flags & (O_CREAT|O_TRUNC))
196 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
@@ -313,9 +331,9 @@ static int striped_read(struct inode *inode,
313{ 331{
314 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 332 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
315 struct ceph_inode_info *ci = ceph_inode(inode); 333 struct ceph_inode_info *ci = ceph_inode(inode);
316 u64 pos, this_len; 334 u64 pos, this_len, left;
317 int io_align, page_align; 335 int io_align, page_align;
318 int left, pages_left; 336 int pages_left;
319 int read; 337 int read;
320 struct page **page_pos; 338 struct page **page_pos;
321 int ret; 339 int ret;
@@ -346,47 +364,40 @@ more:
346 ret = 0; 364 ret = 0;
347 hit_stripe = this_len < left; 365 hit_stripe = this_len < left;
348 was_short = ret >= 0 && ret < this_len; 366 was_short = ret >= 0 && ret < this_len;
349 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, 367 dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
350 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); 368 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
351 369
352 if (ret > 0) { 370 if (ret >= 0) {
353 int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; 371 int didpages;
354 372 if (was_short && (pos + ret < inode->i_size)) {
355 if (read < pos - off) { 373 u64 tmp = min(this_len - ret,
356 dout(" zero gap %llu to %llu\n", off + read, pos); 374 inode->i_size - pos - ret);
357 ceph_zero_page_vector_range(page_align + read, 375 dout(" zero gap %llu to %llu\n",
358 pos - off - read, pages); 376 pos + ret, pos + ret + tmp);
377 ceph_zero_page_vector_range(page_align + read + ret,
378 tmp, pages);
379 ret += tmp;
359 } 380 }
381
382 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
360 pos += ret; 383 pos += ret;
361 read = pos - off; 384 read = pos - off;
362 left -= ret; 385 left -= ret;
363 page_pos += didpages; 386 page_pos += didpages;
364 pages_left -= didpages; 387 pages_left -= didpages;
365 388
366 /* hit stripe? */ 389 /* hit stripe and need continue*/
367 if (left && hit_stripe) 390 if (left && hit_stripe && pos < inode->i_size)
368 goto more; 391 goto more;
369 } 392 }
370 393
371 if (was_short) { 394 if (read > 0) {
395 ret = read;
372 /* did we bounce off eof? */ 396 /* did we bounce off eof? */
373 if (pos + left > inode->i_size) 397 if (pos + left > inode->i_size)
374 *checkeof = 1; 398 *checkeof = 1;
375
376 /* zero trailing bytes (inside i_size) */
377 if (left > 0 && pos < inode->i_size) {
378 if (pos + left > inode->i_size)
379 left = inode->i_size - pos;
380
381 dout("zero tail %d\n", left);
382 ceph_zero_page_vector_range(page_align + read, left,
383 pages);
384 read += left;
385 }
386 } 399 }
387 400
388 if (ret >= 0)
389 ret = read;
390 dout("striped_read returns %d\n", ret); 401 dout("striped_read returns %d\n", ret);
391 return ret; 402 return ret;
392} 403}
@@ -618,6 +629,8 @@ out:
618 if (check_caps) 629 if (check_caps)
619 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, 630 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
620 NULL); 631 NULL);
632 } else if (ret != -EOLDSNAPC && written > 0) {
633 ret = written;
621 } 634 }
622 return ret; 635 return ret;
623} 636}
@@ -659,7 +672,6 @@ again:
659 672
660 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 673 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
661 (iocb->ki_filp->f_flags & O_DIRECT) || 674 (iocb->ki_filp->f_flags & O_DIRECT) ||
662 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
663 (fi->flags & CEPH_F_SYNC)) 675 (fi->flags & CEPH_F_SYNC))
664 /* hmm, this isn't really async... */ 676 /* hmm, this isn't really async... */
665 ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 677 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
@@ -711,13 +723,11 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
711 &ceph_sb_to_client(inode->i_sb)->client->osdc; 723 &ceph_sb_to_client(inode->i_sb)->client->osdc;
712 ssize_t count, written = 0; 724 ssize_t count, written = 0;
713 int err, want, got; 725 int err, want, got;
714 bool hold_mutex;
715 726
716 if (ceph_snap(inode) != CEPH_NOSNAP) 727 if (ceph_snap(inode) != CEPH_NOSNAP)
717 return -EROFS; 728 return -EROFS;
718 729
719 mutex_lock(&inode->i_mutex); 730 mutex_lock(&inode->i_mutex);
720 hold_mutex = true;
721 731
722 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); 732 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
723 if (err) 733 if (err)
@@ -763,18 +773,31 @@ retry_snap:
763 773
764 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 774 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) || 775 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
767 (fi->flags & CEPH_F_SYNC)) { 776 (fi->flags & CEPH_F_SYNC)) {
768 mutex_unlock(&inode->i_mutex); 777 mutex_unlock(&inode->i_mutex);
769 written = ceph_sync_write(file, iov->iov_base, count, 778 written = ceph_sync_write(file, iov->iov_base, count,
770 pos, &iocb->ki_pos); 779 pos, &iocb->ki_pos);
780 if (written == -EOLDSNAPC) {
781 dout("aio_write %p %llx.%llx %llu~%u"
782 "got EOLDSNAPC, retrying\n",
783 inode, ceph_vinop(inode),
784 pos, (unsigned)iov->iov_len);
785 mutex_lock(&inode->i_mutex);
786 goto retry_snap;
787 }
771 } else { 788 } else {
789 /*
790 * No need to acquire the i_truncate_mutex. Because
791 * the MDS revokes Fwb caps before sending truncate
792 * message to us. We can't get Fwb cap while there
793 * are pending vmtruncate. So write and vmtruncate
794 * can not run at the same time
795 */
772 written = generic_file_buffered_write(iocb, iov, nr_segs, 796 written = generic_file_buffered_write(iocb, iov, nr_segs,
773 pos, &iocb->ki_pos, 797 pos, &iocb->ki_pos,
774 count, 0); 798 count, 0);
775 mutex_unlock(&inode->i_mutex); 799 mutex_unlock(&inode->i_mutex);
776 } 800 }
777 hold_mutex = false;
778 801
779 if (written >= 0) { 802 if (written >= 0) {
780 int dirty; 803 int dirty;
@@ -798,18 +821,12 @@ retry_snap:
798 written = err; 821 written = err;
799 } 822 }
800 823
801 if (written == -EOLDSNAPC) { 824 goto out_unlocked;
802 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 825
803 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
804 mutex_lock(&inode->i_mutex);
805 hold_mutex = true;
806 goto retry_snap;
807 }
808out: 826out:
809 if (hold_mutex) 827 mutex_unlock(&inode->i_mutex);
810 mutex_unlock(&inode->i_mutex); 828out_unlocked:
811 current->backing_dev_info = NULL; 829 current->backing_dev_info = NULL;
812
813 return written ? written : err; 830 return written ? written : err;
814} 831}
815 832
@@ -822,7 +839,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
822 int ret; 839 int ret;
823 840
824 mutex_lock(&inode->i_mutex); 841 mutex_lock(&inode->i_mutex);
825 __ceph_do_pending_vmtruncate(inode);
826 842
827 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 843 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
828 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 844 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
@@ -871,6 +887,204 @@ out:
871 return offset; 887 return offset;
872} 888}
873 889
890static inline void ceph_zero_partial_page(
891 struct inode *inode, loff_t offset, unsigned size)
892{
893 struct page *page;
894 pgoff_t index = offset >> PAGE_CACHE_SHIFT;
895
896 page = find_lock_page(inode->i_mapping, index);
897 if (page) {
898 wait_on_page_writeback(page);
899 zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
900 unlock_page(page);
901 page_cache_release(page);
902 }
903}
904
905static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
906 loff_t length)
907{
908 loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
909 if (offset < nearly) {
910 loff_t size = nearly - offset;
911 if (length < size)
912 size = length;
913 ceph_zero_partial_page(inode, offset, size);
914 offset += size;
915 length -= size;
916 }
917 if (length >= PAGE_CACHE_SIZE) {
918 loff_t size = round_down(length, PAGE_CACHE_SIZE);
919 truncate_pagecache_range(inode, offset, offset + size - 1);
920 offset += size;
921 length -= size;
922 }
923 if (length)
924 ceph_zero_partial_page(inode, offset, length);
925}
926
927static int ceph_zero_partial_object(struct inode *inode,
928 loff_t offset, loff_t *length)
929{
930 struct ceph_inode_info *ci = ceph_inode(inode);
931 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
932 struct ceph_osd_request *req;
933 int ret = 0;
934 loff_t zero = 0;
935 int op;
936
937 if (!length) {
938 op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
939 length = &zero;
940 } else {
941 op = CEPH_OSD_OP_ZERO;
942 }
943
944 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
945 ceph_vino(inode),
946 offset, length,
947 1, op,
948 CEPH_OSD_FLAG_WRITE |
949 CEPH_OSD_FLAG_ONDISK,
950 NULL, 0, 0, false);
951 if (IS_ERR(req)) {
952 ret = PTR_ERR(req);
953 goto out;
954 }
955
956 ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
957 &inode->i_mtime);
958
959 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
960 if (!ret) {
961 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
962 if (ret == -ENOENT)
963 ret = 0;
964 }
965 ceph_osdc_put_request(req);
966
967out:
968 return ret;
969}
970
971static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
972{
973 int ret = 0;
974 struct ceph_inode_info *ci = ceph_inode(inode);
975 s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
976 s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
977 s32 object_size = ceph_file_layout_object_size(ci->i_layout);
978 u64 object_set_size = object_size * stripe_count;
979 u64 nearly, t;
980
981 /* round offset up to next period boundary */
982 nearly = offset + object_set_size - 1;
983 t = nearly;
984 nearly -= do_div(t, object_set_size);
985
986 while (length && offset < nearly) {
987 loff_t size = length;
988 ret = ceph_zero_partial_object(inode, offset, &size);
989 if (ret < 0)
990 return ret;
991 offset += size;
992 length -= size;
993 }
994 while (length >= object_set_size) {
995 int i;
996 loff_t pos = offset;
997 for (i = 0; i < stripe_count; ++i) {
998 ret = ceph_zero_partial_object(inode, pos, NULL);
999 if (ret < 0)
1000 return ret;
1001 pos += stripe_unit;
1002 }
1003 offset += object_set_size;
1004 length -= object_set_size;
1005 }
1006 while (length) {
1007 loff_t size = length;
1008 ret = ceph_zero_partial_object(inode, offset, &size);
1009 if (ret < 0)
1010 return ret;
1011 offset += size;
1012 length -= size;
1013 }
1014 return ret;
1015}
1016
1017static long ceph_fallocate(struct file *file, int mode,
1018 loff_t offset, loff_t length)
1019{
1020 struct ceph_file_info *fi = file->private_data;
1021 struct inode *inode = file->f_dentry->d_inode;
1022 struct ceph_inode_info *ci = ceph_inode(inode);
1023 struct ceph_osd_client *osdc =
1024 &ceph_inode_to_client(inode)->client->osdc;
1025 int want, got = 0;
1026 int dirty;
1027 int ret = 0;
1028 loff_t endoff = 0;
1029 loff_t size;
1030
1031 if (!S_ISREG(inode->i_mode))
1032 return -EOPNOTSUPP;
1033
1034 if (IS_SWAPFILE(inode))
1035 return -ETXTBSY;
1036
1037 mutex_lock(&inode->i_mutex);
1038
1039 if (ceph_snap(inode) != CEPH_NOSNAP) {
1040 ret = -EROFS;
1041 goto unlock;
1042 }
1043
1044 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
1045 !(mode & FALLOC_FL_PUNCH_HOLE)) {
1046 ret = -ENOSPC;
1047 goto unlock;
1048 }
1049
1050 size = i_size_read(inode);
1051 if (!(mode & FALLOC_FL_KEEP_SIZE))
1052 endoff = offset + length;
1053
1054 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1055 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1056 else
1057 want = CEPH_CAP_FILE_BUFFER;
1058
1059 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
1060 if (ret < 0)
1061 goto unlock;
1062
1063 if (mode & FALLOC_FL_PUNCH_HOLE) {
1064 if (offset < size)
1065 ceph_zero_pagecache_range(inode, offset, length);
1066 ret = ceph_zero_objects(inode, offset, length);
1067 } else if (endoff > size) {
1068 truncate_pagecache_range(inode, size, -1);
1069 if (ceph_inode_set_size(inode, endoff))
1070 ceph_check_caps(ceph_inode(inode),
1071 CHECK_CAPS_AUTHONLY, NULL);
1072 }
1073
1074 if (!ret) {
1075 spin_lock(&ci->i_ceph_lock);
1076 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1077 spin_unlock(&ci->i_ceph_lock);
1078 if (dirty)
1079 __mark_inode_dirty(inode, dirty);
1080 }
1081
1082 ceph_put_cap_refs(ci, got);
1083unlock:
1084 mutex_unlock(&inode->i_mutex);
1085 return ret;
1086}
1087
874const struct file_operations ceph_file_fops = { 1088const struct file_operations ceph_file_fops = {
875 .open = ceph_open, 1089 .open = ceph_open,
876 .release = ceph_release, 1090 .release = ceph_release,
@@ -887,5 +1101,6 @@ const struct file_operations ceph_file_fops = {
887 .splice_write = generic_file_splice_write, 1101 .splice_write = generic_file_splice_write,
888 .unlocked_ioctl = ceph_ioctl, 1102 .unlocked_ioctl = ceph_ioctl,
889 .compat_ioctl = ceph_ioctl, 1103 .compat_ioctl = ceph_ioctl,
1104 .fallocate = ceph_fallocate,
890}; 1105};
891 1106
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f3a2abf28a77..8549a48115f7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -12,6 +12,7 @@
12 12
13#include "super.h" 13#include "super.h"
14#include "mds_client.h" 14#include "mds_client.h"
15#include "cache.h"
15#include <linux/ceph/decode.h> 16#include <linux/ceph/decode.h>
16 17
17/* 18/*
@@ -344,6 +345,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
344 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 345 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
345 ci->i_nr_by_mode[i] = 0; 346 ci->i_nr_by_mode[i] = 0;
346 347
348 mutex_init(&ci->i_truncate_mutex);
347 ci->i_truncate_seq = 0; 349 ci->i_truncate_seq = 0;
348 ci->i_truncate_size = 0; 350 ci->i_truncate_size = 0;
349 ci->i_truncate_pending = 0; 351 ci->i_truncate_pending = 0;
@@ -377,6 +379,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
377 379
378 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); 380 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
379 381
382 ceph_fscache_inode_init(ci);
383
380 return &ci->vfs_inode; 384 return &ci->vfs_inode;
381} 385}
382 386
@@ -396,6 +400,8 @@ void ceph_destroy_inode(struct inode *inode)
396 400
397 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); 401 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
398 402
403 ceph_fscache_unregister_inode_cookie(ci);
404
399 ceph_queue_caps_release(inode); 405 ceph_queue_caps_release(inode);
400 406
401 /* 407 /*
@@ -430,7 +436,6 @@ void ceph_destroy_inode(struct inode *inode)
430 call_rcu(&inode->i_rcu, ceph_i_callback); 436 call_rcu(&inode->i_rcu, ceph_i_callback);
431} 437}
432 438
433
434/* 439/*
435 * Helpers to fill in size, ctime, mtime, and atime. We have to be 440 * Helpers to fill in size, ctime, mtime, and atime. We have to be
436 * careful because either the client or MDS may have more up to date 441 * careful because either the client or MDS may have more up to date
@@ -455,16 +460,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,
455 dout("truncate_seq %u -> %u\n", 460 dout("truncate_seq %u -> %u\n",
456 ci->i_truncate_seq, truncate_seq); 461 ci->i_truncate_seq, truncate_seq);
457 ci->i_truncate_seq = truncate_seq; 462 ci->i_truncate_seq = truncate_seq;
463
464 /* the MDS should have revoked these caps */
465 WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
466 CEPH_CAP_FILE_RD |
467 CEPH_CAP_FILE_WR |
468 CEPH_CAP_FILE_LAZYIO));
458 /* 469 /*
459 * If we hold relevant caps, or in the case where we're 470 * If we hold relevant caps, or in the case where we're
460 * not the only client referencing this file and we 471 * not the only client referencing this file and we
461 * don't hold those caps, then we need to check whether 472 * don't hold those caps, then we need to check whether
462 * the file is either opened or mmaped 473 * the file is either opened or mmaped
463 */ 474 */
464 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 475 if ((issued & (CEPH_CAP_FILE_CACHE|
465 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 476 CEPH_CAP_FILE_BUFFER)) ||
466 CEPH_CAP_FILE_EXCL|
467 CEPH_CAP_FILE_LAZYIO)) ||
468 mapping_mapped(inode->i_mapping) || 477 mapping_mapped(inode->i_mapping) ||
469 __ceph_caps_file_wanted(ci)) { 478 __ceph_caps_file_wanted(ci)) {
470 ci->i_truncate_pending++; 479 ci->i_truncate_pending++;
@@ -478,6 +487,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
478 truncate_size); 487 truncate_size);
479 ci->i_truncate_size = truncate_size; 488 ci->i_truncate_size = truncate_size;
480 } 489 }
490
491 if (queue_trunc)
492 ceph_fscache_invalidate(inode);
493
481 return queue_trunc; 494 return queue_trunc;
482} 495}
483 496
@@ -1066,7 +1079,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1066 * complete. 1079 * complete.
1067 */ 1080 */
1068 ceph_set_dentry_offset(req->r_old_dentry); 1081 ceph_set_dentry_offset(req->r_old_dentry);
1069 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1082 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1070 ceph_dentry(req->r_old_dentry)->offset); 1083 ceph_dentry(req->r_old_dentry)->offset);
1071 1084
1072 dn = req->r_old_dentry; /* use old_dentry */ 1085 dn = req->r_old_dentry; /* use old_dentry */
@@ -1419,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work)
1419 u32 orig_gen; 1432 u32 orig_gen;
1420 int check = 0; 1433 int check = 0;
1421 1434
1435 mutex_lock(&ci->i_truncate_mutex);
1422 spin_lock(&ci->i_ceph_lock); 1436 spin_lock(&ci->i_ceph_lock);
1423 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1437 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1424 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1438 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1425 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1439 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1426 /* nevermind! */ 1440 /* nevermind! */
1427 spin_unlock(&ci->i_ceph_lock); 1441 spin_unlock(&ci->i_ceph_lock);
1442 mutex_unlock(&ci->i_truncate_mutex);
1428 goto out; 1443 goto out;
1429 } 1444 }
1430 orig_gen = ci->i_rdcache_gen; 1445 orig_gen = ci->i_rdcache_gen;
1431 spin_unlock(&ci->i_ceph_lock); 1446 spin_unlock(&ci->i_ceph_lock);
1432 1447
1433 truncate_inode_pages(&inode->i_data, 0); 1448 truncate_inode_pages(inode->i_mapping, 0);
1434 1449
1435 spin_lock(&ci->i_ceph_lock); 1450 spin_lock(&ci->i_ceph_lock);
1436 if (orig_gen == ci->i_rdcache_gen && 1451 if (orig_gen == ci->i_rdcache_gen &&
@@ -1445,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1445 ci->i_rdcache_revoking); 1460 ci->i_rdcache_revoking);
1446 } 1461 }
1447 spin_unlock(&ci->i_ceph_lock); 1462 spin_unlock(&ci->i_ceph_lock);
1463 mutex_unlock(&ci->i_truncate_mutex);
1448 1464
1449 if (check) 1465 if (check)
1450 ceph_check_caps(ci, 0, NULL); 1466 ceph_check_caps(ci, 0, NULL);
@@ -1465,9 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1465 struct inode *inode = &ci->vfs_inode; 1481 struct inode *inode = &ci->vfs_inode;
1466 1482
1467 dout("vmtruncate_work %p\n", inode); 1483 dout("vmtruncate_work %p\n", inode);
1468 mutex_lock(&inode->i_mutex);
1469 __ceph_do_pending_vmtruncate(inode); 1484 __ceph_do_pending_vmtruncate(inode);
1470 mutex_unlock(&inode->i_mutex);
1471 iput(inode); 1485 iput(inode);
1472} 1486}
1473 1487
@@ -1480,6 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1480 struct ceph_inode_info *ci = ceph_inode(inode); 1494 struct ceph_inode_info *ci = ceph_inode(inode);
1481 1495
1482 ihold(inode); 1496 ihold(inode);
1497
1483 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, 1498 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1484 &ci->i_vmtruncate_work)) { 1499 &ci->i_vmtruncate_work)) {
1485 dout("ceph_queue_vmtruncate %p\n", inode); 1500 dout("ceph_queue_vmtruncate %p\n", inode);
@@ -1500,11 +1515,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
1500 u64 to; 1515 u64 to;
1501 int wrbuffer_refs, finish = 0; 1516 int wrbuffer_refs, finish = 0;
1502 1517
1518 mutex_lock(&ci->i_truncate_mutex);
1503retry: 1519retry:
1504 spin_lock(&ci->i_ceph_lock); 1520 spin_lock(&ci->i_ceph_lock);
1505 if (ci->i_truncate_pending == 0) { 1521 if (ci->i_truncate_pending == 0) {
1506 dout("__do_pending_vmtruncate %p none pending\n", inode); 1522 dout("__do_pending_vmtruncate %p none pending\n", inode);
1507 spin_unlock(&ci->i_ceph_lock); 1523 spin_unlock(&ci->i_ceph_lock);
1524 mutex_unlock(&ci->i_truncate_mutex);
1508 return; 1525 return;
1509 } 1526 }
1510 1527
@@ -1521,6 +1538,9 @@ retry:
1521 goto retry; 1538 goto retry;
1522 } 1539 }
1523 1540
1541 /* there should be no reader or writer */
1542 WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
1543
1524 to = ci->i_truncate_size; 1544 to = ci->i_truncate_size;
1525 wrbuffer_refs = ci->i_wrbuffer_ref; 1545 wrbuffer_refs = ci->i_wrbuffer_ref;
1526 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, 1546 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
@@ -1538,13 +1558,14 @@ retry:
1538 if (!finish) 1558 if (!finish)
1539 goto retry; 1559 goto retry;
1540 1560
1561 mutex_unlock(&ci->i_truncate_mutex);
1562
1541 if (wrbuffer_refs == 0) 1563 if (wrbuffer_refs == 0)
1542 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1564 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1543 1565
1544 wake_up_all(&ci->i_cap_wq); 1566 wake_up_all(&ci->i_cap_wq);
1545} 1567}
1546 1568
1547
1548/* 1569/*
1549 * symlinks 1570 * symlinks
1550 */ 1571 */
@@ -1586,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1586 if (ceph_snap(inode) != CEPH_NOSNAP) 1607 if (ceph_snap(inode) != CEPH_NOSNAP)
1587 return -EROFS; 1608 return -EROFS;
1588 1609
1589 __ceph_do_pending_vmtruncate(inode);
1590
1591 err = inode_change_ok(inode, attr); 1610 err = inode_change_ok(inode, attr);
1592 if (err != 0) 1611 if (err != 0)
1593 return err; 1612 return err;
@@ -1768,7 +1787,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1768 ceph_cap_string(dirtied), mask); 1787 ceph_cap_string(dirtied), mask);
1769 1788
1770 ceph_mdsc_put_request(req); 1789 ceph_mdsc_put_request(req);
1771 __ceph_do_pending_vmtruncate(inode); 1790 if (mask & CEPH_SETATTR_SIZE)
1791 __ceph_do_pending_vmtruncate(inode);
1772 return err; 1792 return err;
1773out: 1793out:
1774 spin_unlock(&ci->i_ceph_lock); 1794 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e0b4ef31d3c8..669622fd1ae3 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -196,8 +196,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
196 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, 196 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
197 &dl.object_no, &dl.object_offset, 197 &dl.object_no, &dl.object_offset,
198 &olen); 198 &olen);
199 if (r < 0) 199 if (r < 0) {
200 up_read(&osdc->map_sem);
200 return -EIO; 201 return -EIO;
202 }
201 dl.file_offset -= dl.object_offset; 203 dl.file_offset -= dl.object_offset;
202 dl.object_size = ceph_file_layout_object_size(ci->i_layout); 204 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
203 dl.block_size = ceph_file_layout_su(ci->i_layout); 205 dl.block_size = ceph_file_layout_su(ci->i_layout);
@@ -209,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
209 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 211 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
210 ceph_ino(inode), dl.object_no); 212 ceph_ino(inode), dl.object_no);
211 213
212 ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, 214 r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
213 ceph_file_layout_pg_pool(ci->i_layout)); 215 ceph_file_layout_pg_pool(ci->i_layout));
216 if (r < 0) {
217 up_read(&osdc->map_sem);
218 return r;
219 }
214 220
215 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 221 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
216 if (dl.osd >= 0) { 222 if (dl.osd >= 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 187bf214444d..b7bda5d9611d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
414{ 414{
415 struct ceph_mds_session *s; 415 struct ceph_mds_session *s;
416 416
417 if (mds >= mdsc->mdsmap->m_max_mds)
418 return ERR_PTR(-EINVAL);
419
417 s = kzalloc(sizeof(*s), GFP_NOFS); 420 s = kzalloc(sizeof(*s), GFP_NOFS);
418 if (!s) 421 if (!s)
419 return ERR_PTR(-ENOMEM); 422 return ERR_PTR(-ENOMEM);
@@ -1028,6 +1031,37 @@ static void remove_session_caps(struct ceph_mds_session *session)
1028{ 1031{
1029 dout("remove_session_caps on %p\n", session); 1032 dout("remove_session_caps on %p\n", session);
1030 iterate_session_caps(session, remove_session_caps_cb, NULL); 1033 iterate_session_caps(session, remove_session_caps_cb, NULL);
1034
1035 spin_lock(&session->s_cap_lock);
1036 if (session->s_nr_caps > 0) {
1037 struct super_block *sb = session->s_mdsc->fsc->sb;
1038 struct inode *inode;
1039 struct ceph_cap *cap, *prev = NULL;
1040 struct ceph_vino vino;
1041 /*
1042 * iterate_session_caps() skips inodes that are being
1043 * deleted, we need to wait until deletions are complete.
1044 * __wait_on_freeing_inode() is designed for the job,
1045 * but it is not exported, so use lookup inode function
1046 * to access it.
1047 */
1048 while (!list_empty(&session->s_caps)) {
1049 cap = list_entry(session->s_caps.next,
1050 struct ceph_cap, session_caps);
1051 if (cap == prev)
1052 break;
1053 prev = cap;
1054 vino = cap->ci->i_vino;
1055 spin_unlock(&session->s_cap_lock);
1056
1057 inode = ceph_find_inode(sb, vino);
1058 iput(inode);
1059
1060 spin_lock(&session->s_cap_lock);
1061 }
1062 }
1063 spin_unlock(&session->s_cap_lock);
1064
1031 BUG_ON(session->s_nr_caps > 0); 1065 BUG_ON(session->s_nr_caps > 0);
1032 BUG_ON(!list_empty(&session->s_cap_flushing)); 1066 BUG_ON(!list_empty(&session->s_cap_flushing));
1033 cleanup_cap_releases(session); 1067 cleanup_cap_releases(session);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6627b26a800c..6a0951e43044 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -17,6 +17,7 @@
17 17
18#include "super.h" 18#include "super.h"
19#include "mds_client.h" 19#include "mds_client.h"
20#include "cache.h"
20 21
21#include <linux/ceph/ceph_features.h> 22#include <linux/ceph/ceph_features.h>
22#include <linux/ceph/decode.h> 23#include <linux/ceph/decode.h>
@@ -142,6 +143,8 @@ enum {
142 Opt_nodcache, 143 Opt_nodcache,
143 Opt_ino32, 144 Opt_ino32,
144 Opt_noino32, 145 Opt_noino32,
146 Opt_fscache,
147 Opt_nofscache
145}; 148};
146 149
147static match_table_t fsopt_tokens = { 150static match_table_t fsopt_tokens = {
@@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = {
167 {Opt_nodcache, "nodcache"}, 170 {Opt_nodcache, "nodcache"},
168 {Opt_ino32, "ino32"}, 171 {Opt_ino32, "ino32"},
169 {Opt_noino32, "noino32"}, 172 {Opt_noino32, "noino32"},
173 {Opt_fscache, "fsc"},
174 {Opt_nofscache, "nofsc"},
170 {-1, NULL} 175 {-1, NULL}
171}; 176};
172 177
@@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private)
260 case Opt_noino32: 265 case Opt_noino32:
261 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; 266 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
262 break; 267 break;
268 case Opt_fscache:
269 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
270 break;
271 case Opt_nofscache:
272 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
273 break;
263 default: 274 default:
264 BUG_ON(token); 275 BUG_ON(token);
265 } 276 }
@@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
422 seq_puts(m, ",dcache"); 433 seq_puts(m, ",dcache");
423 else 434 else
424 seq_puts(m, ",nodcache"); 435 seq_puts(m, ",nodcache");
436 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
437 seq_puts(m, ",fsc");
438 else
439 seq_puts(m, ",nofsc");
425 440
426 if (fsopt->wsize) 441 if (fsopt->wsize)
427 seq_printf(m, ",wsize=%d", fsopt->wsize); 442 seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -530,11 +545,18 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
530 if (!fsc->wb_pagevec_pool) 545 if (!fsc->wb_pagevec_pool)
531 goto fail_trunc_wq; 546 goto fail_trunc_wq;
532 547
548 /* setup fscache */
549 if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
550 (ceph_fscache_register_fs(fsc) != 0))
551 goto fail_fscache;
552
533 /* caps */ 553 /* caps */
534 fsc->min_caps = fsopt->max_readdir; 554 fsc->min_caps = fsopt->max_readdir;
535 555
536 return fsc; 556 return fsc;
537 557
558fail_fscache:
559 ceph_fscache_unregister_fs(fsc);
538fail_trunc_wq: 560fail_trunc_wq:
539 destroy_workqueue(fsc->trunc_wq); 561 destroy_workqueue(fsc->trunc_wq);
540fail_pg_inv_wq: 562fail_pg_inv_wq:
@@ -554,6 +576,8 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
554{ 576{
555 dout("destroy_fs_client %p\n", fsc); 577 dout("destroy_fs_client %p\n", fsc);
556 578
579 ceph_fscache_unregister_fs(fsc);
580
557 destroy_workqueue(fsc->wb_wq); 581 destroy_workqueue(fsc->wb_wq);
558 destroy_workqueue(fsc->pg_inv_wq); 582 destroy_workqueue(fsc->pg_inv_wq);
559 destroy_workqueue(fsc->trunc_wq); 583 destroy_workqueue(fsc->trunc_wq);
@@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo)
588 612
589static int __init init_caches(void) 613static int __init init_caches(void)
590{ 614{
615 int error = -ENOMEM;
616
591 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 617 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
592 sizeof(struct ceph_inode_info), 618 sizeof(struct ceph_inode_info),
593 __alignof__(struct ceph_inode_info), 619 __alignof__(struct ceph_inode_info),
@@ -611,15 +637,17 @@ static int __init init_caches(void)
611 if (ceph_file_cachep == NULL) 637 if (ceph_file_cachep == NULL)
612 goto bad_file; 638 goto bad_file;
613 639
614 return 0; 640 if ((error = ceph_fscache_register()))
641 goto bad_file;
615 642
643 return 0;
616bad_file: 644bad_file:
617 kmem_cache_destroy(ceph_dentry_cachep); 645 kmem_cache_destroy(ceph_dentry_cachep);
618bad_dentry: 646bad_dentry:
619 kmem_cache_destroy(ceph_cap_cachep); 647 kmem_cache_destroy(ceph_cap_cachep);
620bad_cap: 648bad_cap:
621 kmem_cache_destroy(ceph_inode_cachep); 649 kmem_cache_destroy(ceph_inode_cachep);
622 return -ENOMEM; 650 return error;
623} 651}
624 652
625static void destroy_caches(void) 653static void destroy_caches(void)
@@ -629,10 +657,13 @@ static void destroy_caches(void)
629 * destroy cache. 657 * destroy cache.
630 */ 658 */
631 rcu_barrier(); 659 rcu_barrier();
660
632 kmem_cache_destroy(ceph_inode_cachep); 661 kmem_cache_destroy(ceph_inode_cachep);
633 kmem_cache_destroy(ceph_cap_cachep); 662 kmem_cache_destroy(ceph_cap_cachep);
634 kmem_cache_destroy(ceph_dentry_cachep); 663 kmem_cache_destroy(ceph_dentry_cachep);
635 kmem_cache_destroy(ceph_file_cachep); 664 kmem_cache_destroy(ceph_file_cachep);
665
666 ceph_fscache_unregister();
636} 667}
637 668
638 669
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cbded572345e..6014b0a3c405 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -16,6 +16,10 @@
16 16
17#include <linux/ceph/libceph.h> 17#include <linux/ceph/libceph.h>
18 18
19#ifdef CONFIG_CEPH_FSCACHE
20#include <linux/fscache.h>
21#endif
22
19/* f_type in struct statfs */ 23/* f_type in struct statfs */
20#define CEPH_SUPER_MAGIC 0x00c36400 24#define CEPH_SUPER_MAGIC 0x00c36400
21 25
@@ -29,6 +33,7 @@
29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 33#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ 34#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
31#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ 35#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
36#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
32 37
33#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 38#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
34 39
@@ -90,6 +95,11 @@ struct ceph_fs_client {
90 struct dentry *debugfs_bdi; 95 struct dentry *debugfs_bdi;
91 struct dentry *debugfs_mdsc, *debugfs_mdsmap; 96 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
92#endif 97#endif
98
99#ifdef CONFIG_CEPH_FSCACHE
100 struct fscache_cookie *fscache;
101 struct workqueue_struct *revalidate_wq;
102#endif
93}; 103};
94 104
95 105
@@ -288,6 +298,7 @@ struct ceph_inode_info {
288 298
289 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 299 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
290 300
301 struct mutex i_truncate_mutex;
291 u32 i_truncate_seq; /* last truncate to smaller size */ 302 u32 i_truncate_seq; /* last truncate to smaller size */
292 u64 i_truncate_size; /* and the size we last truncated down to */ 303 u64 i_truncate_size; /* and the size we last truncated down to */
293 int i_truncate_pending; /* still need to call vmtruncate */ 304 int i_truncate_pending; /* still need to call vmtruncate */
@@ -319,6 +330,12 @@ struct ceph_inode_info {
319 330
320 struct work_struct i_vmtruncate_work; 331 struct work_struct i_vmtruncate_work;
321 332
333#ifdef CONFIG_CEPH_FSCACHE
334 struct fscache_cookie *fscache;
335 u32 i_fscache_gen; /* sequence, for delayed fscache validate */
336 struct work_struct i_revalidate_work;
337#endif
338
322 struct inode vfs_inode; /* at end */ 339 struct inode vfs_inode; /* at end */
323}; 340};
324 341
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
deleted file mode 100644
index ea940b1db77b..000000000000
--- a/fs/cifs/AUTHORS
+++ /dev/null
@@ -1,55 +0,0 @@
1Original Author
2===============
3Steve French (sfrench@samba.org)
4
5The author wishes to express his appreciation and thanks to:
6Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
7improvements. Thanks to IBM for allowing me time and test resources to pursue
8this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
9the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
10Jeremy Allison of the Samba team has done invaluable work in adding the server
11side of the original CIFS Unix extensions and reviewing and implementing
12portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
13Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
14for proving years ago that very good smb/cifs clients could be done on Unix-like
15operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John
16Newbigin and others for their work on the Linux smbfs module. Thanks to
17the other members of the Storage Network Industry Association CIFS Technical
18Workgroup for their work specifying this highly complex protocol and finally
19thanks to the Samba team for their technical advice and encouragement.
20
21Patch Contributors
22------------------
23Zwane Mwaikambo
24Andi Kleen
25Amrut Joshi
26Shobhit Dayal
27Sergey Vlasov
28Richard Hughes
29Yury Umanets
30Mark Hamzy (for some of the early cifs IPv6 work)
31Domen Puncer
32Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
33Vince Negri and Dave Stahl (for finding an important caching bug)
34Adrian Bunk (kcalloc cleanups)
35Miklos Szeredi
36Kazeon team for various fixes especially for 2.4 version.
37Asser Ferno (Change Notify support)
38Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
39Gunter Kukkukk (testing and suggestions for support of old servers)
40Igor Mammedov (DFS support)
41Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
42
43Test case and Bug Report contributors
44-------------------------------------
45Thanks to those in the community who have submitted detailed bug reports
46and debug of problems they have found: Jochen Dolze, David Blaine,
47Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
48Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
49Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
50mention to the Stanford Checker (SWAT) which pointed out many minor
51bugs in error paths. Valuable suggestions also have come from Al Viro
52and Dave Miller.
53
54And thanks to the IBM LTC and Power test teams and SuSE testers for
55finding multiple bugs during excellent stress test runs.
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
deleted file mode 100644
index bc0025cdd1c9..000000000000
--- a/fs/cifs/CHANGES
+++ /dev/null
@@ -1,1065 +0,0 @@
1Version 1.62
2------------
3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
5
6Version 1.61
7------------
8Fix append problem to Samba servers (files opened with O_APPEND could
9have duplicated data). Fix oops in cifs_lookup. Workaround problem
10mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
11Disable use of server inode numbers when server only
12partially supports them (e.g. for one server querying inode numbers on
13FindFirst fails but QPathInfo queries works). Fix oops with dfs in
14cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
15for OpenOffice when on forcedirectio mount e.g.)
16
17Version 1.60
18-------------
19Fix memory leak in reconnect. Fix oops in DFS mount error path.
20Set s_maxbytes to smaller (the max that vfs can handle) so that
21sendfile will now work over cifs mounts again. Add noforcegid
22and noforceuid mount parameters. Fix small mem leak when using
23ntlmv2. Fix 2nd mount to same server but with different port to
24be allowed (rather than reusing the 1st port) - only when the
25user explicitly overrides the port on the 2nd mount.
26
27Version 1.59
28------------
29Client uses server inode numbers (which are persistent) rather than
30client generated ones by default (mount option "serverino" turned
31on by default if server supports it). Add forceuid and forcegid
32mount options (so that when negotiating unix extensions specifying
33which uid mounted does not immediately force the server's reported
34uids to be overridden). Add support for scope mount parm. Improve
35hard link detection to use same inode for both. Do not set
36read-only dos attribute on directories (for chmod) since Windows
37explorer special cases this attribute bit for directories for
38a different purpose.
39
40Version 1.58
41------------
42Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
43when the UTF-8 string is composed of unusually long (more than 4 byte) converted
44characters. Add support for mounting root of a share which redirects immediately
45to DFS target. Convert string conversion functions from Unicode to more
46accurately mark string length before allocating memory (which may help the
47rare cases where a UTF-8 string is much larger than the UCS2 string that
48we converted from). Fix endianness of the vcnum field used during
49session setup to distinguish multiple mounts to same server from different
50userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
51flag to be set to 2, and mount must enable krb5 to turn on extended security).
52Performance of file create to Samba improved (posix create on lookup
53removes 1 of 2 network requests sent on file create)
54
55Version 1.57
56------------
57Improve support for multiple security contexts to the same server. We
58used to use the same "vcnumber" for all connections which could cause
59the server to treat subsequent connections, especially those that
60are authenticated as guest, as reconnections, invalidating the earlier
61user's smb session. This fix allows cifs to mount multiple times to the
62same server with different userids without risking invalidating earlier
63established security contexts. fsync now sends SMB Flush operation
64to better ensure that we wait for server to write all of the data to
65server disk (not just write it over the network). Add new mount
66parameter to allow user to disable sending the (slow) SMB flush on
67fsync if desired (fsync still flushes all cached write data to the server).
68Posix file open support added (turned off after one attempt if server
69fails to support it properly, as with Samba server versions prior to 3.3.2)
70Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
71little memory for the "nativeFileSystem" field returned by the server
72during mount). Endian convert inode numbers if necessary (makes it easier
73to compare inode numbers on network files from big endian systems).
74
75Version 1.56
76------------
77Add "forcemandatorylock" mount option to allow user to use mandatory
78rather than posix (advisory) byte range locks, even though server would
79support posix byte range locks. Fix query of root inode when prefixpath
80specified and user does not have access to query information about the
81top of the share. Fix problem in 2.6.28 resolving DFS paths to
82Samba servers (worked to Windows). Fix rmdir so that pending search
83(readdir) requests do not get invalid results which include the now
84removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
85when using DFS. Add better file create support to servers which support
86the CIFS POSIX protocol extensions (this adds support for new flags
87on create, and improves semantics for write of locked ranges).
88
89Version 1.55
90------------
91Various fixes to make delete of open files behavior more predictable
92(when delete of an open file fails we mark the file as "delete-on-close"
93in a way that more servers accept, but only if we can first rename the
94file to a temporary name). Add experimental support for more safely
95handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp
96sends, and also let tcp autotune the socket send and receive buffers.
97This reduces the number of EAGAIN errors returned by TCP/IP in
98high stress workloads (and the number of retries on socket writes
99when sending large SMBWriteX requests). Fix case in which a portion of
100data can in some cases not get written to the file on the server before the
101file is closed. Fix DFS parsing to properly handle path consumed field,
102and to handle certain codepage conversions better. Fix mount and
103umount race that can cause oops in mount or umount or reconnect.
104
105Version 1.54
106------------
107Fix premature write failure on congested networks (we would give up
108on EAGAIN from the socket too quickly on large writes).
109Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
110Fix endian problems in acl (mode from/to cifs acl) on bigendian
111architectures. Fix problems with preserving timestamps on copying open
112files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
113on parent directory when server supports Unix Extensions but not POSIX
114create. Update cifs.upcall version to handle new Kerberos sec flags
115(this requires update of cifs.upcall program from Samba). Fix memory leak
116on dns_upcall (resolving DFS referralls). Fix plain text password
117authentication (requires setting SecurityFlags to 0x30030 to enable
118lanman and plain text though). Fix writes to be at correct offset when
119file is open with O_APPEND and file is on a directio (forcediretio) mount.
120Fix bug in rewinding readdir directory searches. Add nodfs mount option.
121
122Version 1.53
123------------
124DFS support added (Microsoft Distributed File System client support needed
125for referrals which enable a hierarchical name space among servers).
126Disable temporary caching of mode bits to servers which do not support
127storing of mode (e.g. Windows servers, when client mounts without cifsacl
128mount option) and add new "dynperm" mount option to enable temporary caching
129of mode (enable old behavior). Fix hang on mount caused when server crashes
130tcp session during negotiate protocol.
131
132Version 1.52
133------------
134Fix oops on second mount to server when null auth is used.
135Enable experimental Kerberos support. Return writebehind errors on flush
136and sync so that events like out of disk space get reported properly on
137cached files. Fix setxattr failure to certain Samba versions. Fix mount
138of second share to disconnected server session (autoreconnect on this).
139Add ability to modify cifs acls for handling chmod (when mounted with
140cifsacl flag). Fix prefixpath path separator so we can handle mounts
141with prefixpaths longer than one directory (one path component) when
142mounted to Windows servers. Fix slow file open when cifsacl
143enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
144
145
146Version 1.51
147------------
148Fix memory leak in statfs when mounted to very old servers (e.g.
149Windows 9x). Add new feature "POSIX open" which allows servers
150which support the current POSIX Extensions to provide better semantics
151(e.g. delete for open files opened with posix open). Take into
152account umask on posix mkdir not just older style mkdir. Add
153ability to mount to IPC$ share (which allows CIFS named pipes to be
154opened, read and written as if they were files). When 1st tree
155connect fails (e.g. due to signing negotiation failure) fix
156leak that causes cifsd not to stop and rmmod to fail to cleanup
157cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
158bigendian architectures. Fix possible memory corruption when
159EAGAIN returned on kern_recvmsg. Return better error if server
160requires packet signing but client has disabled it. When mounted
161with cifsacl mount option - mode bits are approximated based
162on the contents of the ACL of the file or directory. When cifs
163mount helper is missing convert make sure that UNC name
164has backslash (not forward slash) between ip address of server
165and the share name.
166
167Version 1.50
168------------
169Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
170done with "serverino" mount option). Add support for POSIX Unlink
171(helps with certain sharing violation cases when server such as
172Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
173mount option to allow disabling the CIFS Unix Extensions for just
174that mount. Fix hang on spinlock in find_writable_file (race when
175reopening file after session crash). Byte range unlock request to
176windows server could unlock more bytes (on server copy of file)
177than intended if start of unlock request is well before start of
178a previous byte range lock that we issued.
179
180Version 1.49
181------------
182IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6
183address after the "ip=" mount option, at least until mount.cifs is fixed to
184handle DNS host to ipv6 name translation). Accept override of uid or gid
185on mount even when Unix Extensions are negotiated (it used to be ignored
186when Unix Extensions were ignored). This allows users to override the
187default uid and gid for files when they are certain that the uids or
188gids on the server do not match those of the client. Make "sec=none"
189mount override username (so that null user connection is attempted)
190to match what documentation said. Support for very large reads, over 127K,
191available to some newer servers (such as Samba 3.0.26 and later but
192note that it also requires setting CIFSMaxBufSize at module install
193time to a larger value which may hurt performance in some cases).
194Make sign option force signing (or fail if server does not support it).
195
196Version 1.48
197------------
198Fix mtime bouncing around from local idea of last write times to remote time.
199Fix hang (in i_size_read) when simultaneous size update of same remote file
200on smp system corrupts sequence number. Do not reread unnecessarily partial page
201(which we are about to overwrite anyway) when writing out file opened rw.
202When DOS attribute of file on non-Unix server's file changes on the server side
203from read-only back to read-write, reflect this change in default file mode
204(we had been leaving a file's mode read-only until the inode were reloaded).
205Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
206when archive dos attribute not set and we are changing mode back to writeable
207on server which does not support the Unix Extensions). Remove read only dos
208attribute on chmod when adding any write permission (ie on any of
209user/group/other (not all of user/group/other ie 0222) when
210mounted to windows. Add support for POSIX MkDir (slight performance
211enhancement and eliminates the network race between the mkdir and set
212path info of the mode).
213
214
215Version 1.47
216------------
217Fix oops in list_del during mount caused by unaligned string.
218Fix file corruption which could occur on some large file
219copies caused by writepages page i/o completion bug.
220Seek to SEEK_END forces check for update of file size for non-cached
221files. Allow file size to be updated on remote extend of locally open,
222non-cached file. Fix reconnect to newer Samba servers (or other servers
223which support the CIFS Unix/POSIX extensions) so that we again tell the
224server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
225Add experimental support for new POSIX Open/Mkdir (which returns
226stat information on the open, and allows setting the mode).
227
228Version 1.46
229------------
230Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps.
231Allow null user to be specified on mount ("username="). Do not return
232EINVAL on readdir when filldir fails due to overwritten blocksize
233(fixes FC problem). Return error in rename 2nd attempt retry (ie report
234if rename by handle also fails, after rename by path fails, we were
235not reporting whether the retry worked or not). Fix NTLMv2 to
236work to Windows servers (mount with option "sec=ntlmv2").
237
238Version 1.45
239------------
240Do not time out lockw calls when using posix extensions. Do not
241time out requests if server still responding reasonably fast
242on requests on other threads. Improve POSIX locking emulation,
243(lock cancel now works, and unlock of merged range works even
244to Windows servers now). Fix oops on mount to lanman servers
245(win9x, os/2 etc.) when null password. Do not send listxattr
246(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux
247problem (instantiate inodes/dentries in right order for readdir).
248
249Version 1.44
250------------
251Rewritten sessionsetup support, including support for legacy SMB
252session setup needed for OS/2 and older servers such as Windows 95 and 98.
253Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst
254so we can do search (ls etc.) to OS/2. Do not send NTCreateX
255or recent levels of FindFirst unless server says it supports NT SMBs
256(instead use legacy equivalents from LANMAN dialect). Fix to allow
257NTLMv2 authentication support (now can use stronger password hashing
258on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
259Allow override of global cifs security flags on mount via "sec=" option(s).
260
261Version 1.43
262------------
263POSIX locking to servers which support CIFS POSIX Extensions
264(disabled by default controlled by proc/fs/cifs/Experimental).
265Handle conversion of long share names (especially Asian languages)
266to Unicode during mount. Fix memory leak in sess struct on reconnect.
267Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on
268cifs open which helps rare case when setpathinfo fails or server does
269not support it.
270
271Version 1.42
272------------
273Fix slow oplock break when mounted to different servers at the same time and
274the tids match and we try to find matching fid on wrong server. Fix read
275looping when signing required by server (2.6.16 kernel only). Fix readdir
276vs. rename race which could cause each to hang. Return . and .. even
277if server does not. Allow searches to skip first three entries and
278begin at any location. Fix oops in find_writeable_file.
279
280Version 1.41
281------------
282Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
283configure stronger authentication. Fix sfu symlinks so they can
284be followed (not just recognized). Fix wraparound of bcc on
285read responses when buffer size over 64K and also fix wrap of
286max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in
287cifs_user_read and cifs_readpages (when EAGAIN on send of smb
288on socket is returned over and over). Add POSIX (advisory) byte range
289locking support (requires server with newest CIFS UNIX Extensions
290to the protocol implemented). Slow down negprot slightly in port 139
291RFC1001 case to give session_init time on buggy servers.
292
293Version 1.40
294------------
295Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
296of readpages by eliminating one extra memcpy. Allow update of file size
297from remote server even if file is open for write as long as mount is
298directio. Recognize share mode security and send NTLM encrypted password
299on tree connect if share mode negotiated.
300
301Version 1.39
302------------
303Defer close of a file handle slightly if pending writes depend on that handle
304(this reduces the EBADF bad file handle errors that can be logged under heavy
305stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2
306Fix SFU style symlinks and mknod needed for servers which do not support the
307CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative
308dentries so files that the client sees as deleted but that later get created
309on the server will be recognized. Add client side permission check on setattr.
310Timeout stuck requests better (where server has never responded or sent corrupt
311responses)
312
313Version 1.38
314------------
315Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
316to be smaller at first (but increasing) so large write performance performance
317over GigE is better. Do not hang thread on illegal byte range lock response
318from Windows (Windows can send an RFC1001 size which does not match smb size) by
319allowing an SMBs TCP length to be up to a few bytes longer than it should be.
320wsize and rsize can now be larger than negotiated buffer size if server
321supports large readx/writex, even when directio mount flag not specified.
322Write size will in many cases now be 16K instead of 4K which greatly helps
323file copy performance on lightly loaded networks. Fix oops in dnotify
324when experimental config flag enabled. Make cifsFYI more granular.
325
326Version 1.37
327------------
328Fix readdir caching when unlink removes file in current search buffer,
329and this is followed by a rewind search to just before the deleted entry.
330Do not attempt to set ctime unless atime and/or mtime change requested
331(most servers throw it away anyway). Fix length check of received smbs
332to be more accurate. Fix big endian problem with mapchars mount option,
333and with a field returned by statfs.
334
335Version 1.36
336------------
337Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
338For these older servers, add option for passing netbios name of server in
339on mount (servernetbiosname). Add suspend support for power management, to
340avoid cifsd thread preventing software suspend from working.
341Add mount option for disabling the default behavior of sending byte range lock
342requests to the server (necessary for certain applications which break with
343mandatory lock behavior such as Evolution), and also mount option for
344requesting case insensitive matching for path based requests (requesting
345case sensitive is the default).
346
347Version 1.35
348------------
349Add writepage performance improvements. Fix path name conversions
350for long filenames on mounts which were done with "mapchars" mount option
351specified. Ensure multiplex ids do not collide. Fix case in which
352rmmod can oops if done soon after last unmount. Fix truncated
353search (readdir) output when resume filename was a long filename.
354Fix filename conversion when mapchars mount option was specified and
355filename was a long filename.
356
357Version 1.34
358------------
359Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
360Do not oops if root user kills cifs oplock kernel thread or
361kills the cifsd thread (NB: killing the cifs kernel threads is not
362recommended, unmount and rmmod cifs will kill them when they are
363no longer needed). Fix readdir to ASCII servers (ie older servers
364which do not support Unicode) and also require asterisk.
365Fix out of memory case in which data could be written one page
366off in the page cache.
367
368Version 1.33
369------------
370Fix caching problem, in which readdir of directory containing a file
371which was cached could cause the file's time stamp to be updated
372without invalidating the readahead data (so we could get stale
373file data on the client for that file even as the server copy changed).
374Cleanup response processing so cifsd can not loop when abnormally
375terminated.
376
377
378Version 1.32
379------------
380Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
381transact response for an SMB request and search entry split across two frames.
382Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
383as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
384unless server explicitly claims to support them in CIFS Unix extensions
385POSIX ACL capability bit. Fix packet signing when multiuser mounting with
386different users from the same client to the same server. Fix oops in
387cifs_close. Add mount option for remapping reserved characters in
388filenames (also allow recognizing files with created by SFU which have any
389of these seven reserved characters, except backslash, to be recognized).
390Fix invalid transact2 message (we were sometimes trying to interpret
391oplock breaks as SMB responses). Add ioctl for checking that the
392current uid matches the uid of the mounter (needed by umount.cifs).
393Reduce the number of large buffer allocations in cifs response processing
394(significantly reduces memory pressure under heavy stress with multiple
395processes accessing the same server at the same time).
396
397Version 1.31
398------------
399Fix updates of DOS attributes and time fields so that files on NT4 servers
400do not get marked delete on close. Display sizes of cifs buffer pools in
401cifs stats. Fix oops in unmount when cifsd thread being killed by
402shutdown. Add generic readv/writev and aio support. Report inode numbers
403consistently in readdir and lookup (when serverino mount option is
404specified use the inode number that the server reports - for both lookup
405and readdir, otherwise by default the locally generated inode number is used
406for inodes created in either path since servers are not always able to
407provide unique inode numbers when exporting multiple volumes from under one
408sharename).
409
410Version 1.30
411------------
412Allow new nouser_xattr mount parm to disable xattr support for user namespace.
413Do not flag user_xattr mount parm in dmesg. Retry failures setting file time
414(mostly affects NT4 servers) by retry with handle based network operation.
415Add new POSIX Query FS Info for returning statfs info more accurately.
416Handle passwords with multiple commas in them.
417
418Version 1.29
419------------
420Fix default mode in sysfs of cifs module parms. Remove old readdir routine.
421Fix capabilities flags for large readx so as to allow reads larger than 64K.
422
423Version 1.28
424------------
425Add module init parm for large SMB buffer size (to allow it to be changed
426from its default of 16K) which is especially useful for large file copy
427when mounting with the directio mount option. Fix oops after
428returning from mount when experimental ExtendedSecurity enabled and
429SpnegoNegotiated returning invalid error. Fix case to retry better when
430peek returns from 1 to 3 bytes on socket which should have more data.
431Fixed path based calls (such as cifs lookup) to handle path names
432longer than 530 (now can handle PATH_MAX). Fix pass through authentication
433from Samba server to DC (Samba required dummy LM password).
434
435Version 1.27
436------------
437Turn off DNOTIFY (directory change notification support) by default
438(unless built with the experimental flag) to fix hang with KDE
439file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event
440waiting on an SMB response) in SendReceive when session dies but
441reconnects quickly from another task. Add module init parms for
442minimum number of large and small network buffers in the buffer pools,
443and for the maximum number of simultaneous requests.
444
445Version 1.26
446------------
447Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
448and other POSIX CIFS compliant servers. Fix error mapping for getfacl
449to EOPNOTSUPP when server does not support posix acls on the wire. Fix
450improperly zeroed buffer in CIFS Unix extensions set times call.
451
452Version 1.25
453------------
454Fix internationalization problem in cifs readdir with filenames that map to
455longer UTF-8 strings than the string on the wire was in Unicode. Add workaround
456for readdir to netapp servers. Fix search rewind (seek into readdir to return
457non-consecutive entries). Do not do readdir when server negotiates
458buffer size to small to fit filename. Add support for reading POSIX ACLs from
459the server (add also acl and noacl mount options).
460
461Version 1.24
462------------
463Optionally allow using server side inode numbers, rather than client generated
464ones by specifying mount option "serverino" - this is required for some apps
465to work which double check hardlinked files and have persistent inode numbers.
466
467Version 1.23
468------------
469Multiple bigendian fixes. On little endian systems (for reconnect after
470network failure) fix tcp session reconnect code so we do not try first
471to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
472as directories rather than symlinks because we can do follow link on them.
473
474Version 1.22
475------------
476Add config option to enable XATTR (extended attribute) support, mapping
477xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
478minor fixes pointed out by the Stanford SWAT checker (mostly missing
479or out of order NULL pointer checks in little used error paths).
480
481Version 1.21
482------------
483Add new mount parm to control whether mode check (generic_permission) is done
484on the client. If Unix extensions are enabled and the uids on the client
485and server do not match, client permission checks are meaningless on
486server uids that do not exist on the client (this does not affect the
487normal ACL check which occurs on the server). Fix default uid
488on mknod to match create and mkdir. Add optional mount parm to allow
489override of the default uid behavior (in which the server sets the uid
490and gid of newly created files). Normally for network filesystem mounts
491user want the server to set the uid/gid on newly created files (rather than
492using uid of the client processes you would in a local filesystem).
493
494Version 1.20
495------------
496Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
497info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir
498(in build_wildcard_path_from_dentry). Fix mknod to pass type field
499(block/char/fifo) properly. Remove spurious mount warning log entry when
500credentials passed as mount argument. Set major/minor device number in
501inode for block and char devices when unix extensions enabled.
502
503Version 1.19
504------------
505Fix /proc/fs/cifs/Stats and DebugData display to handle larger
506amounts of return data. Properly limit requests to MAX_REQ (50
507is the usual maximum active multiplex SMB/CIFS requests per server).
508Do not kill cifsd (and thus hurt the other SMB session) when more than one
509session to the same server (but with different userids) exists and one
510of the two user's smb sessions is being removed while leaving the other.
511Do not loop reconnecting in cifsd demultiplex thread when admin
512kills the thread without going through unmount.
513
514Version 1.18
515------------
516Do not rename hardlinked files (since that should be a noop). Flush
517cached write behind data when reopening a file after session abend,
518except when already in write. Grab per socket sem during reconnect
519to avoid oops in sendmsg if overlapping with reconnect. Do not
520reset cached inode file size on readdir for files open for write on
521client.
522
523
524Version 1.17
525------------
526Update number of blocks in file so du command is happier (in Linux a fake
527blocksize of 512 is required for calculating number of blocks in inode).
528Fix prepare write of partial pages to read in data from server if possible.
529Fix race on tcpStatus field between unmount and reconnection code, causing
530cifsd process sometimes to hang around forever. Improve out of memory
531checks in cifs_filldir
532
533Version 1.16
534------------
535Fix incorrect file size in file handle based setattr on big endian hardware.
536Fix oops in build_path_from_dentry when out of memory. Add checks for invalid
537and closing file structs in writepage/partialpagewrite. Add statistics
538for each mounted share (new menuconfig option). Fix endianness problem in
539volume information displayed in /proc/fs/cifs/DebugData (only affects
540affects big endian architectures). Prevent renames while constructing
541path names for open, mkdir and rmdir.
542
543Version 1.15
544------------
545Change to mempools for alloc smb request buffers and multiplex structs
546to better handle low memory problems (and potential deadlocks).
547
548Version 1.14
549------------
550Fix incomplete listings of large directories on Samba servers when Unix
551extensions enabled. Fix oops when smb_buffer can not be allocated. Fix
552rename deadlock when writing out dirty pages at same time.
553
554Version 1.13
555------------
556Fix open of files in which O_CREATE can cause the mode to change in
557some cases. Fix case in which retry of write overlaps file close.
558Fix PPC64 build error. Reduce excessive stack usage in smb password
559hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
560
561Version 1.12
562------------
563Fixes for large file copy, signal handling, socket retry, buffer
564allocation and low memory situations.
565
566Version 1.11
567------------
568Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
569also now allowing support for specifying client netbiosname. NT4 support added.
570
571Version 1.10
572------------
573Fix reconnection (and certain failed mounts) to properly wake up the
574blocked users thread so it does not seem hung (in some cases was blocked
575until the cifs receive timeout expired). Fix spurious error logging
576to kernel log when application with open network files killed.
577
578Version 1.09
579------------
580Fix /proc/fs module unload warning message (that could be logged
581to the kernel log). Fix intermittent failure in connectathon
582test7 (hardlink count not immediately refreshed in case in which
583inode metadata can be incorrectly kept cached when time near zero)
584
585Version 1.08
586------------
587Allow file_mode and dir_mode (specified at mount time) to be enforced
588locally (the server already enforced its own ACLs too) for servers
589that do not report the correct mode (do not support the
590CIFS Unix Extensions).
591
592Version 1.07
593------------
594Fix some small memory leaks in some unmount error paths. Fix major leak
595of cache pages in readpages causing multiple read oriented stress
596testcases (including fsx, and even large file copy) to fail over time.
597
598Version 1.06
599------------
600Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
601This allows files that differ only in case and improves performance of file
602creation and file open to such servers. Fix semaphore conflict which causes
603slow delete of open file to Samba (which unfortunately can cause an oplock
604break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
605
606Version 1.05
607------------
608fixes to cifs_readpages for fsx test case
609
610Version 1.04
611------------
612Fix caching data integrity bug when extending file size especially when no
613oplock on file. Fix spurious logging of valid already parsed mount options
614that are parsed outside of the cifs vfs such as nosuid.
615
616
617Version 1.03
618------------
619Connect to server when port number override not specified, and tcp port
620unitialized. Reset search to restart at correct file when kernel routine
621filldir returns error during large directory searches (readdir).
622
623Version 1.02
624------------
625Fix caching problem when files opened by multiple clients in which
626page cache could contain stale data, and write through did
627not occur often enough while file was still open when read ahead
628(read oplock) not allowed. Treat "sep=" when first mount option
629as an override of comma as the default separator between mount
630options.
631
632Version 1.01
633------------
634Allow passwords longer than 16 bytes. Allow null password string.
635
636Version 1.00
637------------
638Gracefully clean up failed mounts when attempting to mount to servers such as
639Windows 98 that terminate tcp sessions during protocol negotiation. Handle
640embedded commas in mount parsing of passwords.
641
642Version 0.99
643------------
644Invalidate local inode cached pages on oplock break and when last file
645instance is closed so that the client does not continue using stale local
646copy rather than later modified server copy of file. Do not reconnect
647when server drops the tcp session prematurely before negotiate
648protocol response. Fix oops in reopen_file when dentry freed. Allow
649the support for CIFS Unix Extensions to be disabled via proc interface.
650
651Version 0.98
652------------
653Fix hang in commit_write during reconnection of open files under heavy load.
654Fix unload_nls oops in a mount failure path. Serialize writes to same socket
655which also fixes any possible races when cifs signatures are enabled in SMBs
656being sent out of signature sequence number order.
657
658Version 0.97
659------------
660Fix byte range locking bug (endian problem) causing bad offset and
661length.
662
663Version 0.96
664------------
665Fix oops (in send_sig) caused by CIFS unmount code trying to
666wake up the demultiplex thread after it had exited. Do not log
667error on harmless oplock release of closed handle.
668
669Version 0.95
670------------
671Fix unsafe global variable usage and password hash failure on gcc 3.3.1
672Fix problem reconnecting secondary mounts to same server after session
673failure. Fix invalid dentry - race in mkdir when directory gets created
674by another client between the lookup and mkdir.
675
676Version 0.94
677------------
678Fix to list processing in reopen_files. Fix reconnection when server hung
679but tcpip session still alive. Set proper timeout on socket read.
680
681Version 0.93
682------------
683Add missing mount options including iocharset. SMP fixes in write and open.
684Fix errors in reconnecting after TCP session failure. Fix module unloading
685of default nls codepage
686
687Version 0.92
688------------
689Active smb transactions should never go negative (fix double FreeXid). Fix
690list processing in file routines. Check return code on kmalloc in open.
691Fix spinlock usage for SMP.
692
693Version 0.91
694------------
695Fix oops in reopen_files when invalid dentry. drop dentry on server rename
696and on revalidate errors. Fix cases where pid is now tgid. Fix return code
697on create hard link when server does not support them.
698
699Version 0.90
700------------
701Fix scheduling while atomic error in getting inode info on newly created file.
702Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
703
704Version 0.89
705------------
706Fix oops on write to dead tcp session. Remove error log write for case when file open
707O_CREAT but not O_EXCL
708
709Version 0.88
710------------
711Fix non-POSIX behavior on rename of open file and delete of open file by taking
712advantage of trans2 SetFileInfo rename facility if available on target server.
713Retry on ENOSPC and EAGAIN socket errors.
714
715Version 0.87
716------------
717Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix
718allocation size miscalculation. After oplock token lost do not read through
719cache.
720
721Version 0.86
722------------
723Fix oops on empty file readahead. Fix for file size handling for locally cached files.
724
725Version 0.85
726------------
727Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
728during auto reconnection to server after server recovered from failure.
729
730Version 0.84
731------------
732Finish support for Linux 2.5 open/create changes, which removes the
733redundant NTCreate/QPathInfo/close that was sent during file create.
734Enable oplock by default. Enable packet signing by default (needed to
735access many recent Windows servers)
736
737Version 0.83
738------------
739Fix oops when mounting to long server names caused by inverted parms to kmalloc.
740Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
741we will choose a cifs user session (smb uid) that better matches the local
742uid if a) the mount uid does not match the current uid and b) we have another
743session to the same server (ip address) for a different mount which
744matches the current local uid.
745
746Version 0.82
747------------
748Add support for mknod of block or character devices. Fix oplock
749code (distributed caching) to properly send response to oplock
750break from server.
751
752Version 0.81
753------------
754Finish up CIFS packet digital signing for the default
755NTLM security case. This should help Windows 2003
756network interoperability since it is common for
757packet signing to be required now. Fix statfs (stat -f)
758which recently started returning errors due to
759invalid value (-1 instead of 0) being set in the
760struct kstatfs f_ffiles field.
761
762Version 0.80
763-----------
764Fix oops on stopping oplock thread when removing cifs when
765built as module.
766
767Version 0.79
768------------
769Fix mount options for ro (readonly), uid, gid and file and directory mode.
770
771Version 0.78
772------------
773Fix errors displayed on failed mounts to be more understandable.
774Fixed various incorrect or misleading smb to posix error code mappings.
775
776Version 0.77
777------------
778Fix display of NTFS DFS junctions to display as symlinks.
779They are the network equivalent. Fix oops in
780cifs_partialpagewrite caused by missing spinlock protection
781of openfile linked list. Allow writebehind caching errors to
782be returned to the application at file close.
783
784Version 0.76
785------------
786Clean up options displayed in /proc/mounts by show_options to
787be more consistent with other filesystems.
788
789Version 0.75
790------------
791Fix delete of readonly file to Windows servers. Reflect
792presence or absence of read only dos attribute in mode
793bits for servers that do not support CIFS Unix extensions.
794Fix shortened results on readdir of large directories to
795servers supporting CIFS Unix extensions (caused by
796incorrect resume key).
797
798Version 0.74
799------------
800Fix truncate bug (set file size) that could cause hangs e.g. running fsx
801
802Version 0.73
803------------
804unload nls if mount fails.
805
806Version 0.72
807------------
808Add resume key support to search (readdir) code to workaround
809Windows bug. Add /proc/fs/cifs/LookupCacheEnable which
810allows disabling caching of attribute information for
811lookups.
812
813Version 0.71
814------------
815Add more oplock handling (distributed caching code). Remove
816dead code. Remove excessive stack space utilization from
817symlink routines.
818
819Version 0.70
820------------
821Fix oops in get dfs referral (triggered when null path sent in to
822mount). Add support for overriding rsize at mount time.
823
824Version 0.69
825------------
826Fix buffer overrun in readdir which caused intermittent kernel oopses.
827Fix writepage code to release kmap on write data. Allow "-ip=" new
828mount option to be passed in on parameter distinct from the first part
829(server name portion of) the UNC name. Allow override of the
830tcp port of the target server via new mount option "-port="
831
832Version 0.68
833------------
834Fix search handle leak on rewind. Fix setuid and gid so that they are
835reflected in the local inode immediately. Cleanup of whitespace
836to make 2.4 and 2.5 versions more consistent.
837
838
839Version 0.67
840------------
841Fix signal sending so that captive thread (cifsd) exits on umount
842(which was causing the warning in kmem_cache_free of the request buffers
843at rmmod time). This had broken as a sideeffect of the recent global
844kernel change to daemonize. Fix memory leak in readdir code which
845showed up in "ls -R" (and applications that did search rewinding).
846
847Version 0.66
848------------
849Reconnect tids and fids after session reconnection (still do not
850reconnect byte range locks though). Fix problem caching
851lookup information for directory inodes, improving performance,
852especially in deep directory trees. Fix various build warnings.
853
854Version 0.65
855------------
856Finish fixes to commit write for caching/readahead consistency. fsx
857now works to Samba servers. Fix oops caused when readahead
858was interrupted by a signal.
859
860Version 0.64
861------------
862Fix data corruption (in partial page after truncate) that caused fsx to
863fail to Windows servers. Cleaned up some extraneous error logging in
864common error paths. Add generic sendfile support.
865
866Version 0.63
867------------
868Fix memory leak in AllocMidQEntry.
869Finish reconnection logic, so connection with server can be dropped
870(or server rebooted) and the cifs client will reconnect.
871
872Version 0.62
873------------
874Fix temporary socket leak when bad userid or password specified
875(or other SMBSessSetup failure). Increase maximum buffer size to slightly
876over 16K to allow negotiation of up to Samba and Windows server default read
877sizes. Add support for readpages
878
879Version 0.61
880------------
881Fix oops when username not passed in on mount. Extensive fixes and improvements
882to error logging (strip redundant newlines, change debug macros to ensure newline
883passed in and to be more consistent). Fix writepage wrong file handle problem,
884a readonly file handle could be incorrectly used to attempt to write out
885file updates through the page cache to multiply open files. This could cause
886the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
887shares to the same Windows server when using different usernames
888(doing this to Samba servers worked but Windows was rejecting it) - now it is
889possible to use different userids when connecting to the same server from a
890Linux client. Fix oops when treeDisconnect called during unmount on
891previously freed socket.
892
893Version 0.60
894------------
895Fix oops in readpages caused by not setting address space operations in inode in
896rare code path.
897
898Version 0.59
899------------
900Includes support for deleting of open files and renaming over existing files (per POSIX
901requirement). Add readlink support for Windows junction points (directory symlinks).
902
903Version 0.58
904------------
905Changed read and write to go through pagecache. Added additional address space operations.
906Memory mapped operations now working.
907
908Version 0.57
909------------
910Added writepage code for additional memory mapping support. Fixed leak in xids causing
911the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on
912every stat call. Additional formatting cleanup.
913
914Version 0.56
915------------
916Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup.
917
918Version 0.55
919------------
920Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
921Also included a modified version of his fix to protect global list manipulation of
922the smb session and tree connection and mid related global variables.
923
924Version 0.54
925------------
926Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre
927changes to superblock layout. Remove wasteful allocation of smb buffers (now the send
928buffer is reused for responses). Add more oplock handling. Additional minor cleanup.
929
930Version 0.53
931------------
932More stylistic updates to better match kernel style. Add additional statistics
933for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2
934and CIFS Packet Signing enablement.
935
936Version 0.52
937------------
938Replace call to sleep_on with safer wait_on_event.
939Make stylistic changes to better match kernel style recommendations.
940Remove most typedef usage (except for the PDUs themselves).
941
942Version 0.51
943------------
944Update mount so the -unc mount option is no longer required (the ip address can be specified
945in a UNC style device name. Implementation of readpage/writepage started.
946
947Version 0.50
948------------
949Fix intermittent problem with incorrect smb header checking on badly
950fragmented tcp responses
951
952Version 0.49
953------------
954Fixes to setting of allocation size and file size.
955
956Version 0.48
957------------
958Various 2.5.38 fixes. Now works on 2.5.38
959
960Version 0.47
961------------
962Prepare for 2.5 kernel merge. Remove ifdefs.
963
964Version 0.46
965------------
966Socket buffer management fixes. Fix dual free.
967
968Version 0.45
969------------
970Various big endian fixes for hardlinks and symlinks and also for dfs.
971
972Version 0.44
973------------
974Various big endian fixes for servers with Unix extensions such as Samba
975
976Version 0.43
977------------
978Various FindNext fixes for incorrect filenames on large directory searches on big endian
979clients. basic posix file i/o tests now work on big endian machines, not just le
980
981Version 0.42
982------------
983SessionSetup and NegotiateProtocol now work from Big Endian machines.
984Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older
985versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
986
987Version 0.41
988------------
989Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked
990files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
991
992Version 0.40
993------------
994Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
995session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
996Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
997(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
998
999Version 0.38
1000------------
1001Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
1002it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
1003
1004Version 0.37
1005------------
1006Rewrote much of connection and mount/unmount logic to handle bugs with
1007multiple uses to same share, multiple users to same server etc.
1008
1009Version 0.36
1010------------
1011Fixed major problem with dentry corruption (missing call to dput)
1012
1013Version 0.35
1014------------
1015Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
1016Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs
1017although corresponding function not fully implemented in the vfs yet
1018
1019Version 0.34
1020------------
1021Fixed dentry caching bug, misc. cleanup
1022
1023Version 0.33
1024------------
1025Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build
1026on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
1027Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added.
1028
1029Version 0.32
1030------------
1031Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
1032and tested against Samba 2.2.5
1033
1034
1035Version 0.31
1036------------
10371) Fixed lockrange to be correct (it was one byte too short)
1038
10392) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly
1040show range as locked when there is a conflict with an existing lock.
1041
10423) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
1043in most cases. Eventually will offer optional ability to query server for the correct perms.
1044
10453) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded
1046but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
1047session)
1048
10494) Fixed error logging of valid mount options
1050
10515) Removed logging of password field.
1052
10536) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
1054and cleaned them up and made them more consistent with other cifs functions.
1055
10567) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways
1057(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
1058nor is the symlink support using the Unix extensions
1059
10608) Started adding the readlink and follow_link code
1061
1062Version 0.3
1063-----------
1064Initial drop
1065
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index aa0d68b086eb..1964d212ab08 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o smb1ops.o 9 readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
10 10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o 11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
12 12
diff --git a/fs/cifs/README b/fs/cifs/README
deleted file mode 100644
index 2d5622f60e11..000000000000
--- a/fs/cifs/README
+++ /dev/null
@@ -1,753 +0,0 @@
1The CIFS VFS support for Linux supports many advanced network filesystem
2features such as hierarchical dfs like namespace, hardlinks, locking and more.
3It was designed to comply with the SNIA CIFS Technical Reference (which
4supersedes the 1992 X/Open SMB Standard) as well as to perform best practice
5practical interoperability with Windows 2000, Windows XP, Samba and equivalent
6servers. This code was developed in participation with the Protocol Freedom
7Information Foundation.
8
9Please see
10 http://protocolfreedom.org/ and
11 http://samba.org/samba/PFIF/
12for more details.
13
14
15For questions or bug reports please contact:
16 sfrench@samba.org (sfrench@us.ibm.com)
17
18Build instructions:
19==================
20For Linux 2.4:
211) Get the kernel source (e.g.from http://www.kernel.org)
22and download the cifs vfs source (see the project page
23at http://us1.samba.org/samba/Linux_CIFS_client.html)
24and change directory into the top of the kernel directory
25then patch the kernel (e.g. "patch -p1 < cifs_24.patch")
26to add the cifs vfs to your kernel configure options if
27it has not already been added (e.g. current SuSE and UL
28users do not need to apply the cifs_24.patch since the cifs vfs is
29already in the kernel configure menu) and then
30mkdir linux/fs/cifs and then copy the current cifs vfs files from
31the cifs download to your kernel build directory e.g.
32
33 cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
34
352) make menuconfig (or make xconfig)
363) select cifs from within the network filesystem choices
374) save and exit
385) make dep
396) make modules (or "make" if CIFS VFS not to be built as a module)
40
41For Linux 2.6:
421) Download the kernel (e.g. from http://www.kernel.org)
43and change directory into the top of the kernel directory tree
44(e.g. /usr/src/linux-2.5.73)
452) make menuconfig (or make xconfig)
463) select cifs from within the network filesystem choices
474) save and exit
485) make
49
50
51Installation instructions:
52=========================
53If you have built the CIFS vfs as module (successfully) simply
54type "make modules_install" (or if you prefer, manually copy the file to
55the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
56
57If you have built the CIFS vfs into the kernel itself, follow the instructions
58for your distribution on how to install a new kernel (usually you
59would simply type "make install").
60
61If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on
62the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
63similar files reside (usually /sbin). Although the helper software is not
64required, mount.cifs is recommended. Eventually the Samba 3.0 utility program
65"net" may also be helpful since it may someday provide easier mount syntax for
66users who are used to Windows e.g.
67 net use <mount point> <UNC name or cifs URL>
68Note that running the Winbind pam/nss module (logon service) on all of your
69Linux clients is useful in mapping Uids and Gids consistently across the
70domain to the proper network user. The mount.cifs mount helper can be
71trivially built from Samba 3.0 or later source e.g. by executing:
72
73 gcc samba/source/client/mount.cifs.c -o mount.cifs
74
75If cifs is built as a module, then the size and number of network buffers
76and maximum number of simultaneous requests to one server can be configured.
77Changing these from their defaults is not recommended. By executing modinfo
78 modinfo kernel/fs/cifs/cifs.ko
79on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
80at module initialization time (by running insmod cifs.ko) can be seen.
81
82Allowing User Mounts
83====================
84To permit users to mount and unmount over directories they own is possible
85with the cifs vfs. A way to enable such mounting is to mark the mount.cifs
86utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to
87umount shares they mount requires
881) mount.cifs version 1.4 or later
892) an entry for the share in /etc/fstab indicating that a user may
90unmount it e.g.
91//server/usersharename /mnt/username cifs user 0 0
92
93Note that when the mount.cifs utility is run suid (allowing user mounts),
94in order to reduce risks, the "nosuid" mount flag is passed in on mount to
95disallow execution of an suid program mounted on the remote target.
96When mount is executed as root, nosuid is not passed in by default,
97and execution of suid programs on the remote target would be enabled
98by default. This can be changed, as with nfs and other filesystems,
99by simply specifying "nosuid" among the mount options. For user mounts
100though to be able to pass the suid flag to mount requires rebuilding
101mount.cifs with the following flag:
102
103 gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
104
105There is a corresponding manual page for cifs mounting in the Samba 3.0 and
106later source tree in docs/manpages/mount.cifs.8
107
108Allowing User Unmounts
109======================
110To permit users to ummount directories that they have user mounted (see above),
111the utility umount.cifs may be used. It may be invoked directly, or if
112umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
113(at least for most versions of the umount utility) for umount of cifs
114mounts, unless umount is invoked with -i (which will avoid invoking a umount
115helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
116as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
117allow adding entries to a file to the /etc/permissions file to achieve the
118equivalent suid effect). For this utility to succeed the target path
119must be a cifs mount, and the uid of the current user must match the uid
120of the user who mounted the resource.
121
122Also note that the customary way of allowing user mounts and unmounts is
123(instead of using mount.cifs and unmount.cifs as suid) to add a line
124to the file /etc/fstab for each //server/share you wish to mount, but
125this can become unwieldy when potential mount targets include many
126or unpredictable UNC names.
127
128Samba Considerations
129====================
130To get the maximum benefit from the CIFS VFS, we recommend using a server that
131supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or
132Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.
133Note that uid, gid and file permissions will display default values if you do
134not have a server that supports the Unix extensions for CIFS (such as Samba
1352.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add
136the line:
137
138 unix extensions = yes
139
140to your smb.conf file on the server. Note that the following smb.conf settings
141are also useful (on the Samba server) when the majority of clients are Unix or
142Linux:
143
144 case sensitive = yes
145 delete readonly = yes
146 ea support = yes
147
148Note that server ea support is required for supporting xattrs from the Linux
149cifs client, and that EA support is present in later versions of Samba (e.g.
1503.0.6 and later (also EA support works in all versions of Windows, at least to
151shares on NTFS filesystems). Extended Attribute (xattr) support is an optional
152feature of most Linux filesystems which may require enabling via
153make menuconfig. Client support for extended attributes (user xattr) can be
154disabled on a per-mount basis by specifying "nouser_xattr" on mount.
155
156The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
157version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and
158then POSIX support in the CIFS configuration options when building the cifs
159module. POSIX ACL support can be disabled on a per mount basic by specifying
160"noacl" on mount.
161
162Some administrators may want to change Samba's smb.conf "map archive" and
163"create mask" parameters from the default. Unless the create mask is changed
164newly created files can end up with an unnecessarily restrictive default mode,
165which may not be what you want, although if the CIFS Unix extensions are
166enabled on the server and client, subsequent setattr calls (e.g. chmod) can
167fix the mode. Note that creating special devices (mknod) remotely
168may require specifying a mkdev function to Samba if you are not using
169Samba 3.0.6 or later. For more information on these see the manual pages
170("man smb.conf") on the Samba server system. Note that the cifs vfs,
171unlike the smbfs vfs, does not read the smb.conf on the client system
172(the few optional settings are passed in on mount via -o parameters instead).
173Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
174open files (required for strict POSIX compliance). Windows Servers already
175supported this feature. Samba server does not allow symlinks that refer to files
176outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
177files with absolute paths (ie beginning with slash) such as:
178 ln -s /mnt/foo bar
179would be forbidden. Samba 3.0.6 server or later includes the ability to create
180such symlinks safely by converting unsafe symlinks (ie symlinks to server
181files that are outside of the share) to a samba specific format on the server
182that is ignored by local server applications and non-cifs clients and that will
183not be traversed by the Samba server). This is opaque to the Linux client
184application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
185later, but only for remote clients using the CIFS Unix extensions, and will
186be invisbile to Windows clients and typically will not affect local
187applications running on the same server as Samba.
188
189Use instructions:
190================
191Once the CIFS VFS support is built into the kernel or installed as a module
192(cifs.o), you can use mount syntax like the following to access Samba or Windows
193servers:
194
195 mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
196
197Before -o the option -v may be specified to make the mount.cifs
198mount helper display the mount steps more verbosely.
199After -o the following commonly used cifs vfs specific options
200are supported:
201
202 user=<username>
203 pass=<password>
204 domain=<domain name>
205
206Other cifs mount options are described below. Use of TCP names (in addition to
207ip addresses) is available if the mount helper (mount.cifs) is installed. If
208you do not trust the server to which are mounted, or if you do not have
209cifs signing enabled (and the physical network is insecure), consider use
210of the standard mount options "noexec" and "nosuid" to reduce the risk of
211running an altered binary on your local system (downloaded from a hostile server
212or altered by a hostile router).
213
214Although mounting using format corresponding to the CIFS URL specification is
215not possible in mount.cifs yet, it is possible to use an alternate format
216for the server and sharename (which is somewhat similar to NFS style mount
217syntax) instead of the more widely used UNC format (i.e. \\server\share):
218 mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
219
220When using the mount helper mount.cifs, passwords may be specified via alternate
221mechanisms, instead of specifying it after -o using the normal "pass=" syntax
222on the command line:
2231) By including it in a credential file. Specify credentials=filename as one
224of the mount options. Credential files contain two lines
225 username=someuser
226 password=your_password
2272) By specifying the password in the PASSWD environment variable (similarly
228the user name can be taken from the USER environment variable).
2293) By specifying the password in a file by name via PASSWD_FILE
2304) By specifying the password in a file by file descriptor via PASSWD_FD
231
232If no password is provided, mount.cifs will prompt for password entry
233
234Restrictions
235============
236Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC
2371001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a
238problem as most servers support this.
239
240Valid filenames differ between Windows and Linux. Windows typically restricts
241filenames which contain certain reserved characters (e.g.the character :
242which is used to delimit the beginning of a stream name by Windows), while
243Linux allows a slightly wider set of valid characters in filenames. Windows
244servers can remap such characters when an explicit mapping is specified in
245the Server's registry. Samba starting with version 3.10 will allow such
246filenames (ie those which contain valid Linux characters, which normally
247would be forbidden for Windows/CIFS semantics) as long as the server is
248configured for Unix Extensions (and the client has not disabled
249/proc/fs/cifs/LinuxExtensionsEnabled).
250
251
252CIFS VFS Mount Options
253======================
254A partial list of the supported mount options follows:
255 user The user name to use when trying to establish
256 the CIFS session.
257 password The user password. If the mount helper is
258 installed, the user will be prompted for password
259 if not supplied.
260 ip The ip address of the target server
261 unc The target server Universal Network Name (export) to
262 mount.
263 domain Set the SMB/CIFS workgroup name prepended to the
264 username during CIFS session establishment
265 forceuid Set the default uid for inodes to the uid
266 passed in on mount. For mounts to servers
267 which do support the CIFS Unix extensions, such as a
268 properly configured Samba server, the server provides
269 the uid, gid and mode so this parameter should not be
270 specified unless the server and clients uid and gid
271 numbering differ. If the server and client are in the
272 same domain (e.g. running winbind or nss_ldap) and
273 the server supports the Unix Extensions then the uid
274 and gid can be retrieved from the server (and uid
275 and gid would not have to be specifed on the mount.
276 For servers which do not support the CIFS Unix
277 extensions, the default uid (and gid) returned on lookup
278 of existing files will be the uid (gid) of the person
279 who executed the mount (root, except when mount.cifs
280 is configured setuid for user mounts) unless the "uid="
281 (gid) mount option is specified. Also note that permission
282 checks (authorization checks) on accesses to a file occur
283 at the server, but there are cases in which an administrator
284 may want to restrict at the client as well. For those
285 servers which do not report a uid/gid owner
286 (such as Windows), permissions can also be checked at the
287 client, and a crude form of client side permission checking
288 can be enabled by specifying file_mode and dir_mode on
289 the client. (default)
290 forcegid (similar to above but for the groupid instead of uid) (default)
291 noforceuid Fill in file owner information (uid) by requesting it from
292 the server if possible. With this option, the value given in
293 the uid= option (on mount) will only be used if the server
294 can not support returning uids on inodes.
295 noforcegid (similar to above but for the group owner, gid, instead of uid)
296 uid Set the default uid for inodes, and indicate to the
297 cifs kernel driver which local user mounted. If the server
298 supports the unix extensions the default uid is
299 not used to fill in the owner fields of inodes (files)
300 unless the "forceuid" parameter is specified.
301 gid Set the default gid for inodes (similar to above).
302 file_mode If CIFS Unix extensions are not supported by the server
303 this overrides the default mode for file inodes.
304 fsc Enable local disk caching using FS-Cache (off by default). This
305 option could be useful to improve performance on a slow link,
306 heavily loaded server and/or network where reading from the
307 disk is faster than reading from the server (over the network).
308 This could also impact scalability positively as the
309 number of calls to the server are reduced. However, local
310 caching is not suitable for all workloads for e.g. read-once
311 type workloads. So, you need to consider carefully your
312 workload/scenario before using this option. Currently, local
313 disk caching is functional for CIFS files opened as read-only.
314 dir_mode If CIFS Unix extensions are not supported by the server
315 this overrides the default mode for directory inodes.
316 port attempt to contact the server on this tcp port, before
317 trying the usual ports (port 445, then 139).
318 iocharset Codepage used to convert local path names to and from
319 Unicode. Unicode is used by default for network path
320 names if the server supports it. If iocharset is
321 not specified then the nls_default specified
322 during the local client kernel build will be used.
323 If server does not support Unicode, this parameter is
324 unused.
325 rsize default read size (usually 16K). The client currently
326 can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
327 defaults to 16K and may be changed (from 8K to the maximum
328 kmalloc size allowed by your kernel) at module install time
329 for cifs.ko. Setting CIFSMaxBufSize to a very large value
330 will cause cifs to use more memory and may reduce performance
331 in some cases. To use rsize greater than 127K (the original
332 cifs protocol maximum) also requires that the server support
333 a new Unix Capability flag (for very large read) which some
334 newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
335 set from a minimum of 2048 to a maximum of 130048 (127K or
336 CIFSMaxBufSize, whichever is smaller)
337 wsize default write size (default 57344)
338 maximum wsize currently allowed by CIFS is 57344 (fourteen
339 4096 byte pages)
340 actimeo=n attribute cache timeout in seconds (default 1 second).
341 After this timeout, the cifs client requests fresh attribute
342 information from the server. This option allows to tune the
343 attribute cache timeout to suit the workload needs. Shorter
344 timeouts mean better the cache coherency, but increased number
345 of calls to the server. Longer timeouts mean reduced number
346 of calls to the server at the expense of less stricter cache
347 coherency checks (i.e. incorrect attribute cache for a short
348 period of time).
349 rw mount the network share read-write (note that the
350 server may still consider the share read-only)
351 ro mount network share read-only
352 version used to distinguish different versions of the
353 mount helper utility (not typically needed)
354 sep if first mount option (after the -o), overrides
355 the comma as the separator between the mount
356 parms. e.g.
357 -o user=myname,password=mypassword,domain=mydom
358 could be passed instead with period as the separator by
359 -o sep=.user=myname.password=mypassword.domain=mydom
360 this might be useful when comma is contained within username
361 or password or domain. This option is less important
362 when the cifs mount helper cifs.mount (version 1.1 or later)
363 is used.
364 nosuid Do not allow remote executables with the suid bit
365 program to be executed. This is only meaningful for mounts
366 to servers such as Samba which support the CIFS Unix Extensions.
367 If you do not trust the servers in your network (your mount
368 targets) it is recommended that you specify this option for
369 greater security.
370 exec Permit execution of binaries on the mount.
371 noexec Do not permit execution of binaries on the mount.
372 dev Recognize block devices on the remote mount.
373 nodev Do not recognize devices on the remote mount.
374 suid Allow remote files on this mountpoint with suid enabled to
375 be executed (default for mounts when executed as root,
376 nosuid is default for user mounts).
377 credentials Although ignored by the cifs kernel component, it is used by
378 the mount helper, mount.cifs. When mount.cifs is installed it
379 opens and reads the credential file specified in order
380 to obtain the userid and password arguments which are passed to
381 the cifs vfs.
382 guest Although ignored by the kernel component, the mount.cifs
383 mount helper will not prompt the user for a password
384 if guest is specified on the mount options. If no
385 password is specified a null password will be used.
386 perm Client does permission checks (vfs_permission check of uid
387 and gid of the file against the mode and desired operation),
388 Note that this is in addition to the normal ACL check on the
389 target machine done by the server software.
390 Client permission checking is enabled by default.
391 noperm Client does not do permission checks. This can expose
392 files on this mount to access by other users on the local
393 client system. It is typically only needed when the server
394 supports the CIFS Unix Extensions but the UIDs/GIDs on the
395 client and server system do not match closely enough to allow
396 access by the user doing the mount, but it may be useful with
397 non CIFS Unix Extension mounts for cases in which the default
398 mode is specified on the mount but is not to be enforced on the
399 client (e.g. perhaps when MultiUserMount is enabled)
400 Note that this does not affect the normal ACL check on the
401 target machine done by the server software (of the server
402 ACL against the user name provided at mount time).
403 serverino Use server's inode numbers instead of generating automatically
404 incrementing inode numbers on the client. Although this will
405 make it easier to spot hardlinked files (as they will have
406 the same inode numbers) and inode numbers may be persistent,
407 note that the server does not guarantee that the inode numbers
408 are unique if multiple server side mounts are exported under a
409 single share (since inode numbers on the servers might not
410 be unique if multiple filesystems are mounted under the same
411 shared higher level directory). Note that some older
412 (e.g. pre-Windows 2000) do not support returning UniqueIDs
413 or the CIFS Unix Extensions equivalent and for those
414 this mount option will have no effect. Exporting cifs mounts
415 under nfsd requires this mount option on the cifs mount.
416 This is now the default if server supports the
417 required network operation.
418 noserverino Client generates inode numbers (rather than using the actual one
419 from the server). These inode numbers will vary after
420 unmount or reboot which can confuse some applications,
421 but not all server filesystems support unique inode
422 numbers.
423 setuids If the CIFS Unix extensions are negotiated with the server
424 the client will attempt to set the effective uid and gid of
425 the local process on newly created files, directories, and
426 devices (create, mkdir, mknod). If the CIFS Unix Extensions
427 are not negotiated, for newly created files and directories
428 instead of using the default uid and gid specified on
429 the mount, cache the new file's uid and gid locally which means
430 that the uid for the file can change when the inode is
431 reloaded (or the user remounts the share).
432 nosetuids The client will not attempt to set the uid and gid on
433 on newly created files, directories, and devices (create,
434 mkdir, mknod) which will result in the server setting the
435 uid and gid to the default (usually the server uid of the
436 user who mounted the share). Letting the server (rather than
437 the client) set the uid and gid is the default. If the CIFS
438 Unix Extensions are not negotiated then the uid and gid for
439 new files will appear to be the uid (gid) of the mounter or the
440 uid (gid) parameter specified on the mount.
441 netbiosname When mounting to servers via port 139, specifies the RFC1001
442 source name to use to represent the client netbios machine
443 name when doing the RFC1001 netbios session initialize.
444 direct Do not do inode data caching on files opened on this mount.
445 This precludes mmapping files on this mount. In some cases
446 with fast networks and little or no caching benefits on the
447 client (e.g. when the application is doing large sequential
448 reads bigger than page size without rereading the same data)
449 this can provide better performance than the default
450 behavior which caches reads (readahead) and writes
451 (writebehind) through the local Linux client pagecache
452 if oplock (caching token) is granted and held. Note that
453 direct allows write operations larger than page size
454 to be sent to the server.
455 strictcache Use for switching on strict cache mode. In this mode the
456 client read from the cache all the time it has Oplock Level II,
457 otherwise - read from the server. All written data are stored
458 in the cache, but if the client doesn't have Exclusive Oplock,
459 it writes the data to the server.
460 rwpidforward Forward pid of a process who opened a file to any read or write
461 operation on that file. This prevent applications like WINE
462 from failing on read and write if we use mandatory brlock style.
463 acl Allow setfacl and getfacl to manage posix ACLs if server
464 supports them. (default)
465 noacl Do not allow setfacl and getfacl calls on this mount
466 user_xattr Allow getting and setting user xattrs (those attributes whose
467 name begins with "user." or "os2.") as OS/2 EAs (extended
468 attributes) to the server. This allows support of the
469 setfattr and getfattr utilities. (default)
470 nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs
471 mapchars Translate six of the seven reserved characters (not backslash)
472 *?<>|:
473 to the remap range (above 0xF000), which also
474 allows the CIFS client to recognize files created with
475 such characters by Windows's POSIX emulation. This can
476 also be useful when mounting to most versions of Samba
477 (which also forbids creating and opening files
478 whose names contain any of these seven characters).
479 This has no effect if the server does not support
480 Unicode on the wire.
481 nomapchars Do not translate any of these seven characters (default).
482 nocase Request case insensitive path name matching (case
483 sensitive is the default if the server supports it).
484 (mount option "ignorecase" is identical to "nocase")
485 posixpaths If CIFS Unix extensions are supported, attempt to
486 negotiate posix path name support which allows certain
487 characters forbidden in typical CIFS filenames, without
488 requiring remapping. (default)
489 noposixpaths If CIFS Unix extensions are supported, do not request
490 posix path name support (this may cause servers to
491 reject creatingfile with certain reserved characters).
492 nounix Disable the CIFS Unix Extensions for this mount (tree
493 connection). This is rarely needed, but it may be useful
494 in order to turn off multiple settings all at once (ie
495 posix acls, posix locks, posix paths, symlink support
496 and retrieving uids/gids/mode from the server) or to
497 work around a bug in server which implement the Unix
498 Extensions.
499 nobrl Do not send byte range lock requests to the server.
500 This is necessary for certain applications that break
501 with cifs style mandatory byte range locks (and most
502 cifs servers do not yet support requesting advisory
503 byte range locks).
504 forcemandatorylock Even if the server supports posix (advisory) byte range
505 locking, send only mandatory lock requests. For some
506 (presumably rare) applications, originally coded for
507 DOS/Windows, which require Windows style mandatory byte range
508 locking, they may be able to take advantage of this option,
509 forcing the cifs client to only send mandatory locks
510 even if the cifs server would support posix advisory locks.
511 "forcemand" is accepted as a shorter form of this mount
512 option.
513 nostrictsync If this mount option is set, when an application does an
514 fsync call then the cifs client does not send an SMB Flush
515 to the server (to force the server to write all dirty data
516 for this file immediately to disk), although cifs still sends
517 all dirty (cached) file data to the server and waits for the
518 server to respond to the write. Since SMB Flush can be
519 very slow, and some servers may be reliable enough (to risk
520 delaying slightly flushing the data to disk on the server),
521 turning on this option may be useful to improve performance for
522 applications that fsync too much, at a small risk of server
523 crash. If this mount option is not set, by default cifs will
524 send an SMB flush request (and wait for a response) on every
525 fsync call.
526 nodfs Disable DFS (global name space support) even if the
527 server claims to support it. This can help work around
528 a problem with parsing of DFS paths with Samba server
529 versions 3.0.24 and 3.0.25.
530 remount remount the share (often used to change from ro to rw mounts
531 or vice versa)
532 cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for
533 the file. (EXPERIMENTAL)
534 servern Specify the server 's netbios name (RFC1001 name) to use
535 when attempting to setup a session to the server.
536 This is needed for mounting to some older servers (such
537 as OS/2 or Windows 98 and Windows ME) since they do not
538 support a default server name. A server name can be up
539 to 15 characters long and is usually uppercased.
540 sfu When the CIFS Unix Extensions are not negotiated, attempt to
541 create device files and fifos in a format compatible with
542 Services for Unix (SFU). In addition retrieve bits 10-12
543 of the mode via the SETFILEBITS extended attribute (as
544 SFU does). In the future the bottom 9 bits of the
545 mode also will be emulated using queries of the security
546 descriptor (ACL).
547 mfsymlinks Enable support for Minshall+French symlinks
548 (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
549 This option is ignored when specified together with the
550 'sfu' option. Minshall+French symlinks are used even if
551 the server supports the CIFS Unix Extensions.
552 sign Must use packet signing (helps avoid unwanted data modification
553 by intermediate systems in the route). Note that signing
554 does not work with lanman or plaintext authentication.
555 seal Must seal (encrypt) all data on this mounted share before
556 sending on the network. Requires support for Unix Extensions.
557 Note that this differs from the sign mount option in that it
558 causes encryption of data sent over this mounted share but other
559 shares mounted to the same server are unaffected.
560 locallease This option is rarely needed. Fcntl F_SETLEASE is
561 used by some applications such as Samba and NFSv4 server to
562 check to see whether a file is cacheable. CIFS has no way
563 to explicitly request a lease, but can check whether a file
564 is cacheable (oplocked). Unfortunately, even if a file
565 is not oplocked, it could still be cacheable (ie cifs client
566 could grant fcntl leases if no other local processes are using
567 the file) for cases for example such as when the server does not
568 support oplocks and the user is sure that the only updates to
569 the file will be from this client. Specifying this mount option
570 will allow the cifs client to check for leases (only) locally
571 for files which are not oplocked instead of denying leases
572 in that case. (EXPERIMENTAL)
573 sec Security mode. Allowed values are:
574 none attempt to connection as a null user (no name)
575 krb5 Use Kerberos version 5 authentication
576 krb5i Use Kerberos authentication and packet signing
577 ntlm Use NTLM password hashing (default)
578 ntlmi Use NTLM password hashing with signing (if
579 /proc/fs/cifs/PacketSigningEnabled on or if
580 server requires signing also can be the default)
581 ntlmv2 Use NTLMv2 password hashing
582 ntlmv2i Use NTLMv2 password hashing with packet signing
583 lanman (if configured in kernel config) use older
584 lanman hash
585hard Retry file operations if server is not responding
586soft Limit retries to unresponsive servers (usually only
587 one retry) before returning an error. (default)
588
589The mount.cifs mount helper also accepts a few mount options before -o
590including:
591
592 -S take password from stdin (equivalent to setting the environment
593 variable "PASSWD_FD=0"
594 -V print mount.cifs version
595 -? display simple usage information
596
597With most 2.6 kernel versions of modutils, the version of the cifs kernel
598module can be displayed via modinfo.
599
600Misc /proc/fs/cifs Flags and Debug Info
601=======================================
602Informational pseudo-files:
603DebugData Displays information about active CIFS sessions and
604 shares, features enabled as well as the cifs.ko
605 version.
606Stats Lists summary resource usage information as well as per
607 share statistics, if CONFIG_CIFS_STATS in enabled
608 in the kernel configuration.
609
610Configuration pseudo-files:
611PacketSigningEnabled If set to one, cifs packet signing is enabled
612 and will be used if the server requires
613 it. If set to two, cifs packet signing is
614 required even if the server considers packet
615 signing optional. (default 1)
616SecurityFlags Flags which control security negotiation and
617 also packet signing. Authentication (may/must)
618 flags (e.g. for NTLM and/or NTLMv2) may be combined with
619 the signing flags. Specifying two different password
620 hashing mechanisms (as "must use") on the other hand
621 does not make much sense. Default flags are
622 0x07007
623 (NTLM, NTLMv2 and packet signing allowed). The maximum
624 allowable flags if you want to allow mounts to servers
625 using weaker password hashes is 0x37037 (lanman,
626 plaintext, ntlm, ntlmv2, signing allowed). Some
627 SecurityFlags require the corresponding menuconfig
628 options to be enabled (lanman and plaintext require
629 CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
630 plaintext authentication currently requires also
631 enabling lanman authentication in the security flags
632 because the cifs module only supports sending
633 laintext passwords using the older lanman dialect
634 form of the session setup SMB. (e.g. for authentication
635 using plain text passwords, set the SecurityFlags
636 to 0x30030):
637
638 may use packet signing 0x00001
639 must use packet signing 0x01001
640 may use NTLM (most common password hash) 0x00002
641 must use NTLM 0x02002
642 may use NTLMv2 0x00004
643 must use NTLMv2 0x04004
644 may use Kerberos security 0x00008
645 must use Kerberos 0x08008
646 may use lanman (weak) password hash 0x00010
647 must use lanman password hash 0x10010
648 may use plaintext passwords 0x00020
649 must use plaintext passwords 0x20020
650 (reserved for future packet encryption) 0x00040
651
652cifsFYI If set to non-zero value, additional debug information
653 will be logged to the system error log. This field
654 contains three flags controlling different classes of
655 debugging entries. The maximum value it can be set
656 to is 7 which enables all debugging points (default 0).
657 Some debugging statements are not compiled into the
658 cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
659 kernel configuration. cifsFYI may be set to one or
660 nore of the following flags (7 sets them all):
661
662 log cifs informational messages 0x01
663 log return codes from cifs entry points 0x02
664 log slow responses (ie which take longer than 1 second)
665 CONFIG_CIFS_STATS2 must be enabled in .config 0x04
666
667
668traceSMB If set to one, debug information is logged to the
669 system error log with the start of smb requests
670 and responses (default 0)
671LookupCacheEnable If set to one, inode information is kept cached
672 for one second improving performance of lookups
673 (default 1)
674OplockEnabled If set to one, safe distributed caching enabled.
675 (default 1)
676LinuxExtensionsEnabled If set to one then the client will attempt to
677 use the CIFS "UNIX" extensions which are optional
678 protocol enhancements that allow CIFS servers
679 to return accurate UID/GID information as well
680 as support symbolic links. If you use servers
681 such as Samba that support the CIFS Unix
682 extensions but do not want to use symbolic link
683 support and want to map the uid and gid fields
684 to values supplied at mount (rather than the
685 actual values, then set this to zero. (default 1)
686
687These experimental features and tracing can be enabled by changing flags in
688/proc/fs/cifs (after the cifs module has been installed or built into the
689kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable
690tracing to the kernel message log type:
691
692 echo 7 > /proc/fs/cifs/cifsFYI
693
694cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
695logging of various informational messages. 2 enables logging of non-zero
696SMB return codes while 4 enables logging of requests that take longer
697than one second to complete (except for byte range lock requests).
698Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
699source code (typically by setting it in the beginning of cifsglob.h),
700and setting it to seven enables all three. Finally, tracing
701the start of smb requests and responses can be enabled via:
702
703 echo 1 > /proc/fs/cifs/traceSMB
704
705Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
706if the kernel was configured with cifs statistics enabled. The statistics
707represent the number of successful (ie non-zero return code from the server)
708SMB responses to some of the more common commands (open, delete, mkdir etc.).
709Also recorded is the total bytes read and bytes written to the server for
710that share. Note that due to client caching effects this can be less than the
711number of bytes read and written by the application running on the client.
712The statistics for the number of total SMBs and oplock breaks are different in
713that they represent all for that share, not just those for which the server
714returned success.
715
716Also note that "cat /proc/fs/cifs/DebugData" will display information about
717the active sessions and the shares that are mounted.
718
719Enabling Kerberos (extended security) works but requires version 1.2 or later
720of the helper program cifs.upcall to be present and to be configured in the
721/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
722project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
723require this helper. Note that NTLMv2 security (which does not require the
724cifs.upcall helper program), instead of using Kerberos, is sufficient for
725some use cases.
726
727DFS support allows transparent redirection to shares in an MS-DFS name space.
728In addition, DFS support for target shares which are specified as UNC
729names which begin with host names (rather than IP addresses) requires
730a user space helper (such as cifs.upcall) to be present in order to
731translate host names to ip address, and the user space helper must also
732be configured in the file /etc/request-key.conf. Samba, Windows servers and
733many NAS appliances support DFS as a way of constructing a global name
734space to ease network configuration and improve reliability.
735
736To use cifs Kerberos and DFS support, the Linux keyutils package should be
737installed and something like the following lines should be added to the
738/etc/request-key.conf file:
739
740create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
741create dns_resolver * * /usr/local/sbin/cifs.upcall %k
742
743CIFS kernel module parameters
744=============================
745These module parameters can be specified or modified either during the time of
746module loading or during the runtime by using the interface
747 /proc/module/cifs/parameters/<param>
748
749i.e. echo "value" > /sys/module/cifs/parameters/<param>
750
7511. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
752 [Y/y/1]. To disable use any of [N/n/0].
753
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
deleted file mode 100644
index 355abcdcda98..000000000000
--- a/fs/cifs/TODO
+++ /dev/null
@@ -1,129 +0,0 @@
1Version 1.53 May 20, 2008
2
3A Partial List of Missing Features
4==================================
5
6Contributions are welcome. There are plenty of opportunities
7for visible, important contributions to this module. Here
8is a partial list of the known problems and missing features:
9
10a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
11so that these operations can be supported to Windows servers
12
13b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
14SecurityDescriptors
15
16c) Better pam/winbind integration (e.g. to handle uid mapping
17better)
18
19d) Cleanup now unneeded SessSetup code in
20fs/cifs/connect.c and add back in NTLMSSP code if any servers
21need it
22
23e) fix NTLMv2 signing when two mounts with different users to same
24server.
25
26f) Directory entry caching relies on a 1 second timer, rather than
27using FindNotify or equivalent. - (started)
28
29g) quota support (needs minor kernel change since quota calls
30to make it to network filesystems or deviceless filesystems)
31
32h) investigate sync behavior (including syncpage) and check
33for proper behavior of intr/nointr
34
35i) improve support for very old servers (OS/2 and Win9x for example)
36Including support for changing the time remotely (utimes command).
37
38j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
39extra copy in/out of the socket buffers in some cases.
40
41k) Better optimize open (and pathbased setfilesize) to reduce the
42oplock breaks coming from windows srv. Piggyback identical file
43opens on top of each other by incrementing reference count rather
44than resending (helps reduce server resource utilization and avoid
45spurious oplock breaks).
46
47l) Improve performance of readpages by sending more than one read
48at a time when 8 pages or more are requested. In conjuntion
49add support for async_cifs_readpages.
50
51m) Add support for storing symlink info to Windows servers
52in the Extended Attribute format their SFU clients would recognize.
53
54n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
55will autorefresh (partially complete by Asser). Needs minor kernel
56vfs change to support removing D_NOTIFY on a file.
57
58o) Add GUI tool to configure /proc/fs/cifs settings and for display of
59the CIFS statistics (started)
60
61p) implement support for security and trusted categories of xattrs
62(requires minor protocol extension) to enable better support for SELINUX
63
64q) Implement O_DIRECT flag on open (already supported on mount)
65
66r) Create UID mapping facility so server UIDs can be mapped on a per
67mount or a per server basis to client UIDs or nobody if no mapping
68exists. This is helpful when Unix extensions are negotiated to
69allow better permission checking when UIDs differ on the server
70and client. Add new protocol request to the CIFS protocol
71standard for asking the server for the corresponding name of a
72particular uid.
73
74s) Add support for CIFS Unix and also the newer POSIX extensions to the
75server side for Samba 4.
76
77t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
78need to add ability to set time to server (utimes command)
79
80u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
81
82v) mount check for unmatched uids
83
84w) Add support for new vfs entry point for fallocate
85
86x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
87processes can proceed better in parallel (on the server)
88
89y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
90restriction of wsize max being 127K)
91
92KNOWN BUGS (updated April 24, 2007)
93====================================
94See http://bugzilla.samba.org - search on product "CifsVFS" for
95current bug list.
96
971) existing symbolic links (Windows reparse points) are recognized but
98can not be created remotely. They are implemented for Samba and those that
99support the CIFS Unix extensions, although earlier versions of Samba
100overly restrict the pathnames.
1012) follow_link and readdir code does not follow dfs junctions
102but recognizes them
1033) create of new files to FAT partitions on Windows servers can
104succeed but still return access denied (appears to be Windows
105server not cifs client problem) and has not been reproduced recently.
106NTFS partitions do not have this problem.
1074) Unix/POSIX capabilities are reset after reconnection, and affect
108a few fields in the tree connection but we do do not know which
109superblocks to apply these changes to. We should probably walk
110the list of superblocks to set these. Also need to check the
111flags on the second mount to the same share, and see if we
112can do the same trick that NFS does to remount duplicate shares.
113
114Misc testing to do
115==================
1161) check out max path names and max path name components against various server
117types. Try nested symlinks (8 deep). Return max path name in stat -f information
118
1192) Modify file portion of ltp so it can run against a mounted network
120share and run it against cifs vfs in automated fashion.
121
1223) Additional performance testing and optimization using iozone and similar -
123there are some easy changes that can be done to parallelize sequential writes,
124and when signing is disabled to request larger read sizes (larger than
125negotiated size) and send larger write sizes to modern servers.
126
1274) More exhaustively test against less common servers. More testing
128against Windows 9x, Windows ME servers.
129
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index fe8d6276410a..d8eac3b6cefb 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -91,6 +91,8 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
91#endif /* CONFIG_CIFS_SMB2 */ 91#endif /* CONFIG_CIFS_SMB2 */
92#endif 92#endif
93 93
94wchar_t cifs_toupper(wchar_t in);
95
94/* 96/*
95 * UniStrcat: Concatenate the second string to the first 97 * UniStrcat: Concatenate the second string to the first
96 * 98 *
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 85ea98d139fc..a16b4e58bcc6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -255,6 +255,7 @@ cifs_alloc_inode(struct super_block *sb)
255 cifs_inode->server_eof = 0; 255 cifs_inode->server_eof = 0;
256 cifs_inode->uniqueid = 0; 256 cifs_inode->uniqueid = 0;
257 cifs_inode->createtime = 0; 257 cifs_inode->createtime = 0;
258 cifs_inode->epoch = 0;
258#ifdef CONFIG_CIFS_SMB2 259#ifdef CONFIG_CIFS_SMB2
259 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); 260 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
260#endif 261#endif
@@ -357,6 +358,18 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
357 seq_printf(s, "loose"); 358 seq_printf(s, "loose");
358} 359}
359 360
361static void
362cifs_show_nls(struct seq_file *s, struct nls_table *cur)
363{
364 struct nls_table *def;
365
366 /* Display iocharset= option if it's not default charset */
367 def = load_nls_default();
368 if (def != cur)
369 seq_printf(s, ",iocharset=%s", cur->charset);
370 unload_nls(def);
371}
372
360/* 373/*
361 * cifs_show_options() is for displaying mount options in /proc/mounts. 374 * cifs_show_options() is for displaying mount options in /proc/mounts.
362 * Not all settable options are displayed but most of the important 375 * Not all settable options are displayed but most of the important
@@ -418,6 +431,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
418 seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", 431 seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
419 cifs_sb->mnt_file_mode, 432 cifs_sb->mnt_file_mode,
420 cifs_sb->mnt_dir_mode); 433 cifs_sb->mnt_dir_mode);
434
435 cifs_show_nls(s, cifs_sb->local_nls);
436
421 if (tcon->seal) 437 if (tcon->seal)
422 seq_printf(s, ",seal"); 438 seq_printf(s, ",seal");
423 if (tcon->nocase) 439 if (tcon->nocase)
@@ -718,7 +734,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
718 734
719 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 735 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
720 736
721 if (CIFS_I(inode)->clientCanCacheAll) 737 if (CIFS_CACHE_WRITE(CIFS_I(inode)))
722 return written; 738 return written;
723 739
724 rc = filemap_fdatawrite(inode->i_mapping); 740 rc = filemap_fdatawrite(inode->i_mapping);
@@ -743,7 +759,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
743 * We need to be sure that all dirty pages are written and the 759 * We need to be sure that all dirty pages are written and the
744 * server has the newest file length. 760 * server has the newest file length.
745 */ 761 */
746 if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && 762 if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
747 inode->i_mapping->nrpages != 0) { 763 inode->i_mapping->nrpages != 0) {
748 rc = filemap_fdatawait(inode->i_mapping); 764 rc = filemap_fdatawait(inode->i_mapping);
749 if (rc) { 765 if (rc) {
@@ -767,8 +783,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
767 783
768static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 784static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
769{ 785{
770 /* note that this is called by vfs setlease with i_lock held 786 /*
771 to protect *lease from going away */ 787 * Note that this is called by vfs setlease with i_lock held to
788 * protect *lease from going away.
789 */
772 struct inode *inode = file_inode(file); 790 struct inode *inode = file_inode(file);
773 struct cifsFileInfo *cfile = file->private_data; 791 struct cifsFileInfo *cfile = file->private_data;
774 792
@@ -776,20 +794,19 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
776 return -EINVAL; 794 return -EINVAL;
777 795
778 /* check if file is oplocked */ 796 /* check if file is oplocked */
779 if (((arg == F_RDLCK) && 797 if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
780 (CIFS_I(inode)->clientCanCacheRead)) || 798 ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
781 ((arg == F_WRLCK) &&
782 (CIFS_I(inode)->clientCanCacheAll)))
783 return generic_setlease(file, arg, lease); 799 return generic_setlease(file, arg, lease);
784 else if (tlink_tcon(cfile->tlink)->local_lease && 800 else if (tlink_tcon(cfile->tlink)->local_lease &&
785 !CIFS_I(inode)->clientCanCacheRead) 801 !CIFS_CACHE_READ(CIFS_I(inode)))
786 /* If the server claims to support oplock on this 802 /*
787 file, then we still need to check oplock even 803 * If the server claims to support oplock on this file, then we
788 if the local_lease mount option is set, but there 804 * still need to check oplock even if the local_lease mount
789 are servers which do not support oplock for which 805 * option is set, but there are servers which do not support
790 this mount option may be useful if the user 806 * oplock for which this mount option may be useful if the user
791 knows that the file won't be changed on the server 807 * knows that the file won't be changed on the server by anyone
792 by anyone else */ 808 * else.
809 */
793 return generic_setlease(file, arg, lease); 810 return generic_setlease(file, arg, lease);
794 else 811 else
795 return -EAGAIN; 812 return -EAGAIN;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ea723a5e8226..6d0b07217ac9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
132extern const struct export_operations cifs_export_ops; 132extern const struct export_operations cifs_export_ops;
133#endif /* CONFIG_CIFS_NFSD_EXPORT */ 133#endif /* CONFIG_CIFS_NFSD_EXPORT */
134 134
135#define CIFS_VERSION "2.01" 135#define CIFS_VERSION "2.02"
136#endif /* _CIFSFS_H */ 136#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 52ca861ed35e..52b6f6c26bfc 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -28,6 +28,7 @@
28#include "cifsacl.h" 28#include "cifsacl.h"
29#include <crypto/internal/hash.h> 29#include <crypto/internal/hash.h>
30#include <linux/scatterlist.h> 30#include <linux/scatterlist.h>
31#include <uapi/linux/cifs/cifs_mount.h>
31#ifdef CONFIG_CIFS_SMB2 32#ifdef CONFIG_CIFS_SMB2
32#include "smb2pdu.h" 33#include "smb2pdu.h"
33#endif 34#endif
@@ -41,12 +42,7 @@
41#define MAX_SES_INFO 2 42#define MAX_SES_INFO 2
42#define MAX_TCON_INFO 4 43#define MAX_TCON_INFO 4
43 44
44#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) 45#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1)
45#define MAX_SERVER_SIZE 15
46#define MAX_SHARE_SIZE 80
47#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
48#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */
49#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
50 46
51#define CIFS_MIN_RCV_POOL 4 47#define CIFS_MIN_RCV_POOL 4
52 48
@@ -135,6 +131,7 @@ struct cifs_secmech {
135 131
136/* per smb session structure/fields */ 132/* per smb session structure/fields */
137struct ntlmssp_auth { 133struct ntlmssp_auth {
134 bool sesskey_per_smbsess; /* whether session key is per smb session */
138 __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ 135 __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
139 __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ 136 __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
140 unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ 137 unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
@@ -308,6 +305,9 @@ struct smb_version_operations {
308 int (*create_hardlink)(const unsigned int, struct cifs_tcon *, 305 int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
309 const char *, const char *, 306 const char *, const char *,
310 struct cifs_sb_info *); 307 struct cifs_sb_info *);
308 /* query symlink target */
309 int (*query_symlink)(const unsigned int, struct cifs_tcon *,
310 const char *, char **, struct cifs_sb_info *);
311 /* open a file for non-posix mounts */ 311 /* open a file for non-posix mounts */
312 int (*open)(const unsigned int, struct cifs_open_parms *, 312 int (*open)(const unsigned int, struct cifs_open_parms *,
313 __u32 *, FILE_ALL_INFO *); 313 __u32 *, FILE_ALL_INFO *);
@@ -361,18 +361,24 @@ struct smb_version_operations {
361 /* push brlocks from the cache to the server */ 361 /* push brlocks from the cache to the server */
362 int (*push_mand_locks)(struct cifsFileInfo *); 362 int (*push_mand_locks)(struct cifsFileInfo *);
363 /* get lease key of the inode */ 363 /* get lease key of the inode */
364 void (*get_lease_key)(struct inode *, struct cifs_fid *fid); 364 void (*get_lease_key)(struct inode *, struct cifs_fid *);
365 /* set lease key of the inode */ 365 /* set lease key of the inode */
366 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 366 void (*set_lease_key)(struct inode *, struct cifs_fid *);
367 /* generate new lease key */ 367 /* generate new lease key */
368 void (*new_lease_key)(struct cifs_fid *fid); 368 void (*new_lease_key)(struct cifs_fid *);
369 /* The next two functions will need to be changed to per smb session */ 369 int (*generate_signingkey)(struct cifs_ses *);
370 void (*generate_signingkey)(struct TCP_Server_Info *server); 370 int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
371 int (*calc_signature)(struct smb_rqst *rqst, 371 int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *,
372 struct TCP_Server_Info *server); 372 struct cifs_sb_info *, unsigned int);
373 int (*query_mf_symlink)(const unsigned char *path, char *pbuf, 373 /* if we can do cache read operations */
374 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, 374 bool (*is_read_op)(__u32);
375 unsigned int xid); 375 /* set oplock level for the inode */
376 void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
377 bool *);
378 /* create lease context buffer for CREATE request */
379 char * (*create_lease_buf)(u8 *, u8);
380 /* parse lease context buffer and return oplock/epoch info */
381 __u8 (*parse_lease_buf)(void *, unsigned int *);
376}; 382};
377 383
378struct smb_version_values { 384struct smb_version_values {
@@ -390,9 +396,9 @@ struct smb_version_values {
390 unsigned int cap_unix; 396 unsigned int cap_unix;
391 unsigned int cap_nt_find; 397 unsigned int cap_nt_find;
392 unsigned int cap_large_files; 398 unsigned int cap_large_files;
393 unsigned int oplock_read;
394 __u16 signing_enabled; 399 __u16 signing_enabled;
395 __u16 signing_required; 400 __u16 signing_required;
401 size_t create_lease_size;
396}; 402};
397 403
398#define HEADER_SIZE(server) (server->vals->header_size) 404#define HEADER_SIZE(server) (server->vals->header_size)
@@ -541,14 +547,10 @@ struct TCP_Server_Info {
541 unsigned int max_rw; /* maxRw specifies the maximum */ 547 unsigned int max_rw; /* maxRw specifies the maximum */
542 /* message size the server can send or receive for */ 548 /* message size the server can send or receive for */
543 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 549 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
544 unsigned int max_vcs; /* maximum number of smb sessions, at least
545 those that can be specified uniquely with
546 vcnumbers */
547 unsigned int capabilities; /* selective disabling of caps by smb sess */ 550 unsigned int capabilities; /* selective disabling of caps by smb sess */
548 int timeAdj; /* Adjust for difference in server time zone in sec */ 551 int timeAdj; /* Adjust for difference in server time zone in sec */
549 __u64 CurrentMid; /* multiplex id - rotating counter */ 552 __u64 CurrentMid; /* multiplex id - rotating counter */
550 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ 553 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
551 char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
552 /* 16th byte of RFC1001 workstation name is always null */ 554 /* 16th byte of RFC1001 workstation name is always null */
553 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 555 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
554 __u32 sequence_number; /* for signing, protected by srv_mutex */ 556 __u32 sequence_number; /* for signing, protected by srv_mutex */
@@ -710,7 +712,6 @@ struct cifs_ses {
710 enum statusEnum status; 712 enum statusEnum status;
711 unsigned overrideSecFlg; /* if non-zero override global sec flags */ 713 unsigned overrideSecFlg; /* if non-zero override global sec flags */
712 __u16 ipc_tid; /* special tid for connection to IPC share */ 714 __u16 ipc_tid; /* special tid for connection to IPC share */
713 __u16 vcnum;
714 char *serverOS; /* name of operating system underlying server */ 715 char *serverOS; /* name of operating system underlying server */
715 char *serverNOS; /* name of network operating system of server */ 716 char *serverNOS; /* name of network operating system of server */
716 char *serverDomain; /* security realm of server */ 717 char *serverDomain; /* security realm of server */
@@ -731,6 +732,7 @@ struct cifs_ses {
731 bool need_reconnect:1; /* connection reset, uid now invalid */ 732 bool need_reconnect:1; /* connection reset, uid now invalid */
732#ifdef CONFIG_CIFS_SMB2 733#ifdef CONFIG_CIFS_SMB2
733 __u16 session_flags; 734 __u16 session_flags;
735 char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
734#endif /* CONFIG_CIFS_SMB2 */ 736#endif /* CONFIG_CIFS_SMB2 */
735}; 737};
736 738
@@ -935,6 +937,8 @@ struct cifs_fid {
935 __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ 937 __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
936#endif 938#endif
937 struct cifs_pending_open *pending_open; 939 struct cifs_pending_open *pending_open;
940 unsigned int epoch;
941 bool purge_cache;
938}; 942};
939 943
940struct cifs_fid_locks { 944struct cifs_fid_locks {
@@ -1032,6 +1036,17 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
1032struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); 1036struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
1033void cifsFileInfo_put(struct cifsFileInfo *cifs_file); 1037void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
1034 1038
1039#define CIFS_CACHE_READ_FLG 1
1040#define CIFS_CACHE_HANDLE_FLG 2
1041#define CIFS_CACHE_RH_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG)
1042#define CIFS_CACHE_WRITE_FLG 4
1043#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
1044#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
1045
1046#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG)
1047#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
1048#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG)
1049
1035/* 1050/*
1036 * One of these for each file inode 1051 * One of these for each file inode
1037 */ 1052 */
@@ -1043,8 +1058,8 @@ struct cifsInodeInfo {
1043 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 1058 /* BB add in lists for dirty pages i.e. write caching info for oplock */
1044 struct list_head openFileList; 1059 struct list_head openFileList;
1045 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 1060 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
1046 bool clientCanCacheRead; /* read oplock */ 1061 unsigned int oplock; /* oplock/lease level we have */
1047 bool clientCanCacheAll; /* read and writebehind oplock */ 1062 unsigned int epoch; /* used to track lease state changes */
1048 bool delete_pending; /* DELETE_ON_CLOSE is set */ 1063 bool delete_pending; /* DELETE_ON_CLOSE is set */
1049 bool invalid_mapping; /* pagecache is invalid */ 1064 bool invalid_mapping; /* pagecache is invalid */
1050 unsigned long time; /* jiffies of last update of inode */ 1065 unsigned long time; /* jiffies of last update of inode */
@@ -1253,6 +1268,7 @@ struct dfs_info3_param {
1253#define CIFS_FATTR_DELETE_PENDING 0x2 1268#define CIFS_FATTR_DELETE_PENDING 0x2
1254#define CIFS_FATTR_NEED_REVAL 0x4 1269#define CIFS_FATTR_NEED_REVAL 0x4
1255#define CIFS_FATTR_INO_COLLISION 0x8 1270#define CIFS_FATTR_INO_COLLISION 0x8
1271#define CIFS_FATTR_UNKNOWN_NLINK 0x10
1256 1272
1257struct cifs_fattr { 1273struct cifs_fattr {
1258 u32 cf_flags; 1274 u32 cf_flags;
@@ -1502,7 +1518,7 @@ extern mempool_t *cifs_mid_poolp;
1502extern struct smb_version_operations smb1_operations; 1518extern struct smb_version_operations smb1_operations;
1503extern struct smb_version_values smb1_values; 1519extern struct smb_version_values smb1_values;
1504#define SMB20_VERSION_STRING "2.0" 1520#define SMB20_VERSION_STRING "2.0"
1505/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */ 1521extern struct smb_version_operations smb20_operations;
1506extern struct smb_version_values smb20_values; 1522extern struct smb_version_values smb20_values;
1507#define SMB21_VERSION_STRING "2.1" 1523#define SMB21_VERSION_STRING "2.1"
1508extern struct smb_version_operations smb21_operations; 1524extern struct smb_version_operations smb21_operations;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 11ca24a8e054..a630475e421c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1495,11 +1495,12 @@ struct reparse_data {
1495 __u32 ReparseTag; 1495 __u32 ReparseTag;
1496 __u16 ReparseDataLength; 1496 __u16 ReparseDataLength;
1497 __u16 Reserved; 1497 __u16 Reserved;
1498 __u16 AltNameOffset; 1498 __u16 SubstituteNameOffset;
1499 __u16 AltNameLen; 1499 __u16 SubstituteNameLength;
1500 __u16 TargetNameOffset; 1500 __u16 PrintNameOffset;
1501 __u16 TargetNameLen; 1501 __u16 PrintNameLength;
1502 char LinkNamesBuf[1]; 1502 __u32 Flags;
1503 char PathBuffer[0];
1503} __attribute__((packed)); 1504} __attribute__((packed));
1504 1505
1505struct cifs_quota_data { 1506struct cifs_quota_data {
@@ -2651,26 +2652,7 @@ typedef struct file_xattr_info {
2651} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info 2652} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
2652 level 0x205 */ 2653 level 0x205 */
2653 2654
2654 2655/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
2655/* flags for chattr command */
2656#define EXT_SECURE_DELETE 0x00000001 /* EXT3_SECRM_FL */
2657#define EXT_ENABLE_UNDELETE 0x00000002 /* EXT3_UNRM_FL */
2658/* Reserved for compress file 0x4 */
2659#define EXT_SYNCHRONOUS 0x00000008 /* EXT3_SYNC_FL */
2660#define EXT_IMMUTABLE_FL 0x00000010 /* EXT3_IMMUTABLE_FL */
2661#define EXT_OPEN_APPEND_ONLY 0x00000020 /* EXT3_APPEND_FL */
2662#define EXT_DO_NOT_BACKUP 0x00000040 /* EXT3_NODUMP_FL */
2663#define EXT_NO_UPDATE_ATIME 0x00000080 /* EXT3_NOATIME_FL */
2664/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */
2665#define EXT_HASH_TREE_INDEXED_DIR 0x00001000 /* GET-ONLY EXT3_INDEX_FL */
2666/* 0x2000 reserved for IMAGIC_FL */
2667#define EXT_JOURNAL_THIS_FILE 0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */
2668/* 0x8000 reserved for EXT3_NOTAIL_FL */
2669#define EXT_SYNCHRONOUS_DIR 0x00010000 /* EXT3_DIRSYNC_FL */
2670#define EXT_TOPDIR 0x00020000 /* EXT3_TOPDIR_FL */
2671
2672#define EXT_SET_MASK 0x000300FF
2673#define EXT_GET_MASK 0x0003DFFF
2674 2656
2675typedef struct file_chattr_info { 2657typedef struct file_chattr_info {
2676 __le64 mask; /* list of all possible attribute bits */ 2658 __le64 mask; /* list of all possible attribute bits */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b29a012bed33..b5ec2a268f56 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -357,13 +357,9 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
357 struct cifs_tcon *tcon, 357 struct cifs_tcon *tcon,
358 const unsigned char *searchName, char **syminfo, 358 const unsigned char *searchName, char **syminfo,
359 const struct nls_table *nls_codepage); 359 const struct nls_table *nls_codepage);
360#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL 360extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
361extern int CIFSSMBQueryReparseLinkInfo(const unsigned int xid, 361 __u16 fid, char **symlinkinfo,
362 struct cifs_tcon *tcon, 362 const struct nls_table *nls_codepage);
363 const unsigned char *searchName,
364 char *symlinkinfo, const int buflen, __u16 fid,
365 const struct nls_table *nls_codepage);
366#endif /* temporarily unused until cifs_symlink fixed */
367extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, 363extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
368 const char *fileName, const int disposition, 364 const char *fileName, const int disposition,
369 const int access_flags, const int omode, 365 const int access_flags, const int omode,
@@ -435,7 +431,7 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
435extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); 431extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
436extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 432extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
437extern int calc_seckey(struct cifs_ses *); 433extern int calc_seckey(struct cifs_ses *);
438extern void generate_smb3signingkey(struct TCP_Server_Info *); 434extern int generate_smb3signingkey(struct cifs_ses *);
439 435
440#ifdef CONFIG_CIFS_WEAK_PW_HASH 436#ifdef CONFIG_CIFS_WEAK_PW_HASH
441extern int calc_lanman_hash(const char *password, const char *cryptkey, 437extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a89c4cb4e6cf..4baf35949b51 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -463,7 +463,6 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
463 cifs_max_pending); 463 cifs_max_pending);
464 set_credits(server, server->maxReq); 464 set_credits(server, server->maxReq);
465 server->maxBuf = le16_to_cpu(rsp->MaxBufSize); 465 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
466 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
467 /* even though we do not use raw we might as well set this 466 /* even though we do not use raw we might as well set this
468 accurately, in case we ever find a need for it */ 467 accurately, in case we ever find a need for it */
469 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 468 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -3067,7 +3066,6 @@ querySymLinkRetry:
3067 return rc; 3066 return rc;
3068} 3067}
3069 3068
3070#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
3071/* 3069/*
3072 * Recent Windows versions now create symlinks more frequently 3070 * Recent Windows versions now create symlinks more frequently
3073 * and they use the "reparse point" mechanism below. We can of course 3071 * and they use the "reparse point" mechanism below. We can of course
@@ -3079,18 +3077,22 @@ querySymLinkRetry:
3079 * it is not compiled in by default until callers fixed up and more tested. 3077 * it is not compiled in by default until callers fixed up and more tested.
3080 */ 3078 */
3081int 3079int
3082CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, 3080CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
3083 const unsigned char *searchName, 3081 __u16 fid, char **symlinkinfo,
3084 char *symlinkinfo, const int buflen, __u16 fid, 3082 const struct nls_table *nls_codepage)
3085 const struct nls_table *nls_codepage)
3086{ 3083{
3087 int rc = 0; 3084 int rc = 0;
3088 int bytes_returned; 3085 int bytes_returned;
3089 struct smb_com_transaction_ioctl_req *pSMB; 3086 struct smb_com_transaction_ioctl_req *pSMB;
3090 struct smb_com_transaction_ioctl_rsp *pSMBr; 3087 struct smb_com_transaction_ioctl_rsp *pSMBr;
3088 bool is_unicode;
3089 unsigned int sub_len;
3090 char *sub_start;
3091 struct reparse_data *reparse_buf;
3092 __u32 data_offset, data_count;
3093 char *end_of_smb;
3091 3094
3092 cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n", 3095 cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
3093 searchName);
3094 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, 3096 rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
3095 (void **) &pSMBr); 3097 (void **) &pSMBr);
3096 if (rc) 3098 if (rc)
@@ -3119,66 +3121,55 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
3119 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3121 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3120 if (rc) { 3122 if (rc) {
3121 cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); 3123 cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
3122 } else { /* decode response */ 3124 goto qreparse_out;
3123 __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); 3125 }
3124 __u32 data_count = le32_to_cpu(pSMBr->DataCount);
3125 if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
3126 /* BB also check enough total bytes returned */
3127 rc = -EIO; /* bad smb */
3128 goto qreparse_out;
3129 }
3130 if (data_count && (data_count < 2048)) {
3131 char *end_of_smb = 2 /* sizeof byte count */ +
3132 get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
3133
3134 struct reparse_data *reparse_buf =
3135 (struct reparse_data *)
3136 ((char *)&pSMBr->hdr.Protocol
3137 + data_offset);
3138 if ((char *)reparse_buf >= end_of_smb) {
3139 rc = -EIO;
3140 goto qreparse_out;
3141 }
3142 if ((reparse_buf->LinkNamesBuf +
3143 reparse_buf->TargetNameOffset +
3144 reparse_buf->TargetNameLen) > end_of_smb) {
3145 cifs_dbg(FYI, "reparse buf beyond SMB\n");
3146 rc = -EIO;
3147 goto qreparse_out;
3148 }
3149 3126
3150 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { 3127 data_offset = le32_to_cpu(pSMBr->DataOffset);
3151 cifs_from_ucs2(symlinkinfo, (__le16 *) 3128 data_count = le32_to_cpu(pSMBr->DataCount);
3152 (reparse_buf->LinkNamesBuf + 3129 if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
3153 reparse_buf->TargetNameOffset), 3130 /* BB also check enough total bytes returned */
3154 buflen, 3131 rc = -EIO; /* bad smb */
3155 reparse_buf->TargetNameLen, 3132 goto qreparse_out;
3156 nls_codepage, 0); 3133 }
3157 } else { /* ASCII names */ 3134 if (!data_count || (data_count > 2048)) {
3158 strncpy(symlinkinfo, 3135 rc = -EIO;
3159 reparse_buf->LinkNamesBuf + 3136 cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
3160 reparse_buf->TargetNameOffset, 3137 goto qreparse_out;
3161 min_t(const int, buflen, 3138 }
3162 reparse_buf->TargetNameLen)); 3139 end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
3163 } 3140 reparse_buf = (struct reparse_data *)
3164 } else { 3141 ((char *)&pSMBr->hdr.Protocol + data_offset);
3165 rc = -EIO; 3142 if ((char *)reparse_buf >= end_of_smb) {
3166 cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); 3143 rc = -EIO;
3167 } 3144 goto qreparse_out;
3168 symlinkinfo[buflen] = 0; /* just in case so the caller
3169 does not go off the end of the buffer */
3170 cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo);
3171 } 3145 }
3146 if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset +
3147 reparse_buf->PrintNameLength) > end_of_smb) {
3148 cifs_dbg(FYI, "reparse buf beyond SMB\n");
3149 rc = -EIO;
3150 goto qreparse_out;
3151 }
3152 sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer;
3153 sub_len = reparse_buf->SubstituteNameLength;
3154 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3155 is_unicode = true;
3156 else
3157 is_unicode = false;
3172 3158
3159 /* BB FIXME investigate remapping reserved chars here */
3160 *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
3161 nls_codepage);
3162 if (!*symlinkinfo)
3163 rc = -ENOMEM;
3173qreparse_out: 3164qreparse_out:
3174 cifs_buf_release(pSMB); 3165 cifs_buf_release(pSMB);
3175 3166
3176 /* Note: On -EAGAIN error only caller can retry on handle based calls 3167 /*
3177 since file handle passed in no longer valid */ 3168 * Note: On -EAGAIN error only caller can retry on handle based calls
3178 3169 * since file handle passed in no longer valid.
3170 */
3179 return rc; 3171 return rc;
3180} 3172}
3181#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
3182 3173
3183#ifdef CONFIG_CIFS_POSIX 3174#ifdef CONFIG_CIFS_POSIX
3184 3175
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d67c550c4980..a279ffc0bc29 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -379,6 +379,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
379 try_to_freeze(); 379 try_to_freeze();
380 380
381 /* we should try only the port we connected to before */ 381 /* we should try only the port we connected to before */
382 mutex_lock(&server->srv_mutex);
382 rc = generic_ip_connect(server); 383 rc = generic_ip_connect(server);
383 if (rc) { 384 if (rc) {
384 cifs_dbg(FYI, "reconnect error %d\n", rc); 385 cifs_dbg(FYI, "reconnect error %d\n", rc);
@@ -390,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
390 server->tcpStatus = CifsNeedNegotiate; 391 server->tcpStatus = CifsNeedNegotiate;
391 spin_unlock(&GlobalMid_Lock); 392 spin_unlock(&GlobalMid_Lock);
392 } 393 }
394 mutex_unlock(&server->srv_mutex);
393 } while (server->tcpStatus == CifsNeedReconnect); 395 } while (server->tcpStatus == CifsNeedReconnect);
394 396
395 return rc; 397 return rc;
@@ -1114,7 +1116,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1114 break; 1116 break;
1115#ifdef CONFIG_CIFS_SMB2 1117#ifdef CONFIG_CIFS_SMB2
1116 case Smb_20: 1118 case Smb_20:
1117 vol->ops = &smb21_operations; /* currently identical with 2.1 */ 1119 vol->ops = &smb20_operations;
1118 vol->vals = &smb20_values; 1120 vol->vals = &smb20_values;
1119 break; 1121 break;
1120 case Smb_21: 1122 case Smb_21:
@@ -1575,8 +1577,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1575 if (string == NULL) 1577 if (string == NULL)
1576 goto out_nomem; 1578 goto out_nomem;
1577 1579
1578 if (strnlen(string, MAX_USERNAME_SIZE) > 1580 if (strnlen(string, CIFS_MAX_USERNAME_LEN) >
1579 MAX_USERNAME_SIZE) { 1581 CIFS_MAX_USERNAME_LEN) {
1580 printk(KERN_WARNING "CIFS: username too long\n"); 1582 printk(KERN_WARNING "CIFS: username too long\n");
1581 goto cifs_parse_mount_err; 1583 goto cifs_parse_mount_err;
1582 } 1584 }
@@ -2221,13 +2223,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
2221 /* anything else takes username/password */ 2223 /* anything else takes username/password */
2222 if (strncmp(ses->user_name, 2224 if (strncmp(ses->user_name,
2223 vol->username ? vol->username : "", 2225 vol->username ? vol->username : "",
2224 MAX_USERNAME_SIZE)) 2226 CIFS_MAX_USERNAME_LEN))
2225 return 0; 2227 return 0;
2226 if (strlen(vol->username) != 0 && 2228 if (strlen(vol->username) != 0 &&
2227 ses->password != NULL && 2229 ses->password != NULL &&
2228 strncmp(ses->password, 2230 strncmp(ses->password,
2229 vol->password ? vol->password : "", 2231 vol->password ? vol->password : "",
2230 MAX_PASSWORD_SIZE)) 2232 CIFS_MAX_PASSWORD_LEN))
2231 return 0; 2233 return 0;
2232 } 2234 }
2233 return 1; 2235 return 1;
@@ -2352,7 +2354,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
2352 } 2354 }
2353 2355
2354 len = delim - payload; 2356 len = delim - payload;
2355 if (len > MAX_USERNAME_SIZE || len <= 0) { 2357 if (len > CIFS_MAX_USERNAME_LEN || len <= 0) {
2356 cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", 2358 cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
2357 len); 2359 len);
2358 rc = -EINVAL; 2360 rc = -EINVAL;
@@ -2369,7 +2371,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
2369 cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); 2371 cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
2370 2372
2371 len = key->datalen - (len + 1); 2373 len = key->datalen - (len + 1);
2372 if (len > MAX_PASSWORD_SIZE || len <= 0) { 2374 if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) {
2373 cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); 2375 cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
2374 rc = -EINVAL; 2376 rc = -EINVAL;
2375 kfree(vol->username); 2377 kfree(vol->username);
@@ -3826,33 +3828,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3826 if (server->ops->sess_setup) 3828 if (server->ops->sess_setup)
3827 rc = server->ops->sess_setup(xid, ses, nls_info); 3829 rc = server->ops->sess_setup(xid, ses, nls_info);
3828 3830
3829 if (rc) { 3831 if (rc)
3830 cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); 3832 cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
3831 } else {
3832 mutex_lock(&server->srv_mutex);
3833 if (!server->session_estab) {
3834 server->session_key.response = ses->auth_key.response;
3835 server->session_key.len = ses->auth_key.len;
3836 server->sequence_number = 0x2;
3837 server->session_estab = true;
3838 ses->auth_key.response = NULL;
3839 if (server->ops->generate_signingkey)
3840 server->ops->generate_signingkey(server);
3841 }
3842 mutex_unlock(&server->srv_mutex);
3843
3844 cifs_dbg(FYI, "CIFS Session Established successfully\n");
3845 spin_lock(&GlobalMid_Lock);
3846 ses->status = CifsGood;
3847 ses->need_reconnect = false;
3848 spin_unlock(&GlobalMid_Lock);
3849 }
3850
3851 kfree(ses->auth_key.response);
3852 ses->auth_key.response = NULL;
3853 ses->auth_key.len = 0;
3854 kfree(ses->ntlmssp);
3855 ses->ntlmssp = NULL;
3856 3833
3857 return rc; 3834 return rc;
3858} 3835}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d62ce0d48141..5384c2a640ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -32,6 +32,7 @@
32#include "cifsproto.h" 32#include "cifsproto.h"
33#include "cifs_debug.h" 33#include "cifs_debug.h"
34#include "cifs_fs_sb.h" 34#include "cifs_fs_sb.h"
35#include "cifs_unicode.h"
35 36
36static void 37static void
37renew_parental_timestamps(struct dentry *direntry) 38renew_parental_timestamps(struct dentry *direntry)
@@ -499,6 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
499 if (server->ops->close) 500 if (server->ops->close)
500 server->ops->close(xid, tcon, &fid); 501 server->ops->close(xid, tcon, &fid);
501 cifs_del_pending_open(&open); 502 cifs_del_pending_open(&open);
503 fput(file);
502 rc = -ENOMEM; 504 rc = -ENOMEM;
503 } 505 }
504 506
@@ -834,12 +836,17 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
834{ 836{
835 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; 837 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
836 unsigned long hash; 838 unsigned long hash;
837 int i; 839 wchar_t c;
840 int i, charlen;
838 841
839 hash = init_name_hash(); 842 hash = init_name_hash();
840 for (i = 0; i < q->len; i++) 843 for (i = 0; i < q->len; i += charlen) {
841 hash = partial_name_hash(nls_tolower(codepage, q->name[i]), 844 charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
842 hash); 845 /* error out if we can't convert the character */
846 if (unlikely(charlen < 0))
847 return charlen;
848 hash = partial_name_hash(cifs_toupper(c), hash);
849 }
843 q->hash = end_name_hash(hash); 850 q->hash = end_name_hash(hash);
844 851
845 return 0; 852 return 0;
@@ -849,11 +856,47 @@ static int cifs_ci_compare(const struct dentry *parent, const struct dentry *den
849 unsigned int len, const char *str, const struct qstr *name) 856 unsigned int len, const char *str, const struct qstr *name)
850{ 857{
851 struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; 858 struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
859 wchar_t c1, c2;
860 int i, l1, l2;
852 861
853 if ((name->len == len) && 862 /*
854 (nls_strnicmp(codepage, name->name, str, len) == 0)) 863 * We make the assumption here that uppercase characters in the local
855 return 0; 864 * codepage are always the same length as their lowercase counterparts.
856 return 1; 865 *
866 * If that's ever not the case, then this will fail to match it.
867 */
868 if (name->len != len)
869 return 1;
870
871 for (i = 0; i < len; i += l1) {
872 /* Convert characters in both strings to UTF-16. */
873 l1 = codepage->char2uni(&str[i], len - i, &c1);
874 l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
875
876 /*
877 * If we can't convert either character, just declare it to
878 * be 1 byte long and compare the original byte.
879 */
880 if (unlikely(l1 < 0 && l2 < 0)) {
881 if (str[i] != name->name[i])
882 return 1;
883 l1 = 1;
884 continue;
885 }
886
887 /*
888 * Here, we again ass|u|me that upper/lowercase versions of
889 * a character are the same length in the local NLS.
890 */
891 if (l1 != l2)
892 return 1;
893
894 /* Now compare uppercase versions of these characters */
895 if (cifs_toupper(c1) != cifs_toupper(c2))
896 return 1;
897 }
898
899 return 0;
857} 900}
858 901
859const struct dentry_operations cifs_ci_dentry_ops = { 902const struct dentry_operations cifs_ci_dentry_ops = {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7e36ae34e947..7ddddf2e2504 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -313,8 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
313 * If the server returned a read oplock and we have mandatory brlocks, 313 * If the server returned a read oplock and we have mandatory brlocks,
314 * set oplock level to None. 314 * set oplock level to None.
315 */ 315 */
316 if (oplock == server->vals->oplock_read && 316 if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) {
317 cifs_has_mand_locks(cinode)) {
318 cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); 317 cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
319 oplock = 0; 318 oplock = 0;
320 } 319 }
@@ -324,6 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
324 oplock = fid->pending_open->oplock; 323 oplock = fid->pending_open->oplock;
325 list_del(&fid->pending_open->olist); 324 list_del(&fid->pending_open->olist);
326 325
326 fid->purge_cache = false;
327 server->ops->set_fid(cfile, fid, oplock); 327 server->ops->set_fid(cfile, fid, oplock);
328 328
329 list_add(&cfile->tlist, &tcon->openFileList); 329 list_add(&cfile->tlist, &tcon->openFileList);
@@ -334,6 +334,9 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
334 list_add_tail(&cfile->flist, &cinode->openFileList); 334 list_add_tail(&cfile->flist, &cinode->openFileList);
335 spin_unlock(&cifs_file_list_lock); 335 spin_unlock(&cifs_file_list_lock);
336 336
337 if (fid->purge_cache)
338 cifs_invalidate_mapping(inode);
339
337 file->private_data = cfile; 340 file->private_data = cfile;
338 return cfile; 341 return cfile;
339} 342}
@@ -1524,12 +1527,12 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1524 * read won't conflict with non-overlapted locks due to 1527 * read won't conflict with non-overlapted locks due to
1525 * pagereading. 1528 * pagereading.
1526 */ 1529 */
1527 if (!CIFS_I(inode)->clientCanCacheAll && 1530 if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
1528 CIFS_I(inode)->clientCanCacheRead) { 1531 CIFS_CACHE_READ(CIFS_I(inode))) {
1529 cifs_invalidate_mapping(inode); 1532 cifs_invalidate_mapping(inode);
1530 cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", 1533 cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
1531 inode); 1534 inode);
1532 CIFS_I(inode)->clientCanCacheRead = false; 1535 CIFS_I(inode)->oplock = 0;
1533 } 1536 }
1534 1537
1535 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, 1538 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
@@ -2213,7 +2216,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
2213 cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", 2216 cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
2214 file->f_path.dentry->d_name.name, datasync); 2217 file->f_path.dentry->d_name.name, datasync);
2215 2218
2216 if (!CIFS_I(inode)->clientCanCacheRead) { 2219 if (!CIFS_CACHE_READ(CIFS_I(inode))) {
2217 rc = cifs_invalidate_mapping(inode); 2220 rc = cifs_invalidate_mapping(inode);
2218 if (rc) { 2221 if (rc) {
2219 cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); 2222 cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
@@ -2553,7 +2556,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2553 mutex_unlock(&inode->i_mutex); 2556 mutex_unlock(&inode->i_mutex);
2554 } 2557 }
2555 2558
2556 if (rc > 0 || rc == -EIOCBQUEUED) { 2559 if (rc > 0) {
2557 ssize_t err; 2560 ssize_t err;
2558 2561
2559 err = generic_write_sync(file, pos, rc); 2562 err = generic_write_sync(file, pos, rc);
@@ -2577,7 +2580,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2577 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 2580 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2578 ssize_t written; 2581 ssize_t written;
2579 2582
2580 if (cinode->clientCanCacheAll) { 2583 if (CIFS_CACHE_WRITE(cinode)) {
2581 if (cap_unix(tcon->ses) && 2584 if (cap_unix(tcon->ses) &&
2582 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) 2585 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
2583 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2586 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
@@ -2591,7 +2594,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2591 * these pages but not on the region from pos to ppos+len-1. 2594 * these pages but not on the region from pos to ppos+len-1.
2592 */ 2595 */
2593 written = cifs_user_writev(iocb, iov, nr_segs, pos); 2596 written = cifs_user_writev(iocb, iov, nr_segs, pos);
2594 if (written > 0 && cinode->clientCanCacheRead) { 2597 if (written > 0 && CIFS_CACHE_READ(cinode)) {
2595 /* 2598 /*
2596 * Windows 7 server can delay breaking level2 oplock if a write 2599 * Windows 7 server can delay breaking level2 oplock if a write
2597 * request comes - break it on the client to prevent reading 2600 * request comes - break it on the client to prevent reading
@@ -2600,7 +2603,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2600 cifs_invalidate_mapping(inode); 2603 cifs_invalidate_mapping(inode);
2601 cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", 2604 cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
2602 inode); 2605 inode);
2603 cinode->clientCanCacheRead = false; 2606 cinode->oplock = 0;
2604 } 2607 }
2605 return written; 2608 return written;
2606} 2609}
@@ -2957,7 +2960,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2957 * on pages affected by this read but not on the region from pos to 2960 * on pages affected by this read but not on the region from pos to
2958 * pos+len-1. 2961 * pos+len-1.
2959 */ 2962 */
2960 if (!cinode->clientCanCacheRead) 2963 if (!CIFS_CACHE_READ(cinode))
2961 return cifs_user_readv(iocb, iov, nr_segs, pos); 2964 return cifs_user_readv(iocb, iov, nr_segs, pos);
2962 2965
2963 if (cap_unix(tcon->ses) && 2966 if (cap_unix(tcon->ses) &&
@@ -3093,7 +3096,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
3093 3096
3094 xid = get_xid(); 3097 xid = get_xid();
3095 3098
3096 if (!CIFS_I(inode)->clientCanCacheRead) { 3099 if (!CIFS_CACHE_READ(CIFS_I(inode))) {
3097 rc = cifs_invalidate_mapping(inode); 3100 rc = cifs_invalidate_mapping(inode);
3098 if (rc) 3101 if (rc)
3099 return rc; 3102 return rc;
@@ -3251,6 +3254,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3251 /* 3254 /*
3252 * Reads as many pages as possible from fscache. Returns -ENOBUFS 3255 * Reads as many pages as possible from fscache. Returns -ENOBUFS
3253 * immediately if the cookie is negative 3256 * immediately if the cookie is negative
3257 *
3258 * After this point, every page in the list might have PG_fscache set,
3259 * so we will need to clean that up off of every page we don't use.
3254 */ 3260 */
3255 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, 3261 rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
3256 &num_pages); 3262 &num_pages);
@@ -3373,9 +3379,17 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3373 kref_put(&rdata->refcount, cifs_readdata_release); 3379 kref_put(&rdata->refcount, cifs_readdata_release);
3374 } 3380 }
3375 3381
3382 /* Any pages that have been shown to fscache but didn't get added to
3383 * the pagecache must be uncached before they get returned to the
3384 * allocator.
3385 */
3386 cifs_fscache_readpages_cancel(mapping->host, page_list);
3376 return rc; 3387 return rc;
3377} 3388}
3378 3389
3390/*
3391 * cifs_readpage_worker must be called with the page pinned
3392 */
3379static int cifs_readpage_worker(struct file *file, struct page *page, 3393static int cifs_readpage_worker(struct file *file, struct page *page,
3380 loff_t *poffset) 3394 loff_t *poffset)
3381{ 3395{
@@ -3387,7 +3401,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
3387 if (rc == 0) 3401 if (rc == 0)
3388 goto read_complete; 3402 goto read_complete;
3389 3403
3390 page_cache_get(page);
3391 read_data = kmap(page); 3404 read_data = kmap(page);
3392 /* for reads over a certain size could initiate async read ahead */ 3405 /* for reads over a certain size could initiate async read ahead */
3393 3406
@@ -3414,7 +3427,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
3414 3427
3415io_error: 3428io_error:
3416 kunmap(page); 3429 kunmap(page);
3417 page_cache_release(page); 3430 unlock_page(page);
3418 3431
3419read_complete: 3432read_complete:
3420 return rc; 3433 return rc;
@@ -3439,8 +3452,6 @@ static int cifs_readpage(struct file *file, struct page *page)
3439 3452
3440 rc = cifs_readpage_worker(file, page, &offset); 3453 rc = cifs_readpage_worker(file, page, &offset);
3441 3454
3442 unlock_page(page);
3443
3444 free_xid(xid); 3455 free_xid(xid);
3445 return rc; 3456 return rc;
3446} 3457}
@@ -3494,6 +3505,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
3494 loff_t pos, unsigned len, unsigned flags, 3505 loff_t pos, unsigned len, unsigned flags,
3495 struct page **pagep, void **fsdata) 3506 struct page **pagep, void **fsdata)
3496{ 3507{
3508 int oncethru = 0;
3497 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 3509 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
3498 loff_t offset = pos & (PAGE_CACHE_SIZE - 1); 3510 loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
3499 loff_t page_start = pos & PAGE_MASK; 3511 loff_t page_start = pos & PAGE_MASK;
@@ -3503,6 +3515,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
3503 3515
3504 cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); 3516 cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
3505 3517
3518start:
3506 page = grab_cache_page_write_begin(mapping, index, flags); 3519 page = grab_cache_page_write_begin(mapping, index, flags);
3507 if (!page) { 3520 if (!page) {
3508 rc = -ENOMEM; 3521 rc = -ENOMEM;
@@ -3526,7 +3539,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
3526 * is, when the page lies beyond the EOF, or straddles the EOF 3539 * is, when the page lies beyond the EOF, or straddles the EOF
3527 * and the write will cover all of the existing data. 3540 * and the write will cover all of the existing data.
3528 */ 3541 */
3529 if (CIFS_I(mapping->host)->clientCanCacheRead) { 3542 if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
3530 i_size = i_size_read(mapping->host); 3543 i_size = i_size_read(mapping->host);
3531 if (page_start >= i_size || 3544 if (page_start >= i_size ||
3532 (offset == 0 && (pos + len) >= i_size)) { 3545 (offset == 0 && (pos + len) >= i_size)) {
@@ -3544,13 +3557,16 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
3544 } 3557 }
3545 } 3558 }
3546 3559
3547 if ((file->f_flags & O_ACCMODE) != O_WRONLY) { 3560 if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
3548 /* 3561 /*
3549 * might as well read a page, it is fast enough. If we get 3562 * might as well read a page, it is fast enough. If we get
3550 * an error, we don't need to return it. cifs_write_end will 3563 * an error, we don't need to return it. cifs_write_end will
3551 * do a sync write instead since PG_uptodate isn't set. 3564 * do a sync write instead since PG_uptodate isn't set.
3552 */ 3565 */
3553 cifs_readpage_worker(file, page, &page_start); 3566 cifs_readpage_worker(file, page, &page_start);
3567 page_cache_release(page);
3568 oncethru = 1;
3569 goto start;
3554 } else { 3570 } else {
3555 /* we could try using another file handle if there is one - 3571 /* we could try using another file handle if there is one -
3556 but how would we lock it to prevent close of that handle 3572 but how would we lock it to prevent close of that handle
@@ -3609,20 +3625,20 @@ void cifs_oplock_break(struct work_struct *work)
3609 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 3625 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
3610 int rc = 0; 3626 int rc = 0;
3611 3627
3612 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && 3628 if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
3613 cifs_has_mand_locks(cinode)) { 3629 cifs_has_mand_locks(cinode)) {
3614 cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", 3630 cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
3615 inode); 3631 inode);
3616 cinode->clientCanCacheRead = false; 3632 cinode->oplock = 0;
3617 } 3633 }
3618 3634
3619 if (inode && S_ISREG(inode->i_mode)) { 3635 if (inode && S_ISREG(inode->i_mode)) {
3620 if (cinode->clientCanCacheRead) 3636 if (CIFS_CACHE_READ(cinode))
3621 break_lease(inode, O_RDONLY); 3637 break_lease(inode, O_RDONLY);
3622 else 3638 else
3623 break_lease(inode, O_WRONLY); 3639 break_lease(inode, O_WRONLY);
3624 rc = filemap_fdatawrite(inode->i_mapping); 3640 rc = filemap_fdatawrite(inode->i_mapping);
3625 if (cinode->clientCanCacheRead == 0) { 3641 if (!CIFS_CACHE_READ(cinode)) {
3626 rc = filemap_fdatawait(inode->i_mapping); 3642 rc = filemap_fdatawait(inode->i_mapping);
3627 mapping_set_error(inode->i_mapping, rc); 3643 mapping_set_error(inode->i_mapping, rc);
3628 cifs_invalidate_mapping(inode); 3644 cifs_invalidate_mapping(inode);
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 2f4bc5a58054..b3258f35e88a 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -223,6 +223,13 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
223 fscache_uncache_page(CIFS_I(inode)->fscache, page); 223 fscache_uncache_page(CIFS_I(inode)->fscache, page);
224} 224}
225 225
226void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
227{
228 cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
229 __func__, CIFS_I(inode)->fscache, inode);
230 fscache_readpages_cancel(CIFS_I(inode)->fscache, pages);
231}
232
226void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) 233void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
227{ 234{
228 struct cifsInodeInfo *cifsi = CIFS_I(inode); 235 struct cifsInodeInfo *cifsi = CIFS_I(inode);
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 63539323e0b9..24794b6cd8ec 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *,
54 struct address_space *, 54 struct address_space *,
55 struct list_head *, 55 struct list_head *,
56 unsigned *); 56 unsigned *);
57extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *);
57 58
58extern void __cifs_readpage_to_fscache(struct inode *, struct page *); 59extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
59 60
@@ -91,6 +92,13 @@ static inline void cifs_readpage_to_fscache(struct inode *inode,
91 __cifs_readpage_to_fscache(inode, page); 92 __cifs_readpage_to_fscache(inode, page);
92} 93}
93 94
95static inline void cifs_fscache_readpages_cancel(struct inode *inode,
96 struct list_head *pages)
97{
98 if (CIFS_I(inode)->fscache)
99 return __cifs_fscache_readpages_cancel(inode, pages);
100}
101
94#else /* CONFIG_CIFS_FSCACHE */ 102#else /* CONFIG_CIFS_FSCACHE */
95static inline int cifs_fscache_register(void) { return 0; } 103static inline int cifs_fscache_register(void) { return 0; }
96static inline void cifs_fscache_unregister(void) {} 104static inline void cifs_fscache_unregister(void) {}
@@ -131,6 +139,11 @@ static inline int cifs_readpages_from_fscache(struct inode *inode,
131static inline void cifs_readpage_to_fscache(struct inode *inode, 139static inline void cifs_readpage_to_fscache(struct inode *inode,
132 struct page *page) {} 140 struct page *page) {}
133 141
142static inline void cifs_fscache_readpages_cancel(struct inode *inode,
143 struct list_head *pages)
144{
145}
146
134#endif /* CONFIG_CIFS_FSCACHE */ 147#endif /* CONFIG_CIFS_FSCACHE */
135 148
136#endif /* _CIFS_FSCACHE_H */ 149#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 449b6cf09b09..867b7cdc794a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -101,7 +101,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
101 } 101 }
102 102
103 /* don't bother with revalidation if we have an oplock */ 103 /* don't bother with revalidation if we have an oplock */
104 if (cifs_i->clientCanCacheRead) { 104 if (CIFS_CACHE_READ(cifs_i)) {
105 cifs_dbg(FYI, "%s: inode %llu is oplocked\n", 105 cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
106 __func__, cifs_i->uniqueid); 106 __func__, cifs_i->uniqueid);
107 return; 107 return;
@@ -120,6 +120,33 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
120 cifs_i->invalid_mapping = true; 120 cifs_i->invalid_mapping = true;
121} 121}
122 122
123/*
124 * copy nlink to the inode, unless it wasn't provided. Provide
125 * sane values if we don't have an existing one and none was provided
126 */
127static void
128cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
129{
130 /*
131 * if we're in a situation where we can't trust what we
132 * got from the server (readdir, some non-unix cases)
133 * fake reasonable values
134 */
135 if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
136 /* only provide fake values on a new inode */
137 if (inode->i_state & I_NEW) {
138 if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
139 set_nlink(inode, 2);
140 else
141 set_nlink(inode, 1);
142 }
143 return;
144 }
145
146 /* we trust the server, so update it */
147 set_nlink(inode, fattr->cf_nlink);
148}
149
123/* populate an inode with info from a cifs_fattr struct */ 150/* populate an inode with info from a cifs_fattr struct */
124void 151void
125cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 152cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -134,7 +161,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
134 inode->i_mtime = fattr->cf_mtime; 161 inode->i_mtime = fattr->cf_mtime;
135 inode->i_ctime = fattr->cf_ctime; 162 inode->i_ctime = fattr->cf_ctime;
136 inode->i_rdev = fattr->cf_rdev; 163 inode->i_rdev = fattr->cf_rdev;
137 set_nlink(inode, fattr->cf_nlink); 164 cifs_nlink_fattr_to_inode(inode, fattr);
138 inode->i_uid = fattr->cf_uid; 165 inode->i_uid = fattr->cf_uid;
139 inode->i_gid = fattr->cf_gid; 166 inode->i_gid = fattr->cf_gid;
140 167
@@ -541,6 +568,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
541 fattr->cf_bytes = le64_to_cpu(info->AllocationSize); 568 fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
542 fattr->cf_createtime = le64_to_cpu(info->CreationTime); 569 fattr->cf_createtime = le64_to_cpu(info->CreationTime);
543 570
571 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
544 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 572 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
545 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 573 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
546 fattr->cf_dtype = DT_DIR; 574 fattr->cf_dtype = DT_DIR;
@@ -548,7 +576,12 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
548 * Server can return wrong NumberOfLinks value for directories 576 * Server can return wrong NumberOfLinks value for directories
549 * when Unix extensions are disabled - fake it. 577 * when Unix extensions are disabled - fake it.
550 */ 578 */
551 fattr->cf_nlink = 2; 579 if (!tcon->unix_ext)
580 fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
581 } else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
582 fattr->cf_mode = S_IFLNK;
583 fattr->cf_dtype = DT_LNK;
584 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
552 } else { 585 } else {
553 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; 586 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
554 fattr->cf_dtype = DT_REG; 587 fattr->cf_dtype = DT_REG;
@@ -557,11 +590,15 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
557 if (fattr->cf_cifsattrs & ATTR_READONLY) 590 if (fattr->cf_cifsattrs & ATTR_READONLY)
558 fattr->cf_mode &= ~(S_IWUGO); 591 fattr->cf_mode &= ~(S_IWUGO);
559 592
560 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); 593 /*
561 if (fattr->cf_nlink < 1) { 594 * Don't accept zero nlink from non-unix servers unless
562 cifs_dbg(1, "replacing bogus file nlink value %u\n", 595 * delete is pending. Instead mark it as unknown.
596 */
597 if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
598 !info->DeletePending) {
599 cifs_dbg(1, "bogus file nlink value %u\n",
563 fattr->cf_nlink); 600 fattr->cf_nlink);
564 fattr->cf_nlink = 1; 601 fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
565 } 602 }
566 } 603 }
567 604
@@ -646,7 +683,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
646 cifs_dbg(FYI, "Getting info on %s\n", full_path); 683 cifs_dbg(FYI, "Getting info on %s\n", full_path);
647 684
648 if ((data == NULL) && (*inode != NULL)) { 685 if ((data == NULL) && (*inode != NULL)) {
649 if (CIFS_I(*inode)->clientCanCacheRead) { 686 if (CIFS_CACHE_READ(CIFS_I(*inode))) {
650 cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); 687 cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
651 goto cgii_exit; 688 goto cgii_exit;
652 } 689 }
@@ -1657,7 +1694,7 @@ cifs_inode_needs_reval(struct inode *inode)
1657 struct cifsInodeInfo *cifs_i = CIFS_I(inode); 1694 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1658 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1695 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1659 1696
1660 if (cifs_i->clientCanCacheRead) 1697 if (CIFS_CACHE_READ(cifs_i))
1661 return false; 1698 return false;
1662 1699
1663 if (!lookupCacheEnabled) 1700 if (!lookupCacheEnabled)
@@ -1800,7 +1837,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1800 * We need to be sure that all dirty pages are written and the server 1837 * We need to be sure that all dirty pages are written and the server
1801 * has actual ctime, mtime and file length. 1838 * has actual ctime, mtime and file length.
1802 */ 1839 */
1803 if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && 1840 if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
1804 inode->i_mapping->nrpages != 0) { 1841 inode->i_mapping->nrpages != 0) {
1805 rc = filemap_fdatawait(inode->i_mapping); 1842 rc = filemap_fdatawait(inode->i_mapping);
1806 if (rc) { 1843 if (rc) {
@@ -1852,14 +1889,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
1852 1889
1853static void cifs_setsize(struct inode *inode, loff_t offset) 1890static void cifs_setsize(struct inode *inode, loff_t offset)
1854{ 1891{
1855 loff_t oldsize;
1856
1857 spin_lock(&inode->i_lock); 1892 spin_lock(&inode->i_lock);
1858 oldsize = inode->i_size;
1859 i_size_write(inode, offset); 1893 i_size_write(inode, offset);
1860 spin_unlock(&inode->i_lock); 1894 spin_unlock(&inode->i_lock);
1861 1895
1862 truncate_pagecache(inode, oldsize, offset); 1896 truncate_pagecache(inode, offset);
1863} 1897}
1864 1898
1865static int 1899static int
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 562044f700e5..7e36ceba0c7a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -509,6 +509,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
509 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 509 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
510 struct tcon_link *tlink = NULL; 510 struct tcon_link *tlink = NULL;
511 struct cifs_tcon *tcon; 511 struct cifs_tcon *tcon;
512 struct TCP_Server_Info *server;
512 513
513 xid = get_xid(); 514 xid = get_xid();
514 515
@@ -519,25 +520,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
519 goto out; 520 goto out;
520 } 521 }
521 tcon = tlink_tcon(tlink); 522 tcon = tlink_tcon(tlink);
522 523 server = tcon->ses->server;
523 /*
524 * For now, we just handle symlinks with unix extensions enabled.
525 * Eventually we should handle NTFS reparse points, and MacOS
526 * symlink support. For instance...
527 *
528 * rc = CIFSSMBQueryReparseLinkInfo(...)
529 *
530 * For now, just return -EACCES when the server doesn't support posix
531 * extensions. Note that we still allow querying symlinks when posix
532 * extensions are manually disabled. We could disable these as well
533 * but there doesn't seem to be any harm in allowing the client to
534 * read them.
535 */
536 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
537 !cap_unix(tcon->ses)) {
538 rc = -EACCES;
539 goto out;
540 }
541 524
542 full_path = build_path_from_dentry(direntry); 525 full_path = build_path_from_dentry(direntry);
543 if (!full_path) 526 if (!full_path)
@@ -559,6 +542,9 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
559 if ((rc != 0) && cap_unix(tcon->ses)) 542 if ((rc != 0) && cap_unix(tcon->ses))
560 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, 543 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
561 cifs_sb->local_nls); 544 cifs_sb->local_nls);
545 else if (rc != 0 && server->ops->query_symlink)
546 rc = server->ops->query_symlink(xid, tcon, full_path,
547 &target_path, cifs_sb);
562 548
563 kfree(full_path); 549 kfree(full_path);
564out: 550out:
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index f7d4b2285efe..138a011633fe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -105,6 +105,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
105 } 105 }
106 kfree(buf_to_free->user_name); 106 kfree(buf_to_free->user_name);
107 kfree(buf_to_free->domainName); 107 kfree(buf_to_free->domainName);
108 kfree(buf_to_free->auth_key.response);
108 kfree(buf_to_free); 109 kfree(buf_to_free);
109} 110}
110 111
@@ -545,19 +546,15 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
545 oplock &= 0xF; 546 oplock &= 0xF;
546 547
547 if (oplock == OPLOCK_EXCLUSIVE) { 548 if (oplock == OPLOCK_EXCLUSIVE) {
548 cinode->clientCanCacheAll = true; 549 cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
549 cinode->clientCanCacheRead = true;
550 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", 550 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
551 &cinode->vfs_inode); 551 &cinode->vfs_inode);
552 } else if (oplock == OPLOCK_READ) { 552 } else if (oplock == OPLOCK_READ) {
553 cinode->clientCanCacheAll = false; 553 cinode->oplock = CIFS_CACHE_READ_FLG;
554 cinode->clientCanCacheRead = true;
555 cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", 554 cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
556 &cinode->vfs_inode); 555 &cinode->vfs_inode);
557 } else { 556 } else
558 cinode->clientCanCacheAll = false; 557 cinode->oplock = 0;
559 cinode->clientCanCacheRead = false;
560 }
561} 558}
562 559
563bool 560bool
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 69d2c826a23b..53a75f3d0179 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -172,11 +172,17 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
172 if (cifs_dfs_is_possible(cifs_sb) && 172 if (cifs_dfs_is_possible(cifs_sb) &&
173 (fattr->cf_cifsattrs & ATTR_REPARSE)) 173 (fattr->cf_cifsattrs & ATTR_REPARSE))
174 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; 174 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
175 } else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
176 fattr->cf_mode = S_IFLNK;
177 fattr->cf_dtype = DT_LNK;
175 } else { 178 } else {
176 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; 179 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
177 fattr->cf_dtype = DT_REG; 180 fattr->cf_dtype = DT_REG;
178 } 181 }
179 182
183 /* non-unix readdir doesn't provide nlink */
184 fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
185
180 if (fattr->cf_cifsattrs & ATTR_READONLY) 186 if (fattr->cf_cifsattrs & ATTR_READONLY)
181 fattr->cf_mode &= ~S_IWUGO; 187 fattr->cf_mode &= ~S_IWUGO;
182 188
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 08dd37bb23aa..352358de1d7e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,88 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include "cifs_spnego.h" 33#include "cifs_spnego.h"
34 34
35/*
36 * Checks if this is the first smb session to be reconnected after
37 * the socket has been reestablished (so we know whether to use vc 0).
38 * Called while holding the cifs_tcp_ses_lock, so do not block
39 */
40static bool is_first_ses_reconnect(struct cifs_ses *ses)
41{
42 struct list_head *tmp;
43 struct cifs_ses *tmp_ses;
44
45 list_for_each(tmp, &ses->server->smb_ses_list) {
46 tmp_ses = list_entry(tmp, struct cifs_ses,
47 smb_ses_list);
48 if (tmp_ses->need_reconnect == false)
49 return false;
50 }
51 /* could not find a session that was already connected,
52 this must be the first one we are reconnecting */
53 return true;
54}
55
56/*
57 * vc number 0 is treated specially by some servers, and should be the
58 * first one we request. After that we can use vcnumbers up to maxvcs,
59 * one for each smb session (some Windows versions set maxvcs incorrectly
60 * so maxvc=1 can be ignored). If we have too many vcs, we can reuse
61 * any vc but zero (some servers reset the connection on vcnum zero)
62 *
63 */
64static __le16 get_next_vcnum(struct cifs_ses *ses)
65{
66 __u16 vcnum = 0;
67 struct list_head *tmp;
68 struct cifs_ses *tmp_ses;
69 __u16 max_vcs = ses->server->max_vcs;
70 __u16 i;
71 int free_vc_found = 0;
72
73 /* Quoting the MS-SMB specification: "Windows-based SMB servers set this
74 field to one but do not enforce this limit, which allows an SMB client
75 to establish more virtual circuits than allowed by this value ... but
76 other server implementations can enforce this limit." */
77 if (max_vcs < 2)
78 max_vcs = 0xFFFF;
79
80 spin_lock(&cifs_tcp_ses_lock);
81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
82 goto get_vc_num_exit; /* vcnum will be zero */
83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
84 if (i == 0) /* this is the only connection, use vc 0 */
85 break;
86
87 free_vc_found = 1;
88
89 list_for_each(tmp, &ses->server->smb_ses_list) {
90 tmp_ses = list_entry(tmp, struct cifs_ses,
91 smb_ses_list);
92 if (tmp_ses->vcnum == i) {
93 free_vc_found = 0;
94 break; /* found duplicate, try next vcnum */
95 }
96 }
97 if (free_vc_found)
98 break; /* we found a vcnumber that will work - use it */
99 }
100
101 if (i == 0)
102 vcnum = 0; /* for most common case, ie if one smb session, use
103 vc zero. Also for case when no free vcnum, zero
104 is safest to send (some clients only send zero) */
105 else if (free_vc_found == 0)
106 vcnum = 1; /* we can not reuse vc=0 safely, since some servers
107 reset all uids on that, but 1 is ok. */
108 else
109 vcnum = i;
110 ses->vcnum = vcnum;
111get_vc_num_exit:
112 spin_unlock(&cifs_tcp_ses_lock);
113
114 return cpu_to_le16(vcnum);
115}
116
117static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) 35static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
118{ 36{
119 __u32 capabilities = 0; 37 __u32 capabilities = 0;
@@ -128,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
128 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, 46 CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
129 USHRT_MAX)); 47 USHRT_MAX));
130 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 48 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
131 pSMB->req.VcNumber = get_next_vcnum(ses); 49 pSMB->req.VcNumber = __constant_cpu_to_le16(1);
132 50
133 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ 51 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
134 52
@@ -226,7 +144,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
226 *(bcc_ptr+1) = 0; 144 *(bcc_ptr+1) = 0;
227 } else { 145 } else {
228 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, 146 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
229 MAX_USERNAME_SIZE, nls_cp); 147 CIFS_MAX_USERNAME_LEN, nls_cp);
230 } 148 }
231 bcc_ptr += 2 * bytes_ret; 149 bcc_ptr += 2 * bytes_ret;
232 bcc_ptr += 2; /* account for null termination */ 150 bcc_ptr += 2; /* account for null termination */
@@ -246,8 +164,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
246 /* BB what about null user mounts - check that we do this BB */ 164 /* BB what about null user mounts - check that we do this BB */
247 /* copy user */ 165 /* copy user */
248 if (ses->user_name != NULL) { 166 if (ses->user_name != NULL) {
249 strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); 167 strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
250 bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); 168 bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
251 } 169 }
252 /* else null user mount */ 170 /* else null user mount */
253 *bcc_ptr = 0; 171 *bcc_ptr = 0;
@@ -428,7 +346,8 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
428 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 346 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
429 if (ses->server->sign) { 347 if (ses->server->sign) {
430 flags |= NTLMSSP_NEGOTIATE_SIGN; 348 flags |= NTLMSSP_NEGOTIATE_SIGN;
431 if (!ses->server->session_estab) 349 if (!ses->server->session_estab ||
350 ses->ntlmssp->sesskey_per_smbsess)
432 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 351 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
433 } 352 }
434 353
@@ -466,7 +385,8 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
466 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 385 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
467 if (ses->server->sign) { 386 if (ses->server->sign) {
468 flags |= NTLMSSP_NEGOTIATE_SIGN; 387 flags |= NTLMSSP_NEGOTIATE_SIGN;
469 if (!ses->server->session_estab) 388 if (!ses->server->session_estab ||
389 ses->ntlmssp->sesskey_per_smbsess)
470 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 390 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
471 } 391 }
472 392
@@ -501,7 +421,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 } else { 421 } else {
502 int len; 422 int len;
503 len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, 423 len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
504 MAX_USERNAME_SIZE, nls_cp); 424 CIFS_MAX_USERNAME_LEN, nls_cp);
505 len *= 2; /* unicode is 2 bytes each */ 425 len *= 2; /* unicode is 2 bytes each */
506 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 426 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
507 sec_blob->DomainName.Length = cpu_to_le16(len); 427 sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -517,7 +437,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
517 } else { 437 } else {
518 int len; 438 int len;
519 len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, 439 len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
520 MAX_USERNAME_SIZE, nls_cp); 440 CIFS_MAX_USERNAME_LEN, nls_cp);
521 len *= 2; /* unicode is 2 bytes each */ 441 len *= 2; /* unicode is 2 bytes each */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 442 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 443 sec_blob->UserName.Length = cpu_to_le16(len);
@@ -629,7 +549,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
629 type = select_sectype(ses->server, ses->sectype); 549 type = select_sectype(ses->server, ses->sectype);
630 cifs_dbg(FYI, "sess setup type %d\n", type); 550 cifs_dbg(FYI, "sess setup type %d\n", type);
631 if (type == Unspecified) { 551 if (type == Unspecified) {
632 cifs_dbg(VFS, "Unable to select appropriate authentication method!"); 552 cifs_dbg(VFS,
553 "Unable to select appropriate authentication method!");
633 return -EINVAL; 554 return -EINVAL;
634 } 555 }
635 556
@@ -640,6 +561,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
640 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); 561 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
641 if (!ses->ntlmssp) 562 if (!ses->ntlmssp)
642 return -ENOMEM; 563 return -ENOMEM;
564 ses->ntlmssp->sesskey_per_smbsess = false;
565
643 } 566 }
644 567
645ssetup_ntlmssp_authenticate: 568ssetup_ntlmssp_authenticate:
@@ -815,8 +738,9 @@ ssetup_ntlmssp_authenticate:
815 ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, 738 ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
816 GFP_KERNEL); 739 GFP_KERNEL);
817 if (!ses->auth_key.response) { 740 if (!ses->auth_key.response) {
818 cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", 741 cifs_dbg(VFS,
819 msg->sesskey_len); 742 "Kerberos can't allocate (%u bytes) memory",
743 msg->sesskey_len);
820 rc = -ENOMEM; 744 rc = -ENOMEM;
821 goto ssetup_exit; 745 goto ssetup_exit;
822 } 746 }
@@ -1005,5 +929,37 @@ ssetup_exit:
1005 if ((phase == NtLmChallenge) && (rc == 0)) 929 if ((phase == NtLmChallenge) && (rc == 0))
1006 goto ssetup_ntlmssp_authenticate; 930 goto ssetup_ntlmssp_authenticate;
1007 931
932 if (!rc) {
933 mutex_lock(&ses->server->srv_mutex);
934 if (!ses->server->session_estab) {
935 if (ses->server->sign) {
936 ses->server->session_key.response =
937 kmemdup(ses->auth_key.response,
938 ses->auth_key.len, GFP_KERNEL);
939 if (!ses->server->session_key.response) {
940 rc = -ENOMEM;
941 mutex_unlock(&ses->server->srv_mutex);
942 goto keycp_exit;
943 }
944 ses->server->session_key.len =
945 ses->auth_key.len;
946 }
947 ses->server->sequence_number = 0x2;
948 ses->server->session_estab = true;
949 }
950 mutex_unlock(&ses->server->srv_mutex);
951
952 cifs_dbg(FYI, "CIFS session established successfully\n");
953 spin_lock(&GlobalMid_Lock);
954 ses->status = CifsGood;
955 ses->need_reconnect = false;
956 spin_unlock(&GlobalMid_Lock);
957 }
958
959keycp_exit:
960 kfree(ses->auth_key.response);
961 ses->auth_key.response = NULL;
962 kfree(ses->ntlmssp);
963
1008 return rc; 964 return rc;
1009} 965}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 60943978aec3..8233b174de3d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -700,7 +700,7 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
700 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 700 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
701 cfile->fid.netfid = fid->netfid; 701 cfile->fid.netfid = fid->netfid;
702 cifs_set_oplock_level(cinode, oplock); 702 cifs_set_oplock_level(cinode, oplock);
703 cinode->can_cache_brlcks = cinode->clientCanCacheAll; 703 cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
704} 704}
705 705
706static void 706static void
@@ -837,7 +837,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
837{ 837{
838 return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0, 838 return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0,
839 LOCKING_ANDX_OPLOCK_RELEASE, false, 839 LOCKING_ANDX_OPLOCK_RELEASE, false,
840 cinode->clientCanCacheRead ? 1 : 0); 840 CIFS_CACHE_READ(cinode) ? 1 : 0);
841} 841}
842 842
843static int 843static int
@@ -881,6 +881,43 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
881 (__u8)type, wait, 0); 881 (__u8)type, wait, 0);
882} 882}
883 883
884static int
885cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
886 const char *full_path, char **target_path,
887 struct cifs_sb_info *cifs_sb)
888{
889 int rc;
890 int oplock = 0;
891 __u16 netfid;
892
893 cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
894
895 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
896 FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
897 &oplock, NULL, cifs_sb->local_nls,
898 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
899 if (rc)
900 return rc;
901
902 rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
903 cifs_sb->local_nls);
904 if (rc) {
905 CIFSSMBClose(xid, tcon, netfid);
906 return rc;
907 }
908
909 convert_delimiter(*target_path, '/');
910 CIFSSMBClose(xid, tcon, netfid);
911 cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
912 return rc;
913}
914
915static bool
916cifs_is_read_op(__u32 oplock)
917{
918 return oplock == OPLOCK_READ;
919}
920
884struct smb_version_operations smb1_operations = { 921struct smb_version_operations smb1_operations = {
885 .send_cancel = send_nt_cancel, 922 .send_cancel = send_nt_cancel,
886 .compare_fids = cifs_compare_fids, 923 .compare_fids = cifs_compare_fids,
@@ -927,6 +964,7 @@ struct smb_version_operations smb1_operations = {
927 .rename_pending_delete = cifs_rename_pending_delete, 964 .rename_pending_delete = cifs_rename_pending_delete,
928 .rename = CIFSSMBRename, 965 .rename = CIFSSMBRename,
929 .create_hardlink = CIFSCreateHardLink, 966 .create_hardlink = CIFSCreateHardLink,
967 .query_symlink = cifs_query_symlink,
930 .open = cifs_open_file, 968 .open = cifs_open_file,
931 .set_fid = cifs_set_fid, 969 .set_fid = cifs_set_fid,
932 .close = cifs_close_file, 970 .close = cifs_close_file,
@@ -945,6 +983,7 @@ struct smb_version_operations smb1_operations = {
945 .mand_unlock_range = cifs_unlock_range, 983 .mand_unlock_range = cifs_unlock_range,
946 .push_mand_locks = cifs_push_mandatory_locks, 984 .push_mand_locks = cifs_push_mandatory_locks,
947 .query_mf_symlink = open_query_close_cifs_symlink, 985 .query_mf_symlink = open_query_close_cifs_symlink,
986 .is_read_op = cifs_is_read_op,
948}; 987};
949 988
950struct smb_version_values smb1_values = { 989struct smb_version_values smb1_values = {
@@ -960,7 +999,6 @@ struct smb_version_values smb1_values = {
960 .cap_unix = CAP_UNIX, 999 .cap_unix = CAP_UNIX,
961 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, 1000 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
962 .cap_large_files = CAP_LARGE_FILES, 1001 .cap_large_files = CAP_LARGE_FILES,
963 .oplock_read = OPLOCK_READ,
964 .signing_enabled = SECMODE_SIGN_ENABLED, 1002 .signing_enabled = SECMODE_SIGN_ENABLED,
965 .signing_required = SECMODE_SIGN_REQUIRED, 1003 .signing_required = SECMODE_SIGN_REQUIRED,
966}; 1004};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 04a81a4142c3..3f17b4550831 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -34,29 +34,6 @@
34#include "fscache.h" 34#include "fscache.h"
35#include "smb2proto.h" 35#include "smb2proto.h"
36 36
37void
38smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
39{
40 oplock &= 0xFF;
41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
42 return;
43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
44 oplock == SMB2_OPLOCK_LEVEL_BATCH) {
45 cinode->clientCanCacheAll = true;
46 cinode->clientCanCacheRead = true;
47 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
48 &cinode->vfs_inode);
49 } else if (oplock == SMB2_OPLOCK_LEVEL_II) {
50 cinode->clientCanCacheAll = false;
51 cinode->clientCanCacheRead = true;
52 cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
53 &cinode->vfs_inode);
54 } else {
55 cinode->clientCanCacheAll = false;
56 cinode->clientCanCacheRead = false;
57 }
58}
59
60int 37int
61smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, 38smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
62 __u32 *oplock, FILE_ALL_INFO *buf) 39 __u32 *oplock, FILE_ALL_INFO *buf)
@@ -86,7 +63,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
86 if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) 63 if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); 64 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
88 65
89 rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data); 66 rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL);
90 if (rc) 67 if (rc)
91 goto out; 68 goto out;
92 69
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index c6ec1633309a..78ff88c467b9 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -60,7 +60,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
60 oparms.fid = &fid; 60 oparms.fid = &fid;
61 oparms.reconnect = false; 61 oparms.reconnect = false;
62 62
63 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); 63 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
64 if (rc) { 64 if (rc) {
65 kfree(utf16_path); 65 kfree(utf16_path);
66 return rc; 66 return rc;
@@ -136,7 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
136 return -ENOMEM; 136 return -ENOMEM;
137 137
138 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, 138 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
139 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data, 139 FILE_READ_ATTRIBUTES, FILE_OPEN,
140 OPEN_REPARSE_POINT, smb2_data,
140 SMB2_OP_QUERY_INFO); 141 SMB2_OP_QUERY_INFO);
141 if (rc) 142 if (rc)
142 goto out; 143 goto out;
@@ -191,8 +192,8 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
191 struct cifs_sb_info *cifs_sb) 192 struct cifs_sb_info *cifs_sb)
192{ 193{
193 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, 194 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
194 CREATE_DELETE_ON_CLOSE, NULL, 195 CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
195 SMB2_OP_DELETE); 196 NULL, SMB2_OP_DELETE);
196} 197}
197 198
198static int 199static int
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b0c43345cd98..fb3966265b6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -171,6 +171,10 @@ smb2_check_message(char *buf, unsigned int length)
171 if (4 + len != clc_len) { 171 if (4 + len != clc_len) {
172 cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", 172 cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
173 clc_len, 4 + len, mid); 173 clc_len, 4 + len, mid);
174 /* create failed on symlink */
175 if (command == SMB2_CREATE_HE &&
176 hdr->Status == STATUS_STOPPED_ON_SYMLINK)
177 return 0;
174 /* Windows 7 server returns 24 bytes more */ 178 /* Windows 7 server returns 24 bytes more */
175 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) 179 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
176 return 0; 180 return 0;
@@ -376,23 +380,15 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
376__le32 380__le32
377smb2_get_lease_state(struct cifsInodeInfo *cinode) 381smb2_get_lease_state(struct cifsInodeInfo *cinode)
378{ 382{
379 if (cinode->clientCanCacheAll) 383 __le32 lease = 0;
380 return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; 384
381 else if (cinode->clientCanCacheRead) 385 if (CIFS_CACHE_WRITE(cinode))
382 return SMB2_LEASE_READ_CACHING; 386 lease |= SMB2_LEASE_WRITE_CACHING;
383 return 0; 387 if (CIFS_CACHE_HANDLE(cinode))
384} 388 lease |= SMB2_LEASE_HANDLE_CACHING;
385 389 if (CIFS_CACHE_READ(cinode))
386__u8 smb2_map_lease_to_oplock(__le32 lease_state) 390 lease |= SMB2_LEASE_READ_CACHING;
387{ 391 return lease;
388 if (lease_state & SMB2_LEASE_WRITE_CACHING) {
389 if (lease_state & SMB2_LEASE_HANDLE_CACHING)
390 return SMB2_OPLOCK_LEVEL_BATCH;
391 else
392 return SMB2_OPLOCK_LEVEL_EXCLUSIVE;
393 } else if (lease_state & SMB2_LEASE_READ_CACHING)
394 return SMB2_OPLOCK_LEVEL_II;
395 return 0;
396} 392}
397 393
398struct smb2_lease_break_work { 394struct smb2_lease_break_work {
@@ -417,96 +413,109 @@ cifs_ses_oplock_break(struct work_struct *work)
417} 413}
418 414
419static bool 415static bool
420smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) 416smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
417 struct smb2_lease_break_work *lw)
421{ 418{
422 struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; 419 bool found;
423 struct list_head *tmp, *tmp1, *tmp2; 420 __u8 lease_state;
424 struct cifs_ses *ses; 421 struct list_head *tmp;
425 struct cifs_tcon *tcon;
426 struct cifsInodeInfo *cinode;
427 struct cifsFileInfo *cfile; 422 struct cifsFileInfo *cfile;
423 struct TCP_Server_Info *server = tcon->ses->server;
428 struct cifs_pending_open *open; 424 struct cifs_pending_open *open;
429 struct smb2_lease_break_work *lw; 425 struct cifsInodeInfo *cinode;
430 bool found;
431 int ack_req = le32_to_cpu(rsp->Flags & 426 int ack_req = le32_to_cpu(rsp->Flags &
432 SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); 427 SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
433 428
434 lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); 429 lease_state = le32_to_cpu(rsp->NewLeaseState);
435 if (!lw)
436 return false;
437 430
438 INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); 431 list_for_each(tmp, &tcon->openFileList) {
439 lw->lease_state = rsp->NewLeaseState; 432 cfile = list_entry(tmp, struct cifsFileInfo, tlist);
433 cinode = CIFS_I(cfile->dentry->d_inode);
440 434
441 cifs_dbg(FYI, "Checking for lease break\n"); 435 if (memcmp(cinode->lease_key, rsp->LeaseKey,
436 SMB2_LEASE_KEY_SIZE))
437 continue;
442 438
443 /* look up tcon based on tid & uid */ 439 cifs_dbg(FYI, "found in the open list\n");
444 spin_lock(&cifs_tcp_ses_lock); 440 cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
445 list_for_each(tmp, &server->smb_ses_list) { 441 le32_to_cpu(rsp->NewLeaseState));
446 ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
447 442
448 spin_lock(&cifs_file_list_lock); 443 server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
449 list_for_each(tmp1, &ses->tcon_list) {
450 tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
451 444
452 cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); 445 if (ack_req)
453 list_for_each(tmp2, &tcon->openFileList) { 446 cfile->oplock_break_cancelled = false;
454 cfile = list_entry(tmp2, struct cifsFileInfo, 447 else
455 tlist); 448 cfile->oplock_break_cancelled = true;
456 cinode = CIFS_I(cfile->dentry->d_inode);
457 449
458 if (memcmp(cinode->lease_key, rsp->LeaseKey, 450 queue_work(cifsiod_wq, &cfile->oplock_break);
459 SMB2_LEASE_KEY_SIZE)) 451 kfree(lw);
460 continue; 452 return true;
453 }
461 454
462 cifs_dbg(FYI, "found in the open list\n"); 455 found = false;
463 cifs_dbg(FYI, "lease key match, lease break 0x%d\n", 456 list_for_each_entry(open, &tcon->pending_opens, olist) {
464 le32_to_cpu(rsp->NewLeaseState)); 457 if (memcmp(open->lease_key, rsp->LeaseKey,
458 SMB2_LEASE_KEY_SIZE))
459 continue;
460
461 if (!found && ack_req) {
462 found = true;
463 memcpy(lw->lease_key, open->lease_key,
464 SMB2_LEASE_KEY_SIZE);
465 lw->tlink = cifs_get_tlink(open->tlink);
466 queue_work(cifsiod_wq, &lw->lease_break);
467 }
465 468
466 smb2_set_oplock_level(cinode, 469 cifs_dbg(FYI, "found in the pending open list\n");
467 smb2_map_lease_to_oplock(rsp->NewLeaseState)); 470 cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
471 le32_to_cpu(rsp->NewLeaseState));
468 472
469 if (ack_req) 473 open->oplock = lease_state;
470 cfile->oplock_break_cancelled = false; 474 }
471 else 475 return found;
472 cfile->oplock_break_cancelled = true; 476}
473 477
474 queue_work(cifsiod_wq, &cfile->oplock_break); 478static bool
479smb2_is_valid_lease_break(char *buffer)
480{
481 struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
482 struct list_head *tmp, *tmp1, *tmp2;
483 struct TCP_Server_Info *server;
484 struct cifs_ses *ses;
485 struct cifs_tcon *tcon;
486 struct smb2_lease_break_work *lw;
475 487
476 spin_unlock(&cifs_file_list_lock); 488 lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
477 spin_unlock(&cifs_tcp_ses_lock); 489 if (!lw)
478 return true; 490 return false;
479 }
480 491
481 found = false; 492 INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
482 list_for_each_entry(open, &tcon->pending_opens, olist) { 493 lw->lease_state = rsp->NewLeaseState;
483 if (memcmp(open->lease_key, rsp->LeaseKey,
484 SMB2_LEASE_KEY_SIZE))
485 continue;
486 494
487 if (!found && ack_req) { 495 cifs_dbg(FYI, "Checking for lease break\n");
488 found = true;
489 memcpy(lw->lease_key, open->lease_key,
490 SMB2_LEASE_KEY_SIZE);
491 lw->tlink = cifs_get_tlink(open->tlink);
492 queue_work(cifsiod_wq,
493 &lw->lease_break);
494 }
495 496
496 cifs_dbg(FYI, "found in the pending open list\n"); 497 /* look up tcon based on tid & uid */
497 cifs_dbg(FYI, "lease key match, lease break 0x%d\n", 498 spin_lock(&cifs_tcp_ses_lock);
498 le32_to_cpu(rsp->NewLeaseState)); 499 list_for_each(tmp, &cifs_tcp_ses_list) {
500 server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
499 501
500 open->oplock = 502 list_for_each(tmp1, &server->smb_ses_list) {
501 smb2_map_lease_to_oplock(rsp->NewLeaseState); 503 ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
502 } 504
503 if (found) { 505 spin_lock(&cifs_file_list_lock);
504 spin_unlock(&cifs_file_list_lock); 506 list_for_each(tmp2, &ses->tcon_list) {
505 spin_unlock(&cifs_tcp_ses_lock); 507 tcon = list_entry(tmp2, struct cifs_tcon,
506 return true; 508 tcon_list);
509 cifs_stats_inc(
510 &tcon->stats.cifs_stats.num_oplock_brks);
511 if (smb2_tcon_has_lease(tcon, rsp, lw)) {
512 spin_unlock(&cifs_file_list_lock);
513 spin_unlock(&cifs_tcp_ses_lock);
514 return true;
515 }
507 } 516 }
517 spin_unlock(&cifs_file_list_lock);
508 } 518 }
509 spin_unlock(&cifs_file_list_lock);
510 } 519 }
511 spin_unlock(&cifs_tcp_ses_lock); 520 spin_unlock(&cifs_tcp_ses_lock);
512 kfree(lw); 521 kfree(lw);
@@ -532,7 +541,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
532 if (rsp->StructureSize != 541 if (rsp->StructureSize !=
533 smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { 542 smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
534 if (le16_to_cpu(rsp->StructureSize) == 44) 543 if (le16_to_cpu(rsp->StructureSize) == 44)
535 return smb2_is_valid_lease_break(buffer, server); 544 return smb2_is_valid_lease_break(buffer);
536 else 545 else
537 return false; 546 return false;
538 } 547 }
@@ -560,14 +569,15 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
560 cifs_dbg(FYI, "file id match, oplock break\n"); 569 cifs_dbg(FYI, "file id match, oplock break\n");
561 cinode = CIFS_I(cfile->dentry->d_inode); 570 cinode = CIFS_I(cfile->dentry->d_inode);
562 571
563 if (!cinode->clientCanCacheAll && 572 if (!CIFS_CACHE_WRITE(cinode) &&
564 rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) 573 rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
565 cfile->oplock_break_cancelled = true; 574 cfile->oplock_break_cancelled = true;
566 else 575 else
567 cfile->oplock_break_cancelled = false; 576 cfile->oplock_break_cancelled = false;
568 577
569 smb2_set_oplock_level(cinode, 578 server->ops->set_oplock_level(cinode,
570 rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0); 579 rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
580 0, NULL);
571 581
572 queue_work(cifsiod_wq, &cfile->oplock_break); 582 queue_work(cifsiod_wq, &cfile->oplock_break);
573 583
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f259e6cc8357..861b33214144 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -24,6 +24,7 @@
24#include "smb2proto.h" 24#include "smb2proto.h"
25#include "cifsproto.h" 25#include "cifsproto.h"
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "cifs_unicode.h"
27#include "smb2status.h" 28#include "smb2status.h"
28#include "smb2glob.h" 29#include "smb2glob.h"
29 30
@@ -229,7 +230,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
229 oparms.fid = &fid; 230 oparms.fid = &fid;
230 oparms.reconnect = false; 231 oparms.reconnect = false;
231 232
232 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); 233 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
233 if (rc) { 234 if (rc) {
234 kfree(utf16_path); 235 kfree(utf16_path);
235 return rc; 236 return rc;
@@ -376,10 +377,13 @@ static void
376smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) 377smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
377{ 378{
378 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 379 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
380 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
381
379 cfile->fid.persistent_fid = fid->persistent_fid; 382 cfile->fid.persistent_fid = fid->persistent_fid;
380 cfile->fid.volatile_fid = fid->volatile_fid; 383 cfile->fid.volatile_fid = fid->volatile_fid;
381 smb2_set_oplock_level(cinode, oplock); 384 server->ops->set_oplock_level(cinode, oplock, fid->epoch,
382 cinode->can_cache_brlcks = cinode->clientCanCacheAll; 385 &fid->purge_cache);
386 cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
383} 387}
384 388
385static void 389static void
@@ -463,7 +467,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
463 oparms.fid = fid; 467 oparms.fid = fid;
464 oparms.reconnect = false; 468 oparms.reconnect = false;
465 469
466 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); 470 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
467 kfree(utf16_path); 471 kfree(utf16_path);
468 if (rc) { 472 if (rc) {
469 cifs_dbg(VFS, "open dir failed\n"); 473 cifs_dbg(VFS, "open dir failed\n");
@@ -530,7 +534,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
530 534
531 return SMB2_oplock_break(0, tcon, fid->persistent_fid, 535 return SMB2_oplock_break(0, tcon, fid->persistent_fid,
532 fid->volatile_fid, 536 fid->volatile_fid,
533 cinode->clientCanCacheRead ? 1 : 0); 537 CIFS_CACHE_READ(cinode) ? 1 : 0);
534} 538}
535 539
536static int 540static int
@@ -550,7 +554,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
550 oparms.fid = &fid; 554 oparms.fid = &fid;
551 oparms.reconnect = false; 555 oparms.reconnect = false;
552 556
553 rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL); 557 rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
554 if (rc) 558 if (rc)
555 return rc; 559 return rc;
556 buf->f_type = SMB2_MAGIC_NUMBER; 560 buf->f_type = SMB2_MAGIC_NUMBER;
@@ -596,7 +600,245 @@ smb2_new_lease_key(struct cifs_fid *fid)
596 get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); 600 get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
597} 601}
598 602
599struct smb_version_operations smb21_operations = { 603static int
604smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
605 const char *full_path, char **target_path,
606 struct cifs_sb_info *cifs_sb)
607{
608 int rc;
609 __le16 *utf16_path;
610 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
611 struct cifs_open_parms oparms;
612 struct cifs_fid fid;
613 struct smb2_err_rsp *err_buf = NULL;
614 struct smb2_symlink_err_rsp *symlink;
615 unsigned int sub_len, sub_offset;
616
617 cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
618
619 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
620 if (!utf16_path)
621 return -ENOMEM;
622
623 oparms.tcon = tcon;
624 oparms.desired_access = FILE_READ_ATTRIBUTES;
625 oparms.disposition = FILE_OPEN;
626 oparms.create_options = 0;
627 oparms.fid = &fid;
628 oparms.reconnect = false;
629
630 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf);
631
632 if (!rc || !err_buf) {
633 kfree(utf16_path);
634 return -ENOENT;
635 }
636 /* open must fail on symlink - reset rc */
637 rc = 0;
638 symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
639 sub_len = le16_to_cpu(symlink->SubstituteNameLength);
640 sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
641 *target_path = cifs_strndup_from_utf16(
642 (char *)symlink->PathBuffer + sub_offset,
643 sub_len, true, cifs_sb->local_nls);
644 if (!(*target_path)) {
645 kfree(utf16_path);
646 return -ENOMEM;
647 }
648 convert_delimiter(*target_path, '/');
649 cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
650 kfree(utf16_path);
651 return rc;
652}
653
654static void
655smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
656 unsigned int epoch, bool *purge_cache)
657{
658 oplock &= 0xFF;
659 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
660 return;
661 if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
662 cinode->oplock = CIFS_CACHE_RHW_FLG;
663 cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
664 &cinode->vfs_inode);
665 } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
666 cinode->oplock = CIFS_CACHE_RW_FLG;
667 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
668 &cinode->vfs_inode);
669 } else if (oplock == SMB2_OPLOCK_LEVEL_II) {
670 cinode->oplock = CIFS_CACHE_READ_FLG;
671 cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
672 &cinode->vfs_inode);
673 } else
674 cinode->oplock = 0;
675}
676
677static void
678smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
679 unsigned int epoch, bool *purge_cache)
680{
681 char message[5] = {0};
682
683 oplock &= 0xFF;
684 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
685 return;
686
687 cinode->oplock = 0;
688 if (oplock & SMB2_LEASE_READ_CACHING_HE) {
689 cinode->oplock |= CIFS_CACHE_READ_FLG;
690 strcat(message, "R");
691 }
692 if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
693 cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
694 strcat(message, "H");
695 }
696 if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
697 cinode->oplock |= CIFS_CACHE_WRITE_FLG;
698 strcat(message, "W");
699 }
700 if (!cinode->oplock)
701 strcat(message, "None");
702 cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
703 &cinode->vfs_inode);
704}
705
706static void
707smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
708 unsigned int epoch, bool *purge_cache)
709{
710 unsigned int old_oplock = cinode->oplock;
711
712 smb21_set_oplock_level(cinode, oplock, epoch, purge_cache);
713
714 if (purge_cache) {
715 *purge_cache = false;
716 if (old_oplock == CIFS_CACHE_READ_FLG) {
717 if (cinode->oplock == CIFS_CACHE_READ_FLG &&
718 (epoch - cinode->epoch > 0))
719 *purge_cache = true;
720 else if (cinode->oplock == CIFS_CACHE_RH_FLG &&
721 (epoch - cinode->epoch > 1))
722 *purge_cache = true;
723 else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
724 (epoch - cinode->epoch > 1))
725 *purge_cache = true;
726 else if (cinode->oplock == 0 &&
727 (epoch - cinode->epoch > 0))
728 *purge_cache = true;
729 } else if (old_oplock == CIFS_CACHE_RH_FLG) {
730 if (cinode->oplock == CIFS_CACHE_RH_FLG &&
731 (epoch - cinode->epoch > 0))
732 *purge_cache = true;
733 else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
734 (epoch - cinode->epoch > 1))
735 *purge_cache = true;
736 }
737 cinode->epoch = epoch;
738 }
739}
740
741static bool
742smb2_is_read_op(__u32 oplock)
743{
744 return oplock == SMB2_OPLOCK_LEVEL_II;
745}
746
747static bool
748smb21_is_read_op(__u32 oplock)
749{
750 return (oplock & SMB2_LEASE_READ_CACHING_HE) &&
751 !(oplock & SMB2_LEASE_WRITE_CACHING_HE);
752}
753
754static __le32
755map_oplock_to_lease(u8 oplock)
756{
757 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
758 return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
759 else if (oplock == SMB2_OPLOCK_LEVEL_II)
760 return SMB2_LEASE_READ_CACHING;
761 else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
762 return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
763 SMB2_LEASE_WRITE_CACHING;
764 return 0;
765}
766
767static char *
768smb2_create_lease_buf(u8 *lease_key, u8 oplock)
769{
770 struct create_lease *buf;
771
772 buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
773 if (!buf)
774 return NULL;
775
776 buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
777 buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
778 buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
779
780 buf->ccontext.DataOffset = cpu_to_le16(offsetof
781 (struct create_lease, lcontext));
782 buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
783 buf->ccontext.NameOffset = cpu_to_le16(offsetof
784 (struct create_lease, Name));
785 buf->ccontext.NameLength = cpu_to_le16(4);
786 buf->Name[0] = 'R';
787 buf->Name[1] = 'q';
788 buf->Name[2] = 'L';
789 buf->Name[3] = 's';
790 return (char *)buf;
791}
792
793static char *
794smb3_create_lease_buf(u8 *lease_key, u8 oplock)
795{
796 struct create_lease_v2 *buf;
797
798 buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL);
799 if (!buf)
800 return NULL;
801
802 buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
803 buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
804 buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
805
806 buf->ccontext.DataOffset = cpu_to_le16(offsetof
807 (struct create_lease_v2, lcontext));
808 buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
809 buf->ccontext.NameOffset = cpu_to_le16(offsetof
810 (struct create_lease_v2, Name));
811 buf->ccontext.NameLength = cpu_to_le16(4);
812 buf->Name[0] = 'R';
813 buf->Name[1] = 'q';
814 buf->Name[2] = 'L';
815 buf->Name[3] = 's';
816 return (char *)buf;
817}
818
819static __u8
820smb2_parse_lease_buf(void *buf, unsigned int *epoch)
821{
822 struct create_lease *lc = (struct create_lease *)buf;
823
824 *epoch = 0; /* not used */
825 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
826 return SMB2_OPLOCK_LEVEL_NOCHANGE;
827 return le32_to_cpu(lc->lcontext.LeaseState);
828}
829
830static __u8
831smb3_parse_lease_buf(void *buf, unsigned int *epoch)
832{
833 struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
834
835 *epoch = le16_to_cpu(lc->lcontext.Epoch);
836 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
837 return SMB2_OPLOCK_LEVEL_NOCHANGE;
838 return le32_to_cpu(lc->lcontext.LeaseState);
839}
840
841struct smb_version_operations smb20_operations = {
600 .compare_fids = smb2_compare_fids, 842 .compare_fids = smb2_compare_fids,
601 .setup_request = smb2_setup_request, 843 .setup_request = smb2_setup_request,
602 .setup_async_request = smb2_setup_async_request, 844 .setup_async_request = smb2_setup_async_request,
@@ -638,6 +880,7 @@ struct smb_version_operations smb21_operations = {
638 .unlink = smb2_unlink, 880 .unlink = smb2_unlink,
639 .rename = smb2_rename_path, 881 .rename = smb2_rename_path,
640 .create_hardlink = smb2_create_hardlink, 882 .create_hardlink = smb2_create_hardlink,
883 .query_symlink = smb2_query_symlink,
641 .open = smb2_open_file, 884 .open = smb2_open_file,
642 .set_fid = smb2_set_fid, 885 .set_fid = smb2_set_fid,
643 .close = smb2_close_file, 886 .close = smb2_close_file,
@@ -660,8 +903,82 @@ struct smb_version_operations smb21_operations = {
660 .set_lease_key = smb2_set_lease_key, 903 .set_lease_key = smb2_set_lease_key,
661 .new_lease_key = smb2_new_lease_key, 904 .new_lease_key = smb2_new_lease_key,
662 .calc_signature = smb2_calc_signature, 905 .calc_signature = smb2_calc_signature,
906 .is_read_op = smb2_is_read_op,
907 .set_oplock_level = smb2_set_oplock_level,
908 .create_lease_buf = smb2_create_lease_buf,
909 .parse_lease_buf = smb2_parse_lease_buf,
663}; 910};
664 911
912struct smb_version_operations smb21_operations = {
913 .compare_fids = smb2_compare_fids,
914 .setup_request = smb2_setup_request,
915 .setup_async_request = smb2_setup_async_request,
916 .check_receive = smb2_check_receive,
917 .add_credits = smb2_add_credits,
918 .set_credits = smb2_set_credits,
919 .get_credits_field = smb2_get_credits_field,
920 .get_credits = smb2_get_credits,
921 .get_next_mid = smb2_get_next_mid,
922 .read_data_offset = smb2_read_data_offset,
923 .read_data_length = smb2_read_data_length,
924 .map_error = map_smb2_to_linux_error,
925 .find_mid = smb2_find_mid,
926 .check_message = smb2_check_message,
927 .dump_detail = smb2_dump_detail,
928 .clear_stats = smb2_clear_stats,
929 .print_stats = smb2_print_stats,
930 .is_oplock_break = smb2_is_valid_oplock_break,
931 .need_neg = smb2_need_neg,
932 .negotiate = smb2_negotiate,
933 .negotiate_wsize = smb2_negotiate_wsize,
934 .negotiate_rsize = smb2_negotiate_rsize,
935 .sess_setup = SMB2_sess_setup,
936 .logoff = SMB2_logoff,
937 .tree_connect = SMB2_tcon,
938 .tree_disconnect = SMB2_tdis,
939 .is_path_accessible = smb2_is_path_accessible,
940 .can_echo = smb2_can_echo,
941 .echo = SMB2_echo,
942 .query_path_info = smb2_query_path_info,
943 .get_srv_inum = smb2_get_srv_inum,
944 .query_file_info = smb2_query_file_info,
945 .set_path_size = smb2_set_path_size,
946 .set_file_size = smb2_set_file_size,
947 .set_file_info = smb2_set_file_info,
948 .mkdir = smb2_mkdir,
949 .mkdir_setinfo = smb2_mkdir_setinfo,
950 .rmdir = smb2_rmdir,
951 .unlink = smb2_unlink,
952 .rename = smb2_rename_path,
953 .create_hardlink = smb2_create_hardlink,
954 .query_symlink = smb2_query_symlink,
955 .open = smb2_open_file,
956 .set_fid = smb2_set_fid,
957 .close = smb2_close_file,
958 .flush = smb2_flush_file,
959 .async_readv = smb2_async_readv,
960 .async_writev = smb2_async_writev,
961 .sync_read = smb2_sync_read,
962 .sync_write = smb2_sync_write,
963 .query_dir_first = smb2_query_dir_first,
964 .query_dir_next = smb2_query_dir_next,
965 .close_dir = smb2_close_dir,
966 .calc_smb_size = smb2_calc_size,
967 .is_status_pending = smb2_is_status_pending,
968 .oplock_response = smb2_oplock_response,
969 .queryfs = smb2_queryfs,
970 .mand_lock = smb2_mand_lock,
971 .mand_unlock_range = smb2_unlock_range,
972 .push_mand_locks = smb2_push_mandatory_locks,
973 .get_lease_key = smb2_get_lease_key,
974 .set_lease_key = smb2_set_lease_key,
975 .new_lease_key = smb2_new_lease_key,
976 .calc_signature = smb2_calc_signature,
977 .is_read_op = smb21_is_read_op,
978 .set_oplock_level = smb21_set_oplock_level,
979 .create_lease_buf = smb2_create_lease_buf,
980 .parse_lease_buf = smb2_parse_lease_buf,
981};
665 982
666struct smb_version_operations smb30_operations = { 983struct smb_version_operations smb30_operations = {
667 .compare_fids = smb2_compare_fids, 984 .compare_fids = smb2_compare_fids,
@@ -706,6 +1023,7 @@ struct smb_version_operations smb30_operations = {
706 .unlink = smb2_unlink, 1023 .unlink = smb2_unlink,
707 .rename = smb2_rename_path, 1024 .rename = smb2_rename_path,
708 .create_hardlink = smb2_create_hardlink, 1025 .create_hardlink = smb2_create_hardlink,
1026 .query_symlink = smb2_query_symlink,
709 .open = smb2_open_file, 1027 .open = smb2_open_file,
710 .set_fid = smb2_set_fid, 1028 .set_fid = smb2_set_fid,
711 .close = smb2_close_file, 1029 .close = smb2_close_file,
@@ -729,6 +1047,10 @@ struct smb_version_operations smb30_operations = {
729 .new_lease_key = smb2_new_lease_key, 1047 .new_lease_key = smb2_new_lease_key,
730 .generate_signingkey = generate_smb3signingkey, 1048 .generate_signingkey = generate_smb3signingkey,
731 .calc_signature = smb3_calc_signature, 1049 .calc_signature = smb3_calc_signature,
1050 .is_read_op = smb21_is_read_op,
1051 .set_oplock_level = smb3_set_oplock_level,
1052 .create_lease_buf = smb3_create_lease_buf,
1053 .parse_lease_buf = smb3_parse_lease_buf,
732}; 1054};
733 1055
734struct smb_version_values smb20_values = { 1056struct smb_version_values smb20_values = {
@@ -746,9 +1068,9 @@ struct smb_version_values smb20_values = {
746 .cap_unix = 0, 1068 .cap_unix = 0,
747 .cap_nt_find = SMB2_NT_FIND, 1069 .cap_nt_find = SMB2_NT_FIND,
748 .cap_large_files = SMB2_LARGE_FILES, 1070 .cap_large_files = SMB2_LARGE_FILES,
749 .oplock_read = SMB2_OPLOCK_LEVEL_II,
750 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, 1071 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
751 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, 1072 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
1073 .create_lease_size = sizeof(struct create_lease),
752}; 1074};
753 1075
754struct smb_version_values smb21_values = { 1076struct smb_version_values smb21_values = {
@@ -766,9 +1088,9 @@ struct smb_version_values smb21_values = {
766 .cap_unix = 0, 1088 .cap_unix = 0,
767 .cap_nt_find = SMB2_NT_FIND, 1089 .cap_nt_find = SMB2_NT_FIND,
768 .cap_large_files = SMB2_LARGE_FILES, 1090 .cap_large_files = SMB2_LARGE_FILES,
769 .oplock_read = SMB2_OPLOCK_LEVEL_II,
770 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, 1091 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
771 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, 1092 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
1093 .create_lease_size = sizeof(struct create_lease),
772}; 1094};
773 1095
774struct smb_version_values smb30_values = { 1096struct smb_version_values smb30_values = {
@@ -786,9 +1108,9 @@ struct smb_version_values smb30_values = {
786 .cap_unix = 0, 1108 .cap_unix = 0,
787 .cap_nt_find = SMB2_NT_FIND, 1109 .cap_nt_find = SMB2_NT_FIND,
788 .cap_large_files = SMB2_LARGE_FILES, 1110 .cap_large_files = SMB2_LARGE_FILES,
789 .oplock_read = SMB2_OPLOCK_LEVEL_II,
790 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, 1111 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
791 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, 1112 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
1113 .create_lease_size = sizeof(struct create_lease_v2),
792}; 1114};
793 1115
794struct smb_version_values smb302_values = { 1116struct smb_version_values smb302_values = {
@@ -806,7 +1128,7 @@ struct smb_version_values smb302_values = {
806 .cap_unix = 0, 1128 .cap_unix = 0,
807 .cap_nt_find = SMB2_NT_FIND, 1129 .cap_nt_find = SMB2_NT_FIND,
808 .cap_large_files = SMB2_LARGE_FILES, 1130 .cap_large_files = SMB2_LARGE_FILES,
809 .oplock_read = SMB2_OPLOCK_LEVEL_II,
810 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, 1131 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
811 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, 1132 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
1133 .create_lease_size = sizeof(struct create_lease_v2),
812}; 1134};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index abc9c2809b51..eba0efde66d7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -478,12 +478,20 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
478 } 478 }
479 479
480 /* 480 /*
481 * If we are here due to reconnect, free per-smb session key
482 * in case signing was required.
483 */
484 kfree(ses->auth_key.response);
485 ses->auth_key.response = NULL;
486
487 /*
481 * If memory allocation is successful, caller of this function 488 * If memory allocation is successful, caller of this function
482 * frees it. 489 * frees it.
483 */ 490 */
484 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); 491 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
485 if (!ses->ntlmssp) 492 if (!ses->ntlmssp)
486 return -ENOMEM; 493 return -ENOMEM;
494 ses->ntlmssp->sesskey_per_smbsess = true;
487 495
488 /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ 496 /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
489 ses->sectype = RawNTLMSSP; 497 ses->sectype = RawNTLMSSP;
@@ -628,6 +636,40 @@ ssetup_exit:
628 /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ 636 /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
629 if ((phase == NtLmChallenge) && (rc == 0)) 637 if ((phase == NtLmChallenge) && (rc == 0))
630 goto ssetup_ntlmssp_authenticate; 638 goto ssetup_ntlmssp_authenticate;
639
640 if (!rc) {
641 mutex_lock(&server->srv_mutex);
642 if (server->sign && server->ops->generate_signingkey) {
643 rc = server->ops->generate_signingkey(ses);
644 kfree(ses->auth_key.response);
645 ses->auth_key.response = NULL;
646 if (rc) {
647 cifs_dbg(FYI,
648 "SMB3 session key generation failed\n");
649 mutex_unlock(&server->srv_mutex);
650 goto keygen_exit;
651 }
652 }
653 if (!server->session_estab) {
654 server->sequence_number = 0x2;
655 server->session_estab = true;
656 }
657 mutex_unlock(&server->srv_mutex);
658
659 cifs_dbg(FYI, "SMB2/3 session established successfully\n");
660 spin_lock(&GlobalMid_Lock);
661 ses->status = CifsGood;
662 ses->need_reconnect = false;
663 spin_unlock(&GlobalMid_Lock);
664 }
665
666keygen_exit:
667 if (!server->sign) {
668 kfree(ses->auth_key.response);
669 ses->auth_key.response = NULL;
670 }
671 kfree(ses->ntlmssp);
672
631 return rc; 673 return rc;
632} 674}
633 675
@@ -813,39 +855,6 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
813 return rc; 855 return rc;
814} 856}
815 857
816static struct create_lease *
817create_lease_buf(u8 *lease_key, u8 oplock)
818{
819 struct create_lease *buf;
820
821 buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
822 if (!buf)
823 return NULL;
824
825 buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
826 buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
827 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
828 buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING |
829 SMB2_LEASE_READ_CACHING;
830 else if (oplock == SMB2_OPLOCK_LEVEL_II)
831 buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING;
832 else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
833 buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING |
834 SMB2_LEASE_READ_CACHING |
835 SMB2_LEASE_WRITE_CACHING;
836
837 buf->ccontext.DataOffset = cpu_to_le16(offsetof
838 (struct create_lease, lcontext));
839 buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
840 buf->ccontext.NameOffset = cpu_to_le16(offsetof
841 (struct create_lease, Name));
842 buf->ccontext.NameLength = cpu_to_le16(4);
843 buf->Name[0] = 'R';
844 buf->Name[1] = 'q';
845 buf->Name[2] = 'L';
846 buf->Name[3] = 's';
847 return buf;
848}
849 858
850static struct create_durable * 859static struct create_durable *
851create_durable_buf(void) 860create_durable_buf(void)
@@ -894,55 +903,49 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
894} 903}
895 904
896static __u8 905static __u8
897parse_lease_state(struct smb2_create_rsp *rsp) 906parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
907 unsigned int *epoch)
898{ 908{
899 char *data_offset; 909 char *data_offset;
900 struct create_lease *lc; 910 struct create_context *cc;
901 bool found = false;
902 unsigned int next = 0; 911 unsigned int next = 0;
903 char *name; 912 char *name;
904 913
905 data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); 914 data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
906 lc = (struct create_lease *)data_offset; 915 cc = (struct create_context *)data_offset;
907 do { 916 do {
908 lc = (struct create_lease *)((char *)lc + next); 917 cc = (struct create_context *)((char *)cc + next);
909 name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; 918 name = le16_to_cpu(cc->NameOffset) + (char *)cc;
910 if (le16_to_cpu(lc->ccontext.NameLength) != 4 || 919 if (le16_to_cpu(cc->NameLength) != 4 ||
911 strncmp(name, "RqLs", 4)) { 920 strncmp(name, "RqLs", 4)) {
912 next = le32_to_cpu(lc->ccontext.Next); 921 next = le32_to_cpu(cc->Next);
913 continue; 922 continue;
914 } 923 }
915 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) 924 return server->ops->parse_lease_buf(cc, epoch);
916 return SMB2_OPLOCK_LEVEL_NOCHANGE;
917 found = true;
918 break;
919 } while (next != 0); 925 } while (next != 0);
920 926
921 if (!found) 927 return 0;
922 return 0;
923
924 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
925} 928}
926 929
927static int 930static int
928add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock) 931add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
932 unsigned int *num_iovec, __u8 *oplock)
929{ 933{
930 struct smb2_create_req *req = iov[0].iov_base; 934 struct smb2_create_req *req = iov[0].iov_base;
931 unsigned int num = *num_iovec; 935 unsigned int num = *num_iovec;
932 936
933 iov[num].iov_base = create_lease_buf(oplock+1, *oplock); 937 iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock);
934 if (iov[num].iov_base == NULL) 938 if (iov[num].iov_base == NULL)
935 return -ENOMEM; 939 return -ENOMEM;
936 iov[num].iov_len = sizeof(struct create_lease); 940 iov[num].iov_len = server->vals->create_lease_size;
937 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; 941 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
938 if (!req->CreateContextsOffset) 942 if (!req->CreateContextsOffset)
939 req->CreateContextsOffset = cpu_to_le32( 943 req->CreateContextsOffset = cpu_to_le32(
940 sizeof(struct smb2_create_req) - 4 + 944 sizeof(struct smb2_create_req) - 4 +
941 iov[num - 1].iov_len); 945 iov[num - 1].iov_len);
942 req->CreateContextsLength = cpu_to_le32( 946 le32_add_cpu(&req->CreateContextsLength,
943 le32_to_cpu(req->CreateContextsLength) + 947 server->vals->create_lease_size);
944 sizeof(struct create_lease)); 948 inc_rfc1001_len(&req->hdr, server->vals->create_lease_size);
945 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
946 *num_iovec = num + 1; 949 *num_iovec = num + 1;
947 return 0; 950 return 0;
948} 951}
@@ -967,9 +970,7 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
967 req->CreateContextsOffset = 970 req->CreateContextsOffset =
968 cpu_to_le32(sizeof(struct smb2_create_req) - 4 + 971 cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
969 iov[1].iov_len); 972 iov[1].iov_len);
970 req->CreateContextsLength = 973 le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
971 cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
972 sizeof(struct create_durable));
973 inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); 974 inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
974 *num_iovec = num + 1; 975 *num_iovec = num + 1;
975 return 0; 976 return 0;
@@ -977,7 +978,8 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
977 978
978int 979int
979SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, 980SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
980 __u8 *oplock, struct smb2_file_all_info *buf) 981 __u8 *oplock, struct smb2_file_all_info *buf,
982 struct smb2_err_rsp **err_buf)
981{ 983{
982 struct smb2_create_req *req; 984 struct smb2_create_req *req;
983 struct smb2_create_rsp *rsp; 985 struct smb2_create_rsp *rsp;
@@ -1048,11 +1050,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
1048 if (!server->oplocks) 1050 if (!server->oplocks)
1049 *oplock = SMB2_OPLOCK_LEVEL_NONE; 1051 *oplock = SMB2_OPLOCK_LEVEL_NONE;
1050 1052
1051 if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) || 1053 if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
1052 *oplock == SMB2_OPLOCK_LEVEL_NONE) 1054 *oplock == SMB2_OPLOCK_LEVEL_NONE)
1053 req->RequestedOplockLevel = *oplock; 1055 req->RequestedOplockLevel = *oplock;
1054 else { 1056 else {
1055 rc = add_lease_context(iov, &num_iovecs, oplock); 1057 rc = add_lease_context(server, iov, &num_iovecs, oplock);
1056 if (rc) { 1058 if (rc) {
1057 cifs_small_buf_release(req); 1059 cifs_small_buf_release(req);
1058 kfree(copy_path); 1060 kfree(copy_path);
@@ -1062,11 +1064,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
1062 1064
1063 if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { 1065 if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
1064 /* need to set Next field of lease context if we request it */ 1066 /* need to set Next field of lease context if we request it */
1065 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) { 1067 if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
1066 struct create_context *ccontext = 1068 struct create_context *ccontext =
1067 (struct create_context *)iov[num_iovecs-1].iov_base; 1069 (struct create_context *)iov[num_iovecs-1].iov_base;
1068 ccontext->Next = 1070 ccontext->Next =
1069 cpu_to_le32(sizeof(struct create_lease)); 1071 cpu_to_le32(server->vals->create_lease_size);
1070 } 1072 }
1071 rc = add_durable_context(iov, &num_iovecs, oparms); 1073 rc = add_durable_context(iov, &num_iovecs, oparms);
1072 if (rc) { 1074 if (rc) {
@@ -1082,6 +1084,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
1082 1084
1083 if (rc != 0) { 1085 if (rc != 0) {
1084 cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); 1086 cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
1087 if (err_buf)
1088 *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4,
1089 GFP_KERNEL);
1085 goto creat_exit; 1090 goto creat_exit;
1086 } 1091 }
1087 1092
@@ -1098,7 +1103,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
1098 } 1103 }
1099 1104
1100 if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) 1105 if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
1101 *oplock = parse_lease_state(rsp); 1106 *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
1102 else 1107 else
1103 *oplock = rsp->OplockLevel; 1108 *oplock = rsp->OplockLevel;
1104creat_exit: 1109creat_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 36b0d37ea69b..b83d0118a757 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -150,6 +150,20 @@ struct smb2_err_rsp {
150 __u8 ErrorData[1]; /* variable length */ 150 __u8 ErrorData[1]; /* variable length */
151} __packed; 151} __packed;
152 152
153struct smb2_symlink_err_rsp {
154 __le32 SymLinkLength;
155 __le32 SymLinkErrorTag;
156 __le32 ReparseTag;
157 __le16 ReparseDataLength;
158 __le16 UnparsedPathLength;
159 __le16 SubstituteNameOffset;
160 __le16 SubstituteNameLength;
161 __le16 PrintNameOffset;
162 __le16 PrintNameLength;
163 __le32 Flags;
164 __u8 PathBuffer[0];
165} __packed;
166
153#define SMB2_CLIENT_GUID_SIZE 16 167#define SMB2_CLIENT_GUID_SIZE 16
154 168
155extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; 169extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
@@ -462,6 +476,10 @@ struct create_context {
462 __u8 Buffer[0]; 476 __u8 Buffer[0];
463} __packed; 477} __packed;
464 478
479#define SMB2_LEASE_READ_CACHING_HE 0x01
480#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
481#define SMB2_LEASE_WRITE_CACHING_HE 0x04
482
465#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) 483#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00)
466#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) 484#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01)
467#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) 485#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02)
@@ -479,12 +497,31 @@ struct lease_context {
479 __le64 LeaseDuration; 497 __le64 LeaseDuration;
480} __packed; 498} __packed;
481 499
500struct lease_context_v2 {
501 __le64 LeaseKeyLow;
502 __le64 LeaseKeyHigh;
503 __le32 LeaseState;
504 __le32 LeaseFlags;
505 __le64 LeaseDuration;
506 __le64 ParentLeaseKeyLow;
507 __le64 ParentLeaseKeyHigh;
508 __le16 Epoch;
509 __le16 Reserved;
510} __packed;
511
482struct create_lease { 512struct create_lease {
483 struct create_context ccontext; 513 struct create_context ccontext;
484 __u8 Name[8]; 514 __u8 Name[8];
485 struct lease_context lcontext; 515 struct lease_context lcontext;
486} __packed; 516} __packed;
487 517
518struct create_lease_v2 {
519 struct create_context ccontext;
520 __u8 Name[8];
521 struct lease_context_v2 lcontext;
522 __u8 Pad[4];
523} __packed;
524
488struct create_durable { 525struct create_durable {
489 struct create_context ccontext; 526 struct create_context ccontext;
490 __u8 Name[8]; 527 __u8 Name[8];
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 1a5ecbed40ed..e3fb4801ee96 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -53,7 +53,6 @@ extern int smb3_calc_signature(struct smb_rqst *rqst,
53 struct TCP_Server_Info *server); 53 struct TCP_Server_Info *server);
54extern void smb2_echo_request(struct work_struct *work); 54extern void smb2_echo_request(struct work_struct *work);
55extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); 55extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
56extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
57extern bool smb2_is_valid_oplock_break(char *buffer, 56extern bool smb2_is_valid_oplock_break(char *buffer,
58 struct TCP_Server_Info *srv); 57 struct TCP_Server_Info *srv);
59 58
@@ -87,7 +86,6 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
87extern int smb2_open_file(const unsigned int xid, 86extern int smb2_open_file(const unsigned int xid,
88 struct cifs_open_parms *oparms, 87 struct cifs_open_parms *oparms,
89 __u32 *oplock, FILE_ALL_INFO *buf); 88 __u32 *oplock, FILE_ALL_INFO *buf);
90extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
91extern int smb2_unlock_range(struct cifsFileInfo *cfile, 89extern int smb2_unlock_range(struct cifsFileInfo *cfile,
92 struct file_lock *flock, const unsigned int xid); 90 struct file_lock *flock, const unsigned int xid);
93extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); 91extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
@@ -106,7 +104,8 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
106extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); 104extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
107extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, 105extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
108 __le16 *path, __u8 *oplock, 106 __le16 *path, __u8 *oplock,
109 struct smb2_file_all_info *buf); 107 struct smb2_file_all_info *buf,
108 struct smb2_err_rsp **err_buf);
110extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, 109extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
111 u64 persistent_fid, u64 volatile_fid, u32 opcode, 110 u64 persistent_fid, u64 volatile_fid, u32 opcode,
112 bool is_fsctl, char *in_data, u32 indatalen, 111 bool is_fsctl, char *in_data, u32 indatalen,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 4f2300d020c7..340abca3aa52 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -114,6 +114,23 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
114 return 0; 114 return 0;
115} 115}
116 116
117static struct cifs_ses *
118smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
119{
120 struct cifs_ses *ses;
121
122 spin_lock(&cifs_tcp_ses_lock);
123 list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
124 if (ses->Suid != smb2hdr->SessionId)
125 continue;
126 spin_unlock(&cifs_tcp_ses_lock);
127 return ses;
128 }
129 spin_unlock(&cifs_tcp_ses_lock);
130
131 return NULL;
132}
133
117 134
118int 135int
119smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 136smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -124,6 +141,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
124 struct kvec *iov = rqst->rq_iov; 141 struct kvec *iov = rqst->rq_iov;
125 int n_vec = rqst->rq_nvec; 142 int n_vec = rqst->rq_nvec;
126 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; 143 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
144 struct cifs_ses *ses;
145
146 ses = smb2_find_smb_ses(smb2_pdu, server);
147 if (!ses) {
148 cifs_dbg(VFS, "%s: Could not find session\n", __func__);
149 return 0;
150 }
127 151
128 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); 152 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
129 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); 153 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
@@ -135,7 +159,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
135 } 159 }
136 160
137 rc = crypto_shash_setkey(server->secmech.hmacsha256, 161 rc = crypto_shash_setkey(server->secmech.hmacsha256,
138 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); 162 ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
139 if (rc) { 163 if (rc) {
140 cifs_dbg(VFS, "%s: Could not update with response\n", __func__); 164 cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
141 return rc; 165 return rc;
@@ -198,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
198 return rc; 222 return rc;
199} 223}
200 224
201void 225int
202generate_smb3signingkey(struct TCP_Server_Info *server) 226generate_smb3signingkey(struct cifs_ses *ses)
203{ 227{
204 unsigned char zero = 0x0; 228 unsigned char zero = 0x0;
205 __u8 i[4] = {0, 0, 0, 1}; 229 __u8 i[4] = {0, 0, 0, 1};
@@ -209,90 +233,99 @@ generate_smb3signingkey(struct TCP_Server_Info *server)
209 unsigned char *hashptr = prfhash; 233 unsigned char *hashptr = prfhash;
210 234
211 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); 235 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
212 memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); 236 memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
213 237
214 rc = smb3_crypto_shash_allocate(server); 238 rc = smb3_crypto_shash_allocate(ses->server);
215 if (rc) { 239 if (rc) {
216 cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); 240 cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
217 goto smb3signkey_ret; 241 goto smb3signkey_ret;
218 } 242 }
219 243
220 rc = crypto_shash_setkey(server->secmech.hmacsha256, 244 rc = crypto_shash_setkey(ses->server->secmech.hmacsha256,
221 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); 245 ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
222 if (rc) { 246 if (rc) {
223 cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); 247 cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
224 goto smb3signkey_ret; 248 goto smb3signkey_ret;
225 } 249 }
226 250
227 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); 251 rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash);
228 if (rc) { 252 if (rc) {
229 cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); 253 cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
230 goto smb3signkey_ret; 254 goto smb3signkey_ret;
231 } 255 }
232 256
233 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, 257 rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
234 i, 4); 258 i, 4);
235 if (rc) { 259 if (rc) {
236 cifs_dbg(VFS, "%s: Could not update with n\n", __func__); 260 cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
237 goto smb3signkey_ret; 261 goto smb3signkey_ret;
238 } 262 }
239 263
240 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, 264 rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
241 "SMB2AESCMAC", 12); 265 "SMB2AESCMAC", 12);
242 if (rc) { 266 if (rc) {
243 cifs_dbg(VFS, "%s: Could not update with label\n", __func__); 267 cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
244 goto smb3signkey_ret; 268 goto smb3signkey_ret;
245 } 269 }
246 270
247 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, 271 rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
248 &zero, 1); 272 &zero, 1);
249 if (rc) { 273 if (rc) {
250 cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); 274 cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
251 goto smb3signkey_ret; 275 goto smb3signkey_ret;
252 } 276 }
253 277
254 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, 278 rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
255 "SmbSign", 8); 279 "SmbSign", 8);
256 if (rc) { 280 if (rc) {
257 cifs_dbg(VFS, "%s: Could not update with context\n", __func__); 281 cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
258 goto smb3signkey_ret; 282 goto smb3signkey_ret;
259 } 283 }
260 284
261 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, 285 rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
262 L, 4); 286 L, 4);
263 if (rc) { 287 if (rc) {
264 cifs_dbg(VFS, "%s: Could not update with L\n", __func__); 288 cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
265 goto smb3signkey_ret; 289 goto smb3signkey_ret;
266 } 290 }
267 291
268 rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, 292 rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash,
269 hashptr); 293 hashptr);
270 if (rc) { 294 if (rc) {
271 cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); 295 cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
272 goto smb3signkey_ret; 296 goto smb3signkey_ret;
273 } 297 }
274 298
275 memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); 299 memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
276 300
277smb3signkey_ret: 301smb3signkey_ret:
278 return; 302 return rc;
279} 303}
280 304
281int 305int
282smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 306smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
283{ 307{
284 int i, rc; 308 int i;
309 int rc = 0;
285 unsigned char smb3_signature[SMB2_CMACAES_SIZE]; 310 unsigned char smb3_signature[SMB2_CMACAES_SIZE];
286 unsigned char *sigptr = smb3_signature; 311 unsigned char *sigptr = smb3_signature;
287 struct kvec *iov = rqst->rq_iov; 312 struct kvec *iov = rqst->rq_iov;
288 int n_vec = rqst->rq_nvec; 313 int n_vec = rqst->rq_nvec;
289 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; 314 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
315 struct cifs_ses *ses;
316
317 ses = smb2_find_smb_ses(smb2_pdu, server);
318 if (!ses) {
319 cifs_dbg(VFS, "%s: Could not find session\n", __func__);
320 return 0;
321 }
290 322
291 memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); 323 memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
292 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); 324 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
293 325
294 rc = crypto_shash_setkey(server->secmech.cmacaes, 326 rc = crypto_shash_setkey(server->secmech.cmacaes,
295 server->smb3signingkey, SMB2_CMACAES_SIZE); 327 ses->smb3signingkey, SMB2_CMACAES_SIZE);
328
296 if (rc) { 329 if (rc) {
297 cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); 330 cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
298 return rc; 331 return rc;
@@ -389,6 +422,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
389 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; 422 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
390 423
391 if ((smb2_pdu->Command == SMB2_NEGOTIATE) || 424 if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
425 (smb2_pdu->Command == SMB2_SESSION_SETUP) ||
392 (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || 426 (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
393 (!server->session_estab)) 427 (!server->session_estab))
394 return 0; 428 return 0;
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
new file mode 100644
index 000000000000..1506d4fddb2c
--- /dev/null
+++ b/fs/cifs/winucase.c
@@ -0,0 +1,663 @@
1/*
2 * fs/cifs/winucase.c
3 *
4 * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * The const tables in this file were converted from the following info
21 * provided by Microsoft:
22 *
23 * 3.1.5.3 Mapping UTF-16 Strings to Upper Case:
24 *
25 * http://msdn.microsoft.com/en-us/library/hh877830.aspx
26 * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921
27 *
28 * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was
29 * post-processed using the winucase_convert.pl script.
30 */
31
32#include <linux/nls.h>
33
34wchar_t cifs_toupper(wchar_t in); /* quiet sparse */
35
36static const wchar_t t2_00[256] = {
37 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
38 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
39 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
40 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
41 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
42 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
43 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
44 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
45 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
46 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
47 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
48 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
49 0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
50 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
51 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
52 0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
53 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
54 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
55 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
56 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
57 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
58 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
59 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
60 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
61 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
62 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
63 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
64 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
65 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
66 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
67 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000,
68 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178,
69};
70
71static const wchar_t t2_01[256] = {
72 0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106,
73 0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e,
74 0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116,
75 0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e,
76 0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126,
77 0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e,
78 0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136,
79 0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000,
80 0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000,
81 0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e,
82 0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
83 0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e,
84 0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166,
85 0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e,
86 0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176,
87 0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000,
88 0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000,
89 0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000,
90 0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000,
91 0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000,
92 0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000,
93 0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000,
94 0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000,
95 0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7,
96 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000,
97 0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000,
98 0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000,
99 0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de,
100 0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
101 0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
102 0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000,
103 0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe,
104};
105
106static const wchar_t t2_02[256] = {
107 0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206,
108 0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e,
109 0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216,
110 0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e,
111 0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226,
112 0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e,
113 0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000,
114 0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000,
115 0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246,
116 0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e,
117 0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a,
118 0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000,
119 0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000,
120 0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c,
121 0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000,
122 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000,
123 0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000,
124 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000,
125 0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
126 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
127 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
128 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
129 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
130 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
131 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
132 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
134 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
135 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
136 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
137 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
138 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
139};
140
141static const wchar_t t2_03[256] = {
142 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
143 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
144 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
145 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
146 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
147 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
148 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
149 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
150 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
151 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
152 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
153 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
154 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
155 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
156 0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376,
157 0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000,
158 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
159 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
160 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
161 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
162 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
163 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a,
164 0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
165 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
166 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
167 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000,
168 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf,
169 0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de,
170 0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6,
171 0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee,
172 0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
173 0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000,
174};
175
176static const wchar_t t2_04[256] = {
177 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
178 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
179 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
180 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
181 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
182 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
183 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
184 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
185 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
186 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
187 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
188 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
189 0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466,
190 0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e,
191 0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476,
192 0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e,
193 0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
194 0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e,
195 0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496,
196 0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e,
197 0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6,
198 0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae,
199 0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6,
200 0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be,
201 0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000,
202 0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0,
203 0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6,
204 0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de,
205 0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6,
206 0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee,
207 0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6,
208 0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe,
209};
210
211static const wchar_t t2_05[256] = {
212 0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506,
213 0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e,
214 0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516,
215 0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e,
216 0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000,
217 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
218 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
219 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
220 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
221 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
222 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
223 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
224 0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
225 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
226 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
227 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
228 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000,
229 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
230 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
231 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
232 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
233 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
234 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
235 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
236 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
237 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
238 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
239 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
240 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
241 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
242 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
243 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
244};
245
246static const wchar_t t2_1d[256] = {
247 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
248 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
249 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
250 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
251 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
252 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
253 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
254 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
255 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
256 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
257 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
258 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
259 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
260 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
261 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
262 0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000,
263 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
264 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
265 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
266 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
267 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
268 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
269 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
270 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
271 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
272 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
273 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
274 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
275 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
276 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
277 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
278 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
279};
280
281static const wchar_t t2_1e[256] = {
282 0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06,
283 0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e,
284 0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16,
285 0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e,
286 0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26,
287 0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e,
288 0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36,
289 0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e,
290 0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46,
291 0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e,
292 0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56,
293 0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e,
294 0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66,
295 0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e,
296 0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76,
297 0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e,
298 0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86,
299 0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e,
300 0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000,
301 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
302 0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6,
303 0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae,
304 0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6,
305 0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe,
306 0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6,
307 0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece,
308 0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6,
309 0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede,
310 0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6,
311 0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee,
312 0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6,
313 0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe,
314};
315
316static const wchar_t t2_1f[256] = {
317 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f,
318 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
319 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000,
320 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
321 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f,
322 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
323 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f,
324 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
325 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000,
326 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
327 0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f,
328 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
329 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f,
330 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
331 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb,
332 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000,
333 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f,
334 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
335 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f,
336 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
337 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf,
338 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
339 0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000,
340 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
341 0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000,
342 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
343 0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
344 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
345 0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000,
346 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
347 0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000,
348 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
349};
350
351static const wchar_t t2_21[256] = {
352 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
353 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
354 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
355 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
356 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
357 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
358 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
359 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
360 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
361 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000,
362 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
363 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
364 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
365 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
366 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167,
367 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f,
368 0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000,
369 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
370 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
371 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
372 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
373 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
374 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
375 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
376 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
377 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
378 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
379 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
380 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
381 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
382 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
383 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
384};
385
386static const wchar_t t2_24[256] = {
387 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
388 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
389 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
390 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
391 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
392 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
393 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
394 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
395 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
396 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
397 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
398 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
399 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
400 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
401 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
402 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
403 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
404 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
405 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
406 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
407 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
408 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
409 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
410 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
411 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
412 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
413 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd,
414 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5,
415 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd,
416 0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
417 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
418 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
419};
420
421static const wchar_t t2_2c[256] = {
422 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
423 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
424 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
425 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
426 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
427 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
428 0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07,
429 0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f,
430 0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17,
431 0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f,
432 0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27,
433 0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000,
434 0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000,
435 0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000,
436 0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000,
437 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
438 0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86,
439 0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e,
440 0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96,
441 0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e,
442 0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6,
443 0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae,
444 0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6,
445 0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe,
446 0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6,
447 0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce,
448 0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6,
449 0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde,
450 0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000,
451 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
452 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
453 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
454};
455
456static const wchar_t t2_2d[256] = {
457 0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7,
458 0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af,
459 0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7,
460 0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf,
461 0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000,
462 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
463 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
464 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
465 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
466 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
467 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
468 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
469 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
470 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
471 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
472 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
473 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
474 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
475 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
476 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
477 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
478 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
479 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
480 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
481 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
482 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
483 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
484 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
485 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
486 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
487 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
488 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
489};
490
491static const wchar_t t2_a6[256] = {
492 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
493 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
494 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
495 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
496 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
497 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
498 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
499 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
500 0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646,
501 0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e,
502 0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656,
503 0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e,
504 0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666,
505 0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000,
506 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
507 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
508 0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686,
509 0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e,
510 0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696,
511 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
512 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
513 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
514 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
515 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
516 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
517 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
518 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
519 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
520 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
521 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
522 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
523 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
524};
525
526static const wchar_t t2_a7[256] = {
527 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
528 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
529 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
530 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
531 0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726,
532 0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e,
533 0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736,
534 0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e,
535 0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746,
536 0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e,
537 0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756,
538 0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e,
539 0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766,
540 0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e,
541 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
542 0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e,
543 0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786,
544 0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000,
545 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
546 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
547 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
548 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
549 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
550 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
551 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
552 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
553 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
554 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
555 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
556 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
557 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
558 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
559};
560
561static const wchar_t t2_ff[256] = {
562 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
563 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
564 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
565 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
566 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
567 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
568 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
569 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
570 0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27,
571 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f,
572 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37,
573 0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
574 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
575 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
576 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
577 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
578 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
579 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
580 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
581 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
582 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
583 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
584 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
585 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
586 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
587 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
588 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
589 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
590 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
591 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
592 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
593 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
594};
595
596static const wchar_t *const toplevel[256] = {
597 t2_00, t2_01, t2_02, t2_03, t2_04, t2_05, NULL, NULL,
598 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
599 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
600 NULL, NULL, NULL, NULL, NULL, t2_1d, t2_1e, t2_1f,
601 NULL, t2_21, NULL, NULL, t2_24, NULL, NULL, NULL,
602 NULL, NULL, NULL, NULL, t2_2c, t2_2d, NULL, NULL,
603 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
604 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
605 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
606 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
607 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
608 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
609 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
610 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
611 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
612 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
613 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
614 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
615 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
616 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
617 NULL, NULL, NULL, NULL, NULL, NULL, t2_a6, t2_a7,
618 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
619 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
620 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
621 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
622 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
623 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
624 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
625 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
626 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
627 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
628 NULL, NULL, NULL, NULL, NULL, NULL, NULL, t2_ff,
629};
630
631/**
632 * cifs_toupper - convert a wchar_t from lower to uppercase
633 * @in: character to convert from lower to uppercase
634 *
635 * This function consults the static tables above to convert a wchar_t from
636 * lower to uppercase. In the event that there is no mapping, the original
637 * "in" character is returned.
638 */
639wchar_t
640cifs_toupper(wchar_t in)
641{
642 unsigned char idx;
643 const wchar_t *tbl;
644 wchar_t out;
645
646 /* grab upper byte */
647 idx = (in & 0xff00) >> 8;
648
649 /* find pointer to 2nd layer table */
650 tbl = toplevel[idx];
651 if (!tbl)
652 return in;
653
654 /* grab lower byte */
655 idx = in & 0xff;
656
657 /* look up character in table */
658 out = tbl[idx];
659 if (out)
660 return out;
661
662 return in;
663}
diff --git a/fs/coredump.c b/fs/coredump.c
index 72f816d6cad9..9bdeca12ae0e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
190 err = cn_printf(cn, "%d", 190 err = cn_printf(cn, "%d",
191 task_tgid_vnr(current)); 191 task_tgid_vnr(current));
192 break; 192 break;
193 /* global pid */
194 case 'P':
195 err = cn_printf(cn, "%d",
196 task_tgid_nr(current));
197 break;
193 /* uid */ 198 /* uid */
194 case 'u': 199 case 'u':
195 err = cn_printf(cn, "%d", cred->uid); 200 err = cn_printf(cn, "%d", cred->uid);
diff --git a/fs/dcache.c b/fs/dcache.c
index b949af850cd6..41000305d716 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
37#include <linux/rculist_bl.h> 37#include <linux/rculist_bl.h>
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/list_lru.h>
40#include "internal.h" 41#include "internal.h"
41#include "mount.h" 42#include "mount.h"
42 43
@@ -48,7 +49,7 @@
48 * - the dcache hash table 49 * - the dcache hash table
49 * s_anon bl list spinlock protects: 50 * s_anon bl list spinlock protects:
50 * - the s_anon list (see __d_drop) 51 * - the s_anon list (see __d_drop)
51 * dcache_lru_lock protects: 52 * dentry->d_sb->s_dentry_lru_lock protects:
52 * - the dcache lru lists and counters 53 * - the dcache lru lists and counters
53 * d_lock protects: 54 * d_lock protects:
54 * - d_flags 55 * - d_flags
@@ -63,7 +64,7 @@
63 * Ordering: 64 * Ordering:
64 * dentry->d_inode->i_lock 65 * dentry->d_inode->i_lock
65 * dentry->d_lock 66 * dentry->d_lock
66 * dcache_lru_lock 67 * dentry->d_sb->s_dentry_lru_lock
67 * dcache_hash_bucket lock 68 * dcache_hash_bucket lock
68 * s_anon lock 69 * s_anon lock
69 * 70 *
@@ -81,13 +82,41 @@
81int sysctl_vfs_cache_pressure __read_mostly = 100; 82int sysctl_vfs_cache_pressure __read_mostly = 100;
82EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 83EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
83 84
84static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
85__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); 85__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
86 86
87EXPORT_SYMBOL(rename_lock); 87EXPORT_SYMBOL(rename_lock);
88 88
89static struct kmem_cache *dentry_cache __read_mostly; 89static struct kmem_cache *dentry_cache __read_mostly;
90 90
91/**
92 * read_seqbegin_or_lock - begin a sequence number check or locking block
93 * @lock: sequence lock
94 * @seq : sequence number to be checked
95 *
96 * First try it once optimistically without taking the lock. If that fails,
97 * take the lock. The sequence number is also used as a marker for deciding
98 * whether to be a reader (even) or writer (odd).
99 * N.B. seq must be initialized to an even number to begin with.
100 */
101static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
102{
103 if (!(*seq & 1)) /* Even */
104 *seq = read_seqbegin(lock);
105 else /* Odd */
106 read_seqlock_excl(lock);
107}
108
109static inline int need_seqretry(seqlock_t *lock, int seq)
110{
111 return !(seq & 1) && read_seqretry(lock, seq);
112}
113
114static inline void done_seqretry(seqlock_t *lock, int seq)
115{
116 if (seq & 1)
117 read_sequnlock_excl(lock);
118}
119
91/* 120/*
92 * This is the single most critical data structure when it comes 121 * This is the single most critical data structure when it comes
93 * to the dcache: the hashtable for lookups. Somebody should try 122 * to the dcache: the hashtable for lookups. Somebody should try
@@ -117,23 +146,47 @@ struct dentry_stat_t dentry_stat = {
117 .age_limit = 45, 146 .age_limit = 45,
118}; 147};
119 148
120static DEFINE_PER_CPU(unsigned int, nr_dentry); 149static DEFINE_PER_CPU(long, nr_dentry);
150static DEFINE_PER_CPU(long, nr_dentry_unused);
121 151
122#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 152#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
123static int get_nr_dentry(void) 153
154/*
155 * Here we resort to our own counters instead of using generic per-cpu counters
156 * for consistency with what the vfs inode code does. We are expected to harvest
157 * better code and performance by having our own specialized counters.
158 *
159 * Please note that the loop is done over all possible CPUs, not over all online
160 * CPUs. The reason for this is that we don't want to play games with CPUs going
161 * on and off. If one of them goes off, we will just keep their counters.
162 *
163 * glommer: See cffbc8a for details, and if you ever intend to change this,
164 * please update all vfs counters to match.
165 */
166static long get_nr_dentry(void)
124{ 167{
125 int i; 168 int i;
126 int sum = 0; 169 long sum = 0;
127 for_each_possible_cpu(i) 170 for_each_possible_cpu(i)
128 sum += per_cpu(nr_dentry, i); 171 sum += per_cpu(nr_dentry, i);
129 return sum < 0 ? 0 : sum; 172 return sum < 0 ? 0 : sum;
130} 173}
131 174
175static long get_nr_dentry_unused(void)
176{
177 int i;
178 long sum = 0;
179 for_each_possible_cpu(i)
180 sum += per_cpu(nr_dentry_unused, i);
181 return sum < 0 ? 0 : sum;
182}
183
132int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, 184int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
133 size_t *lenp, loff_t *ppos) 185 size_t *lenp, loff_t *ppos)
134{ 186{
135 dentry_stat.nr_dentry = get_nr_dentry(); 187 dentry_stat.nr_dentry = get_nr_dentry();
136 return proc_dointvec(table, write, buffer, lenp, ppos); 188 dentry_stat.nr_unused = get_nr_dentry_unused();
189 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
137} 190}
138#endif 191#endif
139 192
@@ -229,7 +282,7 @@ static void __d_free(struct rcu_head *head)
229 */ 282 */
230static void d_free(struct dentry *dentry) 283static void d_free(struct dentry *dentry)
231{ 284{
232 BUG_ON(dentry->d_lockref.count); 285 BUG_ON((int)dentry->d_lockref.count > 0);
233 this_cpu_dec(nr_dentry); 286 this_cpu_dec(nr_dentry);
234 if (dentry->d_op && dentry->d_op->d_release) 287 if (dentry->d_op && dentry->d_op->d_release)
235 dentry->d_op->d_release(dentry); 288 dentry->d_op->d_release(dentry);
@@ -304,50 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry)
304} 357}
305 358
306/* 359/*
307 * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. 360 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
361 * is in use - which includes both the "real" per-superblock
362 * LRU list _and_ the DCACHE_SHRINK_LIST use.
363 *
364 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
365 * on the shrink list (ie not on the superblock LRU list).
366 *
367 * The per-cpu "nr_dentry_unused" counters are updated with
368 * the DCACHE_LRU_LIST bit.
369 *
370 * These helper functions make sure we always follow the
371 * rules. d_lock must be held by the caller.
308 */ 372 */
309static void dentry_lru_add(struct dentry *dentry) 373#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
374static void d_lru_add(struct dentry *dentry)
310{ 375{
311 if (list_empty(&dentry->d_lru)) { 376 D_FLAG_VERIFY(dentry, 0);
312 spin_lock(&dcache_lru_lock); 377 dentry->d_flags |= DCACHE_LRU_LIST;
313 list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 378 this_cpu_inc(nr_dentry_unused);
314 dentry->d_sb->s_nr_dentry_unused++; 379 WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
315 dentry_stat.nr_unused++;
316 spin_unlock(&dcache_lru_lock);
317 }
318} 380}
319 381
320static void __dentry_lru_del(struct dentry *dentry) 382static void d_lru_del(struct dentry *dentry)
321{ 383{
384 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
385 dentry->d_flags &= ~DCACHE_LRU_LIST;
386 this_cpu_dec(nr_dentry_unused);
387 WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
388}
389
390static void d_shrink_del(struct dentry *dentry)
391{
392 D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
322 list_del_init(&dentry->d_lru); 393 list_del_init(&dentry->d_lru);
323 dentry->d_flags &= ~DCACHE_SHRINK_LIST; 394 dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
324 dentry->d_sb->s_nr_dentry_unused--; 395 this_cpu_dec(nr_dentry_unused);
325 dentry_stat.nr_unused--; 396}
397
398static void d_shrink_add(struct dentry *dentry, struct list_head *list)
399{
400 D_FLAG_VERIFY(dentry, 0);
401 list_add(&dentry->d_lru, list);
402 dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
403 this_cpu_inc(nr_dentry_unused);
326} 404}
327 405
328/* 406/*
329 * Remove a dentry with references from the LRU. 407 * These can only be called under the global LRU lock, ie during the
408 * callback for freeing the LRU list. "isolate" removes it from the
409 * LRU lists entirely, while shrink_move moves it to the indicated
410 * private list.
330 */ 411 */
331static void dentry_lru_del(struct dentry *dentry) 412static void d_lru_isolate(struct dentry *dentry)
332{ 413{
333 if (!list_empty(&dentry->d_lru)) { 414 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
334 spin_lock(&dcache_lru_lock); 415 dentry->d_flags &= ~DCACHE_LRU_LIST;
335 __dentry_lru_del(dentry); 416 this_cpu_dec(nr_dentry_unused);
336 spin_unlock(&dcache_lru_lock); 417 list_del_init(&dentry->d_lru);
337 }
338} 418}
339 419
340static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) 420static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
341{ 421{
342 spin_lock(&dcache_lru_lock); 422 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
343 if (list_empty(&dentry->d_lru)) { 423 dentry->d_flags |= DCACHE_SHRINK_LIST;
344 list_add_tail(&dentry->d_lru, list); 424 list_move_tail(&dentry->d_lru, list);
345 dentry->d_sb->s_nr_dentry_unused++; 425}
346 dentry_stat.nr_unused++; 426
347 } else { 427/*
348 list_move_tail(&dentry->d_lru, list); 428 * dentry_lru_(add|del)_list) must be called with d_lock held.
429 */
430static void dentry_lru_add(struct dentry *dentry)
431{
432 if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
433 d_lru_add(dentry);
434}
435
436/*
437 * Remove a dentry with references from the LRU.
438 *
439 * If we are on the shrink list, then we can get to try_prune_one_dentry() and
440 * lose our last reference through the parent walk. In this case, we need to
441 * remove ourselves from the shrink list, not the LRU.
442 */
443static void dentry_lru_del(struct dentry *dentry)
444{
445 if (dentry->d_flags & DCACHE_LRU_LIST) {
446 if (dentry->d_flags & DCACHE_SHRINK_LIST)
447 return d_shrink_del(dentry);
448 d_lru_del(dentry);
349 } 449 }
350 spin_unlock(&dcache_lru_lock);
351} 450}
352 451
353/** 452/**
@@ -443,7 +542,8 @@ EXPORT_SYMBOL(d_drop);
443 * If ref is non-zero, then decrement the refcount too. 542 * If ref is non-zero, then decrement the refcount too.
444 * Returns dentry requiring refcount drop, or NULL if we're done. 543 * Returns dentry requiring refcount drop, or NULL if we're done.
445 */ 544 */
446static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) 545static inline struct dentry *
546dentry_kill(struct dentry *dentry, int unlock_on_failure)
447 __releases(dentry->d_lock) 547 __releases(dentry->d_lock)
448{ 548{
449 struct inode *inode; 549 struct inode *inode;
@@ -452,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
452 inode = dentry->d_inode; 552 inode = dentry->d_inode;
453 if (inode && !spin_trylock(&inode->i_lock)) { 553 if (inode && !spin_trylock(&inode->i_lock)) {
454relock: 554relock:
455 spin_unlock(&dentry->d_lock); 555 if (unlock_on_failure) {
456 cpu_relax(); 556 spin_unlock(&dentry->d_lock);
557 cpu_relax();
558 }
457 return dentry; /* try again with same dentry */ 559 return dentry; /* try again with same dentry */
458 } 560 }
459 if (IS_ROOT(dentry)) 561 if (IS_ROOT(dentry))
@@ -466,13 +568,16 @@ relock:
466 goto relock; 568 goto relock;
467 } 569 }
468 570
469 if (ref) 571 /*
470 dentry->d_lockref.count--; 572 * The dentry is now unrecoverably dead to the world.
573 */
574 lockref_mark_dead(&dentry->d_lockref);
575
471 /* 576 /*
472 * inform the fs via d_prune that this dentry is about to be 577 * inform the fs via d_prune that this dentry is about to be
473 * unhashed and destroyed. 578 * unhashed and destroyed.
474 */ 579 */
475 if (dentry->d_flags & DCACHE_OP_PRUNE) 580 if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
476 dentry->d_op->d_prune(dentry); 581 dentry->d_op->d_prune(dentry);
477 582
478 dentry_lru_del(dentry); 583 dentry_lru_del(dentry);
@@ -509,24 +614,22 @@ relock:
509 */ 614 */
510void dput(struct dentry *dentry) 615void dput(struct dentry *dentry)
511{ 616{
512 if (!dentry) 617 if (unlikely(!dentry))
513 return; 618 return;
514 619
515repeat: 620repeat:
516 if (dentry->d_lockref.count == 1)
517 might_sleep();
518 if (lockref_put_or_lock(&dentry->d_lockref)) 621 if (lockref_put_or_lock(&dentry->d_lockref))
519 return; 622 return;
520 623
521 if (dentry->d_flags & DCACHE_OP_DELETE) { 624 /* Unreachable? Get rid of it */
625 if (unlikely(d_unhashed(dentry)))
626 goto kill_it;
627
628 if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
522 if (dentry->d_op->d_delete(dentry)) 629 if (dentry->d_op->d_delete(dentry))
523 goto kill_it; 630 goto kill_it;
524 } 631 }
525 632
526 /* Unreachable? Get rid of it */
527 if (d_unhashed(dentry))
528 goto kill_it;
529
530 dentry->d_flags |= DCACHE_REFERENCED; 633 dentry->d_flags |= DCACHE_REFERENCED;
531 dentry_lru_add(dentry); 634 dentry_lru_add(dentry);
532 635
@@ -611,8 +714,23 @@ static inline void __dget(struct dentry *dentry)
611 714
612struct dentry *dget_parent(struct dentry *dentry) 715struct dentry *dget_parent(struct dentry *dentry)
613{ 716{
717 int gotref;
614 struct dentry *ret; 718 struct dentry *ret;
615 719
720 /*
721 * Do optimistic parent lookup without any
722 * locking.
723 */
724 rcu_read_lock();
725 ret = ACCESS_ONCE(dentry->d_parent);
726 gotref = lockref_get_not_zero(&ret->d_lockref);
727 rcu_read_unlock();
728 if (likely(gotref)) {
729 if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
730 return ret;
731 dput(ret);
732 }
733
616repeat: 734repeat:
617 /* 735 /*
618 * Don't need rcu_dereference because we re-check it was correct under 736 * Don't need rcu_dereference because we re-check it was correct under
@@ -712,6 +830,14 @@ restart:
712 hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { 830 hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
713 spin_lock(&dentry->d_lock); 831 spin_lock(&dentry->d_lock);
714 if (!dentry->d_lockref.count) { 832 if (!dentry->d_lockref.count) {
833 /*
834 * inform the fs via d_prune that this dentry
835 * is about to be unhashed and destroyed.
836 */
837 if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
838 !d_unhashed(dentry))
839 dentry->d_op->d_prune(dentry);
840
715 __dget_dlock(dentry); 841 __dget_dlock(dentry);
716 __d_drop(dentry); 842 __d_drop(dentry);
717 spin_unlock(&dentry->d_lock); 843 spin_unlock(&dentry->d_lock);
@@ -732,7 +858,7 @@ EXPORT_SYMBOL(d_prune_aliases);
732 * 858 *
733 * This may fail if locks cannot be acquired no problem, just try again. 859 * This may fail if locks cannot be acquired no problem, just try again.
734 */ 860 */
735static void try_prune_one_dentry(struct dentry *dentry) 861static struct dentry * try_prune_one_dentry(struct dentry *dentry)
736 __releases(dentry->d_lock) 862 __releases(dentry->d_lock)
737{ 863{
738 struct dentry *parent; 864 struct dentry *parent;
@@ -749,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry)
749 * fragmentation. 875 * fragmentation.
750 */ 876 */
751 if (!parent) 877 if (!parent)
752 return; 878 return NULL;
753 if (parent == dentry) 879 if (parent == dentry)
754 return; 880 return dentry;
755 881
756 /* Prune ancestors. */ 882 /* Prune ancestors. */
757 dentry = parent; 883 dentry = parent;
758 while (dentry) { 884 while (dentry) {
759 if (lockref_put_or_lock(&dentry->d_lockref)) 885 if (lockref_put_or_lock(&dentry->d_lockref))
760 return; 886 return NULL;
761 dentry = dentry_kill(dentry, 1); 887 dentry = dentry_kill(dentry, 1);
762 } 888 }
889 return NULL;
763} 890}
764 891
765static void shrink_dentry_list(struct list_head *list) 892static void shrink_dentry_list(struct list_head *list)
@@ -771,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list)
771 dentry = list_entry_rcu(list->prev, struct dentry, d_lru); 898 dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
772 if (&dentry->d_lru == list) 899 if (&dentry->d_lru == list)
773 break; /* empty */ 900 break; /* empty */
901
902 /*
903 * Get the dentry lock, and re-verify that the dentry is
904 * this on the shrinking list. If it is, we know that
905 * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
906 */
774 spin_lock(&dentry->d_lock); 907 spin_lock(&dentry->d_lock);
775 if (dentry != list_entry(list->prev, struct dentry, d_lru)) { 908 if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
776 spin_unlock(&dentry->d_lock); 909 spin_unlock(&dentry->d_lock);
@@ -778,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list)
778 } 911 }
779 912
780 /* 913 /*
914 * The dispose list is isolated and dentries are not accounted
915 * to the LRU here, so we can simply remove it from the list
916 * here regardless of whether it is referenced or not.
917 */
918 d_shrink_del(dentry);
919
920 /*
781 * We found an inuse dentry which was not removed from 921 * We found an inuse dentry which was not removed from
782 * the LRU because of laziness during lookup. Do not free 922 * the LRU because of laziness during lookup. Do not free it.
783 * it - just keep it off the LRU list.
784 */ 923 */
785 if (dentry->d_lockref.count) { 924 if (dentry->d_lockref.count) {
786 dentry_lru_del(dentry);
787 spin_unlock(&dentry->d_lock); 925 spin_unlock(&dentry->d_lock);
788 continue; 926 continue;
789 } 927 }
790
791 rcu_read_unlock(); 928 rcu_read_unlock();
792 929
793 try_prune_one_dentry(dentry); 930 /*
931 * If 'try_to_prune()' returns a dentry, it will
932 * be the same one we passed in, and d_lock will
933 * have been held the whole time, so it will not
934 * have been added to any other lists. We failed
935 * to get the inode lock.
936 *
937 * We just add it back to the shrink list.
938 */
939 dentry = try_prune_one_dentry(dentry);
794 940
795 rcu_read_lock(); 941 rcu_read_lock();
942 if (dentry) {
943 d_shrink_add(dentry, list);
944 spin_unlock(&dentry->d_lock);
945 }
796 } 946 }
797 rcu_read_unlock(); 947 rcu_read_unlock();
798} 948}
799 949
950static enum lru_status
951dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
952{
953 struct list_head *freeable = arg;
954 struct dentry *dentry = container_of(item, struct dentry, d_lru);
955
956
957 /*
958 * we are inverting the lru lock/dentry->d_lock here,
959 * so use a trylock. If we fail to get the lock, just skip
960 * it
961 */
962 if (!spin_trylock(&dentry->d_lock))
963 return LRU_SKIP;
964
965 /*
966 * Referenced dentries are still in use. If they have active
967 * counts, just remove them from the LRU. Otherwise give them
968 * another pass through the LRU.
969 */
970 if (dentry->d_lockref.count) {
971 d_lru_isolate(dentry);
972 spin_unlock(&dentry->d_lock);
973 return LRU_REMOVED;
974 }
975
976 if (dentry->d_flags & DCACHE_REFERENCED) {
977 dentry->d_flags &= ~DCACHE_REFERENCED;
978 spin_unlock(&dentry->d_lock);
979
980 /*
981 * The list move itself will be made by the common LRU code. At
982 * this point, we've dropped the dentry->d_lock but keep the
983 * lru lock. This is safe to do, since every list movement is
984 * protected by the lru lock even if both locks are held.
985 *
986 * This is guaranteed by the fact that all LRU management
987 * functions are intermediated by the LRU API calls like
988 * list_lru_add and list_lru_del. List movement in this file
989 * only ever occur through this functions or through callbacks
990 * like this one, that are called from the LRU API.
991 *
992 * The only exceptions to this are functions like
993 * shrink_dentry_list, and code that first checks for the
994 * DCACHE_SHRINK_LIST flag. Those are guaranteed to be
995 * operating only with stack provided lists after they are
996 * properly isolated from the main list. It is thus, always a
997 * local access.
998 */
999 return LRU_ROTATE;
1000 }
1001
1002 d_lru_shrink_move(dentry, freeable);
1003 spin_unlock(&dentry->d_lock);
1004
1005 return LRU_REMOVED;
1006}
1007
800/** 1008/**
801 * prune_dcache_sb - shrink the dcache 1009 * prune_dcache_sb - shrink the dcache
802 * @sb: superblock 1010 * @sb: superblock
803 * @count: number of entries to try to free 1011 * @nr_to_scan : number of entries to try to free
1012 * @nid: which node to scan for freeable entities
804 * 1013 *
805 * Attempt to shrink the superblock dcache LRU by @count entries. This is 1014 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
806 * done when we need more memory an called from the superblock shrinker 1015 * done when we need more memory an called from the superblock shrinker
807 * function. 1016 * function.
808 * 1017 *
809 * This function may fail to free any resources if all the dentries are in 1018 * This function may fail to free any resources if all the dentries are in
810 * use. 1019 * use.
811 */ 1020 */
812void prune_dcache_sb(struct super_block *sb, int count) 1021long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
1022 int nid)
813{ 1023{
814 struct dentry *dentry; 1024 LIST_HEAD(dispose);
815 LIST_HEAD(referenced); 1025 long freed;
816 LIST_HEAD(tmp);
817 1026
818relock: 1027 freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
819 spin_lock(&dcache_lru_lock); 1028 &dispose, &nr_to_scan);
820 while (!list_empty(&sb->s_dentry_lru)) { 1029 shrink_dentry_list(&dispose);
821 dentry = list_entry(sb->s_dentry_lru.prev, 1030 return freed;
822 struct dentry, d_lru); 1031}
823 BUG_ON(dentry->d_sb != sb);
824
825 if (!spin_trylock(&dentry->d_lock)) {
826 spin_unlock(&dcache_lru_lock);
827 cpu_relax();
828 goto relock;
829 }
830 1032
831 if (dentry->d_flags & DCACHE_REFERENCED) { 1033static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
832 dentry->d_flags &= ~DCACHE_REFERENCED; 1034 spinlock_t *lru_lock, void *arg)
833 list_move(&dentry->d_lru, &referenced); 1035{
834 spin_unlock(&dentry->d_lock); 1036 struct list_head *freeable = arg;
835 } else { 1037 struct dentry *dentry = container_of(item, struct dentry, d_lru);
836 list_move_tail(&dentry->d_lru, &tmp); 1038
837 dentry->d_flags |= DCACHE_SHRINK_LIST; 1039 /*
838 spin_unlock(&dentry->d_lock); 1040 * we are inverting the lru lock/dentry->d_lock here,
839 if (!--count) 1041 * so use a trylock. If we fail to get the lock, just skip
840 break; 1042 * it
841 } 1043 */
842 cond_resched_lock(&dcache_lru_lock); 1044 if (!spin_trylock(&dentry->d_lock))
843 } 1045 return LRU_SKIP;
844 if (!list_empty(&referenced)) 1046
845 list_splice(&referenced, &sb->s_dentry_lru); 1047 d_lru_shrink_move(dentry, freeable);
846 spin_unlock(&dcache_lru_lock); 1048 spin_unlock(&dentry->d_lock);
847 1049
848 shrink_dentry_list(&tmp); 1050 return LRU_REMOVED;
849} 1051}
850 1052
1053
851/** 1054/**
852 * shrink_dcache_sb - shrink dcache for a superblock 1055 * shrink_dcache_sb - shrink dcache for a superblock
853 * @sb: superblock 1056 * @sb: superblock
@@ -857,16 +1060,17 @@ relock:
857 */ 1060 */
858void shrink_dcache_sb(struct super_block *sb) 1061void shrink_dcache_sb(struct super_block *sb)
859{ 1062{
860 LIST_HEAD(tmp); 1063 long freed;
861 1064
862 spin_lock(&dcache_lru_lock); 1065 do {
863 while (!list_empty(&sb->s_dentry_lru)) { 1066 LIST_HEAD(dispose);
864 list_splice_init(&sb->s_dentry_lru, &tmp); 1067
865 spin_unlock(&dcache_lru_lock); 1068 freed = list_lru_walk(&sb->s_dentry_lru,
866 shrink_dentry_list(&tmp); 1069 dentry_lru_isolate_shrink, &dispose, UINT_MAX);
867 spin_lock(&dcache_lru_lock); 1070
868 } 1071 this_cpu_sub(nr_dentry_unused, freed);
869 spin_unlock(&dcache_lru_lock); 1072 shrink_dentry_list(&dispose);
1073 } while (freed > 0);
870} 1074}
871EXPORT_SYMBOL(shrink_dcache_sb); 1075EXPORT_SYMBOL(shrink_dcache_sb);
872 1076
@@ -896,7 +1100,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
896 * inform the fs that this dentry is about to be 1100 * inform the fs that this dentry is about to be
897 * unhashed and destroyed. 1101 * unhashed and destroyed.
898 */ 1102 */
899 if (dentry->d_flags & DCACHE_OP_PRUNE) 1103 if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
1104 !d_unhashed(dentry))
900 dentry->d_op->d_prune(dentry); 1105 dentry->d_op->d_prune(dentry);
901 1106
902 dentry_lru_del(dentry); 1107 dentry_lru_del(dentry);
@@ -985,7 +1190,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
985 * the parenthood after dropping the lock and check 1190 * the parenthood after dropping the lock and check
986 * that the sequence number still matches. 1191 * that the sequence number still matches.
987 */ 1192 */
988static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) 1193static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
989{ 1194{
990 struct dentry *new = old->d_parent; 1195 struct dentry *new = old->d_parent;
991 1196
@@ -999,7 +1204,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
999 */ 1204 */
1000 if (new != old->d_parent || 1205 if (new != old->d_parent ||
1001 (old->d_flags & DCACHE_DENTRY_KILLED) || 1206 (old->d_flags & DCACHE_DENTRY_KILLED) ||
1002 (!locked && read_seqretry(&rename_lock, seq))) { 1207 need_seqretry(&rename_lock, seq)) {
1003 spin_unlock(&new->d_lock); 1208 spin_unlock(&new->d_lock);
1004 new = NULL; 1209 new = NULL;
1005 } 1210 }
@@ -1007,34 +1212,55 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
1007 return new; 1212 return new;
1008} 1213}
1009 1214
1215/**
1216 * enum d_walk_ret - action to talke during tree walk
1217 * @D_WALK_CONTINUE: contrinue walk
1218 * @D_WALK_QUIT: quit walk
1219 * @D_WALK_NORETRY: quit when retry is needed
1220 * @D_WALK_SKIP: skip this dentry and its children
1221 */
1222enum d_walk_ret {
1223 D_WALK_CONTINUE,
1224 D_WALK_QUIT,
1225 D_WALK_NORETRY,
1226 D_WALK_SKIP,
1227};
1010 1228
1011/*
1012 * Search for at least 1 mount point in the dentry's subdirs.
1013 * We descend to the next level whenever the d_subdirs
1014 * list is non-empty and continue searching.
1015 */
1016
1017/** 1229/**
1018 * have_submounts - check for mounts over a dentry 1230 * d_walk - walk the dentry tree
1019 * @parent: dentry to check. 1231 * @parent: start of walk
1232 * @data: data passed to @enter() and @finish()
1233 * @enter: callback when first entering the dentry
1234 * @finish: callback when successfully finished the walk
1020 * 1235 *
1021 * Return true if the parent or its subdirectories contain 1236 * The @enter() and @finish() callbacks are called with d_lock held.
1022 * a mount point
1023 */ 1237 */
1024int have_submounts(struct dentry *parent) 1238static void d_walk(struct dentry *parent, void *data,
1239 enum d_walk_ret (*enter)(void *, struct dentry *),
1240 void (*finish)(void *))
1025{ 1241{
1026 struct dentry *this_parent; 1242 struct dentry *this_parent;
1027 struct list_head *next; 1243 struct list_head *next;
1028 unsigned seq; 1244 unsigned seq = 0;
1029 int locked = 0; 1245 enum d_walk_ret ret;
1246 bool retry = true;
1030 1247
1031 seq = read_seqbegin(&rename_lock);
1032again: 1248again:
1249 read_seqbegin_or_lock(&rename_lock, &seq);
1033 this_parent = parent; 1250 this_parent = parent;
1034
1035 if (d_mountpoint(parent))
1036 goto positive;
1037 spin_lock(&this_parent->d_lock); 1251 spin_lock(&this_parent->d_lock);
1252
1253 ret = enter(data, this_parent);
1254 switch (ret) {
1255 case D_WALK_CONTINUE:
1256 break;
1257 case D_WALK_QUIT:
1258 case D_WALK_SKIP:
1259 goto out_unlock;
1260 case D_WALK_NORETRY:
1261 retry = false;
1262 break;
1263 }
1038repeat: 1264repeat:
1039 next = this_parent->d_subdirs.next; 1265 next = this_parent->d_subdirs.next;
1040resume: 1266resume:
@@ -1044,12 +1270,22 @@ resume:
1044 next = tmp->next; 1270 next = tmp->next;
1045 1271
1046 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 1272 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1047 /* Have we found a mount point ? */ 1273
1048 if (d_mountpoint(dentry)) { 1274 ret = enter(data, dentry);
1275 switch (ret) {
1276 case D_WALK_CONTINUE:
1277 break;
1278 case D_WALK_QUIT:
1049 spin_unlock(&dentry->d_lock); 1279 spin_unlock(&dentry->d_lock);
1050 spin_unlock(&this_parent->d_lock); 1280 goto out_unlock;
1051 goto positive; 1281 case D_WALK_NORETRY:
1282 retry = false;
1283 break;
1284 case D_WALK_SKIP:
1285 spin_unlock(&dentry->d_lock);
1286 continue;
1052 } 1287 }
1288
1053 if (!list_empty(&dentry->d_subdirs)) { 1289 if (!list_empty(&dentry->d_subdirs)) {
1054 spin_unlock(&this_parent->d_lock); 1290 spin_unlock(&this_parent->d_lock);
1055 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); 1291 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
@@ -1064,35 +1300,99 @@ resume:
1064 */ 1300 */
1065 if (this_parent != parent) { 1301 if (this_parent != parent) {
1066 struct dentry *child = this_parent; 1302 struct dentry *child = this_parent;
1067 this_parent = try_to_ascend(this_parent, locked, seq); 1303 this_parent = try_to_ascend(this_parent, seq);
1068 if (!this_parent) 1304 if (!this_parent)
1069 goto rename_retry; 1305 goto rename_retry;
1070 next = child->d_u.d_child.next; 1306 next = child->d_u.d_child.next;
1071 goto resume; 1307 goto resume;
1072 } 1308 }
1073 spin_unlock(&this_parent->d_lock); 1309 if (need_seqretry(&rename_lock, seq)) {
1074 if (!locked && read_seqretry(&rename_lock, seq)) 1310 spin_unlock(&this_parent->d_lock);
1075 goto rename_retry; 1311 goto rename_retry;
1076 if (locked) 1312 }
1077 write_sequnlock(&rename_lock); 1313 if (finish)
1078 return 0; /* No mount points found in tree */ 1314 finish(data);
1079positive: 1315
1080 if (!locked && read_seqretry(&rename_lock, seq)) 1316out_unlock:
1081 goto rename_retry; 1317 spin_unlock(&this_parent->d_lock);
1082 if (locked) 1318 done_seqretry(&rename_lock, seq);
1083 write_sequnlock(&rename_lock); 1319 return;
1084 return 1;
1085 1320
1086rename_retry: 1321rename_retry:
1087 if (locked) 1322 if (!retry)
1088 goto again; 1323 return;
1089 locked = 1; 1324 seq = 1;
1090 write_seqlock(&rename_lock);
1091 goto again; 1325 goto again;
1092} 1326}
1327
1328/*
1329 * Search for at least 1 mount point in the dentry's subdirs.
1330 * We descend to the next level whenever the d_subdirs
1331 * list is non-empty and continue searching.
1332 */
1333
1334/**
1335 * have_submounts - check for mounts over a dentry
1336 * @parent: dentry to check.
1337 *
1338 * Return true if the parent or its subdirectories contain
1339 * a mount point
1340 */
1341
1342static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
1343{
1344 int *ret = data;
1345 if (d_mountpoint(dentry)) {
1346 *ret = 1;
1347 return D_WALK_QUIT;
1348 }
1349 return D_WALK_CONTINUE;
1350}
1351
1352int have_submounts(struct dentry *parent)
1353{
1354 int ret = 0;
1355
1356 d_walk(parent, &ret, check_mount, NULL);
1357
1358 return ret;
1359}
1093EXPORT_SYMBOL(have_submounts); 1360EXPORT_SYMBOL(have_submounts);
1094 1361
1095/* 1362/*
1363 * Called by mount code to set a mountpoint and check if the mountpoint is
1364 * reachable (e.g. NFS can unhash a directory dentry and then the complete
1365 * subtree can become unreachable).
1366 *
1367 * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For
1368 * this reason take rename_lock and d_lock on dentry and ancestors.
1369 */
1370int d_set_mounted(struct dentry *dentry)
1371{
1372 struct dentry *p;
1373 int ret = -ENOENT;
1374 write_seqlock(&rename_lock);
1375 for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
1376 /* Need exclusion wrt. check_submounts_and_drop() */
1377 spin_lock(&p->d_lock);
1378 if (unlikely(d_unhashed(p))) {
1379 spin_unlock(&p->d_lock);
1380 goto out;
1381 }
1382 spin_unlock(&p->d_lock);
1383 }
1384 spin_lock(&dentry->d_lock);
1385 if (!d_unlinked(dentry)) {
1386 dentry->d_flags |= DCACHE_MOUNTED;
1387 ret = 0;
1388 }
1389 spin_unlock(&dentry->d_lock);
1390out:
1391 write_sequnlock(&rename_lock);
1392 return ret;
1393}
1394
1395/*
1096 * Search the dentry child list of the specified parent, 1396 * Search the dentry child list of the specified parent,
1097 * and move any unused dentries to the end of the unused 1397 * and move any unused dentries to the end of the unused
1098 * list for prune_dcache(). We descend to the next level 1398 * list for prune_dcache(). We descend to the next level
@@ -1106,93 +1406,51 @@ EXPORT_SYMBOL(have_submounts);
1106 * drop the lock and return early due to latency 1406 * drop the lock and return early due to latency
1107 * constraints. 1407 * constraints.
1108 */ 1408 */
1109static int select_parent(struct dentry *parent, struct list_head *dispose)
1110{
1111 struct dentry *this_parent;
1112 struct list_head *next;
1113 unsigned seq;
1114 int found = 0;
1115 int locked = 0;
1116 1409
1117 seq = read_seqbegin(&rename_lock); 1410struct select_data {
1118again: 1411 struct dentry *start;
1119 this_parent = parent; 1412 struct list_head dispose;
1120 spin_lock(&this_parent->d_lock); 1413 int found;
1121repeat: 1414};
1122 next = this_parent->d_subdirs.next;
1123resume:
1124 while (next != &this_parent->d_subdirs) {
1125 struct list_head *tmp = next;
1126 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
1127 next = tmp->next;
1128 1415
1129 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 1416static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
1417{
1418 struct select_data *data = _data;
1419 enum d_walk_ret ret = D_WALK_CONTINUE;
1130 1420
1131 /* 1421 if (data->start == dentry)
1132 * move only zero ref count dentries to the dispose list. 1422 goto out;
1133 *
1134 * Those which are presently on the shrink list, being processed
1135 * by shrink_dentry_list(), shouldn't be moved. Otherwise the
1136 * loop in shrink_dcache_parent() might not make any progress
1137 * and loop forever.
1138 */
1139 if (dentry->d_lockref.count) {
1140 dentry_lru_del(dentry);
1141 } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
1142 dentry_lru_move_list(dentry, dispose);
1143 dentry->d_flags |= DCACHE_SHRINK_LIST;
1144 found++;
1145 }
1146 /*
1147 * We can return to the caller if we have found some (this
1148 * ensures forward progress). We'll be coming back to find
1149 * the rest.
1150 */
1151 if (found && need_resched()) {
1152 spin_unlock(&dentry->d_lock);
1153 goto out;
1154 }
1155 1423
1424 /*
1425 * move only zero ref count dentries to the dispose list.
1426 *
1427 * Those which are presently on the shrink list, being processed
1428 * by shrink_dentry_list(), shouldn't be moved. Otherwise the
1429 * loop in shrink_dcache_parent() might not make any progress
1430 * and loop forever.
1431 */
1432 if (dentry->d_lockref.count) {
1433 dentry_lru_del(dentry);
1434 } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
1156 /* 1435 /*
1157 * Descend a level if the d_subdirs list is non-empty. 1436 * We can't use d_lru_shrink_move() because we
1437 * need to get the global LRU lock and do the
1438 * LRU accounting.
1158 */ 1439 */
1159 if (!list_empty(&dentry->d_subdirs)) { 1440 d_lru_del(dentry);
1160 spin_unlock(&this_parent->d_lock); 1441 d_shrink_add(dentry, &data->dispose);
1161 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); 1442 data->found++;
1162 this_parent = dentry; 1443 ret = D_WALK_NORETRY;
1163 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
1164 goto repeat;
1165 }
1166
1167 spin_unlock(&dentry->d_lock);
1168 } 1444 }
1169 /* 1445 /*
1170 * All done at this level ... ascend and resume the search. 1446 * We can return to the caller if we have found some (this
1447 * ensures forward progress). We'll be coming back to find
1448 * the rest.
1171 */ 1449 */
1172 if (this_parent != parent) { 1450 if (data->found && need_resched())
1173 struct dentry *child = this_parent; 1451 ret = D_WALK_QUIT;
1174 this_parent = try_to_ascend(this_parent, locked, seq);
1175 if (!this_parent)
1176 goto rename_retry;
1177 next = child->d_u.d_child.next;
1178 goto resume;
1179 }
1180out: 1452out:
1181 spin_unlock(&this_parent->d_lock); 1453 return ret;
1182 if (!locked && read_seqretry(&rename_lock, seq))
1183 goto rename_retry;
1184 if (locked)
1185 write_sequnlock(&rename_lock);
1186 return found;
1187
1188rename_retry:
1189 if (found)
1190 return found;
1191 if (locked)
1192 goto again;
1193 locked = 1;
1194 write_seqlock(&rename_lock);
1195 goto again;
1196} 1454}
1197 1455
1198/** 1456/**
@@ -1201,18 +1459,90 @@ rename_retry:
1201 * 1459 *
1202 * Prune the dcache to remove unused children of the parent dentry. 1460 * Prune the dcache to remove unused children of the parent dentry.
1203 */ 1461 */
1204void shrink_dcache_parent(struct dentry * parent) 1462void shrink_dcache_parent(struct dentry *parent)
1205{ 1463{
1206 LIST_HEAD(dispose); 1464 for (;;) {
1207 int found; 1465 struct select_data data;
1208 1466
1209 while ((found = select_parent(parent, &dispose)) != 0) { 1467 INIT_LIST_HEAD(&data.dispose);
1210 shrink_dentry_list(&dispose); 1468 data.start = parent;
1469 data.found = 0;
1470
1471 d_walk(parent, &data, select_collect, NULL);
1472 if (!data.found)
1473 break;
1474
1475 shrink_dentry_list(&data.dispose);
1211 cond_resched(); 1476 cond_resched();
1212 } 1477 }
1213} 1478}
1214EXPORT_SYMBOL(shrink_dcache_parent); 1479EXPORT_SYMBOL(shrink_dcache_parent);
1215 1480
1481static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
1482{
1483 struct select_data *data = _data;
1484
1485 if (d_mountpoint(dentry)) {
1486 data->found = -EBUSY;
1487 return D_WALK_QUIT;
1488 }
1489
1490 return select_collect(_data, dentry);
1491}
1492
1493static void check_and_drop(void *_data)
1494{
1495 struct select_data *data = _data;
1496
1497 if (d_mountpoint(data->start))
1498 data->found = -EBUSY;
1499 if (!data->found)
1500 __d_drop(data->start);
1501}
1502
1503/**
1504 * check_submounts_and_drop - prune dcache, check for submounts and drop
1505 *
1506 * All done as a single atomic operation relative to has_unlinked_ancestor().
1507 * Returns 0 if successfully unhashed @parent. If there were submounts then
1508 * return -EBUSY.
1509 *
1510 * @dentry: dentry to prune and drop
1511 */
1512int check_submounts_and_drop(struct dentry *dentry)
1513{
1514 int ret = 0;
1515
1516 /* Negative dentries can be dropped without further checks */
1517 if (!dentry->d_inode) {
1518 d_drop(dentry);
1519 goto out;
1520 }
1521
1522 for (;;) {
1523 struct select_data data;
1524
1525 INIT_LIST_HEAD(&data.dispose);
1526 data.start = dentry;
1527 data.found = 0;
1528
1529 d_walk(dentry, &data, check_and_collect, check_and_drop);
1530 ret = data.found;
1531
1532 if (!list_empty(&data.dispose))
1533 shrink_dentry_list(&data.dispose);
1534
1535 if (ret <= 0)
1536 break;
1537
1538 cond_resched();
1539 }
1540
1541out:
1542 return ret;
1543}
1544EXPORT_SYMBOL(check_submounts_and_drop);
1545
1216/** 1546/**
1217 * __d_alloc - allocate a dcache entry 1547 * __d_alloc - allocate a dcache entry
1218 * @sb: filesystem it will belong to 1548 * @sb: filesystem it will belong to
@@ -1771,7 +2101,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1771 * without taking d_lock and checking d_seq sequence count against @seq 2101 * without taking d_lock and checking d_seq sequence count against @seq
1772 * returned here. 2102 * returned here.
1773 * 2103 *
1774 * A refcount may be taken on the found dentry with the __d_rcu_to_refcount 2104 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
1775 * function. 2105 * function.
1776 * 2106 *
1777 * Alternatively, __d_lookup_rcu may be called again to look up the child of 2107 * Alternatively, __d_lookup_rcu may be called again to look up the child of
@@ -2495,9 +2825,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
2495 return 0; 2825 return 0;
2496} 2826}
2497 2827
2828/**
2829 * prepend_name - prepend a pathname in front of current buffer pointer
2830 * @buffer: buffer pointer
2831 * @buflen: allocated length of the buffer
2832 * @name: name string and length qstr structure
2833 *
2834 * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
2835 * make sure that either the old or the new name pointer and length are
2836 * fetched. However, there may be mismatch between length and pointer.
2837 * The length cannot be trusted, we need to copy it byte-by-byte until
2838 * the length is reached or a null byte is found. It also prepends "/" at
2839 * the beginning of the name. The sequence number check at the caller will
2840 * retry it again when a d_move() does happen. So any garbage in the buffer
2841 * due to mismatched pointer and length will be discarded.
2842 */
2498static int prepend_name(char **buffer, int *buflen, struct qstr *name) 2843static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2499{ 2844{
2500 return prepend(buffer, buflen, name->name, name->len); 2845 const char *dname = ACCESS_ONCE(name->name);
2846 u32 dlen = ACCESS_ONCE(name->len);
2847 char *p;
2848
2849 if (*buflen < dlen + 1)
2850 return -ENAMETOOLONG;
2851 *buflen -= dlen + 1;
2852 p = *buffer -= dlen + 1;
2853 *p++ = '/';
2854 while (dlen--) {
2855 char c = *dname++;
2856 if (!c)
2857 break;
2858 *p++ = c;
2859 }
2860 return 0;
2501} 2861}
2502 2862
2503/** 2863/**
@@ -2507,7 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2507 * @buffer: pointer to the end of the buffer 2867 * @buffer: pointer to the end of the buffer
2508 * @buflen: pointer to buffer length 2868 * @buflen: pointer to buffer length
2509 * 2869 *
2510 * Caller holds the rename_lock. 2870 * The function will first try to write out the pathname without taking any
2871 * lock other than the RCU read lock to make sure that dentries won't go away.
2872 * It only checks the sequence number of the global rename_lock as any change
2873 * in the dentry's d_seq will be preceded by changes in the rename_lock
2874 * sequence number. If the sequence number had been changed, it will restart
2875 * the whole pathname back-tracing sequence again by taking the rename_lock.
2876 * In this case, there is no need to take the RCU read lock as the recursive
2877 * parent pointer references will keep the dentry chain alive as long as no
2878 * rename operation is performed.
2511 */ 2879 */
2512static int prepend_path(const struct path *path, 2880static int prepend_path(const struct path *path,
2513 const struct path *root, 2881 const struct path *root,
@@ -2516,54 +2884,66 @@ static int prepend_path(const struct path *path,
2516 struct dentry *dentry = path->dentry; 2884 struct dentry *dentry = path->dentry;
2517 struct vfsmount *vfsmnt = path->mnt; 2885 struct vfsmount *vfsmnt = path->mnt;
2518 struct mount *mnt = real_mount(vfsmnt); 2886 struct mount *mnt = real_mount(vfsmnt);
2519 bool slash = false;
2520 int error = 0; 2887 int error = 0;
2888 unsigned seq = 0;
2889 char *bptr;
2890 int blen;
2521 2891
2892 rcu_read_lock();
2893restart:
2894 bptr = *buffer;
2895 blen = *buflen;
2896 read_seqbegin_or_lock(&rename_lock, &seq);
2522 while (dentry != root->dentry || vfsmnt != root->mnt) { 2897 while (dentry != root->dentry || vfsmnt != root->mnt) {
2523 struct dentry * parent; 2898 struct dentry * parent;
2524 2899
2525 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 2900 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
2526 /* Global root? */ 2901 /* Global root? */
2527 if (!mnt_has_parent(mnt)) 2902 if (mnt_has_parent(mnt)) {
2528 goto global_root; 2903 dentry = mnt->mnt_mountpoint;
2529 dentry = mnt->mnt_mountpoint; 2904 mnt = mnt->mnt_parent;
2530 mnt = mnt->mnt_parent; 2905 vfsmnt = &mnt->mnt;
2531 vfsmnt = &mnt->mnt; 2906 continue;
2532 continue; 2907 }
2908 /*
2909 * Filesystems needing to implement special "root names"
2910 * should do so with ->d_dname()
2911 */
2912 if (IS_ROOT(dentry) &&
2913 (dentry->d_name.len != 1 ||
2914 dentry->d_name.name[0] != '/')) {
2915 WARN(1, "Root dentry has weird name <%.*s>\n",
2916 (int) dentry->d_name.len,
2917 dentry->d_name.name);
2918 }
2919 if (!error)
2920 error = is_mounted(vfsmnt) ? 1 : 2;
2921 break;
2533 } 2922 }
2534 parent = dentry->d_parent; 2923 parent = dentry->d_parent;
2535 prefetch(parent); 2924 prefetch(parent);
2536 spin_lock(&dentry->d_lock); 2925 error = prepend_name(&bptr, &blen, &dentry->d_name);
2537 error = prepend_name(buffer, buflen, &dentry->d_name);
2538 spin_unlock(&dentry->d_lock);
2539 if (!error)
2540 error = prepend(buffer, buflen, "/", 1);
2541 if (error) 2926 if (error)
2542 break; 2927 break;
2543 2928
2544 slash = true;
2545 dentry = parent; 2929 dentry = parent;
2546 } 2930 }
2931 if (!(seq & 1))
2932 rcu_read_unlock();
2933 if (need_seqretry(&rename_lock, seq)) {
2934 seq = 1;
2935 goto restart;
2936 }
2937 done_seqretry(&rename_lock, seq);
2547 2938
2548 if (!error && !slash) 2939 if (error >= 0 && bptr == *buffer) {
2549 error = prepend(buffer, buflen, "/", 1); 2940 if (--blen < 0)
2550 2941 error = -ENAMETOOLONG;
2551 return error; 2942 else
2552 2943 *--bptr = '/';
2553global_root: 2944 }
2554 /* 2945 *buffer = bptr;
2555 * Filesystems needing to implement special "root names" 2946 *buflen = blen;
2556 * should do so with ->d_dname()
2557 */
2558 if (IS_ROOT(dentry) &&
2559 (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
2560 WARN(1, "Root dentry has weird name <%.*s>\n",
2561 (int) dentry->d_name.len, dentry->d_name.name);
2562 }
2563 if (!slash)
2564 error = prepend(buffer, buflen, "/", 1);
2565 if (!error)
2566 error = is_mounted(vfsmnt) ? 1 : 2;
2567 return error; 2947 return error;
2568} 2948}
2569 2949
@@ -2592,9 +2972,7 @@ char *__d_path(const struct path *path,
2592 2972
2593 prepend(&res, &buflen, "\0", 1); 2973 prepend(&res, &buflen, "\0", 1);
2594 br_read_lock(&vfsmount_lock); 2974 br_read_lock(&vfsmount_lock);
2595 write_seqlock(&rename_lock);
2596 error = prepend_path(path, root, &res, &buflen); 2975 error = prepend_path(path, root, &res, &buflen);
2597 write_sequnlock(&rename_lock);
2598 br_read_unlock(&vfsmount_lock); 2976 br_read_unlock(&vfsmount_lock);
2599 2977
2600 if (error < 0) 2978 if (error < 0)
@@ -2613,9 +2991,7 @@ char *d_absolute_path(const struct path *path,
2613 2991
2614 prepend(&res, &buflen, "\0", 1); 2992 prepend(&res, &buflen, "\0", 1);
2615 br_read_lock(&vfsmount_lock); 2993 br_read_lock(&vfsmount_lock);
2616 write_seqlock(&rename_lock);
2617 error = prepend_path(path, &root, &res, &buflen); 2994 error = prepend_path(path, &root, &res, &buflen);
2618 write_sequnlock(&rename_lock);
2619 br_read_unlock(&vfsmount_lock); 2995 br_read_unlock(&vfsmount_lock);
2620 2996
2621 if (error > 1) 2997 if (error > 1)
@@ -2647,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen)
2647 return prepend(buffer, buflen, "(unreachable)", 13); 3023 return prepend(buffer, buflen, "(unreachable)", 13);
2648} 3024}
2649 3025
3026static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
3027{
3028 unsigned seq;
3029
3030 do {
3031 seq = read_seqcount_begin(&fs->seq);
3032 *root = fs->root;
3033 } while (read_seqcount_retry(&fs->seq, seq));
3034}
3035
2650/** 3036/**
2651 * d_path - return the path of a dentry 3037 * d_path - return the path of a dentry
2652 * @path: path to report 3038 * @path: path to report
@@ -2679,15 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen)
2679 if (path->dentry->d_op && path->dentry->d_op->d_dname) 3065 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2680 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 3066 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2681 3067
2682 get_fs_root(current->fs, &root); 3068 rcu_read_lock();
3069 get_fs_root_rcu(current->fs, &root);
2683 br_read_lock(&vfsmount_lock); 3070 br_read_lock(&vfsmount_lock);
2684 write_seqlock(&rename_lock);
2685 error = path_with_deleted(path, &root, &res, &buflen); 3071 error = path_with_deleted(path, &root, &res, &buflen);
2686 write_sequnlock(&rename_lock);
2687 br_read_unlock(&vfsmount_lock); 3072 br_read_unlock(&vfsmount_lock);
3073 rcu_read_unlock();
3074
2688 if (error < 0) 3075 if (error < 0)
2689 res = ERR_PTR(error); 3076 res = ERR_PTR(error);
2690 path_put(&root);
2691 return res; 3077 return res;
2692} 3078}
2693EXPORT_SYMBOL(d_path); 3079EXPORT_SYMBOL(d_path);
@@ -2718,10 +3104,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
2718 char *end = buffer + buflen; 3104 char *end = buffer + buflen;
2719 /* these dentries are never renamed, so d_lock is not needed */ 3105 /* these dentries are never renamed, so d_lock is not needed */
2720 if (prepend(&end, &buflen, " (deleted)", 11) || 3106 if (prepend(&end, &buflen, " (deleted)", 11) ||
2721 prepend_name(&end, &buflen, &dentry->d_name) || 3107 prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
2722 prepend(&end, &buflen, "/", 1)) 3108 prepend(&end, &buflen, "/", 1))
2723 end = ERR_PTR(-ENAMETOOLONG); 3109 end = ERR_PTR(-ENAMETOOLONG);
2724 return end; 3110 return end;
2725} 3111}
2726 3112
2727/* 3113/*
@@ -2729,30 +3115,42 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
2729 */ 3115 */
2730static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) 3116static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
2731{ 3117{
2732 char *end = buf + buflen; 3118 char *end, *retval;
2733 char *retval; 3119 int len, seq = 0;
3120 int error = 0;
2734 3121
2735 prepend(&end, &buflen, "\0", 1); 3122 rcu_read_lock();
3123restart:
3124 end = buf + buflen;
3125 len = buflen;
3126 prepend(&end, &len, "\0", 1);
2736 if (buflen < 1) 3127 if (buflen < 1)
2737 goto Elong; 3128 goto Elong;
2738 /* Get '/' right */ 3129 /* Get '/' right */
2739 retval = end-1; 3130 retval = end-1;
2740 *retval = '/'; 3131 *retval = '/';
2741 3132 read_seqbegin_or_lock(&rename_lock, &seq);
2742 while (!IS_ROOT(dentry)) { 3133 while (!IS_ROOT(dentry)) {
2743 struct dentry *parent = dentry->d_parent; 3134 struct dentry *parent = dentry->d_parent;
2744 int error; 3135 int error;
2745 3136
2746 prefetch(parent); 3137 prefetch(parent);
2747 spin_lock(&dentry->d_lock); 3138 error = prepend_name(&end, &len, &dentry->d_name);
2748 error = prepend_name(&end, &buflen, &dentry->d_name); 3139 if (error)
2749 spin_unlock(&dentry->d_lock); 3140 break;
2750 if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
2751 goto Elong;
2752 3141
2753 retval = end; 3142 retval = end;
2754 dentry = parent; 3143 dentry = parent;
2755 } 3144 }
3145 if (!(seq & 1))
3146 rcu_read_unlock();
3147 if (need_seqretry(&rename_lock, seq)) {
3148 seq = 1;
3149 goto restart;
3150 }
3151 done_seqretry(&rename_lock, seq);
3152 if (error)
3153 goto Elong;
2756 return retval; 3154 return retval;
2757Elong: 3155Elong:
2758 return ERR_PTR(-ENAMETOOLONG); 3156 return ERR_PTR(-ENAMETOOLONG);
@@ -2760,13 +3158,7 @@ Elong:
2760 3158
2761char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) 3159char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
2762{ 3160{
2763 char *retval; 3161 return __dentry_path(dentry, buf, buflen);
2764
2765 write_seqlock(&rename_lock);
2766 retval = __dentry_path(dentry, buf, buflen);
2767 write_sequnlock(&rename_lock);
2768
2769 return retval;
2770} 3162}
2771EXPORT_SYMBOL(dentry_path_raw); 3163EXPORT_SYMBOL(dentry_path_raw);
2772 3164
@@ -2775,7 +3167,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2775 char *p = NULL; 3167 char *p = NULL;
2776 char *retval; 3168 char *retval;
2777 3169
2778 write_seqlock(&rename_lock);
2779 if (d_unlinked(dentry)) { 3170 if (d_unlinked(dentry)) {
2780 p = buf + buflen; 3171 p = buf + buflen;
2781 if (prepend(&p, &buflen, "//deleted", 10) != 0) 3172 if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2783,7 +3174,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
2783 buflen++; 3174 buflen++;
2784 } 3175 }
2785 retval = __dentry_path(dentry, buf, buflen); 3176 retval = __dentry_path(dentry, buf, buflen);
2786 write_sequnlock(&rename_lock);
2787 if (!IS_ERR(retval) && p) 3177 if (!IS_ERR(retval) && p)
2788 *p = '/'; /* restore '/' overriden with '\0' */ 3178 *p = '/'; /* restore '/' overriden with '\0' */
2789 return retval; 3179 return retval;
@@ -2791,6 +3181,18 @@ Elong:
2791 return ERR_PTR(-ENAMETOOLONG); 3181 return ERR_PTR(-ENAMETOOLONG);
2792} 3182}
2793 3183
3184static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
3185 struct path *pwd)
3186{
3187 unsigned seq;
3188
3189 do {
3190 seq = read_seqcount_begin(&fs->seq);
3191 *root = fs->root;
3192 *pwd = fs->pwd;
3193 } while (read_seqcount_retry(&fs->seq, seq));
3194}
3195
2794/* 3196/*
2795 * NOTE! The user-level library version returns a 3197 * NOTE! The user-level library version returns a
2796 * character pointer. The kernel system call just 3198 * character pointer. The kernel system call just
@@ -2813,25 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2813{ 3215{
2814 int error; 3216 int error;
2815 struct path pwd, root; 3217 struct path pwd, root;
2816 char *page = (char *) __get_free_page(GFP_USER); 3218 char *page = __getname();
2817 3219
2818 if (!page) 3220 if (!page)
2819 return -ENOMEM; 3221 return -ENOMEM;
2820 3222
2821 get_fs_root_and_pwd(current->fs, &root, &pwd); 3223 rcu_read_lock();
3224 get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
2822 3225
2823 error = -ENOENT; 3226 error = -ENOENT;
2824 br_read_lock(&vfsmount_lock); 3227 br_read_lock(&vfsmount_lock);
2825 write_seqlock(&rename_lock);
2826 if (!d_unlinked(pwd.dentry)) { 3228 if (!d_unlinked(pwd.dentry)) {
2827 unsigned long len; 3229 unsigned long len;
2828 char *cwd = page + PAGE_SIZE; 3230 char *cwd = page + PATH_MAX;
2829 int buflen = PAGE_SIZE; 3231 int buflen = PATH_MAX;
2830 3232
2831 prepend(&cwd, &buflen, "\0", 1); 3233 prepend(&cwd, &buflen, "\0", 1);
2832 error = prepend_path(&pwd, &root, &cwd, &buflen); 3234 error = prepend_path(&pwd, &root, &cwd, &buflen);
2833 write_sequnlock(&rename_lock);
2834 br_read_unlock(&vfsmount_lock); 3235 br_read_unlock(&vfsmount_lock);
3236 rcu_read_unlock();
2835 3237
2836 if (error < 0) 3238 if (error < 0)
2837 goto out; 3239 goto out;
@@ -2844,21 +3246,19 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2844 } 3246 }
2845 3247
2846 error = -ERANGE; 3248 error = -ERANGE;
2847 len = PAGE_SIZE + page - cwd; 3249 len = PATH_MAX + page - cwd;
2848 if (len <= size) { 3250 if (len <= size) {
2849 error = len; 3251 error = len;
2850 if (copy_to_user(buf, cwd, len)) 3252 if (copy_to_user(buf, cwd, len))
2851 error = -EFAULT; 3253 error = -EFAULT;
2852 } 3254 }
2853 } else { 3255 } else {
2854 write_sequnlock(&rename_lock);
2855 br_read_unlock(&vfsmount_lock); 3256 br_read_unlock(&vfsmount_lock);
3257 rcu_read_unlock();
2856 } 3258 }
2857 3259
2858out: 3260out:
2859 path_put(&pwd); 3261 __putname(page);
2860 path_put(&root);
2861 free_page((unsigned long) page);
2862 return error; 3262 return error;
2863} 3263}
2864 3264
@@ -2904,68 +3304,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2904 return result; 3304 return result;
2905} 3305}
2906 3306
2907void d_genocide(struct dentry *root) 3307static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
2908{ 3308{
2909 struct dentry *this_parent; 3309 struct dentry *root = data;
2910 struct list_head *next; 3310 if (dentry != root) {
2911 unsigned seq; 3311 if (d_unhashed(dentry) || !dentry->d_inode)
2912 int locked = 0; 3312 return D_WALK_SKIP;
2913 3313
2914 seq = read_seqbegin(&rename_lock);
2915again:
2916 this_parent = root;
2917 spin_lock(&this_parent->d_lock);
2918repeat:
2919 next = this_parent->d_subdirs.next;
2920resume:
2921 while (next != &this_parent->d_subdirs) {
2922 struct list_head *tmp = next;
2923 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
2924 next = tmp->next;
2925
2926 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2927 if (d_unhashed(dentry) || !dentry->d_inode) {
2928 spin_unlock(&dentry->d_lock);
2929 continue;
2930 }
2931 if (!list_empty(&dentry->d_subdirs)) {
2932 spin_unlock(&this_parent->d_lock);
2933 spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
2934 this_parent = dentry;
2935 spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
2936 goto repeat;
2937 }
2938 if (!(dentry->d_flags & DCACHE_GENOCIDE)) { 3314 if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
2939 dentry->d_flags |= DCACHE_GENOCIDE; 3315 dentry->d_flags |= DCACHE_GENOCIDE;
2940 dentry->d_lockref.count--; 3316 dentry->d_lockref.count--;
2941 } 3317 }
2942 spin_unlock(&dentry->d_lock);
2943 } 3318 }
2944 if (this_parent != root) { 3319 return D_WALK_CONTINUE;
2945 struct dentry *child = this_parent; 3320}
2946 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2947 this_parent->d_flags |= DCACHE_GENOCIDE;
2948 this_parent->d_lockref.count--;
2949 }
2950 this_parent = try_to_ascend(this_parent, locked, seq);
2951 if (!this_parent)
2952 goto rename_retry;
2953 next = child->d_u.d_child.next;
2954 goto resume;
2955 }
2956 spin_unlock(&this_parent->d_lock);
2957 if (!locked && read_seqretry(&rename_lock, seq))
2958 goto rename_retry;
2959 if (locked)
2960 write_sequnlock(&rename_lock);
2961 return;
2962 3321
2963rename_retry: 3322void d_genocide(struct dentry *parent)
2964 if (locked) 3323{
2965 goto again; 3324 d_walk(parent, parent, d_genocide_kill, NULL);
2966 locked = 1;
2967 write_seqlock(&rename_lock);
2968 goto again;
2969} 3325}
2970 3326
2971void d_tmpfile(struct dentry *dentry, struct inode *inode) 3327void d_tmpfile(struct dentry *dentry, struct inode *inode)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5081ee..0e04142d5962 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
127 spinlock_t bio_lock; /* protects BIO fields below */ 127 spinlock_t bio_lock; /* protects BIO fields below */
128 int page_errors; /* errno from get_user_pages() */ 128 int page_errors; /* errno from get_user_pages() */
129 int is_async; /* is IO async ? */ 129 int is_async; /* is IO async ? */
130 bool defer_completion; /* defer AIO completion to workqueue? */
130 int io_error; /* IO error in completion path */ 131 int io_error; /* IO error in completion path */
131 unsigned long refcount; /* direct_io_worker() and bios */ 132 unsigned long refcount; /* direct_io_worker() and bios */
132 struct bio *bio_list; /* singly linked via bi_private */ 133 struct bio *bio_list; /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
141 * allocation time. Don't add new fields after pages[] unless you 142 * allocation time. Don't add new fields after pages[] unless you
142 * wish that they not be zeroed. 143 * wish that they not be zeroed.
143 */ 144 */
144 struct page *pages[DIO_PAGES]; /* page buffer */ 145 union {
146 struct page *pages[DIO_PAGES]; /* page buffer */
147 struct work_struct complete_work;/* deferred AIO completion */
148 };
145} ____cacheline_aligned_in_smp; 149} ____cacheline_aligned_in_smp;
146 150
147static struct kmem_cache *dio_cache __read_mostly; 151static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
221 * dio_complete() - called when all DIO BIO I/O has been completed 225 * dio_complete() - called when all DIO BIO I/O has been completed
222 * @offset: the byte offset in the file of the completed operation 226 * @offset: the byte offset in the file of the completed operation
223 * 227 *
224 * This releases locks as dictated by the locking type, lets interested parties 228 * This drops i_dio_count, lets interested parties know that a DIO operation
225 * know that a DIO operation has completed, and calculates the resulting return 229 * has completed, and calculates the resulting return code for the operation.
226 * code for the operation.
227 * 230 *
228 * It lets the filesystem know if it registered an interest earlier via 231 * It lets the filesystem know if it registered an interest earlier via
229 * get_block. Pass the private field of the map buffer_head so that 232 * get_block. Pass the private field of the map buffer_head so that
230 * filesystems can use it to hold additional state between get_block calls and 233 * filesystems can use it to hold additional state between get_block calls and
231 * dio_complete. 234 * dio_complete.
232 */ 235 */
233static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) 236static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
237 bool is_async)
234{ 238{
235 ssize_t transferred = 0; 239 ssize_t transferred = 0;
236 240
@@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
258 if (ret == 0) 262 if (ret == 0)
259 ret = transferred; 263 ret = transferred;
260 264
261 if (dio->end_io && dio->result) { 265 if (dio->end_io && dio->result)
262 dio->end_io(dio->iocb, offset, transferred, 266 dio->end_io(dio->iocb, offset, transferred, dio->private);
263 dio->private, ret, is_async); 267
264 } else { 268 inode_dio_done(dio->inode);
265 inode_dio_done(dio->inode); 269 if (is_async) {
266 if (is_async) 270 if (dio->rw & WRITE) {
267 aio_complete(dio->iocb, ret, 0); 271 int err;
272
273 err = generic_write_sync(dio->iocb->ki_filp, offset,
274 transferred);
275 if (err < 0 && ret > 0)
276 ret = err;
277 }
278
279 aio_complete(dio->iocb, ret, 0);
268 } 280 }
269 281
282 kmem_cache_free(dio_cache, dio);
270 return ret; 283 return ret;
271} 284}
272 285
286static void dio_aio_complete_work(struct work_struct *work)
287{
288 struct dio *dio = container_of(work, struct dio, complete_work);
289
290 dio_complete(dio, dio->iocb->ki_pos, 0, true);
291}
292
273static int dio_bio_complete(struct dio *dio, struct bio *bio); 293static int dio_bio_complete(struct dio *dio, struct bio *bio);
294
274/* 295/*
275 * Asynchronous IO callback. 296 * Asynchronous IO callback.
276 */ 297 */
@@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
290 spin_unlock_irqrestore(&dio->bio_lock, flags); 311 spin_unlock_irqrestore(&dio->bio_lock, flags);
291 312
292 if (remaining == 0) { 313 if (remaining == 0) {
293 dio_complete(dio, dio->iocb->ki_pos, 0, true); 314 if (dio->result && dio->defer_completion) {
294 kmem_cache_free(dio_cache, dio); 315 INIT_WORK(&dio->complete_work, dio_aio_complete_work);
316 queue_work(dio->inode->i_sb->s_dio_done_wq,
317 &dio->complete_work);
318 } else {
319 dio_complete(dio, dio->iocb->ki_pos, 0, true);
320 }
295 } 321 }
296} 322}
297 323
@@ -511,6 +537,42 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
511} 537}
512 538
513/* 539/*
540 * Create workqueue for deferred direct IO completions. We allocate the
541 * workqueue when it's first needed. This avoids creating workqueue for
542 * filesystems that don't need it and also allows us to create the workqueue
543 * late enough so the we can include s_id in the name of the workqueue.
544 */
545static int sb_init_dio_done_wq(struct super_block *sb)
546{
547 struct workqueue_struct *old;
548 struct workqueue_struct *wq = alloc_workqueue("dio/%s",
549 WQ_MEM_RECLAIM, 0,
550 sb->s_id);
551 if (!wq)
552 return -ENOMEM;
553 /*
554 * This has to be atomic as more DIOs can race to create the workqueue
555 */
556 old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
557 /* Someone created workqueue before us? Free ours... */
558 if (old)
559 destroy_workqueue(wq);
560 return 0;
561}
562
563static int dio_set_defer_completion(struct dio *dio)
564{
565 struct super_block *sb = dio->inode->i_sb;
566
567 if (dio->defer_completion)
568 return 0;
569 dio->defer_completion = true;
570 if (!sb->s_dio_done_wq)
571 return sb_init_dio_done_wq(sb);
572 return 0;
573}
574
575/*
514 * Call into the fs to map some more disk blocks. We record the current number 576 * Call into the fs to map some more disk blocks. We record the current number
515 * of available blocks at sdio->blocks_available. These are in units of the 577 * of available blocks at sdio->blocks_available. These are in units of the
516 * fs blocksize, (1 << inode->i_blkbits). 578 * fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +643,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
581 643
582 /* Store for completion */ 644 /* Store for completion */
583 dio->private = map_bh->b_private; 645 dio->private = map_bh->b_private;
646
647 if (ret == 0 && buffer_defer_completion(map_bh))
648 ret = dio_set_defer_completion(dio);
584 } 649 }
585 return ret; 650 return ret;
586} 651}
@@ -1129,11 +1194,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1129 } 1194 }
1130 1195
1131 /* 1196 /*
1132 * Will be decremented at I/O completion time.
1133 */
1134 atomic_inc(&inode->i_dio_count);
1135
1136 /*
1137 * For file extending writes updating i_size before data 1197 * For file extending writes updating i_size before data
1138 * writeouts complete can expose uninitialized blocks. So 1198 * writeouts complete can expose uninitialized blocks. So
1139 * even for AIO, we need to wait for i/o to complete before 1199 * even for AIO, we need to wait for i/o to complete before
@@ -1141,11 +1201,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1141 */ 1201 */
1142 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1202 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1143 (end > i_size_read(inode))); 1203 (end > i_size_read(inode)));
1144
1145 retval = 0;
1146
1147 dio->inode = inode; 1204 dio->inode = inode;
1148 dio->rw = rw; 1205 dio->rw = rw;
1206
1207 /*
1208 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
1209 * so that we can call ->fsync.
1210 */
1211 if (dio->is_async && (rw & WRITE) &&
1212 ((iocb->ki_filp->f_flags & O_DSYNC) ||
1213 IS_SYNC(iocb->ki_filp->f_mapping->host))) {
1214 retval = dio_set_defer_completion(dio);
1215 if (retval) {
1216 /*
1217 * We grab i_mutex only for reads so we don't have
1218 * to release it here
1219 */
1220 kmem_cache_free(dio_cache, dio);
1221 goto out;
1222 }
1223 }
1224
1225 /*
1226 * Will be decremented at I/O completion time.
1227 */
1228 atomic_inc(&inode->i_dio_count);
1229
1230 retval = 0;
1149 sdio.blkbits = blkbits; 1231 sdio.blkbits = blkbits;
1150 sdio.blkfactor = i_blkbits - blkbits; 1232 sdio.blkfactor = i_blkbits - blkbits;
1151 sdio.block_in_file = offset >> blkbits; 1233 sdio.block_in_file = offset >> blkbits;
@@ -1269,7 +1351,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1269 1351
1270 if (drop_refcount(dio) == 0) { 1352 if (drop_refcount(dio) == 0) {
1271 retval = dio_complete(dio, offset, retval, false); 1353 retval = dio_complete(dio, offset, retval, false);
1272 kmem_cache_free(dio_cache, dio);
1273 } else 1354 } else
1274 BUG_ON(retval != -EIOCBQUEUED); 1355 BUG_ON(retval != -EIOCBQUEUED);
1275 1356
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 27a6ba9aaeec..0e90f0c91b93 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -267,10 +267,7 @@ void dlm_callback_work(struct work_struct *work)
267int dlm_callback_start(struct dlm_ls *ls) 267int dlm_callback_start(struct dlm_ls *ls)
268{ 268{
269 ls->ls_callback_wq = alloc_workqueue("dlm_callback", 269 ls->ls_callback_wq = alloc_workqueue("dlm_callback",
270 WQ_UNBOUND | 270 WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
271 WQ_MEM_RECLAIM |
272 WQ_NON_REENTRANT,
273 0);
274 if (!ls->ls_callback_wq) { 271 if (!ls->ls_callback_wq) {
275 log_print("can't start dlm_callback workqueue"); 272 log_print("can't start dlm_callback workqueue");
276 return -ENOMEM; 273 return -ENOMEM;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 812149119fa3..142e21655eed 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -493,7 +493,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
493{ 493{
494 struct dlm_user_proc *proc = file->private_data; 494 struct dlm_user_proc *proc = file->private_data;
495 struct dlm_write_request *kbuf; 495 struct dlm_write_request *kbuf;
496 sigset_t tmpsig, allsigs;
497 int error; 496 int error;
498 497
499#ifdef CONFIG_COMPAT 498#ifdef CONFIG_COMPAT
@@ -557,9 +556,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
557 goto out_free; 556 goto out_free;
558 } 557 }
559 558
560 sigfillset(&allsigs);
561 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
562
563 error = -EINVAL; 559 error = -EINVAL;
564 560
565 switch (kbuf->cmd) 561 switch (kbuf->cmd)
@@ -567,7 +563,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
567 case DLM_USER_LOCK: 563 case DLM_USER_LOCK:
568 if (!proc) { 564 if (!proc) {
569 log_print("no locking on control device"); 565 log_print("no locking on control device");
570 goto out_sig; 566 goto out_free;
571 } 567 }
572 error = device_user_lock(proc, &kbuf->i.lock); 568 error = device_user_lock(proc, &kbuf->i.lock);
573 break; 569 break;
@@ -575,7 +571,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
575 case DLM_USER_UNLOCK: 571 case DLM_USER_UNLOCK:
576 if (!proc) { 572 if (!proc) {
577 log_print("no locking on control device"); 573 log_print("no locking on control device");
578 goto out_sig; 574 goto out_free;
579 } 575 }
580 error = device_user_unlock(proc, &kbuf->i.lock); 576 error = device_user_unlock(proc, &kbuf->i.lock);
581 break; 577 break;
@@ -583,7 +579,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
583 case DLM_USER_DEADLOCK: 579 case DLM_USER_DEADLOCK:
584 if (!proc) { 580 if (!proc) {
585 log_print("no locking on control device"); 581 log_print("no locking on control device");
586 goto out_sig; 582 goto out_free;
587 } 583 }
588 error = device_user_deadlock(proc, &kbuf->i.lock); 584 error = device_user_deadlock(proc, &kbuf->i.lock);
589 break; 585 break;
@@ -591,7 +587,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
591 case DLM_USER_CREATE_LOCKSPACE: 587 case DLM_USER_CREATE_LOCKSPACE:
592 if (proc) { 588 if (proc) {
593 log_print("create/remove only on control device"); 589 log_print("create/remove only on control device");
594 goto out_sig; 590 goto out_free;
595 } 591 }
596 error = device_create_lockspace(&kbuf->i.lspace); 592 error = device_create_lockspace(&kbuf->i.lspace);
597 break; 593 break;
@@ -599,7 +595,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
599 case DLM_USER_REMOVE_LOCKSPACE: 595 case DLM_USER_REMOVE_LOCKSPACE:
600 if (proc) { 596 if (proc) {
601 log_print("create/remove only on control device"); 597 log_print("create/remove only on control device");
602 goto out_sig; 598 goto out_free;
603 } 599 }
604 error = device_remove_lockspace(&kbuf->i.lspace); 600 error = device_remove_lockspace(&kbuf->i.lspace);
605 break; 601 break;
@@ -607,7 +603,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
607 case DLM_USER_PURGE: 603 case DLM_USER_PURGE:
608 if (!proc) { 604 if (!proc) {
609 log_print("no locking on control device"); 605 log_print("no locking on control device");
610 goto out_sig; 606 goto out_free;
611 } 607 }
612 error = device_user_purge(proc, &kbuf->i.purge); 608 error = device_user_purge(proc, &kbuf->i.purge);
613 break; 609 break;
@@ -617,8 +613,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
617 kbuf->cmd); 613 kbuf->cmd);
618 } 614 }
619 615
620 out_sig:
621 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
622 out_free: 616 out_free:
623 kfree(kbuf); 617 kfree(kbuf);
624 return error; 618 return error;
@@ -659,15 +653,11 @@ static int device_close(struct inode *inode, struct file *file)
659{ 653{
660 struct dlm_user_proc *proc = file->private_data; 654 struct dlm_user_proc *proc = file->private_data;
661 struct dlm_ls *ls; 655 struct dlm_ls *ls;
662 sigset_t tmpsig, allsigs;
663 656
664 ls = dlm_find_lockspace_local(proc->lockspace); 657 ls = dlm_find_lockspace_local(proc->lockspace);
665 if (!ls) 658 if (!ls)
666 return -ENOENT; 659 return -ENOENT;
667 660
668 sigfillset(&allsigs);
669 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
670
671 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); 661 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
672 662
673 dlm_clear_proc_locks(ls, proc); 663 dlm_clear_proc_locks(ls, proc);
@@ -685,8 +675,6 @@ static int device_close(struct inode *inode, struct file *file)
685 /* FIXME: AUTOFREE: if this ls is no longer used do 675 /* FIXME: AUTOFREE: if this ls is no longer used do
686 device_remove_lockspace() */ 676 device_remove_lockspace() */
687 677
688 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
689
690 return 0; 678 return 0;
691} 679}
692 680
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055b6282..9fd702f5bfb2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -44,6 +44,7 @@ static void drop_slab(void)
44 .gfp_mask = GFP_KERNEL, 44 .gfp_mask = GFP_KERNEL,
45 }; 45 };
46 46
47 nodes_setall(shrink.nodes_to_scan);
47 do { 48 do {
48 nr_objects = shrink_slab(&shrink, 1000, 1000); 49 nr_objects = shrink_slab(&shrink, 1000, 1000);
49 } while (nr_objects > 10); 50 } while (nr_objects > 10);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index d10757635b9c..c88e355f7635 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -609,39 +609,35 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
609 char *full_alg_name; 609 char *full_alg_name;
610 int rc = -EINVAL; 610 int rc = -EINVAL;
611 611
612 if (!crypt_stat->cipher) {
613 ecryptfs_printk(KERN_ERR, "No cipher specified\n");
614 goto out;
615 }
616 ecryptfs_printk(KERN_DEBUG, 612 ecryptfs_printk(KERN_DEBUG,
617 "Initializing cipher [%s]; strlen = [%d]; " 613 "Initializing cipher [%s]; strlen = [%d]; "
618 "key_size_bits = [%zd]\n", 614 "key_size_bits = [%zd]\n",
619 crypt_stat->cipher, (int)strlen(crypt_stat->cipher), 615 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
620 crypt_stat->key_size << 3); 616 crypt_stat->key_size << 3);
617 mutex_lock(&crypt_stat->cs_tfm_mutex);
621 if (crypt_stat->tfm) { 618 if (crypt_stat->tfm) {
622 rc = 0; 619 rc = 0;
623 goto out; 620 goto out_unlock;
624 } 621 }
625 mutex_lock(&crypt_stat->cs_tfm_mutex);
626 rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, 622 rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
627 crypt_stat->cipher, "cbc"); 623 crypt_stat->cipher, "cbc");
628 if (rc) 624 if (rc)
629 goto out_unlock; 625 goto out_unlock;
630 crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); 626 crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
631 kfree(full_alg_name);
632 if (IS_ERR(crypt_stat->tfm)) { 627 if (IS_ERR(crypt_stat->tfm)) {
633 rc = PTR_ERR(crypt_stat->tfm); 628 rc = PTR_ERR(crypt_stat->tfm);
634 crypt_stat->tfm = NULL; 629 crypt_stat->tfm = NULL;
635 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " 630 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
636 "Error initializing cipher [%s]\n", 631 "Error initializing cipher [%s]\n",
637 crypt_stat->cipher); 632 full_alg_name);
638 goto out_unlock; 633 goto out_free;
639 } 634 }
640 crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); 635 crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
641 rc = 0; 636 rc = 0;
637out_free:
638 kfree(full_alg_name);
642out_unlock: 639out_unlock:
643 mutex_unlock(&crypt_stat->cs_tfm_mutex); 640 mutex_unlock(&crypt_stat->cs_tfm_mutex);
644out:
645 return rc; 641 return rc;
646} 642}
647 643
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9ad17b15b454..473e09da7d02 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep)
740 epi = rb_entry(rbp, struct epitem, rbn); 740 epi = rb_entry(rbp, struct epitem, rbn);
741 741
742 ep_unregister_pollwait(ep, epi); 742 ep_unregister_pollwait(ep, epi);
743 cond_resched();
743 } 744 }
744 745
745 /* 746 /*
@@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep)
754 while ((rbp = rb_first(&ep->rbr)) != NULL) { 755 while ((rbp = rb_first(&ep->rbr)) != NULL) {
755 epi = rb_entry(rbp, struct epitem, rbn); 756 epi = rb_entry(rbp, struct epitem, rbn);
756 ep_remove(ep, epi); 757 ep_remove(ep, epi);
758 cond_resched();
757 } 759 }
758 mutex_unlock(&ep->mtx); 760 mutex_unlock(&ep->mtx);
759 761
@@ -1792,7 +1794,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1792{ 1794{
1793 int error; 1795 int error;
1794 int did_lock_epmutex = 0; 1796 int did_lock_epmutex = 0;
1795 struct file *file, *tfile; 1797 struct fd f, tf;
1796 struct eventpoll *ep; 1798 struct eventpoll *ep;
1797 struct epitem *epi; 1799 struct epitem *epi;
1798 struct epoll_event epds; 1800 struct epoll_event epds;
@@ -1802,20 +1804,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1802 copy_from_user(&epds, event, sizeof(struct epoll_event))) 1804 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1803 goto error_return; 1805 goto error_return;
1804 1806
1805 /* Get the "struct file *" for the eventpoll file */
1806 error = -EBADF; 1807 error = -EBADF;
1807 file = fget(epfd); 1808 f = fdget(epfd);
1808 if (!file) 1809 if (!f.file)
1809 goto error_return; 1810 goto error_return;
1810 1811
1811 /* Get the "struct file *" for the target file */ 1812 /* Get the "struct file *" for the target file */
1812 tfile = fget(fd); 1813 tf = fdget(fd);
1813 if (!tfile) 1814 if (!tf.file)
1814 goto error_fput; 1815 goto error_fput;
1815 1816
1816 /* The target file descriptor must support poll */ 1817 /* The target file descriptor must support poll */
1817 error = -EPERM; 1818 error = -EPERM;
1818 if (!tfile->f_op || !tfile->f_op->poll) 1819 if (!tf.file->f_op || !tf.file->f_op->poll)
1819 goto error_tgt_fput; 1820 goto error_tgt_fput;
1820 1821
1821 /* Check if EPOLLWAKEUP is allowed */ 1822 /* Check if EPOLLWAKEUP is allowed */
@@ -1828,14 +1829,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1828 * adding an epoll file descriptor inside itself. 1829 * adding an epoll file descriptor inside itself.
1829 */ 1830 */
1830 error = -EINVAL; 1831 error = -EINVAL;
1831 if (file == tfile || !is_file_epoll(file)) 1832 if (f.file == tf.file || !is_file_epoll(f.file))
1832 goto error_tgt_fput; 1833 goto error_tgt_fput;
1833 1834
1834 /* 1835 /*
1835 * At this point it is safe to assume that the "private_data" contains 1836 * At this point it is safe to assume that the "private_data" contains
1836 * our own data structure. 1837 * our own data structure.
1837 */ 1838 */
1838 ep = file->private_data; 1839 ep = f.file->private_data;
1839 1840
1840 /* 1841 /*
1841 * When we insert an epoll file descriptor, inside another epoll file 1842 * When we insert an epoll file descriptor, inside another epoll file
@@ -1854,14 +1855,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1854 did_lock_epmutex = 1; 1855 did_lock_epmutex = 1;
1855 } 1856 }
1856 if (op == EPOLL_CTL_ADD) { 1857 if (op == EPOLL_CTL_ADD) {
1857 if (is_file_epoll(tfile)) { 1858 if (is_file_epoll(tf.file)) {
1858 error = -ELOOP; 1859 error = -ELOOP;
1859 if (ep_loop_check(ep, tfile) != 0) { 1860 if (ep_loop_check(ep, tf.file) != 0) {
1860 clear_tfile_check_list(); 1861 clear_tfile_check_list();
1861 goto error_tgt_fput; 1862 goto error_tgt_fput;
1862 } 1863 }
1863 } else 1864 } else
1864 list_add(&tfile->f_tfile_llink, &tfile_check_list); 1865 list_add(&tf.file->f_tfile_llink, &tfile_check_list);
1865 } 1866 }
1866 1867
1867 mutex_lock_nested(&ep->mtx, 0); 1868 mutex_lock_nested(&ep->mtx, 0);
@@ -1871,14 +1872,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1871 * above, we can be sure to be able to use the item looked up by 1872 * above, we can be sure to be able to use the item looked up by
1872 * ep_find() till we release the mutex. 1873 * ep_find() till we release the mutex.
1873 */ 1874 */
1874 epi = ep_find(ep, tfile, fd); 1875 epi = ep_find(ep, tf.file, fd);
1875 1876
1876 error = -EINVAL; 1877 error = -EINVAL;
1877 switch (op) { 1878 switch (op) {
1878 case EPOLL_CTL_ADD: 1879 case EPOLL_CTL_ADD:
1879 if (!epi) { 1880 if (!epi) {
1880 epds.events |= POLLERR | POLLHUP; 1881 epds.events |= POLLERR | POLLHUP;
1881 error = ep_insert(ep, &epds, tfile, fd); 1882 error = ep_insert(ep, &epds, tf.file, fd);
1882 } else 1883 } else
1883 error = -EEXIST; 1884 error = -EEXIST;
1884 clear_tfile_check_list(); 1885 clear_tfile_check_list();
@@ -1903,9 +1904,9 @@ error_tgt_fput:
1903 if (did_lock_epmutex) 1904 if (did_lock_epmutex)
1904 mutex_unlock(&epmutex); 1905 mutex_unlock(&epmutex);
1905 1906
1906 fput(tfile); 1907 fdput(tf);
1907error_fput: 1908error_fput:
1908 fput(file); 1909 fdput(f);
1909error_return: 1910error_return:
1910 1911
1911 return error; 1912 return error;
diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..8875dd10ae7a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock);
74void __register_binfmt(struct linux_binfmt * fmt, int insert) 74void __register_binfmt(struct linux_binfmt * fmt, int insert)
75{ 75{
76 BUG_ON(!fmt); 76 BUG_ON(!fmt);
77 if (WARN_ON(!fmt->load_binary))
78 return;
77 write_lock(&binfmt_lock); 79 write_lock(&binfmt_lock);
78 insert ? list_add(&fmt->lh, &formats) : 80 insert ? list_add(&fmt->lh, &formats) :
79 list_add_tail(&fmt->lh, &formats); 81 list_add_tail(&fmt->lh, &formats);
@@ -266,7 +268,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
266 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 268 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
267 vma->vm_end = STACK_TOP_MAX; 269 vma->vm_end = STACK_TOP_MAX;
268 vma->vm_start = vma->vm_end - PAGE_SIZE; 270 vma->vm_start = vma->vm_end - PAGE_SIZE;
269 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 271 vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
270 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 272 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
271 INIT_LIST_HEAD(&vma->anon_vma_chain); 273 INIT_LIST_HEAD(&vma->anon_vma_chain);
272 274
@@ -1365,18 +1367,18 @@ out:
1365} 1367}
1366EXPORT_SYMBOL(remove_arg_zero); 1368EXPORT_SYMBOL(remove_arg_zero);
1367 1369
1370#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1368/* 1371/*
1369 * cycle the list of binary formats handler, until one recognizes the image 1372 * cycle the list of binary formats handler, until one recognizes the image
1370 */ 1373 */
1371int search_binary_handler(struct linux_binprm *bprm) 1374int search_binary_handler(struct linux_binprm *bprm)
1372{ 1375{
1373 unsigned int depth = bprm->recursion_depth; 1376 bool need_retry = IS_ENABLED(CONFIG_MODULES);
1374 int try,retval;
1375 struct linux_binfmt *fmt; 1377 struct linux_binfmt *fmt;
1376 pid_t old_pid, old_vpid; 1378 int retval;
1377 1379
1378 /* This allows 4 levels of binfmt rewrites before failing hard. */ 1380 /* This allows 4 levels of binfmt rewrites before failing hard. */
1379 if (depth > 5) 1381 if (bprm->recursion_depth > 5)
1380 return -ELOOP; 1382 return -ELOOP;
1381 1383
1382 retval = security_bprm_check(bprm); 1384 retval = security_bprm_check(bprm);
@@ -1387,71 +1389,67 @@ int search_binary_handler(struct linux_binprm *bprm)
1387 if (retval) 1389 if (retval)
1388 return retval; 1390 return retval;
1389 1391
1392 retval = -ENOENT;
1393 retry:
1394 read_lock(&binfmt_lock);
1395 list_for_each_entry(fmt, &formats, lh) {
1396 if (!try_module_get(fmt->module))
1397 continue;
1398 read_unlock(&binfmt_lock);
1399 bprm->recursion_depth++;
1400 retval = fmt->load_binary(bprm);
1401 bprm->recursion_depth--;
1402 if (retval >= 0 || retval != -ENOEXEC ||
1403 bprm->mm == NULL || bprm->file == NULL) {
1404 put_binfmt(fmt);
1405 return retval;
1406 }
1407 read_lock(&binfmt_lock);
1408 put_binfmt(fmt);
1409 }
1410 read_unlock(&binfmt_lock);
1411
1412 if (need_retry && retval == -ENOEXEC) {
1413 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1414 printable(bprm->buf[2]) && printable(bprm->buf[3]))
1415 return retval;
1416 if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
1417 return retval;
1418 need_retry = false;
1419 goto retry;
1420 }
1421
1422 return retval;
1423}
1424EXPORT_SYMBOL(search_binary_handler);
1425
1426static int exec_binprm(struct linux_binprm *bprm)
1427{
1428 pid_t old_pid, old_vpid;
1429 int ret;
1430
1390 /* Need to fetch pid before load_binary changes it */ 1431 /* Need to fetch pid before load_binary changes it */
1391 old_pid = current->pid; 1432 old_pid = current->pid;
1392 rcu_read_lock(); 1433 rcu_read_lock();
1393 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); 1434 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1394 rcu_read_unlock(); 1435 rcu_read_unlock();
1395 1436
1396 retval = -ENOENT; 1437 ret = search_binary_handler(bprm);
1397 for (try=0; try<2; try++) { 1438 if (ret >= 0) {
1398 read_lock(&binfmt_lock); 1439 trace_sched_process_exec(current, old_pid, bprm);
1399 list_for_each_entry(fmt, &formats, lh) { 1440 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1400 int (*fn)(struct linux_binprm *) = fmt->load_binary; 1441 current->did_exec = 1;
1401 if (!fn) 1442 proc_exec_connector(current);
1402 continue; 1443
1403 if (!try_module_get(fmt->module)) 1444 if (bprm->file) {
1404 continue; 1445 allow_write_access(bprm->file);
1405 read_unlock(&binfmt_lock); 1446 fput(bprm->file);
1406 bprm->recursion_depth = depth + 1; 1447 bprm->file = NULL; /* to catch use-after-free */
1407 retval = fn(bprm);
1408 bprm->recursion_depth = depth;
1409 if (retval >= 0) {
1410 if (depth == 0) {
1411 trace_sched_process_exec(current, old_pid, bprm);
1412 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1413 }
1414 put_binfmt(fmt);
1415 allow_write_access(bprm->file);
1416 if (bprm->file)
1417 fput(bprm->file);
1418 bprm->file = NULL;
1419 current->did_exec = 1;
1420 proc_exec_connector(current);
1421 return retval;
1422 }
1423 read_lock(&binfmt_lock);
1424 put_binfmt(fmt);
1425 if (retval != -ENOEXEC || bprm->mm == NULL)
1426 break;
1427 if (!bprm->file) {
1428 read_unlock(&binfmt_lock);
1429 return retval;
1430 }
1431 } 1448 }
1432 read_unlock(&binfmt_lock);
1433#ifdef CONFIG_MODULES
1434 if (retval != -ENOEXEC || bprm->mm == NULL) {
1435 break;
1436 } else {
1437#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1438 if (printable(bprm->buf[0]) &&
1439 printable(bprm->buf[1]) &&
1440 printable(bprm->buf[2]) &&
1441 printable(bprm->buf[3]))
1442 break; /* -ENOEXEC */
1443 if (try)
1444 break; /* -ENOEXEC */
1445 request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1446 }
1447#else
1448 break;
1449#endif
1450 } 1449 }
1451 return retval;
1452}
1453 1450
1454EXPORT_SYMBOL(search_binary_handler); 1451 return ret;
1452}
1455 1453
1456/* 1454/*
1457 * sys_execve() executes a new program. 1455 * sys_execve() executes a new program.
@@ -1541,7 +1539,7 @@ static int do_execve_common(const char *filename,
1541 if (retval < 0) 1539 if (retval < 0)
1542 goto out; 1540 goto out;
1543 1541
1544 retval = search_binary_handler(bprm); 1542 retval = exec_binprm(bprm);
1545 if (retval < 0) 1543 if (retval < 0)
1546 goto out; 1544 goto out;
1547 1545
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2ec8eb1ab269..a52a5d23c30b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
861static void _write_failed(struct inode *inode, loff_t to) 861static void _write_failed(struct inode *inode, loff_t to)
862{ 862{
863 if (to > inode->i_size) 863 if (to > inode->i_size)
864 truncate_pagecache(inode, to, inode->i_size); 864 truncate_pagecache(inode, inode->i_size);
865} 865}
866 866
867int exofs_write_begin(struct file *file, struct address_space *mapping, 867int exofs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 293bc2e47a73..a235f0016889 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -231,7 +231,7 @@ static int filldir_one(void * __buf, const char * name, int len,
231 int result = 0; 231 int result = 0;
232 232
233 buf->sequence++; 233 buf->sequence++;
234 if (buf->ino == ino) { 234 if (buf->ino == ino && len <= NAME_MAX) {
235 memcpy(buf->name, name, len); 235 memcpy(buf->name, name, len);
236 buf->name[len] = '\0'; 236 buf->name[len] = '\0';
237 buf->found = 1; 237 buf->found = 1;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0a87bb10998d..c260de6d7b6d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
58 struct inode *inode = mapping->host; 58 struct inode *inode = mapping->host;
59 59
60 if (to > inode->i_size) { 60 if (to > inode->i_size) {
61 truncate_pagecache(inode, to, inode->i_size); 61 truncate_pagecache(inode, inode->i_size);
62 ext2_truncate_blocks(inode, inode->i_size); 62 ext2_truncate_blocks(inode, inode->i_size);
63 } 63 }
64} 64}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index f522425aaa24..bafdd48eefde 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -41,7 +41,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
41 41
42/** 42/**
43 * Check if the given dir-inode refers to an htree-indexed directory 43 * Check if the given dir-inode refers to an htree-indexed directory
44 * (or a directory which chould potentially get coverted to use htree 44 * (or a directory which could potentially get converted to use htree
45 * indexing). 45 * indexing).
46 * 46 *
47 * Return 1 if it is a dx dir, 0 if not 47 * Return 1 if it is a dx dir, 0 if not
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c47f14750722..c50c76190373 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,6 +27,7 @@
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/cleancache.h> 29#include <linux/cleancache.h>
30#include <linux/namei.h>
30 31
31#include <asm/uaccess.h> 32#include <asm/uaccess.h>
32 33
@@ -819,6 +820,7 @@ enum {
819 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 820 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
820 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 821 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
821 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 822 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
823 Opt_journal_path,
822 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 824 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
823 Opt_data_err_abort, Opt_data_err_ignore, 825 Opt_data_err_abort, Opt_data_err_ignore,
824 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 826 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -860,6 +862,7 @@ static const match_table_t tokens = {
860 {Opt_journal_update, "journal=update"}, 862 {Opt_journal_update, "journal=update"},
861 {Opt_journal_inum, "journal=%u"}, 863 {Opt_journal_inum, "journal=%u"},
862 {Opt_journal_dev, "journal_dev=%u"}, 864 {Opt_journal_dev, "journal_dev=%u"},
865 {Opt_journal_path, "journal_path=%s"},
863 {Opt_abort, "abort"}, 866 {Opt_abort, "abort"},
864 {Opt_data_journal, "data=journal"}, 867 {Opt_data_journal, "data=journal"},
865 {Opt_data_ordered, "data=ordered"}, 868 {Opt_data_ordered, "data=ordered"},
@@ -975,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb,
975 int option; 978 int option;
976 kuid_t uid; 979 kuid_t uid;
977 kgid_t gid; 980 kgid_t gid;
981 char *journal_path;
982 struct inode *journal_inode;
983 struct path path;
984 int error;
985
978#ifdef CONFIG_QUOTA 986#ifdef CONFIG_QUOTA
979 int qfmt; 987 int qfmt;
980#endif 988#endif
@@ -1129,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb,
1129 return 0; 1137 return 0;
1130 *journal_devnum = option; 1138 *journal_devnum = option;
1131 break; 1139 break;
1140 case Opt_journal_path:
1141 if (is_remount) {
1142 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1143 "journal on remount");
1144 return 0;
1145 }
1146
1147 journal_path = match_strdup(&args[0]);
1148 if (!journal_path) {
1149 ext3_msg(sb, KERN_ERR, "error: could not dup "
1150 "journal device string");
1151 return 0;
1152 }
1153
1154 error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1155 if (error) {
1156 ext3_msg(sb, KERN_ERR, "error: could not find "
1157 "journal device path: error %d", error);
1158 kfree(journal_path);
1159 return 0;
1160 }
1161
1162 journal_inode = path.dentry->d_inode;
1163 if (!S_ISBLK(journal_inode->i_mode)) {
1164 ext3_msg(sb, KERN_ERR, "error: journal path %s "
1165 "is not a block device", journal_path);
1166 path_put(&path);
1167 kfree(journal_path);
1168 return 0;
1169 }
1170
1171 *journal_devnum = new_encode_dev(journal_inode->i_rdev);
1172 path_put(&path);
1173 kfree(journal_path);
1174 break;
1132 case Opt_noload: 1175 case Opt_noload:
1133 set_opt (sbi->s_mount_opt, NOLOAD); 1176 set_opt (sbi->s_mount_opt, NOLOAD);
1134 break; 1177 break;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ddd715e42a5c..dc5d572ebd6a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
184 struct ext4_sb_info *sbi = EXT4_SB(sb); 184 struct ext4_sb_info *sbi = EXT4_SB(sb);
185 ext4_fsblk_t start, tmp; 185 ext4_fsblk_t start, tmp;
186 int flex_bg = 0; 186 int flex_bg = 0;
187 struct ext4_group_info *grp;
187 188
188 J_ASSERT_BH(bh, buffer_locked(bh)); 189 J_ASSERT_BH(bh, buffer_locked(bh));
189 190
@@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
191 * essentially implementing a per-group read-only flag. */ 192 * essentially implementing a per-group read-only flag. */
192 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 193 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
193 ext4_error(sb, "Checksum bad for group %u", block_group); 194 ext4_error(sb, "Checksum bad for group %u", block_group);
194 ext4_free_group_clusters_set(sb, gdp, 0); 195 grp = ext4_get_group_info(sb, block_group);
195 ext4_free_inodes_set(sb, gdp, 0); 196 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
196 ext4_itable_unused_set(sb, gdp, 0); 197 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
197 memset(bh->b_data, 0xff, sb->s_blocksize);
198 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
199 return; 198 return;
200 } 199 }
201 memset(bh->b_data, 0, sb->s_blocksize); 200 memset(bh->b_data, 0, sb->s_blocksize);
@@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
305 */ 304 */
306static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, 305static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
307 struct ext4_group_desc *desc, 306 struct ext4_group_desc *desc,
308 unsigned int block_group, 307 ext4_group_t block_group,
309 struct buffer_head *bh) 308 struct buffer_head *bh)
310{ 309{
311 ext4_grpblk_t offset; 310 ext4_grpblk_t offset;
@@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
352 351
353void ext4_validate_block_bitmap(struct super_block *sb, 352void ext4_validate_block_bitmap(struct super_block *sb,
354 struct ext4_group_desc *desc, 353 struct ext4_group_desc *desc,
355 unsigned int block_group, 354 ext4_group_t block_group,
356 struct buffer_head *bh) 355 struct buffer_head *bh)
357{ 356{
358 ext4_fsblk_t blk; 357 ext4_fsblk_t blk;
358 struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
359 359
360 if (buffer_verified(bh)) 360 if (buffer_verified(bh))
361 return; 361 return;
@@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb,
366 ext4_unlock_group(sb, block_group); 366 ext4_unlock_group(sb, block_group);
367 ext4_error(sb, "bg %u: block %llu: invalid block bitmap", 367 ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
368 block_group, blk); 368 block_group, blk);
369 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
369 return; 370 return;
370 } 371 }
371 if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, 372 if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
372 desc, bh))) { 373 desc, bh))) {
373 ext4_unlock_group(sb, block_group); 374 ext4_unlock_group(sb, block_group);
374 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); 375 ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
376 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
375 return; 377 return;
376 } 378 }
377 set_buffer_verified(bh); 379 set_buffer_verified(bh);
@@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
445 return bh; 447 return bh;
446verify: 448verify:
447 ext4_validate_block_bitmap(sb, desc, block_group, bh); 449 ext4_validate_block_bitmap(sb, desc, block_group, bh);
448 return bh; 450 if (buffer_verified(bh))
451 return bh;
452 put_bh(bh);
453 return NULL;
449} 454}
450 455
451/* Returns 0 on success, 1 on error */ 456/* Returns 0 on success, 1 on error */
@@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
469 clear_buffer_new(bh); 474 clear_buffer_new(bh);
470 /* Panic or remount fs read-only if block bitmap is invalid */ 475 /* Panic or remount fs read-only if block bitmap is invalid */
471 ext4_validate_block_bitmap(sb, desc, block_group, bh); 476 ext4_validate_block_bitmap(sb, desc, block_group, bh);
472 return 0; 477 /* ...but check for error just in case errors=continue. */
478 return !buffer_verified(bh);
473} 479}
474 480
475struct buffer_head * 481struct buffer_head *
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3c7d288ae94c..680bb3388919 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *);
33 33
34/** 34/**
35 * Check if the given dir-inode refers to an htree-indexed directory 35 * Check if the given dir-inode refers to an htree-indexed directory
36 * (or a directory which chould potentially get coverted to use htree 36 * (or a directory which could potentially get converted to use htree
37 * indexing). 37 * indexing).
38 * 38 *
39 * Return 1 if it is a dx dir, 0 if not 39 * Return 1 if it is a dx dir, 0 if not
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0ab26fbf3380..af815ea9d7cc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -180,7 +180,6 @@ struct ext4_map_blocks {
180 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
181 */ 181 */
182#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
183#define EXT4_IO_END_DIRECT 0x0002
184 183
185/* 184/*
186 * For converting uninitialized extents on a work queue. 'handle' is used for 185 * For converting uninitialized extents on a work queue. 'handle' is used for
@@ -196,8 +195,6 @@ typedef struct ext4_io_end {
196 unsigned int flag; /* unwritten or not */ 195 unsigned int flag; /* unwritten or not */
197 loff_t offset; /* offset in the file */ 196 loff_t offset; /* offset in the file */
198 ssize_t size; /* size of the extent */ 197 ssize_t size; /* size of the extent */
199 struct kiocb *iocb; /* iocb struct for AIO */
200 int result; /* error value for AIO */
201 atomic_t count; /* reference counter */ 198 atomic_t count; /* reference counter */
202} ext4_io_end_t; 199} ext4_io_end_t;
203 200
@@ -561,6 +558,18 @@ enum {
561#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 558#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
562 559
563/* 560/*
561 * The bit position of these flags must not overlap with any of the
562 * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
563 * read_extent_tree_block(), ext4_split_extent_at(),
564 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
565 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
566 * caching the extents when reading from the extent tree while a
567 * truncate or punch hole operation is in progress.
568 */
569#define EXT4_EX_NOCACHE 0x0400
570#define EXT4_EX_FORCE_CACHE 0x0800
571
572/*
564 * Flags used by ext4_free_blocks 573 * Flags used by ext4_free_blocks
565 */ 574 */
566#define EXT4_FREE_BLOCKS_METADATA 0x0001 575#define EXT4_FREE_BLOCKS_METADATA 0x0001
@@ -569,6 +578,7 @@ enum {
569#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 578#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
570#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 579#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 580#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
581#define EXT4_FREE_BLOCKS_RESERVE 0x0040
572 582
573/* 583/*
574 * ioctl commands 584 * ioctl commands
@@ -590,6 +600,7 @@ enum {
590#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 600#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
591#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 601#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
592#define EXT4_IOC_SWAP_BOOT _IO('f', 17) 602#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
603#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
593 604
594#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 605#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
595/* 606/*
@@ -900,11 +911,9 @@ struct ext4_inode_info {
900 * Completed IOs that need unwritten extents handling and don't have 911 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved 912 * transaction reserved
902 */ 913 */
903 struct list_head i_unrsv_conversion_list;
904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 914 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 915 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
906 struct work_struct i_rsv_conversion_work; 916 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
908 917
909 spinlock_t i_block_reservation_lock; 918 spinlock_t i_block_reservation_lock;
910 919
@@ -1276,8 +1285,6 @@ struct ext4_sb_info {
1276 struct flex_groups *s_flex_groups; 1285 struct flex_groups *s_flex_groups;
1277 ext4_group_t s_flex_groups_allocated; 1286 ext4_group_t s_flex_groups_allocated;
1278 1287
1279 /* workqueue for unreserved extent convertions (dio) */
1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */ 1288 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq; 1289 struct workqueue_struct *rsv_conversion_wq;
1283 1290
@@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1340 struct ext4_io_end *io_end) 1347 struct ext4_io_end *io_end)
1341{ 1348{
1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1349 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1346 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1350 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1347 atomic_inc(&EXT4_I(inode)->i_unwritten); 1351 atomic_inc(&EXT4_I(inode)->i_unwritten);
1348 } 1352 }
@@ -1375,6 +1379,7 @@ enum {
1375 nolocking */ 1379 nolocking */
1376 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1380 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1377 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 1381 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1382 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
1378}; 1383};
1379 1384
1380#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1385#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb,
1915 1920
1916extern void ext4_validate_block_bitmap(struct super_block *sb, 1921extern void ext4_validate_block_bitmap(struct super_block *sb,
1917 struct ext4_group_desc *desc, 1922 struct ext4_group_desc *desc,
1918 unsigned int block_group, 1923 ext4_group_t block_group,
1919 struct buffer_head *bh); 1924 struct buffer_head *bh);
1920extern unsigned int ext4_block_group(struct super_block *sb, 1925extern unsigned int ext4_block_group(struct super_block *sb,
1921 ext4_fsblk_t blocknr); 1926 ext4_fsblk_t blocknr);
@@ -2417,16 +2422,32 @@ do { \
2417#define EXT4_FREECLUSTERS_WATERMARK 0 2422#define EXT4_FREECLUSTERS_WATERMARK 0
2418#endif 2423#endif
2419 2424
2425/* Update i_disksize. Requires i_mutex to avoid races with truncate */
2420static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) 2426static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
2421{ 2427{
2422 /* 2428 WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
2423 * XXX: replace with spinlock if seen contended -bzzz 2429 !mutex_is_locked(&inode->i_mutex));
2424 */
2425 down_write(&EXT4_I(inode)->i_data_sem); 2430 down_write(&EXT4_I(inode)->i_data_sem);
2426 if (newsize > EXT4_I(inode)->i_disksize) 2431 if (newsize > EXT4_I(inode)->i_disksize)
2427 EXT4_I(inode)->i_disksize = newsize; 2432 EXT4_I(inode)->i_disksize = newsize;
2428 up_write(&EXT4_I(inode)->i_data_sem); 2433 up_write(&EXT4_I(inode)->i_data_sem);
2429 return ; 2434}
2435
2436/*
2437 * Update i_disksize after writeback has been started. Races with truncate
2438 * are avoided by checking i_size under i_data_sem.
2439 */
2440static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
2441{
2442 loff_t i_size;
2443
2444 down_write(&EXT4_I(inode)->i_data_sem);
2445 i_size = i_size_read(inode);
2446 if (newsize > i_size)
2447 newsize = i_size;
2448 if (newsize > EXT4_I(inode)->i_disksize)
2449 EXT4_I(inode)->i_disksize = newsize;
2450 up_write(&EXT4_I(inode)->i_data_sem);
2430} 2451}
2431 2452
2432struct ext4_group_info { 2453struct ext4_group_info {
@@ -2449,9 +2470,15 @@ struct ext4_group_info {
2449 2470
2450#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 2471#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
2451#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 2472#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
2473#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
2474#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
2452 2475
2453#define EXT4_MB_GRP_NEED_INIT(grp) \ 2476#define EXT4_MB_GRP_NEED_INIT(grp) \
2454 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 2477 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
2478#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
2479 (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
2480#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
2481 (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
2455 2482
2456#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ 2483#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
2457 (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) 2484 (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
@@ -2655,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int,
2655struct ext4_ext_path; 2682struct ext4_ext_path;
2656struct ext4_extent; 2683struct ext4_extent;
2657 2684
2685/*
2686 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
2687 * __le32.
2688 */
2689#define EXT_MAX_BLOCKS 0xffffffff
2690
2658extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2691extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2659extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2692extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2660extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); 2693extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
@@ -2684,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
2684 struct ext4_ext_path *, 2717 struct ext4_ext_path *,
2685 struct ext4_extent *, int); 2718 struct ext4_extent *, int);
2686extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 2719extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
2687 struct ext4_ext_path *); 2720 struct ext4_ext_path *,
2721 int flags);
2688extern void ext4_ext_drop_refs(struct ext4_ext_path *); 2722extern void ext4_ext_drop_refs(struct ext4_ext_path *);
2689extern int ext4_ext_check_inode(struct inode *inode); 2723extern int ext4_ext_check_inode(struct inode *inode);
2690extern int ext4_find_delalloc_range(struct inode *inode, 2724extern int ext4_find_delalloc_range(struct inode *inode,
@@ -2693,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
2693extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2727extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2694extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2728extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2695 __u64 start, __u64 len); 2729 __u64 start, __u64 len);
2696 2730extern int ext4_ext_precache(struct inode *inode);
2697 2731
2698/* move_extent.c */ 2732/* move_extent.c */
2699extern void ext4_double_down_write_data_sem(struct inode *first, 2733extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2716,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2716extern void ext4_io_submit_init(struct ext4_io_submit *io, 2750extern void ext4_io_submit_init(struct ext4_io_submit *io,
2717 struct writeback_control *wbc); 2751 struct writeback_control *wbc);
2718extern void ext4_end_io_rsv_work(struct work_struct *work); 2752extern void ext4_end_io_rsv_work(struct work_struct *work);
2719extern void ext4_end_io_unrsv_work(struct work_struct *work);
2720extern void ext4_io_submit(struct ext4_io_submit *io); 2753extern void ext4_io_submit(struct ext4_io_submit *io);
2721extern int ext4_bio_write_page(struct ext4_io_submit *io, 2754extern int ext4_bio_write_page(struct ext4_io_submit *io,
2722 struct page *page, 2755 struct page *page,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 51bc821ade90..5074fe23f19e 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -134,12 +134,6 @@ struct ext4_ext_path {
134 */ 134 */
135 135
136/* 136/*
137 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
138 * __le32.
139 */
140#define EXT_MAX_BLOCKS 0xffffffff
141
142/*
143 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an 137 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
144 * initialized extent. This is 2^15 and not (2^16 - 1), since we use the 138 * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
145 * MSB of ee_len field in the extent datastructure to signify if this 139 * MSB of ee_len field in the extent datastructure to signify if this
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 2877258d9497..81cfefa9dc0c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle,
197 * ext4_journal_callback_del: delete a registered callback 197 * ext4_journal_callback_del: delete a registered callback
198 * @handle: active journal transaction handle on which callback was registered 198 * @handle: active journal transaction handle on which callback was registered
199 * @jce: registered journal callback entry to unregister 199 * @jce: registered journal callback entry to unregister
200 * Return true if object was sucessfully removed 200 * Return true if object was successfully removed
201 */ 201 */
202static inline bool ext4_journal_callback_try_del(handle_t *handle, 202static inline bool ext4_journal_callback_try_del(handle_t *handle,
203 struct ext4_journal_cb_entry *jce) 203 struct ext4_journal_cb_entry *jce)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72ba4705d4fa..54d52afcdb19 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
407 407
408static int __ext4_ext_check(const char *function, unsigned int line, 408static int __ext4_ext_check(const char *function, unsigned int line,
409 struct inode *inode, struct ext4_extent_header *eh, 409 struct inode *inode, struct ext4_extent_header *eh,
410 int depth) 410 int depth, ext4_fsblk_t pblk)
411{ 411{
412 const char *error_msg; 412 const char *error_msg;
413 int max = 0; 413 int max = 0;
@@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line,
447 447
448corrupted: 448corrupted:
449 ext4_error_inode(inode, function, line, 0, 449 ext4_error_inode(inode, function, line, 0,
450 "bad header/extent: %s - magic %x, " 450 "pblk %llu bad header/extent: %s - magic %x, "
451 "entries %u, max %u(%u), depth %u(%u)", 451 "entries %u, max %u(%u), depth %u(%u)",
452 error_msg, le16_to_cpu(eh->eh_magic), 452 (unsigned long long) pblk, error_msg,
453 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 453 le16_to_cpu(eh->eh_magic),
454 max, le16_to_cpu(eh->eh_depth), depth); 454 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
455 455 max, le16_to_cpu(eh->eh_depth), depth);
456 return -EIO; 456 return -EIO;
457} 457}
458 458
459#define ext4_ext_check(inode, eh, depth) \ 459#define ext4_ext_check(inode, eh, depth, pblk) \
460 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 460 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
461 461
462int ext4_ext_check_inode(struct inode *inode) 462int ext4_ext_check_inode(struct inode *inode)
463{ 463{
464 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 464 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
465} 465}
466 466
467static int __ext4_ext_check_block(const char *function, unsigned int line, 467static struct buffer_head *
468 struct inode *inode, 468__read_extent_tree_block(const char *function, unsigned int line,
469 struct ext4_extent_header *eh, 469 struct inode *inode, ext4_fsblk_t pblk, int depth,
470 int depth, 470 int flags)
471 struct buffer_head *bh)
472{ 471{
473 int ret; 472 struct buffer_head *bh;
473 int err;
474 474
475 if (buffer_verified(bh)) 475 bh = sb_getblk(inode->i_sb, pblk);
476 return 0; 476 if (unlikely(!bh))
477 ret = ext4_ext_check(inode, eh, depth); 477 return ERR_PTR(-ENOMEM);
478 if (ret) 478
479 return ret; 479 if (!bh_uptodate_or_lock(bh)) {
480 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
481 err = bh_submit_read(bh);
482 if (err < 0)
483 goto errout;
484 }
485 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
486 return bh;
487 err = __ext4_ext_check(function, line, inode,
488 ext_block_hdr(bh), depth, pblk);
489 if (err)
490 goto errout;
480 set_buffer_verified(bh); 491 set_buffer_verified(bh);
481 return ret; 492 /*
493 * If this is a leaf block, cache all of its entries
494 */
495 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
496 struct ext4_extent_header *eh = ext_block_hdr(bh);
497 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
498 ext4_lblk_t prev = 0;
499 int i;
500
501 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
502 unsigned int status = EXTENT_STATUS_WRITTEN;
503 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
504 int len = ext4_ext_get_actual_len(ex);
505
506 if (prev && (prev != lblk))
507 ext4_es_cache_extent(inode, prev,
508 lblk - prev, ~0,
509 EXTENT_STATUS_HOLE);
510
511 if (ext4_ext_is_uninitialized(ex))
512 status = EXTENT_STATUS_UNWRITTEN;
513 ext4_es_cache_extent(inode, lblk, len,
514 ext4_ext_pblock(ex), status);
515 prev = lblk + len;
516 }
517 }
518 return bh;
519errout:
520 put_bh(bh);
521 return ERR_PTR(err);
522
482} 523}
483 524
484#define ext4_ext_check_block(inode, eh, depth, bh) \ 525#define read_extent_tree_block(inode, pblk, depth, flags) \
485 __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) 526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
527 (depth), (flags))
528
529/*
530 * This function is called to cache a file's extent information in the
531 * extent status tree
532 */
533int ext4_ext_precache(struct inode *inode)
534{
535 struct ext4_inode_info *ei = EXT4_I(inode);
536 struct ext4_ext_path *path = NULL;
537 struct buffer_head *bh;
538 int i = 0, depth, ret = 0;
539
540 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
541 return 0; /* not an extent-mapped inode */
542
543 down_read(&ei->i_data_sem);
544 depth = ext_depth(inode);
545
546 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
547 GFP_NOFS);
548 if (path == NULL) {
549 up_read(&ei->i_data_sem);
550 return -ENOMEM;
551 }
552
553 /* Don't cache anything if there are no external extent blocks */
554 if (depth == 0)
555 goto out;
556 path[0].p_hdr = ext_inode_hdr(inode);
557 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
558 if (ret)
559 goto out;
560 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
561 while (i >= 0) {
562 /*
563 * If this is a leaf block or we've reached the end of
564 * the index block, go up
565 */
566 if ((i == depth) ||
567 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
568 brelse(path[i].p_bh);
569 path[i].p_bh = NULL;
570 i--;
571 continue;
572 }
573 bh = read_extent_tree_block(inode,
574 ext4_idx_pblock(path[i].p_idx++),
575 depth - i - 1,
576 EXT4_EX_FORCE_CACHE);
577 if (IS_ERR(bh)) {
578 ret = PTR_ERR(bh);
579 break;
580 }
581 i++;
582 path[i].p_bh = bh;
583 path[i].p_hdr = ext_block_hdr(bh);
584 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
585 }
586 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
587out:
588 up_read(&ei->i_data_sem);
589 ext4_ext_drop_refs(path);
590 kfree(path);
591 return ret;
592}
486 593
487#ifdef EXT_DEBUG 594#ifdef EXT_DEBUG
488static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 595static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
716 823
717struct ext4_ext_path * 824struct ext4_ext_path *
718ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 825ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
719 struct ext4_ext_path *path) 826 struct ext4_ext_path *path, int flags)
720{ 827{
721 struct ext4_extent_header *eh; 828 struct ext4_extent_header *eh;
722 struct buffer_head *bh; 829 struct buffer_head *bh;
@@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
748 path[ppos].p_depth = i; 855 path[ppos].p_depth = i;
749 path[ppos].p_ext = NULL; 856 path[ppos].p_ext = NULL;
750 857
751 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 858 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
752 if (unlikely(!bh)) { 859 flags);
753 ret = -ENOMEM; 860 if (IS_ERR(bh)) {
861 ret = PTR_ERR(bh);
754 goto err; 862 goto err;
755 } 863 }
756 if (!bh_uptodate_or_lock(bh)) { 864
757 trace_ext4_ext_load_extent(inode, block,
758 path[ppos].p_block);
759 ret = bh_submit_read(bh);
760 if (ret < 0) {
761 put_bh(bh);
762 goto err;
763 }
764 }
765 eh = ext_block_hdr(bh); 865 eh = ext_block_hdr(bh);
766 ppos++; 866 ppos++;
767 if (unlikely(ppos > depth)) { 867 if (unlikely(ppos > depth)) {
@@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
773 } 873 }
774 path[ppos].p_bh = bh; 874 path[ppos].p_bh = bh;
775 path[ppos].p_hdr = eh; 875 path[ppos].p_hdr = eh;
776 i--;
777
778 ret = ext4_ext_check_block(inode, eh, i, bh);
779 if (ret < 0)
780 goto err;
781 } 876 }
782 877
783 path[ppos].p_depth = i; 878 path[ppos].p_depth = i;
@@ -1198,7 +1293,8 @@ out:
1198 * if no free index is found, then it requests in-depth growing. 1293 * if no free index is found, then it requests in-depth growing.
1199 */ 1294 */
1200static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1295static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1201 unsigned int flags, 1296 unsigned int mb_flags,
1297 unsigned int gb_flags,
1202 struct ext4_ext_path *path, 1298 struct ext4_ext_path *path,
1203 struct ext4_extent *newext) 1299 struct ext4_extent *newext)
1204{ 1300{
@@ -1220,7 +1316,7 @@ repeat:
1220 if (EXT_HAS_FREE_INDEX(curp)) { 1316 if (EXT_HAS_FREE_INDEX(curp)) {
1221 /* if we found index with free entry, then use that 1317 /* if we found index with free entry, then use that
1222 * entry: create all needed subtree and add new leaf */ 1318 * entry: create all needed subtree and add new leaf */
1223 err = ext4_ext_split(handle, inode, flags, path, newext, i); 1319 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1224 if (err) 1320 if (err)
1225 goto out; 1321 goto out;
1226 1322
@@ -1228,12 +1324,12 @@ repeat:
1228 ext4_ext_drop_refs(path); 1324 ext4_ext_drop_refs(path);
1229 path = ext4_ext_find_extent(inode, 1325 path = ext4_ext_find_extent(inode,
1230 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1326 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1231 path); 1327 path, gb_flags);
1232 if (IS_ERR(path)) 1328 if (IS_ERR(path))
1233 err = PTR_ERR(path); 1329 err = PTR_ERR(path);
1234 } else { 1330 } else {
1235 /* tree is full, time to grow in depth */ 1331 /* tree is full, time to grow in depth */
1236 err = ext4_ext_grow_indepth(handle, inode, flags, newext); 1332 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
1237 if (err) 1333 if (err)
1238 goto out; 1334 goto out;
1239 1335
@@ -1241,7 +1337,7 @@ repeat:
1241 ext4_ext_drop_refs(path); 1337 ext4_ext_drop_refs(path);
1242 path = ext4_ext_find_extent(inode, 1338 path = ext4_ext_find_extent(inode,
1243 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1339 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1244 path); 1340 path, gb_flags);
1245 if (IS_ERR(path)) { 1341 if (IS_ERR(path)) {
1246 err = PTR_ERR(path); 1342 err = PTR_ERR(path);
1247 goto out; 1343 goto out;
@@ -1412,29 +1508,21 @@ got_index:
1412 ix++; 1508 ix++;
1413 block = ext4_idx_pblock(ix); 1509 block = ext4_idx_pblock(ix);
1414 while (++depth < path->p_depth) { 1510 while (++depth < path->p_depth) {
1415 bh = sb_bread(inode->i_sb, block);
1416 if (bh == NULL)
1417 return -EIO;
1418 eh = ext_block_hdr(bh);
1419 /* subtract from p_depth to get proper eh_depth */ 1511 /* subtract from p_depth to get proper eh_depth */
1420 if (ext4_ext_check_block(inode, eh, 1512 bh = read_extent_tree_block(inode, block,
1421 path->p_depth - depth, bh)) { 1513 path->p_depth - depth, 0);
1422 put_bh(bh); 1514 if (IS_ERR(bh))
1423 return -EIO; 1515 return PTR_ERR(bh);
1424 } 1516 eh = ext_block_hdr(bh);
1425 ix = EXT_FIRST_INDEX(eh); 1517 ix = EXT_FIRST_INDEX(eh);
1426 block = ext4_idx_pblock(ix); 1518 block = ext4_idx_pblock(ix);
1427 put_bh(bh); 1519 put_bh(bh);
1428 } 1520 }
1429 1521
1430 bh = sb_bread(inode->i_sb, block); 1522 bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1431 if (bh == NULL) 1523 if (IS_ERR(bh))
1432 return -EIO; 1524 return PTR_ERR(bh);
1433 eh = ext_block_hdr(bh); 1525 eh = ext_block_hdr(bh);
1434 if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
1435 put_bh(bh);
1436 return -EIO;
1437 }
1438 ex = EXT_FIRST_EXTENT(eh); 1526 ex = EXT_FIRST_EXTENT(eh);
1439found_extent: 1527found_extent:
1440 *logical = le32_to_cpu(ex->ee_block); 1528 *logical = le32_to_cpu(ex->ee_block);
@@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
1705 1793
1706 brelse(path[1].p_bh); 1794 brelse(path[1].p_bh);
1707 ext4_free_blocks(handle, inode, NULL, blk, 1, 1795 ext4_free_blocks(handle, inode, NULL, blk, 1,
1708 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1796 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
1797 EXT4_FREE_BLOCKS_RESERVE);
1709} 1798}
1710 1799
1711/* 1800/*
@@ -1793,7 +1882,7 @@ out:
1793 */ 1882 */
1794int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1883int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1795 struct ext4_ext_path *path, 1884 struct ext4_ext_path *path,
1796 struct ext4_extent *newext, int flag) 1885 struct ext4_extent *newext, int gb_flags)
1797{ 1886{
1798 struct ext4_extent_header *eh; 1887 struct ext4_extent_header *eh;
1799 struct ext4_extent *ex, *fex; 1888 struct ext4_extent *ex, *fex;
@@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1802 int depth, len, err; 1891 int depth, len, err;
1803 ext4_lblk_t next; 1892 ext4_lblk_t next;
1804 unsigned uninitialized = 0; 1893 unsigned uninitialized = 0;
1805 int flags = 0; 1894 int mb_flags = 0;
1806 1895
1807 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1896 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1808 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1897 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1817 } 1906 }
1818 1907
1819 /* try to insert block into found extent and return */ 1908 /* try to insert block into found extent and return */
1820 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { 1909 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1821 1910
1822 /* 1911 /*
1823 * Try to see whether we should rather test the extent on 1912 * Try to see whether we should rather test the extent on
@@ -1920,7 +2009,7 @@ prepend:
1920 if (next != EXT_MAX_BLOCKS) { 2009 if (next != EXT_MAX_BLOCKS) {
1921 ext_debug("next leaf block - %u\n", next); 2010 ext_debug("next leaf block - %u\n", next);
1922 BUG_ON(npath != NULL); 2011 BUG_ON(npath != NULL);
1923 npath = ext4_ext_find_extent(inode, next, NULL); 2012 npath = ext4_ext_find_extent(inode, next, NULL, 0);
1924 if (IS_ERR(npath)) 2013 if (IS_ERR(npath))
1925 return PTR_ERR(npath); 2014 return PTR_ERR(npath);
1926 BUG_ON(npath->p_depth != path->p_depth); 2015 BUG_ON(npath->p_depth != path->p_depth);
@@ -1939,9 +2028,10 @@ prepend:
1939 * There is no free space in the found leaf. 2028 * There is no free space in the found leaf.
1940 * We're gonna add a new leaf in the tree. 2029 * We're gonna add a new leaf in the tree.
1941 */ 2030 */
1942 if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2031 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
1943 flags = EXT4_MB_USE_RESERVED; 2032 mb_flags = EXT4_MB_USE_RESERVED;
1944 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 2033 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2034 path, newext);
1945 if (err) 2035 if (err)
1946 goto cleanup; 2036 goto cleanup;
1947 depth = ext_depth(inode); 2037 depth = ext_depth(inode);
@@ -2007,7 +2097,7 @@ has_space:
2007 2097
2008merge: 2098merge:
2009 /* try to merge extents */ 2099 /* try to merge extents */
2010 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 2100 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2011 ext4_ext_try_to_merge(handle, inode, path, nearex); 2101 ext4_ext_try_to_merge(handle, inode, path, nearex);
2012 2102
2013 2103
@@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2050 path = NULL; 2140 path = NULL;
2051 } 2141 }
2052 2142
2053 path = ext4_ext_find_extent(inode, block, path); 2143 path = ext4_ext_find_extent(inode, block, path, 0);
2054 if (IS_ERR(path)) { 2144 if (IS_ERR(path)) {
2055 up_read(&EXT4_I(inode)->i_data_sem); 2145 up_read(&EXT4_I(inode)->i_data_sem);
2056 err = PTR_ERR(path); 2146 err = PTR_ERR(path);
@@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2195 ext4_lblk_t block) 2285 ext4_lblk_t block)
2196{ 2286{
2197 int depth = ext_depth(inode); 2287 int depth = ext_depth(inode);
2198 unsigned long len; 2288 unsigned long len = 0;
2199 ext4_lblk_t lblock; 2289 ext4_lblk_t lblock = 0;
2200 struct ext4_extent *ex; 2290 struct ext4_extent *ex;
2201 2291
2202 ex = path[depth].p_ext; 2292 ex = path[depth].p_ext;
@@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2233 ext4_es_insert_extent(inode, lblock, len, ~0, 2323 ext4_es_insert_extent(inode, lblock, len, ~0,
2234 EXTENT_STATUS_HOLE); 2324 EXTENT_STATUS_HOLE);
2235 } else { 2325 } else {
2236 lblock = len = 0;
2237 BUG(); 2326 BUG();
2238 } 2327 }
2239 2328
@@ -2712,7 +2801,7 @@ again:
2712 ext4_lblk_t ee_block; 2801 ext4_lblk_t ee_block;
2713 2802
2714 /* find extent for this block */ 2803 /* find extent for this block */
2715 path = ext4_ext_find_extent(inode, end, NULL); 2804 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2716 if (IS_ERR(path)) { 2805 if (IS_ERR(path)) {
2717 ext4_journal_stop(handle); 2806 ext4_journal_stop(handle);
2718 return PTR_ERR(path); 2807 return PTR_ERR(path);
@@ -2754,6 +2843,7 @@ again:
2754 */ 2843 */
2755 err = ext4_split_extent_at(handle, inode, path, 2844 err = ext4_split_extent_at(handle, inode, path,
2756 end + 1, split_flag, 2845 end + 1, split_flag,
2846 EXT4_EX_NOCACHE |
2757 EXT4_GET_BLOCKS_PRE_IO | 2847 EXT4_GET_BLOCKS_PRE_IO |
2758 EXT4_GET_BLOCKS_METADATA_NOFAIL); 2848 EXT4_GET_BLOCKS_METADATA_NOFAIL);
2759 2849
@@ -2782,7 +2872,7 @@ again:
2782 path[0].p_hdr = ext_inode_hdr(inode); 2872 path[0].p_hdr = ext_inode_hdr(inode);
2783 i = 0; 2873 i = 0;
2784 2874
2785 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2875 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2786 err = -EIO; 2876 err = -EIO;
2787 goto out; 2877 goto out;
2788 } 2878 }
@@ -2829,10 +2919,12 @@ again:
2829 ext_debug("move to level %d (block %llu)\n", 2919 ext_debug("move to level %d (block %llu)\n",
2830 i + 1, ext4_idx_pblock(path[i].p_idx)); 2920 i + 1, ext4_idx_pblock(path[i].p_idx));
2831 memset(path + i + 1, 0, sizeof(*path)); 2921 memset(path + i + 1, 0, sizeof(*path));
2832 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); 2922 bh = read_extent_tree_block(inode,
2833 if (!bh) { 2923 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2924 EXT4_EX_NOCACHE);
2925 if (IS_ERR(bh)) {
2834 /* should we reset i_size? */ 2926 /* should we reset i_size? */
2835 err = -EIO; 2927 err = PTR_ERR(bh);
2836 break; 2928 break;
2837 } 2929 }
2838 /* Yield here to deal with large extent trees. 2930 /* Yield here to deal with large extent trees.
@@ -2842,11 +2934,6 @@ again:
2842 err = -EIO; 2934 err = -EIO;
2843 break; 2935 break;
2844 } 2936 }
2845 if (ext4_ext_check_block(inode, ext_block_hdr(bh),
2846 depth - i - 1, bh)) {
2847 err = -EIO;
2848 break;
2849 }
2850 path[i + 1].p_bh = bh; 2937 path[i + 1].p_bh = bh;
2851 2938
2852 /* save actual number of indexes since this 2939 /* save actual number of indexes since this
@@ -2961,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb)
2961#endif 3048#endif
2962} 3049}
2963 3050
3051static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3052{
3053 ext4_lblk_t ee_block;
3054 ext4_fsblk_t ee_pblock;
3055 unsigned int ee_len;
3056
3057 ee_block = le32_to_cpu(ex->ee_block);
3058 ee_len = ext4_ext_get_actual_len(ex);
3059 ee_pblock = ext4_ext_pblock(ex);
3060
3061 if (ee_len == 0)
3062 return 0;
3063
3064 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3065 EXTENT_STATUS_WRITTEN);
3066}
3067
2964/* FIXME!! we need to try to merge to left or right after zero-out */ 3068/* FIXME!! we need to try to merge to left or right after zero-out */
2965static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 3069static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2966{ 3070{
@@ -3113,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
3113 goto fix_extent_len; 3217 goto fix_extent_len;
3114 3218
3115 /* update extent status tree */ 3219 /* update extent status tree */
3116 err = ext4_es_zeroout(inode, &zero_ex); 3220 err = ext4_zeroout_es(inode, &zero_ex);
3117 3221
3118 goto out; 3222 goto out;
3119 } else if (err) 3223 } else if (err)
@@ -3133,7 +3237,7 @@ fix_extent_len:
3133 * ext4_split_extents() splits an extent and mark extent which is covered 3237 * ext4_split_extents() splits an extent and mark extent which is covered
3134 * by @map as split_flags indicates 3238 * by @map as split_flags indicates
3135 * 3239 *
3136 * It may result in splitting the extent into multiple extents (upto three) 3240 * It may result in splitting the extent into multiple extents (up to three)
3137 * There are three possibilities: 3241 * There are three possibilities:
3138 * a> There is no split required 3242 * a> There is no split required
3139 * b> Splits in two extents: Split is happening at either end of the extent 3243 * b> Splits in two extents: Split is happening at either end of the extent
@@ -3181,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle,
3181 * result in split of original leaf or extent zeroout. 3285 * result in split of original leaf or extent zeroout.
3182 */ 3286 */
3183 ext4_ext_drop_refs(path); 3287 ext4_ext_drop_refs(path);
3184 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3288 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3185 if (IS_ERR(path)) 3289 if (IS_ERR(path))
3186 return PTR_ERR(path); 3290 return PTR_ERR(path);
3187 depth = ext_depth(inode); 3291 depth = ext_depth(inode);
@@ -3464,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3464out: 3568out:
3465 /* If we have gotten a failure, don't zero out status tree */ 3569 /* If we have gotten a failure, don't zero out status tree */
3466 if (!err) 3570 if (!err)
3467 err = ext4_es_zeroout(inode, &zero_ex); 3571 err = ext4_zeroout_es(inode, &zero_ex);
3468 return err ? err : allocated; 3572 return err ? err : allocated;
3469} 3573}
3470 3574
@@ -3565,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3565 if (err < 0) 3669 if (err < 0)
3566 goto out; 3670 goto out;
3567 ext4_ext_drop_refs(path); 3671 ext4_ext_drop_refs(path);
3568 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3672 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3569 if (IS_ERR(path)) { 3673 if (IS_ERR(path)) {
3570 err = PTR_ERR(path); 3674 err = PTR_ERR(path);
3571 goto out; 3675 goto out;
@@ -4052,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4052 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4156 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4053 4157
4054 /* find extent for this block */ 4158 /* find extent for this block */
4055 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 4159 path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
4056 if (IS_ERR(path)) { 4160 if (IS_ERR(path)) {
4057 err = PTR_ERR(path); 4161 err = PTR_ERR(path);
4058 path = NULL; 4162 path = NULL;
@@ -4744,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4744 return error; 4848 return error;
4745 } 4849 }
4746 4850
4851 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4852 error = ext4_ext_precache(inode);
4853 if (error)
4854 return error;
4855 }
4856
4747 /* fallback to generic here if not in extents fmt */ 4857 /* fallback to generic here if not in extents fmt */
4748 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4858 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4749 return generic_block_fiemap(inode, fieinfo, start, len, 4859 return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4771,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4771 error = ext4_fill_fiemap_extents(inode, start_blk, 4881 error = ext4_fill_fiemap_extents(inode, start_blk,
4772 len_blks, fieinfo); 4882 len_blks, fieinfo);
4773 } 4883 }
4774 4884 ext4_es_lru_add(inode);
4775 return error; 4885 return error;
4776} 4886}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 91cb110da1b4..3981ff783950 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -13,7 +13,6 @@
13#include <linux/list_sort.h> 13#include <linux/list_sort.h>
14#include "ext4.h" 14#include "ext4.h"
15#include "extents_status.h" 15#include "extents_status.h"
16#include "ext4_extents.h"
17 16
18#include <trace/events/ext4.h> 17#include <trace/events/ext4.h>
19 18
@@ -263,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
263 if (tree->cache_es) { 262 if (tree->cache_es) {
264 es1 = tree->cache_es; 263 es1 = tree->cache_es;
265 if (in_range(lblk, es1->es_lblk, es1->es_len)) { 264 if (in_range(lblk, es1->es_lblk, es1->es_len)) {
266 es_debug("%u cached by [%u/%u) %llu %llx\n", 265 es_debug("%u cached by [%u/%u) %llu %x\n",
267 lblk, es1->es_lblk, es1->es_len, 266 lblk, es1->es_lblk, es1->es_len,
268 ext4_es_pblock(es1), ext4_es_status(es1)); 267 ext4_es_pblock(es1), ext4_es_status(es1));
269 goto out; 268 goto out;
@@ -409,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
409} 408}
410 409
411#ifdef ES_AGGRESSIVE_TEST 410#ifdef ES_AGGRESSIVE_TEST
411#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
412
412static void ext4_es_insert_extent_ext_check(struct inode *inode, 413static void ext4_es_insert_extent_ext_check(struct inode *inode,
413 struct extent_status *es) 414 struct extent_status *es)
414{ 415{
@@ -419,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
419 unsigned short ee_len; 420 unsigned short ee_len;
420 int depth, ee_status, es_status; 421 int depth, ee_status, es_status;
421 422
422 path = ext4_ext_find_extent(inode, es->es_lblk, NULL); 423 path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
423 if (IS_ERR(path)) 424 if (IS_ERR(path))
424 return; 425 return;
425 426
@@ -641,13 +642,13 @@ out:
641 */ 642 */
642int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, 643int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
643 ext4_lblk_t len, ext4_fsblk_t pblk, 644 ext4_lblk_t len, ext4_fsblk_t pblk,
644 unsigned long long status) 645 unsigned int status)
645{ 646{
646 struct extent_status newes; 647 struct extent_status newes;
647 ext4_lblk_t end = lblk + len - 1; 648 ext4_lblk_t end = lblk + len - 1;
648 int err = 0; 649 int err = 0;
649 650
650 es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", 651 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
651 lblk, len, pblk, status, inode->i_ino); 652 lblk, len, pblk, status, inode->i_ino);
652 653
653 if (!len) 654 if (!len)
@@ -684,6 +685,38 @@ error:
684} 685}
685 686
686/* 687/*
688 * ext4_es_cache_extent() inserts information into the extent status
689 * tree if and only if there isn't information about the range in
690 * question already.
691 */
692void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
693 ext4_lblk_t len, ext4_fsblk_t pblk,
694 unsigned int status)
695{
696 struct extent_status *es;
697 struct extent_status newes;
698 ext4_lblk_t end = lblk + len - 1;
699
700 newes.es_lblk = lblk;
701 newes.es_len = len;
702 ext4_es_store_pblock(&newes, pblk);
703 ext4_es_store_status(&newes, status);
704 trace_ext4_es_cache_extent(inode, &newes);
705
706 if (!len)
707 return;
708
709 BUG_ON(end < lblk);
710
711 write_lock(&EXT4_I(inode)->i_es_lock);
712
713 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
714 if (!es || es->es_lblk > end)
715 __es_insert_extent(inode, &newes);
716 write_unlock(&EXT4_I(inode)->i_es_lock);
717}
718
719/*
687 * ext4_es_lookup_extent() looks up an extent in extent status tree. 720 * ext4_es_lookup_extent() looks up an extent in extent status tree.
688 * 721 *
689 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. 722 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
@@ -871,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
871 return err; 904 return err;
872} 905}
873 906
874int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
875{
876 ext4_lblk_t ee_block;
877 ext4_fsblk_t ee_pblock;
878 unsigned int ee_len;
879
880 ee_block = le32_to_cpu(ex->ee_block);
881 ee_len = ext4_ext_get_actual_len(ex);
882 ee_pblock = ext4_ext_pblock(ex);
883
884 if (ee_len == 0)
885 return 0;
886
887 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
888 EXTENT_STATUS_WRITTEN);
889}
890
891static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, 907static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
892 struct list_head *b) 908 struct list_head *b)
893{ 909{
@@ -895,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
895 eia = list_entry(a, struct ext4_inode_info, i_es_lru); 911 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
896 eib = list_entry(b, struct ext4_inode_info, i_es_lru); 912 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
897 913
914 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
915 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
916 return 1;
917 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
918 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
919 return -1;
898 if (eia->i_touch_when == eib->i_touch_when) 920 if (eia->i_touch_when == eib->i_touch_when)
899 return 0; 921 return 0;
900 if (time_after(eia->i_touch_when, eib->i_touch_when)) 922 if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -908,22 +930,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
908{ 930{
909 struct ext4_inode_info *ei; 931 struct ext4_inode_info *ei;
910 struct list_head *cur, *tmp; 932 struct list_head *cur, *tmp;
911 LIST_HEAD(skiped); 933 LIST_HEAD(skipped);
912 int ret, nr_shrunk = 0; 934 int nr_shrunk = 0;
935 int retried = 0, skip_precached = 1, nr_skipped = 0;
913 936
914 spin_lock(&sbi->s_es_lru_lock); 937 spin_lock(&sbi->s_es_lru_lock);
915 938
916 /* 939retry:
917 * If the inode that is at the head of LRU list is newer than
918 * last_sorted time, that means that we need to sort this list.
919 */
920 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
921 if (sbi->s_es_last_sorted < ei->i_touch_when) {
922 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
923 sbi->s_es_last_sorted = jiffies;
924 }
925
926 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 940 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
941 int shrunk;
942
927 /* 943 /*
928 * If we have already reclaimed all extents from extent 944 * If we have already reclaimed all extents from extent
929 * status tree, just stop the loop immediately. 945 * status tree, just stop the loop immediately.
@@ -933,9 +949,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
933 949
934 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 950 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
935 951
936 /* Skip the inode that is newer than the last_sorted time */ 952 /*
937 if (sbi->s_es_last_sorted < ei->i_touch_when) { 953 * Skip the inode that is newer than the last_sorted
938 list_move_tail(cur, &skiped); 954 * time. Normally we try hard to avoid shrinking
955 * precached inodes, but we will as a last resort.
956 */
957 if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
958 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
959 EXT4_STATE_EXT_PRECACHED))) {
960 nr_skipped++;
961 list_move_tail(cur, &skipped);
939 continue; 962 continue;
940 } 963 }
941 964
@@ -943,28 +966,63 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
943 continue; 966 continue;
944 967
945 write_lock(&ei->i_es_lock); 968 write_lock(&ei->i_es_lock);
946 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 969 shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
947 if (ei->i_es_lru_nr == 0) 970 if (ei->i_es_lru_nr == 0)
948 list_del_init(&ei->i_es_lru); 971 list_del_init(&ei->i_es_lru);
949 write_unlock(&ei->i_es_lock); 972 write_unlock(&ei->i_es_lock);
950 973
951 nr_shrunk += ret; 974 nr_shrunk += shrunk;
952 nr_to_scan -= ret; 975 nr_to_scan -= shrunk;
953 if (nr_to_scan == 0) 976 if (nr_to_scan == 0)
954 break; 977 break;
955 } 978 }
956 979
957 /* Move the newer inodes into the tail of the LRU list. */ 980 /* Move the newer inodes into the tail of the LRU list. */
958 list_splice_tail(&skiped, &sbi->s_es_lru); 981 list_splice_tail(&skipped, &sbi->s_es_lru);
982 INIT_LIST_HEAD(&skipped);
983
984 /*
985 * If we skipped any inodes, and we weren't able to make any
986 * forward progress, sort the list and try again.
987 */
988 if ((nr_shrunk == 0) && nr_skipped && !retried) {
989 retried++;
990 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
991 sbi->s_es_last_sorted = jiffies;
992 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
993 i_es_lru);
994 /*
995 * If there are no non-precached inodes left on the
996 * list, start releasing precached extents.
997 */
998 if (ext4_test_inode_state(&ei->vfs_inode,
999 EXT4_STATE_EXT_PRECACHED))
1000 skip_precached = 0;
1001 goto retry;
1002 }
1003
959 spin_unlock(&sbi->s_es_lru_lock); 1004 spin_unlock(&sbi->s_es_lru_lock);
960 1005
961 if (locked_ei && nr_shrunk == 0) 1006 if (locked_ei && nr_shrunk == 0)
962 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1007 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
963 1008
964 return nr_shrunk; 1009 return nr_shrunk;
965} 1010}
966 1011
967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 1012static unsigned long ext4_es_count(struct shrinker *shrink,
1013 struct shrink_control *sc)
1014{
1015 unsigned long nr;
1016 struct ext4_sb_info *sbi;
1017
1018 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1019 nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
1020 trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
1021 return nr;
1022}
1023
1024static unsigned long ext4_es_scan(struct shrinker *shrink,
1025 struct shrink_control *sc)
968{ 1026{
969 struct ext4_sb_info *sbi = container_of(shrink, 1027 struct ext4_sb_info *sbi = container_of(shrink,
970 struct ext4_sb_info, s_es_shrinker); 1028 struct ext4_sb_info, s_es_shrinker);
@@ -979,9 +1037,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
979 1037
980 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1038 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
981 1039
982 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
983 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 1040 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
984 return ret; 1041 return nr_shrunk;
985} 1042}
986 1043
987void ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1044void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
@@ -989,7 +1046,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
989 INIT_LIST_HEAD(&sbi->s_es_lru); 1046 INIT_LIST_HEAD(&sbi->s_es_lru);
990 spin_lock_init(&sbi->s_es_lru_lock); 1047 spin_lock_init(&sbi->s_es_lru_lock);
991 sbi->s_es_last_sorted = 0; 1048 sbi->s_es_last_sorted = 0;
992 sbi->s_es_shrinker.shrink = ext4_es_shrink; 1049 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1050 sbi->s_es_shrinker.count_objects = ext4_es_count;
993 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1051 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
994 register_shrinker(&sbi->s_es_shrinker); 1052 register_shrinker(&sbi->s_es_shrinker);
995} 1053}
@@ -1033,11 +1091,17 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
1033 struct ext4_es_tree *tree = &ei->i_es_tree; 1091 struct ext4_es_tree *tree = &ei->i_es_tree;
1034 struct rb_node *node; 1092 struct rb_node *node;
1035 struct extent_status *es; 1093 struct extent_status *es;
1036 int nr_shrunk = 0; 1094 unsigned long nr_shrunk = 0;
1095 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1096 DEFAULT_RATELIMIT_BURST);
1037 1097
1038 if (ei->i_es_lru_nr == 0) 1098 if (ei->i_es_lru_nr == 0)
1039 return 0; 1099 return 0;
1040 1100
1101 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1102 __ratelimit(&_rs))
1103 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1104
1041 node = rb_first(&tree->root); 1105 node = rb_first(&tree->root);
1042 while (node != NULL) { 1106 while (node != NULL) {
1043 es = rb_entry(node, struct extent_status, rb_node); 1107 es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index e936730cc5b0..167f4ab8ecc3 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,16 +29,26 @@
29/* 29/*
30 * These flags live in the high bits of extent_status.es_pblk 30 * These flags live in the high bits of extent_status.es_pblk
31 */ 31 */
32#define EXTENT_STATUS_WRITTEN (1ULL << 63) 32#define ES_SHIFT 60
33#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) 33
34#define EXTENT_STATUS_DELAYED (1ULL << 61) 34#define EXTENT_STATUS_WRITTEN (1 << 3)
35#define EXTENT_STATUS_HOLE (1ULL << 60) 35#define EXTENT_STATUS_UNWRITTEN (1 << 2)
36#define EXTENT_STATUS_DELAYED (1 << 1)
37#define EXTENT_STATUS_HOLE (1 << 0)
36 38
37#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ 39#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
38 EXTENT_STATUS_UNWRITTEN | \ 40 EXTENT_STATUS_UNWRITTEN | \
39 EXTENT_STATUS_DELAYED | \ 41 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 42 EXTENT_STATUS_HOLE)
41 43
44#define ES_WRITTEN (1ULL << 63)
45#define ES_UNWRITTEN (1ULL << 62)
46#define ES_DELAYED (1ULL << 61)
47#define ES_HOLE (1ULL << 60)
48
49#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
50 ES_DELAYED | ES_HOLE)
51
42struct ext4_sb_info; 52struct ext4_sb_info;
43struct ext4_extent; 53struct ext4_extent;
44 54
@@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
60 70
61extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, 71extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
62 ext4_lblk_t len, ext4_fsblk_t pblk, 72 ext4_lblk_t len, ext4_fsblk_t pblk,
63 unsigned long long status); 73 unsigned int status);
74extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
75 ext4_lblk_t len, ext4_fsblk_t pblk,
76 unsigned int status);
64extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 77extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
65 ext4_lblk_t len); 78 ext4_lblk_t len);
66extern void ext4_es_find_delayed_extent_range(struct inode *inode, 79extern void ext4_es_find_delayed_extent_range(struct inode *inode,
@@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
68 struct extent_status *es); 81 struct extent_status *es);
69extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 82extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
70 struct extent_status *es); 83 struct extent_status *es);
71extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
72 84
73static inline int ext4_es_is_written(struct extent_status *es) 85static inline int ext4_es_is_written(struct extent_status *es)
74{ 86{
75 return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; 87 return (es->es_pblk & ES_WRITTEN) != 0;
76} 88}
77 89
78static inline int ext4_es_is_unwritten(struct extent_status *es) 90static inline int ext4_es_is_unwritten(struct extent_status *es)
79{ 91{
80 return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; 92 return (es->es_pblk & ES_UNWRITTEN) != 0;
81} 93}
82 94
83static inline int ext4_es_is_delayed(struct extent_status *es) 95static inline int ext4_es_is_delayed(struct extent_status *es)
84{ 96{
85 return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; 97 return (es->es_pblk & ES_DELAYED) != 0;
86} 98}
87 99
88static inline int ext4_es_is_hole(struct extent_status *es) 100static inline int ext4_es_is_hole(struct extent_status *es)
89{ 101{
90 return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; 102 return (es->es_pblk & ES_HOLE) != 0;
91} 103}
92 104
93static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) 105static inline unsigned int ext4_es_status(struct extent_status *es)
94{ 106{
95 return (es->es_pblk & EXTENT_STATUS_FLAGS); 107 return es->es_pblk >> ES_SHIFT;
96} 108}
97 109
98static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) 110static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
99{ 111{
100 return (es->es_pblk & ~EXTENT_STATUS_FLAGS); 112 return es->es_pblk & ~ES_MASK;
101} 113}
102 114
103static inline void ext4_es_store_pblock(struct extent_status *es, 115static inline void ext4_es_store_pblock(struct extent_status *es,
@@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
105{ 117{
106 ext4_fsblk_t block; 118 ext4_fsblk_t block;
107 119
108 block = (pb & ~EXTENT_STATUS_FLAGS) | 120 block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
109 (es->es_pblk & EXTENT_STATUS_FLAGS);
110 es->es_pblk = block; 121 es->es_pblk = block;
111} 122}
112 123
113static inline void ext4_es_store_status(struct extent_status *es, 124static inline void ext4_es_store_status(struct extent_status *es,
114 unsigned long long status) 125 unsigned int status)
115{ 126{
116 ext4_fsblk_t block; 127 es->es_pblk = (((ext4_fsblk_t)
117 128 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
118 block = (status & EXTENT_STATUS_FLAGS) | 129 (es->es_pblk & ~ES_MASK));
119 (es->es_pblk & ~EXTENT_STATUS_FLAGS);
120 es->es_pblk = block;
121} 130}
122 131
123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 132extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 319c9d26279a..3da21945ff1f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
149 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 149 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
150 mutex_unlock(&inode->i_mutex); 150 mutex_unlock(&inode->i_mutex);
151 151
152 if (ret > 0 || ret == -EIOCBQUEUED) { 152 if (ret > 0) {
153 ssize_t err; 153 ssize_t err;
154 154
155 err = generic_write_sync(file, pos, ret); 155 err = generic_write_sync(file, pos, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8bf5999875ee..137193ff389b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
70 ext4_group_t block_group, 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp) 71 struct ext4_group_desc *gdp)
72{ 72{
73 struct ext4_group_info *grp;
73 J_ASSERT_BH(bh, buffer_locked(bh)); 74 J_ASSERT_BH(bh, buffer_locked(bh));
74 75
75 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
76 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
77 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
78 ext4_error(sb, "Checksum bad for group %u", block_group); 79 ext4_error(sb, "Checksum bad for group %u", block_group);
79 ext4_free_group_clusters_set(sb, gdp, 0); 80 grp = ext4_get_group_info(sb, block_group);
80 ext4_free_inodes_set(sb, gdp, 0); 81 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
81 ext4_itable_unused_set(sb, gdp, 0); 82 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
84 EXT4_INODES_PER_GROUP(sb) / 8);
85 return 0; 83 return 0;
86 } 84 }
87 85
@@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
117 struct ext4_group_desc *desc; 115 struct ext4_group_desc *desc;
118 struct buffer_head *bh = NULL; 116 struct buffer_head *bh = NULL;
119 ext4_fsblk_t bitmap_blk; 117 ext4_fsblk_t bitmap_blk;
118 struct ext4_group_info *grp;
120 119
121 desc = ext4_get_group_desc(sb, block_group, NULL); 120 desc = ext4_get_group_desc(sb, block_group, NULL);
122 if (!desc) 121 if (!desc)
@@ -185,6 +184,8 @@ verify:
185 put_bh(bh); 184 put_bh(bh);
186 ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " 185 ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
187 "inode_bitmap = %llu", block_group, bitmap_blk); 186 "inode_bitmap = %llu", block_group, bitmap_blk);
187 grp = ext4_get_group_info(sb, block_group);
188 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
188 return NULL; 189 return NULL;
189 } 190 }
190 ext4_unlock_group(sb, block_group); 191 ext4_unlock_group(sb, block_group);
@@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
221 struct ext4_super_block *es; 222 struct ext4_super_block *es;
222 struct ext4_sb_info *sbi; 223 struct ext4_sb_info *sbi;
223 int fatal = 0, err, count, cleared; 224 int fatal = 0, err, count, cleared;
225 struct ext4_group_info *grp;
224 226
225 if (!sb) { 227 if (!sb) {
226 printk(KERN_ERR "EXT4-fs: %s:%d: inode on " 228 printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
@@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
266 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 268 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
267 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 269 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
268 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 270 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
269 if (!bitmap_bh) 271 /* Don't bother if the inode bitmap is corrupt. */
272 grp = ext4_get_group_info(sb, block_group);
273 if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
270 goto error_return; 274 goto error_return;
271 275
272 BUFFER_TRACE(bitmap_bh, "get_write_access"); 276 BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -315,8 +319,10 @@ out:
315 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 319 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
316 if (!fatal) 320 if (!fatal)
317 fatal = err; 321 fatal = err;
318 } else 322 } else {
319 ext4_error(sb, "bit already cleared for inode %lu", ino); 323 ext4_error(sb, "bit already cleared for inode %lu", ino);
324 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
325 }
320 326
321error_return: 327error_return:
322 brelse(bitmap_bh); 328 brelse(bitmap_bh);
@@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
625} 631}
626 632
627/* 633/*
634 * In no journal mode, if an inode has recently been deleted, we want
635 * to avoid reusing it until we're reasonably sure the inode table
636 * block has been written back to disk. (Yes, these values are
637 * somewhat arbitrary...)
638 */
639#define RECENTCY_MIN 5
640#define RECENTCY_DIRTY 30
641
642static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
643{
644 struct ext4_group_desc *gdp;
645 struct ext4_inode *raw_inode;
646 struct buffer_head *bh;
647 unsigned long dtime, now;
648 int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
649 int offset, ret = 0, recentcy = RECENTCY_MIN;
650
651 gdp = ext4_get_group_desc(sb, group, NULL);
652 if (unlikely(!gdp))
653 return 0;
654
655 bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
656 (ino / inodes_per_block));
657 if (unlikely(!bh) || !buffer_uptodate(bh))
658 /*
659 * If the block is not in the buffer cache, then it
660 * must have been written out.
661 */
662 goto out;
663
664 offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
665 raw_inode = (struct ext4_inode *) (bh->b_data + offset);
666 dtime = le32_to_cpu(raw_inode->i_dtime);
667 now = get_seconds();
668 if (buffer_dirty(bh))
669 recentcy += RECENTCY_DIRTY;
670
671 if (dtime && (dtime < now) && (now < dtime + recentcy))
672 ret = 1;
673out:
674 brelse(bh);
675 return ret;
676}
677
678/*
628 * There are two policies for allocating an inode. If the new inode is 679 * There are two policies for allocating an inode. If the new inode is
629 * a directory, then a forward search is made for a block group with both 680 * a directory, then a forward search is made for a block group with both
630 * free space and a low directory-to-inode ratio; if that fails, then of 681 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
652 struct inode *ret; 703 struct inode *ret;
653 ext4_group_t i; 704 ext4_group_t i;
654 ext4_group_t flex_group; 705 ext4_group_t flex_group;
706 struct ext4_group_info *grp;
655 707
656 /* Cannot create files in a deleted directory */ 708 /* Cannot create files in a deleted directory */
657 if (!dir || !dir->i_nlink) 709 if (!dir || !dir->i_nlink)
@@ -725,10 +777,22 @@ got_group:
725 continue; 777 continue;
726 } 778 }
727 779
780 grp = ext4_get_group_info(sb, group);
781 /* Skip groups with already-known suspicious inode tables */
782 if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
783 if (++group == ngroups)
784 group = 0;
785 continue;
786 }
787
728 brelse(inode_bitmap_bh); 788 brelse(inode_bitmap_bh);
729 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 789 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
730 if (!inode_bitmap_bh) 790 /* Skip groups with suspicious inode tables */
731 goto out; 791 if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
792 if (++group == ngroups)
793 group = 0;
794 continue;
795 }
732 796
733repeat_in_this_group: 797repeat_in_this_group:
734 ino = ext4_find_next_zero_bit((unsigned long *) 798 ino = ext4_find_next_zero_bit((unsigned long *)
@@ -741,6 +805,11 @@ repeat_in_this_group:
741 "inode=%lu", ino + 1); 805 "inode=%lu", ino + 1);
742 continue; 806 continue;
743 } 807 }
808 if ((EXT4_SB(sb)->s_journal == NULL) &&
809 recently_deleted(sb, group, ino)) {
810 ino++;
811 goto next_inode;
812 }
744 if (!handle) { 813 if (!handle) {
745 BUG_ON(nblocks <= 0); 814 BUG_ON(nblocks <= 0);
746 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 815 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -764,6 +833,7 @@ repeat_in_this_group:
764 ino++; /* the inode bitmap is zero-based */ 833 ino++; /* the inode bitmap is zero-based */
765 if (!ret2) 834 if (!ret2)
766 goto got; /* we grabbed the inode! */ 835 goto got; /* we grabbed the inode! */
836next_inode:
767 if (ino < EXT4_INODES_PER_GROUP(sb)) 837 if (ino < EXT4_INODES_PER_GROUP(sb))
768 goto repeat_in_this_group; 838 goto repeat_in_this_group;
769next_group: 839next_group:
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 87b30cd357e7..594009f5f523 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -23,7 +23,6 @@
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
25#include "truncate.h" 25#include "truncate.h"
26#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
27 26
28#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
29 28
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2ca04e67a4f..0d424d7ac02b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -553,7 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
553 } 553 }
554 if (retval > 0) { 554 if (retval > 0) {
555 int ret; 555 int ret;
556 unsigned long long status; 556 unsigned int status;
557 557
558 if (unlikely(retval != map->m_len)) { 558 if (unlikely(retval != map->m_len)) {
559 ext4_warning(inode->i_sb, 559 ext4_warning(inode->i_sb,
@@ -653,7 +653,7 @@ found:
653 653
654 if (retval > 0) { 654 if (retval > 0) {
655 int ret; 655 int ret;
656 unsigned long long status; 656 unsigned int status;
657 657
658 if (unlikely(retval != map->m_len)) { 658 if (unlikely(retval != map->m_len)) {
659 ext4_warning(inode->i_sb, 659 ext4_warning(inode->i_sb,
@@ -727,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
727 727
728 ret = ext4_map_blocks(handle, inode, &map, flags); 728 ret = ext4_map_blocks(handle, inode, &map, flags);
729 if (ret > 0) { 729 if (ret > 0) {
730 ext4_io_end_t *io_end = ext4_inode_aio(inode);
731
730 map_bh(bh, inode->i_sb, map.m_pblk); 732 map_bh(bh, inode->i_sb, map.m_pblk);
731 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 733 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
734 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
735 set_buffer_defer_completion(bh);
732 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 736 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
733 ret = 0; 737 ret = 0;
734 } 738 }
@@ -969,7 +973,8 @@ retry_journal:
969 ext4_journal_stop(handle); 973 ext4_journal_stop(handle);
970 goto retry_grab; 974 goto retry_grab;
971 } 975 }
972 wait_on_page_writeback(page); 976 /* In case writeback began while the page was unlocked */
977 wait_for_stable_page(page);
973 978
974 if (ext4_should_dioread_nolock(inode)) 979 if (ext4_should_dioread_nolock(inode))
975 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 980 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -1633,7 +1638,7 @@ add_delayed:
1633 set_buffer_delay(bh); 1638 set_buffer_delay(bh);
1634 } else if (retval > 0) { 1639 } else if (retval > 0) {
1635 int ret; 1640 int ret;
1636 unsigned long long status; 1641 unsigned int status;
1637 1642
1638 if (unlikely(retval != map->m_len)) { 1643 if (unlikely(retval != map->m_len)) {
1639 ext4_warning(inode->i_sb, 1644 ext4_warning(inode->i_sb,
@@ -1890,12 +1895,32 @@ static int ext4_writepage(struct page *page,
1890 return ret; 1895 return ret;
1891} 1896}
1892 1897
1898static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
1899{
1900 int len;
1901 loff_t size = i_size_read(mpd->inode);
1902 int err;
1903
1904 BUG_ON(page->index != mpd->first_page);
1905 if (page->index == size >> PAGE_CACHE_SHIFT)
1906 len = size & ~PAGE_CACHE_MASK;
1907 else
1908 len = PAGE_CACHE_SIZE;
1909 clear_page_dirty_for_io(page);
1910 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1911 if (!err)
1912 mpd->wbc->nr_to_write--;
1913 mpd->first_page++;
1914
1915 return err;
1916}
1917
1893#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) 1918#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1894 1919
1895/* 1920/*
1896 * mballoc gives us at most this number of blocks... 1921 * mballoc gives us at most this number of blocks...
1897 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 1922 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
1898 * The rest of mballoc seems to handle chunks upto full group size. 1923 * The rest of mballoc seems to handle chunks up to full group size.
1899 */ 1924 */
1900#define MAX_WRITEPAGES_EXTENT_LEN 2048 1925#define MAX_WRITEPAGES_EXTENT_LEN 2048
1901 1926
@@ -1904,82 +1929,94 @@ static int ext4_writepage(struct page *page,
1904 * 1929 *
1905 * @mpd - extent of blocks 1930 * @mpd - extent of blocks
1906 * @lblk - logical number of the block in the file 1931 * @lblk - logical number of the block in the file
1907 * @b_state - b_state of the buffer head added 1932 * @bh - buffer head we want to add to the extent
1908 * 1933 *
1909 * the function is used to collect contig. blocks in same state 1934 * The function is used to collect contig. blocks in the same state. If the
1935 * buffer doesn't require mapping for writeback and we haven't started the
1936 * extent of buffers to map yet, the function returns 'true' immediately - the
1937 * caller can write the buffer right away. Otherwise the function returns true
1938 * if the block has been added to the extent, false if the block couldn't be
1939 * added.
1910 */ 1940 */
1911static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 1941static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1912 unsigned long b_state) 1942 struct buffer_head *bh)
1913{ 1943{
1914 struct ext4_map_blocks *map = &mpd->map; 1944 struct ext4_map_blocks *map = &mpd->map;
1915 1945
1916 /* Don't go larger than mballoc is willing to allocate */ 1946 /* Buffer that doesn't need mapping for writeback? */
1917 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 1947 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1918 return 0; 1948 (!buffer_delay(bh) && !buffer_unwritten(bh))) {
1949 /* So far no extent to map => we write the buffer right away */
1950 if (map->m_len == 0)
1951 return true;
1952 return false;
1953 }
1919 1954
1920 /* First block in the extent? */ 1955 /* First block in the extent? */
1921 if (map->m_len == 0) { 1956 if (map->m_len == 0) {
1922 map->m_lblk = lblk; 1957 map->m_lblk = lblk;
1923 map->m_len = 1; 1958 map->m_len = 1;
1924 map->m_flags = b_state & BH_FLAGS; 1959 map->m_flags = bh->b_state & BH_FLAGS;
1925 return 1; 1960 return true;
1926 } 1961 }
1927 1962
1963 /* Don't go larger than mballoc is willing to allocate */
1964 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1965 return false;
1966
1928 /* Can we merge the block to our big extent? */ 1967 /* Can we merge the block to our big extent? */
1929 if (lblk == map->m_lblk + map->m_len && 1968 if (lblk == map->m_lblk + map->m_len &&
1930 (b_state & BH_FLAGS) == map->m_flags) { 1969 (bh->b_state & BH_FLAGS) == map->m_flags) {
1931 map->m_len++; 1970 map->m_len++;
1932 return 1; 1971 return true;
1933 } 1972 }
1934 return 0; 1973 return false;
1935} 1974}
1936 1975
1937static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, 1976/*
1938 struct buffer_head *head, 1977 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
1939 struct buffer_head *bh, 1978 *
1940 ext4_lblk_t lblk) 1979 * @mpd - extent of blocks for mapping
1980 * @head - the first buffer in the page
1981 * @bh - buffer we should start processing from
1982 * @lblk - logical number of the block in the file corresponding to @bh
1983 *
1984 * Walk through page buffers from @bh upto @head (exclusive) and either submit
1985 * the page for IO if all buffers in this page were mapped and there's no
1986 * accumulated extent of buffers to map or add buffers in the page to the
1987 * extent of buffers to map. The function returns 1 if the caller can continue
1988 * by processing the next page, 0 if it should stop adding buffers to the
1989 * extent to map because we cannot extend it anymore. It can also return value
1990 * < 0 in case of error during IO submission.
1991 */
1992static int mpage_process_page_bufs(struct mpage_da_data *mpd,
1993 struct buffer_head *head,
1994 struct buffer_head *bh,
1995 ext4_lblk_t lblk)
1941{ 1996{
1942 struct inode *inode = mpd->inode; 1997 struct inode *inode = mpd->inode;
1998 int err;
1943 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) 1999 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1944 >> inode->i_blkbits; 2000 >> inode->i_blkbits;
1945 2001
1946 do { 2002 do {
1947 BUG_ON(buffer_locked(bh)); 2003 BUG_ON(buffer_locked(bh));
1948 2004
1949 if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2005 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
1950 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1951 lblk >= blocks) {
1952 /* Found extent to map? */ 2006 /* Found extent to map? */
1953 if (mpd->map.m_len) 2007 if (mpd->map.m_len)
1954 return false; 2008 return 0;
1955 if (lblk >= blocks) 2009 /* Everything mapped so far and we hit EOF */
1956 return true; 2010 break;
1957 continue;
1958 } 2011 }
1959 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1960 return false;
1961 } while (lblk++, (bh = bh->b_this_page) != head); 2012 } while (lblk++, (bh = bh->b_this_page) != head);
1962 return true; 2013 /* So far everything mapped? Submit the page for IO. */
1963} 2014 if (mpd->map.m_len == 0) {
1964 2015 err = mpage_submit_page(mpd, head->b_page);
1965static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2016 if (err < 0)
1966{ 2017 return err;
1967 int len; 2018 }
1968 loff_t size = i_size_read(mpd->inode); 2019 return lblk < blocks;
1969 int err;
1970
1971 BUG_ON(page->index != mpd->first_page);
1972 if (page->index == size >> PAGE_CACHE_SHIFT)
1973 len = size & ~PAGE_CACHE_MASK;
1974 else
1975 len = PAGE_CACHE_SIZE;
1976 clear_page_dirty_for_io(page);
1977 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1978 if (!err)
1979 mpd->wbc->nr_to_write--;
1980 mpd->first_page++;
1981
1982 return err;
1983} 2020}
1984 2021
1985/* 2022/*
@@ -2003,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2003 struct inode *inode = mpd->inode; 2040 struct inode *inode = mpd->inode;
2004 struct buffer_head *head, *bh; 2041 struct buffer_head *head, *bh;
2005 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; 2042 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2006 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2007 >> inode->i_blkbits;
2008 pgoff_t start, end; 2043 pgoff_t start, end;
2009 ext4_lblk_t lblk; 2044 ext4_lblk_t lblk;
2010 sector_t pblock; 2045 sector_t pblock;
@@ -2026,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2026 2061
2027 if (page->index > end) 2062 if (page->index > end)
2028 break; 2063 break;
2029 /* Upto 'end' pages must be contiguous */ 2064 /* Up to 'end' pages must be contiguous */
2030 BUG_ON(page->index != start); 2065 BUG_ON(page->index != start);
2031 bh = head = page_buffers(page); 2066 bh = head = page_buffers(page);
2032 do { 2067 do {
@@ -2039,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2039 */ 2074 */
2040 mpd->map.m_len = 0; 2075 mpd->map.m_len = 0;
2041 mpd->map.m_flags = 0; 2076 mpd->map.m_flags = 0;
2042 add_page_bufs_to_extent(mpd, head, bh, 2077 /*
2043 lblk); 2078 * FIXME: If dioread_nolock supports
2079 * blocksize < pagesize, we need to make
2080 * sure we add size mapped so far to
2081 * io_end->size as the following call
2082 * can submit the page for IO.
2083 */
2084 err = mpage_process_page_bufs(mpd, head,
2085 bh, lblk);
2044 pagevec_release(&pvec); 2086 pagevec_release(&pvec);
2045 return 0; 2087 if (err > 0)
2088 err = 0;
2089 return err;
2046 } 2090 }
2047 if (buffer_delay(bh)) { 2091 if (buffer_delay(bh)) {
2048 clear_buffer_delay(bh); 2092 clear_buffer_delay(bh);
2049 bh->b_blocknr = pblock++; 2093 bh->b_blocknr = pblock++;
2050 } 2094 }
2051 clear_buffer_unwritten(bh); 2095 clear_buffer_unwritten(bh);
2052 } while (++lblk < blocks && 2096 } while (lblk++, (bh = bh->b_this_page) != head);
2053 (bh = bh->b_this_page) != head);
2054 2097
2055 /* 2098 /*
2056 * FIXME: This is going to break if dioread_nolock 2099 * FIXME: This is going to break if dioread_nolock
@@ -2199,12 +2242,10 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2199 2242
2200 /* Update on-disk size after IO is submitted */ 2243 /* Update on-disk size after IO is submitted */
2201 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; 2244 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2202 if (disksize > i_size_read(inode))
2203 disksize = i_size_read(inode);
2204 if (disksize > EXT4_I(inode)->i_disksize) { 2245 if (disksize > EXT4_I(inode)->i_disksize) {
2205 int err2; 2246 int err2;
2206 2247
2207 ext4_update_i_disksize(inode, disksize); 2248 ext4_wb_update_i_disksize(inode, disksize);
2208 err2 = ext4_mark_inode_dirty(handle, inode); 2249 err2 = ext4_mark_inode_dirty(handle, inode);
2209 if (err2) 2250 if (err2)
2210 ext4_error(inode->i_sb, 2251 ext4_error(inode->i_sb,
@@ -2219,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2219/* 2260/*
2220 * Calculate the total number of credits to reserve for one writepages 2261 * Calculate the total number of credits to reserve for one writepages
2221 * iteration. This is called from ext4_writepages(). We map an extent of 2262 * iteration. This is called from ext4_writepages(). We map an extent of
2222 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2263 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2223 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2264 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2224 * bpp - 1 blocks in bpp different extents. 2265 * bpp - 1 blocks in bpp different extents.
2225 */ 2266 */
@@ -2319,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2319 lblk = ((ext4_lblk_t)page->index) << 2360 lblk = ((ext4_lblk_t)page->index) <<
2320 (PAGE_CACHE_SHIFT - blkbits); 2361 (PAGE_CACHE_SHIFT - blkbits);
2321 head = page_buffers(page); 2362 head = page_buffers(page);
2322 if (!add_page_bufs_to_extent(mpd, head, head, lblk)) 2363 err = mpage_process_page_bufs(mpd, head, head, lblk);
2364 if (err <= 0)
2323 goto out; 2365 goto out;
2324 /* So far everything mapped? Submit the page for IO. */ 2366 err = 0;
2325 if (mpd->map.m_len == 0) {
2326 err = mpage_submit_page(mpd, page);
2327 if (err < 0)
2328 goto out;
2329 }
2330 2367
2331 /* 2368 /*
2332 * Accumulated enough dirty pages? This doesn't apply 2369 * Accumulated enough dirty pages? This doesn't apply
@@ -2410,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping,
2410 2447
2411 if (ext4_should_dioread_nolock(inode)) { 2448 if (ext4_should_dioread_nolock(inode)) {
2412 /* 2449 /*
2413 * We may need to convert upto one extent per block in 2450 * We may need to convert up to one extent per block in
2414 * the page and we may dirty the inode. 2451 * the page and we may dirty the inode.
2415 */ 2452 */
2416 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); 2453 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
@@ -2646,7 +2683,7 @@ retry_journal:
2646 goto retry_grab; 2683 goto retry_grab;
2647 } 2684 }
2648 /* In case writeback began while the page was unlocked */ 2685 /* In case writeback began while the page was unlocked */
2649 wait_on_page_writeback(page); 2686 wait_for_stable_page(page);
2650 2687
2651 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 2688 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2652 if (ret < 0) { 2689 if (ret < 0) {
@@ -2991,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
2991} 3028}
2992 3029
2993static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3030static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2994 ssize_t size, void *private, int ret, 3031 ssize_t size, void *private)
2995 bool is_async)
2996{ 3032{
2997 struct inode *inode = file_inode(iocb->ki_filp);
2998 ext4_io_end_t *io_end = iocb->private; 3033 ext4_io_end_t *io_end = iocb->private;
2999 3034
3000 /* if not async direct IO just return */ 3035 /* if not async direct IO just return */
3001 if (!io_end) { 3036 if (!io_end)
3002 inode_dio_done(inode);
3003 if (is_async)
3004 aio_complete(iocb, ret, 0);
3005 return; 3037 return;
3006 }
3007 3038
3008 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3039 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3009 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3040 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3013,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3013 iocb->private = NULL; 3044 iocb->private = NULL;
3014 io_end->offset = offset; 3045 io_end->offset = offset;
3015 io_end->size = size; 3046 io_end->size = size;
3016 if (is_async) { 3047 ext4_put_io_end(io_end);
3017 io_end->iocb = iocb;
3018 io_end->result = ret;
3019 }
3020 ext4_put_io_end_defer(io_end);
3021} 3048}
3022 3049
3023/* 3050/*
@@ -3102,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3102 ret = -ENOMEM; 3129 ret = -ENOMEM;
3103 goto retake_lock; 3130 goto retake_lock;
3104 } 3131 }
3105 io_end->flag |= EXT4_IO_END_DIRECT;
3106 /* 3132 /*
3107 * Grab reference for DIO. Will be dropped in ext4_end_io_dio() 3133 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3108 */ 3134 */
@@ -3147,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3147 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { 3173 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3148 WARN_ON(iocb->private != io_end); 3174 WARN_ON(iocb->private != io_end);
3149 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 3175 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3150 WARN_ON(io_end->iocb);
3151 /*
3152 * Generic code already did inode_dio_done() so we
3153 * have to clear EXT4_IO_END_DIRECT to not do it for
3154 * the second time.
3155 */
3156 io_end->flag = 0;
3157 ext4_put_io_end(io_end); 3176 ext4_put_io_end(io_end);
3158 iocb->private = NULL; 3177 iocb->private = NULL;
3159 } 3178 }
@@ -4566,7 +4585,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4566 ext4_journal_stop(handle); 4585 ext4_journal_stop(handle);
4567 } 4586 }
4568 4587
4569 if (attr->ia_valid & ATTR_SIZE) { 4588 if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
4589 handle_t *handle;
4570 4590
4571 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4591 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4572 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4592 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4574,73 +4594,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4574 if (attr->ia_size > sbi->s_bitmap_maxbytes) 4594 if (attr->ia_size > sbi->s_bitmap_maxbytes)
4575 return -EFBIG; 4595 return -EFBIG;
4576 } 4596 }
4577 } 4597 if (S_ISREG(inode->i_mode) &&
4578 4598 (attr->ia_size < inode->i_size)) {
4579 if (S_ISREG(inode->i_mode) && 4599 if (ext4_should_order_data(inode)) {
4580 attr->ia_valid & ATTR_SIZE && 4600 error = ext4_begin_ordered_truncate(inode,
4581 (attr->ia_size < inode->i_size)) {
4582 handle_t *handle;
4583
4584 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
4585 if (IS_ERR(handle)) {
4586 error = PTR_ERR(handle);
4587 goto err_out;
4588 }
4589 if (ext4_handle_valid(handle)) {
4590 error = ext4_orphan_add(handle, inode);
4591 orphan = 1;
4592 }
4593 EXT4_I(inode)->i_disksize = attr->ia_size;
4594 rc = ext4_mark_inode_dirty(handle, inode);
4595 if (!error)
4596 error = rc;
4597 ext4_journal_stop(handle);
4598
4599 if (ext4_should_order_data(inode)) {
4600 error = ext4_begin_ordered_truncate(inode,
4601 attr->ia_size); 4601 attr->ia_size);
4602 if (error) { 4602 if (error)
4603 /* Do as much error cleanup as possible */
4604 handle = ext4_journal_start(inode,
4605 EXT4_HT_INODE, 3);
4606 if (IS_ERR(handle)) {
4607 ext4_orphan_del(NULL, inode);
4608 goto err_out; 4603 goto err_out;
4609 } 4604 }
4610 ext4_orphan_del(handle, inode); 4605 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
4611 orphan = 0; 4606 if (IS_ERR(handle)) {
4612 ext4_journal_stop(handle); 4607 error = PTR_ERR(handle);
4613 goto err_out; 4608 goto err_out;
4614 } 4609 }
4615 } 4610 if (ext4_handle_valid(handle)) {
4616 } 4611 error = ext4_orphan_add(handle, inode);
4617 4612 orphan = 1;
4618 if (attr->ia_valid & ATTR_SIZE) {
4619 if (attr->ia_size != inode->i_size) {
4620 loff_t oldsize = inode->i_size;
4621
4622 i_size_write(inode, attr->ia_size);
4623 /*
4624 * Blocks are going to be removed from the inode. Wait
4625 * for dio in flight. Temporarily disable
4626 * dioread_nolock to prevent livelock.
4627 */
4628 if (orphan) {
4629 if (!ext4_should_journal_data(inode)) {
4630 ext4_inode_block_unlocked_dio(inode);
4631 inode_dio_wait(inode);
4632 ext4_inode_resume_unlocked_dio(inode);
4633 } else
4634 ext4_wait_for_tail_page_commit(inode);
4635 } 4613 }
4614 down_write(&EXT4_I(inode)->i_data_sem);
4615 EXT4_I(inode)->i_disksize = attr->ia_size;
4616 rc = ext4_mark_inode_dirty(handle, inode);
4617 if (!error)
4618 error = rc;
4636 /* 4619 /*
4637 * Truncate pagecache after we've waited for commit 4620 * We have to update i_size under i_data_sem together
4638 * in data=journal mode to make pages freeable. 4621 * with i_disksize to avoid races with writeback code
4622 * running ext4_wb_update_i_disksize().
4639 */ 4623 */
4640 truncate_pagecache(inode, oldsize, inode->i_size); 4624 if (!error)
4625 i_size_write(inode, attr->ia_size);
4626 up_write(&EXT4_I(inode)->i_data_sem);
4627 ext4_journal_stop(handle);
4628 if (error) {
4629 ext4_orphan_del(NULL, inode);
4630 goto err_out;
4631 }
4632 } else
4633 i_size_write(inode, attr->ia_size);
4634
4635 /*
4636 * Blocks are going to be removed from the inode. Wait
4637 * for dio in flight. Temporarily disable
4638 * dioread_nolock to prevent livelock.
4639 */
4640 if (orphan) {
4641 if (!ext4_should_journal_data(inode)) {
4642 ext4_inode_block_unlocked_dio(inode);
4643 inode_dio_wait(inode);
4644 ext4_inode_resume_unlocked_dio(inode);
4645 } else
4646 ext4_wait_for_tail_page_commit(inode);
4641 } 4647 }
4642 ext4_truncate(inode); 4648 /*
4649 * Truncate pagecache after we've waited for commit
4650 * in data=journal mode to make pages freeable.
4651 */
4652 truncate_pagecache(inode, inode->i_size);
4643 } 4653 }
4654 /*
4655 * We want to call ext4_truncate() even if attr->ia_size ==
4656 * inode->i_size for cases like truncation of fallocated space
4657 */
4658 if (attr->ia_valid & ATTR_SIZE)
4659 ext4_truncate(inode);
4644 4660
4645 if (!rc) { 4661 if (!rc) {
4646 setattr_copy(inode, attr); 4662 setattr_copy(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c0427e2f6648..a569d335f804 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,7 +17,6 @@
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
20#include "ext4_extents.h"
21 20
22#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) 21#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
23 22
@@ -624,6 +623,8 @@ resizefs_out:
624 623
625 return 0; 624 return 0;
626 } 625 }
626 case EXT4_IOC_PRECACHE_EXTENTS:
627 return ext4_ext_precache(inode);
627 628
628 default: 629 default:
629 return -ENOTTY; 630 return -ENOTTY;
@@ -688,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
688 case EXT4_IOC_MOVE_EXT: 689 case EXT4_IOC_MOVE_EXT:
689 case FITRIM: 690 case FITRIM:
690 case EXT4_IOC_RESIZE_FS: 691 case EXT4_IOC_RESIZE_FS:
692 case EXT4_IOC_PRECACHE_EXTENTS:
691 break; 693 break;
692 default: 694 default:
693 return -ENOIOCTLCMD; 695 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4bbbf13bd743..a41e3ba8cfaa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb,
751 751
752 if (free != grp->bb_free) { 752 if (free != grp->bb_free) {
753 ext4_grp_locked_error(sb, group, 0, 0, 753 ext4_grp_locked_error(sb, group, 0, 0,
754 "%u clusters in bitmap, %u in gd", 754 "%u clusters in bitmap, %u in gd; "
755 "block bitmap corrupt.",
755 free, grp->bb_free); 756 free, grp->bb_free);
756 /* 757 /*
757 * If we intent to continue, we consider group descritor 758 * If we intend to continue, we consider group descriptor
758 * corrupt and update bb_free using bitmap value 759 * corrupt and update bb_free using bitmap value
759 */ 760 */
760 grp->bb_free = free; 761 grp->bb_free = free;
762 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
761 } 763 }
762 mb_set_largest_free_order(sb, grp); 764 mb_set_largest_free_order(sb, grp);
763 765
@@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1398 1400
1399 BUG_ON(last >= (sb->s_blocksize << 3)); 1401 BUG_ON(last >= (sb->s_blocksize << 3));
1400 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1402 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1403 /* Don't bother if the block group is corrupt. */
1404 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1405 return;
1406
1401 mb_check_buddy(e4b); 1407 mb_check_buddy(e4b);
1402 mb_free_blocks_double(inode, e4b, first, count); 1408 mb_free_blocks_double(inode, e4b, first, count);
1403 1409
@@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1423 inode ? inode->i_ino : 0, 1429 inode ? inode->i_ino : 0,
1424 blocknr, 1430 blocknr,
1425 "freeing already freed block " 1431 "freeing already freed block "
1426 "(bit %u)", block); 1432 "(bit %u); block bitmap corrupt.",
1433 block);
1434 /* Mark the block group as corrupt. */
1435 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1436 &e4b->bd_info->bb_state);
1427 mb_regenerate_buddy(e4b); 1437 mb_regenerate_buddy(e4b);
1428 goto done; 1438 goto done;
1429 } 1439 }
@@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1790 if (err) 1800 if (err)
1791 return err; 1801 return err;
1792 1802
1803 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1804 ext4_mb_unload_buddy(e4b);
1805 return 0;
1806 }
1807
1793 ext4_lock_group(ac->ac_sb, group); 1808 ext4_lock_group(ac->ac_sb, group);
1794 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1795 ac->ac_g_ex.fe_len, &ex); 1810 ac->ac_g_ex.fe_len, &ex);
@@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1987 if (cr <= 2 && free < ac->ac_g_ex.fe_len) 2002 if (cr <= 2 && free < ac->ac_g_ex.fe_len)
1988 return 0; 2003 return 0;
1989 2004
2005 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2006 return 0;
2007
1990 /* We only do this if the grp has never been initialized */ 2008 /* We only do this if the grp has never been initialized */
1991 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2009 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1992 int ret = ext4_mb_init_group(ac->ac_sb, group); 2010 int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4585 struct buffer_head *gd_bh; 4603 struct buffer_head *gd_bh;
4586 ext4_group_t block_group; 4604 ext4_group_t block_group;
4587 struct ext4_sb_info *sbi; 4605 struct ext4_sb_info *sbi;
4606 struct ext4_inode_info *ei = EXT4_I(inode);
4588 struct ext4_buddy e4b; 4607 struct ext4_buddy e4b;
4589 unsigned int count_clusters; 4608 unsigned int count_clusters;
4590 int err = 0; 4609 int err = 0;
@@ -4673,6 +4692,10 @@ do_more:
4673 overflow = 0; 4692 overflow = 0;
4674 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4693 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4675 4694
4695 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4696 ext4_get_group_info(sb, block_group))))
4697 return;
4698
4676 /* 4699 /*
4677 * Check to see if we are freeing blocks across a group 4700 * Check to see if we are freeing blocks across a group
4678 * boundary. 4701 * boundary.
@@ -4784,7 +4807,6 @@ do_more:
4784 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); 4807 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4785 ext4_group_desc_csum_set(sb, block_group, gdp); 4808 ext4_group_desc_csum_set(sb, block_group, gdp);
4786 ext4_unlock_group(sb, block_group); 4809 ext4_unlock_group(sb, block_group);
4787 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4788 4810
4789 if (sbi->s_log_groups_per_flex) { 4811 if (sbi->s_log_groups_per_flex) {
4790 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4812 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
@@ -4792,10 +4814,23 @@ do_more:
4792 &sbi->s_flex_groups[flex_group].free_clusters); 4814 &sbi->s_flex_groups[flex_group].free_clusters);
4793 } 4815 }
4794 4816
4795 ext4_mb_unload_buddy(&e4b); 4817 if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
4796 4818 percpu_counter_add(&sbi->s_dirtyclusters_counter,
4797 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4819 count_clusters);
4820 spin_lock(&ei->i_block_reservation_lock);
4821 if (flags & EXT4_FREE_BLOCKS_METADATA)
4822 ei->i_reserved_meta_blocks += count_clusters;
4823 else
4824 ei->i_reserved_data_blocks += count_clusters;
4825 spin_unlock(&ei->i_block_reservation_lock);
4826 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4827 dquot_reclaim_block(inode,
4828 EXT4_C2B(sbi, count_clusters));
4829 } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4798 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 4830 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4831 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4832
4833 ext4_mb_unload_buddy(&e4b);
4799 4834
4800 /* We dirtied the bitmap block */ 4835 /* We dirtied the bitmap block */
4801 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4836 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 49e8bdff9163..2ae73a80c19b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
39 newext.ee_block = cpu_to_le32(lb->first_block); 39 newext.ee_block = cpu_to_le32(lb->first_block);
40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); 40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
41 ext4_ext_store_pblock(&newext, lb->first_pblock); 41 ext4_ext_store_pblock(&newext, lb->first_pblock);
42 path = ext4_ext_find_extent(inode, lb->first_block, NULL); 42 path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
43 43
44 if (IS_ERR(path)) { 44 if (IS_ERR(path)) {
45 retval = PTR_ERR(path); 45 retval = PTR_ERR(path);
@@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode)
494 * superblock modification. 494 * superblock modification.
495 * 495 *
496 * For the tmp_inode we already have committed the 496 * For the tmp_inode we already have committed the
497 * trascation that created the inode. Later as and 497 * transaction that created the inode. Later as and
498 * when we add extents we extent the journal 498 * when we add extents we extent the journal
499 */ 499 */
500 /* 500 /*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e86dddbd8296..7fa4d855dbd5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
37 int ret = 0; 37 int ret = 0;
38 struct ext4_ext_path *path; 38 struct ext4_ext_path *path;
39 39
40 path = ext4_ext_find_extent(inode, lblock, *orig_path); 40 path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
41 if (IS_ERR(path)) 41 if (IS_ERR(path))
42 ret = PTR_ERR(path); 42 ret = PTR_ERR(path);
43 else if (path[ext_depth(inode)].p_ext == NULL) 43 else if (path[ext_depth(inode)].p_ext == NULL)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0dbc4b..1bec5a5c1e45 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
3005/* 3005/*
3006 * Anybody can rename anything with this: the permission checks are left to the 3006 * Anybody can rename anything with this: the permission checks are left to the
3007 * higher-level routines. 3007 * higher-level routines.
3008 *
3009 * n.b. old_{dentry,inode) refers to the source dentry/inode
3010 * while new_{dentry,inode) refers to the destination dentry/inode
3011 * This comes from rename(const char *oldpath, const char *newpath)
3008 */ 3012 */
3009static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, 3013static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3010 struct inode *new_dir, struct dentry *new_dentry) 3014 struct inode *new_dir, struct dentry *new_dentry)
3011{ 3015{
3012 handle_t *handle; 3016 handle_t *handle = NULL;
3013 struct inode *old_inode, *new_inode; 3017 struct inode *old_inode, *new_inode;
3014 struct buffer_head *old_bh, *new_bh, *dir_bh; 3018 struct buffer_head *old_bh, *new_bh, *dir_bh;
3015 struct ext4_dir_entry_2 *old_de, *new_de; 3019 struct ext4_dir_entry_2 *old_de, *new_de;
3016 int retval, force_da_alloc = 0; 3020 int retval;
3017 int inlined = 0, new_inlined = 0; 3021 int inlined = 0, new_inlined = 0;
3018 struct ext4_dir_entry_2 *parent_de; 3022 struct ext4_dir_entry_2 *parent_de;
3019 3023
@@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3026 * in separate transaction */ 3030 * in separate transaction */
3027 if (new_dentry->d_inode) 3031 if (new_dentry->d_inode)
3028 dquot_initialize(new_dentry->d_inode); 3032 dquot_initialize(new_dentry->d_inode);
3029 handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
3030 (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
3031 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3032 if (IS_ERR(handle))
3033 return PTR_ERR(handle);
3034
3035 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
3036 ext4_handle_sync(handle);
3037 3033
3038 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); 3034 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
3039 /* 3035 /*
@@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3056 new_bh = NULL; 3052 new_bh = NULL;
3057 } 3053 }
3058 } 3054 }
3055 if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
3056 ext4_alloc_da_blocks(old_inode);
3057
3058 handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
3059 (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
3060 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3061 if (IS_ERR(handle))
3062 return PTR_ERR(handle);
3063
3064 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
3065 ext4_handle_sync(handle);
3066
3059 if (S_ISDIR(old_inode->i_mode)) { 3067 if (S_ISDIR(old_inode->i_mode)) {
3060 if (new_inode) { 3068 if (new_inode) {
3061 retval = -ENOTEMPTY; 3069 retval = -ENOTEMPTY;
@@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3186 ext4_mark_inode_dirty(handle, new_inode); 3194 ext4_mark_inode_dirty(handle, new_inode);
3187 if (!new_inode->i_nlink) 3195 if (!new_inode->i_nlink)
3188 ext4_orphan_add(handle, new_inode); 3196 ext4_orphan_add(handle, new_inode);
3189 if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
3190 force_da_alloc = 1;
3191 } 3197 }
3192 retval = 0; 3198 retval = 0;
3193 3199
@@ -3195,9 +3201,8 @@ end_rename:
3195 brelse(dir_bh); 3201 brelse(dir_bh);
3196 brelse(old_bh); 3202 brelse(old_bh);
3197 brelse(new_bh); 3203 brelse(new_bh);
3198 ext4_journal_stop(handle); 3204 if (handle)
3199 if (retval == 0 && force_da_alloc) 3205 ext4_journal_stop(handle);
3200 ext4_alloc_da_blocks(old_inode);
3201 return retval; 3206 return retval;
3202} 3207}
3203 3208
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 6625d210fb45..d7d0c7b46ed4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
123 ext4_finish_bio(bio); 123 ext4_finish_bio(bio);
124 bio_put(bio); 124 bio_put(bio);
125 } 125 }
126 if (io_end->flag & EXT4_IO_END_DIRECT)
127 inode_dio_done(io_end->inode);
128 if (io_end->iocb)
129 aio_complete(io_end->iocb, io_end->result, 0);
130 kmem_cache_free(io_end_cachep, io_end); 126 kmem_cache_free(io_end_cachep, io_end);
131} 127}
132 128
@@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
204 struct workqueue_struct *wq; 200 struct workqueue_struct *wq;
205 unsigned long flags; 201 unsigned long flags;
206 202
207 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 203 /* Only reserved conversions from writeback should enter here */
204 WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
205 WARN_ON(!io_end->handle);
208 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 206 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
209 if (io_end->handle) { 207 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
210 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; 208 if (list_empty(&ei->i_rsv_conversion_list))
211 if (list_empty(&ei->i_rsv_conversion_list)) 209 queue_work(wq, &ei->i_rsv_conversion_work);
212 queue_work(wq, &ei->i_rsv_conversion_work); 210 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
213 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
214 } else {
215 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
216 if (list_empty(&ei->i_unrsv_conversion_list))
217 queue_work(wq, &ei->i_unrsv_conversion_work);
218 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
219 }
220 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 211 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
221} 212}
222 213
@@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work)
256 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 247 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
257} 248}
258 249
259void ext4_end_io_unrsv_work(struct work_struct *work)
260{
261 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
262 i_unrsv_conversion_work);
263 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
264}
265
266ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 250ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
267{ 251{
268 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 252 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b59373b625e9..2c2e6cbc6bed 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -162,7 +162,7 @@ void *ext4_kvmalloc(size_t size, gfp_t flags)
162{ 162{
163 void *ret; 163 void *ret;
164 164
165 ret = kmalloc(size, flags); 165 ret = kmalloc(size, flags | __GFP_NOWARN);
166 if (!ret) 166 if (!ret)
167 ret = __vmalloc(size, flags, PAGE_KERNEL); 167 ret = __vmalloc(size, flags, PAGE_KERNEL);
168 return ret; 168 return ret;
@@ -172,7 +172,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
172{ 172{
173 void *ret; 173 void *ret;
174 174
175 ret = kzalloc(size, flags); 175 ret = kzalloc(size, flags | __GFP_NOWARN);
176 if (!ret) 176 if (!ret)
177 ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); 177 ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
178 return ret; 178 return ret;
@@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb)
762 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
764 764
765 flush_workqueue(sbi->unrsv_conversion_wq);
766 flush_workqueue(sbi->rsv_conversion_wq); 765 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq); 766 destroy_workqueue(sbi->rsv_conversion_wq);
769 767
770 if (sbi->s_journal) { 768 if (sbi->s_journal) {
@@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
875#endif 873#endif
876 ei->jinode = NULL; 874 ei->jinode = NULL;
877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list); 875 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
879 spin_lock_init(&ei->i_completed_io_lock); 876 spin_lock_init(&ei->i_completed_io_lock);
880 ei->i_sync_tid = 0; 877 ei->i_sync_tid = 0;
881 ei->i_datasync_tid = 0; 878 ei->i_datasync_tid = 0;
882 atomic_set(&ei->i_ioend_count, 0); 879 atomic_set(&ei->i_ioend_count, 0);
883 atomic_set(&ei->i_unwritten, 0); 880 atomic_set(&ei->i_unwritten, 0);
884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 881 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
886 882
887 return &ei->vfs_inode; 883 return &ei->vfs_inode;
888} 884}
@@ -1134,8 +1130,8 @@ enum {
1134 Opt_nouid32, Opt_debug, Opt_removed, 1130 Opt_nouid32, Opt_debug, Opt_removed,
1135 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1131 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1136 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, 1132 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1137 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1133 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1138 Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, 1134 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1139 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1135 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1140 Opt_data_err_abort, Opt_data_err_ignore, 1136 Opt_data_err_abort, Opt_data_err_ignore,
1141 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1137 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -1179,6 +1175,7 @@ static const match_table_t tokens = {
1179 {Opt_min_batch_time, "min_batch_time=%u"}, 1175 {Opt_min_batch_time, "min_batch_time=%u"},
1180 {Opt_max_batch_time, "max_batch_time=%u"}, 1176 {Opt_max_batch_time, "max_batch_time=%u"},
1181 {Opt_journal_dev, "journal_dev=%u"}, 1177 {Opt_journal_dev, "journal_dev=%u"},
1178 {Opt_journal_path, "journal_path=%s"},
1182 {Opt_journal_checksum, "journal_checksum"}, 1179 {Opt_journal_checksum, "journal_checksum"},
1183 {Opt_journal_async_commit, "journal_async_commit"}, 1180 {Opt_journal_async_commit, "journal_async_commit"},
1184 {Opt_abort, "abort"}, 1181 {Opt_abort, "abort"},
@@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
1338#define MOPT_NO_EXT2 0x0100 1335#define MOPT_NO_EXT2 0x0100
1339#define MOPT_NO_EXT3 0x0200 1336#define MOPT_NO_EXT3 0x0200
1340#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) 1337#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
1338#define MOPT_STRING 0x0400
1341 1339
1342static const struct mount_opts { 1340static const struct mount_opts {
1343 int token; 1341 int token;
@@ -1387,6 +1385,7 @@ static const struct mount_opts {
1387 {Opt_resuid, 0, MOPT_GTE0}, 1385 {Opt_resuid, 0, MOPT_GTE0},
1388 {Opt_resgid, 0, MOPT_GTE0}, 1386 {Opt_resgid, 0, MOPT_GTE0},
1389 {Opt_journal_dev, 0, MOPT_GTE0}, 1387 {Opt_journal_dev, 0, MOPT_GTE0},
1388 {Opt_journal_path, 0, MOPT_STRING},
1390 {Opt_journal_ioprio, 0, MOPT_GTE0}, 1389 {Opt_journal_ioprio, 0, MOPT_GTE0},
1391 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1390 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1392 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1391 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
@@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1480 return -1; 1479 return -1;
1481 } 1480 }
1482 1481
1483 if (args->from && match_int(args, &arg)) 1482 if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
1484 return -1; 1483 return -1;
1485 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) 1484 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1486 return -1; 1485 return -1;
@@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1544 return -1; 1543 return -1;
1545 } 1544 }
1546 *journal_devnum = arg; 1545 *journal_devnum = arg;
1546 } else if (token == Opt_journal_path) {
1547 char *journal_path;
1548 struct inode *journal_inode;
1549 struct path path;
1550 int error;
1551
1552 if (is_remount) {
1553 ext4_msg(sb, KERN_ERR,
1554 "Cannot specify journal on remount");
1555 return -1;
1556 }
1557 journal_path = match_strdup(&args[0]);
1558 if (!journal_path) {
1559 ext4_msg(sb, KERN_ERR, "error: could not dup "
1560 "journal device string");
1561 return -1;
1562 }
1563
1564 error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1565 if (error) {
1566 ext4_msg(sb, KERN_ERR, "error: could not find "
1567 "journal device path: error %d", error);
1568 kfree(journal_path);
1569 return -1;
1570 }
1571
1572 journal_inode = path.dentry->d_inode;
1573 if (!S_ISBLK(journal_inode->i_mode)) {
1574 ext4_msg(sb, KERN_ERR, "error: journal path %s "
1575 "is not a block device", journal_path);
1576 path_put(&path);
1577 kfree(journal_path);
1578 return -1;
1579 }
1580
1581 *journal_devnum = new_encode_dev(journal_inode->i_rdev);
1582 path_put(&path);
1583 kfree(journal_path);
1547 } else if (token == Opt_journal_ioprio) { 1584 } else if (token == Opt_journal_ioprio) {
1548 if (arg > 7) { 1585 if (arg > 7) {
1549 ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" 1586 ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
@@ -3954,14 +3991,6 @@ no_journal:
3954 goto failed_mount4; 3991 goto failed_mount4;
3955 } 3992 }
3956 3993
3957 EXT4_SB(sb)->unrsv_conversion_wq =
3958 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3959 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3960 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3961 ret = -ENOMEM;
3962 goto failed_mount4;
3963 }
3964
3965 /* 3994 /*
3966 * The jbd2_journal_load will have done any necessary log recovery, 3995 * The jbd2_journal_load will have done any necessary log recovery,
3967 * so we can safely mount the rest of the filesystem now. 3996 * so we can safely mount the rest of the filesystem now.
@@ -4115,8 +4144,6 @@ failed_mount4:
4115 ext4_msg(sb, KERN_ERR, "mount failed"); 4144 ext4_msg(sb, KERN_ERR, "mount failed");
4116 if (EXT4_SB(sb)->rsv_conversion_wq) 4145 if (EXT4_SB(sb)->rsv_conversion_wq)
4117 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4146 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4118 if (EXT4_SB(sb)->unrsv_conversion_wq)
4119 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4120failed_mount_wq: 4147failed_mount_wq:
4121 if (sbi->s_journal) { 4148 if (sbi->s_journal) {
4122 jbd2_journal_destroy(sbi->s_journal); 4149 jbd2_journal_destroy(sbi->s_journal);
@@ -4564,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4564 4591
4565 trace_ext4_sync_fs(sb, wait); 4592 trace_ext4_sync_fs(sb, wait);
4566 flush_workqueue(sbi->rsv_conversion_wq); 4593 flush_workqueue(sbi->rsv_conversion_wq);
4567 flush_workqueue(sbi->unrsv_conversion_wq);
4568 /* 4594 /*
4569 * Writeback quota in non-journalled quota case - journalled quota has 4595 * Writeback quota in non-journalled quota case - journalled quota has
4570 * no dirty dquots 4596 * no dirty dquots
@@ -4600,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4600 4626
4601 trace_ext4_sync_fs(sb, wait); 4627 trace_ext4_sync_fs(sb, wait);
4602 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4628 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4603 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4604 dquot_writeback_dquots(sb, -1); 4629 dquot_writeback_dquots(sb, -1);
4605 if (wait && test_opt(sb, BARRIER)) 4630 if (wait && test_opt(sb, BARRIER))
4606 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4631 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 66a6b85a51d8..bb312201ca95 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = {
182 .set_page_dirty = f2fs_set_meta_page_dirty, 182 .set_page_dirty = f2fs_set_meta_page_dirty,
183}; 183};
184 184
185int check_orphan_space(struct f2fs_sb_info *sbi) 185int acquire_orphan_inode(struct f2fs_sb_info *sbi)
186{ 186{
187 unsigned int max_orphans; 187 unsigned int max_orphans;
188 int err = 0; 188 int err = 0;
@@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi)
197 mutex_lock(&sbi->orphan_inode_mutex); 197 mutex_lock(&sbi->orphan_inode_mutex);
198 if (sbi->n_orphans >= max_orphans) 198 if (sbi->n_orphans >= max_orphans)
199 err = -ENOSPC; 199 err = -ENOSPC;
200 else
201 sbi->n_orphans++;
200 mutex_unlock(&sbi->orphan_inode_mutex); 202 mutex_unlock(&sbi->orphan_inode_mutex);
201 return err; 203 return err;
202} 204}
203 205
206void release_orphan_inode(struct f2fs_sb_info *sbi)
207{
208 mutex_lock(&sbi->orphan_inode_mutex);
209 sbi->n_orphans--;
210 mutex_unlock(&sbi->orphan_inode_mutex);
211}
212
204void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 213void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
205{ 214{
206 struct list_head *head, *this; 215 struct list_head *head, *this;
@@ -229,21 +238,18 @@ retry:
229 list_add(&new->list, this->prev); 238 list_add(&new->list, this->prev);
230 else 239 else
231 list_add_tail(&new->list, head); 240 list_add_tail(&new->list, head);
232
233 sbi->n_orphans++;
234out: 241out:
235 mutex_unlock(&sbi->orphan_inode_mutex); 242 mutex_unlock(&sbi->orphan_inode_mutex);
236} 243}
237 244
238void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 245void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
239{ 246{
240 struct list_head *this, *next, *head; 247 struct list_head *head;
241 struct orphan_inode_entry *orphan; 248 struct orphan_inode_entry *orphan;
242 249
243 mutex_lock(&sbi->orphan_inode_mutex); 250 mutex_lock(&sbi->orphan_inode_mutex);
244 head = &sbi->orphan_inode_list; 251 head = &sbi->orphan_inode_list;
245 list_for_each_safe(this, next, head) { 252 list_for_each_entry(orphan, head, list) {
246 orphan = list_entry(this, struct orphan_inode_entry, list);
247 if (orphan->ino == ino) { 253 if (orphan->ino == ino) {
248 list_del(&orphan->list); 254 list_del(&orphan->list);
249 kmem_cache_free(orphan_entry_slab, orphan); 255 kmem_cache_free(orphan_entry_slab, orphan);
@@ -373,7 +379,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
373 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 379 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
374 goto invalid_cp1; 380 goto invalid_cp1;
375 381
376 pre_version = le64_to_cpu(cp_block->checkpoint_ver); 382 pre_version = cur_cp_version(cp_block);
377 383
378 /* Read the 2nd cp block in this CP pack */ 384 /* Read the 2nd cp block in this CP pack */
379 cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; 385 cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
@@ -388,7 +394,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
388 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 394 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
389 goto invalid_cp2; 395 goto invalid_cp2;
390 396
391 cur_version = le64_to_cpu(cp_block->checkpoint_ver); 397 cur_version = cur_cp_version(cp_block);
392 398
393 if (cur_version == pre_version) { 399 if (cur_version == pre_version) {
394 *version = cur_version; 400 *version = cur_version;
@@ -793,7 +799,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
793 * Increase the version number so that 799 * Increase the version number so that
794 * SIT entries and seg summaries are written at correct place 800 * SIT entries and seg summaries are written at correct place
795 */ 801 */
796 ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); 802 ckpt_ver = cur_cp_version(ckpt);
797 ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); 803 ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
798 804
799 /* write cached NAT/SIT entries to NAT/SIT area */ 805 /* write cached NAT/SIT entries to NAT/SIT area */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 035f9a345cdf..941f9b9ca3a5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
37 struct page *node_page = dn->node_page; 37 struct page *node_page = dn->node_page;
38 unsigned int ofs_in_node = dn->ofs_in_node; 38 unsigned int ofs_in_node = dn->ofs_in_node;
39 39
40 wait_on_page_writeback(node_page); 40 f2fs_wait_on_page_writeback(node_page, NODE, false);
41 41
42 rn = (struct f2fs_node *)page_address(node_page); 42 rn = F2FS_NODE(node_page);
43 43
44 /* Get physical address of data block */ 44 /* Get physical address of data block */
45 addr_array = blkaddr_in_node(rn); 45 addr_array = blkaddr_in_node(rn);
@@ -117,7 +117,8 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
117 block_t start_blkaddr, end_blkaddr; 117 block_t start_blkaddr, end_blkaddr;
118 118
119 BUG_ON(blk_addr == NEW_ADDR); 119 BUG_ON(blk_addr == NEW_ADDR);
120 fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; 120 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
121 dn->ofs_in_node;
121 122
122 /* Update the page address in the parent node */ 123 /* Update the page address in the parent node */
123 __set_data_blkaddr(dn, blk_addr); 124 __set_data_blkaddr(dn, blk_addr);
@@ -176,7 +177,6 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
176end_update: 177end_update:
177 write_unlock(&fi->ext.ext_lock); 178 write_unlock(&fi->ext.ext_lock);
178 sync_inode_page(dn); 179 sync_inode_page(dn);
179 return;
180} 180}
181 181
182struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 182struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -260,8 +260,17 @@ repeat:
260 if (PageUptodate(page)) 260 if (PageUptodate(page))
261 return page; 261 return page;
262 262
263 BUG_ON(dn.data_blkaddr == NEW_ADDR); 263 /*
264 BUG_ON(dn.data_blkaddr == NULL_ADDR); 264 * A new dentry page is allocated but not able to be written, since its
265 * new inode page couldn't be allocated due to -ENOSPC.
266 * In such the case, its blkaddr can be remained as NEW_ADDR.
267 * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
268 */
269 if (dn.data_blkaddr == NEW_ADDR) {
270 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
271 SetPageUptodate(page);
272 return page;
273 }
265 274
266 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 275 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
267 if (err) 276 if (err)
@@ -365,7 +374,6 @@ static void read_end_io(struct bio *bio, int err)
365 } 374 }
366 unlock_page(page); 375 unlock_page(page);
367 } while (bvec >= bio->bi_io_vec); 376 } while (bvec >= bio->bi_io_vec);
368 kfree(bio->bi_private);
369 bio_put(bio); 377 bio_put(bio);
370} 378}
371 379
@@ -391,7 +399,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
391 bio->bi_end_io = read_end_io; 399 bio->bi_end_io = read_end_io;
392 400
393 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 401 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
394 kfree(bio->bi_private);
395 bio_put(bio); 402 bio_put(bio);
396 up_read(&sbi->bio_sem); 403 up_read(&sbi->bio_sem);
397 f2fs_put_page(page, 1); 404 f2fs_put_page(page, 1);
@@ -442,7 +449,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
442 unsigned int end_offset; 449 unsigned int end_offset;
443 450
444 end_offset = IS_INODE(dn.node_page) ? 451 end_offset = IS_INODE(dn.node_page) ?
445 ADDRS_PER_INODE : 452 ADDRS_PER_INODE(F2FS_I(inode)) :
446 ADDRS_PER_BLOCK; 453 ADDRS_PER_BLOCK;
447 454
448 clear_buffer_new(bh_result); 455 clear_buffer_new(bh_result);
@@ -636,9 +643,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
636 int err = 0; 643 int err = 0;
637 int ilock; 644 int ilock;
638 645
639 /* for nobh_write_end */
640 *fsdata = NULL;
641
642 f2fs_balance_fs(sbi); 646 f2fs_balance_fs(sbi);
643repeat: 647repeat:
644 page = grab_cache_page_write_begin(mapping, index, flags); 648 page = grab_cache_page_write_begin(mapping, index, flags);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0d6c6aafb235..a84b0a8e6854 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -29,7 +29,7 @@ static DEFINE_MUTEX(f2fs_stat_mutex);
29 29
30static void update_general_status(struct f2fs_sb_info *sbi) 30static void update_general_status(struct f2fs_sb_info *sbi)
31{ 31{
32 struct f2fs_stat_info *si = sbi->stat_info; 32 struct f2fs_stat_info *si = F2FS_STAT(sbi);
33 int i; 33 int i;
34 34
35 /* valid check of the segment numbers */ 35 /* valid check of the segment numbers */
@@ -83,7 +83,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
83 */ 83 */
84static void update_sit_info(struct f2fs_sb_info *sbi) 84static void update_sit_info(struct f2fs_sb_info *sbi)
85{ 85{
86 struct f2fs_stat_info *si = sbi->stat_info; 86 struct f2fs_stat_info *si = F2FS_STAT(sbi);
87 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; 87 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
88 struct sit_info *sit_i = SIT_I(sbi); 88 struct sit_info *sit_i = SIT_I(sbi);
89 unsigned int segno, vblocks; 89 unsigned int segno, vblocks;
@@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
118 */ 118 */
119static void update_mem_info(struct f2fs_sb_info *sbi) 119static void update_mem_info(struct f2fs_sb_info *sbi)
120{ 120{
121 struct f2fs_stat_info *si = sbi->stat_info; 121 struct f2fs_stat_info *si = F2FS_STAT(sbi);
122 unsigned npages; 122 unsigned npages;
123 123
124 if (si->base_mem) 124 if (si->base_mem)
@@ -253,21 +253,21 @@ static int stat_show(struct seq_file *s, void *v)
253 si->nats, NM_WOUT_THRESHOLD); 253 si->nats, NM_WOUT_THRESHOLD);
254 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 254 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
255 si->sits, si->fnids); 255 si->sits, si->fnids);
256 seq_printf(s, "\nDistribution of User Blocks:"); 256 seq_puts(s, "\nDistribution of User Blocks:");
257 seq_printf(s, " [ valid | invalid | free ]\n"); 257 seq_puts(s, " [ valid | invalid | free ]\n");
258 seq_printf(s, " ["); 258 seq_puts(s, " [");
259 259
260 for (j = 0; j < si->util_valid; j++) 260 for (j = 0; j < si->util_valid; j++)
261 seq_printf(s, "-"); 261 seq_putc(s, '-');
262 seq_printf(s, "|"); 262 seq_putc(s, '|');
263 263
264 for (j = 0; j < si->util_invalid; j++) 264 for (j = 0; j < si->util_invalid; j++)
265 seq_printf(s, "-"); 265 seq_putc(s, '-');
266 seq_printf(s, "|"); 266 seq_putc(s, '|');
267 267
268 for (j = 0; j < si->util_free; j++) 268 for (j = 0; j < si->util_free; j++)
269 seq_printf(s, "-"); 269 seq_putc(s, '-');
270 seq_printf(s, "]\n\n"); 270 seq_puts(s, "]\n\n");
271 seq_printf(s, "SSR: %u blocks in %u segments\n", 271 seq_printf(s, "SSR: %u blocks in %u segments\n",
272 si->block_count[SSR], si->segment_count[SSR]); 272 si->block_count[SSR], si->segment_count[SSR]);
273 seq_printf(s, "LFS: %u blocks in %u segments\n", 273 seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -305,11 +305,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
305 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); 305 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
306 struct f2fs_stat_info *si; 306 struct f2fs_stat_info *si;
307 307
308 sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); 308 si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
309 if (!sbi->stat_info) 309 if (!si)
310 return -ENOMEM; 310 return -ENOMEM;
311 311
312 si = sbi->stat_info;
313 si->all_area_segs = le32_to_cpu(raw_super->segment_count); 312 si->all_area_segs = le32_to_cpu(raw_super->segment_count);
314 si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); 313 si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
315 si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); 314 si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -319,6 +318,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
319 si->main_area_zones = si->main_area_sections / 318 si->main_area_zones = si->main_area_sections /
320 le32_to_cpu(raw_super->secs_per_zone); 319 le32_to_cpu(raw_super->secs_per_zone);
321 si->sbi = sbi; 320 si->sbi = sbi;
321 sbi->stat_info = si;
322 322
323 mutex_lock(&f2fs_stat_mutex); 323 mutex_lock(&f2fs_stat_mutex);
324 list_add_tail(&si->stat_list, &f2fs_stat_list); 324 list_add_tail(&si->stat_list, &f2fs_stat_list);
@@ -329,13 +329,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
329 329
330void f2fs_destroy_stats(struct f2fs_sb_info *sbi) 330void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
331{ 331{
332 struct f2fs_stat_info *si = sbi->stat_info; 332 struct f2fs_stat_info *si = F2FS_STAT(sbi);
333 333
334 mutex_lock(&f2fs_stat_mutex); 334 mutex_lock(&f2fs_stat_mutex);
335 list_del(&si->stat_list); 335 list_del(&si->stat_list);
336 mutex_unlock(&f2fs_stat_mutex); 336 mutex_unlock(&f2fs_stat_mutex);
337 337
338 kfree(sbi->stat_info); 338 kfree(si);
339} 339}
340 340
341void __init f2fs_create_root_stats(void) 341void __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 62f0d5977c64..384c6daf9a89 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -270,12 +270,27 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
270 struct f2fs_node *rn; 270 struct f2fs_node *rn;
271 271
272 /* copy name info. to this inode page */ 272 /* copy name info. to this inode page */
273 rn = (struct f2fs_node *)page_address(ipage); 273 rn = F2FS_NODE(ipage);
274 rn->i.i_namelen = cpu_to_le32(name->len); 274 rn->i.i_namelen = cpu_to_le32(name->len);
275 memcpy(rn->i.i_name, name->name, name->len); 275 memcpy(rn->i.i_name, name->name, name->len);
276 set_page_dirty(ipage); 276 set_page_dirty(ipage);
277} 277}
278 278
279int update_dent_inode(struct inode *inode, const struct qstr *name)
280{
281 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
282 struct page *page;
283
284 page = get_node_page(sbi, inode->i_ino);
285 if (IS_ERR(page))
286 return PTR_ERR(page);
287
288 init_dent_inode(name, page);
289 f2fs_put_page(page, 1);
290
291 return 0;
292}
293
279static int make_empty_dir(struct inode *inode, 294static int make_empty_dir(struct inode *inode,
280 struct inode *parent, struct page *page) 295 struct inode *parent, struct page *page)
281{ 296{
@@ -557,6 +572,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
557 572
558 if (inode->i_nlink == 0) 573 if (inode->i_nlink == 0)
559 add_orphan_inode(sbi, inode->i_ino); 574 add_orphan_inode(sbi, inode->i_ino);
575 else
576 release_orphan_inode(sbi);
560 } 577 }
561 578
562 if (bit_pos == NR_DENTRY_IN_BLOCK) { 579 if (bit_pos == NR_DENTRY_IN_BLOCK) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 467d42d65c48..608f0df5b919 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/magic.h> 19#include <linux/magic.h>
20#include <linux/kobject.h>
20 21
21/* 22/*
22 * For mount options 23 * For mount options
@@ -28,6 +29,7 @@
28#define F2FS_MOUNT_XATTR_USER 0x00000010 29#define F2FS_MOUNT_XATTR_USER 0x00000010
29#define F2FS_MOUNT_POSIX_ACL 0x00000020 30#define F2FS_MOUNT_POSIX_ACL 0x00000020
30#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 31#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
32#define F2FS_MOUNT_INLINE_XATTR 0x00000080
31 33
32#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 34#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
33#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 35#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -134,11 +136,13 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
134/* 136/*
135 * For INODE and NODE manager 137 * For INODE and NODE manager
136 */ 138 */
137#define XATTR_NODE_OFFSET (-1) /* 139/*
138 * store xattrs to one node block per 140 * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1
139 * file keeping -1 as its node offset to 141 * as its node offset to distinguish from index node blocks.
140 * distinguish from index node blocks. 142 * But some bits are used to mark the node block.
141 */ 143 */
144#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \
145 >> OFFSET_BIT_SHIFT)
142enum { 146enum {
143 ALLOC_NODE, /* allocate a new node page if needed */ 147 ALLOC_NODE, /* allocate a new node page if needed */
144 LOOKUP_NODE, /* look up a node without readahead */ 148 LOOKUP_NODE, /* look up a node without readahead */
@@ -178,6 +182,7 @@ struct f2fs_inode_info {
178 f2fs_hash_t chash; /* hash value of given file name */ 182 f2fs_hash_t chash; /* hash value of given file name */
179 unsigned int clevel; /* maximum level of given file name */ 183 unsigned int clevel; /* maximum level of given file name */
180 nid_t i_xattr_nid; /* node id that contains xattrs */ 184 nid_t i_xattr_nid; /* node id that contains xattrs */
185 unsigned long long xattr_ver; /* cp version of xattr modification */
181 struct extent_info ext; /* in-memory extent cache entry */ 186 struct extent_info ext; /* in-memory extent cache entry */
182}; 187};
183 188
@@ -296,15 +301,6 @@ struct f2fs_sm_info {
296}; 301};
297 302
298/* 303/*
299 * For directory operation
300 */
301#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
302#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
303#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
304#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
305#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
306
307/*
308 * For superblock 304 * For superblock
309 */ 305 */
310/* 306/*
@@ -350,6 +346,7 @@ enum page_type {
350 346
351struct f2fs_sb_info { 347struct f2fs_sb_info {
352 struct super_block *sb; /* pointer to VFS super block */ 348 struct super_block *sb; /* pointer to VFS super block */
349 struct proc_dir_entry *s_proc; /* proc entry */
353 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 350 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
354 struct f2fs_super_block *raw_super; /* raw super block pointer */ 351 struct f2fs_super_block *raw_super; /* raw super block pointer */
355 int s_dirty; /* dirty flag for checkpoint */ 352 int s_dirty; /* dirty flag for checkpoint */
@@ -429,6 +426,10 @@ struct f2fs_sb_info {
429#endif 426#endif
430 unsigned int last_victim[2]; /* last victim segment # */ 427 unsigned int last_victim[2]; /* last victim segment # */
431 spinlock_t stat_lock; /* lock for stat operations */ 428 spinlock_t stat_lock; /* lock for stat operations */
429
430 /* For sysfs suppport */
431 struct kobject s_kobj;
432 struct completion s_kobj_unregister;
432}; 433};
433 434
434/* 435/*
@@ -454,6 +455,11 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
454 return (struct f2fs_checkpoint *)(sbi->ckpt); 455 return (struct f2fs_checkpoint *)(sbi->ckpt);
455} 456}
456 457
458static inline struct f2fs_node *F2FS_NODE(struct page *page)
459{
460 return (struct f2fs_node *)page_address(page);
461}
462
457static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) 463static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
458{ 464{
459 return (struct f2fs_nm_info *)(sbi->nm_info); 465 return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -489,6 +495,11 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
489 sbi->s_dirty = 0; 495 sbi->s_dirty = 0;
490} 496}
491 497
498static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
499{
500 return le64_to_cpu(cp->checkpoint_ver);
501}
502
492static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) 503static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
493{ 504{
494 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); 505 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
@@ -677,7 +688,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
677{ 688{
678 block_t start_addr; 689 block_t start_addr;
679 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 690 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
680 unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); 691 unsigned long long ckpt_version = cur_cp_version(ckpt);
681 692
682 start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); 693 start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
683 694
@@ -812,7 +823,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
812 823
813static inline bool IS_INODE(struct page *page) 824static inline bool IS_INODE(struct page *page)
814{ 825{
815 struct f2fs_node *p = (struct f2fs_node *)page_address(page); 826 struct f2fs_node *p = F2FS_NODE(page);
816 return RAW_IS_INODE(p); 827 return RAW_IS_INODE(p);
817} 828}
818 829
@@ -826,7 +837,7 @@ static inline block_t datablock_addr(struct page *node_page,
826{ 837{
827 struct f2fs_node *raw_node; 838 struct f2fs_node *raw_node;
828 __le32 *addr_array; 839 __le32 *addr_array;
829 raw_node = (struct f2fs_node *)page_address(node_page); 840 raw_node = F2FS_NODE(node_page);
830 addr_array = blkaddr_in_node(raw_node); 841 addr_array = blkaddr_in_node(raw_node);
831 return le32_to_cpu(addr_array[offset]); 842 return le32_to_cpu(addr_array[offset]);
832} 843}
@@ -873,6 +884,7 @@ enum {
873 FI_NO_ALLOC, /* should not allocate any blocks */ 884 FI_NO_ALLOC, /* should not allocate any blocks */
874 FI_UPDATE_DIR, /* should update inode block for consistency */ 885 FI_UPDATE_DIR, /* should update inode block for consistency */
875 FI_DELAY_IPUT, /* used for the recovery */ 886 FI_DELAY_IPUT, /* used for the recovery */
887 FI_INLINE_XATTR, /* used for inline xattr */
876}; 888};
877 889
878static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 890static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -905,6 +917,45 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
905 return 0; 917 return 0;
906} 918}
907 919
920static inline void get_inline_info(struct f2fs_inode_info *fi,
921 struct f2fs_inode *ri)
922{
923 if (ri->i_inline & F2FS_INLINE_XATTR)
924 set_inode_flag(fi, FI_INLINE_XATTR);
925}
926
927static inline void set_raw_inline(struct f2fs_inode_info *fi,
928 struct f2fs_inode *ri)
929{
930 ri->i_inline = 0;
931
932 if (is_inode_flag_set(fi, FI_INLINE_XATTR))
933 ri->i_inline |= F2FS_INLINE_XATTR;
934}
935
936static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
937{
938 if (is_inode_flag_set(fi, FI_INLINE_XATTR))
939 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
940 return DEF_ADDRS_PER_INODE;
941}
942
943static inline void *inline_xattr_addr(struct page *page)
944{
945 struct f2fs_inode *ri;
946 ri = (struct f2fs_inode *)page_address(page);
947 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
948 F2FS_INLINE_XATTR_ADDRS]);
949}
950
951static inline int inline_xattr_size(struct inode *inode)
952{
953 if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR))
954 return F2FS_INLINE_XATTR_ADDRS << 2;
955 else
956 return 0;
957}
958
908static inline int f2fs_readonly(struct super_block *sb) 959static inline int f2fs_readonly(struct super_block *sb)
909{ 960{
910 return sb->s_flags & MS_RDONLY; 961 return sb->s_flags & MS_RDONLY;
@@ -947,6 +998,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
947ino_t f2fs_inode_by_name(struct inode *, struct qstr *); 998ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
948void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, 999void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
949 struct page *, struct inode *); 1000 struct page *, struct inode *);
1001int update_dent_inode(struct inode *, const struct qstr *);
950int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); 1002int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
951void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); 1003void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
952int f2fs_make_empty(struct inode *, struct inode *); 1004int f2fs_make_empty(struct inode *, struct inode *);
@@ -980,6 +1032,7 @@ int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
980void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1032void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
981int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1033int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
982int truncate_inode_blocks(struct inode *, pgoff_t); 1034int truncate_inode_blocks(struct inode *, pgoff_t);
1035int truncate_xattr_node(struct inode *, struct page *);
983int remove_inode_page(struct inode *); 1036int remove_inode_page(struct inode *);
984struct page *new_inode_page(struct inode *, const struct qstr *); 1037struct page *new_inode_page(struct inode *, const struct qstr *);
985struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); 1038struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
@@ -1012,7 +1065,8 @@ int npages_for_summary_flush(struct f2fs_sb_info *);
1012void allocate_new_segments(struct f2fs_sb_info *); 1065void allocate_new_segments(struct f2fs_sb_info *);
1013struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1066struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1014struct bio *f2fs_bio_alloc(struct block_device *, int); 1067struct bio *f2fs_bio_alloc(struct block_device *, int);
1015void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); 1068void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
1069void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
1016void write_meta_page(struct f2fs_sb_info *, struct page *); 1070void write_meta_page(struct f2fs_sb_info *, struct page *);
1017void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, 1071void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
1018 block_t, block_t *); 1072 block_t, block_t *);
@@ -1037,7 +1091,8 @@ void destroy_segment_manager(struct f2fs_sb_info *);
1037struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1091struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1038struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1092struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1039long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1093long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1040int check_orphan_space(struct f2fs_sb_info *); 1094int acquire_orphan_inode(struct f2fs_sb_info *);
1095void release_orphan_inode(struct f2fs_sb_info *);
1041void add_orphan_inode(struct f2fs_sb_info *, nid_t); 1096void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1042void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1097void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1043int recover_orphan_inodes(struct f2fs_sb_info *); 1098int recover_orphan_inodes(struct f2fs_sb_info *);
@@ -1068,7 +1123,7 @@ int do_write_data_page(struct page *);
1068 */ 1123 */
1069int start_gc_thread(struct f2fs_sb_info *); 1124int start_gc_thread(struct f2fs_sb_info *);
1070void stop_gc_thread(struct f2fs_sb_info *); 1125void stop_gc_thread(struct f2fs_sb_info *);
1071block_t start_bidx_of_node(unsigned int); 1126block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
1072int f2fs_gc(struct f2fs_sb_info *); 1127int f2fs_gc(struct f2fs_sb_info *);
1073void build_gc_manager(struct f2fs_sb_info *); 1128void build_gc_manager(struct f2fs_sb_info *);
1074int __init create_gc_caches(void); 1129int __init create_gc_caches(void);
@@ -1112,11 +1167,16 @@ struct f2fs_stat_info {
1112 unsigned base_mem, cache_mem; 1167 unsigned base_mem, cache_mem;
1113}; 1168};
1114 1169
1170static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1171{
1172 return (struct f2fs_stat_info*)sbi->stat_info;
1173}
1174
1115#define stat_inc_call_count(si) ((si)->call_count++) 1175#define stat_inc_call_count(si) ((si)->call_count++)
1116 1176
1117#define stat_inc_seg_count(sbi, type) \ 1177#define stat_inc_seg_count(sbi, type) \
1118 do { \ 1178 do { \
1119 struct f2fs_stat_info *si = sbi->stat_info; \ 1179 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1120 (si)->tot_segs++; \ 1180 (si)->tot_segs++; \
1121 if (type == SUM_TYPE_DATA) \ 1181 if (type == SUM_TYPE_DATA) \
1122 si->data_segs++; \ 1182 si->data_segs++; \
@@ -1129,14 +1189,14 @@ struct f2fs_stat_info {
1129 1189
1130#define stat_inc_data_blk_count(sbi, blks) \ 1190#define stat_inc_data_blk_count(sbi, blks) \
1131 do { \ 1191 do { \
1132 struct f2fs_stat_info *si = sbi->stat_info; \ 1192 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1133 stat_inc_tot_blk_count(si, blks); \ 1193 stat_inc_tot_blk_count(si, blks); \
1134 si->data_blks += (blks); \ 1194 si->data_blks += (blks); \
1135 } while (0) 1195 } while (0)
1136 1196
1137#define stat_inc_node_blk_count(sbi, blks) \ 1197#define stat_inc_node_blk_count(sbi, blks) \
1138 do { \ 1198 do { \
1139 struct f2fs_stat_info *si = sbi->stat_info; \ 1199 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1140 stat_inc_tot_blk_count(si, blks); \ 1200 stat_inc_tot_blk_count(si, blks); \
1141 si->node_blks += (blks); \ 1201 si->node_blks += (blks); \
1142 } while (0) 1202 } while (0)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d2d2b7dbdcc1..02c906971cc6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -112,11 +112,13 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
112 if (!dentry) 112 if (!dentry)
113 return 0; 113 return 0;
114 114
115 inode = igrab(dentry->d_parent->d_inode); 115 if (update_dent_inode(inode, &dentry->d_name)) {
116 dput(dentry); 116 dput(dentry);
117 return 0;
118 }
117 119
118 *pino = inode->i_ino; 120 *pino = parent_ino(dentry);
119 iput(inode); 121 dput(dentry);
120 return 1; 122 return 1;
121} 123}
122 124
@@ -147,9 +149,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
147 149
148 mutex_lock(&inode->i_mutex); 150 mutex_lock(&inode->i_mutex);
149 151
150 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 152 /*
151 goto out; 153 * Both of fdatasync() and fsync() are able to be recovered from
152 154 * sudden-power-off.
155 */
153 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 156 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
154 need_cp = true; 157 need_cp = true;
155 else if (file_wrong_pino(inode)) 158 else if (file_wrong_pino(inode))
@@ -158,10 +161,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
158 need_cp = true; 161 need_cp = true;
159 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) 162 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
160 need_cp = true; 163 need_cp = true;
164 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
165 need_cp = true;
161 166
162 if (need_cp) { 167 if (need_cp) {
163 nid_t pino; 168 nid_t pino;
164 169
170 F2FS_I(inode)->xattr_ver = 0;
171
165 /* all the dirty node pages should be flushed for POR */ 172 /* all the dirty node pages should be flushed for POR */
166 ret = f2fs_sync_fs(inode->i_sb, 1); 173 ret = f2fs_sync_fs(inode->i_sb, 1);
167 if (file_wrong_pino(inode) && inode->i_nlink == 1 && 174 if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
@@ -205,7 +212,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
205 struct f2fs_node *raw_node; 212 struct f2fs_node *raw_node;
206 __le32 *addr; 213 __le32 *addr;
207 214
208 raw_node = page_address(dn->node_page); 215 raw_node = F2FS_NODE(dn->node_page);
209 addr = blkaddr_in_node(raw_node) + ofs; 216 addr = blkaddr_in_node(raw_node) + ofs;
210 217
211 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { 218 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
@@ -283,7 +290,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
283 } 290 }
284 291
285 if (IS_INODE(dn.node_page)) 292 if (IS_INODE(dn.node_page))
286 count = ADDRS_PER_INODE; 293 count = ADDRS_PER_INODE(F2FS_I(inode));
287 else 294 else
288 count = ADDRS_PER_BLOCK; 295 count = ADDRS_PER_BLOCK;
289 296
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 35f9b1a196aa..2f157e883687 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -29,10 +29,11 @@ static struct kmem_cache *winode_slab;
29static int gc_thread_func(void *data) 29static int gc_thread_func(void *data)
30{ 30{
31 struct f2fs_sb_info *sbi = data; 31 struct f2fs_sb_info *sbi = data;
32 struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
32 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 33 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
33 long wait_ms; 34 long wait_ms;
34 35
35 wait_ms = GC_THREAD_MIN_SLEEP_TIME; 36 wait_ms = gc_th->min_sleep_time;
36 37
37 do { 38 do {
38 if (try_to_freeze()) 39 if (try_to_freeze())
@@ -45,7 +46,7 @@ static int gc_thread_func(void *data)
45 break; 46 break;
46 47
47 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 48 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
48 wait_ms = GC_THREAD_MAX_SLEEP_TIME; 49 wait_ms = increase_sleep_time(gc_th, wait_ms);
49 continue; 50 continue;
50 } 51 }
51 52
@@ -66,15 +67,15 @@ static int gc_thread_func(void *data)
66 continue; 67 continue;
67 68
68 if (!is_idle(sbi)) { 69 if (!is_idle(sbi)) {
69 wait_ms = increase_sleep_time(wait_ms); 70 wait_ms = increase_sleep_time(gc_th, wait_ms);
70 mutex_unlock(&sbi->gc_mutex); 71 mutex_unlock(&sbi->gc_mutex);
71 continue; 72 continue;
72 } 73 }
73 74
74 if (has_enough_invalid_blocks(sbi)) 75 if (has_enough_invalid_blocks(sbi))
75 wait_ms = decrease_sleep_time(wait_ms); 76 wait_ms = decrease_sleep_time(gc_th, wait_ms);
76 else 77 else
77 wait_ms = increase_sleep_time(wait_ms); 78 wait_ms = increase_sleep_time(gc_th, wait_ms);
78 79
79#ifdef CONFIG_F2FS_STAT_FS 80#ifdef CONFIG_F2FS_STAT_FS
80 sbi->bg_gc++; 81 sbi->bg_gc++;
@@ -82,7 +83,7 @@ static int gc_thread_func(void *data)
82 83
83 /* if return value is not zero, no victim was selected */ 84 /* if return value is not zero, no victim was selected */
84 if (f2fs_gc(sbi)) 85 if (f2fs_gc(sbi))
85 wait_ms = GC_THREAD_NOGC_SLEEP_TIME; 86 wait_ms = gc_th->no_gc_sleep_time;
86 } while (!kthread_should_stop()); 87 } while (!kthread_should_stop());
87 return 0; 88 return 0;
88} 89}
@@ -101,6 +102,12 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
101 goto out; 102 goto out;
102 } 103 }
103 104
105 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
106 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
107 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
108
109 gc_th->gc_idle = 0;
110
104 sbi->gc_thread = gc_th; 111 sbi->gc_thread = gc_th;
105 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 112 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
106 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 113 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
@@ -125,9 +132,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi)
125 sbi->gc_thread = NULL; 132 sbi->gc_thread = NULL;
126} 133}
127 134
128static int select_gc_type(int gc_type) 135static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
129{ 136{
130 return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 137 int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
138
139 if (gc_th && gc_th->gc_idle) {
140 if (gc_th->gc_idle == 1)
141 gc_mode = GC_CB;
142 else if (gc_th->gc_idle == 2)
143 gc_mode = GC_GREEDY;
144 }
145 return gc_mode;
131} 146}
132 147
133static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 148static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
@@ -138,12 +153,18 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
138 if (p->alloc_mode == SSR) { 153 if (p->alloc_mode == SSR) {
139 p->gc_mode = GC_GREEDY; 154 p->gc_mode = GC_GREEDY;
140 p->dirty_segmap = dirty_i->dirty_segmap[type]; 155 p->dirty_segmap = dirty_i->dirty_segmap[type];
156 p->max_search = dirty_i->nr_dirty[type];
141 p->ofs_unit = 1; 157 p->ofs_unit = 1;
142 } else { 158 } else {
143 p->gc_mode = select_gc_type(gc_type); 159 p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
144 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 160 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
161 p->max_search = dirty_i->nr_dirty[DIRTY];
145 p->ofs_unit = sbi->segs_per_sec; 162 p->ofs_unit = sbi->segs_per_sec;
146 } 163 }
164
165 if (p->max_search > MAX_VICTIM_SEARCH)
166 p->max_search = MAX_VICTIM_SEARCH;
167
147 p->offset = sbi->last_victim[p->gc_mode]; 168 p->offset = sbi->last_victim[p->gc_mode];
148} 169}
149 170
@@ -290,7 +311,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
290 if (cost == max_cost) 311 if (cost == max_cost)
291 continue; 312 continue;
292 313
293 if (nsearched++ >= MAX_VICTIM_SEARCH) { 314 if (nsearched++ >= p.max_search) {
294 sbi->last_victim[p.gc_mode] = segno; 315 sbi->last_victim[p.gc_mode] = segno;
295 break; 316 break;
296 } 317 }
@@ -407,8 +428,7 @@ next_step:
407 428
408 /* set page dirty and write it */ 429 /* set page dirty and write it */
409 if (gc_type == FG_GC) { 430 if (gc_type == FG_GC) {
410 f2fs_submit_bio(sbi, NODE, true); 431 f2fs_wait_on_page_writeback(node_page, NODE, true);
411 wait_on_page_writeback(node_page);
412 set_page_dirty(node_page); 432 set_page_dirty(node_page);
413 } else { 433 } else {
414 if (!PageWriteback(node_page)) 434 if (!PageWriteback(node_page))
@@ -447,7 +467,7 @@ next_step:
447 * as indirect or double indirect node blocks, are given, it must be a caller's 467 * as indirect or double indirect node blocks, are given, it must be a caller's
448 * bug. 468 * bug.
449 */ 469 */
450block_t start_bidx_of_node(unsigned int node_ofs) 470block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
451{ 471{
452 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 472 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
453 unsigned int bidx; 473 unsigned int bidx;
@@ -464,7 +484,7 @@ block_t start_bidx_of_node(unsigned int node_ofs)
464 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 484 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
465 bidx = node_ofs - 5 - dec; 485 bidx = node_ofs - 5 - dec;
466 } 486 }
467 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; 487 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
468} 488}
469 489
470static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 490static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -508,10 +528,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
508 } else { 528 } else {
509 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 529 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
510 530
511 if (PageWriteback(page)) { 531 f2fs_wait_on_page_writeback(page, DATA, true);
512 f2fs_submit_bio(sbi, DATA, true);
513 wait_on_page_writeback(page);
514 }
515 532
516 if (clear_page_dirty_for_io(page) && 533 if (clear_page_dirty_for_io(page) &&
517 S_ISDIR(inode->i_mode)) { 534 S_ISDIR(inode->i_mode)) {
@@ -575,7 +592,6 @@ next_step:
575 continue; 592 continue;
576 } 593 }
577 594
578 start_bidx = start_bidx_of_node(nofs);
579 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 595 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
580 596
581 if (phase == 2) { 597 if (phase == 2) {
@@ -583,6 +599,8 @@ next_step:
583 if (IS_ERR(inode)) 599 if (IS_ERR(inode))
584 continue; 600 continue;
585 601
602 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
603
586 data_page = find_data_page(inode, 604 data_page = find_data_page(inode,
587 start_bidx + ofs_in_node, false); 605 start_bidx + ofs_in_node, false);
588 if (IS_ERR(data_page)) 606 if (IS_ERR(data_page))
@@ -593,6 +611,8 @@ next_step:
593 } else { 611 } else {
594 inode = find_gc_inode(dni.ino, ilist); 612 inode = find_gc_inode(dni.ino, ilist);
595 if (inode) { 613 if (inode) {
614 start_bidx = start_bidx_of_node(nofs,
615 F2FS_I(inode));
596 data_page = get_lock_data_page(inode, 616 data_page = get_lock_data_page(inode,
597 start_bidx + ofs_in_node); 617 start_bidx + ofs_in_node);
598 if (IS_ERR(data_page)) 618 if (IS_ERR(data_page))
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 2c6a6bd08322..507056d22205 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,18 +13,26 @@
13 * whether IO subsystem is idle 13 * whether IO subsystem is idle
14 * or not 14 * or not
15 */ 15 */
16#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ 16#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
17#define GC_THREAD_MAX_SLEEP_TIME 60000 17#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
18#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ 18#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
19#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ 19#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ 20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
21 21
22/* Search max. number of dirty segments to select a victim segment */ 22/* Search max. number of dirty segments to select a victim segment */
23#define MAX_VICTIM_SEARCH 20 23#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
24 24
25struct f2fs_gc_kthread { 25struct f2fs_gc_kthread {
26 struct task_struct *f2fs_gc_task; 26 struct task_struct *f2fs_gc_task;
27 wait_queue_head_t gc_wait_queue_head; 27 wait_queue_head_t gc_wait_queue_head;
28
29 /* for gc sleep time */
30 unsigned int min_sleep_time;
31 unsigned int max_sleep_time;
32 unsigned int no_gc_sleep_time;
33
34 /* for changing gc mode */
35 unsigned int gc_idle;
28}; 36};
29 37
30struct inode_entry { 38struct inode_entry {
@@ -56,25 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
56 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; 64 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
57} 65}
58 66
59static inline long increase_sleep_time(long wait) 67static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
60{ 68{
61 if (wait == GC_THREAD_NOGC_SLEEP_TIME) 69 if (wait == gc_th->no_gc_sleep_time)
62 return wait; 70 return wait;
63 71
64 wait += GC_THREAD_MIN_SLEEP_TIME; 72 wait += gc_th->min_sleep_time;
65 if (wait > GC_THREAD_MAX_SLEEP_TIME) 73 if (wait > gc_th->max_sleep_time)
66 wait = GC_THREAD_MAX_SLEEP_TIME; 74 wait = gc_th->max_sleep_time;
67 return wait; 75 return wait;
68} 76}
69 77
70static inline long decrease_sleep_time(long wait) 78static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
71{ 79{
72 if (wait == GC_THREAD_NOGC_SLEEP_TIME) 80 if (wait == gc_th->no_gc_sleep_time)
73 wait = GC_THREAD_MAX_SLEEP_TIME; 81 wait = gc_th->max_sleep_time;
74 82
75 wait -= GC_THREAD_MIN_SLEEP_TIME; 83 wait -= gc_th->min_sleep_time;
76 if (wait <= GC_THREAD_MIN_SLEEP_TIME) 84 if (wait <= gc_th->min_sleep_time)
77 wait = GC_THREAD_MIN_SLEEP_TIME; 85 wait = gc_th->min_sleep_time;
78 return wait; 86 return wait;
79} 87}
80 88
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2b2d45d19e3e..9339cd292047 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -56,7 +56,7 @@ static int do_read_inode(struct inode *inode)
56 if (IS_ERR(node_page)) 56 if (IS_ERR(node_page))
57 return PTR_ERR(node_page); 57 return PTR_ERR(node_page);
58 58
59 rn = page_address(node_page); 59 rn = F2FS_NODE(node_page);
60 ri = &(rn->i); 60 ri = &(rn->i);
61 61
62 inode->i_mode = le16_to_cpu(ri->i_mode); 62 inode->i_mode = le16_to_cpu(ri->i_mode);
@@ -85,6 +85,7 @@ static int do_read_inode(struct inode *inode)
85 fi->i_advise = ri->i_advise; 85 fi->i_advise = ri->i_advise;
86 fi->i_pino = le32_to_cpu(ri->i_pino); 86 fi->i_pino = le32_to_cpu(ri->i_pino);
87 get_extent_info(&fi->ext, ri->i_ext); 87 get_extent_info(&fi->ext, ri->i_ext);
88 get_inline_info(fi, ri);
88 f2fs_put_page(node_page, 1); 89 f2fs_put_page(node_page, 1);
89 return 0; 90 return 0;
90} 91}
@@ -151,9 +152,9 @@ void update_inode(struct inode *inode, struct page *node_page)
151 struct f2fs_node *rn; 152 struct f2fs_node *rn;
152 struct f2fs_inode *ri; 153 struct f2fs_inode *ri;
153 154
154 wait_on_page_writeback(node_page); 155 f2fs_wait_on_page_writeback(node_page, NODE, false);
155 156
156 rn = page_address(node_page); 157 rn = F2FS_NODE(node_page);
157 ri = &(rn->i); 158 ri = &(rn->i);
158 159
159 ri->i_mode = cpu_to_le16(inode->i_mode); 160 ri->i_mode = cpu_to_le16(inode->i_mode);
@@ -164,6 +165,7 @@ void update_inode(struct inode *inode, struct page *node_page)
164 ri->i_size = cpu_to_le64(i_size_read(inode)); 165 ri->i_size = cpu_to_le64(i_size_read(inode));
165 ri->i_blocks = cpu_to_le64(inode->i_blocks); 166 ri->i_blocks = cpu_to_le64(inode->i_blocks);
166 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); 167 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
168 set_raw_inline(F2FS_I(inode), ri);
167 169
168 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 170 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
169 ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 171 ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -221,9 +223,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
221 if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) 223 if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
222 return 0; 224 return 0;
223 225
224 if (wbc)
225 f2fs_balance_fs(sbi);
226
227 /* 226 /*
228 * We need to lock here to prevent from producing dirty node pages 227 * We need to lock here to prevent from producing dirty node pages
229 * during the urgent cleaning time when runing out of free sections. 228 * during the urgent cleaning time when runing out of free sections.
@@ -231,6 +230,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
231 ilock = mutex_lock_op(sbi); 230 ilock = mutex_lock_op(sbi);
232 ret = update_inode_page(inode); 231 ret = update_inode_page(inode);
233 mutex_unlock_op(sbi, ilock); 232 mutex_unlock_op(sbi, ilock);
233
234 if (wbc)
235 f2fs_balance_fs(sbi);
236
234 return ret; 237 return ret;
235} 238}
236 239
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 64c07169df05..2a5359c990fc 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -83,21 +83,11 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
83{ 83{
84 size_t slen = strlen(s); 84 size_t slen = strlen(s);
85 size_t sublen = strlen(sub); 85 size_t sublen = strlen(sub);
86 int ret;
87 86
88 if (sublen > slen) 87 if (sublen > slen)
89 return 0; 88 return 0;
90 89
91 ret = memcmp(s + slen - sublen, sub, sublen); 90 return !strncasecmp(s + slen - sublen, sub, sublen);
92 if (ret) { /* compare upper case */
93 int i;
94 char upper_sub[8];
95 for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
96 upper_sub[i] = toupper(sub[i]);
97 return !memcmp(s + slen - sublen, upper_sub, sublen);
98 }
99
100 return !ret;
101} 91}
102 92
103/* 93/*
@@ -239,7 +229,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
239 if (!de) 229 if (!de)
240 goto fail; 230 goto fail;
241 231
242 err = check_orphan_space(sbi); 232 err = acquire_orphan_inode(sbi);
243 if (err) { 233 if (err) {
244 kunmap(page); 234 kunmap(page);
245 f2fs_put_page(page, 0); 235 f2fs_put_page(page, 0);
@@ -393,7 +383,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
393 struct inode *old_inode = old_dentry->d_inode; 383 struct inode *old_inode = old_dentry->d_inode;
394 struct inode *new_inode = new_dentry->d_inode; 384 struct inode *new_inode = new_dentry->d_inode;
395 struct page *old_dir_page; 385 struct page *old_dir_page;
396 struct page *old_page; 386 struct page *old_page, *new_page;
397 struct f2fs_dir_entry *old_dir_entry = NULL; 387 struct f2fs_dir_entry *old_dir_entry = NULL;
398 struct f2fs_dir_entry *old_entry; 388 struct f2fs_dir_entry *old_entry;
399 struct f2fs_dir_entry *new_entry; 389 struct f2fs_dir_entry *new_entry;
@@ -415,7 +405,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
415 ilock = mutex_lock_op(sbi); 405 ilock = mutex_lock_op(sbi);
416 406
417 if (new_inode) { 407 if (new_inode) {
418 struct page *new_page;
419 408
420 err = -ENOTEMPTY; 409 err = -ENOTEMPTY;
421 if (old_dir_entry && !f2fs_empty_dir(new_inode)) 410 if (old_dir_entry && !f2fs_empty_dir(new_inode))
@@ -427,14 +416,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
427 if (!new_entry) 416 if (!new_entry)
428 goto out_dir; 417 goto out_dir;
429 418
419 err = acquire_orphan_inode(sbi);
420 if (err)
421 goto put_out_dir;
422
423 if (update_dent_inode(old_inode, &new_dentry->d_name)) {
424 release_orphan_inode(sbi);
425 goto put_out_dir;
426 }
427
430 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 428 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
431 429
432 new_inode->i_ctime = CURRENT_TIME; 430 new_inode->i_ctime = CURRENT_TIME;
433 if (old_dir_entry) 431 if (old_dir_entry)
434 drop_nlink(new_inode); 432 drop_nlink(new_inode);
435 drop_nlink(new_inode); 433 drop_nlink(new_inode);
434
436 if (!new_inode->i_nlink) 435 if (!new_inode->i_nlink)
437 add_orphan_inode(sbi, new_inode->i_ino); 436 add_orphan_inode(sbi, new_inode->i_ino);
437 else
438 release_orphan_inode(sbi);
439
440 update_inode_page(old_inode);
438 update_inode_page(new_inode); 441 update_inode_page(new_inode);
439 } else { 442 } else {
440 err = f2fs_add_link(new_dentry, old_inode); 443 err = f2fs_add_link(new_dentry, old_inode);
@@ -467,6 +470,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
467 mutex_unlock_op(sbi, ilock); 470 mutex_unlock_op(sbi, ilock);
468 return 0; 471 return 0;
469 472
473put_out_dir:
474 f2fs_put_page(new_page, 1);
470out_dir: 475out_dir:
471 if (old_dir_entry) { 476 if (old_dir_entry) {
472 kunmap(old_dir_page); 477 kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b418aee09573..51ef27894433 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -315,9 +315,10 @@ cache:
315 * The maximum depth is four. 315 * The maximum depth is four.
316 * Offset[0] will have raw inode offset. 316 * Offset[0] will have raw inode offset.
317 */ 317 */
318static int get_node_path(long block, int offset[4], unsigned int noffset[4]) 318static int get_node_path(struct f2fs_inode_info *fi, long block,
319 int offset[4], unsigned int noffset[4])
319{ 320{
320 const long direct_index = ADDRS_PER_INODE; 321 const long direct_index = ADDRS_PER_INODE(fi);
321 const long direct_blks = ADDRS_PER_BLOCK; 322 const long direct_blks = ADDRS_PER_BLOCK;
322 const long dptrs_per_blk = NIDS_PER_BLOCK; 323 const long dptrs_per_blk = NIDS_PER_BLOCK;
323 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; 324 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -405,7 +406,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
405 int level, i; 406 int level, i;
406 int err = 0; 407 int err = 0;
407 408
408 level = get_node_path(index, offset, noffset); 409 level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
409 410
410 nids[0] = dn->inode->i_ino; 411 nids[0] = dn->inode->i_ino;
411 npage[0] = dn->inode_page; 412 npage[0] = dn->inode_page;
@@ -565,7 +566,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
565 return PTR_ERR(page); 566 return PTR_ERR(page);
566 } 567 }
567 568
568 rn = (struct f2fs_node *)page_address(page); 569 rn = F2FS_NODE(page);
569 if (depth < 3) { 570 if (depth < 3) {
570 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { 571 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
571 child_nid = le32_to_cpu(rn->in.nid[i]); 572 child_nid = le32_to_cpu(rn->in.nid[i]);
@@ -687,7 +688,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
687 688
688 trace_f2fs_truncate_inode_blocks_enter(inode, from); 689 trace_f2fs_truncate_inode_blocks_enter(inode, from);
689 690
690 level = get_node_path(from, offset, noffset); 691 level = get_node_path(F2FS_I(inode), from, offset, noffset);
691restart: 692restart:
692 page = get_node_page(sbi, inode->i_ino); 693 page = get_node_page(sbi, inode->i_ino);
693 if (IS_ERR(page)) { 694 if (IS_ERR(page)) {
@@ -698,7 +699,7 @@ restart:
698 set_new_dnode(&dn, inode, page, NULL, 0); 699 set_new_dnode(&dn, inode, page, NULL, 0);
699 unlock_page(page); 700 unlock_page(page);
700 701
701 rn = page_address(page); 702 rn = F2FS_NODE(page);
702 switch (level) { 703 switch (level) {
703 case 0: 704 case 0:
704 case 1: 705 case 1:
@@ -771,6 +772,33 @@ fail:
771 return err > 0 ? 0 : err; 772 return err > 0 ? 0 : err;
772} 773}
773 774
775int truncate_xattr_node(struct inode *inode, struct page *page)
776{
777 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
778 nid_t nid = F2FS_I(inode)->i_xattr_nid;
779 struct dnode_of_data dn;
780 struct page *npage;
781
782 if (!nid)
783 return 0;
784
785 npage = get_node_page(sbi, nid);
786 if (IS_ERR(npage))
787 return PTR_ERR(npage);
788
789 F2FS_I(inode)->i_xattr_nid = 0;
790
791 /* need to do checkpoint during fsync */
792 F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
793
794 set_new_dnode(&dn, inode, page, npage, nid);
795
796 if (page)
797 dn.inode_page_locked = 1;
798 truncate_node(&dn);
799 return 0;
800}
801
774/* 802/*
775 * Caller should grab and release a mutex by calling mutex_lock_op() and 803 * Caller should grab and release a mutex by calling mutex_lock_op() and
776 * mutex_unlock_op(). 804 * mutex_unlock_op().
@@ -781,22 +809,16 @@ int remove_inode_page(struct inode *inode)
781 struct page *page; 809 struct page *page;
782 nid_t ino = inode->i_ino; 810 nid_t ino = inode->i_ino;
783 struct dnode_of_data dn; 811 struct dnode_of_data dn;
812 int err;
784 813
785 page = get_node_page(sbi, ino); 814 page = get_node_page(sbi, ino);
786 if (IS_ERR(page)) 815 if (IS_ERR(page))
787 return PTR_ERR(page); 816 return PTR_ERR(page);
788 817
789 if (F2FS_I(inode)->i_xattr_nid) { 818 err = truncate_xattr_node(inode, page);
790 nid_t nid = F2FS_I(inode)->i_xattr_nid; 819 if (err) {
791 struct page *npage = get_node_page(sbi, nid); 820 f2fs_put_page(page, 1);
792 821 return err;
793 if (IS_ERR(npage))
794 return PTR_ERR(npage);
795
796 F2FS_I(inode)->i_xattr_nid = 0;
797 set_new_dnode(&dn, inode, page, npage, nid);
798 dn.inode_page_locked = 1;
799 truncate_node(&dn);
800 } 822 }
801 823
802 /* 0 is possible, after f2fs_new_inode() is failed */ 824 /* 0 is possible, after f2fs_new_inode() is failed */
@@ -833,29 +855,32 @@ struct page *new_node_page(struct dnode_of_data *dn,
833 if (!page) 855 if (!page)
834 return ERR_PTR(-ENOMEM); 856 return ERR_PTR(-ENOMEM);
835 857
836 get_node_info(sbi, dn->nid, &old_ni); 858 if (!inc_valid_node_count(sbi, dn->inode, 1)) {
859 err = -ENOSPC;
860 goto fail;
861 }
837 862
838 SetPageUptodate(page); 863 get_node_info(sbi, dn->nid, &old_ni);
839 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
840 864
841 /* Reinitialize old_ni with new node page */ 865 /* Reinitialize old_ni with new node page */
842 BUG_ON(old_ni.blk_addr != NULL_ADDR); 866 BUG_ON(old_ni.blk_addr != NULL_ADDR);
843 new_ni = old_ni; 867 new_ni = old_ni;
844 new_ni.ino = dn->inode->i_ino; 868 new_ni.ino = dn->inode->i_ino;
845
846 if (!inc_valid_node_count(sbi, dn->inode, 1)) {
847 err = -ENOSPC;
848 goto fail;
849 }
850 set_node_addr(sbi, &new_ni, NEW_ADDR); 869 set_node_addr(sbi, &new_ni, NEW_ADDR);
870
871 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
851 set_cold_node(dn->inode, page); 872 set_cold_node(dn->inode, page);
873 SetPageUptodate(page);
874 set_page_dirty(page);
875
876 if (ofs == XATTR_NODE_OFFSET)
877 F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
852 878
853 dn->node_page = page; 879 dn->node_page = page;
854 if (ipage) 880 if (ipage)
855 update_inode(dn->inode, ipage); 881 update_inode(dn->inode, ipage);
856 else 882 else
857 sync_inode_page(dn); 883 sync_inode_page(dn);
858 set_page_dirty(page);
859 if (ofs == 0) 884 if (ofs == 0)
860 inc_valid_inode_count(sbi); 885 inc_valid_inode_count(sbi);
861 886
@@ -916,7 +941,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
916 f2fs_put_page(apage, 0); 941 f2fs_put_page(apage, 0);
917 else if (err == LOCKED_PAGE) 942 else if (err == LOCKED_PAGE)
918 f2fs_put_page(apage, 1); 943 f2fs_put_page(apage, 1);
919 return;
920} 944}
921 945
922struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) 946struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1167,9 +1191,9 @@ static int f2fs_write_node_page(struct page *page,
1167/* 1191/*
1168 * It is very important to gather dirty pages and write at once, so that we can 1192 * It is very important to gather dirty pages and write at once, so that we can
1169 * submit a big bio without interfering other data writes. 1193 * submit a big bio without interfering other data writes.
1170 * Be default, 512 pages (2MB), a segment size, is quite reasonable. 1194 * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
1171 */ 1195 */
1172#define COLLECT_DIRTY_NODES 512 1196#define COLLECT_DIRTY_NODES 1536
1173static int f2fs_write_node_pages(struct address_space *mapping, 1197static int f2fs_write_node_pages(struct address_space *mapping,
1174 struct writeback_control *wbc) 1198 struct writeback_control *wbc)
1175{ 1199{
@@ -1187,9 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1187 return 0; 1211 return 0;
1188 1212
1189 /* if mounting is failed, skip writing node pages */ 1213 /* if mounting is failed, skip writing node pages */
1190 wbc->nr_to_write = max_hw_blocks(sbi); 1214 wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1191 sync_node_pages(sbi, 0, wbc); 1215 sync_node_pages(sbi, 0, wbc);
1192 wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); 1216 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
1217 wbc->nr_to_write);
1193 return 0; 1218 return 0;
1194} 1219}
1195 1220
@@ -1444,6 +1469,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1444 struct f2fs_nm_info *nm_i = NM_I(sbi); 1469 struct f2fs_nm_info *nm_i = NM_I(sbi);
1445 struct free_nid *i; 1470 struct free_nid *i;
1446 1471
1472 if (!nid)
1473 return;
1474
1447 spin_lock(&nm_i->free_nid_list_lock); 1475 spin_lock(&nm_i->free_nid_list_lock);
1448 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1476 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1449 BUG_ON(!i || i->state != NID_ALLOC); 1477 BUG_ON(!i || i->state != NID_ALLOC);
@@ -1484,8 +1512,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1484 SetPageUptodate(ipage); 1512 SetPageUptodate(ipage);
1485 fill_node_footer(ipage, ino, ino, 0, true); 1513 fill_node_footer(ipage, ino, ino, 0, true);
1486 1514
1487 src = (struct f2fs_node *)page_address(page); 1515 src = F2FS_NODE(page);
1488 dst = (struct f2fs_node *)page_address(ipage); 1516 dst = F2FS_NODE(ipage);
1489 1517
1490 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); 1518 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
1491 dst->i.i_size = 0; 1519 dst->i.i_size = 0;
@@ -1515,8 +1543,8 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1515 1543
1516 /* alloc temporal page for read node */ 1544 /* alloc temporal page for read node */
1517 page = alloc_page(GFP_NOFS | __GFP_ZERO); 1545 page = alloc_page(GFP_NOFS | __GFP_ZERO);
1518 if (IS_ERR(page)) 1546 if (!page)
1519 return PTR_ERR(page); 1547 return -ENOMEM;
1520 lock_page(page); 1548 lock_page(page);
1521 1549
1522 /* scan the node segment */ 1550 /* scan the node segment */
@@ -1535,7 +1563,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1535 goto out; 1563 goto out;
1536 1564
1537 lock_page(page); 1565 lock_page(page);
1538 rn = (struct f2fs_node *)page_address(page); 1566 rn = F2FS_NODE(page);
1539 sum_entry->nid = rn->footer.nid; 1567 sum_entry->nid = rn->footer.nid;
1540 sum_entry->version = 0; 1568 sum_entry->version = 0;
1541 sum_entry->ofs_in_node = 0; 1569 sum_entry->ofs_in_node = 0;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c65fb4f4230f..3496bb3e15dc 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -155,8 +155,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
155static inline void fill_node_footer(struct page *page, nid_t nid, 155static inline void fill_node_footer(struct page *page, nid_t nid,
156 nid_t ino, unsigned int ofs, bool reset) 156 nid_t ino, unsigned int ofs, bool reset)
157{ 157{
158 void *kaddr = page_address(page); 158 struct f2fs_node *rn = F2FS_NODE(page);
159 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
160 if (reset) 159 if (reset)
161 memset(rn, 0, sizeof(*rn)); 160 memset(rn, 0, sizeof(*rn));
162 rn->footer.nid = cpu_to_le32(nid); 161 rn->footer.nid = cpu_to_le32(nid);
@@ -166,10 +165,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
166 165
167static inline void copy_node_footer(struct page *dst, struct page *src) 166static inline void copy_node_footer(struct page *dst, struct page *src)
168{ 167{
169 void *src_addr = page_address(src); 168 struct f2fs_node *src_rn = F2FS_NODE(src);
170 void *dst_addr = page_address(dst); 169 struct f2fs_node *dst_rn = F2FS_NODE(dst);
171 struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
172 struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
173 memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); 170 memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
174} 171}
175 172
@@ -177,45 +174,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
177{ 174{
178 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 175 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
179 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 176 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
180 void *kaddr = page_address(page); 177 struct f2fs_node *rn = F2FS_NODE(page);
181 struct f2fs_node *rn = (struct f2fs_node *)kaddr; 178
182 rn->footer.cp_ver = ckpt->checkpoint_ver; 179 rn->footer.cp_ver = ckpt->checkpoint_ver;
183 rn->footer.next_blkaddr = cpu_to_le32(blkaddr); 180 rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
184} 181}
185 182
186static inline nid_t ino_of_node(struct page *node_page) 183static inline nid_t ino_of_node(struct page *node_page)
187{ 184{
188 void *kaddr = page_address(node_page); 185 struct f2fs_node *rn = F2FS_NODE(node_page);
189 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
190 return le32_to_cpu(rn->footer.ino); 186 return le32_to_cpu(rn->footer.ino);
191} 187}
192 188
193static inline nid_t nid_of_node(struct page *node_page) 189static inline nid_t nid_of_node(struct page *node_page)
194{ 190{
195 void *kaddr = page_address(node_page); 191 struct f2fs_node *rn = F2FS_NODE(node_page);
196 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
197 return le32_to_cpu(rn->footer.nid); 192 return le32_to_cpu(rn->footer.nid);
198} 193}
199 194
200static inline unsigned int ofs_of_node(struct page *node_page) 195static inline unsigned int ofs_of_node(struct page *node_page)
201{ 196{
202 void *kaddr = page_address(node_page); 197 struct f2fs_node *rn = F2FS_NODE(node_page);
203 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
204 unsigned flag = le32_to_cpu(rn->footer.flag); 198 unsigned flag = le32_to_cpu(rn->footer.flag);
205 return flag >> OFFSET_BIT_SHIFT; 199 return flag >> OFFSET_BIT_SHIFT;
206} 200}
207 201
208static inline unsigned long long cpver_of_node(struct page *node_page) 202static inline unsigned long long cpver_of_node(struct page *node_page)
209{ 203{
210 void *kaddr = page_address(node_page); 204 struct f2fs_node *rn = F2FS_NODE(node_page);
211 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
212 return le64_to_cpu(rn->footer.cp_ver); 205 return le64_to_cpu(rn->footer.cp_ver);
213} 206}
214 207
215static inline block_t next_blkaddr_of_node(struct page *node_page) 208static inline block_t next_blkaddr_of_node(struct page *node_page)
216{ 209{
217 void *kaddr = page_address(node_page); 210 struct f2fs_node *rn = F2FS_NODE(node_page);
218 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
219 return le32_to_cpu(rn->footer.next_blkaddr); 211 return le32_to_cpu(rn->footer.next_blkaddr);
220} 212}
221 213
@@ -237,6 +229,10 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
237static inline bool IS_DNODE(struct page *node_page) 229static inline bool IS_DNODE(struct page *node_page)
238{ 230{
239 unsigned int ofs = ofs_of_node(node_page); 231 unsigned int ofs = ofs_of_node(node_page);
232
233 if (ofs == XATTR_NODE_OFFSET)
234 return false;
235
240 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || 236 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
241 ofs == 5 + 2 * NIDS_PER_BLOCK) 237 ofs == 5 + 2 * NIDS_PER_BLOCK)
242 return false; 238 return false;
@@ -250,7 +246,7 @@ static inline bool IS_DNODE(struct page *node_page)
250 246
251static inline void set_nid(struct page *p, int off, nid_t nid, bool i) 247static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
252{ 248{
253 struct f2fs_node *rn = (struct f2fs_node *)page_address(p); 249 struct f2fs_node *rn = F2FS_NODE(p);
254 250
255 wait_on_page_writeback(p); 251 wait_on_page_writeback(p);
256 252
@@ -263,7 +259,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
263 259
264static inline nid_t get_nid(struct page *p, int off, bool i) 260static inline nid_t get_nid(struct page *p, int off, bool i)
265{ 261{
266 struct f2fs_node *rn = (struct f2fs_node *)page_address(p); 262 struct f2fs_node *rn = F2FS_NODE(p);
263
267 if (i) 264 if (i)
268 return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); 265 return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
269 return le32_to_cpu(rn->in.nid[off]); 266 return le32_to_cpu(rn->in.nid[off]);
@@ -314,8 +311,7 @@ static inline void clear_cold_data(struct page *page)
314 311
315static inline int is_node(struct page *page, int type) 312static inline int is_node(struct page *page, int type)
316{ 313{
317 void *kaddr = page_address(page); 314 struct f2fs_node *rn = F2FS_NODE(page);
318 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
319 return le32_to_cpu(rn->footer.flag) & (1 << type); 315 return le32_to_cpu(rn->footer.flag) & (1 << type);
320} 316}
321 317
@@ -325,7 +321,7 @@ static inline int is_node(struct page *page, int type)
325 321
326static inline void set_cold_node(struct inode *inode, struct page *page) 322static inline void set_cold_node(struct inode *inode, struct page *page)
327{ 323{
328 struct f2fs_node *rn = (struct f2fs_node *)page_address(page); 324 struct f2fs_node *rn = F2FS_NODE(page);
329 unsigned int flag = le32_to_cpu(rn->footer.flag); 325 unsigned int flag = le32_to_cpu(rn->footer.flag);
330 326
331 if (S_ISDIR(inode->i_mode)) 327 if (S_ISDIR(inode->i_mode))
@@ -337,7 +333,7 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
337 333
338static inline void set_mark(struct page *page, int mark, int type) 334static inline void set_mark(struct page *page, int mark, int type)
339{ 335{
340 struct f2fs_node *rn = (struct f2fs_node *)page_address(page); 336 struct f2fs_node *rn = F2FS_NODE(page);
341 unsigned int flag = le32_to_cpu(rn->footer.flag); 337 unsigned int flag = le32_to_cpu(rn->footer.flag);
342 if (mark) 338 if (mark)
343 flag |= (0x1 << type); 339 flag |= (0x1 << type);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d56d951c2253..51ef5eec33d7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
40 40
41static int recover_dentry(struct page *ipage, struct inode *inode) 41static int recover_dentry(struct page *ipage, struct inode *inode)
42{ 42{
43 void *kaddr = page_address(ipage); 43 struct f2fs_node *raw_node = F2FS_NODE(ipage);
44 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
45 struct f2fs_inode *raw_inode = &(raw_node->i); 44 struct f2fs_inode *raw_inode = &(raw_node->i);
46 nid_t pino = le32_to_cpu(raw_inode->i_pino); 45 nid_t pino = le32_to_cpu(raw_inode->i_pino);
47 struct f2fs_dir_entry *de; 46 struct f2fs_dir_entry *de;
@@ -93,8 +92,7 @@ out:
93 92
94static int recover_inode(struct inode *inode, struct page *node_page) 93static int recover_inode(struct inode *inode, struct page *node_page)
95{ 94{
96 void *kaddr = page_address(node_page); 95 struct f2fs_node *raw_node = F2FS_NODE(node_page);
97 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
98 struct f2fs_inode *raw_inode = &(raw_node->i); 96 struct f2fs_inode *raw_inode = &(raw_node->i);
99 97
100 if (!IS_INODE(node_page)) 98 if (!IS_INODE(node_page))
@@ -119,7 +117,7 @@ static int recover_inode(struct inode *inode, struct page *node_page)
119 117
120static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 118static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
121{ 119{
122 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); 120 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
123 struct curseg_info *curseg; 121 struct curseg_info *curseg;
124 struct page *page; 122 struct page *page;
125 block_t blkaddr; 123 block_t blkaddr;
@@ -131,8 +129,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
131 129
132 /* read node page */ 130 /* read node page */
133 page = alloc_page(GFP_F2FS_ZERO); 131 page = alloc_page(GFP_F2FS_ZERO);
134 if (IS_ERR(page)) 132 if (!page)
135 return PTR_ERR(page); 133 return -ENOMEM;
136 lock_page(page); 134 lock_page(page);
137 135
138 while (1) { 136 while (1) {
@@ -215,6 +213,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
215 void *kaddr; 213 void *kaddr;
216 struct inode *inode; 214 struct inode *inode;
217 struct page *node_page; 215 struct page *node_page;
216 unsigned int offset;
218 block_t bidx; 217 block_t bidx;
219 int i; 218 int i;
220 219
@@ -259,8 +258,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
259 node_page = get_node_page(sbi, nid); 258 node_page = get_node_page(sbi, nid);
260 if (IS_ERR(node_page)) 259 if (IS_ERR(node_page))
261 return PTR_ERR(node_page); 260 return PTR_ERR(node_page);
262 bidx = start_bidx_of_node(ofs_of_node(node_page)) + 261
263 le16_to_cpu(sum.ofs_in_node); 262 offset = ofs_of_node(node_page);
264 ino = ino_of_node(node_page); 263 ino = ino_of_node(node_page);
265 f2fs_put_page(node_page, 1); 264 f2fs_put_page(node_page, 1);
266 265
@@ -269,6 +268,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
269 if (IS_ERR(inode)) 268 if (IS_ERR(inode))
270 return PTR_ERR(inode); 269 return PTR_ERR(inode);
271 270
271 bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
272 le16_to_cpu(sum.ofs_in_node);
273
272 truncate_hole(inode, bidx, bidx + 1); 274 truncate_hole(inode, bidx, bidx + 1);
273 iput(inode); 275 iput(inode);
274 return 0; 276 return 0;
@@ -277,6 +279,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
277static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, 279static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
278 struct page *page, block_t blkaddr) 280 struct page *page, block_t blkaddr)
279{ 281{
282 struct f2fs_inode_info *fi = F2FS_I(inode);
280 unsigned int start, end; 283 unsigned int start, end;
281 struct dnode_of_data dn; 284 struct dnode_of_data dn;
282 struct f2fs_summary sum; 285 struct f2fs_summary sum;
@@ -284,9 +287,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
284 int err = 0, recovered = 0; 287 int err = 0, recovered = 0;
285 int ilock; 288 int ilock;
286 289
287 start = start_bidx_of_node(ofs_of_node(page)); 290 start = start_bidx_of_node(ofs_of_node(page), fi);
288 if (IS_INODE(page)) 291 if (IS_INODE(page))
289 end = start + ADDRS_PER_INODE; 292 end = start + ADDRS_PER_INODE(fi);
290 else 293 else
291 end = start + ADDRS_PER_BLOCK; 294 end = start + ADDRS_PER_BLOCK;
292 295
@@ -357,7 +360,7 @@ err:
357static int recover_data(struct f2fs_sb_info *sbi, 360static int recover_data(struct f2fs_sb_info *sbi,
358 struct list_head *head, int type) 361 struct list_head *head, int type)
359{ 362{
360 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); 363 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
361 struct curseg_info *curseg; 364 struct curseg_info *curseg;
362 struct page *page; 365 struct page *page;
363 int err = 0; 366 int err = 0;
@@ -369,7 +372,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
369 372
370 /* read node page */ 373 /* read node page */
371 page = alloc_page(GFP_NOFS | __GFP_ZERO); 374 page = alloc_page(GFP_NOFS | __GFP_ZERO);
372 if (IS_ERR(page)) 375 if (!page)
373 return -ENOMEM; 376 return -ENOMEM;
374 377
375 lock_page(page); 378 lock_page(page);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a86d125a9885..09af9c7b0f52 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -117,7 +117,6 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
117 } 117 }
118 118
119 mutex_unlock(&dirty_i->seglist_lock); 119 mutex_unlock(&dirty_i->seglist_lock);
120 return;
121} 120}
122 121
123/* 122/*
@@ -261,7 +260,6 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
261 void *addr = curseg->sum_blk; 260 void *addr = curseg->sum_blk;
262 addr += curseg->next_blkoff * sizeof(struct f2fs_summary); 261 addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
263 memcpy(addr, sum, sizeof(struct f2fs_summary)); 262 memcpy(addr, sum, sizeof(struct f2fs_summary));
264 return;
265} 263}
266 264
267/* 265/*
@@ -542,12 +540,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
542{ 540{
543 struct curseg_info *curseg = CURSEG_I(sbi, type); 541 struct curseg_info *curseg = CURSEG_I(sbi, type);
544 542
545 if (force) { 543 if (force)
546 new_curseg(sbi, type, true); 544 new_curseg(sbi, type, true);
547 goto out; 545 else if (type == CURSEG_WARM_NODE)
548 }
549
550 if (type == CURSEG_WARM_NODE)
551 new_curseg(sbi, type, false); 546 new_curseg(sbi, type, false);
552 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) 547 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
553 new_curseg(sbi, type, false); 548 new_curseg(sbi, type, false);
@@ -555,11 +550,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
555 change_curseg(sbi, type, true); 550 change_curseg(sbi, type, true);
556 else 551 else
557 new_curseg(sbi, type, false); 552 new_curseg(sbi, type, false);
558out:
559#ifdef CONFIG_F2FS_STAT_FS 553#ifdef CONFIG_F2FS_STAT_FS
560 sbi->segment_count[curseg->alloc_type]++; 554 sbi->segment_count[curseg->alloc_type]++;
561#endif 555#endif
562 return;
563} 556}
564 557
565void allocate_new_segments(struct f2fs_sb_info *sbi) 558void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -611,18 +604,12 @@ static void f2fs_end_io_write(struct bio *bio, int err)
611struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) 604struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
612{ 605{
613 struct bio *bio; 606 struct bio *bio;
614 struct bio_private *priv;
615retry:
616 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
617 if (!priv) {
618 cond_resched();
619 goto retry;
620 }
621 607
622 /* No failure on bio allocation */ 608 /* No failure on bio allocation */
623 bio = bio_alloc(GFP_NOIO, npages); 609 bio = bio_alloc(GFP_NOIO, npages);
624 bio->bi_bdev = bdev; 610 bio->bi_bdev = bdev;
625 bio->bi_private = priv; 611 bio->bi_private = NULL;
612
626 return bio; 613 return bio;
627} 614}
628 615
@@ -681,8 +668,17 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
681 do_submit_bio(sbi, type, false); 668 do_submit_bio(sbi, type, false);
682alloc_new: 669alloc_new:
683 if (sbi->bio[type] == NULL) { 670 if (sbi->bio[type] == NULL) {
671 struct bio_private *priv;
672retry:
673 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
674 if (!priv) {
675 cond_resched();
676 goto retry;
677 }
678
684 sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); 679 sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
685 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 680 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
681 sbi->bio[type]->bi_private = priv;
686 /* 682 /*
687 * The end_io will be assigned at the sumbission phase. 683 * The end_io will be assigned at the sumbission phase.
688 * Until then, let bio_add_page() merge consecutive IOs as much 684 * Until then, let bio_add_page() merge consecutive IOs as much
@@ -702,6 +698,16 @@ alloc_new:
702 trace_f2fs_submit_write_page(page, blk_addr, type); 698 trace_f2fs_submit_write_page(page, blk_addr, type);
703} 699}
704 700
701void f2fs_wait_on_page_writeback(struct page *page,
702 enum page_type type, bool sync)
703{
704 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
705 if (PageWriteback(page)) {
706 f2fs_submit_bio(sbi, type, sync);
707 wait_on_page_writeback(page);
708 }
709}
710
705static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 711static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
706{ 712{
707 struct curseg_info *curseg = CURSEG_I(sbi, type); 713 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1179,7 +1185,6 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1179{ 1185{
1180 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) 1186 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
1181 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); 1187 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1182 return;
1183} 1188}
1184 1189
1185int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, 1190int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 062424a0e4c3..bdd10eab8c40 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -142,6 +142,7 @@ struct victim_sel_policy {
142 int alloc_mode; /* LFS or SSR */ 142 int alloc_mode; /* LFS or SSR */
143 int gc_mode; /* GC_CB or GC_GREEDY */ 143 int gc_mode; /* GC_CB or GC_GREEDY */
144 unsigned long *dirty_segmap; /* dirty segment bitmap */ 144 unsigned long *dirty_segmap; /* dirty segment bitmap */
145 unsigned int max_search; /* maximum # of segments to search */
145 unsigned int offset; /* last scanned bitmap offset */ 146 unsigned int offset; /* last scanned bitmap offset */
146 unsigned int ofs_unit; /* bitmap search unit */ 147 unsigned int ofs_unit; /* bitmap search unit */
147 unsigned int min_cost; /* minimum cost */ 148 unsigned int min_cost; /* minimum cost */
@@ -453,7 +454,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
453 454
454static inline bool need_SSR(struct f2fs_sb_info *sbi) 455static inline bool need_SSR(struct f2fs_sb_info *sbi)
455{ 456{
456 return (free_sections(sbi) < overprovision_sections(sbi)); 457 return ((prefree_segments(sbi) / sbi->segs_per_sec)
458 + free_sections(sbi) < overprovision_sections(sbi));
457} 459}
458 460
459static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 461static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -470,7 +472,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
470 472
471static inline int utilization(struct f2fs_sb_info *sbi) 473static inline int utilization(struct f2fs_sb_info *sbi)
472{ 474{
473 return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); 475 return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
474} 476}
475 477
476/* 478/*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 75c7dc363e92..13d0a0fe49dd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -18,20 +18,25 @@
18#include <linux/parser.h> 18#include <linux/parser.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/proc_fs.h>
21#include <linux/random.h> 22#include <linux/random.h>
22#include <linux/exportfs.h> 23#include <linux/exportfs.h>
23#include <linux/blkdev.h> 24#include <linux/blkdev.h>
24#include <linux/f2fs_fs.h> 25#include <linux/f2fs_fs.h>
26#include <linux/sysfs.h>
25 27
26#include "f2fs.h" 28#include "f2fs.h"
27#include "node.h" 29#include "node.h"
28#include "segment.h" 30#include "segment.h"
29#include "xattr.h" 31#include "xattr.h"
32#include "gc.h"
30 33
31#define CREATE_TRACE_POINTS 34#define CREATE_TRACE_POINTS
32#include <trace/events/f2fs.h> 35#include <trace/events/f2fs.h>
33 36
37static struct proc_dir_entry *f2fs_proc_root;
34static struct kmem_cache *f2fs_inode_cachep; 38static struct kmem_cache *f2fs_inode_cachep;
39static struct kset *f2fs_kset;
35 40
36enum { 41enum {
37 Opt_gc_background, 42 Opt_gc_background,
@@ -42,6 +47,7 @@ enum {
42 Opt_noacl, 47 Opt_noacl,
43 Opt_active_logs, 48 Opt_active_logs,
44 Opt_disable_ext_identify, 49 Opt_disable_ext_identify,
50 Opt_inline_xattr,
45 Opt_err, 51 Opt_err,
46}; 52};
47 53
@@ -54,9 +60,117 @@ static match_table_t f2fs_tokens = {
54 {Opt_noacl, "noacl"}, 60 {Opt_noacl, "noacl"},
55 {Opt_active_logs, "active_logs=%u"}, 61 {Opt_active_logs, "active_logs=%u"},
56 {Opt_disable_ext_identify, "disable_ext_identify"}, 62 {Opt_disable_ext_identify, "disable_ext_identify"},
63 {Opt_inline_xattr, "inline_xattr"},
57 {Opt_err, NULL}, 64 {Opt_err, NULL},
58}; 65};
59 66
67/* Sysfs support for f2fs */
68struct f2fs_attr {
69 struct attribute attr;
70 ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
71 ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
72 const char *, size_t);
73 int offset;
74};
75
76static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
77 struct f2fs_sb_info *sbi, char *buf)
78{
79 struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
80 unsigned int *ui;
81
82 if (!gc_kth)
83 return -EINVAL;
84
85 ui = (unsigned int *)(((char *)gc_kth) + a->offset);
86
87 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
88}
89
90static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
91 struct f2fs_sb_info *sbi,
92 const char *buf, size_t count)
93{
94 struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
95 unsigned long t;
96 unsigned int *ui;
97 ssize_t ret;
98
99 if (!gc_kth)
100 return -EINVAL;
101
102 ui = (unsigned int *)(((char *)gc_kth) + a->offset);
103
104 ret = kstrtoul(skip_spaces(buf), 0, &t);
105 if (ret < 0)
106 return ret;
107 *ui = t;
108 return count;
109}
110
111static ssize_t f2fs_attr_show(struct kobject *kobj,
112 struct attribute *attr, char *buf)
113{
114 struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
115 s_kobj);
116 struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
117
118 return a->show ? a->show(a, sbi, buf) : 0;
119}
120
121static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
122 const char *buf, size_t len)
123{
124 struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
125 s_kobj);
126 struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
127
128 return a->store ? a->store(a, sbi, buf, len) : 0;
129}
130
131static void f2fs_sb_release(struct kobject *kobj)
132{
133 struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
134 s_kobj);
135 complete(&sbi->s_kobj_unregister);
136}
137
138#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \
139static struct f2fs_attr f2fs_attr_##_name = { \
140 .attr = {.name = __stringify(_name), .mode = _mode }, \
141 .show = _show, \
142 .store = _store, \
143 .offset = offsetof(struct f2fs_gc_kthread, _elname), \
144}
145
146#define F2FS_RW_ATTR(name, elname) \
147 F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname)
148
149F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time);
150F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time);
151F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
152F2FS_RW_ATTR(gc_idle, gc_idle);
153
154#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
155static struct attribute *f2fs_attrs[] = {
156 ATTR_LIST(gc_min_sleep_time),
157 ATTR_LIST(gc_max_sleep_time),
158 ATTR_LIST(gc_no_gc_sleep_time),
159 ATTR_LIST(gc_idle),
160 NULL,
161};
162
163static const struct sysfs_ops f2fs_attr_ops = {
164 .show = f2fs_attr_show,
165 .store = f2fs_attr_store,
166};
167
168static struct kobj_type f2fs_ktype = {
169 .default_attrs = f2fs_attrs,
170 .sysfs_ops = &f2fs_attr_ops,
171 .release = f2fs_sb_release,
172};
173
60void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) 174void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
61{ 175{
62 struct va_format vaf; 176 struct va_format vaf;
@@ -126,11 +240,18 @@ static int parse_options(struct super_block *sb, char *options)
126 case Opt_nouser_xattr: 240 case Opt_nouser_xattr:
127 clear_opt(sbi, XATTR_USER); 241 clear_opt(sbi, XATTR_USER);
128 break; 242 break;
243 case Opt_inline_xattr:
244 set_opt(sbi, INLINE_XATTR);
245 break;
129#else 246#else
130 case Opt_nouser_xattr: 247 case Opt_nouser_xattr:
131 f2fs_msg(sb, KERN_INFO, 248 f2fs_msg(sb, KERN_INFO,
132 "nouser_xattr options not supported"); 249 "nouser_xattr options not supported");
133 break; 250 break;
251 case Opt_inline_xattr:
252 f2fs_msg(sb, KERN_INFO,
253 "inline_xattr options not supported");
254 break;
134#endif 255#endif
135#ifdef CONFIG_F2FS_FS_POSIX_ACL 256#ifdef CONFIG_F2FS_FS_POSIX_ACL
136 case Opt_noacl: 257 case Opt_noacl:
@@ -180,6 +301,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
180 301
181 set_inode_flag(fi, FI_NEW_INODE); 302 set_inode_flag(fi, FI_NEW_INODE);
182 303
304 if (test_opt(F2FS_SB(sb), INLINE_XATTR))
305 set_inode_flag(fi, FI_INLINE_XATTR);
306
183 return &fi->vfs_inode; 307 return &fi->vfs_inode;
184} 308}
185 309
@@ -205,7 +329,6 @@ static int f2fs_drop_inode(struct inode *inode)
205static void f2fs_dirty_inode(struct inode *inode, int flags) 329static void f2fs_dirty_inode(struct inode *inode, int flags)
206{ 330{
207 set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); 331 set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
208 return;
209} 332}
210 333
211static void f2fs_i_callback(struct rcu_head *head) 334static void f2fs_i_callback(struct rcu_head *head)
@@ -223,6 +346,12 @@ static void f2fs_put_super(struct super_block *sb)
223{ 346{
224 struct f2fs_sb_info *sbi = F2FS_SB(sb); 347 struct f2fs_sb_info *sbi = F2FS_SB(sb);
225 348
349 if (sbi->s_proc) {
350 remove_proc_entry("segment_info", sbi->s_proc);
351 remove_proc_entry(sb->s_id, f2fs_proc_root);
352 }
353 kobject_del(&sbi->s_kobj);
354
226 f2fs_destroy_stats(sbi); 355 f2fs_destroy_stats(sbi);
227 stop_gc_thread(sbi); 356 stop_gc_thread(sbi);
228 357
@@ -236,6 +365,8 @@ static void f2fs_put_super(struct super_block *sb)
236 destroy_segment_manager(sbi); 365 destroy_segment_manager(sbi);
237 366
238 kfree(sbi->ckpt); 367 kfree(sbi->ckpt);
368 kobject_put(&sbi->s_kobj);
369 wait_for_completion(&sbi->s_kobj_unregister);
239 370
240 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
241 brelse(sbi->raw_super_buf); 372 brelse(sbi->raw_super_buf);
@@ -325,6 +456,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
325 seq_puts(seq, ",user_xattr"); 456 seq_puts(seq, ",user_xattr");
326 else 457 else
327 seq_puts(seq, ",nouser_xattr"); 458 seq_puts(seq, ",nouser_xattr");
459 if (test_opt(sbi, INLINE_XATTR))
460 seq_puts(seq, ",inline_xattr");
328#endif 461#endif
329#ifdef CONFIG_F2FS_FS_POSIX_ACL 462#ifdef CONFIG_F2FS_FS_POSIX_ACL
330 if (test_opt(sbi, POSIX_ACL)) 463 if (test_opt(sbi, POSIX_ACL))
@@ -340,6 +473,36 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
340 return 0; 473 return 0;
341} 474}
342 475
476static int segment_info_seq_show(struct seq_file *seq, void *offset)
477{
478 struct super_block *sb = seq->private;
479 struct f2fs_sb_info *sbi = F2FS_SB(sb);
480 unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
481 int i;
482
483 for (i = 0; i < total_segs; i++) {
484 seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
485 if (i != 0 && (i % 10) == 0)
486 seq_puts(seq, "\n");
487 else
488 seq_puts(seq, " ");
489 }
490 return 0;
491}
492
493static int segment_info_open_fs(struct inode *inode, struct file *file)
494{
495 return single_open(file, segment_info_seq_show, PDE_DATA(inode));
496}
497
498static const struct file_operations f2fs_seq_segment_info_fops = {
499 .owner = THIS_MODULE,
500 .open = segment_info_open_fs,
501 .read = seq_read,
502 .llseek = seq_lseek,
503 .release = single_release,
504};
505
343static int f2fs_remount(struct super_block *sb, int *flags, char *data) 506static int f2fs_remount(struct super_block *sb, int *flags, char *data)
344{ 507{
345 struct f2fs_sb_info *sbi = F2FS_SB(sb); 508 struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -455,7 +618,7 @@ static const struct export_operations f2fs_export_ops = {
455 618
456static loff_t max_file_size(unsigned bits) 619static loff_t max_file_size(unsigned bits)
457{ 620{
458 loff_t result = ADDRS_PER_INODE; 621 loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
459 loff_t leaf_count = ADDRS_PER_BLOCK; 622 loff_t leaf_count = ADDRS_PER_BLOCK;
460 623
461 /* two direct node blocks */ 624 /* two direct node blocks */
@@ -766,6 +929,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
766 if (err) 929 if (err)
767 goto fail; 930 goto fail;
768 931
932 if (f2fs_proc_root)
933 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
934
935 if (sbi->s_proc)
936 proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
937 &f2fs_seq_segment_info_fops, sb);
938
769 if (test_opt(sbi, DISCARD)) { 939 if (test_opt(sbi, DISCARD)) {
770 struct request_queue *q = bdev_get_queue(sb->s_bdev); 940 struct request_queue *q = bdev_get_queue(sb->s_bdev);
771 if (!blk_queue_discard(q)) 941 if (!blk_queue_discard(q))
@@ -774,6 +944,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
774 "the device does not support discard"); 944 "the device does not support discard");
775 } 945 }
776 946
947 sbi->s_kobj.kset = f2fs_kset;
948 init_completion(&sbi->s_kobj_unregister);
949 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
950 "%s", sb->s_id);
951 if (err)
952 goto fail;
953
777 return 0; 954 return 0;
778fail: 955fail:
779 stop_gc_thread(sbi); 956 stop_gc_thread(sbi);
@@ -841,29 +1018,49 @@ static int __init init_f2fs_fs(void)
841 goto fail; 1018 goto fail;
842 err = create_node_manager_caches(); 1019 err = create_node_manager_caches();
843 if (err) 1020 if (err)
844 goto fail; 1021 goto free_inodecache;
845 err = create_gc_caches(); 1022 err = create_gc_caches();
846 if (err) 1023 if (err)
847 goto fail; 1024 goto free_node_manager_caches;
848 err = create_checkpoint_caches(); 1025 err = create_checkpoint_caches();
849 if (err) 1026 if (err)
850 goto fail; 1027 goto free_gc_caches;
1028 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
1029 if (!f2fs_kset) {
1030 err = -ENOMEM;
1031 goto free_checkpoint_caches;
1032 }
851 err = register_filesystem(&f2fs_fs_type); 1033 err = register_filesystem(&f2fs_fs_type);
852 if (err) 1034 if (err)
853 goto fail; 1035 goto free_kset;
854 f2fs_create_root_stats(); 1036 f2fs_create_root_stats();
1037 f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
1038 return 0;
1039
1040free_kset:
1041 kset_unregister(f2fs_kset);
1042free_checkpoint_caches:
1043 destroy_checkpoint_caches();
1044free_gc_caches:
1045 destroy_gc_caches();
1046free_node_manager_caches:
1047 destroy_node_manager_caches();
1048free_inodecache:
1049 destroy_inodecache();
855fail: 1050fail:
856 return err; 1051 return err;
857} 1052}
858 1053
859static void __exit exit_f2fs_fs(void) 1054static void __exit exit_f2fs_fs(void)
860{ 1055{
1056 remove_proc_entry("fs/f2fs", NULL);
861 f2fs_destroy_root_stats(); 1057 f2fs_destroy_root_stats();
862 unregister_filesystem(&f2fs_fs_type); 1058 unregister_filesystem(&f2fs_fs_type);
863 destroy_checkpoint_caches(); 1059 destroy_checkpoint_caches();
864 destroy_gc_caches(); 1060 destroy_gc_caches();
865 destroy_node_manager_caches(); 1061 destroy_node_manager_caches();
866 destroy_inodecache(); 1062 destroy_inodecache();
1063 kset_unregister(f2fs_kset);
867} 1064}
868 1065
869module_init(init_f2fs_fs) 1066module_init(init_f2fs_fs)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3ab07ecd86ca..1ac8a5f6e380 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -246,40 +246,170 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
246 return handler; 246 return handler;
247} 247}
248 248
249static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index,
250 size_t name_len, const char *name)
251{
252 struct f2fs_xattr_entry *entry;
253
254 list_for_each_xattr(entry, base_addr) {
255 if (entry->e_name_index != name_index)
256 continue;
257 if (entry->e_name_len != name_len)
258 continue;
259 if (!memcmp(entry->e_name, name, name_len))
260 break;
261 }
262 return entry;
263}
264
265static void *read_all_xattrs(struct inode *inode, struct page *ipage)
266{
267 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
268 struct f2fs_xattr_header *header;
269 size_t size = PAGE_SIZE, inline_size = 0;
270 void *txattr_addr;
271
272 inline_size = inline_xattr_size(inode);
273
274 txattr_addr = kzalloc(inline_size + size, GFP_KERNEL);
275 if (!txattr_addr)
276 return NULL;
277
278 /* read from inline xattr */
279 if (inline_size) {
280 struct page *page = NULL;
281 void *inline_addr;
282
283 if (ipage) {
284 inline_addr = inline_xattr_addr(ipage);
285 } else {
286 page = get_node_page(sbi, inode->i_ino);
287 if (IS_ERR(page))
288 goto fail;
289 inline_addr = inline_xattr_addr(page);
290 }
291 memcpy(txattr_addr, inline_addr, inline_size);
292 f2fs_put_page(page, 1);
293 }
294
295 /* read from xattr node block */
296 if (F2FS_I(inode)->i_xattr_nid) {
297 struct page *xpage;
298 void *xattr_addr;
299
300 /* The inode already has an extended attribute block. */
301 xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
302 if (IS_ERR(xpage))
303 goto fail;
304
305 xattr_addr = page_address(xpage);
306 memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
307 f2fs_put_page(xpage, 1);
308 }
309
310 header = XATTR_HDR(txattr_addr);
311
312 /* never been allocated xattrs */
313 if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
314 header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
315 header->h_refcount = cpu_to_le32(1);
316 }
317 return txattr_addr;
318fail:
319 kzfree(txattr_addr);
320 return NULL;
321}
322
323static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
324 void *txattr_addr, struct page *ipage)
325{
326 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
327 size_t inline_size = 0;
328 void *xattr_addr;
329 struct page *xpage;
330 nid_t new_nid = 0;
331 int err;
332
333 inline_size = inline_xattr_size(inode);
334
335 if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
336 if (!alloc_nid(sbi, &new_nid))
337 return -ENOSPC;
338
339 /* write to inline xattr */
340 if (inline_size) {
341 struct page *page = NULL;
342 void *inline_addr;
343
344 if (ipage) {
345 inline_addr = inline_xattr_addr(ipage);
346 } else {
347 page = get_node_page(sbi, inode->i_ino);
348 if (IS_ERR(page)) {
349 alloc_nid_failed(sbi, new_nid);
350 return PTR_ERR(page);
351 }
352 inline_addr = inline_xattr_addr(page);
353 }
354 memcpy(inline_addr, txattr_addr, inline_size);
355 f2fs_put_page(page, 1);
356
357 /* no need to use xattr node block */
358 if (hsize <= inline_size) {
359 err = truncate_xattr_node(inode, ipage);
360 alloc_nid_failed(sbi, new_nid);
361 return err;
362 }
363 }
364
365 /* write to xattr node block */
366 if (F2FS_I(inode)->i_xattr_nid) {
367 xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
368 if (IS_ERR(xpage)) {
369 alloc_nid_failed(sbi, new_nid);
370 return PTR_ERR(xpage);
371 }
372 BUG_ON(new_nid);
373 } else {
374 struct dnode_of_data dn;
375 set_new_dnode(&dn, inode, NULL, NULL, new_nid);
376 xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
377 if (IS_ERR(xpage)) {
378 alloc_nid_failed(sbi, new_nid);
379 return PTR_ERR(xpage);
380 }
381 alloc_nid_done(sbi, new_nid);
382 }
383
384 xattr_addr = page_address(xpage);
385 memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
386 sizeof(struct node_footer));
387 set_page_dirty(xpage);
388 f2fs_put_page(xpage, 1);
389
390 /* need to checkpoint during fsync */
391 F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
392 return 0;
393}
394
249int f2fs_getxattr(struct inode *inode, int name_index, const char *name, 395int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
250 void *buffer, size_t buffer_size) 396 void *buffer, size_t buffer_size)
251{ 397{
252 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
253 struct f2fs_inode_info *fi = F2FS_I(inode);
254 struct f2fs_xattr_entry *entry; 398 struct f2fs_xattr_entry *entry;
255 struct page *page;
256 void *base_addr; 399 void *base_addr;
257 int error = 0, found = 0; 400 int error = 0;
258 size_t value_len, name_len; 401 size_t value_len, name_len;
259 402
260 if (name == NULL) 403 if (name == NULL)
261 return -EINVAL; 404 return -EINVAL;
262 name_len = strlen(name); 405 name_len = strlen(name);
263 406
264 if (!fi->i_xattr_nid) 407 base_addr = read_all_xattrs(inode, NULL);
265 return -ENODATA; 408 if (!base_addr)
409 return -ENOMEM;
266 410
267 page = get_node_page(sbi, fi->i_xattr_nid); 411 entry = __find_xattr(base_addr, name_index, name_len, name);
268 if (IS_ERR(page)) 412 if (IS_XATTR_LAST_ENTRY(entry)) {
269 return PTR_ERR(page);
270 base_addr = page_address(page);
271
272 list_for_each_xattr(entry, base_addr) {
273 if (entry->e_name_index != name_index)
274 continue;
275 if (entry->e_name_len != name_len)
276 continue;
277 if (!memcmp(entry->e_name, name, name_len)) {
278 found = 1;
279 break;
280 }
281 }
282 if (!found) {
283 error = -ENODATA; 413 error = -ENODATA;
284 goto cleanup; 414 goto cleanup;
285 } 415 }
@@ -298,28 +428,21 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
298 error = value_len; 428 error = value_len;
299 429
300cleanup: 430cleanup:
301 f2fs_put_page(page, 1); 431 kzfree(base_addr);
302 return error; 432 return error;
303} 433}
304 434
305ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 435ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
306{ 436{
307 struct inode *inode = dentry->d_inode; 437 struct inode *inode = dentry->d_inode;
308 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
309 struct f2fs_inode_info *fi = F2FS_I(inode);
310 struct f2fs_xattr_entry *entry; 438 struct f2fs_xattr_entry *entry;
311 struct page *page;
312 void *base_addr; 439 void *base_addr;
313 int error = 0; 440 int error = 0;
314 size_t rest = buffer_size; 441 size_t rest = buffer_size;
315 442
316 if (!fi->i_xattr_nid) 443 base_addr = read_all_xattrs(inode, NULL);
317 return 0; 444 if (!base_addr)
318 445 return -ENOMEM;
319 page = get_node_page(sbi, fi->i_xattr_nid);
320 if (IS_ERR(page))
321 return PTR_ERR(page);
322 base_addr = page_address(page);
323 446
324 list_for_each_xattr(entry, base_addr) { 447 list_for_each_xattr(entry, base_addr) {
325 const struct xattr_handler *handler = 448 const struct xattr_handler *handler =
@@ -342,7 +465,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
342 } 465 }
343 error = buffer_size - rest; 466 error = buffer_size - rest;
344cleanup: 467cleanup:
345 f2fs_put_page(page, 1); 468 kzfree(base_addr);
346 return error; 469 return error;
347} 470}
348 471
@@ -351,14 +474,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
351{ 474{
352 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 475 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
353 struct f2fs_inode_info *fi = F2FS_I(inode); 476 struct f2fs_inode_info *fi = F2FS_I(inode);
354 struct f2fs_xattr_header *header = NULL;
355 struct f2fs_xattr_entry *here, *last; 477 struct f2fs_xattr_entry *here, *last;
356 struct page *page;
357 void *base_addr; 478 void *base_addr;
358 int error, found, free, newsize; 479 int found, newsize;
359 size_t name_len; 480 size_t name_len;
360 char *pval;
361 int ilock; 481 int ilock;
482 __u32 new_hsize;
483 int error = -ENOMEM;
362 484
363 if (name == NULL) 485 if (name == NULL)
364 return -EINVAL; 486 return -EINVAL;
@@ -368,67 +490,21 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
368 490
369 name_len = strlen(name); 491 name_len = strlen(name);
370 492
371 if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) 493 if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode))
372 return -ERANGE; 494 return -ERANGE;
373 495
374 f2fs_balance_fs(sbi); 496 f2fs_balance_fs(sbi);
375 497
376 ilock = mutex_lock_op(sbi); 498 ilock = mutex_lock_op(sbi);
377 499
378 if (!fi->i_xattr_nid) { 500 base_addr = read_all_xattrs(inode, ipage);
379 /* Allocate new attribute block */ 501 if (!base_addr)
380 struct dnode_of_data dn; 502 goto exit;
381
382 if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
383 error = -ENOSPC;
384 goto exit;
385 }
386 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
387 mark_inode_dirty(inode);
388
389 page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
390 if (IS_ERR(page)) {
391 alloc_nid_failed(sbi, fi->i_xattr_nid);
392 fi->i_xattr_nid = 0;
393 error = PTR_ERR(page);
394 goto exit;
395 }
396
397 alloc_nid_done(sbi, fi->i_xattr_nid);
398 base_addr = page_address(page);
399 header = XATTR_HDR(base_addr);
400 header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
401 header->h_refcount = cpu_to_le32(1);
402 } else {
403 /* The inode already has an extended attribute block. */
404 page = get_node_page(sbi, fi->i_xattr_nid);
405 if (IS_ERR(page)) {
406 error = PTR_ERR(page);
407 goto exit;
408 }
409
410 base_addr = page_address(page);
411 header = XATTR_HDR(base_addr);
412 }
413
414 if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
415 error = -EIO;
416 goto cleanup;
417 }
418 503
419 /* find entry with wanted name. */ 504 /* find entry with wanted name. */
420 found = 0; 505 here = __find_xattr(base_addr, name_index, name_len, name);
421 list_for_each_xattr(here, base_addr) {
422 if (here->e_name_index != name_index)
423 continue;
424 if (here->e_name_len != name_len)
425 continue;
426 if (!memcmp(here->e_name, name, name_len)) {
427 found = 1;
428 break;
429 }
430 }
431 506
507 found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
432 last = here; 508 last = here;
433 509
434 while (!IS_XATTR_LAST_ENTRY(last)) 510 while (!IS_XATTR_LAST_ENTRY(last))
@@ -439,22 +515,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
439 515
440 /* 1. Check space */ 516 /* 1. Check space */
441 if (value) { 517 if (value) {
442 /* If value is NULL, it is remove operation. 518 int free;
519 /*
520 * If value is NULL, it is remove operation.
443 * In case of update operation, we caculate free. 521 * In case of update operation, we caculate free.
444 */ 522 */
445 free = MIN_OFFSET - ((char *)last - (char *)header); 523 free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
446 if (found) 524 if (found)
447 free = free - ENTRY_SIZE(here); 525 free = free - ENTRY_SIZE(here);
448 526
449 if (free < newsize) { 527 if (free < newsize) {
450 error = -ENOSPC; 528 error = -ENOSPC;
451 goto cleanup; 529 goto exit;
452 } 530 }
453 } 531 }
454 532
455 /* 2. Remove old entry */ 533 /* 2. Remove old entry */
456 if (found) { 534 if (found) {
457 /* If entry is found, remove old entry. 535 /*
536 * If entry is found, remove old entry.
458 * If not found, remove operation is not needed. 537 * If not found, remove operation is not needed.
459 */ 538 */
460 struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); 539 struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
@@ -465,10 +544,15 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
465 memset(last, 0, oldsize); 544 memset(last, 0, oldsize);
466 } 545 }
467 546
547 new_hsize = (char *)last - (char *)base_addr;
548
468 /* 3. Write new entry */ 549 /* 3. Write new entry */
469 if (value) { 550 if (value) {
470 /* Before we come here, old entry is removed. 551 char *pval;
471 * We just write new entry. */ 552 /*
553 * Before we come here, old entry is removed.
554 * We just write new entry.
555 */
472 memset(last, 0, newsize); 556 memset(last, 0, newsize);
473 last->e_name_index = name_index; 557 last->e_name_index = name_index;
474 last->e_name_len = name_len; 558 last->e_name_len = name_len;
@@ -476,26 +560,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
476 pval = last->e_name + name_len; 560 pval = last->e_name + name_len;
477 memcpy(pval, value, value_len); 561 memcpy(pval, value, value_len);
478 last->e_value_size = cpu_to_le16(value_len); 562 last->e_value_size = cpu_to_le16(value_len);
563 new_hsize += newsize;
479 } 564 }
480 565
481 set_page_dirty(page); 566 error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
482 f2fs_put_page(page, 1); 567 if (error)
568 goto exit;
483 569
484 if (is_inode_flag_set(fi, FI_ACL_MODE)) { 570 if (is_inode_flag_set(fi, FI_ACL_MODE)) {
485 inode->i_mode = fi->i_acl_mode; 571 inode->i_mode = fi->i_acl_mode;
486 inode->i_ctime = CURRENT_TIME; 572 inode->i_ctime = CURRENT_TIME;
487 clear_inode_flag(fi, FI_ACL_MODE); 573 clear_inode_flag(fi, FI_ACL_MODE);
488 } 574 }
575
489 if (ipage) 576 if (ipage)
490 update_inode(inode, ipage); 577 update_inode(inode, ipage);
491 else 578 else
492 update_inode_page(inode); 579 update_inode_page(inode);
493 mutex_unlock_op(sbi, ilock);
494
495 return 0;
496cleanup:
497 f2fs_put_page(page, 1);
498exit: 580exit:
499 mutex_unlock_op(sbi, ilock); 581 mutex_unlock_op(sbi, ilock);
582 kzfree(base_addr);
500 return error; 583 return error;
501} 584}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 3c0817bef25d..02a08fb88a15 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -51,7 +51,7 @@ struct f2fs_xattr_entry {
51 51
52#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) 52#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
53#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) 53#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
54#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) 54#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
55#define XATTR_ROUND (3) 55#define XATTR_ROUND (3)
56 56
57#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) 57#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
@@ -69,17 +69,16 @@ struct f2fs_xattr_entry {
69 !IS_XATTR_LAST_ENTRY(entry);\ 69 !IS_XATTR_LAST_ENTRY(entry);\
70 entry = XATTR_NEXT_ENTRY(entry)) 70 entry = XATTR_NEXT_ENTRY(entry))
71 71
72#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \
73 sizeof(struct node_footer) - sizeof(__u32))
72 74
73#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ 75#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \
74 sizeof(struct node_footer) - \ 76 sizeof(struct f2fs_xattr_header) - \
75 sizeof(__u32)) 77 sizeof(struct f2fs_xattr_entry))
76
77#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
78 sizeof(struct f2fs_xattr_entry))
79 78
80/* 79/*
81 * On-disk structure of f2fs_xattr 80 * On-disk structure of f2fs_xattr
82 * We use only 1 block for xattr. 81 * We use inline xattrs space + 1 block for xattr.
83 * 82 *
84 * +--------------------+ 83 * +--------------------+
85 * | f2fs_xattr_header | 84 * | f2fs_xattr_header |
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 11b51bb55b42..0062da21dd8b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
147 struct inode *inode = mapping->host; 147 struct inode *inode = mapping->host;
148 148
149 if (to > inode->i_size) { 149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size); 150 truncate_pagecache(inode, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size); 151 fat_truncate_blocks(inode, inode->i_size);
152 } 152 }
153} 153}
diff --git a/fs/file_table.c b/fs/file_table.c
index b44e4c559786..abdd15ad13c9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,8 +311,7 @@ void fput(struct file *file)
311 return; 311 return;
312 /* 312 /*
313 * After this task has run exit_task_work(), 313 * After this task has run exit_task_work(),
314 * task_work_add() will fail. free_ipc_ns()-> 314 * task_work_add() will fail. Fall through to delayed
315 * shm_destroy() can do this. Fall through to delayed
316 * fput to avoid leaking *file. 315 * fput to avoid leaking *file.
317 */ 316 */
318 } 317 }
@@ -385,6 +384,10 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
385 */ 384 */
386void file_sb_list_add(struct file *file, struct super_block *sb) 385void file_sb_list_add(struct file *file, struct super_block *sb)
387{ 386{
387 if (likely(!(file->f_mode & FMODE_WRITE)))
388 return;
389 if (!S_ISREG(file_inode(file)->i_mode))
390 return;
388 lg_local_lock(&files_lglock); 391 lg_local_lock(&files_lglock);
389 __file_sb_list_add(file, sb); 392 __file_sb_list_add(file, sb);
390 lg_local_unlock(&files_lglock); 393 lg_local_unlock(&files_lglock);
@@ -450,8 +453,6 @@ void mark_files_ro(struct super_block *sb)
450 453
451 lg_global_lock(&files_lglock); 454 lg_global_lock(&files_lglock);
452 do_file_list_for_each_entry(sb, f) { 455 do_file_list_for_each_entry(sb, f) {
453 if (!S_ISREG(file_inode(f)->i_mode))
454 continue;
455 if (!file_count(f)) 456 if (!file_count(f))
456 continue; 457 continue;
457 if (!(f->f_mode & FMODE_WRITE)) 458 if (!(f->f_mode & FMODE_WRITE))
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 68851ff2fd41..9f4935b8f208 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -69,7 +69,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
69{ 69{
70 struct super_block *sb = inode->i_sb; 70 struct super_block *sb = inode->i_sb;
71 71
72 if (strcmp(sb->s_type->name, "bdev") == 0) 72 if (sb_is_blkdev_sb(sb))
73 return inode->i_mapping->backing_dev_info; 73 return inode->i_mapping->backing_dev_info;
74 74
75 return sb->s_bdi; 75 return sb->s_bdi;
@@ -251,11 +251,13 @@ static int move_expired_inodes(struct list_head *delaying_queue,
251 if (work->older_than_this && 251 if (work->older_than_this &&
252 inode_dirtied_after(inode, *work->older_than_this)) 252 inode_dirtied_after(inode, *work->older_than_this))
253 break; 253 break;
254 list_move(&inode->i_wb_list, &tmp);
255 moved++;
256 if (sb_is_blkdev_sb(inode->i_sb))
257 continue;
254 if (sb && sb != inode->i_sb) 258 if (sb && sb != inode->i_sb)
255 do_sb_sort = 1; 259 do_sb_sort = 1;
256 sb = inode->i_sb; 260 sb = inode->i_sb;
257 list_move(&inode->i_wb_list, &tmp);
258 moved++;
259 } 261 }
260 262
261 /* just one sb in list, splice to dispatch_queue and we're done */ 263 /* just one sb in list, splice to dispatch_queue and we're done */
@@ -723,7 +725,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
723 return wrote; 725 return wrote;
724} 726}
725 727
726long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, 728static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
727 enum wb_reason reason) 729 enum wb_reason reason)
728{ 730{
729 struct wb_writeback_work work = { 731 struct wb_writeback_work work = {
@@ -1049,10 +1051,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1049{ 1051{
1050 struct backing_dev_info *bdi; 1052 struct backing_dev_info *bdi;
1051 1053
1052 if (!nr_pages) { 1054 if (!nr_pages)
1053 nr_pages = global_page_state(NR_FILE_DIRTY) + 1055 nr_pages = get_nr_dirty_pages();
1054 global_page_state(NR_UNSTABLE_NFS);
1055 }
1056 1056
1057 rcu_read_lock(); 1057 rcu_read_lock();
1058 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1058 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
@@ -1173,6 +1173,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1173 bool wakeup_bdi = false; 1173 bool wakeup_bdi = false;
1174 bdi = inode_to_bdi(inode); 1174 bdi = inode_to_bdi(inode);
1175 1175
1176 spin_unlock(&inode->i_lock);
1177 spin_lock(&bdi->wb.list_lock);
1176 if (bdi_cap_writeback_dirty(bdi)) { 1178 if (bdi_cap_writeback_dirty(bdi)) {
1177 WARN(!test_bit(BDI_registered, &bdi->state), 1179 WARN(!test_bit(BDI_registered, &bdi->state),
1178 "bdi-%s not registered\n", bdi->name); 1180 "bdi-%s not registered\n", bdi->name);
@@ -1187,8 +1189,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1187 wakeup_bdi = true; 1189 wakeup_bdi = true;
1188 } 1190 }
1189 1191
1190 spin_unlock(&inode->i_lock);
1191 spin_lock(&bdi->wb.list_lock);
1192 inode->dirtied_when = jiffies; 1192 inode->dirtied_when = jiffies;
1193 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1193 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1194 spin_unlock(&bdi->wb.list_lock); 1194 spin_unlock(&bdi->wb.list_lock);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 0e91a3c9fdb2..b2a86e324aac 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,3 +558,75 @@ void __fscache_cookie_put(struct fscache_cookie *cookie)
558 558
559 _leave(""); 559 _leave("");
560} 560}
561
562/*
563 * check the consistency between the netfs inode and the backing cache
564 *
565 * NOTE: it only serves no-index type
566 */
567int __fscache_check_consistency(struct fscache_cookie *cookie)
568{
569 struct fscache_operation *op;
570 struct fscache_object *object;
571 int ret;
572
573 _enter("%p,", cookie);
574
575 ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
576
577 if (fscache_wait_for_deferred_lookup(cookie) < 0)
578 return -ERESTARTSYS;
579
580 if (hlist_empty(&cookie->backing_objects))
581 return 0;
582
583 op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
584 if (!op)
585 return -ENOMEM;
586
587 fscache_operation_init(op, NULL, NULL);
588 op->flags = FSCACHE_OP_MYTHREAD |
589 (1 << FSCACHE_OP_WAITING) |
590 (1 << FSCACHE_OP_UNUSE_COOKIE);
591
592 spin_lock(&cookie->lock);
593
594 if (hlist_empty(&cookie->backing_objects))
595 goto inconsistent;
596 object = hlist_entry(cookie->backing_objects.first,
597 struct fscache_object, cookie_link);
598 if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
599 goto inconsistent;
600
601 op->debug_id = atomic_inc_return(&fscache_op_debug_id);
602
603 atomic_inc(&cookie->n_active);
604 if (fscache_submit_op(object, op) < 0)
605 goto submit_failed;
606
607 /* the work queue now carries its own ref on the object */
608 spin_unlock(&cookie->lock);
609
610 ret = fscache_wait_for_operation_activation(object, op,
611 NULL, NULL, NULL);
612 if (ret == 0) {
613 /* ask the cache to honour the operation */
614 ret = object->cache->ops->check_consistency(op);
615 fscache_op_complete(op, false);
616 } else if (ret == -ENOBUFS) {
617 ret = 0;
618 }
619
620 fscache_put_operation(op);
621 _leave(" = %d", ret);
622 return ret;
623
624submit_failed:
625 atomic_dec(&cookie->n_active);
626inconsistent:
627 spin_unlock(&cookie->lock);
628 kfree(op);
629 _leave(" = -ESTALE");
630 return -ESTALE;
631}
632EXPORT_SYMBOL(__fscache_check_consistency);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 12d505bedb5c..4226f6680b06 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -130,6 +130,12 @@ extern void fscache_operation_gc(struct work_struct *);
130/* 130/*
131 * page.c 131 * page.c
132 */ 132 */
133extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
134extern int fscache_wait_for_operation_activation(struct fscache_object *,
135 struct fscache_operation *,
136 atomic_t *,
137 atomic_t *,
138 void (*)(struct fscache_operation *));
133extern void fscache_invalidate_writes(struct fscache_cookie *); 139extern void fscache_invalidate_writes(struct fscache_cookie *);
134 140
135/* 141/*
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index d479ab3c63e4..73899c1c3449 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -278,7 +278,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
278/* 278/*
279 * wait for a deferred lookup to complete 279 * wait for a deferred lookup to complete
280 */ 280 */
281static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) 281int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
282{ 282{
283 unsigned long jif; 283 unsigned long jif;
284 284
@@ -322,42 +322,46 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
322/* 322/*
323 * wait for an object to become active (or dead) 323 * wait for an object to become active (or dead)
324 */ 324 */
325static int fscache_wait_for_retrieval_activation(struct fscache_object *object, 325int fscache_wait_for_operation_activation(struct fscache_object *object,
326 struct fscache_retrieval *op, 326 struct fscache_operation *op,
327 atomic_t *stat_op_waits, 327 atomic_t *stat_op_waits,
328 atomic_t *stat_object_dead) 328 atomic_t *stat_object_dead,
329 void (*do_cancel)(struct fscache_operation *))
329{ 330{
330 int ret; 331 int ret;
331 332
332 if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) 333 if (!test_bit(FSCACHE_OP_WAITING, &op->flags))
333 goto check_if_dead; 334 goto check_if_dead;
334 335
335 _debug(">>> WT"); 336 _debug(">>> WT");
336 fscache_stat(stat_op_waits); 337 if (stat_op_waits)
337 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 338 fscache_stat(stat_op_waits);
339 if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
338 fscache_wait_bit_interruptible, 340 fscache_wait_bit_interruptible,
339 TASK_INTERRUPTIBLE) != 0) { 341 TASK_INTERRUPTIBLE) != 0) {
340 ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); 342 ret = fscache_cancel_op(op, do_cancel);
341 if (ret == 0) 343 if (ret == 0)
342 return -ERESTARTSYS; 344 return -ERESTARTSYS;
343 345
344 /* it's been removed from the pending queue by another party, 346 /* it's been removed from the pending queue by another party,
345 * so we should get to run shortly */ 347 * so we should get to run shortly */
346 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 348 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
347 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 349 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
348 } 350 }
349 _debug("<<< GO"); 351 _debug("<<< GO");
350 352
351check_if_dead: 353check_if_dead:
352 if (op->op.state == FSCACHE_OP_ST_CANCELLED) { 354 if (op->state == FSCACHE_OP_ST_CANCELLED) {
353 fscache_stat(stat_object_dead); 355 if (stat_object_dead)
356 fscache_stat(stat_object_dead);
354 _leave(" = -ENOBUFS [cancelled]"); 357 _leave(" = -ENOBUFS [cancelled]");
355 return -ENOBUFS; 358 return -ENOBUFS;
356 } 359 }
357 if (unlikely(fscache_object_is_dead(object))) { 360 if (unlikely(fscache_object_is_dead(object))) {
358 pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state); 361 pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
359 fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); 362 fscache_cancel_op(op, do_cancel);
360 fscache_stat(stat_object_dead); 363 if (stat_object_dead)
364 fscache_stat(stat_object_dead);
361 return -ENOBUFS; 365 return -ENOBUFS;
362 } 366 }
363 return 0; 367 return 0;
@@ -432,10 +436,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
432 436
433 /* we wait for the operation to become active, and then process it 437 /* we wait for the operation to become active, and then process it
434 * *here*, in this thread, and not in the thread pool */ 438 * *here*, in this thread, and not in the thread pool */
435 ret = fscache_wait_for_retrieval_activation( 439 ret = fscache_wait_for_operation_activation(
436 object, op, 440 object, &op->op,
437 __fscache_stat(&fscache_n_retrieval_op_waits), 441 __fscache_stat(&fscache_n_retrieval_op_waits),
438 __fscache_stat(&fscache_n_retrievals_object_dead)); 442 __fscache_stat(&fscache_n_retrievals_object_dead),
443 fscache_do_cancel_retrieval);
439 if (ret < 0) 444 if (ret < 0)
440 goto error; 445 goto error;
441 446
@@ -557,10 +562,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
557 562
558 /* we wait for the operation to become active, and then process it 563 /* we wait for the operation to become active, and then process it
559 * *here*, in this thread, and not in the thread pool */ 564 * *here*, in this thread, and not in the thread pool */
560 ret = fscache_wait_for_retrieval_activation( 565 ret = fscache_wait_for_operation_activation(
561 object, op, 566 object, &op->op,
562 __fscache_stat(&fscache_n_retrieval_op_waits), 567 __fscache_stat(&fscache_n_retrieval_op_waits),
563 __fscache_stat(&fscache_n_retrievals_object_dead)); 568 __fscache_stat(&fscache_n_retrievals_object_dead),
569 fscache_do_cancel_retrieval);
564 if (ret < 0) 570 if (ret < 0)
565 goto error; 571 goto error;
566 572
@@ -658,10 +664,11 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
658 664
659 fscache_stat(&fscache_n_alloc_ops); 665 fscache_stat(&fscache_n_alloc_ops);
660 666
661 ret = fscache_wait_for_retrieval_activation( 667 ret = fscache_wait_for_operation_activation(
662 object, op, 668 object, &op->op,
663 __fscache_stat(&fscache_n_alloc_op_waits), 669 __fscache_stat(&fscache_n_alloc_op_waits),
664 __fscache_stat(&fscache_n_allocs_object_dead)); 670 __fscache_stat(&fscache_n_allocs_object_dead),
671 fscache_do_cancel_retrieval);
665 if (ret < 0) 672 if (ret < 0)
666 goto error; 673 goto error;
667 674
@@ -694,6 +701,22 @@ nobufs:
694EXPORT_SYMBOL(__fscache_alloc_page); 701EXPORT_SYMBOL(__fscache_alloc_page);
695 702
696/* 703/*
704 * Unmark pages allocate in the readahead code path (via:
705 * fscache_readpages_or_alloc) after delegating to the base filesystem
706 */
707void __fscache_readpages_cancel(struct fscache_cookie *cookie,
708 struct list_head *pages)
709{
710 struct page *page;
711
712 list_for_each_entry(page, pages, lru) {
713 if (PageFsCache(page))
714 __fscache_uncache_page(cookie, page);
715 }
716}
717EXPORT_SYMBOL(__fscache_readpages_cancel);
718
719/*
697 * release a write op reference 720 * release a write op reference
698 */ 721 */
699static void fscache_release_write_op(struct fscache_operation *_op) 722static void fscache_release_write_op(struct fscache_operation *_op)
@@ -890,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
890 (1 << FSCACHE_OP_WAITING) | 913 (1 << FSCACHE_OP_WAITING) |
891 (1 << FSCACHE_OP_UNUSE_COOKIE); 914 (1 << FSCACHE_OP_UNUSE_COOKIE);
892 915
893 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 916 ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
894 if (ret < 0) 917 if (ret < 0)
895 goto nomem_free; 918 goto nomem_free;
896 919
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index aef34b1e635e..adbfd66b380f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -568,6 +568,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
568 568
569 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); 569 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
570} 570}
571static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL);
571 572
572static ssize_t cuse_class_abort_store(struct device *dev, 573static ssize_t cuse_class_abort_store(struct device *dev,
573 struct device_attribute *attr, 574 struct device_attribute *attr,
@@ -578,12 +579,14 @@ static ssize_t cuse_class_abort_store(struct device *dev,
578 fuse_abort_conn(&cc->fc); 579 fuse_abort_conn(&cc->fc);
579 return count; 580 return count;
580} 581}
582static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store);
581 583
582static struct device_attribute cuse_class_dev_attrs[] = { 584static struct attribute *cuse_class_dev_attrs[] = {
583 __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), 585 &dev_attr_waiting.attr,
584 __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), 586 &dev_attr_abort.attr,
585 { } 587 NULL,
586}; 588};
589ATTRIBUTE_GROUPS(cuse_class_dev);
587 590
588static struct miscdevice cuse_miscdev = { 591static struct miscdevice cuse_miscdev = {
589 .minor = MISC_DYNAMIC_MINOR, 592 .minor = MISC_DYNAMIC_MINOR,
@@ -609,7 +612,7 @@ static int __init cuse_init(void)
609 if (IS_ERR(cuse_class)) 612 if (IS_ERR(cuse_class))
610 return PTR_ERR(cuse_class); 613 return PTR_ERR(cuse_class);
611 614
612 cuse_class->dev_attrs = cuse_class_dev_attrs; 615 cuse_class->dev_groups = cuse_class_dev_groups;
613 616
614 rc = misc_register(&cuse_miscdev); 617 rc = misc_register(&cuse_miscdev);
615 if (rc) { 618 if (rc) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1d55f9465400..ef74ad5fd362 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1765,11 +1765,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1765/* Look up request on processing list by unique ID */ 1765/* Look up request on processing list by unique ID */
1766static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) 1766static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
1767{ 1767{
1768 struct list_head *entry; 1768 struct fuse_req *req;
1769 1769
1770 list_for_each(entry, &fc->processing) { 1770 list_for_each_entry(req, &fc->processing, list) {
1771 struct fuse_req *req;
1772 req = list_entry(entry, struct fuse_req, list);
1773 if (req->in.h.unique == unique || req->intr_unique == unique) 1771 if (req->in.h.unique == unique || req->intr_unique == unique)
1774 return req; 1772 return req;
1775 } 1773 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a5d5b04494..b7989f2ab4c4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -182,10 +182,12 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
182 struct inode *inode; 182 struct inode *inode;
183 struct dentry *parent; 183 struct dentry *parent;
184 struct fuse_conn *fc; 184 struct fuse_conn *fc;
185 struct fuse_inode *fi;
186 int ret;
185 187
186 inode = ACCESS_ONCE(entry->d_inode); 188 inode = ACCESS_ONCE(entry->d_inode);
187 if (inode && is_bad_inode(inode)) 189 if (inode && is_bad_inode(inode))
188 return 0; 190 goto invalid;
189 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 191 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
190 int err; 192 int err;
191 struct fuse_entry_out outarg; 193 struct fuse_entry_out outarg;
@@ -195,20 +197,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
195 197
196 /* For negative dentries, always do a fresh lookup */ 198 /* For negative dentries, always do a fresh lookup */
197 if (!inode) 199 if (!inode)
198 return 0; 200 goto invalid;
199 201
202 ret = -ECHILD;
200 if (flags & LOOKUP_RCU) 203 if (flags & LOOKUP_RCU)
201 return -ECHILD; 204 goto out;
202 205
203 fc = get_fuse_conn(inode); 206 fc = get_fuse_conn(inode);
204 req = fuse_get_req_nopages(fc); 207 req = fuse_get_req_nopages(fc);
208 ret = PTR_ERR(req);
205 if (IS_ERR(req)) 209 if (IS_ERR(req))
206 return 0; 210 goto out;
207 211
208 forget = fuse_alloc_forget(); 212 forget = fuse_alloc_forget();
209 if (!forget) { 213 if (!forget) {
210 fuse_put_request(fc, req); 214 fuse_put_request(fc, req);
211 return 0; 215 ret = -ENOMEM;
216 goto out;
212 } 217 }
213 218
214 attr_version = fuse_get_attr_version(fc); 219 attr_version = fuse_get_attr_version(fc);
@@ -224,10 +229,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
224 if (!err && !outarg.nodeid) 229 if (!err && !outarg.nodeid)
225 err = -ENOENT; 230 err = -ENOENT;
226 if (!err) { 231 if (!err) {
227 struct fuse_inode *fi = get_fuse_inode(inode); 232 fi = get_fuse_inode(inode);
228 if (outarg.nodeid != get_node_id(inode)) { 233 if (outarg.nodeid != get_node_id(inode)) {
229 fuse_queue_forget(fc, forget, outarg.nodeid, 1); 234 fuse_queue_forget(fc, forget, outarg.nodeid, 1);
230 return 0; 235 goto invalid;
231 } 236 }
232 spin_lock(&fc->lock); 237 spin_lock(&fc->lock);
233 fi->nlookup++; 238 fi->nlookup++;
@@ -235,21 +240,33 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
235 } 240 }
236 kfree(forget); 241 kfree(forget);
237 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 242 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
238 return 0; 243 goto invalid;
239 244
240 fuse_change_attributes(inode, &outarg.attr, 245 fuse_change_attributes(inode, &outarg.attr,
241 entry_attr_timeout(&outarg), 246 entry_attr_timeout(&outarg),
242 attr_version); 247 attr_version);
243 fuse_change_entry_timeout(entry, &outarg); 248 fuse_change_entry_timeout(entry, &outarg);
244 } else if (inode) { 249 } else if (inode) {
245 fc = get_fuse_conn(inode); 250 fi = get_fuse_inode(inode);
246 if (fc->readdirplus_auto) { 251 if (flags & LOOKUP_RCU) {
252 if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
253 return -ECHILD;
254 } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
247 parent = dget_parent(entry); 255 parent = dget_parent(entry);
248 fuse_advise_use_readdirplus(parent->d_inode); 256 fuse_advise_use_readdirplus(parent->d_inode);
249 dput(parent); 257 dput(parent);
250 } 258 }
251 } 259 }
252 return 1; 260 ret = 1;
261out:
262 return ret;
263
264invalid:
265 ret = 0;
266
267 if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0)
268 ret = 1;
269 goto out;
253} 270}
254 271
255static int invalid_nodeid(u64 nodeid) 272static int invalid_nodeid(u64 nodeid)
@@ -267,26 +284,6 @@ int fuse_valid_type(int m)
267 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); 284 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
268} 285}
269 286
270/*
271 * Add a directory inode to a dentry, ensuring that no other dentry
272 * refers to this inode. Called with fc->inst_mutex.
273 */
274static struct dentry *fuse_d_add_directory(struct dentry *entry,
275 struct inode *inode)
276{
277 struct dentry *alias = d_find_alias(inode);
278 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
279 /* This tries to shrink the subtree below alias */
280 fuse_invalidate_entry(alias);
281 dput(alias);
282 if (!hlist_empty(&inode->i_dentry))
283 return ERR_PTR(-EBUSY);
284 } else {
285 dput(alias);
286 }
287 return d_splice_alias(inode, entry);
288}
289
290int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, 287int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
291 struct fuse_entry_out *outarg, struct inode **inode) 288 struct fuse_entry_out *outarg, struct inode **inode)
292{ 289{
@@ -345,6 +342,24 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
345 return err; 342 return err;
346} 343}
347 344
345static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
346 struct inode *inode)
347{
348 struct dentry *newent;
349
350 if (inode && S_ISDIR(inode->i_mode)) {
351 struct fuse_conn *fc = get_fuse_conn(inode);
352
353 mutex_lock(&fc->inst_mutex);
354 newent = d_materialise_unique(dentry, inode);
355 mutex_unlock(&fc->inst_mutex);
356 } else {
357 newent = d_materialise_unique(dentry, inode);
358 }
359
360 return newent;
361}
362
348static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, 363static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
349 unsigned int flags) 364 unsigned int flags)
350{ 365{
@@ -352,7 +367,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
352 struct fuse_entry_out outarg; 367 struct fuse_entry_out outarg;
353 struct inode *inode; 368 struct inode *inode;
354 struct dentry *newent; 369 struct dentry *newent;
355 struct fuse_conn *fc = get_fuse_conn(dir);
356 bool outarg_valid = true; 370 bool outarg_valid = true;
357 371
358 err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, 372 err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
@@ -368,16 +382,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
368 if (inode && get_node_id(inode) == FUSE_ROOT_ID) 382 if (inode && get_node_id(inode) == FUSE_ROOT_ID)
369 goto out_iput; 383 goto out_iput;
370 384
371 if (inode && S_ISDIR(inode->i_mode)) { 385 newent = fuse_materialise_dentry(entry, inode);
372 mutex_lock(&fc->inst_mutex); 386 err = PTR_ERR(newent);
373 newent = fuse_d_add_directory(entry, inode); 387 if (IS_ERR(newent))
374 mutex_unlock(&fc->inst_mutex); 388 goto out_err;
375 err = PTR_ERR(newent);
376 if (IS_ERR(newent))
377 goto out_iput;
378 } else {
379 newent = d_splice_alias(inode, entry);
380 }
381 389
382 entry = newent ? newent : entry; 390 entry = newent ? newent : entry;
383 if (outarg_valid) 391 if (outarg_valid)
@@ -1060,6 +1068,8 @@ static int fuse_access(struct inode *inode, int mask)
1060 struct fuse_access_in inarg; 1068 struct fuse_access_in inarg;
1061 int err; 1069 int err;
1062 1070
1071 BUG_ON(mask & MAY_NOT_BLOCK);
1072
1063 if (fc->no_access) 1073 if (fc->no_access)
1064 return 0; 1074 return 0;
1065 1075
@@ -1147,9 +1157,6 @@ static int fuse_permission(struct inode *inode, int mask)
1147 noticed immediately, only after the attribute 1157 noticed immediately, only after the attribute
1148 timeout has expired */ 1158 timeout has expired */
1149 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { 1159 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
1150 if (mask & MAY_NOT_BLOCK)
1151 return -ECHILD;
1152
1153 err = fuse_access(inode, mask); 1160 err = fuse_access(inode, mask);
1154 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { 1161 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
1155 if (!(inode->i_mode & S_IXUGO)) { 1162 if (!(inode->i_mode & S_IXUGO)) {
@@ -1174,6 +1181,8 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
1174 return -EIO; 1181 return -EIO;
1175 if (reclen > nbytes) 1182 if (reclen > nbytes)
1176 break; 1183 break;
1184 if (memchr(dirent->name, '/', dirent->namelen) != NULL)
1185 return -EIO;
1177 1186
1178 if (!dir_emit(ctx, dirent->name, dirent->namelen, 1187 if (!dir_emit(ctx, dirent->name, dirent->namelen,
1179 dirent->ino, dirent->type)) 1188 dirent->ino, dirent->type))
@@ -1275,18 +1284,10 @@ static int fuse_direntplus_link(struct file *file,
1275 if (!inode) 1284 if (!inode)
1276 goto out; 1285 goto out;
1277 1286
1278 if (S_ISDIR(inode->i_mode)) { 1287 alias = fuse_materialise_dentry(dentry, inode);
1279 mutex_lock(&fc->inst_mutex); 1288 err = PTR_ERR(alias);
1280 alias = fuse_d_add_directory(dentry, inode); 1289 if (IS_ERR(alias))
1281 mutex_unlock(&fc->inst_mutex); 1290 goto out;
1282 err = PTR_ERR(alias);
1283 if (IS_ERR(alias)) {
1284 iput(inode);
1285 goto out;
1286 }
1287 } else {
1288 alias = d_splice_alias(inode, dentry);
1289 }
1290 1291
1291 if (alias) { 1292 if (alias) {
1292 dput(dentry); 1293 dput(dentry);
@@ -1294,6 +1295,8 @@ static int fuse_direntplus_link(struct file *file,
1294 } 1295 }
1295 1296
1296found: 1297found:
1298 if (fc->readdirplus_auto)
1299 set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
1297 fuse_change_entry_timeout(dentry, o); 1300 fuse_change_entry_timeout(dentry, o);
1298 1301
1299 err = 0; 1302 err = 0;
@@ -1320,6 +1323,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1320 return -EIO; 1323 return -EIO;
1321 if (reclen > nbytes) 1324 if (reclen > nbytes)
1322 break; 1325 break;
1326 if (memchr(dirent->name, '/', dirent->namelen) != NULL)
1327 return -EIO;
1323 1328
1324 if (!over) { 1329 if (!over) {
1325 /* We fill entries into dstbuf only as much as 1330 /* We fill entries into dstbuf only as much as
@@ -1590,6 +1595,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1590 struct file *file) 1595 struct file *file)
1591{ 1596{
1592 struct fuse_conn *fc = get_fuse_conn(inode); 1597 struct fuse_conn *fc = get_fuse_conn(inode);
1598 struct fuse_inode *fi = get_fuse_inode(inode);
1593 struct fuse_req *req; 1599 struct fuse_req *req;
1594 struct fuse_setattr_in inarg; 1600 struct fuse_setattr_in inarg;
1595 struct fuse_attr_out outarg; 1601 struct fuse_attr_out outarg;
@@ -1617,8 +1623,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1617 if (IS_ERR(req)) 1623 if (IS_ERR(req))
1618 return PTR_ERR(req); 1624 return PTR_ERR(req);
1619 1625
1620 if (is_truncate) 1626 if (is_truncate) {
1621 fuse_set_nowrite(inode); 1627 fuse_set_nowrite(inode);
1628 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1629 }
1622 1630
1623 memset(&inarg, 0, sizeof(inarg)); 1631 memset(&inarg, 0, sizeof(inarg));
1624 memset(&outarg, 0, sizeof(outarg)); 1632 memset(&outarg, 0, sizeof(outarg));
@@ -1676,16 +1684,18 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1676 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1684 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1677 */ 1685 */
1678 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1686 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1679 truncate_pagecache(inode, oldsize, outarg.attr.size); 1687 truncate_pagecache(inode, outarg.attr.size);
1680 invalidate_inode_pages2(inode->i_mapping); 1688 invalidate_inode_pages2(inode->i_mapping);
1681 } 1689 }
1682 1690
1691 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1683 return 0; 1692 return 0;
1684 1693
1685error: 1694error:
1686 if (is_truncate) 1695 if (is_truncate)
1687 fuse_release_nowrite(inode); 1696 fuse_release_nowrite(inode);
1688 1697
1698 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1689 return err; 1699 return err;
1690} 1700}
1691 1701
@@ -1749,6 +1759,8 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1749 fc->no_setxattr = 1; 1759 fc->no_setxattr = 1;
1750 err = -EOPNOTSUPP; 1760 err = -EOPNOTSUPP;
1751 } 1761 }
1762 if (!err)
1763 fuse_invalidate_attr(inode);
1752 return err; 1764 return err;
1753} 1765}
1754 1766
@@ -1878,6 +1890,8 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1878 fc->no_removexattr = 1; 1890 fc->no_removexattr = 1;
1879 err = -EOPNOTSUPP; 1891 err = -EOPNOTSUPP;
1880 } 1892 }
1893 if (!err)
1894 fuse_invalidate_attr(inode);
1881 return err; 1895 return err;
1882} 1896}
1883 1897
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c121fe19c5f..4598345ab87d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -629,7 +629,8 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
629 struct fuse_inode *fi = get_fuse_inode(inode); 629 struct fuse_inode *fi = get_fuse_inode(inode);
630 630
631 spin_lock(&fc->lock); 631 spin_lock(&fc->lock);
632 if (attr_ver == fi->attr_version && size < inode->i_size) { 632 if (attr_ver == fi->attr_version && size < inode->i_size &&
633 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
633 fi->attr_version = ++fc->attr_version; 634 fi->attr_version = ++fc->attr_version;
634 i_size_write(inode, size); 635 i_size_write(inode, size);
635 } 636 }
@@ -1032,12 +1033,16 @@ static ssize_t fuse_perform_write(struct file *file,
1032{ 1033{
1033 struct inode *inode = mapping->host; 1034 struct inode *inode = mapping->host;
1034 struct fuse_conn *fc = get_fuse_conn(inode); 1035 struct fuse_conn *fc = get_fuse_conn(inode);
1036 struct fuse_inode *fi = get_fuse_inode(inode);
1035 int err = 0; 1037 int err = 0;
1036 ssize_t res = 0; 1038 ssize_t res = 0;
1037 1039
1038 if (is_bad_inode(inode)) 1040 if (is_bad_inode(inode))
1039 return -EIO; 1041 return -EIO;
1040 1042
1043 if (inode->i_size < pos + iov_iter_count(ii))
1044 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1045
1041 do { 1046 do {
1042 struct fuse_req *req; 1047 struct fuse_req *req;
1043 ssize_t count; 1048 ssize_t count;
@@ -1073,6 +1078,7 @@ static ssize_t fuse_perform_write(struct file *file,
1073 if (res > 0) 1078 if (res > 0)
1074 fuse_write_update_size(inode, pos); 1079 fuse_write_update_size(inode, pos);
1075 1080
1081 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1076 fuse_invalidate_attr(inode); 1082 fuse_invalidate_attr(inode);
1077 1083
1078 return res > 0 ? res : err; 1084 return res > 0 ? res : err;
@@ -1529,7 +1535,6 @@ static int fuse_writepage_locked(struct page *page)
1529 1535
1530 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); 1536 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
1531 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1537 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1532 end_page_writeback(page);
1533 1538
1534 spin_lock(&fc->lock); 1539 spin_lock(&fc->lock);
1535 list_add(&req->writepages_entry, &fi->writepages); 1540 list_add(&req->writepages_entry, &fi->writepages);
@@ -1537,6 +1542,8 @@ static int fuse_writepage_locked(struct page *page)
1537 fuse_flush_writepages(inode); 1542 fuse_flush_writepages(inode);
1538 spin_unlock(&fc->lock); 1543 spin_unlock(&fc->lock);
1539 1544
1545 end_page_writeback(page);
1546
1540 return 0; 1547 return 0;
1541 1548
1542err_free: 1549err_free:
@@ -2460,6 +2467,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2460{ 2467{
2461 struct fuse_file *ff = file->private_data; 2468 struct fuse_file *ff = file->private_data;
2462 struct inode *inode = file->f_inode; 2469 struct inode *inode = file->f_inode;
2470 struct fuse_inode *fi = get_fuse_inode(inode);
2463 struct fuse_conn *fc = ff->fc; 2471 struct fuse_conn *fc = ff->fc;
2464 struct fuse_req *req; 2472 struct fuse_req *req;
2465 struct fuse_fallocate_in inarg = { 2473 struct fuse_fallocate_in inarg = {
@@ -2477,10 +2485,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2477 2485
2478 if (lock_inode) { 2486 if (lock_inode) {
2479 mutex_lock(&inode->i_mutex); 2487 mutex_lock(&inode->i_mutex);
2480 if (mode & FALLOC_FL_PUNCH_HOLE) 2488 if (mode & FALLOC_FL_PUNCH_HOLE) {
2481 fuse_set_nowrite(inode); 2489 loff_t endbyte = offset + length - 1;
2490 err = filemap_write_and_wait_range(inode->i_mapping,
2491 offset, endbyte);
2492 if (err)
2493 goto out;
2494
2495 fuse_sync_writes(inode);
2496 }
2482 } 2497 }
2483 2498
2499 if (!(mode & FALLOC_FL_KEEP_SIZE))
2500 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2501
2484 req = fuse_get_req_nopages(fc); 2502 req = fuse_get_req_nopages(fc);
2485 if (IS_ERR(req)) { 2503 if (IS_ERR(req)) {
2486 err = PTR_ERR(req); 2504 err = PTR_ERR(req);
@@ -2513,11 +2531,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2513 fuse_invalidate_attr(inode); 2531 fuse_invalidate_attr(inode);
2514 2532
2515out: 2533out:
2516 if (lock_inode) { 2534 if (!(mode & FALLOC_FL_KEEP_SIZE))
2517 if (mode & FALLOC_FL_PUNCH_HOLE) 2535 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2518 fuse_release_nowrite(inode); 2536
2537 if (lock_inode)
2519 mutex_unlock(&inode->i_mutex); 2538 mutex_unlock(&inode->i_mutex);
2520 }
2521 2539
2522 return err; 2540 return err;
2523} 2541}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fde7249a3a96..5b9e6f3b6aef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -115,6 +115,10 @@ struct fuse_inode {
115enum { 115enum {
116 /** Advise readdirplus */ 116 /** Advise readdirplus */
117 FUSE_I_ADVISE_RDPLUS, 117 FUSE_I_ADVISE_RDPLUS,
118 /** Initialized with readdirplus */
119 FUSE_I_INIT_RDPLUS,
120 /** An operation changing file size is in progress */
121 FUSE_I_SIZE_UNSTABLE,
118}; 122};
119 123
120struct fuse_conn; 124struct fuse_conn;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b578598c6ac..a8ce6dab60a0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -201,7 +201,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
201 struct timespec old_mtime; 201 struct timespec old_mtime;
202 202
203 spin_lock(&fc->lock); 203 spin_lock(&fc->lock);
204 if (attr_version != 0 && fi->attr_version > attr_version) { 204 if ((attr_version != 0 && fi->attr_version > attr_version) ||
205 test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
205 spin_unlock(&fc->lock); 206 spin_unlock(&fc->lock);
206 return; 207 return;
207 } 208 }
@@ -217,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
217 bool inval = false; 218 bool inval = false;
218 219
219 if (oldsize != attr->size) { 220 if (oldsize != attr->size) {
220 truncate_pagecache(inode, oldsize, attr->size); 221 truncate_pagecache(inode, attr->size);
221 inval = true; 222 inval = true;
222 } else if (fc->auto_inval_data) { 223 } else if (fc->auto_inval_data) {
223 struct timespec new_mtime = { 224 struct timespec new_mtime = {
@@ -929,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
929 fc->bdi.name = "fuse"; 930 fc->bdi.name = "fuse";
930 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 931 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
931 /* fuse does it's own writeback accounting */ 932 /* fuse does it's own writeback accounting */
932 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 933 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
933 934
934 err = bdi_init(&fc->bdi); 935 err = bdi_init(&fc->bdi);
935 if (err) 936 if (err)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ee48ad37d9c0..1f7d8057ea68 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -122,14 +122,13 @@ out:
122} 122}
123 123
124/** 124/**
125 * gfs2_writeback_writepage - Write page for writeback mappings 125 * gfs2_writepage - Write page for writeback mappings
126 * @page: The page 126 * @page: The page
127 * @wbc: The writeback control 127 * @wbc: The writeback control
128 * 128 *
129 */ 129 */
130 130
131static int gfs2_writeback_writepage(struct page *page, 131static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
132 struct writeback_control *wbc)
133{ 132{
134 int ret; 133 int ret;
135 134
@@ -141,32 +140,6 @@ static int gfs2_writeback_writepage(struct page *page,
141} 140}
142 141
143/** 142/**
144 * gfs2_ordered_writepage - Write page for ordered data files
145 * @page: The page to write
146 * @wbc: The writeback control
147 *
148 */
149
150static int gfs2_ordered_writepage(struct page *page,
151 struct writeback_control *wbc)
152{
153 struct inode *inode = page->mapping->host;
154 struct gfs2_inode *ip = GFS2_I(inode);
155 int ret;
156
157 ret = gfs2_writepage_common(page, wbc);
158 if (ret <= 0)
159 return ret;
160
161 if (!page_has_buffers(page)) {
162 create_empty_buffers(page, inode->i_sb->s_blocksize,
163 (1 << BH_Dirty)|(1 << BH_Uptodate));
164 }
165 gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
166 return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
167}
168
169/**
170 * __gfs2_jdata_writepage - The core of jdata writepage 143 * __gfs2_jdata_writepage - The core of jdata writepage
171 * @page: The page to write 144 * @page: The page to write
172 * @wbc: The writeback control 145 * @wbc: The writeback control
@@ -842,6 +815,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
842 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 815 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
843 unsigned int to = from + len; 816 unsigned int to = from + len;
844 int ret; 817 int ret;
818 struct gfs2_trans *tr = current->journal_info;
819 BUG_ON(!tr);
845 820
846 BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); 821 BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
847 822
@@ -852,8 +827,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
852 goto failed; 827 goto failed;
853 } 828 }
854 829
855 gfs2_trans_add_meta(ip->i_gl, dibh);
856
857 if (gfs2_is_stuffed(ip)) 830 if (gfs2_is_stuffed(ip))
858 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); 831 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
859 832
@@ -861,6 +834,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
861 gfs2_page_add_databufs(ip, page, from, to); 834 gfs2_page_add_databufs(ip, page, from, to);
862 835
863 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 836 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
837 if (tr->tr_num_buf_new)
838 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
839 else
840 gfs2_trans_add_meta(ip->i_gl, dibh);
841
864 842
865 if (inode == sdp->sd_rindex) { 843 if (inode == sdp->sd_rindex) {
866 adjust_fs_space(inode); 844 adjust_fs_space(inode);
@@ -1107,7 +1085,7 @@ cannot_release:
1107} 1085}
1108 1086
1109static const struct address_space_operations gfs2_writeback_aops = { 1087static const struct address_space_operations gfs2_writeback_aops = {
1110 .writepage = gfs2_writeback_writepage, 1088 .writepage = gfs2_writepage,
1111 .writepages = gfs2_writepages, 1089 .writepages = gfs2_writepages,
1112 .readpage = gfs2_readpage, 1090 .readpage = gfs2_readpage,
1113 .readpages = gfs2_readpages, 1091 .readpages = gfs2_readpages,
@@ -1123,7 +1101,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1123}; 1101};
1124 1102
1125static const struct address_space_operations gfs2_ordered_aops = { 1103static const struct address_space_operations gfs2_ordered_aops = {
1126 .writepage = gfs2_ordered_writepage, 1104 .writepage = gfs2_writepage,
1127 .writepages = gfs2_writepages, 1105 .writepages = gfs2_writepages,
1128 .readpage = gfs2_readpage, 1106 .readpage = gfs2_readpage,
1129 .readpages = gfs2_readpages, 1107 .readpages = gfs2_readpages,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e2f56fccf6b..62a65fc448dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize
1016 chunk = oldsize - newsize; 1016 chunk = oldsize - newsize;
1017 if (chunk > max_chunk) 1017 if (chunk > max_chunk)
1018 chunk = max_chunk; 1018 chunk = max_chunk;
1019 truncate_pagecache(inode, oldsize, oldsize - chunk); 1019 truncate_pagecache(inode, oldsize - chunk);
1020 oldsize -= chunk; 1020 oldsize -= chunk;
1021 gfs2_trans_end(sdp); 1021 gfs2_trans_end(sdp);
1022 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); 1022 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
@@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1067 if (journaled) 1067 if (journaled)
1068 error = gfs2_journaled_truncate(inode, oldsize, newsize); 1068 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1069 else 1069 else
1070 truncate_pagecache(inode, oldsize, newsize); 1070 truncate_pagecache(inode, newsize);
1071 1071
1072 if (error) { 1072 if (error) {
1073 brelse(dibh); 1073 brelse(dibh);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index f2448ab2aac5..d3a5d4e29ba5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -93,12 +93,9 @@ invalid_gunlock:
93 if (!had_lock) 93 if (!had_lock)
94 gfs2_glock_dq_uninit(&d_gh); 94 gfs2_glock_dq_uninit(&d_gh);
95invalid: 95invalid:
96 if (inode && S_ISDIR(inode->i_mode)) { 96 if (check_submounts_and_drop(dentry) != 0)
97 if (have_submounts(dentry)) 97 goto valid;
98 goto valid; 98
99 shrink_dcache_parent(dentry);
100 }
101 d_drop(dentry);
102 dput(parent); 99 dput(parent);
103 return 0; 100 return 0;
104 101
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0cb4c1557f20..2e5fc268d324 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1859,7 +1859,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1859 1859
1860 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1860 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1861 1861
1862 ht = kzalloc(size, GFP_NOFS); 1862 ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
1863 if (ht == NULL) 1863 if (ht == NULL)
1864 ht = vzalloc(size); 1864 ht = vzalloc(size);
1865 if (!ht) 1865 if (!ht)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 72c3866a7320..0621b46d474d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -650,7 +650,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
650{ 650{
651 struct address_space *mapping = file->f_mapping; 651 struct address_space *mapping = file->f_mapping;
652 struct inode *inode = mapping->host; 652 struct inode *inode = mapping->host;
653 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 653 int sync_state = inode->i_state & I_DIRTY;
654 struct gfs2_inode *ip = GFS2_I(inode); 654 struct gfs2_inode *ip = GFS2_I(inode);
655 int ret = 0, ret1 = 0; 655 int ret = 0, ret1 = 0;
656 656
@@ -660,6 +660,8 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
660 return ret1; 660 return ret1;
661 } 661 }
662 662
663 if (!gfs2_is_jdata(ip))
664 sync_state &= ~I_DIRTY_PAGES;
663 if (datasync) 665 if (datasync)
664 sync_state &= ~I_DIRTY_SYNC; 666 sync_state &= ~I_DIRTY_SYNC;
665 667
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 544a809819c3..c2f41b4d00b9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1411,7 +1411,6 @@ __acquires(&lru_lock)
1411 if (demote_ok(gl)) 1411 if (demote_ok(gl))
1412 handle_callback(gl, LM_ST_UNLOCKED, 0, false); 1412 handle_callback(gl, LM_ST_UNLOCKED, 0, false);
1413 WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); 1413 WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
1414 smp_mb__after_clear_bit();
1415 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1414 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1416 gfs2_glock_put_nolock(gl); 1415 gfs2_glock_put_nolock(gl);
1417 spin_unlock(&gl->gl_spin); 1416 spin_unlock(&gl->gl_spin);
@@ -1428,21 +1427,22 @@ __acquires(&lru_lock)
1428 * gfs2_dispose_glock_lru() above. 1427 * gfs2_dispose_glock_lru() above.
1429 */ 1428 */
1430 1429
1431static void gfs2_scan_glock_lru(int nr) 1430static long gfs2_scan_glock_lru(int nr)
1432{ 1431{
1433 struct gfs2_glock *gl; 1432 struct gfs2_glock *gl;
1434 LIST_HEAD(skipped); 1433 LIST_HEAD(skipped);
1435 LIST_HEAD(dispose); 1434 LIST_HEAD(dispose);
1435 long freed = 0;
1436 1436
1437 spin_lock(&lru_lock); 1437 spin_lock(&lru_lock);
1438 while(nr && !list_empty(&lru_list)) { 1438 while ((nr-- >= 0) && !list_empty(&lru_list)) {
1439 gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); 1439 gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
1440 1440
1441 /* Test for being demotable */ 1441 /* Test for being demotable */
1442 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 1442 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1443 list_move(&gl->gl_lru, &dispose); 1443 list_move(&gl->gl_lru, &dispose);
1444 atomic_dec(&lru_count); 1444 atomic_dec(&lru_count);
1445 nr--; 1445 freed++;
1446 continue; 1446 continue;
1447 } 1447 }
1448 1448
@@ -1452,23 +1452,28 @@ static void gfs2_scan_glock_lru(int nr)
1452 if (!list_empty(&dispose)) 1452 if (!list_empty(&dispose))
1453 gfs2_dispose_glock_lru(&dispose); 1453 gfs2_dispose_glock_lru(&dispose);
1454 spin_unlock(&lru_lock); 1454 spin_unlock(&lru_lock);
1455
1456 return freed;
1455} 1457}
1456 1458
1457static int gfs2_shrink_glock_memory(struct shrinker *shrink, 1459static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
1458 struct shrink_control *sc) 1460 struct shrink_control *sc)
1459{ 1461{
1460 if (sc->nr_to_scan) { 1462 if (!(sc->gfp_mask & __GFP_FS))
1461 if (!(sc->gfp_mask & __GFP_FS)) 1463 return SHRINK_STOP;
1462 return -1; 1464 return gfs2_scan_glock_lru(sc->nr_to_scan);
1463 gfs2_scan_glock_lru(sc->nr_to_scan); 1465}
1464 }
1465 1466
1466 return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; 1467static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
1468 struct shrink_control *sc)
1469{
1470 return vfs_pressure_ratio(atomic_read(&lru_count));
1467} 1471}
1468 1472
1469static struct shrinker glock_shrinker = { 1473static struct shrinker glock_shrinker = {
1470 .shrink = gfs2_shrink_glock_memory,
1471 .seeks = DEFAULT_SEEKS, 1474 .seeks = DEFAULT_SEEKS,
1475 .count_objects = gfs2_glock_shrink_count,
1476 .scan_objects = gfs2_glock_shrink_scan,
1472}; 1477};
1473 1478
1474/** 1479/**
@@ -1488,7 +1493,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
1488 1493
1489 rcu_read_lock(); 1494 rcu_read_lock();
1490 hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { 1495 hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
1491 if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref)) 1496 if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref))
1492 examiner(gl); 1497 examiner(gl);
1493 } 1498 }
1494 rcu_read_unlock(); 1499 rcu_read_unlock();
@@ -1508,18 +1513,17 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
1508 * thaw_glock - thaw out a glock which has an unprocessed reply waiting 1513 * thaw_glock - thaw out a glock which has an unprocessed reply waiting
1509 * @gl: The glock to thaw 1514 * @gl: The glock to thaw
1510 * 1515 *
1511 * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
1512 * so this has to result in the ref count being dropped by one.
1513 */ 1516 */
1514 1517
1515static void thaw_glock(struct gfs2_glock *gl) 1518static void thaw_glock(struct gfs2_glock *gl)
1516{ 1519{
1517 if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) 1520 if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
1518 return; 1521 goto out;
1519 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1522 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1520 gfs2_glock_hold(gl); 1523 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) {
1521 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1524out:
1522 gfs2_glock_put(gl); 1525 gfs2_glock_put(gl);
1526 }
1523} 1527}
1524 1528
1525/** 1529/**
@@ -1536,7 +1540,6 @@ static void clear_glock(struct gfs2_glock *gl)
1536 if (gl->gl_state != LM_ST_UNLOCKED) 1540 if (gl->gl_state != LM_ST_UNLOCKED)
1537 handle_callback(gl, LM_ST_UNLOCKED, 0, false); 1541 handle_callback(gl, LM_ST_UNLOCKED, 0, false);
1538 spin_unlock(&gl->gl_spin); 1542 spin_unlock(&gl->gl_spin);
1539 gfs2_glock_hold(gl);
1540 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1543 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1541 gfs2_glock_put(gl); 1544 gfs2_glock_put(gl);
1542} 1545}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 64915eeae5a7..ced3257f06e8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -694,8 +694,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
694 694
695 mark_inode_dirty(inode); 695 mark_inode_dirty(inode);
696 d_instantiate(dentry, inode); 696 d_instantiate(dentry, inode);
697 if (file) 697 if (file) {
698 *opened |= FILE_CREATED;
698 error = finish_open(file, dentry, gfs2_open_common, opened); 699 error = finish_open(file, dentry, gfs2_open_common, opened);
700 }
699 gfs2_glock_dq_uninit(ghs); 701 gfs2_glock_dq_uninit(ghs);
700 gfs2_glock_dq_uninit(ghs + 1); 702 gfs2_glock_dq_uninit(ghs + 1);
701 return error; 703 return error;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 17c5b5d7dc88..010b9fb9fec6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -579,6 +579,24 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
579 return error; 579 return error;
580} 580}
581 581
582/**
583 * gfs2_meta_sync - Sync all buffers associated with a glock
584 * @gl: The glock
585 *
586 */
587
588static void gfs2_meta_sync(struct gfs2_glock *gl)
589{
590 struct address_space *mapping = gfs2_glock2aspace(gl);
591 int error;
592
593 filemap_fdatawrite(mapping);
594 error = filemap_fdatawait(mapping);
595
596 if (error)
597 gfs2_io_error(gl->gl_sbd);
598}
599
582static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) 600static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
583{ 601{
584 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 602 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7b0f5043cf24..351586e24e30 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -32,7 +32,8 @@
32struct workqueue_struct *gfs2_control_wq; 32struct workqueue_struct *gfs2_control_wq;
33 33
34static struct shrinker qd_shrinker = { 34static struct shrinker qd_shrinker = {
35 .shrink = gfs2_shrink_qd_memory, 35 .count_objects = gfs2_qd_shrink_count,
36 .scan_objects = gfs2_qd_shrink_scan,
36 .seeks = DEFAULT_SEEKS, 37 .seeks = DEFAULT_SEEKS,
37}; 38};
38 39
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0da390686c08..932415050540 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -98,24 +98,6 @@ const struct address_space_operations gfs2_meta_aops = {
98}; 98};
99 99
100/** 100/**
101 * gfs2_meta_sync - Sync all buffers associated with a glock
102 * @gl: The glock
103 *
104 */
105
106void gfs2_meta_sync(struct gfs2_glock *gl)
107{
108 struct address_space *mapping = gfs2_glock2aspace(gl);
109 int error;
110
111 filemap_fdatawrite(mapping);
112 error = filemap_fdatawait(mapping);
113
114 if (error)
115 gfs2_io_error(gl->gl_sbd);
116}
117
118/**
119 * gfs2_getbuf - Get a buffer with a given address space 101 * gfs2_getbuf - Get a buffer with a given address space
120 * @gl: the glock 102 * @gl: the glock
121 * @blkno: the block number (filesystem scope) 103 * @blkno: the block number (filesystem scope)
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 0d4c843b6f8e..4823b934208a 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -48,21 +48,17 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
48 return inode->i_sb->s_fs_info; 48 return inode->i_sb->s_fs_info;
49} 49}
50 50
51void gfs2_meta_sync(struct gfs2_glock *gl); 51extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
52 52extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
53struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); 53 struct buffer_head **bhp);
54int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, 54extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
55 int flags, struct buffer_head **bhp); 55extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
56int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); 56 int create);
57struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); 57extern void gfs2_remove_from_journal(struct buffer_head *bh,
58 58 struct gfs2_trans *tr, int meta);
59void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, 59extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
60 int meta); 60extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
61 61 struct buffer_head **bhp);
62void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
63
64int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
65 struct buffer_head **bhp);
66 62
67static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, 63static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
68 struct buffer_head **bhp) 64 struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0262c190b6f9..19ff5e8c285c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -646,6 +646,48 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
646 return error; 646 return error;
647} 647}
648 648
649/**
650 * check_journal_clean - Make sure a journal is clean for a spectator mount
651 * @sdp: The GFS2 superblock
652 * @jd: The journal descriptor
653 *
654 * Returns: 0 if the journal is clean or locked, else an error
655 */
656static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
657{
658 int error;
659 struct gfs2_holder j_gh;
660 struct gfs2_log_header_host head;
661 struct gfs2_inode *ip;
662
663 ip = GFS2_I(jd->jd_inode);
664 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
665 GL_EXACT | GL_NOCACHE, &j_gh);
666 if (error) {
667 fs_err(sdp, "Error locking journal for spectator mount.\n");
668 return -EPERM;
669 }
670 error = gfs2_jdesc_check(jd);
671 if (error) {
672 fs_err(sdp, "Error checking journal for spectator mount.\n");
673 goto out_unlock;
674 }
675 error = gfs2_find_jhead(jd, &head);
676 if (error) {
677 fs_err(sdp, "Error parsing journal for spectator mount.\n");
678 goto out_unlock;
679 }
680 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
681 error = -EPERM;
682 fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter "
683 "must not be a spectator.\n", jd->jd_jid);
684 }
685
686out_unlock:
687 gfs2_glock_dq_uninit(&j_gh);
688 return error;
689}
690
649static int init_journal(struct gfs2_sbd *sdp, int undo) 691static int init_journal(struct gfs2_sbd *sdp, int undo)
650{ 692{
651 struct inode *master = sdp->sd_master_dir->d_inode; 693 struct inode *master = sdp->sd_master_dir->d_inode;
@@ -732,8 +774,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
732 if (sdp->sd_lockstruct.ls_first) { 774 if (sdp->sd_lockstruct.ls_first) {
733 unsigned int x; 775 unsigned int x;
734 for (x = 0; x < sdp->sd_journals; x++) { 776 for (x = 0; x < sdp->sd_journals; x++) {
735 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), 777 struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x);
736 true); 778
779 if (sdp->sd_args.ar_spectator) {
780 error = check_journal_clean(sdp, jd);
781 if (error)
782 goto fail_jinode_gh;
783 continue;
784 }
785 error = gfs2_recover_journal(jd, true);
737 if (error) { 786 if (error) {
738 fs_err(sdp, "error recovering journal %u: %d\n", 787 fs_err(sdp, "error recovering journal %u: %d\n",
739 x, error); 788 x, error);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3768c2f40e43..db441359ee8c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -75,17 +75,16 @@ static LIST_HEAD(qd_lru_list);
75static atomic_t qd_lru_count = ATOMIC_INIT(0); 75static atomic_t qd_lru_count = ATOMIC_INIT(0);
76static DEFINE_SPINLOCK(qd_lru_lock); 76static DEFINE_SPINLOCK(qd_lru_lock);
77 77
78int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) 78unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
79 struct shrink_control *sc)
79{ 80{
80 struct gfs2_quota_data *qd; 81 struct gfs2_quota_data *qd;
81 struct gfs2_sbd *sdp; 82 struct gfs2_sbd *sdp;
82 int nr_to_scan = sc->nr_to_scan; 83 int nr_to_scan = sc->nr_to_scan;
83 84 long freed = 0;
84 if (nr_to_scan == 0)
85 goto out;
86 85
87 if (!(sc->gfp_mask & __GFP_FS)) 86 if (!(sc->gfp_mask & __GFP_FS))
88 return -1; 87 return SHRINK_STOP;
89 88
90 spin_lock(&qd_lru_lock); 89 spin_lock(&qd_lru_lock);
91 while (nr_to_scan && !list_empty(&qd_lru_list)) { 90 while (nr_to_scan && !list_empty(&qd_lru_list)) {
@@ -110,11 +109,16 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
110 kmem_cache_free(gfs2_quotad_cachep, qd); 109 kmem_cache_free(gfs2_quotad_cachep, qd);
111 spin_lock(&qd_lru_lock); 110 spin_lock(&qd_lru_lock);
112 nr_to_scan--; 111 nr_to_scan--;
112 freed++;
113 } 113 }
114 spin_unlock(&qd_lru_lock); 114 spin_unlock(&qd_lru_lock);
115 return freed;
116}
115 117
116out: 118unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
117 return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; 119 struct shrink_control *sc)
120{
121 return vfs_pressure_ratio(atomic_read(&qd_lru_count));
118} 122}
119 123
120static u64 qd2index(struct gfs2_quota_data *qd) 124static u64 qd2index(struct gfs2_quota_data *qd)
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 4f5e6e44ed83..0f64d9deb1b0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -53,8 +53,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
53 return ret; 53 return ret;
54} 54}
55 55
56extern int gfs2_shrink_qd_memory(struct shrinker *shrink, 56extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
57 struct shrink_control *sc); 57 struct shrink_control *sc);
58extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
59 struct shrink_control *sc);
58extern const struct quotactl_ops gfs2_quotactl_ops; 60extern const struct quotactl_ops gfs2_quotactl_ops;
59 61
60#endif /* __QUOTA_DOT_H__ */ 62#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f9299d8a64e3..380ab31b5e0f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
41 struct inode *inode = mapping->host; 41 struct inode *inode = mapping->host;
42 42
43 if (to > inode->i_size) { 43 if (to > inode->i_size) {
44 truncate_pagecache(inode, to, inode->i_size); 44 truncate_pagecache(inode, inode->i_size);
45 hfs_file_truncate(inode); 45 hfs_file_truncate(inode);
46 } 46 }
47} 47}
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index a63371815aab..24bc20fd42f7 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -11,3 +11,21 @@ config HFSPLUS_FS
11 MacOS 8. It includes all Mac specific filesystem data such as 11 MacOS 8. It includes all Mac specific filesystem data such as
12 data forks and creator codes, but it also has several UNIX 12 data forks and creator codes, but it also has several UNIX
13 style features such as file ownership and permissions. 13 style features such as file ownership and permissions.
14
15config HFSPLUS_FS_POSIX_ACL
16 bool "HFS+ POSIX Access Control Lists"
17 depends on HFSPLUS_FS
18 select FS_POSIX_ACL
19 help
20 POSIX Access Control Lists (ACLs) support permissions for users and
21 groups beyond the owner/group/world scheme.
22
23 To learn more about Access Control Lists, visit the POSIX ACLs for
24 Linux website <http://acl.bestbits.at/>.
25
26 It needs to understand that POSIX ACLs are treated only under
27 Linux. POSIX ACLs doesn't mean something under Mac OS X.
28 Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
29 which are part of the NFSv4 standard.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 09d278bb7b91..683fca2e5e65 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
7hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ 7hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
8 bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ 8 bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
9 attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o 9 attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
10
11hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
new file mode 100644
index 000000000000..07c0d4947527
--- /dev/null
+++ b/fs/hfsplus/acl.h
@@ -0,0 +1,30 @@
1/*
2 * linux/fs/hfsplus/acl.h
3 *
4 * Vyacheslav Dubeyko <slava@dubeyko.com>
5 *
6 * Handler for Posix Access Control Lists (ACLs) support.
7 */
8
9#include <linux/posix_acl_xattr.h>
10
11#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
12
13/* posix_acl.c */
14struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
15extern int hfsplus_posix_acl_chmod(struct inode *);
16extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
17
18#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */
19#define hfsplus_get_posix_acl NULL
20
21static inline int hfsplus_posix_acl_chmod(struct inode *inode)
22{
23 return 0;
24}
25
26static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
27{
28 return 0;
29}
30#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d8ce4bd17fc5..4a4fea002673 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -16,6 +16,7 @@
16#include "hfsplus_fs.h" 16#include "hfsplus_fs.h"
17#include "hfsplus_raw.h" 17#include "hfsplus_raw.h"
18#include "xattr.h" 18#include "xattr.h"
19#include "acl.h"
19 20
20static inline void hfsplus_instantiate(struct dentry *dentry, 21static inline void hfsplus_instantiate(struct dentry *dentry,
21 struct inode *inode, u32 cnid) 22 struct inode *inode, u32 cnid)
@@ -529,6 +530,9 @@ const struct inode_operations hfsplus_dir_inode_operations = {
529 .getxattr = generic_getxattr, 530 .getxattr = generic_getxattr,
530 .listxattr = hfsplus_listxattr, 531 .listxattr = hfsplus_listxattr,
531 .removexattr = hfsplus_removexattr, 532 .removexattr = hfsplus_removexattr,
533#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
534 .get_acl = hfsplus_get_posix_acl,
535#endif
532}; 536};
533 537
534const struct file_operations hfsplus_dir_operations = { 538const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index ede79317cfb8..2b9cd01696e2 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -30,6 +30,7 @@
30#define DBG_EXTENT 0x00000020 30#define DBG_EXTENT 0x00000020
31#define DBG_BITMAP 0x00000040 31#define DBG_BITMAP 0x00000040
32#define DBG_ATTR_MOD 0x00000080 32#define DBG_ATTR_MOD 0x00000080
33#define DBG_ACL_MOD 0x00000100
33 34
34#if 0 35#if 0
35#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) 36#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f833d35630ab..37213d075f3c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -19,6 +19,7 @@
19#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
20#include "hfsplus_raw.h" 20#include "hfsplus_raw.h"
21#include "xattr.h" 21#include "xattr.h"
22#include "acl.h"
22 23
23static int hfsplus_readpage(struct file *file, struct page *page) 24static int hfsplus_readpage(struct file *file, struct page *page)
24{ 25{
@@ -35,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
35 struct inode *inode = mapping->host; 36 struct inode *inode = mapping->host;
36 37
37 if (to > inode->i_size) { 38 if (to > inode->i_size) {
38 truncate_pagecache(inode, to, inode->i_size); 39 truncate_pagecache(inode, inode->i_size);
39 hfsplus_file_truncate(inode); 40 hfsplus_file_truncate(inode);
40 } 41 }
41} 42}
@@ -316,6 +317,13 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
316 317
317 setattr_copy(inode, attr); 318 setattr_copy(inode, attr);
318 mark_inode_dirty(inode); 319 mark_inode_dirty(inode);
320
321 if (attr->ia_valid & ATTR_MODE) {
322 error = hfsplus_posix_acl_chmod(inode);
323 if (unlikely(error))
324 return error;
325 }
326
319 return 0; 327 return 0;
320} 328}
321 329
@@ -383,6 +391,9 @@ static const struct inode_operations hfsplus_file_inode_operations = {
383 .getxattr = generic_getxattr, 391 .getxattr = generic_getxattr,
384 .listxattr = hfsplus_listxattr, 392 .listxattr = hfsplus_listxattr,
385 .removexattr = hfsplus_removexattr, 393 .removexattr = hfsplus_removexattr,
394#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
395 .get_acl = hfsplus_get_posix_acl,
396#endif
386}; 397};
387 398
388static const struct file_operations hfsplus_file_operations = { 399static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
new file mode 100644
index 000000000000..b609cc14c72e
--- /dev/null
+++ b/fs/hfsplus/posix_acl.c
@@ -0,0 +1,274 @@
1/*
2 * linux/fs/hfsplus/posix_acl.c
3 *
4 * Vyacheslav Dubeyko <slava@dubeyko.com>
5 *
6 * Handler for Posix Access Control Lists (ACLs) support.
7 */
8
9#include "hfsplus_fs.h"
10#include "xattr.h"
11#include "acl.h"
12
13struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
14{
15 struct posix_acl *acl;
16 char *xattr_name;
17 char *value = NULL;
18 ssize_t size;
19
20 acl = get_cached_acl(inode, type);
21 if (acl != ACL_NOT_CACHED)
22 return acl;
23
24 switch (type) {
25 case ACL_TYPE_ACCESS:
26 xattr_name = POSIX_ACL_XATTR_ACCESS;
27 break;
28 case ACL_TYPE_DEFAULT:
29 xattr_name = POSIX_ACL_XATTR_DEFAULT;
30 break;
31 default:
32 return ERR_PTR(-EINVAL);
33 }
34
35 size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
36
37 if (size > 0) {
38 value = (char *)hfsplus_alloc_attr_entry();
39 if (unlikely(!value))
40 return ERR_PTR(-ENOMEM);
41 size = __hfsplus_getxattr(inode, xattr_name, value, size);
42 }
43
44 if (size > 0)
45 acl = posix_acl_from_xattr(&init_user_ns, value, size);
46 else if (size == -ENODATA)
47 acl = NULL;
48 else
49 acl = ERR_PTR(size);
50
51 hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
52
53 if (!IS_ERR(acl))
54 set_cached_acl(inode, type, acl);
55
56 return acl;
57}
58
59static int hfsplus_set_posix_acl(struct inode *inode,
60 int type,
61 struct posix_acl *acl)
62{
63 int err;
64 char *xattr_name;
65 size_t size = 0;
66 char *value = NULL;
67
68 if (S_ISLNK(inode->i_mode))
69 return -EOPNOTSUPP;
70
71 switch (type) {
72 case ACL_TYPE_ACCESS:
73 xattr_name = POSIX_ACL_XATTR_ACCESS;
74 if (acl) {
75 err = posix_acl_equiv_mode(acl, &inode->i_mode);
76 if (err < 0)
77 return err;
78 }
79 err = 0;
80 break;
81
82 case ACL_TYPE_DEFAULT:
83 xattr_name = POSIX_ACL_XATTR_DEFAULT;
84 if (!S_ISDIR(inode->i_mode))
85 return acl ? -EACCES : 0;
86 break;
87
88 default:
89 return -EINVAL;
90 }
91
92 if (acl) {
93 size = posix_acl_xattr_size(acl->a_count);
94 if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
95 return -ENOMEM;
96 value = (char *)hfsplus_alloc_attr_entry();
97 if (unlikely(!value))
98 return -ENOMEM;
99 err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
100 if (unlikely(err < 0))
101 goto end_set_acl;
102 }
103
104 err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
105
106end_set_acl:
107 hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
108
109 if (!err)
110 set_cached_acl(inode, type, acl);
111
112 return err;
113}
114
115int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
116{
117 int err = 0;
118 struct posix_acl *acl = NULL;
119
120 hfs_dbg(ACL_MOD,
121 "[%s]: ino %lu, dir->ino %lu\n",
122 __func__, inode->i_ino, dir->i_ino);
123
124 if (S_ISLNK(inode->i_mode))
125 return 0;
126
127 acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
128 if (IS_ERR(acl))
129 return PTR_ERR(acl);
130
131 if (acl) {
132 if (S_ISDIR(inode->i_mode)) {
133 err = hfsplus_set_posix_acl(inode,
134 ACL_TYPE_DEFAULT,
135 acl);
136 if (unlikely(err))
137 goto init_acl_cleanup;
138 }
139
140 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
141 if (unlikely(err < 0))
142 return err;
143
144 if (err > 0)
145 err = hfsplus_set_posix_acl(inode,
146 ACL_TYPE_ACCESS,
147 acl);
148 } else
149 inode->i_mode &= ~current_umask();
150
151init_acl_cleanup:
152 posix_acl_release(acl);
153 return err;
154}
155
156int hfsplus_posix_acl_chmod(struct inode *inode)
157{
158 int err;
159 struct posix_acl *acl;
160
161 hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
162
163 if (S_ISLNK(inode->i_mode))
164 return -EOPNOTSUPP;
165
166 acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
167 if (IS_ERR(acl) || !acl)
168 return PTR_ERR(acl);
169
170 err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
171 if (unlikely(err))
172 return err;
173
174 err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
175 posix_acl_release(acl);
176 return err;
177}
178
179static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
180 const char *name,
181 void *buffer,
182 size_t size,
183 int type)
184{
185 int err = 0;
186 struct posix_acl *acl;
187
188 hfs_dbg(ACL_MOD,
189 "[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
190 __func__, dentry->d_inode->i_ino, buffer, size, type);
191
192 if (strcmp(name, "") != 0)
193 return -EINVAL;
194
195 acl = hfsplus_get_posix_acl(dentry->d_inode, type);
196 if (IS_ERR(acl))
197 return PTR_ERR(acl);
198 if (acl == NULL)
199 return -ENODATA;
200
201 err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
202 posix_acl_release(acl);
203
204 return err;
205}
206
207static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
208 const char *name,
209 const void *value,
210 size_t size,
211 int flags,
212 int type)
213{
214 int err = 0;
215 struct inode *inode = dentry->d_inode;
216 struct posix_acl *acl = NULL;
217
218 hfs_dbg(ACL_MOD,
219 "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
220 __func__, inode->i_ino, value, size, flags, type);
221
222 if (strcmp(name, "") != 0)
223 return -EINVAL;
224
225 if (!inode_owner_or_capable(inode))
226 return -EPERM;
227
228 if (value) {
229 acl = posix_acl_from_xattr(&init_user_ns, value, size);
230 if (IS_ERR(acl))
231 return PTR_ERR(acl);
232 else if (acl) {
233 err = posix_acl_valid(acl);
234 if (err)
235 goto end_xattr_set_acl;
236 }
237 }
238
239 err = hfsplus_set_posix_acl(inode, type, acl);
240
241end_xattr_set_acl:
242 posix_acl_release(acl);
243 return err;
244}
245
246static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
247 char *list,
248 size_t list_size,
249 const char *name,
250 size_t name_len,
251 int type)
252{
253 /*
254 * This method is not used.
255 * It is used hfsplus_listxattr() instead of generic_listxattr().
256 */
257 return -EOPNOTSUPP;
258}
259
260const struct xattr_handler hfsplus_xattr_acl_access_handler = {
261 .prefix = POSIX_ACL_XATTR_ACCESS,
262 .flags = ACL_TYPE_ACCESS,
263 .list = hfsplus_xattr_list_posix_acl,
264 .get = hfsplus_xattr_get_posix_acl,
265 .set = hfsplus_xattr_set_posix_acl,
266};
267
268const struct xattr_handler hfsplus_xattr_acl_default_handler = {
269 .prefix = POSIX_ACL_XATTR_DEFAULT,
270 .flags = ACL_TYPE_DEFAULT,
271 .list = hfsplus_xattr_list_posix_acl,
272 .get = hfsplus_xattr_get_posix_acl,
273 .set = hfsplus_xattr_set_posix_acl,
274};
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index f66346155df5..bd8471fb9a6a 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,11 +8,16 @@
8 8
9#include "hfsplus_fs.h" 9#include "hfsplus_fs.h"
10#include "xattr.h" 10#include "xattr.h"
11#include "acl.h"
11 12
12const struct xattr_handler *hfsplus_xattr_handlers[] = { 13const struct xattr_handler *hfsplus_xattr_handlers[] = {
13 &hfsplus_xattr_osx_handler, 14 &hfsplus_xattr_osx_handler,
14 &hfsplus_xattr_user_handler, 15 &hfsplus_xattr_user_handler,
15 &hfsplus_xattr_trusted_handler, 16 &hfsplus_xattr_trusted_handler,
17#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
18 &hfsplus_xattr_acl_access_handler,
19 &hfsplus_xattr_acl_default_handler,
20#endif
16 &hfsplus_xattr_security_handler, 21 &hfsplus_xattr_security_handler,
17 NULL 22 NULL
18}; 23};
@@ -46,11 +51,58 @@ static inline int is_known_namespace(const char *name)
46 return true; 51 return true;
47} 52}
48 53
54static int can_set_system_xattr(struct inode *inode, const char *name,
55 const void *value, size_t size)
56{
57#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
58 struct posix_acl *acl;
59 int err;
60
61 if (!inode_owner_or_capable(inode))
62 return -EPERM;
63
64 /*
65 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
66 */
67 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
68 acl = posix_acl_from_xattr(&init_user_ns, value, size);
69 if (IS_ERR(acl))
70 return PTR_ERR(acl);
71 if (acl) {
72 err = posix_acl_equiv_mode(acl, &inode->i_mode);
73 posix_acl_release(acl);
74 if (err < 0)
75 return err;
76 mark_inode_dirty(inode);
77 }
78 /*
79 * We're changing the ACL. Get rid of the cached one
80 */
81 forget_cached_acl(inode, ACL_TYPE_ACCESS);
82
83 return 0;
84 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
85 acl = posix_acl_from_xattr(&init_user_ns, value, size);
86 if (IS_ERR(acl))
87 return PTR_ERR(acl);
88 posix_acl_release(acl);
89
90 /*
91 * We're changing the default ACL. Get rid of the cached one
92 */
93 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
94
95 return 0;
96 }
97#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
98 return -EOPNOTSUPP;
99}
100
49static int can_set_xattr(struct inode *inode, const char *name, 101static int can_set_xattr(struct inode *inode, const char *name,
50 const void *value, size_t value_len) 102 const void *value, size_t value_len)
51{ 103{
52 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 104 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
53 return -EOPNOTSUPP; /* TODO: implement ACL support */ 105 return can_set_system_xattr(inode, name, value, value_len);
54 106
55 if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { 107 if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
56 /* 108 /*
@@ -253,11 +305,10 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
253 return len; 305 return len;
254} 306}
255 307
256static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, 308static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
257 void *value, size_t size) 309 void *value, size_t size)
258{ 310{
259 ssize_t res = 0; 311 ssize_t res = 0;
260 struct inode *inode = dentry->d_inode;
261 struct hfs_find_data fd; 312 struct hfs_find_data fd;
262 u16 entry_type; 313 u16 entry_type;
263 u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); 314 u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
@@ -304,10 +355,9 @@ end_getxattr_finder_info:
304 return res; 355 return res;
305} 356}
306 357
307ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 358ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
308 void *value, size_t size) 359 void *value, size_t size)
309{ 360{
310 struct inode *inode = dentry->d_inode;
311 struct hfs_find_data fd; 361 struct hfs_find_data fd;
312 hfsplus_attr_entry *entry; 362 hfsplus_attr_entry *entry;
313 __be32 xattr_record_type; 363 __be32 xattr_record_type;
@@ -333,7 +383,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
333 } 383 }
334 384
335 if (!strcmp_xattr_finder_info(name)) 385 if (!strcmp_xattr_finder_info(name))
336 return hfsplus_getxattr_finder_info(dentry, value, size); 386 return hfsplus_getxattr_finder_info(inode, value, size);
337 387
338 if (!HFSPLUS_SB(inode->i_sb)->attr_tree) 388 if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
339 return -EOPNOTSUPP; 389 return -EOPNOTSUPP;
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 847b695b984d..841b5698c0fc 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,8 @@
14extern const struct xattr_handler hfsplus_xattr_osx_handler; 14extern const struct xattr_handler hfsplus_xattr_osx_handler;
15extern const struct xattr_handler hfsplus_xattr_user_handler; 15extern const struct xattr_handler hfsplus_xattr_user_handler;
16extern const struct xattr_handler hfsplus_xattr_trusted_handler; 16extern const struct xattr_handler hfsplus_xattr_trusted_handler;
17/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ 17extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
18/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ 18extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
19extern const struct xattr_handler hfsplus_xattr_security_handler; 19extern const struct xattr_handler hfsplus_xattr_security_handler;
20 20
21extern const struct xattr_handler *hfsplus_xattr_handlers[]; 21extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -29,9 +29,17 @@ static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
29 return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); 29 return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
30} 30}
31 31
32ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 32ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
33 void *value, size_t size); 33 void *value, size_t size);
34 34
35static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
36 const char *name,
37 void *value,
38 size_t size)
39{
40 return __hfsplus_getxattr(dentry->d_inode, name, value, size);
41}
42
35ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); 43ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
36 44
37int hfsplus_removexattr(struct dentry *dentry, const char *name); 45int hfsplus_removexattr(struct dentry *dentry, const char *name);
@@ -39,22 +47,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name);
39int hfsplus_init_security(struct inode *inode, struct inode *dir, 47int hfsplus_init_security(struct inode *inode, struct inode *dir,
40 const struct qstr *qstr); 48 const struct qstr *qstr);
41 49
42static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) 50int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
43{ 51 const struct qstr *qstr);
44 /*TODO: implement*/
45 return 0;
46}
47
48static inline int hfsplus_init_inode_security(struct inode *inode,
49 struct inode *dir,
50 const struct qstr *qstr)
51{
52 int err;
53
54 err = hfsplus_init_acl(inode, dir);
55 if (!err)
56 err = hfsplus_init_security(inode, dir, qstr);
57 return err;
58}
59 52
60#endif 53#endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 83b842f113c5..00722765ea79 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -9,6 +9,7 @@
9#include <linux/security.h> 9#include <linux/security.h>
10#include "hfsplus_fs.h" 10#include "hfsplus_fs.h"
11#include "xattr.h" 11#include "xattr.h"
12#include "acl.h"
12 13
13static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, 14static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
14 void *buffer, size_t size, int type) 15 void *buffer, size_t size, int type)
@@ -96,6 +97,18 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir,
96 &hfsplus_initxattrs, NULL); 97 &hfsplus_initxattrs, NULL);
97} 98}
98 99
100int hfsplus_init_inode_security(struct inode *inode,
101 struct inode *dir,
102 const struct qstr *qstr)
103{
104 int err;
105
106 err = hfsplus_init_posix_acl(inode, dir);
107 if (!err)
108 err = hfsplus_init_security(inode, dir, qstr);
109 return err;
110}
111
99const struct xattr_handler hfsplus_xattr_security_handler = { 112const struct xattr_handler hfsplus_xattr_security_handler = {
100 .prefix = XATTR_SECURITY_PREFIX, 113 .prefix = XATTR_SECURITY_PREFIX,
101 .list = hfsplus_security_listxattr, 114 .list = hfsplus_security_listxattr,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cddb05217512..25437280a207 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -361,6 +361,13 @@ retry:
361 return 0; 361 return 0;
362} 362}
363 363
364static int hostfs_file_release(struct inode *inode, struct file *file)
365{
366 filemap_write_and_wait(inode->i_mapping);
367
368 return 0;
369}
370
364int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 371int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
365{ 372{
366 struct inode *inode = file->f_mapping->host; 373 struct inode *inode = file->f_mapping->host;
@@ -386,7 +393,7 @@ static const struct file_operations hostfs_file_fops = {
386 .write = do_sync_write, 393 .write = do_sync_write,
387 .mmap = generic_file_mmap, 394 .mmap = generic_file_mmap,
388 .open = hostfs_file_open, 395 .open = hostfs_file_open,
389 .release = NULL, 396 .release = hostfs_file_release,
390 .fsync = hostfs_fsync, 397 .fsync = hostfs_fsync,
391}; 398};
392 399
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 4e9dabcf1f4c..67c1a61e0955 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
138 hpfs_lock(inode->i_sb); 138 hpfs_lock(inode->i_sb);
139 139
140 if (to > inode->i_size) { 140 if (to > inode->i_size) {
141 truncate_pagecache(inode, to, inode->i_size); 141 truncate_pagecache(inode, inode->i_size);
142 hpfs_truncate(inode); 142 hpfs_truncate(inode);
143 } 143 }
144 144
diff --git a/fs/inode.c b/fs/inode.c
index d6dfb09c8280..b33ba8e021cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
17#include <linux/prefetch.h> 17#include <linux/prefetch.h>
18#include <linux/buffer_head.h> /* for inode_has_buffers */ 18#include <linux/buffer_head.h> /* for inode_has_buffers */
19#include <linux/ratelimit.h> 19#include <linux/ratelimit.h>
20#include <linux/list_lru.h>
20#include "internal.h" 21#include "internal.h"
21 22
22/* 23/*
@@ -24,7 +25,7 @@
24 * 25 *
25 * inode->i_lock protects: 26 * inode->i_lock protects:
26 * inode->i_state, inode->i_hash, __iget() 27 * inode->i_state, inode->i_hash, __iget()
27 * inode->i_sb->s_inode_lru_lock protects: 28 * Inode LRU list locks protect:
28 * inode->i_sb->s_inode_lru, inode->i_lru 29 * inode->i_sb->s_inode_lru, inode->i_lru
29 * inode_sb_list_lock protects: 30 * inode_sb_list_lock protects:
30 * sb->s_inodes, inode->i_sb_list 31 * sb->s_inodes, inode->i_sb_list
@@ -37,7 +38,7 @@
37 * 38 *
38 * inode_sb_list_lock 39 * inode_sb_list_lock
39 * inode->i_lock 40 * inode->i_lock
40 * inode->i_sb->s_inode_lru_lock 41 * Inode LRU list locks
41 * 42 *
42 * bdi->wb.list_lock 43 * bdi->wb.list_lock
43 * inode->i_lock 44 * inode->i_lock
@@ -70,33 +71,33 @@ EXPORT_SYMBOL(empty_aops);
70 */ 71 */
71struct inodes_stat_t inodes_stat; 72struct inodes_stat_t inodes_stat;
72 73
73static DEFINE_PER_CPU(unsigned int, nr_inodes); 74static DEFINE_PER_CPU(unsigned long, nr_inodes);
74static DEFINE_PER_CPU(unsigned int, nr_unused); 75static DEFINE_PER_CPU(unsigned long, nr_unused);
75 76
76static struct kmem_cache *inode_cachep __read_mostly; 77static struct kmem_cache *inode_cachep __read_mostly;
77 78
78static int get_nr_inodes(void) 79static long get_nr_inodes(void)
79{ 80{
80 int i; 81 int i;
81 int sum = 0; 82 long sum = 0;
82 for_each_possible_cpu(i) 83 for_each_possible_cpu(i)
83 sum += per_cpu(nr_inodes, i); 84 sum += per_cpu(nr_inodes, i);
84 return sum < 0 ? 0 : sum; 85 return sum < 0 ? 0 : sum;
85} 86}
86 87
87static inline int get_nr_inodes_unused(void) 88static inline long get_nr_inodes_unused(void)
88{ 89{
89 int i; 90 int i;
90 int sum = 0; 91 long sum = 0;
91 for_each_possible_cpu(i) 92 for_each_possible_cpu(i)
92 sum += per_cpu(nr_unused, i); 93 sum += per_cpu(nr_unused, i);
93 return sum < 0 ? 0 : sum; 94 return sum < 0 ? 0 : sum;
94} 95}
95 96
96int get_nr_dirty_inodes(void) 97long get_nr_dirty_inodes(void)
97{ 98{
98 /* not actually dirty inodes, but a wild approximation */ 99 /* not actually dirty inodes, but a wild approximation */
99 int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 100 long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
100 return nr_dirty > 0 ? nr_dirty : 0; 101 return nr_dirty > 0 ? nr_dirty : 0;
101} 102}
102 103
@@ -109,7 +110,7 @@ int proc_nr_inodes(ctl_table *table, int write,
109{ 110{
110 inodes_stat.nr_inodes = get_nr_inodes(); 111 inodes_stat.nr_inodes = get_nr_inodes();
111 inodes_stat.nr_unused = get_nr_inodes_unused(); 112 inodes_stat.nr_unused = get_nr_inodes_unused();
112 return proc_dointvec(table, write, buffer, lenp, ppos); 113 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
113} 114}
114#endif 115#endif
115 116
@@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold);
401 402
402static void inode_lru_list_add(struct inode *inode) 403static void inode_lru_list_add(struct inode *inode)
403{ 404{
404 spin_lock(&inode->i_sb->s_inode_lru_lock); 405 if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
405 if (list_empty(&inode->i_lru)) {
406 list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
407 inode->i_sb->s_nr_inodes_unused++;
408 this_cpu_inc(nr_unused); 406 this_cpu_inc(nr_unused);
409 }
410 spin_unlock(&inode->i_sb->s_inode_lru_lock);
411} 407}
412 408
413/* 409/*
@@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode)
425 421
426static void inode_lru_list_del(struct inode *inode) 422static void inode_lru_list_del(struct inode *inode)
427{ 423{
428 spin_lock(&inode->i_sb->s_inode_lru_lock); 424
429 if (!list_empty(&inode->i_lru)) { 425 if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
430 list_del_init(&inode->i_lru);
431 inode->i_sb->s_nr_inodes_unused--;
432 this_cpu_dec(nr_unused); 426 this_cpu_dec(nr_unused);
433 }
434 spin_unlock(&inode->i_sb->s_inode_lru_lock);
435} 427}
436 428
437/** 429/**
@@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
675 return busy; 667 return busy;
676} 668}
677 669
678static int can_unuse(struct inode *inode)
679{
680 if (inode->i_state & ~I_REFERENCED)
681 return 0;
682 if (inode_has_buffers(inode))
683 return 0;
684 if (atomic_read(&inode->i_count))
685 return 0;
686 if (inode->i_data.nrpages)
687 return 0;
688 return 1;
689}
690
691/* 670/*
692 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 671 * Isolate the inode from the LRU in preparation for freeing it.
693 * This is called from the superblock shrinker function with a number of inodes
694 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
695 * then are freed outside inode_lock by dispose_list().
696 * 672 *
697 * Any inodes which are pinned purely because of attached pagecache have their 673 * Any inodes which are pinned purely because of attached pagecache have their
698 * pagecache removed. If the inode has metadata buffers attached to 674 * pagecache removed. If the inode has metadata buffers attached to
@@ -706,89 +682,82 @@ static int can_unuse(struct inode *inode)
706 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 682 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
707 * with this flag set because they are the inodes that are out of order. 683 * with this flag set because they are the inodes that are out of order.
708 */ 684 */
709void prune_icache_sb(struct super_block *sb, int nr_to_scan) 685static enum lru_status
686inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
710{ 687{
711 LIST_HEAD(freeable); 688 struct list_head *freeable = arg;
712 int nr_scanned; 689 struct inode *inode = container_of(item, struct inode, i_lru);
713 unsigned long reap = 0;
714 690
715 spin_lock(&sb->s_inode_lru_lock); 691 /*
716 for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { 692 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
717 struct inode *inode; 693 * If we fail to get the lock, just skip it.
694 */
695 if (!spin_trylock(&inode->i_lock))
696 return LRU_SKIP;
718 697
719 if (list_empty(&sb->s_inode_lru)) 698 /*
720 break; 699 * Referenced or dirty inodes are still in use. Give them another pass
700 * through the LRU as we canot reclaim them now.
701 */
702 if (atomic_read(&inode->i_count) ||
703 (inode->i_state & ~I_REFERENCED)) {
704 list_del_init(&inode->i_lru);
705 spin_unlock(&inode->i_lock);
706 this_cpu_dec(nr_unused);
707 return LRU_REMOVED;
708 }
721 709
722 inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); 710 /* recently referenced inodes get one more pass */
711 if (inode->i_state & I_REFERENCED) {
712 inode->i_state &= ~I_REFERENCED;
713 spin_unlock(&inode->i_lock);
714 return LRU_ROTATE;
715 }
723 716
724 /* 717 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
725 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, 718 __iget(inode);
726 * so use a trylock. If we fail to get the lock, just move the 719 spin_unlock(&inode->i_lock);
727 * inode to the back of the list so we don't spin on it. 720 spin_unlock(lru_lock);
728 */ 721 if (remove_inode_buffers(inode)) {
729 if (!spin_trylock(&inode->i_lock)) { 722 unsigned long reap;
730 list_move(&inode->i_lru, &sb->s_inode_lru); 723 reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
731 continue; 724 if (current_is_kswapd())
725 __count_vm_events(KSWAPD_INODESTEAL, reap);
726 else
727 __count_vm_events(PGINODESTEAL, reap);
728 if (current->reclaim_state)
729 current->reclaim_state->reclaimed_slab += reap;
732 } 730 }
731 iput(inode);
732 spin_lock(lru_lock);
733 return LRU_RETRY;
734 }
733 735
734 /* 736 WARN_ON(inode->i_state & I_NEW);
735 * Referenced or dirty inodes are still in use. Give them 737 inode->i_state |= I_FREEING;
736 * another pass through the LRU as we canot reclaim them now. 738 list_move(&inode->i_lru, freeable);
737 */ 739 spin_unlock(&inode->i_lock);
738 if (atomic_read(&inode->i_count) ||
739 (inode->i_state & ~I_REFERENCED)) {
740 list_del_init(&inode->i_lru);
741 spin_unlock(&inode->i_lock);
742 sb->s_nr_inodes_unused--;
743 this_cpu_dec(nr_unused);
744 continue;
745 }
746 740
747 /* recently referenced inodes get one more pass */ 741 this_cpu_dec(nr_unused);
748 if (inode->i_state & I_REFERENCED) { 742 return LRU_REMOVED;
749 inode->i_state &= ~I_REFERENCED; 743}
750 list_move(&inode->i_lru, &sb->s_inode_lru);
751 spin_unlock(&inode->i_lock);
752 continue;
753 }
754 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
755 __iget(inode);
756 spin_unlock(&inode->i_lock);
757 spin_unlock(&sb->s_inode_lru_lock);
758 if (remove_inode_buffers(inode))
759 reap += invalidate_mapping_pages(&inode->i_data,
760 0, -1);
761 iput(inode);
762 spin_lock(&sb->s_inode_lru_lock);
763
764 if (inode != list_entry(sb->s_inode_lru.next,
765 struct inode, i_lru))
766 continue; /* wrong inode or list_empty */
767 /* avoid lock inversions with trylock */
768 if (!spin_trylock(&inode->i_lock))
769 continue;
770 if (!can_unuse(inode)) {
771 spin_unlock(&inode->i_lock);
772 continue;
773 }
774 }
775 WARN_ON(inode->i_state & I_NEW);
776 inode->i_state |= I_FREEING;
777 spin_unlock(&inode->i_lock);
778 744
779 list_move(&inode->i_lru, &freeable); 745/*
780 sb->s_nr_inodes_unused--; 746 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
781 this_cpu_dec(nr_unused); 747 * This is called from the superblock shrinker function with a number of inodes
782 } 748 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
783 if (current_is_kswapd()) 749 * then are freed outside inode_lock by dispose_list().
784 __count_vm_events(KSWAPD_INODESTEAL, reap); 750 */
785 else 751long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
786 __count_vm_events(PGINODESTEAL, reap); 752 int nid)
787 spin_unlock(&sb->s_inode_lru_lock); 753{
788 if (current->reclaim_state) 754 LIST_HEAD(freeable);
789 current->reclaim_state->reclaimed_slab += reap; 755 long freed;
790 756
757 freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
758 &freeable, &nr_to_scan);
791 dispose_list(&freeable); 759 dispose_list(&freeable);
760 return freed;
792} 761}
793 762
794static void __wait_on_freeing_inode(struct inode *inode); 763static void __wait_on_freeing_inode(struct inode *inode);
@@ -1525,7 +1494,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
1525 * This function automatically handles read only file systems and media, 1494 * This function automatically handles read only file systems and media,
1526 * as well as the "noatime" flag and inode specific "noatime" markers. 1495 * as well as the "noatime" flag and inode specific "noatime" markers.
1527 */ 1496 */
1528void touch_atime(struct path *path) 1497void touch_atime(const struct path *path)
1529{ 1498{
1530 struct vfsmount *mnt = path->mnt; 1499 struct vfsmount *mnt = path->mnt;
1531 struct inode *inode = path->dentry->d_inode; 1500 struct inode *inode = path->dentry->d_inode;
diff --git a/fs/internal.h b/fs/internal.h
index 7c5f01cf619d..513e0d859a6c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -45,6 +45,9 @@ extern void __init chrdev_init(void);
45 * namei.c 45 * namei.c
46 */ 46 */
47extern int __inode_permission(struct inode *, int); 47extern int __inode_permission(struct inode *, int);
48extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
49extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
50 const char *, unsigned int, struct path *);
48 51
49/* 52/*
50 * namespace.c 53 * namespace.c
@@ -111,6 +114,8 @@ extern int open_check_o_direct(struct file *f);
111 * inode.c 114 * inode.c
112 */ 115 */
113extern spinlock_t inode_sb_list_lock; 116extern spinlock_t inode_sb_list_lock;
117extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
118 int nid);
114extern void inode_add_lru(struct inode *inode); 119extern void inode_add_lru(struct inode *inode);
115 120
116/* 121/*
@@ -118,7 +123,7 @@ extern void inode_add_lru(struct inode *inode);
118 */ 123 */
119extern void inode_wb_list_del(struct inode *inode); 124extern void inode_wb_list_del(struct inode *inode);
120 125
121extern int get_nr_dirty_inodes(void); 126extern long get_nr_dirty_inodes(void);
122extern void evict_inodes(struct super_block *); 127extern void evict_inodes(struct super_block *);
123extern int invalidate_inodes(struct super_block *, bool); 128extern int invalidate_inodes(struct super_block *, bool);
124 129
@@ -126,6 +131,9 @@ extern int invalidate_inodes(struct super_block *, bool);
126 * dcache.c 131 * dcache.c
127 */ 132 */
128extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 133extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
134extern int d_set_mounted(struct dentry *dentry);
135extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
136 int nid);
129 137
130/* 138/*
131 * read_write.c 139 * read_write.c
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index c348d6d88624..e5d408a7ea4a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,8 +117,8 @@ static void destroy_inodecache(void)
117 117
118static int isofs_remount(struct super_block *sb, int *flags, char *data) 118static int isofs_remount(struct super_block *sb, int *flags, char *data)
119{ 119{
120 /* we probably want a lot more here */ 120 if (!(*flags & MS_RDONLY))
121 *flags |= MS_RDONLY; 121 return -EROFS;
122 return 0; 122 return 0;
123} 123}
124 124
@@ -763,15 +763,6 @@ root_found:
763 */ 763 */
764 s->s_maxbytes = 0x80000000000LL; 764 s->s_maxbytes = 0x80000000000LL;
765 765
766 /*
767 * The CDROM is read-only, has no nodes (devices) on it, and since
768 * all of the files appear to be owned by root, we really do not want
769 * to allow suid. (suid or devices will not show up unless we have
770 * Rock Ridge extensions)
771 */
772
773 s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
774
775 /* Set this for reference. Its not currently used except on write 766 /* Set this for reference. Its not currently used except on write
776 which we don't have .. */ 767 which we don't have .. */
777 768
@@ -1530,6 +1521,9 @@ struct inode *isofs_iget(struct super_block *sb,
1530static struct dentry *isofs_mount(struct file_system_type *fs_type, 1521static struct dentry *isofs_mount(struct file_system_type *fs_type,
1531 int flags, const char *dev_name, void *data) 1522 int flags, const char *dev_name, void *data)
1532{ 1523{
1524 /* We don't support read-write mounts */
1525 if (!(flags & MS_RDONLY))
1526 return ERR_PTR(-EACCES);
1533 return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); 1527 return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
1534} 1528}
1535 1529
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 11bb11f48b3a..bb217dcb41af 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -340,13 +340,13 @@ void journal_commit_transaction(journal_t *journal)
340 J_ASSERT(journal->j_committing_transaction == NULL); 340 J_ASSERT(journal->j_committing_transaction == NULL);
341 341
342 commit_transaction = journal->j_running_transaction; 342 commit_transaction = journal->j_running_transaction;
343 J_ASSERT(commit_transaction->t_state == T_RUNNING);
344 343
345 trace_jbd_start_commit(journal, commit_transaction); 344 trace_jbd_start_commit(journal, commit_transaction);
346 jbd_debug(1, "JBD: starting commit of transaction %d\n", 345 jbd_debug(1, "JBD: starting commit of transaction %d\n",
347 commit_transaction->t_tid); 346 commit_transaction->t_tid);
348 347
349 spin_lock(&journal->j_state_lock); 348 spin_lock(&journal->j_state_lock);
349 J_ASSERT(commit_transaction->t_state == T_RUNNING);
350 commit_transaction->t_state = T_LOCKED; 350 commit_transaction->t_state = T_LOCKED;
351 351
352 trace_jbd_commit_locking(journal, commit_transaction); 352 trace_jbd_commit_locking(journal, commit_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 6510d6355729..2d04f9afafd7 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -90,6 +90,24 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
90static void __journal_abort_soft (journal_t *journal, int errno); 90static void __journal_abort_soft (journal_t *journal, int errno);
91static const char *journal_dev_name(journal_t *journal, char *buffer); 91static const char *journal_dev_name(journal_t *journal, char *buffer);
92 92
93#ifdef CONFIG_JBD_DEBUG
94void __jbd_debug(int level, const char *file, const char *func,
95 unsigned int line, const char *fmt, ...)
96{
97 struct va_format vaf;
98 va_list args;
99
100 if (level > journal_enable_debug)
101 return;
102 va_start(args, fmt);
103 vaf.fmt = fmt;
104 vaf.va = &args;
105 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
106 va_end(args);
107}
108EXPORT_SYMBOL(__jbd_debug);
109#endif
110
93/* 111/*
94 * Helper function used to manage commit timeouts 112 * Helper function used to manage commit timeouts
95 */ 113 */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 559bec1a37b4..cf2fc0594063 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -343,14 +343,14 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
343 struct page *page = bh->b_page; 343 struct page *page = bh->b_page;
344 __u8 *addr; 344 __u8 *addr;
345 __u32 csum32; 345 __u32 csum32;
346 __be32 seq;
346 347
347 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
348 return; 349 return;
349 350
350 sequence = cpu_to_be32(sequence); 351 seq = cpu_to_be32(sequence);
351 addr = kmap_atomic(page); 352 addr = kmap_atomic(page);
352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 353 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
353 sizeof(sequence));
354 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), 354 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
355 bh->b_size); 355 bh->b_size);
356 kunmap_atomic(addr); 356 kunmap_atomic(addr);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 02c7ad9d7a41..52032647dd4a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -130,9 +130,10 @@ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; 130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
131} 131}
132 132
133static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) 133static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
134{ 134{
135 __u32 csum, old_csum; 135 __u32 csum;
136 __be32 old_csum;
136 137
137 old_csum = sb->s_checksum; 138 old_csum = sb->s_checksum;
138 sb->s_checksum = 0; 139 sb->s_checksum = 0;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d4851464b57e..3929c50428b1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -178,7 +178,8 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
178 void *buf) 178 void *buf)
179{ 179{
180 struct jbd2_journal_block_tail *tail; 180 struct jbd2_journal_block_tail *tail;
181 __u32 provided, calculated; 181 __be32 provided;
182 __u32 calculated;
182 183
183 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 184 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
184 return 1; 185 return 1;
@@ -190,8 +191,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
190 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); 191 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
191 tail->t_checksum = provided; 192 tail->t_checksum = provided;
192 193
193 provided = be32_to_cpu(provided); 194 return provided == cpu_to_be32(calculated);
194 return provided == calculated;
195} 195}
196 196
197/* 197/*
@@ -381,7 +381,8 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
381static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) 381static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
382{ 382{
383 struct commit_header *h; 383 struct commit_header *h;
384 __u32 provided, calculated; 384 __be32 provided;
385 __u32 calculated;
385 386
386 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 387 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
387 return 1; 388 return 1;
@@ -392,21 +393,20 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
392 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); 393 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
393 h->h_chksum[0] = provided; 394 h->h_chksum[0] = provided;
394 395
395 provided = be32_to_cpu(provided); 396 return provided == cpu_to_be32(calculated);
396 return provided == calculated;
397} 397}
398 398
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 400 void *buf, __u32 sequence)
401{ 401{
402 __u32 csum32; 402 __u32 csum32;
403 __be32 seq;
403 404
404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 405 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
405 return 1; 406 return 1;
406 407
407 sequence = cpu_to_be32(sequence); 408 seq = cpu_to_be32(sequence);
408 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 409 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
409 sizeof(sequence));
410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); 410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 411
412 return tag->t_checksum == cpu_to_be16(csum32); 412 return tag->t_checksum == cpu_to_be16(csum32);
@@ -808,7 +808,8 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
808 void *buf) 808 void *buf)
809{ 809{
810 struct jbd2_journal_revoke_tail *tail; 810 struct jbd2_journal_revoke_tail *tail;
811 __u32 provided, calculated; 811 __be32 provided;
812 __u32 calculated;
812 813
813 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 814 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
814 return 1; 815 return 1;
@@ -820,8 +821,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
820 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); 821 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
821 tail->r_checksum = provided; 822 tail->r_checksum = provided;
822 823
823 provided = be32_to_cpu(provided); 824 return provided == cpu_to_be32(calculated);
824 return provided == calculated;
825} 825}
826 826
827/* Scan a revoke record, marking all blocks mentioned as revoked. */ 827/* Scan a revoke record, marking all blocks mentioned as revoked. */
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 730f24e282a6..f4aab719add5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
306 struct inode *inode = mapping->host; 306 struct inode *inode = mapping->host;
307 307
308 if (to > inode->i_size) { 308 if (to > inode->i_size) {
309 truncate_pagecache(inode, to, inode->i_size); 309 truncate_pagecache(inode, inode->i_size);
310 jfs_truncate(inode); 310 jfs_truncate(inode);
311 } 311 }
312} 312}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8c32ef3ba88e..e519e45bf673 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list);
86static LIST_HEAD(mb_cache_lru_list); 86static LIST_HEAD(mb_cache_lru_list);
87static DEFINE_SPINLOCK(mb_cache_spinlock); 87static DEFINE_SPINLOCK(mb_cache_spinlock);
88 88
89/*
90 * What the mbcache registers as to get shrunk dynamically.
91 */
92
93static int mb_cache_shrink_fn(struct shrinker *shrink,
94 struct shrink_control *sc);
95
96static struct shrinker mb_cache_shrinker = {
97 .shrink = mb_cache_shrink_fn,
98 .seeks = DEFAULT_SEEKS,
99};
100
101static inline int 89static inline int
102__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 90__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
103{ 91{
@@ -151,7 +139,7 @@ forget:
151 139
152 140
153/* 141/*
154 * mb_cache_shrink_fn() memory pressure callback 142 * mb_cache_shrink_scan() memory pressure callback
155 * 143 *
156 * This function is called by the kernel memory management when memory 144 * This function is called by the kernel memory management when memory
157 * gets low. 145 * gets low.
@@ -159,17 +147,16 @@ forget:
159 * @shrink: (ignored) 147 * @shrink: (ignored)
160 * @sc: shrink_control passed from reclaim 148 * @sc: shrink_control passed from reclaim
161 * 149 *
162 * Returns the number of objects which are present in the cache. 150 * Returns the number of objects freed.
163 */ 151 */
164static int 152static unsigned long
165mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) 153mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
166{ 154{
167 LIST_HEAD(free_list); 155 LIST_HEAD(free_list);
168 struct mb_cache *cache;
169 struct mb_cache_entry *entry, *tmp; 156 struct mb_cache_entry *entry, *tmp;
170 int count = 0;
171 int nr_to_scan = sc->nr_to_scan; 157 int nr_to_scan = sc->nr_to_scan;
172 gfp_t gfp_mask = sc->gfp_mask; 158 gfp_t gfp_mask = sc->gfp_mask;
159 unsigned long freed = 0;
173 160
174 mb_debug("trying to free %d entries", nr_to_scan); 161 mb_debug("trying to free %d entries", nr_to_scan);
175 spin_lock(&mb_cache_spinlock); 162 spin_lock(&mb_cache_spinlock);
@@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
179 struct mb_cache_entry, e_lru_list); 166 struct mb_cache_entry, e_lru_list);
180 list_move_tail(&ce->e_lru_list, &free_list); 167 list_move_tail(&ce->e_lru_list, &free_list);
181 __mb_cache_entry_unhash(ce); 168 __mb_cache_entry_unhash(ce);
169 freed++;
170 }
171 spin_unlock(&mb_cache_spinlock);
172 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
173 __mb_cache_entry_forget(entry, gfp_mask);
182 } 174 }
175 return freed;
176}
177
178static unsigned long
179mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
180{
181 struct mb_cache *cache;
182 unsigned long count = 0;
183
184 spin_lock(&mb_cache_spinlock);
183 list_for_each_entry(cache, &mb_cache_list, c_cache_list) { 185 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
184 mb_debug("cache %s (%d)", cache->c_name, 186 mb_debug("cache %s (%d)", cache->c_name,
185 atomic_read(&cache->c_entry_count)); 187 atomic_read(&cache->c_entry_count));
186 count += atomic_read(&cache->c_entry_count); 188 count += atomic_read(&cache->c_entry_count);
187 } 189 }
188 spin_unlock(&mb_cache_spinlock); 190 spin_unlock(&mb_cache_spinlock);
189 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 191
190 __mb_cache_entry_forget(entry, gfp_mask); 192 return vfs_pressure_ratio(count);
191 }
192 return (count / 100) * sysctl_vfs_cache_pressure;
193} 193}
194 194
195static struct shrinker mb_cache_shrinker = {
196 .count_objects = mb_cache_shrink_count,
197 .scan_objects = mb_cache_shrink_scan,
198 .seeks = DEFAULT_SEEKS,
199};
195 200
196/* 201/*
197 * mb_cache_create() create a new cache 202 * mb_cache_create() create a new cache
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df122496f328..0332109162a5 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
400 struct inode *inode = mapping->host; 400 struct inode *inode = mapping->host;
401 401
402 if (to > inode->i_size) { 402 if (to > inode->i_size) {
403 truncate_pagecache(inode, to, inode->i_size); 403 truncate_pagecache(inode, inode->i_size);
404 minix_truncate(inode); 404 minix_truncate(inode);
405 } 405 }
406} 406}
diff --git a/fs/namei.c b/fs/namei.c
index 7720fbd5277b..645268f23eb6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -508,56 +508,78 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
508{ 508{
509 struct fs_struct *fs = current->fs; 509 struct fs_struct *fs = current->fs;
510 struct dentry *parent = nd->path.dentry; 510 struct dentry *parent = nd->path.dentry;
511 int want_root = 0;
512 511
513 BUG_ON(!(nd->flags & LOOKUP_RCU)); 512 BUG_ON(!(nd->flags & LOOKUP_RCU));
514 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 513
515 want_root = 1; 514 /*
516 spin_lock(&fs->lock); 515 * Get a reference to the parent first: we're
517 if (nd->root.mnt != fs->root.mnt || 516 * going to make "path_put(nd->path)" valid in
518 nd->root.dentry != fs->root.dentry) 517 * non-RCU context for "terminate_walk()".
519 goto err_root; 518 *
520 } 519 * If this doesn't work, return immediately with
521 spin_lock(&parent->d_lock); 520 * RCU walking still active (and then we will do
521 * the RCU walk cleanup in terminate_walk()).
522 */
523 if (!lockref_get_not_dead(&parent->d_lockref))
524 return -ECHILD;
525
526 /*
527 * After the mntget(), we terminate_walk() will do
528 * the right thing for non-RCU mode, and all our
529 * subsequent exit cases should unlock_rcu_walk()
530 * before returning.
531 */
532 mntget(nd->path.mnt);
533 nd->flags &= ~LOOKUP_RCU;
534
535 /*
536 * For a negative lookup, the lookup sequence point is the parents
537 * sequence point, and it only needs to revalidate the parent dentry.
538 *
539 * For a positive lookup, we need to move both the parent and the
540 * dentry from the RCU domain to be properly refcounted. And the
541 * sequence number in the dentry validates *both* dentry counters,
542 * since we checked the sequence number of the parent after we got
543 * the child sequence number. So we know the parent must still
544 * be valid if the child sequence number is still valid.
545 */
522 if (!dentry) { 546 if (!dentry) {
523 if (!__d_rcu_to_refcount(parent, nd->seq)) 547 if (read_seqcount_retry(&parent->d_seq, nd->seq))
524 goto err_parent; 548 goto out;
525 BUG_ON(nd->inode != parent->d_inode); 549 BUG_ON(nd->inode != parent->d_inode);
526 } else { 550 } else {
527 if (dentry->d_parent != parent) 551 if (!lockref_get_not_dead(&dentry->d_lockref))
528 goto err_parent; 552 goto out;
529 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 553 if (read_seqcount_retry(&dentry->d_seq, nd->seq))
530 if (!__d_rcu_to_refcount(dentry, nd->seq)) 554 goto drop_dentry;
531 goto err_child;
532 /*
533 * If the sequence check on the child dentry passed, then
534 * the child has not been removed from its parent. This
535 * means the parent dentry must be valid and able to take
536 * a reference at this point.
537 */
538 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
539 BUG_ON(!parent->d_lockref.count);
540 parent->d_lockref.count++;
541 spin_unlock(&dentry->d_lock);
542 } 555 }
543 spin_unlock(&parent->d_lock); 556
544 if (want_root) { 557 /*
558 * Sequence counts matched. Now make sure that the root is
559 * still valid and get it if required.
560 */
561 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
562 spin_lock(&fs->lock);
563 if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
564 goto unlock_and_drop_dentry;
545 path_get(&nd->root); 565 path_get(&nd->root);
546 spin_unlock(&fs->lock); 566 spin_unlock(&fs->lock);
547 } 567 }
548 mntget(nd->path.mnt);
549 568
550 unlock_rcu_walk(); 569 unlock_rcu_walk();
551 nd->flags &= ~LOOKUP_RCU;
552 return 0; 570 return 0;
553 571
554err_child: 572unlock_and_drop_dentry:
555 spin_unlock(&dentry->d_lock); 573 spin_unlock(&fs->lock);
556err_parent: 574drop_dentry:
557 spin_unlock(&parent->d_lock); 575 unlock_rcu_walk();
558err_root: 576 dput(dentry);
559 if (want_root) 577 goto drop_root_mnt;
560 spin_unlock(&fs->lock); 578out:
579 unlock_rcu_walk();
580drop_root_mnt:
581 if (!(nd->flags & LOOKUP_ROOT))
582 nd->root.mnt = NULL;
561 return -ECHILD; 583 return -ECHILD;
562} 584}
563 585
@@ -585,14 +607,16 @@ static int complete_walk(struct nameidata *nd)
585 nd->flags &= ~LOOKUP_RCU; 607 nd->flags &= ~LOOKUP_RCU;
586 if (!(nd->flags & LOOKUP_ROOT)) 608 if (!(nd->flags & LOOKUP_ROOT))
587 nd->root.mnt = NULL; 609 nd->root.mnt = NULL;
588 spin_lock(&dentry->d_lock); 610
589 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 611 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
590 spin_unlock(&dentry->d_lock); 612 unlock_rcu_walk();
613 return -ECHILD;
614 }
615 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
591 unlock_rcu_walk(); 616 unlock_rcu_walk();
617 dput(dentry);
592 return -ECHILD; 618 return -ECHILD;
593 } 619 }
594 BUG_ON(nd->inode != dentry->d_inode);
595 spin_unlock(&dentry->d_lock);
596 mntget(nd->path.mnt); 620 mntget(nd->path.mnt);
597 unlock_rcu_walk(); 621 unlock_rcu_walk();
598 } 622 }
@@ -636,29 +660,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
636 } 660 }
637} 661}
638 662
639static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
640{
641 int ret;
642
643 if (IS_ERR(link))
644 goto fail;
645
646 if (*link == '/') {
647 set_root(nd);
648 path_put(&nd->path);
649 nd->path = nd->root;
650 path_get(&nd->root);
651 nd->flags |= LOOKUP_JUMPED;
652 }
653 nd->inode = nd->path.dentry->d_inode;
654
655 ret = link_path_walk(link, nd);
656 return ret;
657fail:
658 path_put(&nd->path);
659 return PTR_ERR(link);
660}
661
662static void path_put_conditional(struct path *path, struct nameidata *nd) 663static void path_put_conditional(struct path *path, struct nameidata *nd)
663{ 664{
664 dput(path->dentry); 665 dput(path->dentry);
@@ -850,7 +851,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
850 error = 0; 851 error = 0;
851 s = nd_get_link(nd); 852 s = nd_get_link(nd);
852 if (s) { 853 if (s) {
853 error = __vfs_follow_link(nd, s); 854 if (unlikely(IS_ERR(s))) {
855 path_put(&nd->path);
856 put_link(nd, link, *p);
857 return PTR_ERR(s);
858 }
859 if (*s == '/') {
860 set_root(nd);
861 path_put(&nd->path);
862 nd->path = nd->root;
863 path_get(&nd->root);
864 nd->flags |= LOOKUP_JUMPED;
865 }
866 nd->inode = nd->path.dentry->d_inode;
867 error = link_path_walk(s, nd);
854 if (unlikely(error)) 868 if (unlikely(error))
855 put_link(nd, link, *p); 869 put_link(nd, link, *p);
856 } 870 }
@@ -2184,6 +2198,197 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
2184 return s; 2198 return s;
2185} 2199}
2186 2200
2201/**
2202 * mountpoint_last - look up last component for umount
2203 * @nd: pathwalk nameidata - currently pointing at parent directory of "last"
2204 * @path: pointer to container for result
2205 *
2206 * This is a special lookup_last function just for umount. In this case, we
2207 * need to resolve the path without doing any revalidation.
2208 *
2209 * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
2210 * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
2211 * in almost all cases, this lookup will be served out of the dcache. The only
2212 * cases where it won't are if nd->last refers to a symlink or the path is
2213 * bogus and it doesn't exist.
2214 *
2215 * Returns:
2216 * -error: if there was an error during lookup. This includes -ENOENT if the
2217 * lookup found a negative dentry. The nd->path reference will also be
2218 * put in this case.
2219 *
2220 * 0: if we successfully resolved nd->path and found it to not to be a
2221 * symlink that needs to be followed. "path" will also be populated.
2222 * The nd->path reference will also be put.
2223 *
2224 * 1: if we successfully resolved nd->last and found it to be a symlink
2225 * that needs to be followed. "path" will be populated with the path
2226 * to the link, and nd->path will *not* be put.
2227 */
2228static int
2229mountpoint_last(struct nameidata *nd, struct path *path)
2230{
2231 int error = 0;
2232 struct dentry *dentry;
2233 struct dentry *dir = nd->path.dentry;
2234
2235 /* If we're in rcuwalk, drop out of it to handle last component */
2236 if (nd->flags & LOOKUP_RCU) {
2237 if (unlazy_walk(nd, NULL)) {
2238 error = -ECHILD;
2239 goto out;
2240 }
2241 }
2242
2243 nd->flags &= ~LOOKUP_PARENT;
2244
2245 if (unlikely(nd->last_type != LAST_NORM)) {
2246 error = handle_dots(nd, nd->last_type);
2247 if (error)
2248 goto out;
2249 dentry = dget(nd->path.dentry);
2250 goto done;
2251 }
2252
2253 mutex_lock(&dir->d_inode->i_mutex);
2254 dentry = d_lookup(dir, &nd->last);
2255 if (!dentry) {
2256 /*
2257 * No cached dentry. Mounted dentries are pinned in the cache,
2258 * so that means that this dentry is probably a symlink or the
2259 * path doesn't actually point to a mounted dentry.
2260 */
2261 dentry = d_alloc(dir, &nd->last);
2262 if (!dentry) {
2263 error = -ENOMEM;
2264 mutex_unlock(&dir->d_inode->i_mutex);
2265 goto out;
2266 }
2267 dentry = lookup_real(dir->d_inode, dentry, nd->flags);
2268 error = PTR_ERR(dentry);
2269 if (IS_ERR(dentry)) {
2270 mutex_unlock(&dir->d_inode->i_mutex);
2271 goto out;
2272 }
2273 }
2274 mutex_unlock(&dir->d_inode->i_mutex);
2275
2276done:
2277 if (!dentry->d_inode) {
2278 error = -ENOENT;
2279 dput(dentry);
2280 goto out;
2281 }
2282 path->dentry = dentry;
2283 path->mnt = mntget(nd->path.mnt);
2284 if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
2285 return 1;
2286 follow_mount(path);
2287 error = 0;
2288out:
2289 terminate_walk(nd);
2290 return error;
2291}
2292
2293/**
2294 * path_mountpoint - look up a path to be umounted
2295 * @dfd: directory file descriptor to start walk from
2296 * @name: full pathname to walk
2297 * @flags: lookup flags
2298 *
2299 * Look up the given name, but don't attempt to revalidate the last component.
2300 * Returns 0 and "path" will be valid on success; Retuns error otherwise.
2301 */
2302static int
2303path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
2304{
2305 struct file *base = NULL;
2306 struct nameidata nd;
2307 int err;
2308
2309 err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
2310 if (unlikely(err))
2311 return err;
2312
2313 current->total_link_count = 0;
2314 err = link_path_walk(name, &nd);
2315 if (err)
2316 goto out;
2317
2318 err = mountpoint_last(&nd, path);
2319 while (err > 0) {
2320 void *cookie;
2321 struct path link = *path;
2322 err = may_follow_link(&link, &nd);
2323 if (unlikely(err))
2324 break;
2325 nd.flags |= LOOKUP_PARENT;
2326 err = follow_link(&link, &nd, &cookie);
2327 if (err)
2328 break;
2329 err = mountpoint_last(&nd, path);
2330 put_link(&nd, &link, cookie);
2331 }
2332out:
2333 if (base)
2334 fput(base);
2335
2336 if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
2337 path_put(&nd.root);
2338
2339 return err;
2340}
2341
2342static int
2343filename_mountpoint(int dfd, struct filename *s, struct path *path,
2344 unsigned int flags)
2345{
2346 int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
2347 if (unlikely(error == -ECHILD))
2348 error = path_mountpoint(dfd, s->name, path, flags);
2349 if (unlikely(error == -ESTALE))
2350 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
2351 if (likely(!error))
2352 audit_inode(s, path->dentry, 0);
2353 return error;
2354}
2355
2356/**
2357 * user_path_mountpoint_at - lookup a path from userland in order to umount it
2358 * @dfd: directory file descriptor
2359 * @name: pathname from userland
2360 * @flags: lookup flags
2361 * @path: pointer to container to hold result
2362 *
2363 * A umount is a special case for path walking. We're not actually interested
2364 * in the inode in this situation, and ESTALE errors can be a problem. We
2365 * simply want track down the dentry and vfsmount attached at the mountpoint
2366 * and avoid revalidating the last component.
2367 *
2368 * Returns 0 and populates "path" on success.
2369 */
2370int
2371user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2372 struct path *path)
2373{
2374 struct filename *s = getname(name);
2375 int error;
2376 if (IS_ERR(s))
2377 return PTR_ERR(s);
2378 error = filename_mountpoint(dfd, s, path, flags);
2379 putname(s);
2380 return error;
2381}
2382
2383int
2384kern_path_mountpoint(int dfd, const char *name, struct path *path,
2385 unsigned int flags)
2386{
2387 struct filename s = {.name = name};
2388 return filename_mountpoint(dfd, &s, path, flags);
2389}
2390EXPORT_SYMBOL(kern_path_mountpoint);
2391
2187/* 2392/*
2188 * It's inline, so penalty for filesystems that don't use sticky bit is 2393 * It's inline, so penalty for filesystems that don't use sticky bit is
2189 * minimal. 2394 * minimal.
@@ -2451,6 +2656,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2451 int acc_mode; 2656 int acc_mode;
2452 int create_error = 0; 2657 int create_error = 0;
2453 struct dentry *const DENTRY_NOT_SET = (void *) -1UL; 2658 struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2659 bool excl;
2454 2660
2455 BUG_ON(dentry->d_inode); 2661 BUG_ON(dentry->d_inode);
2456 2662
@@ -2464,10 +2670,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2464 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) 2670 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
2465 mode &= ~current_umask(); 2671 mode &= ~current_umask();
2466 2672
2467 if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { 2673 excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
2674 if (excl)
2468 open_flag &= ~O_TRUNC; 2675 open_flag &= ~O_TRUNC;
2469 *opened |= FILE_CREATED;
2470 }
2471 2676
2472 /* 2677 /*
2473 * Checking write permission is tricky, bacuse we don't know if we are 2678 * Checking write permission is tricky, bacuse we don't know if we are
@@ -2520,12 +2725,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2520 goto out; 2725 goto out;
2521 } 2726 }
2522 2727
2523 acc_mode = op->acc_mode;
2524 if (*opened & FILE_CREATED) {
2525 fsnotify_create(dir, dentry);
2526 acc_mode = MAY_OPEN;
2527 }
2528
2529 if (error) { /* returned 1, that is */ 2728 if (error) { /* returned 1, that is */
2530 if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { 2729 if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2531 error = -EIO; 2730 error = -EIO;
@@ -2535,9 +2734,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2535 dput(dentry); 2734 dput(dentry);
2536 dentry = file->f_path.dentry; 2735 dentry = file->f_path.dentry;
2537 } 2736 }
2538 if (create_error && dentry->d_inode == NULL) { 2737 if (*opened & FILE_CREATED)
2539 error = create_error; 2738 fsnotify_create(dir, dentry);
2540 goto out; 2739 if (!dentry->d_inode) {
2740 WARN_ON(*opened & FILE_CREATED);
2741 if (create_error) {
2742 error = create_error;
2743 goto out;
2744 }
2745 } else {
2746 if (excl && !(*opened & FILE_CREATED)) {
2747 error = -EEXIST;
2748 goto out;
2749 }
2541 } 2750 }
2542 goto looked_up; 2751 goto looked_up;
2543 } 2752 }
@@ -2546,6 +2755,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2546 * We didn't have the inode before the open, so check open permission 2755 * We didn't have the inode before the open, so check open permission
2547 * here. 2756 * here.
2548 */ 2757 */
2758 acc_mode = op->acc_mode;
2759 if (*opened & FILE_CREATED) {
2760 WARN_ON(!(open_flag & O_CREAT));
2761 fsnotify_create(dir, dentry);
2762 acc_mode = MAY_OPEN;
2763 }
2549 error = may_open(&file->f_path, acc_mode, open_flag); 2764 error = may_open(&file->f_path, acc_mode, open_flag);
2550 if (error) 2765 if (error)
2551 fput(file); 2766 fput(file);
@@ -4024,11 +4239,6 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4024 return res; 4239 return res;
4025} 4240}
4026 4241
4027int vfs_follow_link(struct nameidata *nd, const char *link)
4028{
4029 return __vfs_follow_link(nd, link);
4030}
4031
4032/* get the link contents into pagecache */ 4242/* get the link contents into pagecache */
4033static char *page_getlink(struct dentry * dentry, struct page **ppage) 4243static char *page_getlink(struct dentry * dentry, struct page **ppage)
4034{ 4244{
@@ -4140,7 +4350,6 @@ EXPORT_SYMBOL(vfs_path_lookup);
4140EXPORT_SYMBOL(inode_permission); 4350EXPORT_SYMBOL(inode_permission);
4141EXPORT_SYMBOL(unlock_rename); 4351EXPORT_SYMBOL(unlock_rename);
4142EXPORT_SYMBOL(vfs_create); 4352EXPORT_SYMBOL(vfs_create);
4143EXPORT_SYMBOL(vfs_follow_link);
4144EXPORT_SYMBOL(vfs_link); 4353EXPORT_SYMBOL(vfs_link);
4145EXPORT_SYMBOL(vfs_mkdir); 4354EXPORT_SYMBOL(vfs_mkdir);
4146EXPORT_SYMBOL(vfs_mknod); 4355EXPORT_SYMBOL(vfs_mknod);
diff --git a/fs/namespace.c b/fs/namespace.c
index a45ba4f267fe..da5c49483430 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,7 +17,7 @@
17#include <linux/security.h> 17#include <linux/security.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/acct.h> /* acct_auto_close_mnt */ 19#include <linux/acct.h> /* acct_auto_close_mnt */
20#include <linux/ramfs.h> /* init_rootfs */ 20#include <linux/init.h> /* init_rootfs */
21#include <linux/fs_struct.h> /* get_fs_root et.al. */ 21#include <linux/fs_struct.h> /* get_fs_root et.al. */
22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
@@ -611,6 +611,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
611{ 611{
612 struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); 612 struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
613 struct mountpoint *mp; 613 struct mountpoint *mp;
614 int ret;
614 615
615 list_for_each_entry(mp, chain, m_hash) { 616 list_for_each_entry(mp, chain, m_hash) {
616 if (mp->m_dentry == dentry) { 617 if (mp->m_dentry == dentry) {
@@ -626,14 +627,12 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
626 if (!mp) 627 if (!mp)
627 return ERR_PTR(-ENOMEM); 628 return ERR_PTR(-ENOMEM);
628 629
629 spin_lock(&dentry->d_lock); 630 ret = d_set_mounted(dentry);
630 if (d_unlinked(dentry)) { 631 if (ret) {
631 spin_unlock(&dentry->d_lock);
632 kfree(mp); 632 kfree(mp);
633 return ERR_PTR(-ENOENT); 633 return ERR_PTR(ret);
634 } 634 }
635 dentry->d_flags |= DCACHE_MOUNTED; 635
636 spin_unlock(&dentry->d_lock);
637 mp->m_dentry = dentry; 636 mp->m_dentry = dentry;
638 mp->m_count = 1; 637 mp->m_count = 1;
639 list_add(&mp->m_hash, chain); 638 list_add(&mp->m_hash, chain);
@@ -831,6 +830,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
831 if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) 830 if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
832 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 831 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
833 832
833 /* Don't allow unprivileged users to reveal what is under a mount */
834 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
835 mnt->mnt.mnt_flags |= MNT_LOCKED;
836
834 atomic_inc(&sb->s_active); 837 atomic_inc(&sb->s_active);
835 mnt->mnt.mnt_sb = sb; 838 mnt->mnt.mnt_sb = sb;
836 mnt->mnt.mnt_root = dget(root); 839 mnt->mnt.mnt_root = dget(root);
@@ -1318,7 +1321,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1318 if (!(flags & UMOUNT_NOFOLLOW)) 1321 if (!(flags & UMOUNT_NOFOLLOW))
1319 lookup_flags |= LOOKUP_FOLLOW; 1322 lookup_flags |= LOOKUP_FOLLOW;
1320 1323
1321 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); 1324 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1322 if (retval) 1325 if (retval)
1323 goto out; 1326 goto out;
1324 mnt = real_mount(path.mnt); 1327 mnt = real_mount(path.mnt);
@@ -1327,6 +1330,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1327 goto dput_and_out; 1330 goto dput_and_out;
1328 if (!check_mnt(mnt)) 1331 if (!check_mnt(mnt))
1329 goto dput_and_out; 1332 goto dput_and_out;
1333 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1334 goto dput_and_out;
1330 1335
1331 retval = do_umount(mnt, flags); 1336 retval = do_umount(mnt, flags);
1332dput_and_out: 1337dput_and_out:
@@ -1349,14 +1354,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
1349 1354
1350#endif 1355#endif
1351 1356
1352static bool mnt_ns_loop(struct path *path) 1357static bool is_mnt_ns_file(struct dentry *dentry)
1353{ 1358{
1354 /* Could bind mounting the mount namespace inode cause a 1359 /* Is this a proxy for a mount namespace? */
1355 * mount namespace loop? 1360 struct inode *inode = dentry->d_inode;
1356 */
1357 struct inode *inode = path->dentry->d_inode;
1358 struct proc_ns *ei; 1361 struct proc_ns *ei;
1359 struct mnt_namespace *mnt_ns;
1360 1362
1361 if (!proc_ns_inode(inode)) 1363 if (!proc_ns_inode(inode))
1362 return false; 1364 return false;
@@ -1365,7 +1367,19 @@ static bool mnt_ns_loop(struct path *path)
1365 if (ei->ns_ops != &mntns_operations) 1367 if (ei->ns_ops != &mntns_operations)
1366 return false; 1368 return false;
1367 1369
1368 mnt_ns = ei->ns; 1370 return true;
1371}
1372
1373static bool mnt_ns_loop(struct dentry *dentry)
1374{
1375 /* Could bind mounting the mount namespace inode cause a
1376 * mount namespace loop?
1377 */
1378 struct mnt_namespace *mnt_ns;
1379 if (!is_mnt_ns_file(dentry))
1380 return false;
1381
1382 mnt_ns = get_proc_ns(dentry->d_inode)->ns;
1369 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1383 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1370} 1384}
1371 1385
@@ -1374,13 +1388,17 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1374{ 1388{
1375 struct mount *res, *p, *q, *r, *parent; 1389 struct mount *res, *p, *q, *r, *parent;
1376 1390
1377 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1391 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1392 return ERR_PTR(-EINVAL);
1393
1394 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1378 return ERR_PTR(-EINVAL); 1395 return ERR_PTR(-EINVAL);
1379 1396
1380 res = q = clone_mnt(mnt, dentry, flag); 1397 res = q = clone_mnt(mnt, dentry, flag);
1381 if (IS_ERR(q)) 1398 if (IS_ERR(q))
1382 return q; 1399 return q;
1383 1400
1401 q->mnt.mnt_flags &= ~MNT_LOCKED;
1384 q->mnt_mountpoint = mnt->mnt_mountpoint; 1402 q->mnt_mountpoint = mnt->mnt_mountpoint;
1385 1403
1386 p = mnt; 1404 p = mnt;
@@ -1390,7 +1408,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1390 continue; 1408 continue;
1391 1409
1392 for (s = r; s; s = next_mnt(s, r)) { 1410 for (s = r; s; s = next_mnt(s, r)) {
1393 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 1411 if (!(flag & CL_COPY_UNBINDABLE) &&
1412 IS_MNT_UNBINDABLE(s)) {
1413 s = skip_mnt_tree(s);
1414 continue;
1415 }
1416 if (!(flag & CL_COPY_MNT_NS_FILE) &&
1417 is_mnt_ns_file(s->mnt.mnt_root)) {
1394 s = skip_mnt_tree(s); 1418 s = skip_mnt_tree(s);
1395 continue; 1419 continue;
1396 } 1420 }
@@ -1696,6 +1720,19 @@ static int do_change_type(struct path *path, int flag)
1696 return err; 1720 return err;
1697} 1721}
1698 1722
1723static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1724{
1725 struct mount *child;
1726 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
1727 if (!is_subdir(child->mnt_mountpoint, dentry))
1728 continue;
1729
1730 if (child->mnt.mnt_flags & MNT_LOCKED)
1731 return true;
1732 }
1733 return false;
1734}
1735
1699/* 1736/*
1700 * do loopback mount. 1737 * do loopback mount.
1701 */ 1738 */
@@ -1713,7 +1750,7 @@ static int do_loopback(struct path *path, const char *old_name,
1713 return err; 1750 return err;
1714 1751
1715 err = -EINVAL; 1752 err = -EINVAL;
1716 if (mnt_ns_loop(&old_path)) 1753 if (mnt_ns_loop(old_path.dentry))
1717 goto out; 1754 goto out;
1718 1755
1719 mp = lock_mount(path); 1756 mp = lock_mount(path);
@@ -1731,8 +1768,11 @@ static int do_loopback(struct path *path, const char *old_name,
1731 if (!check_mnt(parent) || !check_mnt(old)) 1768 if (!check_mnt(parent) || !check_mnt(old))
1732 goto out2; 1769 goto out2;
1733 1770
1771 if (!recurse && has_locked_children(old, old_path.dentry))
1772 goto out2;
1773
1734 if (recurse) 1774 if (recurse)
1735 mnt = copy_tree(old, old_path.dentry, 0); 1775 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
1736 else 1776 else
1737 mnt = clone_mnt(old, old_path.dentry, 0); 1777 mnt = clone_mnt(old, old_path.dentry, 0);
1738 1778
@@ -1741,6 +1781,8 @@ static int do_loopback(struct path *path, const char *old_name,
1741 goto out2; 1781 goto out2;
1742 } 1782 }
1743 1783
1784 mnt->mnt.mnt_flags &= ~MNT_LOCKED;
1785
1744 err = graft_tree(mnt, parent, mp); 1786 err = graft_tree(mnt, parent, mp);
1745 if (err) { 1787 if (err) {
1746 br_write_lock(&vfsmount_lock); 1788 br_write_lock(&vfsmount_lock);
@@ -1853,6 +1895,9 @@ static int do_move_mount(struct path *path, const char *old_name)
1853 if (!check_mnt(p) || !check_mnt(old)) 1895 if (!check_mnt(p) || !check_mnt(old))
1854 goto out1; 1896 goto out1;
1855 1897
1898 if (old->mnt.mnt_flags & MNT_LOCKED)
1899 goto out1;
1900
1856 err = -EINVAL; 1901 err = -EINVAL;
1857 if (old_path.dentry != old_path.mnt->mnt_root) 1902 if (old_path.dentry != old_path.mnt->mnt_root)
1858 goto out1; 1903 goto out1;
@@ -2389,7 +2434,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2389 2434
2390 namespace_lock(); 2435 namespace_lock();
2391 /* First pass: copy the tree topology */ 2436 /* First pass: copy the tree topology */
2392 copy_flags = CL_COPY_ALL | CL_EXPIRE; 2437 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2393 if (user_ns != mnt_ns->user_ns) 2438 if (user_ns != mnt_ns->user_ns)
2394 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2439 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2395 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2440 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
@@ -2424,6 +2469,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2424 } 2469 }
2425 p = next_mnt(p, old); 2470 p = next_mnt(p, old);
2426 q = next_mnt(q, new); 2471 q = next_mnt(q, new);
2472 if (!q)
2473 break;
2474 while (p->mnt.mnt_root != q->mnt.mnt_root)
2475 p = next_mnt(p, old);
2427 } 2476 }
2428 namespace_unlock(); 2477 namespace_unlock();
2429 2478
@@ -2630,6 +2679,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2630 goto out4; 2679 goto out4;
2631 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2680 if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2632 goto out4; 2681 goto out4;
2682 if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
2683 goto out4;
2633 error = -ENOENT; 2684 error = -ENOENT;
2634 if (d_unlinked(new.dentry)) 2685 if (d_unlinked(new.dentry))
2635 goto out4; 2686 goto out4;
@@ -2653,6 +2704,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2653 br_write_lock(&vfsmount_lock); 2704 br_write_lock(&vfsmount_lock);
2654 detach_mnt(new_mnt, &parent_path); 2705 detach_mnt(new_mnt, &parent_path);
2655 detach_mnt(root_mnt, &root_parent); 2706 detach_mnt(root_mnt, &root_parent);
2707 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
2708 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
2709 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2710 }
2656 /* mount old root on put_old */ 2711 /* mount old root on put_old */
2657 attach_mnt(root_mnt, old_mnt, old_mp); 2712 attach_mnt(root_mnt, old_mnt, old_mp);
2658 /* mount new_root on / */ 2713 /* mount new_root on / */
@@ -2811,25 +2866,38 @@ bool current_chrooted(void)
2811 return chrooted; 2866 return chrooted;
2812} 2867}
2813 2868
2814void update_mnt_policy(struct user_namespace *userns) 2869bool fs_fully_visible(struct file_system_type *type)
2815{ 2870{
2816 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 2871 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
2817 struct mount *mnt; 2872 struct mount *mnt;
2873 bool visible = false;
2818 2874
2819 down_read(&namespace_sem); 2875 if (unlikely(!ns))
2876 return false;
2877
2878 namespace_lock();
2820 list_for_each_entry(mnt, &ns->list, mnt_list) { 2879 list_for_each_entry(mnt, &ns->list, mnt_list) {
2821 switch (mnt->mnt.mnt_sb->s_magic) { 2880 struct mount *child;
2822 case SYSFS_MAGIC: 2881 if (mnt->mnt.mnt_sb->s_type != type)
2823 userns->may_mount_sysfs = true; 2882 continue;
2824 break; 2883
2825 case PROC_SUPER_MAGIC: 2884 /* This mount is not fully visible if there are any child mounts
2826 userns->may_mount_proc = true; 2885 * that cover anything except for empty directories.
2827 break; 2886 */
2887 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2888 struct inode *inode = child->mnt_mountpoint->d_inode;
2889 if (!S_ISDIR(inode->i_mode))
2890 goto next;
2891 if (inode->i_nlink != 2)
2892 goto next;
2828 } 2893 }
2829 if (userns->may_mount_sysfs && userns->may_mount_proc) 2894 visible = true;
2830 break; 2895 goto found;
2896 next: ;
2831 } 2897 }
2832 up_read(&namespace_sem); 2898found:
2899 namespace_unlock();
2900 return visible;
2833} 2901}
2834 2902
2835static void *mntns_get(struct task_struct *task) 2903static void *mntns_get(struct task_struct *task)
@@ -2860,8 +2928,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns)
2860 struct path root; 2928 struct path root;
2861 2929
2862 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 2930 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
2863 !nsown_capable(CAP_SYS_CHROOT) || 2931 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
2864 !nsown_capable(CAP_SYS_ADMIN)) 2932 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
2865 return -EPERM; 2933 return -EPERM;
2866 2934
2867 if (fs->users != 1) 2935 if (fs->users != 1)
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index e0bb048e9576..03192a66c143 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,9 +4,10 @@
4 4
5obj-$(CONFIG_NFS_FS) += nfs.o 5obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7CFLAGS_nfstrace.o += -I$(src)
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 8nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
8 direct.o pagelist.o read.o symlink.o unlink.o \ 9 direct.o pagelist.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o 10 write.o namespace.o mount_clnt.o nfstrace.o
10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
11nfs-$(CONFIG_SYSCTL) += sysctl.o 12nfs-$(CONFIG_SYSCTL) += sysctl.o
12nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 13nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -19,12 +20,14 @@ nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
19nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 20nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
20 21
21obj-$(CONFIG_NFS_V4) += nfsv4.o 22obj-$(CONFIG_NFS_V4) += nfsv4.o
23CFLAGS_nfs4trace.o += -I$(src)
22nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ 24nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
23 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 25 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
24 nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o 26 nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
27 dns_resolve.o nfs4trace.o
25nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o 30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
28 31
29obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 32obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
30nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 33nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e6ebc4c38c81..ae2e87b95453 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -15,6 +15,7 @@
15#include "internal.h" 15#include "internal.h"
16#include "pnfs.h" 16#include "pnfs.h"
17#include "nfs4session.h" 17#include "nfs4session.h"
18#include "nfs4trace.h"
18 19
19#ifdef NFS_DEBUG 20#ifdef NFS_DEBUG
20#define NFSDBG_FACILITY NFSDBG_CALLBACK 21#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -93,6 +94,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
93 default: 94 default:
94 res = htonl(NFS4ERR_RESOURCE); 95 res = htonl(NFS4ERR_RESOURCE);
95 } 96 }
97 trace_nfs4_recall_delegation(inode, -ntohl(res));
96 iput(inode); 98 iput(inode);
97out: 99out:
98 dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); 100 dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -301,14 +303,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
301{ 303{
302 struct nfs4_slot *slot; 304 struct nfs4_slot *slot;
303 305
304 dprintk("%s enter. slotid %d seqid %d\n", 306 dprintk("%s enter. slotid %u seqid %u\n",
305 __func__, args->csa_slotid, args->csa_sequenceid); 307 __func__, args->csa_slotid, args->csa_sequenceid);
306 308
307 if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) 309 if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
308 return htonl(NFS4ERR_BADSLOT); 310 return htonl(NFS4ERR_BADSLOT);
309 311
310 slot = tbl->slots + args->csa_slotid; 312 slot = tbl->slots + args->csa_slotid;
311 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); 313 dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
312 314
313 /* Normal */ 315 /* Normal */
314 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { 316 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
@@ -318,7 +320,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
318 320
319 /* Replay */ 321 /* Replay */
320 if (args->csa_sequenceid == slot->seq_nr) { 322 if (args->csa_sequenceid == slot->seq_nr) {
321 dprintk("%s seqid %d is a replay\n", 323 dprintk("%s seqid %u is a replay\n",
322 __func__, args->csa_sequenceid); 324 __func__, args->csa_sequenceid);
323 /* Signal process_op to set this error on next op */ 325 /* Signal process_op to set this error on next op */
324 if (args->csa_cachethis == 0) 326 if (args->csa_cachethis == 0)
@@ -462,6 +464,7 @@ out:
462 } else 464 } else
463 res->csr_status = status; 465 res->csr_status = status;
464 466
467 trace_nfs4_cb_sequence(args, res, status);
465 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, 468 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
466 ntohl(status), ntohl(res->csr_status)); 469 ntohl(status), ntohl(res->csr_status));
467 return status; 470 return status;
@@ -518,7 +521,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
518 if (!cps->clp) /* set in cb_sequence */ 521 if (!cps->clp) /* set in cb_sequence */
519 goto out; 522 goto out;
520 523
521 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n", 524 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n",
522 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), 525 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
523 args->crsa_target_highest_slotid); 526 args->crsa_target_highest_slotid);
524 527
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 340b1eff0267..2dceee4db076 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -501,8 +501,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
501 &nn->nfs_client_list); 501 &nn->nfs_client_list);
502 spin_unlock(&nn->nfs_client_lock); 502 spin_unlock(&nn->nfs_client_lock);
503 new->cl_flags = cl_init->init_flags; 503 new->cl_flags = cl_init->init_flags;
504 return rpc_ops->init_client(new, timeparms, ip_addr, 504 return rpc_ops->init_client(new, timeparms, ip_addr);
505 authflavour);
506 } 505 }
507 506
508 spin_unlock(&nn->nfs_client_lock); 507 spin_unlock(&nn->nfs_client_lock);
@@ -694,13 +693,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
694 * @clp: nfs_client to initialise 693 * @clp: nfs_client to initialise
695 * @timeparms: timeout parameters for underlying RPC transport 694 * @timeparms: timeout parameters for underlying RPC transport
696 * @ip_addr: IP presentation address (not used) 695 * @ip_addr: IP presentation address (not used)
697 * @authflavor: authentication flavor for underlying RPC transport
698 * 696 *
699 * Returns pointer to an NFS client, or an ERR_PTR value. 697 * Returns pointer to an NFS client, or an ERR_PTR value.
700 */ 698 */
701struct nfs_client *nfs_init_client(struct nfs_client *clp, 699struct nfs_client *nfs_init_client(struct nfs_client *clp,
702 const struct rpc_timeout *timeparms, 700 const struct rpc_timeout *timeparms,
703 const char *ip_addr, rpc_authflavor_t authflavour) 701 const char *ip_addr)
704{ 702{
705 int error; 703 int error;
706 704
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7ec4814e298d..ef792f29f831 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -20,6 +20,7 @@
20#include "nfs4_fs.h" 20#include "nfs4_fs.h"
21#include "delegation.h" 21#include "delegation.h"
22#include "internal.h" 22#include "internal.h"
23#include "nfs4trace.h"
23 24
24static void nfs_free_delegation(struct nfs_delegation *delegation) 25static void nfs_free_delegation(struct nfs_delegation *delegation)
25{ 26{
@@ -160,6 +161,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
160 spin_unlock(&delegation->lock); 161 spin_unlock(&delegation->lock);
161 put_rpccred(oldcred); 162 put_rpccred(oldcred);
162 rcu_read_unlock(); 163 rcu_read_unlock();
164 trace_nfs4_reclaim_delegation(inode, res->delegation_type);
163 } else { 165 } else {
164 /* We appear to have raced with a delegation return. */ 166 /* We appear to have raced with a delegation return. */
165 spin_unlock(&delegation->lock); 167 spin_unlock(&delegation->lock);
@@ -344,6 +346,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
344 spin_lock(&inode->i_lock); 346 spin_lock(&inode->i_lock);
345 nfsi->cache_validity |= NFS_INO_REVAL_FORCED; 347 nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
346 spin_unlock(&inode->i_lock); 348 spin_unlock(&inode->i_lock);
349 trace_nfs4_set_delegation(inode, res->delegation_type);
347 350
348out: 351out:
349 spin_unlock(&clp->cl_lock); 352 spin_unlock(&clp->cl_lock);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e474ca2b2bfe..02b0df769e2d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -43,6 +43,8 @@
43#include "internal.h" 43#include "internal.h"
44#include "fscache.h" 44#include "fscache.h"
45 45
46#include "nfstrace.h"
47
46/* #define NFS_DEBUG_VERBOSE 1 */ 48/* #define NFS_DEBUG_VERBOSE 1 */
47 49
48static int nfs_opendir(struct inode *, struct file *); 50static int nfs_opendir(struct inode *, struct file *);
@@ -1100,7 +1102,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1100 if (IS_ERR(label)) 1102 if (IS_ERR(label))
1101 goto out_error; 1103 goto out_error;
1102 1104
1105 trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
1103 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); 1106 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1107 trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
1104 if (error) 1108 if (error)
1105 goto out_bad; 1109 goto out_bad;
1106 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1110 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1135,14 +1139,13 @@ out_zap_parent:
1135 if (inode && S_ISDIR(inode->i_mode)) { 1139 if (inode && S_ISDIR(inode->i_mode)) {
1136 /* Purge readdir caches. */ 1140 /* Purge readdir caches. */
1137 nfs_zap_caches(inode); 1141 nfs_zap_caches(inode);
1138 /* If we have submounts, don't unhash ! */
1139 if (have_submounts(dentry))
1140 goto out_valid;
1141 if (dentry->d_flags & DCACHE_DISCONNECTED) 1142 if (dentry->d_flags & DCACHE_DISCONNECTED)
1142 goto out_valid; 1143 goto out_valid;
1143 shrink_dcache_parent(dentry);
1144 } 1144 }
1145 d_drop(dentry); 1145 /* If we have submounts, don't unhash ! */
1146 if (check_submounts_and_drop(dentry) != 0)
1147 goto out_valid;
1148
1146 dput(parent); 1149 dput(parent);
1147 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 1150 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
1148 __func__, dentry->d_parent->d_name.name, 1151 __func__, dentry->d_parent->d_name.name,
@@ -1313,6 +1316,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1313 1316
1314 parent = dentry->d_parent; 1317 parent = dentry->d_parent;
1315 /* Protect against concurrent sillydeletes */ 1318 /* Protect against concurrent sillydeletes */
1319 trace_nfs_lookup_enter(dir, dentry, flags);
1316 nfs_block_sillyrename(parent); 1320 nfs_block_sillyrename(parent);
1317 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); 1321 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1318 if (error == -ENOENT) 1322 if (error == -ENOENT)
@@ -1339,6 +1343,7 @@ no_entry:
1339 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1343 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1340out_unblock_sillyrename: 1344out_unblock_sillyrename:
1341 nfs_unblock_sillyrename(parent); 1345 nfs_unblock_sillyrename(parent);
1346 trace_nfs_lookup_exit(dir, dentry, flags, error);
1342 nfs4_label_free(label); 1347 nfs4_label_free(label);
1343out: 1348out:
1344 nfs_free_fattr(fattr); 1349 nfs_free_fattr(fattr);
@@ -1387,13 +1392,15 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
1387{ 1392{
1388 int err; 1393 int err;
1389 1394
1395 if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
1396 *opened |= FILE_CREATED;
1397
1390 err = finish_open(file, dentry, do_open, opened); 1398 err = finish_open(file, dentry, do_open, opened);
1391 if (err) 1399 if (err)
1392 goto out; 1400 goto out;
1393 nfs_file_set_open_context(file, ctx); 1401 nfs_file_set_open_context(file, ctx);
1394 1402
1395out: 1403out:
1396 put_nfs_open_context(ctx);
1397 return err; 1404 return err;
1398} 1405}
1399 1406
@@ -1405,6 +1412,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1405 struct dentry *res; 1412 struct dentry *res;
1406 struct iattr attr = { .ia_valid = ATTR_OPEN }; 1413 struct iattr attr = { .ia_valid = ATTR_OPEN };
1407 struct inode *inode; 1414 struct inode *inode;
1415 unsigned int lookup_flags = 0;
1408 int err; 1416 int err;
1409 1417
1410 /* Expect a negative dentry */ 1418 /* Expect a negative dentry */
@@ -1413,6 +1421,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1413 dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n", 1421 dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
1414 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1422 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1415 1423
1424 err = nfs_check_flags(open_flags);
1425 if (err)
1426 return err;
1427
1416 /* NFS only supports OPEN on regular files */ 1428 /* NFS only supports OPEN on regular files */
1417 if ((open_flags & O_DIRECTORY)) { 1429 if ((open_flags & O_DIRECTORY)) {
1418 if (!d_unhashed(dentry)) { 1430 if (!d_unhashed(dentry)) {
@@ -1423,6 +1435,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1423 */ 1435 */
1424 return -ENOENT; 1436 return -ENOENT;
1425 } 1437 }
1438 lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY;
1426 goto no_open; 1439 goto no_open;
1427 } 1440 }
1428 1441
@@ -1443,12 +1456,14 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1443 if (IS_ERR(ctx)) 1456 if (IS_ERR(ctx))
1444 goto out; 1457 goto out;
1445 1458
1459 trace_nfs_atomic_open_enter(dir, ctx, open_flags);
1446 nfs_block_sillyrename(dentry->d_parent); 1460 nfs_block_sillyrename(dentry->d_parent);
1447 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); 1461 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
1448 nfs_unblock_sillyrename(dentry->d_parent); 1462 nfs_unblock_sillyrename(dentry->d_parent);
1449 if (IS_ERR(inode)) { 1463 if (IS_ERR(inode)) {
1450 put_nfs_open_context(ctx);
1451 err = PTR_ERR(inode); 1464 err = PTR_ERR(inode);
1465 trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
1466 put_nfs_open_context(ctx);
1452 switch (err) { 1467 switch (err) {
1453 case -ENOENT: 1468 case -ENOENT:
1454 d_drop(dentry); 1469 d_drop(dentry);
@@ -1469,11 +1484,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1469 } 1484 }
1470 1485
1471 err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); 1486 err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
1487 trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
1488 put_nfs_open_context(ctx);
1472out: 1489out:
1473 return err; 1490 return err;
1474 1491
1475no_open: 1492no_open:
1476 res = nfs_lookup(dir, dentry, 0); 1493 res = nfs_lookup(dir, dentry, lookup_flags);
1477 err = PTR_ERR(res); 1494 err = PTR_ERR(res);
1478 if (IS_ERR(res)) 1495 if (IS_ERR(res))
1479 goto out; 1496 goto out;
@@ -1597,7 +1614,9 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
1597 attr.ia_mode = mode; 1614 attr.ia_mode = mode;
1598 attr.ia_valid = ATTR_MODE; 1615 attr.ia_valid = ATTR_MODE;
1599 1616
1617 trace_nfs_create_enter(dir, dentry, open_flags);
1600 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); 1618 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
1619 trace_nfs_create_exit(dir, dentry, open_flags, error);
1601 if (error != 0) 1620 if (error != 0)
1602 goto out_err; 1621 goto out_err;
1603 return 0; 1622 return 0;
@@ -1625,7 +1644,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
1625 attr.ia_mode = mode; 1644 attr.ia_mode = mode;
1626 attr.ia_valid = ATTR_MODE; 1645 attr.ia_valid = ATTR_MODE;
1627 1646
1647 trace_nfs_mknod_enter(dir, dentry);
1628 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1648 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
1649 trace_nfs_mknod_exit(dir, dentry, status);
1629 if (status != 0) 1650 if (status != 0)
1630 goto out_err; 1651 goto out_err;
1631 return 0; 1652 return 0;
@@ -1649,7 +1670,9 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1649 attr.ia_valid = ATTR_MODE; 1670 attr.ia_valid = ATTR_MODE;
1650 attr.ia_mode = mode | S_IFDIR; 1671 attr.ia_mode = mode | S_IFDIR;
1651 1672
1673 trace_nfs_mkdir_enter(dir, dentry);
1652 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1674 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
1675 trace_nfs_mkdir_exit(dir, dentry, error);
1653 if (error != 0) 1676 if (error != 0)
1654 goto out_err; 1677 goto out_err;
1655 return 0; 1678 return 0;
@@ -1672,12 +1695,21 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1672 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", 1695 dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
1673 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1696 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
1674 1697
1675 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1698 trace_nfs_rmdir_enter(dir, dentry);
1676 /* Ensure the VFS deletes this inode */ 1699 if (dentry->d_inode) {
1677 if (error == 0 && dentry->d_inode != NULL) 1700 nfs_wait_on_sillyrename(dentry);
1678 clear_nlink(dentry->d_inode); 1701 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1679 else if (error == -ENOENT) 1702 /* Ensure the VFS deletes this inode */
1680 nfs_dentry_handle_enoent(dentry); 1703 switch (error) {
1704 case 0:
1705 clear_nlink(dentry->d_inode);
1706 break;
1707 case -ENOENT:
1708 nfs_dentry_handle_enoent(dentry);
1709 }
1710 } else
1711 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
1712 trace_nfs_rmdir_exit(dir, dentry, error);
1681 1713
1682 return error; 1714 return error;
1683} 1715}
@@ -1705,6 +1737,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1705 goto out; 1737 goto out;
1706 } 1738 }
1707 1739
1740 trace_nfs_remove_enter(dir, dentry);
1708 if (inode != NULL) { 1741 if (inode != NULL) {
1709 NFS_PROTO(inode)->return_delegation(inode); 1742 NFS_PROTO(inode)->return_delegation(inode);
1710 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1743 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1714,6 +1747,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1714 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1747 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1715 if (error == -ENOENT) 1748 if (error == -ENOENT)
1716 nfs_dentry_handle_enoent(dentry); 1749 nfs_dentry_handle_enoent(dentry);
1750 trace_nfs_remove_exit(dir, dentry, error);
1717out: 1751out:
1718 return error; 1752 return error;
1719} 1753}
@@ -1731,13 +1765,14 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1731 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, 1765 dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
1732 dir->i_ino, dentry->d_name.name); 1766 dir->i_ino, dentry->d_name.name);
1733 1767
1768 trace_nfs_unlink_enter(dir, dentry);
1734 spin_lock(&dentry->d_lock); 1769 spin_lock(&dentry->d_lock);
1735 if (d_count(dentry) > 1) { 1770 if (d_count(dentry) > 1) {
1736 spin_unlock(&dentry->d_lock); 1771 spin_unlock(&dentry->d_lock);
1737 /* Start asynchronous writeout of the inode */ 1772 /* Start asynchronous writeout of the inode */
1738 write_inode_now(dentry->d_inode, 0); 1773 write_inode_now(dentry->d_inode, 0);
1739 error = nfs_sillyrename(dir, dentry); 1774 error = nfs_sillyrename(dir, dentry);
1740 return error; 1775 goto out;
1741 } 1776 }
1742 if (!d_unhashed(dentry)) { 1777 if (!d_unhashed(dentry)) {
1743 __d_drop(dentry); 1778 __d_drop(dentry);
@@ -1749,6 +1784,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1749 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1784 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1750 } else if (need_rehash) 1785 } else if (need_rehash)
1751 d_rehash(dentry); 1786 d_rehash(dentry);
1787out:
1788 trace_nfs_unlink_exit(dir, dentry, error);
1752 return error; 1789 return error;
1753} 1790}
1754EXPORT_SYMBOL_GPL(nfs_unlink); 1791EXPORT_SYMBOL_GPL(nfs_unlink);
@@ -1795,7 +1832,9 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1795 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); 1832 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
1796 kunmap_atomic(kaddr); 1833 kunmap_atomic(kaddr);
1797 1834
1835 trace_nfs_symlink_enter(dir, dentry);
1798 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); 1836 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
1837 trace_nfs_symlink_exit(dir, dentry, error);
1799 if (error != 0) { 1838 if (error != 0) {
1800 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", 1839 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
1801 dir->i_sb->s_id, dir->i_ino, 1840 dir->i_sb->s_id, dir->i_ino,
@@ -1830,6 +1869,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1830 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1869 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1831 dentry->d_parent->d_name.name, dentry->d_name.name); 1870 dentry->d_parent->d_name.name, dentry->d_name.name);
1832 1871
1872 trace_nfs_link_enter(inode, dir, dentry);
1833 NFS_PROTO(inode)->return_delegation(inode); 1873 NFS_PROTO(inode)->return_delegation(inode);
1834 1874
1835 d_drop(dentry); 1875 d_drop(dentry);
@@ -1838,6 +1878,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1838 ihold(inode); 1878 ihold(inode);
1839 d_add(dentry, inode); 1879 d_add(dentry, inode);
1840 } 1880 }
1881 trace_nfs_link_exit(inode, dir, dentry, error);
1841 return error; 1882 return error;
1842} 1883}
1843EXPORT_SYMBOL_GPL(nfs_link); 1884EXPORT_SYMBOL_GPL(nfs_link);
@@ -1879,6 +1920,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1879 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1920 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1880 d_count(new_dentry)); 1921 d_count(new_dentry));
1881 1922
1923 trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
1882 /* 1924 /*
1883 * For non-directories, check whether the target is busy and if so, 1925 * For non-directories, check whether the target is busy and if so,
1884 * make a copy of the dentry and then do a silly-rename. If the 1926 * make a copy of the dentry and then do a silly-rename. If the
@@ -1925,6 +1967,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1925out: 1967out:
1926 if (rehash) 1968 if (rehash)
1927 d_rehash(rehash); 1969 d_rehash(rehash);
1970 trace_nfs_rename_exit(old_dir, old_dentry,
1971 new_dir, new_dentry, error);
1928 if (!error) { 1972 if (!error) {
1929 if (new_inode != NULL) 1973 if (new_inode != NULL)
1930 nfs_drop_nlink(new_inode); 1974 nfs_drop_nlink(new_inode);
@@ -1965,17 +2009,18 @@ static void nfs_access_free_list(struct list_head *head)
1965 } 2009 }
1966} 2010}
1967 2011
1968int nfs_access_cache_shrinker(struct shrinker *shrink, 2012unsigned long
1969 struct shrink_control *sc) 2013nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
1970{ 2014{
1971 LIST_HEAD(head); 2015 LIST_HEAD(head);
1972 struct nfs_inode *nfsi, *next; 2016 struct nfs_inode *nfsi, *next;
1973 struct nfs_access_entry *cache; 2017 struct nfs_access_entry *cache;
1974 int nr_to_scan = sc->nr_to_scan; 2018 int nr_to_scan = sc->nr_to_scan;
1975 gfp_t gfp_mask = sc->gfp_mask; 2019 gfp_t gfp_mask = sc->gfp_mask;
2020 long freed = 0;
1976 2021
1977 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 2022 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
1978 return (nr_to_scan == 0) ? 0 : -1; 2023 return SHRINK_STOP;
1979 2024
1980 spin_lock(&nfs_access_lru_lock); 2025 spin_lock(&nfs_access_lru_lock);
1981 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { 2026 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
@@ -1991,6 +2036,7 @@ int nfs_access_cache_shrinker(struct shrinker *shrink,
1991 struct nfs_access_entry, lru); 2036 struct nfs_access_entry, lru);
1992 list_move(&cache->lru, &head); 2037 list_move(&cache->lru, &head);
1993 rb_erase(&cache->rb_node, &nfsi->access_cache); 2038 rb_erase(&cache->rb_node, &nfsi->access_cache);
2039 freed++;
1994 if (!list_empty(&nfsi->access_cache_entry_lru)) 2040 if (!list_empty(&nfsi->access_cache_entry_lru))
1995 list_move_tail(&nfsi->access_cache_inode_lru, 2041 list_move_tail(&nfsi->access_cache_inode_lru,
1996 &nfs_access_lru_list); 2042 &nfs_access_lru_list);
@@ -2005,7 +2051,13 @@ remove_lru_entry:
2005 } 2051 }
2006 spin_unlock(&nfs_access_lru_lock); 2052 spin_unlock(&nfs_access_lru_lock);
2007 nfs_access_free_list(&head); 2053 nfs_access_free_list(&head);
2008 return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; 2054 return freed;
2055}
2056
2057unsigned long
2058nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
2059{
2060 return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
2009} 2061}
2010 2062
2011static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) 2063static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
@@ -2174,9 +2226,11 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2174 struct nfs_access_entry cache; 2226 struct nfs_access_entry cache;
2175 int status; 2227 int status;
2176 2228
2229 trace_nfs_access_enter(inode);
2230
2177 status = nfs_access_get_cached(inode, cred, &cache); 2231 status = nfs_access_get_cached(inode, cred, &cache);
2178 if (status == 0) 2232 if (status == 0)
2179 goto out; 2233 goto out_cached;
2180 2234
2181 /* Be clever: ask server to check for all possible rights */ 2235 /* Be clever: ask server to check for all possible rights */
2182 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; 2236 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
@@ -2189,13 +2243,15 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2189 if (!S_ISDIR(inode->i_mode)) 2243 if (!S_ISDIR(inode->i_mode))
2190 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 2244 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
2191 } 2245 }
2192 return status; 2246 goto out;
2193 } 2247 }
2194 nfs_access_add_cache(inode, &cache); 2248 nfs_access_add_cache(inode, &cache);
2249out_cached:
2250 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
2251 status = -EACCES;
2195out: 2252out:
2196 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2253 trace_nfs_access_exit(inode, status);
2197 return 0; 2254 return status;
2198 return -EACCES;
2199} 2255}
2200 2256
2201static int nfs_open_permission_mask(int openflags) 2257static int nfs_open_permission_mask(int openflags)
@@ -2241,11 +2297,6 @@ int nfs_permission(struct inode *inode, int mask)
2241 case S_IFLNK: 2297 case S_IFLNK:
2242 goto out; 2298 goto out;
2243 case S_IFREG: 2299 case S_IFREG:
2244 /* NFSv4 has atomic_open... */
2245 if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
2246 && (mask & MAY_OPEN)
2247 && !(mask & MAY_EXEC))
2248 goto out;
2249 break; 2300 break;
2250 case S_IFDIR: 2301 case S_IFDIR:
2251 /* 2302 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0bd7a55a5f07..91ff089d3412 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
130 130
131 return -EINVAL; 131 return -EINVAL;
132#else 132#else
133 VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
134 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 133 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
135 134
136 if (rw == READ || rw == KERNEL_READ) 135 if (rw == READ || rw == KERNEL_READ)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 94e94bd11aae..1e6bfdbc1aff 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -37,6 +37,8 @@
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39 39
40#include "nfstrace.h"
41
40#define NFSDBG_FACILITY NFSDBG_FILE 42#define NFSDBG_FACILITY NFSDBG_FILE
41 43
42static const struct vm_operations_struct nfs_file_vm_ops; 44static const struct vm_operations_struct nfs_file_vm_ops;
@@ -294,6 +296,8 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
294 int ret; 296 int ret;
295 struct inode *inode = file_inode(file); 297 struct inode *inode = file_inode(file);
296 298
299 trace_nfs_fsync_enter(inode);
300
297 do { 301 do {
298 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 302 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
299 if (ret != 0) 303 if (ret != 0)
@@ -310,6 +314,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
310 end = LLONG_MAX; 314 end = LLONG_MAX;
311 } while (ret == -EAGAIN); 315 } while (ret == -EAGAIN);
312 316
317 trace_nfs_fsync_exit(inode, ret);
313 return ret; 318 return ret;
314} 319}
315 320
@@ -406,6 +411,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
406 struct page *page, void *fsdata) 411 struct page *page, void *fsdata)
407{ 412{
408 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 413 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
414 struct nfs_open_context *ctx = nfs_file_open_context(file);
409 int status; 415 int status;
410 416
411 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", 417 dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
@@ -441,6 +447,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
441 if (status < 0) 447 if (status < 0)
442 return status; 448 return status;
443 NFS_I(mapping->host)->write_io += copied; 449 NFS_I(mapping->host)->write_io += copied;
450
451 if (nfs_ctx_key_to_expire(ctx)) {
452 status = nfs_wb_all(mapping->host);
453 if (status < 0)
454 return status;
455 }
456
444 return copied; 457 return copied;
445} 458}
446 459
@@ -637,7 +650,8 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
637 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) 650 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
638 return 1; 651 return 1;
639 ctx = nfs_file_open_context(filp); 652 ctx = nfs_file_open_context(filp);
640 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) 653 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
654 nfs_ctx_key_to_expire(ctx))
641 return 1; 655 return 1;
642 return 0; 656 return 0;
643} 657}
@@ -651,6 +665,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
651 ssize_t result; 665 ssize_t result;
652 size_t count = iov_length(iov, nr_segs); 666 size_t count = iov_length(iov, nr_segs);
653 667
668 result = nfs_key_timeout_notify(iocb->ki_filp, inode);
669 if (result)
670 return result;
671
654 if (iocb->ki_filp->f_flags & O_DIRECT) 672 if (iocb->ki_filp->f_flags & O_DIRECT)
655 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); 673 return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
656 674
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c2c4163d5683..567983d2c0eb 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -49,6 +49,7 @@
49 49
50#include "internal.h" 50#include "internal.h"
51#include "netns.h" 51#include "netns.h"
52#include "nfs4trace.h"
52 53
53#define NFS_UINT_MAXLEN 11 54#define NFS_UINT_MAXLEN 11
54 55
@@ -63,6 +64,7 @@ struct idmap_legacy_upcalldata {
63}; 64};
64 65
65struct idmap { 66struct idmap {
67 struct rpc_pipe_dir_object idmap_pdo;
66 struct rpc_pipe *idmap_pipe; 68 struct rpc_pipe *idmap_pipe;
67 struct idmap_legacy_upcalldata *idmap_upcall_data; 69 struct idmap_legacy_upcalldata *idmap_upcall_data;
68 struct mutex idmap_mutex; 70 struct mutex idmap_mutex;
@@ -310,7 +312,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
310 if (ret < 0) 312 if (ret < 0)
311 goto out_up; 313 goto out_up;
312 314
313 payload = rcu_dereference(rkey->payload.data); 315 payload = rcu_dereference(rkey->payload.rcudata);
314 if (IS_ERR_OR_NULL(payload)) { 316 if (IS_ERR_OR_NULL(payload)) {
315 ret = PTR_ERR(payload); 317 ret = PTR_ERR(payload);
316 goto out_up; 318 goto out_up;
@@ -401,16 +403,23 @@ static struct key_type key_type_id_resolver_legacy = {
401 .request_key = nfs_idmap_legacy_upcall, 403 .request_key = nfs_idmap_legacy_upcall,
402}; 404};
403 405
404static void __nfs_idmap_unregister(struct rpc_pipe *pipe) 406static void nfs_idmap_pipe_destroy(struct dentry *dir,
407 struct rpc_pipe_dir_object *pdo)
405{ 408{
406 if (pipe->dentry) 409 struct idmap *idmap = pdo->pdo_data;
410 struct rpc_pipe *pipe = idmap->idmap_pipe;
411
412 if (pipe->dentry) {
407 rpc_unlink(pipe->dentry); 413 rpc_unlink(pipe->dentry);
414 pipe->dentry = NULL;
415 }
408} 416}
409 417
410static int __nfs_idmap_register(struct dentry *dir, 418static int nfs_idmap_pipe_create(struct dentry *dir,
411 struct idmap *idmap, 419 struct rpc_pipe_dir_object *pdo)
412 struct rpc_pipe *pipe)
413{ 420{
421 struct idmap *idmap = pdo->pdo_data;
422 struct rpc_pipe *pipe = idmap->idmap_pipe;
414 struct dentry *dentry; 423 struct dentry *dentry;
415 424
416 dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); 425 dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
@@ -420,36 +429,10 @@ static int __nfs_idmap_register(struct dentry *dir,
420 return 0; 429 return 0;
421} 430}
422 431
423static void nfs_idmap_unregister(struct nfs_client *clp, 432static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
424 struct rpc_pipe *pipe) 433 .create = nfs_idmap_pipe_create,
425{ 434 .destroy = nfs_idmap_pipe_destroy,
426 struct net *net = clp->cl_net; 435};
427 struct super_block *pipefs_sb;
428
429 pipefs_sb = rpc_get_sb_net(net);
430 if (pipefs_sb) {
431 __nfs_idmap_unregister(pipe);
432 rpc_put_sb_net(net);
433 }
434}
435
436static int nfs_idmap_register(struct nfs_client *clp,
437 struct idmap *idmap,
438 struct rpc_pipe *pipe)
439{
440 struct net *net = clp->cl_net;
441 struct super_block *pipefs_sb;
442 int err = 0;
443
444 pipefs_sb = rpc_get_sb_net(net);
445 if (pipefs_sb) {
446 if (clp->cl_rpcclient->cl_dentry)
447 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
448 idmap, pipe);
449 rpc_put_sb_net(net);
450 }
451 return err;
452}
453 436
454int 437int
455nfs_idmap_new(struct nfs_client *clp) 438nfs_idmap_new(struct nfs_client *clp)
@@ -462,23 +445,31 @@ nfs_idmap_new(struct nfs_client *clp)
462 if (idmap == NULL) 445 if (idmap == NULL)
463 return -ENOMEM; 446 return -ENOMEM;
464 447
448 rpc_init_pipe_dir_object(&idmap->idmap_pdo,
449 &nfs_idmap_pipe_dir_object_ops,
450 idmap);
451
465 pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); 452 pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
466 if (IS_ERR(pipe)) { 453 if (IS_ERR(pipe)) {
467 error = PTR_ERR(pipe); 454 error = PTR_ERR(pipe);
468 kfree(idmap); 455 goto err;
469 return error;
470 }
471 error = nfs_idmap_register(clp, idmap, pipe);
472 if (error) {
473 rpc_destroy_pipe_data(pipe);
474 kfree(idmap);
475 return error;
476 } 456 }
477 idmap->idmap_pipe = pipe; 457 idmap->idmap_pipe = pipe;
478 mutex_init(&idmap->idmap_mutex); 458 mutex_init(&idmap->idmap_mutex);
479 459
460 error = rpc_add_pipe_dir_object(clp->cl_net,
461 &clp->cl_rpcclient->cl_pipedir_objects,
462 &idmap->idmap_pdo);
463 if (error)
464 goto err_destroy_pipe;
465
480 clp->cl_idmap = idmap; 466 clp->cl_idmap = idmap;
481 return 0; 467 return 0;
468err_destroy_pipe:
469 rpc_destroy_pipe_data(idmap->idmap_pipe);
470err:
471 kfree(idmap);
472 return error;
482} 473}
483 474
484void 475void
@@ -488,130 +479,26 @@ nfs_idmap_delete(struct nfs_client *clp)
488 479
489 if (!idmap) 480 if (!idmap)
490 return; 481 return;
491 nfs_idmap_unregister(clp, idmap->idmap_pipe);
492 rpc_destroy_pipe_data(idmap->idmap_pipe);
493 clp->cl_idmap = NULL; 482 clp->cl_idmap = NULL;
483 rpc_remove_pipe_dir_object(clp->cl_net,
484 &clp->cl_rpcclient->cl_pipedir_objects,
485 &idmap->idmap_pdo);
486 rpc_destroy_pipe_data(idmap->idmap_pipe);
494 kfree(idmap); 487 kfree(idmap);
495} 488}
496 489
497static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
498 struct super_block *sb)
499{
500 int err = 0;
501
502 switch (event) {
503 case RPC_PIPEFS_MOUNT:
504 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
505 clp->cl_idmap,
506 clp->cl_idmap->idmap_pipe);
507 break;
508 case RPC_PIPEFS_UMOUNT:
509 if (clp->cl_idmap->idmap_pipe) {
510 struct dentry *parent;
511
512 parent = clp->cl_idmap->idmap_pipe->dentry->d_parent;
513 __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe);
514 /*
515 * Note: This is a dirty hack. SUNRPC hook has been
516 * called already but simple_rmdir() call for the
517 * directory returned with error because of idmap pipe
518 * inside. Thus now we have to remove this directory
519 * here.
520 */
521 if (rpc_rmdir(parent))
522 printk(KERN_ERR "NFS: %s: failed to remove "
523 "clnt dir!\n", __func__);
524 }
525 break;
526 default:
527 printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__,
528 event);
529 return -ENOTSUPP;
530 }
531 return err;
532}
533
534static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
535{
536 struct nfs_net *nn = net_generic(net, nfs_net_id);
537 struct dentry *cl_dentry;
538 struct nfs_client *clp;
539 int err;
540
541restart:
542 spin_lock(&nn->nfs_client_lock);
543 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
544 /* Wait for initialisation to finish */
545 if (clp->cl_cons_state == NFS_CS_INITING) {
546 atomic_inc(&clp->cl_count);
547 spin_unlock(&nn->nfs_client_lock);
548 err = nfs_wait_client_init_complete(clp);
549 nfs_put_client(clp);
550 if (err)
551 return NULL;
552 goto restart;
553 }
554 /* Skip nfs_clients that failed to initialise */
555 if (clp->cl_cons_state < 0)
556 continue;
557 smp_rmb();
558 if (clp->rpc_ops != &nfs_v4_clientops)
559 continue;
560 cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
561 if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) ||
562 ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry))
563 continue;
564 atomic_inc(&clp->cl_count);
565 spin_unlock(&nn->nfs_client_lock);
566 return clp;
567 }
568 spin_unlock(&nn->nfs_client_lock);
569 return NULL;
570}
571
572static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
573 void *ptr)
574{
575 struct super_block *sb = ptr;
576 struct nfs_client *clp;
577 int error = 0;
578
579 if (!try_module_get(THIS_MODULE))
580 return 0;
581
582 while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) {
583 error = __rpc_pipefs_event(clp, event, sb);
584 nfs_put_client(clp);
585 if (error)
586 break;
587 }
588 module_put(THIS_MODULE);
589 return error;
590}
591
592#define PIPEFS_NFS_PRIO 1
593
594static struct notifier_block nfs_idmap_block = {
595 .notifier_call = rpc_pipefs_event,
596 .priority = SUNRPC_PIPEFS_NFS_PRIO,
597};
598
599int nfs_idmap_init(void) 490int nfs_idmap_init(void)
600{ 491{
601 int ret; 492 int ret;
602 ret = nfs_idmap_init_keyring(); 493 ret = nfs_idmap_init_keyring();
603 if (ret != 0) 494 if (ret != 0)
604 goto out; 495 goto out;
605 ret = rpc_pipefs_notifier_register(&nfs_idmap_block);
606 if (ret != 0)
607 nfs_idmap_quit_keyring();
608out: 496out:
609 return ret; 497 return ret;
610} 498}
611 499
612void nfs_idmap_quit(void) 500void nfs_idmap_quit(void)
613{ 501{
614 rpc_pipefs_notifier_unregister(&nfs_idmap_block);
615 nfs_idmap_quit_keyring(); 502 nfs_idmap_quit_keyring();
616} 503}
617 504
@@ -849,6 +736,7 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_
849 if (!uid_valid(*uid)) 736 if (!uid_valid(*uid))
850 ret = -ERANGE; 737 ret = -ERANGE;
851 } 738 }
739 trace_nfs4_map_name_to_uid(name, namelen, id, ret);
852 return ret; 740 return ret;
853} 741}
854 742
@@ -865,6 +753,7 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size
865 if (!gid_valid(*gid)) 753 if (!gid_valid(*gid))
866 ret = -ERANGE; 754 ret = -ERANGE;
867 } 755 }
756 trace_nfs4_map_group_to_gid(name, namelen, id, ret);
868 return ret; 757 return ret;
869} 758}
870 759
@@ -879,6 +768,7 @@ int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf,
879 ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); 768 ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
880 if (ret < 0) 769 if (ret < 0)
881 ret = nfs_map_numeric_to_string(id, buf, buflen); 770 ret = nfs_map_numeric_to_string(id, buf, buflen);
771 trace_nfs4_map_uid_to_name(buf, ret, id, ret);
882 return ret; 772 return ret;
883} 773}
884int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) 774int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
@@ -892,5 +782,6 @@ int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf,
892 ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); 782 ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
893 if (ret < 0) 783 if (ret < 0)
894 ret = nfs_map_numeric_to_string(id, buf, buflen); 784 ret = nfs_map_numeric_to_string(id, buf, buflen);
785 trace_nfs4_map_gid_to_group(buf, ret, id, ret);
895 return ret; 786 return ret;
896} 787}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 941246f2b43d..eda8879171c4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -38,7 +38,6 @@
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/compat.h> 39#include <linux/compat.h>
40#include <linux/freezer.h> 40#include <linux/freezer.h>
41#include <linux/crc32.h>
42 41
43#include <asm/uaccess.h> 42#include <asm/uaccess.h>
44 43
@@ -52,6 +51,8 @@
52#include "nfs.h" 51#include "nfs.h"
53#include "netns.h" 52#include "netns.h"
54 53
54#include "nfstrace.h"
55
55#define NFSDBG_FACILITY NFSDBG_VFS 56#define NFSDBG_FACILITY NFSDBG_VFS
56 57
57#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 58#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
@@ -503,6 +504,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
503 if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) 504 if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
504 return 0; 505 return 0;
505 506
507 trace_nfs_setattr_enter(inode);
508
506 /* Write all dirty data */ 509 /* Write all dirty data */
507 if (S_ISREG(inode->i_mode)) { 510 if (S_ISREG(inode->i_mode)) {
508 nfs_inode_dio_wait(inode); 511 nfs_inode_dio_wait(inode);
@@ -522,6 +525,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
522 error = nfs_refresh_inode(inode, fattr); 525 error = nfs_refresh_inode(inode, fattr);
523 nfs_free_fattr(fattr); 526 nfs_free_fattr(fattr);
524out: 527out:
528 trace_nfs_setattr_exit(inode, error);
525 return error; 529 return error;
526} 530}
527EXPORT_SYMBOL_GPL(nfs_setattr); 531EXPORT_SYMBOL_GPL(nfs_setattr);
@@ -537,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
537 */ 541 */
538static int nfs_vmtruncate(struct inode * inode, loff_t offset) 542static int nfs_vmtruncate(struct inode * inode, loff_t offset)
539{ 543{
540 loff_t oldsize;
541 int err; 544 int err;
542 545
543 err = inode_newsize_ok(inode, offset); 546 err = inode_newsize_ok(inode, offset);
@@ -545,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
545 goto out; 548 goto out;
546 549
547 spin_lock(&inode->i_lock); 550 spin_lock(&inode->i_lock);
548 oldsize = inode->i_size;
549 i_size_write(inode, offset); 551 i_size_write(inode, offset);
550 spin_unlock(&inode->i_lock); 552 spin_unlock(&inode->i_lock);
551 553
552 truncate_pagecache(inode, oldsize, offset); 554 truncate_pagecache(inode, offset);
553out: 555out:
554 return err; 556 return err;
555} 557}
@@ -591,6 +593,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
591 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 593 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
592 int err; 594 int err;
593 595
596 trace_nfs_getattr_enter(inode);
594 /* Flush out writes to the server in order to update c/mtime. */ 597 /* Flush out writes to the server in order to update c/mtime. */
595 if (S_ISREG(inode->i_mode)) { 598 if (S_ISREG(inode->i_mode)) {
596 nfs_inode_dio_wait(inode); 599 nfs_inode_dio_wait(inode);
@@ -621,6 +624,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
621 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 624 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
622 } 625 }
623out: 626out:
627 trace_nfs_getattr_exit(inode, err);
624 return err; 628 return err;
625} 629}
626EXPORT_SYMBOL_GPL(nfs_getattr); 630EXPORT_SYMBOL_GPL(nfs_getattr);
@@ -875,6 +879,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
875 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 879 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
876 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 880 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
877 881
882 trace_nfs_revalidate_inode_enter(inode);
883
878 if (is_bad_inode(inode)) 884 if (is_bad_inode(inode))
879 goto out; 885 goto out;
880 if (NFS_STALE(inode)) 886 if (NFS_STALE(inode))
@@ -925,6 +931,7 @@ err_out:
925 nfs4_label_free(label); 931 nfs4_label_free(label);
926out: 932out:
927 nfs_free_fattr(fattr); 933 nfs_free_fattr(fattr);
934 trace_nfs_revalidate_inode_exit(inode, status);
928 return status; 935 return status;
929} 936}
930 937
@@ -981,6 +988,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
981 spin_unlock(&inode->i_lock); 988 spin_unlock(&inode->i_lock);
982 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 989 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
983 nfs_fscache_wait_on_invalidate(inode); 990 nfs_fscache_wait_on_invalidate(inode);
991
984 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 992 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
985 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 993 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
986 return 0; 994 return 0;
@@ -1014,8 +1022,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1014 if (ret < 0) 1022 if (ret < 0)
1015 goto out; 1023 goto out;
1016 } 1024 }
1017 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) 1025 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
1026 trace_nfs_invalidate_mapping_enter(inode);
1018 ret = nfs_invalidate_mapping(inode, mapping); 1027 ret = nfs_invalidate_mapping(inode, mapping);
1028 trace_nfs_invalidate_mapping_exit(inode, ret);
1029 }
1030
1019out: 1031out:
1020 return ret; 1032 return ret;
1021} 1033}
@@ -1195,7 +1207,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
1195{ 1207{
1196 /* wireshark uses 32-bit AUTODIN crc and does a bitwise 1208 /* wireshark uses 32-bit AUTODIN crc and does a bitwise
1197 * not on the result */ 1209 * not on the result */
1198 return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size); 1210 return nfs_fhandle_hash(fh);
1199} 1211}
1200 1212
1201/* 1213/*
@@ -1274,9 +1286,17 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
1274 1286
1275static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1287static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
1276{ 1288{
1289 int ret;
1290
1291 trace_nfs_refresh_inode_enter(inode);
1292
1277 if (nfs_inode_attrs_need_update(inode, fattr)) 1293 if (nfs_inode_attrs_need_update(inode, fattr))
1278 return nfs_update_inode(inode, fattr); 1294 ret = nfs_update_inode(inode, fattr);
1279 return nfs_check_inode_attributes(inode, fattr); 1295 else
1296 ret = nfs_check_inode_attributes(inode, fattr);
1297
1298 trace_nfs_refresh_inode_exit(inode, ret);
1299 return ret;
1280} 1300}
1281 1301
1282/** 1302/**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3c8373f90ab3..38da8c2b81ac 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,7 @@
5#include "nfs4_fs.h" 5#include "nfs4_fs.h"
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/crc32.h>
8 9
9#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) 10#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
10 11
@@ -185,6 +186,8 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
185 int ds_addrlen, int ds_proto, 186 int ds_addrlen, int ds_proto,
186 unsigned int ds_timeo, 187 unsigned int ds_timeo,
187 unsigned int ds_retrans); 188 unsigned int ds_retrans);
189extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
190 struct inode *);
188#ifdef CONFIG_PROC_FS 191#ifdef CONFIG_PROC_FS
189extern int __init nfs_fs_proc_init(void); 192extern int __init nfs_fs_proc_init(void);
190extern void nfs_fs_proc_exit(void); 193extern void nfs_fs_proc_exit(void);
@@ -267,11 +270,13 @@ extern struct rpc_procinfo nfs4_procedures[];
267void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 270void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
268extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 271extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
269 const struct rpc_timeout *timeparms, 272 const struct rpc_timeout *timeparms,
270 const char *ip_addr, rpc_authflavor_t authflavour); 273 const char *ip_addr);
271 274
272/* dir.c */ 275/* dir.c */
273extern int nfs_access_cache_shrinker(struct shrinker *shrink, 276extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
274 struct shrink_control *sc); 277 struct shrink_control *sc);
278extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
279 struct shrink_control *sc);
275struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); 280struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
276int nfs_create(struct inode *, struct dentry *, umode_t, bool); 281int nfs_create(struct inode *, struct dentry *, umode_t, bool);
277int nfs_mkdir(struct inode *, struct dentry *, umode_t); 282int nfs_mkdir(struct inode *, struct dentry *, umode_t);
@@ -355,7 +360,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
355extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, 360extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
356 const char *); 361 const char *);
357 362
358extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); 363extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
359#endif 364#endif
360 365
361struct nfs_pgio_completion_ops; 366struct nfs_pgio_completion_ops;
@@ -430,6 +435,8 @@ void nfs_request_remove_commit_list(struct nfs_page *req,
430void nfs_init_cinfo(struct nfs_commit_info *cinfo, 435void nfs_init_cinfo(struct nfs_commit_info *cinfo,
431 struct inode *inode, 436 struct inode *inode,
432 struct nfs_direct_req *dreq); 437 struct nfs_direct_req *dreq);
438int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
439bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
433 440
434#ifdef CONFIG_MIGRATION 441#ifdef CONFIG_MIGRATION
435extern int nfs_migrate_page(struct address_space *, 442extern int nfs_migrate_page(struct address_space *,
@@ -451,8 +458,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
451extern void __nfs4_read_done_cb(struct nfs_read_data *); 458extern void __nfs4_read_done_cb(struct nfs_read_data *);
452extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, 459extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
453 const struct rpc_timeout *timeparms, 460 const struct rpc_timeout *timeparms,
454 const char *ip_addr, 461 const char *ip_addr);
455 rpc_authflavor_t authflavour);
456extern int nfs40_walk_client_list(struct nfs_client *clp, 462extern int nfs40_walk_client_list(struct nfs_client *clp,
457 struct nfs_client **result, 463 struct nfs_client **result,
458 struct rpc_cred *cred); 464 struct rpc_cred *cred);
@@ -575,3 +581,22 @@ u64 nfs_timespec_to_change_attr(const struct timespec *ts)
575{ 581{
576 return ((u64)ts->tv_sec << 30) + ts->tv_nsec; 582 return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
577} 583}
584
585#ifdef CONFIG_CRC32
586/**
587 * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
588 * @fh - pointer to filehandle
589 *
590 * returns a crc32 hash for the filehandle that is compatible with
591 * the one displayed by "wireshark".
592 */
593static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
594{
595 return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
596}
597#else
598static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
599{
600 return 0;
601}
602#endif
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f5c84c3efbca..90cb10d7b693 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -336,8 +336,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
336 data->arg.create.createmode = NFS3_CREATE_UNCHECKED; 336 data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
337 if (flags & O_EXCL) { 337 if (flags & O_EXCL) {
338 data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; 338 data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
339 data->arg.create.verifier[0] = jiffies; 339 data->arg.create.verifier[0] = cpu_to_be32(jiffies);
340 data->arg.create.verifier[1] = current->pid; 340 data->arg.create.verifier[1] = cpu_to_be32(current->pid);
341 } 341 }
342 342
343 sattr->ia_mode &= ~current_umask(); 343 sattr->ia_mode &= ~current_umask();
@@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
826 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; 826 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
827} 827}
828 828
829static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) 829static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
830{ 830{
831 rpc_call_start(task); 831 rpc_call_start(task);
832 return 0;
832} 833}
833 834
834static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) 835static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
847 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; 848 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
848} 849}
849 850
850static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) 851static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
851{ 852{
852 rpc_call_start(task); 853 rpc_call_start(task);
854 return 0;
853} 855}
854 856
855static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 857static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ee81e354bce7..28842abafab4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,17 +38,15 @@ struct nfs4_minor_version_ops {
38 u32 minor_version; 38 u32 minor_version;
39 unsigned init_caps; 39 unsigned init_caps;
40 40
41 int (*call_sync)(struct rpc_clnt *clnt, 41 int (*init_client)(struct nfs_client *);
42 struct nfs_server *server, 42 void (*shutdown_client)(struct nfs_client *);
43 struct rpc_message *msg,
44 struct nfs4_sequence_args *args,
45 struct nfs4_sequence_res *res);
46 bool (*match_stateid)(const nfs4_stateid *, 43 bool (*match_stateid)(const nfs4_stateid *,
47 const nfs4_stateid *); 44 const nfs4_stateid *);
48 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 45 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
49 struct nfs_fsinfo *); 46 struct nfs_fsinfo *);
50 int (*free_lock_state)(struct nfs_server *, 47 int (*free_lock_state)(struct nfs_server *,
51 struct nfs4_lock_state *); 48 struct nfs4_lock_state *);
49 const struct rpc_call_ops *call_sync_ops;
52 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 50 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
53 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 51 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
54 const struct nfs4_state_maintenance_ops *state_renewal_ops; 52 const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -135,6 +133,7 @@ struct nfs4_lock_state {
135 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
136 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
137#define NFS_LOCK_INITIALIZED 0 135#define NFS_LOCK_INITIALIZED 0
136#define NFS_LOCK_LOST 1
138 unsigned long ls_flags; 137 unsigned long ls_flags;
139 struct nfs_seqid_counter ls_seqid; 138 struct nfs_seqid_counter ls_seqid;
140 nfs4_stateid ls_stateid; 139 nfs4_stateid ls_stateid;
@@ -193,7 +192,6 @@ struct nfs4_state_recovery_ops {
193 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); 192 int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
194 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 193 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 194 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
197 int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); 195 int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **, 196 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
199 struct rpc_cred *); 197 struct rpc_cred *);
@@ -223,7 +221,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
223/* nfs4proc.c */ 221/* nfs4proc.c */
224extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 222extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
225extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 223extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
226extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 224extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
227extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); 225extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
228extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 226extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
229extern int nfs4_destroy_clientid(struct nfs_client *clp); 227extern int nfs4_destroy_clientid(struct nfs_client *clp);
@@ -248,9 +246,6 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
248 return server->nfs_client->cl_session; 246 return server->nfs_client->cl_session;
249} 247}
250 248
251extern int nfs4_setup_sequence(const struct nfs_server *server,
252 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
253 struct rpc_task *task);
254extern int nfs41_setup_sequence(struct nfs4_session *session, 249extern int nfs41_setup_sequence(struct nfs4_session *session,
255 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 250 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
256 struct rpc_task *task); 251 struct rpc_task *task);
@@ -273,18 +268,63 @@ is_ds_client(struct nfs_client *clp)
273{ 268{
274 return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; 269 return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
275} 270}
276#else /* CONFIG_NFS_v4_1 */ 271
277static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 272static inline bool
273_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
274 struct rpc_clnt **clntp, struct rpc_message *msg)
278{ 275{
279 return NULL; 276 struct rpc_cred *newcred = NULL;
277 rpc_authflavor_t flavor;
278
279 if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
280 spin_lock(&clp->cl_lock);
281 if (clp->cl_machine_cred != NULL)
282 /* don't call get_rpccred on the machine cred -
283 * a reference will be held for life of clp */
284 newcred = clp->cl_machine_cred;
285 spin_unlock(&clp->cl_lock);
286 msg->rpc_cred = newcred;
287
288 flavor = clp->cl_rpcclient->cl_auth->au_flavor;
289 WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
290 flavor != RPC_AUTH_GSS_KRB5P);
291 *clntp = clp->cl_rpcclient;
292
293 return true;
294 }
295 return false;
280} 296}
281 297
282static inline int nfs4_setup_sequence(const struct nfs_server *server, 298/*
283 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 299 * Function responsible for determining if an rpc_message should use the
284 struct rpc_task *task) 300 * machine cred under SP4_MACH_CRED and if so switching the credential and
301 * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p).
302 * Should be called before rpc_call_sync/rpc_call_async.
303 */
304static inline void
305nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
306 struct rpc_clnt **clntp, struct rpc_message *msg)
285{ 307{
286 rpc_call_start(task); 308 _nfs4_state_protect(clp, sp4_mode, clntp, msg);
287 return 0; 309}
310
311/*
312 * Special wrapper to nfs4_state_protect for write.
313 * If WRITE can use machine cred but COMMIT cannot, make sure all writes
314 * that use machine cred use NFS_FILE_SYNC.
315 */
316static inline void
317nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
318 struct rpc_message *msg, struct nfs_write_data *wdata)
319{
320 if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
321 !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
322 wdata->args.stable = NFS_FILE_SYNC;
323}
324#else /* CONFIG_NFS_v4_1 */
325static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
326{
327 return NULL;
288} 328}
289 329
290static inline bool 330static inline bool
@@ -298,6 +338,18 @@ is_ds_client(struct nfs_client *clp)
298{ 338{
299 return false; 339 return false;
300} 340}
341
342static inline void
343nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
344 struct rpc_clnt **clntp, struct rpc_message *msg)
345{
346}
347
348static inline void
349nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
350 struct rpc_message *msg, struct nfs_write_data *wdata)
351{
352}
301#endif /* CONFIG_NFS_V4_1 */ 353#endif /* CONFIG_NFS_V4_1 */
302 354
303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 355extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -308,6 +360,10 @@ extern const u32 nfs4_pathconf_bitmap[3];
308extern const u32 nfs4_fsinfo_bitmap[3]; 360extern const u32 nfs4_fsinfo_bitmap[3];
309extern const u32 nfs4_fs_locations_bitmap[3]; 361extern const u32 nfs4_fs_locations_bitmap[3];
310 362
363void nfs40_shutdown_client(struct nfs_client *);
364void nfs41_shutdown_client(struct nfs_client *);
365int nfs40_init_client(struct nfs_client *);
366int nfs41_init_client(struct nfs_client *);
311void nfs4_free_client(struct nfs_client *); 367void nfs4_free_client(struct nfs_client *);
312 368
313struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); 369struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
@@ -319,7 +375,7 @@ extern void nfs4_kill_renewd(struct nfs_client *);
319extern void nfs4_renew_state(struct work_struct *); 375extern void nfs4_renew_state(struct work_struct *);
320 376
321/* nfs4state.c */ 377/* nfs4state.c */
322struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); 378struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
323struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); 379struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
324struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 380struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
325int nfs4_discover_server_trunking(struct nfs_client *clp, 381int nfs4_discover_server_trunking(struct nfs_client *clp,
@@ -327,7 +383,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
327int nfs40_discover_server_trunking(struct nfs_client *clp, 383int nfs40_discover_server_trunking(struct nfs_client *clp,
328 struct nfs_client **, struct rpc_cred *); 384 struct nfs_client **, struct rpc_cred *);
329#if defined(CONFIG_NFS_V4_1) 385#if defined(CONFIG_NFS_V4_1)
330struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
331int nfs41_discover_server_trunking(struct nfs_client *clp, 386int nfs41_discover_server_trunking(struct nfs_client *clp,
332 struct nfs_client **, struct rpc_cred *); 387 struct nfs_client **, struct rpc_cred *);
333extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 388extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
@@ -382,6 +437,7 @@ struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct
382extern bool nfs4_disable_idmapping; 437extern bool nfs4_disable_idmapping;
383extern unsigned short max_session_slots; 438extern unsigned short max_session_slots;
384extern unsigned short send_implementation_id; 439extern unsigned short send_implementation_id;
440extern bool recover_lost_locks;
385 441
386#define NFS4_CLIENT_ID_UNIQ_LEN (64) 442#define NFS4_CLIENT_ID_UNIQ_LEN (64)
387extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; 443extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
@@ -429,6 +485,8 @@ static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
429 485
430#define nfs4_close_state(a, b) do { } while (0) 486#define nfs4_close_state(a, b) do { } while (0)
431#define nfs4_close_sync(a, b) do { } while (0) 487#define nfs4_close_sync(a, b) do { } while (0)
488#define nfs4_state_protect(a, b, c, d) do { } while (0)
489#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
432 490
433#endif /* CONFIG_NFS_V4 */ 491#endif /* CONFIG_NFS_V4 */
434#endif /* __LINUX_FS_NFS_NFS4_FS.H */ 492#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 90dce91dd5b5..a860ab566d6e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -41,19 +41,138 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
41} 41}
42 42
43#ifdef CONFIG_NFS_V4_1 43#ifdef CONFIG_NFS_V4_1
44static void nfs4_shutdown_session(struct nfs_client *clp) 44/**
45 * Per auth flavor data server rpc clients
46 */
47struct nfs4_ds_server {
48 struct list_head list; /* ds_clp->cl_ds_clients */
49 struct rpc_clnt *rpc_clnt;
50};
51
52/**
53 * Common lookup case for DS I/O
54 */
55static struct nfs4_ds_server *
56nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
57{
58 struct nfs4_ds_server *dss;
59
60 rcu_read_lock();
61 list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) {
62 if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
63 continue;
64 goto out;
65 }
66 dss = NULL;
67out:
68 rcu_read_unlock();
69 return dss;
70}
71
72static struct nfs4_ds_server *
73nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor,
74 struct nfs4_ds_server *new)
75{
76 struct nfs4_ds_server *dss;
77
78 spin_lock(&ds_clp->cl_lock);
79 list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) {
80 if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
81 continue;
82 goto out;
83 }
84 if (new)
85 list_add_rcu(&new->list, &ds_clp->cl_ds_clients);
86 dss = new;
87out:
88 spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */
89 return dss;
90}
91
92static struct nfs4_ds_server *
93nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
94{
95 struct nfs4_ds_server *dss;
96
97 dss = kmalloc(sizeof(*dss), GFP_NOFS);
98 if (dss == NULL)
99 return ERR_PTR(-ENOMEM);
100
101 dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor);
102 if (IS_ERR(dss->rpc_clnt)) {
103 int err = PTR_ERR(dss->rpc_clnt);
104 kfree (dss);
105 return ERR_PTR(err);
106 }
107 INIT_LIST_HEAD(&dss->list);
108
109 return dss;
110}
111
112static void
113nfs4_free_ds_server(struct nfs4_ds_server *dss)
114{
115 rpc_release_client(dss->rpc_clnt);
116 kfree(dss);
117}
118
119/**
120* Find or create a DS rpc client with th MDS server rpc client auth flavor
121* in the nfs_client cl_ds_clients list.
122*/
123struct rpc_clnt *
124nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
125{
126 struct nfs4_ds_server *dss, *new;
127 rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor;
128
129 dss = nfs4_find_ds_client(ds_clp, flavor);
130 if (dss != NULL)
131 goto out;
132 new = nfs4_alloc_ds_server(ds_clp, flavor);
133 if (IS_ERR(new))
134 return ERR_CAST(new);
135 dss = nfs4_add_ds_client(ds_clp, flavor, new);
136 if (dss != new)
137 nfs4_free_ds_server(new);
138out:
139 return dss->rpc_clnt;
140}
141EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client);
142
143static void
144nfs4_shutdown_ds_clients(struct nfs_client *clp)
145{
146 struct nfs4_ds_server *dss;
147 LIST_HEAD(shutdown_list);
148
149 while (!list_empty(&clp->cl_ds_clients)) {
150 dss = list_entry(clp->cl_ds_clients.next,
151 struct nfs4_ds_server, list);
152 list_del(&dss->list);
153 rpc_shutdown_client(dss->rpc_clnt);
154 kfree (dss);
155 }
156}
157
158void nfs41_shutdown_client(struct nfs_client *clp)
45{ 159{
46 if (nfs4_has_session(clp)) { 160 if (nfs4_has_session(clp)) {
161 nfs4_shutdown_ds_clients(clp);
47 nfs4_destroy_session(clp->cl_session); 162 nfs4_destroy_session(clp->cl_session);
48 nfs4_destroy_clientid(clp); 163 nfs4_destroy_clientid(clp);
49 } 164 }
50 165
51} 166}
52#else /* CONFIG_NFS_V4_1 */ 167#endif /* CONFIG_NFS_V4_1 */
53static void nfs4_shutdown_session(struct nfs_client *clp) 168
169void nfs40_shutdown_client(struct nfs_client *clp)
54{ 170{
171 if (clp->cl_slot_tbl) {
172 nfs4_release_slot_table(clp->cl_slot_tbl);
173 kfree(clp->cl_slot_tbl);
174 }
55} 175}
56#endif /* CONFIG_NFS_V4_1 */
57 176
58struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) 177struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
59{ 178{
@@ -73,6 +192,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
73 192
74 spin_lock_init(&clp->cl_lock); 193 spin_lock_init(&clp->cl_lock);
75 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 194 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
195 INIT_LIST_HEAD(&clp->cl_ds_clients);
76 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 196 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
77 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; 197 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
78 clp->cl_minorversion = cl_init->minorversion; 198 clp->cl_minorversion = cl_init->minorversion;
@@ -97,7 +217,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
97{ 217{
98 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) 218 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
99 nfs4_kill_renewd(clp); 219 nfs4_kill_renewd(clp);
100 nfs4_shutdown_session(clp); 220 clp->cl_mvops->shutdown_client(clp);
101 nfs4_destroy_callback(clp); 221 nfs4_destroy_callback(clp);
102 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) 222 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
103 nfs_idmap_delete(clp); 223 nfs_idmap_delete(clp);
@@ -144,34 +264,77 @@ static int nfs4_init_callback(struct nfs_client *clp)
144 return 0; 264 return 0;
145} 265}
146 266
267/**
268 * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
269 * @clp - nfs_client to initialize
270 *
271 * Returns zero on success, or a negative errno if some error occurred.
272 */
273int nfs40_init_client(struct nfs_client *clp)
274{
275 struct nfs4_slot_table *tbl;
276 int ret;
277
278 tbl = kzalloc(sizeof(*tbl), GFP_NOFS);
279 if (tbl == NULL)
280 return -ENOMEM;
281
282 ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
283 "NFSv4.0 transport Slot table");
284 if (ret) {
285 kfree(tbl);
286 return ret;
287 }
288
289 clp->cl_slot_tbl = tbl;
290 return 0;
291}
292
293#if defined(CONFIG_NFS_V4_1)
294
295/**
296 * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
297 * @clp - nfs_client to initialize
298 *
299 * Returns zero on success, or a negative errno if some error occurred.
300 */
301int nfs41_init_client(struct nfs_client *clp)
302{
303 struct nfs4_session *session = NULL;
304
305 /*
306 * Create the session and mark it expired.
307 * When a SEQUENCE operation encounters the expired session
308 * it will do session recovery to initialize it.
309 */
310 session = nfs4_alloc_session(clp);
311 if (!session)
312 return -ENOMEM;
313
314 clp->cl_session = session;
315
316 /*
317 * The create session reply races with the server back
318 * channel probe. Mark the client NFS_CS_SESSION_INITING
319 * so that the client back channel can find the
320 * nfs_client struct
321 */
322 nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
323 return 0;
324}
325
326#endif /* CONFIG_NFS_V4_1 */
327
147/* 328/*
148 * Initialize the minor version specific parts of an NFS4 client record 329 * Initialize the minor version specific parts of an NFS4 client record
149 */ 330 */
150static int nfs4_init_client_minor_version(struct nfs_client *clp) 331static int nfs4_init_client_minor_version(struct nfs_client *clp)
151{ 332{
152#if defined(CONFIG_NFS_V4_1) 333 int ret;
153 if (clp->cl_mvops->minor_version) {
154 struct nfs4_session *session = NULL;
155 /*
156 * Create the session and mark it expired.
157 * When a SEQUENCE operation encounters the expired session
158 * it will do session recovery to initialize it.
159 */
160 session = nfs4_alloc_session(clp);
161 if (!session)
162 return -ENOMEM;
163
164 clp->cl_session = session;
165 /*
166 * The create session reply races with the server back
167 * channel probe. Mark the client NFS_CS_SESSION_INITING
168 * so that the client back channel can find the
169 * nfs_client struct
170 */
171 nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
172 }
173#endif /* CONFIG_NFS_V4_1 */
174 334
335 ret = clp->cl_mvops->init_client(clp);
336 if (ret)
337 return ret;
175 return nfs4_init_callback(clp); 338 return nfs4_init_callback(clp);
176} 339}
177 340
@@ -187,8 +350,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
187 */ 350 */
188struct nfs_client *nfs4_init_client(struct nfs_client *clp, 351struct nfs_client *nfs4_init_client(struct nfs_client *clp,
189 const struct rpc_timeout *timeparms, 352 const struct rpc_timeout *timeparms,
190 const char *ip_addr, 353 const char *ip_addr)
191 rpc_authflavor_t authflavour)
192{ 354{
193 char buf[INET6_ADDRSTRLEN + 1]; 355 char buf[INET6_ADDRSTRLEN + 1];
194 struct nfs_client *old; 356 struct nfs_client *old;
@@ -723,7 +885,7 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
723} 885}
724 886
725static int nfs4_server_common_setup(struct nfs_server *server, 887static int nfs4_server_common_setup(struct nfs_server *server,
726 struct nfs_fh *mntfh) 888 struct nfs_fh *mntfh, bool auth_probe)
727{ 889{
728 struct nfs_fattr *fattr; 890 struct nfs_fattr *fattr;
729 int error; 891 int error;
@@ -755,7 +917,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
755 917
756 918
757 /* Probe the root fh to retrieve its FSID and filehandle */ 919 /* Probe the root fh to retrieve its FSID and filehandle */
758 error = nfs4_get_rootfh(server, mntfh); 920 error = nfs4_get_rootfh(server, mntfh, auth_probe);
759 if (error < 0) 921 if (error < 0)
760 goto out; 922 goto out;
761 923
@@ -787,6 +949,7 @@ out:
787static int nfs4_init_server(struct nfs_server *server, 949static int nfs4_init_server(struct nfs_server *server,
788 const struct nfs_parsed_mount_data *data) 950 const struct nfs_parsed_mount_data *data)
789{ 951{
952 rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
790 struct rpc_timeout timeparms; 953 struct rpc_timeout timeparms;
791 int error; 954 int error;
792 955
@@ -799,13 +962,16 @@ static int nfs4_init_server(struct nfs_server *server,
799 server->flags = data->flags; 962 server->flags = data->flags;
800 server->options = data->options; 963 server->options = data->options;
801 964
965 if (data->auth_flavor_len >= 1)
966 pseudoflavor = data->auth_flavors[0];
967
802 /* Get a client record */ 968 /* Get a client record */
803 error = nfs4_set_client(server, 969 error = nfs4_set_client(server,
804 data->nfs_server.hostname, 970 data->nfs_server.hostname,
805 (const struct sockaddr *)&data->nfs_server.address, 971 (const struct sockaddr *)&data->nfs_server.address,
806 data->nfs_server.addrlen, 972 data->nfs_server.addrlen,
807 data->client_address, 973 data->client_address,
808 data->auth_flavors[0], 974 pseudoflavor,
809 data->nfs_server.protocol, 975 data->nfs_server.protocol,
810 &timeparms, 976 &timeparms,
811 data->minorversion, 977 data->minorversion,
@@ -825,7 +991,7 @@ static int nfs4_init_server(struct nfs_server *server,
825 991
826 server->port = data->nfs_server.port; 992 server->port = data->nfs_server.port;
827 993
828 error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); 994 error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
829 995
830error: 996error:
831 /* Done */ 997 /* Done */
@@ -843,6 +1009,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
843 struct nfs_subversion *nfs_mod) 1009 struct nfs_subversion *nfs_mod)
844{ 1010{
845 struct nfs_server *server; 1011 struct nfs_server *server;
1012 bool auth_probe;
846 int error; 1013 int error;
847 1014
848 dprintk("--> nfs4_create_server()\n"); 1015 dprintk("--> nfs4_create_server()\n");
@@ -851,12 +1018,14 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
851 if (!server) 1018 if (!server)
852 return ERR_PTR(-ENOMEM); 1019 return ERR_PTR(-ENOMEM);
853 1020
1021 auth_probe = mount_info->parsed->auth_flavor_len < 1;
1022
854 /* set up the general RPC client */ 1023 /* set up the general RPC client */
855 error = nfs4_init_server(server, mount_info->parsed); 1024 error = nfs4_init_server(server, mount_info->parsed);
856 if (error < 0) 1025 if (error < 0)
857 goto error; 1026 goto error;
858 1027
859 error = nfs4_server_common_setup(server, mount_info->mntfh); 1028 error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe);
860 if (error < 0) 1029 if (error < 0)
861 goto error; 1030 goto error;
862 1031
@@ -909,7 +1078,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
909 if (error < 0) 1078 if (error < 0)
910 goto error; 1079 goto error;
911 1080
912 error = nfs4_server_common_setup(server, mntfh); 1081 error = nfs4_server_common_setup(server, mntfh,
1082 !(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
913 if (error < 0) 1083 if (error < 0)
914 goto error; 1084 goto error;
915 1085
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e5b804dd944c..77efaf15ec90 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
19 struct inode *dir; 19 struct inode *dir;
20 unsigned openflags = filp->f_flags; 20 unsigned openflags = filp->f_flags;
21 struct iattr attr; 21 struct iattr attr;
22 int opened = 0;
22 int err; 23 int err;
23 24
24 /* 25 /*
@@ -55,7 +56,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
55 nfs_wb_all(inode); 56 nfs_wb_all(inode);
56 } 57 }
57 58
58 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); 59 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
59 if (IS_ERR(inode)) { 60 if (IS_ERR(inode)) {
60 err = PTR_ERR(inode); 61 err = PTR_ERR(inode);
61 switch (err) { 62 switch (err) {
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 17ed87ef9de8..b86464ba25e1 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -39,6 +39,7 @@
39#include "internal.h" 39#include "internal.h"
40#include "delegation.h" 40#include "delegation.h"
41#include "nfs4filelayout.h" 41#include "nfs4filelayout.h"
42#include "nfs4trace.h"
42 43
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD 44#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44 45
@@ -247,6 +248,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
247 struct nfs_pgio_header *hdr = data->header; 248 struct nfs_pgio_header *hdr = data->header;
248 int err; 249 int err;
249 250
251 trace_nfs4_pnfs_read(data, task->tk_status);
250 err = filelayout_async_handle_error(task, data->args.context->state, 252 err = filelayout_async_handle_error(task, data->args.context->state,
251 data->ds_clp, hdr->lseg); 253 data->ds_clp, hdr->lseg);
252 254
@@ -363,6 +365,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
363 struct nfs_pgio_header *hdr = data->header; 365 struct nfs_pgio_header *hdr = data->header;
364 int err; 366 int err;
365 367
368 trace_nfs4_pnfs_write(data, task->tk_status);
366 err = filelayout_async_handle_error(task, data->args.context->state, 369 err = filelayout_async_handle_error(task, data->args.context->state,
367 data->ds_clp, hdr->lseg); 370 data->ds_clp, hdr->lseg);
368 371
@@ -395,6 +398,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
395{ 398{
396 int err; 399 int err;
397 400
401 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
398 err = filelayout_async_handle_error(task, NULL, data->ds_clp, 402 err = filelayout_async_handle_error(task, NULL, data->ds_clp,
399 data->lseg); 403 data->lseg);
400 404
@@ -524,6 +528,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
524 struct nfs_pgio_header *hdr = data->header; 528 struct nfs_pgio_header *hdr = data->header;
525 struct pnfs_layout_segment *lseg = hdr->lseg; 529 struct pnfs_layout_segment *lseg = hdr->lseg;
526 struct nfs4_pnfs_ds *ds; 530 struct nfs4_pnfs_ds *ds;
531 struct rpc_clnt *ds_clnt;
527 loff_t offset = data->args.offset; 532 loff_t offset = data->args.offset;
528 u32 j, idx; 533 u32 j, idx;
529 struct nfs_fh *fh; 534 struct nfs_fh *fh;
@@ -538,6 +543,11 @@ filelayout_read_pagelist(struct nfs_read_data *data)
538 ds = nfs4_fl_prepare_ds(lseg, idx); 543 ds = nfs4_fl_prepare_ds(lseg, idx);
539 if (!ds) 544 if (!ds)
540 return PNFS_NOT_ATTEMPTED; 545 return PNFS_NOT_ATTEMPTED;
546
547 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
548 if (IS_ERR(ds_clnt))
549 return PNFS_NOT_ATTEMPTED;
550
541 dprintk("%s USE DS: %s cl_count %d\n", __func__, 551 dprintk("%s USE DS: %s cl_count %d\n", __func__,
542 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); 552 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
543 553
@@ -552,7 +562,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
552 data->mds_offset = offset; 562 data->mds_offset = offset;
553 563
554 /* Perform an asynchronous read to ds */ 564 /* Perform an asynchronous read to ds */
555 nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, 565 nfs_initiate_read(ds_clnt, data,
556 &filelayout_read_call_ops, RPC_TASK_SOFTCONN); 566 &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
557 return PNFS_ATTEMPTED; 567 return PNFS_ATTEMPTED;
558} 568}
@@ -564,6 +574,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
564 struct nfs_pgio_header *hdr = data->header; 574 struct nfs_pgio_header *hdr = data->header;
565 struct pnfs_layout_segment *lseg = hdr->lseg; 575 struct pnfs_layout_segment *lseg = hdr->lseg;
566 struct nfs4_pnfs_ds *ds; 576 struct nfs4_pnfs_ds *ds;
577 struct rpc_clnt *ds_clnt;
567 loff_t offset = data->args.offset; 578 loff_t offset = data->args.offset;
568 u32 j, idx; 579 u32 j, idx;
569 struct nfs_fh *fh; 580 struct nfs_fh *fh;
@@ -574,6 +585,11 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
574 ds = nfs4_fl_prepare_ds(lseg, idx); 585 ds = nfs4_fl_prepare_ds(lseg, idx);
575 if (!ds) 586 if (!ds)
576 return PNFS_NOT_ATTEMPTED; 587 return PNFS_NOT_ATTEMPTED;
588
589 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
590 if (IS_ERR(ds_clnt))
591 return PNFS_NOT_ATTEMPTED;
592
577 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", 593 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
578 __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, 594 __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
579 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); 595 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
@@ -591,7 +607,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
591 data->args.offset = filelayout_get_dserver_offset(lseg, offset); 607 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
592 608
593 /* Perform an asynchronous write */ 609 /* Perform an asynchronous write */
594 nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, 610 nfs_initiate_write(ds_clnt, data,
595 &filelayout_write_call_ops, sync, 611 &filelayout_write_call_ops, sync,
596 RPC_TASK_SOFTCONN); 612 RPC_TASK_SOFTCONN);
597 return PNFS_ATTEMPTED; 613 return PNFS_ATTEMPTED;
@@ -1101,16 +1117,19 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1101{ 1117{
1102 struct pnfs_layout_segment *lseg = data->lseg; 1118 struct pnfs_layout_segment *lseg = data->lseg;
1103 struct nfs4_pnfs_ds *ds; 1119 struct nfs4_pnfs_ds *ds;
1120 struct rpc_clnt *ds_clnt;
1104 u32 idx; 1121 u32 idx;
1105 struct nfs_fh *fh; 1122 struct nfs_fh *fh;
1106 1123
1107 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); 1124 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1108 ds = nfs4_fl_prepare_ds(lseg, idx); 1125 ds = nfs4_fl_prepare_ds(lseg, idx);
1109 if (!ds) { 1126 if (!ds)
1110 prepare_to_resend_writes(data); 1127 goto out_err;
1111 filelayout_commit_release(data); 1128
1112 return -EAGAIN; 1129 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
1113 } 1130 if (IS_ERR(ds_clnt))
1131 goto out_err;
1132
1114 dprintk("%s ino %lu, how %d cl_count %d\n", __func__, 1133 dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
1115 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); 1134 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
1116 data->commit_done_cb = filelayout_commit_done_cb; 1135 data->commit_done_cb = filelayout_commit_done_cb;
@@ -1119,9 +1138,13 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1119 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); 1138 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1120 if (fh) 1139 if (fh)
1121 data->args.fh = fh; 1140 data->args.fh = fh;
1122 return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data, 1141 return nfs_initiate_commit(ds_clnt, data,
1123 &filelayout_commit_call_ops, how, 1142 &filelayout_commit_call_ops, how,
1124 RPC_TASK_SOFTCONN); 1143 RPC_TASK_SOFTCONN);
1144out_err:
1145 prepare_to_resend_writes(data);
1146 filelayout_commit_release(data);
1147 return -EAGAIN;
1125} 1148}
1126 1149
1127static int 1150static int
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 95604f64cab8..c7c295e556ed 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -185,6 +185,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
185 if (status) 185 if (status)
186 goto out_put; 186 goto out_put;
187 187
188 smp_wmb();
188 ds->ds_clp = clp; 189 ds->ds_clp = clp;
189 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); 190 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
190out: 191out:
@@ -801,34 +802,35 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
801 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; 802 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
802 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 803 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
803 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 804 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
804 805 struct nfs4_pnfs_ds *ret = ds;
805 if (filelayout_test_devid_unavailable(devid))
806 return NULL;
807 806
808 if (ds == NULL) { 807 if (ds == NULL) {
809 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 808 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
810 __func__, ds_idx); 809 __func__, ds_idx);
811 filelayout_mark_devid_invalid(devid); 810 filelayout_mark_devid_invalid(devid);
812 return NULL; 811 goto out;
813 } 812 }
813 smp_rmb();
814 if (ds->ds_clp) 814 if (ds->ds_clp)
815 return ds; 815 goto out_test_devid;
816 816
817 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { 817 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
818 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); 818 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
819 int err; 819 int err;
820 820
821 err = nfs4_ds_connect(s, ds); 821 err = nfs4_ds_connect(s, ds);
822 if (err) { 822 if (err)
823 nfs4_mark_deviceid_unavailable(devid); 823 nfs4_mark_deviceid_unavailable(devid);
824 ds = NULL;
825 }
826 nfs4_clear_ds_conn_bit(ds); 824 nfs4_clear_ds_conn_bit(ds);
827 } else { 825 } else {
828 /* Either ds is connected, or ds is NULL */ 826 /* Either ds is connected, or ds is NULL */
829 nfs4_wait_ds_connect(ds); 827 nfs4_wait_ds_connect(ds);
830 } 828 }
831 return ds; 829out_test_devid:
830 if (filelayout_test_devid_unavailable(devid))
831 ret = NULL;
832out:
833 return ret;
832} 834}
833 835
834module_param(dataserver_retrans, uint, 0644); 836module_param(dataserver_retrans, uint, 0644);
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index 549462e5b9b0..c0b3a16b4a00 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -9,7 +9,7 @@
9 9
10#define NFSDBG_FACILITY NFSDBG_CLIENT 10#define NFSDBG_FACILITY NFSDBG_CLIENT
11 11
12int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) 12int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
13{ 13{
14 struct nfs_fsinfo fsinfo; 14 struct nfs_fsinfo fsinfo;
15 int ret = -ENOMEM; 15 int ret = -ENOMEM;
@@ -21,7 +21,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
21 goto out; 21 goto out;
22 22
23 /* Start by getting the root filehandle from the server */ 23 /* Start by getting the root filehandle from the server */
24 ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo); 24 ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
25 if (ret < 0) { 25 if (ret < 0) {
26 dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); 26 dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
27 goto out; 27 goto out;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index cdb0b41a4810..2288cd3c9278 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/nfs_mount.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/sunrpc/clnt.h> 17#include <linux/sunrpc/clnt.h>
@@ -369,21 +370,33 @@ out:
369struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, 370struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
370 struct nfs_fh *fh, struct nfs_fattr *fattr) 371 struct nfs_fh *fh, struct nfs_fattr *fattr)
371{ 372{
373 rpc_authflavor_t flavor = server->client->cl_auth->au_flavor;
372 struct dentry *parent = dget_parent(dentry); 374 struct dentry *parent = dget_parent(dentry);
375 struct inode *dir = parent->d_inode;
376 struct qstr *name = &dentry->d_name;
373 struct rpc_clnt *client; 377 struct rpc_clnt *client;
374 struct vfsmount *mnt; 378 struct vfsmount *mnt;
375 379
376 /* Look it up again to get its attributes and sec flavor */ 380 /* Look it up again to get its attributes and sec flavor */
377 client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr); 381 client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr);
378 dput(parent); 382 dput(parent);
379 if (IS_ERR(client)) 383 if (IS_ERR(client))
380 return ERR_CAST(client); 384 return ERR_CAST(client);
381 385
382 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 386 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
383 mnt = nfs_do_refmount(client, dentry); 387 mnt = nfs_do_refmount(client, dentry);
384 else 388 goto out;
385 mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor); 389 }
386 390
391 if (client->cl_auth->au_flavor != flavor)
392 flavor = client->cl_auth->au_flavor;
393 else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
394 rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
395 if ((int)new >= 0)
396 flavor = new;
397 }
398 mnt = nfs_do_submount(dentry, fh, fattr, flavor);
399out:
387 rpc_shutdown_client(client); 400 rpc_shutdown_client(client);
388 return mnt; 401 return mnt;
389} 402}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 108a774095f7..d53d6785cba2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -66,6 +66,8 @@
66#include "nfs4session.h" 66#include "nfs4session.h"
67#include "fscache.h" 67#include "fscache.h"
68 68
69#include "nfs4trace.h"
70
69#define NFSDBG_FACILITY NFSDBG_PROC 71#define NFSDBG_FACILITY NFSDBG_PROC
70 72
71#define NFS4_POLL_RETRY_MIN (HZ/10) 73#define NFS4_POLL_RETRY_MIN (HZ/10)
@@ -150,6 +152,7 @@ static int nfs4_map_errors(int err)
150 case -NFS4ERR_RECALLCONFLICT: 152 case -NFS4ERR_RECALLCONFLICT:
151 return -EREMOTEIO; 153 return -EREMOTEIO;
152 case -NFS4ERR_WRONGSEC: 154 case -NFS4ERR_WRONGSEC:
155 case -NFS4ERR_WRONG_CRED:
153 return -EPERM; 156 return -EPERM;
154 case -NFS4ERR_BADOWNER: 157 case -NFS4ERR_BADOWNER:
155 case -NFS4ERR_BADNAME: 158 case -NFS4ERR_BADNAME:
@@ -433,6 +436,20 @@ wait_on_recovery:
433 return ret; 436 return ret;
434} 437}
435 438
439/*
440 * Return 'true' if 'clp' is using an rpc_client that is integrity protected
441 * or 'false' otherwise.
442 */
443static bool _nfs4_is_integrity_protected(struct nfs_client *clp)
444{
445 rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor;
446
447 if (flavor == RPC_AUTH_GSS_KRB5I ||
448 flavor == RPC_AUTH_GSS_KRB5P)
449 return true;
450
451 return false;
452}
436 453
437static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) 454static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
438{ 455{
@@ -447,6 +464,88 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
447 do_renew_lease(server->nfs_client, timestamp); 464 do_renew_lease(server->nfs_client, timestamp);
448} 465}
449 466
467struct nfs4_call_sync_data {
468 const struct nfs_server *seq_server;
469 struct nfs4_sequence_args *seq_args;
470 struct nfs4_sequence_res *seq_res;
471};
472
473static void nfs4_init_sequence(struct nfs4_sequence_args *args,
474 struct nfs4_sequence_res *res, int cache_reply)
475{
476 args->sa_slot = NULL;
477 args->sa_cache_this = cache_reply;
478 args->sa_privileged = 0;
479
480 res->sr_slot = NULL;
481}
482
483static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
484{
485 args->sa_privileged = 1;
486}
487
488static int nfs40_setup_sequence(const struct nfs_server *server,
489 struct nfs4_sequence_args *args,
490 struct nfs4_sequence_res *res,
491 struct rpc_task *task)
492{
493 struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
494 struct nfs4_slot *slot;
495
496 /* slot already allocated? */
497 if (res->sr_slot != NULL)
498 goto out_start;
499
500 spin_lock(&tbl->slot_tbl_lock);
501 if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
502 goto out_sleep;
503
504 slot = nfs4_alloc_slot(tbl);
505 if (IS_ERR(slot)) {
506 if (slot == ERR_PTR(-ENOMEM))
507 task->tk_timeout = HZ >> 2;
508 goto out_sleep;
509 }
510 spin_unlock(&tbl->slot_tbl_lock);
511
512 args->sa_slot = slot;
513 res->sr_slot = slot;
514
515out_start:
516 rpc_call_start(task);
517 return 0;
518
519out_sleep:
520 if (args->sa_privileged)
521 rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
522 NULL, RPC_PRIORITY_PRIVILEGED);
523 else
524 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
525 spin_unlock(&tbl->slot_tbl_lock);
526 return -EAGAIN;
527}
528
529static int nfs40_sequence_done(struct rpc_task *task,
530 struct nfs4_sequence_res *res)
531{
532 struct nfs4_slot *slot = res->sr_slot;
533 struct nfs4_slot_table *tbl;
534
535 if (!RPC_WAS_SENT(task))
536 goto out;
537
538 tbl = slot->table;
539 spin_lock(&tbl->slot_tbl_lock);
540 if (!nfs41_wake_and_assign_slot(tbl, slot))
541 nfs4_free_slot(tbl, slot);
542 spin_unlock(&tbl->slot_tbl_lock);
543
544 res->sr_slot = NULL;
545out:
546 return 1;
547}
548
450#if defined(CONFIG_NFS_V4_1) 549#if defined(CONFIG_NFS_V4_1)
451 550
452static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 551static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -506,6 +605,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
506 interrupted = true; 605 interrupted = true;
507 } 606 }
508 607
608 trace_nfs4_sequence_done(session, res);
509 /* Check the SEQUENCE operation status */ 609 /* Check the SEQUENCE operation status */
510 switch (res->sr_status) { 610 switch (res->sr_status) {
511 case 0: 611 case 0:
@@ -591,25 +691,11 @@ static int nfs4_sequence_done(struct rpc_task *task,
591{ 691{
592 if (res->sr_slot == NULL) 692 if (res->sr_slot == NULL)
593 return 1; 693 return 1;
694 if (!res->sr_slot->table->session)
695 return nfs40_sequence_done(task, res);
594 return nfs41_sequence_done(task, res); 696 return nfs41_sequence_done(task, res);
595} 697}
596 698
597static void nfs41_init_sequence(struct nfs4_sequence_args *args,
598 struct nfs4_sequence_res *res, int cache_reply)
599{
600 args->sa_slot = NULL;
601 args->sa_cache_this = 0;
602 args->sa_privileged = 0;
603 if (cache_reply)
604 args->sa_cache_this = 1;
605 res->sr_slot = NULL;
606}
607
608static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
609{
610 args->sa_privileged = 1;
611}
612
613int nfs41_setup_sequence(struct nfs4_session *session, 699int nfs41_setup_sequence(struct nfs4_session *session,
614 struct nfs4_sequence_args *args, 700 struct nfs4_sequence_args *args,
615 struct nfs4_sequence_res *res, 701 struct nfs4_sequence_res *res,
@@ -647,7 +733,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
647 733
648 args->sa_slot = slot; 734 args->sa_slot = slot;
649 735
650 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, 736 dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
651 slot->slot_nr, slot->seq_nr); 737 slot->slot_nr, slot->seq_nr);
652 738
653 res->sr_slot = slot; 739 res->sr_slot = slot;
@@ -658,6 +744,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
658 * set to 1 if an rpc level failure occurs. 744 * set to 1 if an rpc level failure occurs.
659 */ 745 */
660 res->sr_status = 1; 746 res->sr_status = 1;
747 trace_nfs4_setup_sequence(session, args);
661out_success: 748out_success:
662 rpc_call_start(task); 749 rpc_call_start(task);
663 return 0; 750 return 0;
@@ -673,38 +760,30 @@ out_sleep:
673} 760}
674EXPORT_SYMBOL_GPL(nfs41_setup_sequence); 761EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
675 762
676int nfs4_setup_sequence(const struct nfs_server *server, 763static int nfs4_setup_sequence(const struct nfs_server *server,
677 struct nfs4_sequence_args *args, 764 struct nfs4_sequence_args *args,
678 struct nfs4_sequence_res *res, 765 struct nfs4_sequence_res *res,
679 struct rpc_task *task) 766 struct rpc_task *task)
680{ 767{
681 struct nfs4_session *session = nfs4_get_session(server); 768 struct nfs4_session *session = nfs4_get_session(server);
682 int ret = 0; 769 int ret = 0;
683 770
684 if (session == NULL) { 771 if (!session)
685 rpc_call_start(task); 772 return nfs40_setup_sequence(server, args, res, task);
686 goto out;
687 }
688 773
689 dprintk("--> %s clp %p session %p sr_slot %d\n", 774 dprintk("--> %s clp %p session %p sr_slot %u\n",
690 __func__, session->clp, session, res->sr_slot ? 775 __func__, session->clp, session, res->sr_slot ?
691 res->sr_slot->slot_nr : -1); 776 res->sr_slot->slot_nr : NFS4_NO_SLOT);
692 777
693 ret = nfs41_setup_sequence(session, args, res, task); 778 ret = nfs41_setup_sequence(session, args, res, task);
694out: 779
695 dprintk("<-- %s status=%d\n", __func__, ret); 780 dprintk("<-- %s status=%d\n", __func__, ret);
696 return ret; 781 return ret;
697} 782}
698 783
699struct nfs41_call_sync_data {
700 const struct nfs_server *seq_server;
701 struct nfs4_sequence_args *seq_args;
702 struct nfs4_sequence_res *seq_res;
703};
704
705static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) 784static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
706{ 785{
707 struct nfs41_call_sync_data *data = calldata; 786 struct nfs4_call_sync_data *data = calldata;
708 struct nfs4_session *session = nfs4_get_session(data->seq_server); 787 struct nfs4_session *session = nfs4_get_session(data->seq_server);
709 788
710 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); 789 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
@@ -714,7 +793,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
714 793
715static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) 794static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
716{ 795{
717 struct nfs41_call_sync_data *data = calldata; 796 struct nfs4_call_sync_data *data = calldata;
718 797
719 nfs41_sequence_done(task, data->seq_res); 798 nfs41_sequence_done(task, data->seq_res);
720} 799}
@@ -724,6 +803,42 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
724 .rpc_call_done = nfs41_call_sync_done, 803 .rpc_call_done = nfs41_call_sync_done,
725}; 804};
726 805
806#else /* !CONFIG_NFS_V4_1 */
807
808static int nfs4_setup_sequence(const struct nfs_server *server,
809 struct nfs4_sequence_args *args,
810 struct nfs4_sequence_res *res,
811 struct rpc_task *task)
812{
813 return nfs40_setup_sequence(server, args, res, task);
814}
815
816static int nfs4_sequence_done(struct rpc_task *task,
817 struct nfs4_sequence_res *res)
818{
819 return nfs40_sequence_done(task, res);
820}
821
822#endif /* !CONFIG_NFS_V4_1 */
823
824static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata)
825{
826 struct nfs4_call_sync_data *data = calldata;
827 nfs4_setup_sequence(data->seq_server,
828 data->seq_args, data->seq_res, task);
829}
830
831static void nfs40_call_sync_done(struct rpc_task *task, void *calldata)
832{
833 struct nfs4_call_sync_data *data = calldata;
834 nfs4_sequence_done(task, data->seq_res);
835}
836
837static const struct rpc_call_ops nfs40_call_sync_ops = {
838 .rpc_call_prepare = nfs40_call_sync_prepare,
839 .rpc_call_done = nfs40_call_sync_done,
840};
841
727static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, 842static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
728 struct nfs_server *server, 843 struct nfs_server *server,
729 struct rpc_message *msg, 844 struct rpc_message *msg,
@@ -732,7 +847,8 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
732{ 847{
733 int ret; 848 int ret;
734 struct rpc_task *task; 849 struct rpc_task *task;
735 struct nfs41_call_sync_data data = { 850 struct nfs_client *clp = server->nfs_client;
851 struct nfs4_call_sync_data data = {
736 .seq_server = server, 852 .seq_server = server,
737 .seq_args = args, 853 .seq_args = args,
738 .seq_res = res, 854 .seq_res = res,
@@ -740,7 +856,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
740 struct rpc_task_setup task_setup = { 856 struct rpc_task_setup task_setup = {
741 .rpc_client = clnt, 857 .rpc_client = clnt,
742 .rpc_message = msg, 858 .rpc_message = msg,
743 .callback_ops = &nfs41_call_sync_ops, 859 .callback_ops = clp->cl_mvops->call_sync_ops,
744 .callback_data = &data 860 .callback_data = &data
745 }; 861 };
746 862
@@ -754,35 +870,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
754 return ret; 870 return ret;
755} 871}
756 872
757#else
758static
759void nfs41_init_sequence(struct nfs4_sequence_args *args,
760 struct nfs4_sequence_res *res, int cache_reply)
761{
762}
763
764static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
765{
766}
767
768
769static int nfs4_sequence_done(struct rpc_task *task,
770 struct nfs4_sequence_res *res)
771{
772 return 1;
773}
774#endif /* CONFIG_NFS_V4_1 */
775
776static
777int _nfs4_call_sync(struct rpc_clnt *clnt,
778 struct nfs_server *server,
779 struct rpc_message *msg,
780 struct nfs4_sequence_args *args,
781 struct nfs4_sequence_res *res)
782{
783 return rpc_call_sync(clnt, msg, 0);
784}
785
786static 873static
787int nfs4_call_sync(struct rpc_clnt *clnt, 874int nfs4_call_sync(struct rpc_clnt *clnt,
788 struct nfs_server *server, 875 struct nfs_server *server,
@@ -791,9 +878,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
791 struct nfs4_sequence_res *res, 878 struct nfs4_sequence_res *res,
792 int cache_reply) 879 int cache_reply)
793{ 880{
794 nfs41_init_sequence(args, res, cache_reply); 881 nfs4_init_sequence(args, res, cache_reply);
795 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, 882 return nfs4_call_sync_sequence(clnt, server, msg, args, res);
796 args, res);
797} 883}
798 884
799static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 885static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -826,6 +912,7 @@ struct nfs4_opendata {
826 struct iattr attrs; 912 struct iattr attrs;
827 unsigned long timestamp; 913 unsigned long timestamp;
828 unsigned int rpc_done : 1; 914 unsigned int rpc_done : 1;
915 unsigned int file_created : 1;
829 unsigned int is_recover : 1; 916 unsigned int is_recover : 1;
830 int rpc_status; 917 int rpc_status;
831 int cancelled; 918 int cancelled;
@@ -933,7 +1020,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
933 p->o_arg.fh = NFS_FH(dentry->d_inode); 1020 p->o_arg.fh = NFS_FH(dentry->d_inode);
934 } 1021 }
935 if (attrs != NULL && attrs->ia_valid != 0) { 1022 if (attrs != NULL && attrs->ia_valid != 0) {
936 __be32 verf[2]; 1023 __u32 verf[2];
937 1024
938 p->o_arg.u.attrs = &p->attrs; 1025 p->o_arg.u.attrs = &p->attrs;
939 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 1026 memcpy(&p->attrs, attrs, sizeof(p->attrs));
@@ -1103,7 +1190,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
1103 goto no_delegation; 1190 goto no_delegation;
1104 1191
1105 spin_lock(&deleg_cur->lock); 1192 spin_lock(&deleg_cur->lock);
1106 if (nfsi->delegation != deleg_cur || 1193 if (rcu_dereference(nfsi->delegation) != deleg_cur ||
1107 test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || 1194 test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
1108 (deleg_cur->type & fmode) != fmode) 1195 (deleg_cur->type & fmode) != fmode)
1109 goto no_delegation_unlock; 1196 goto no_delegation_unlock;
@@ -1440,6 +1527,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1440 int err; 1527 int err;
1441 do { 1528 do {
1442 err = _nfs4_do_open_reclaim(ctx, state); 1529 err = _nfs4_do_open_reclaim(ctx, state);
1530 trace_nfs4_open_reclaim(ctx, 0, err);
1443 if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) 1531 if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
1444 continue; 1532 continue;
1445 if (err != -NFS4ERR_DELAY) 1533 if (err != -NFS4ERR_DELAY)
@@ -1524,10 +1612,20 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1524 return nfs4_handle_delegation_recall_error(server, state, stateid, err); 1612 return nfs4_handle_delegation_recall_error(server, state, stateid, err);
1525} 1613}
1526 1614
1615static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
1616{
1617 struct nfs4_opendata *data = calldata;
1618
1619 nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args,
1620 &data->o_res.seq_res, task);
1621}
1622
1527static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 1623static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1528{ 1624{
1529 struct nfs4_opendata *data = calldata; 1625 struct nfs4_opendata *data = calldata;
1530 1626
1627 nfs40_sequence_done(task, &data->o_res.seq_res);
1628
1531 data->rpc_status = task->tk_status; 1629 data->rpc_status = task->tk_status;
1532 if (data->rpc_status == 0) { 1630 if (data->rpc_status == 0) {
1533 nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid); 1631 nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
@@ -1556,6 +1654,7 @@ out_free:
1556} 1654}
1557 1655
1558static const struct rpc_call_ops nfs4_open_confirm_ops = { 1656static const struct rpc_call_ops nfs4_open_confirm_ops = {
1657 .rpc_call_prepare = nfs4_open_confirm_prepare,
1559 .rpc_call_done = nfs4_open_confirm_done, 1658 .rpc_call_done = nfs4_open_confirm_done,
1560 .rpc_release = nfs4_open_confirm_release, 1659 .rpc_release = nfs4_open_confirm_release,
1561}; 1660};
@@ -1583,6 +1682,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
1583 }; 1682 };
1584 int status; 1683 int status;
1585 1684
1685 nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1);
1586 kref_get(&data->kref); 1686 kref_get(&data->kref);
1587 data->rpc_done = 0; 1687 data->rpc_done = 0;
1588 data->rpc_status = 0; 1688 data->rpc_status = 0;
@@ -1742,7 +1842,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1742 }; 1842 };
1743 int status; 1843 int status;
1744 1844
1745 nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); 1845 nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
1746 kref_get(&data->kref); 1846 kref_get(&data->kref);
1747 data->rpc_done = 0; 1847 data->rpc_done = 0;
1748 data->rpc_status = 0; 1848 data->rpc_status = 0;
@@ -1847,8 +1947,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1847 1947
1848 nfs_fattr_map_and_free_names(server, &data->f_attr); 1948 nfs_fattr_map_and_free_names(server, &data->f_attr);
1849 1949
1850 if (o_arg->open_flags & O_CREAT) 1950 if (o_arg->open_flags & O_CREAT) {
1851 update_changeattr(dir, &o_res->cinfo); 1951 update_changeattr(dir, &o_res->cinfo);
1952 if (o_arg->open_flags & O_EXCL)
1953 data->file_created = 1;
1954 else if (o_res->cinfo.before != o_res->cinfo.after)
1955 data->file_created = 1;
1956 }
1852 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) 1957 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
1853 server->caps &= ~NFS_CAP_POSIX_LOCK; 1958 server->caps &= ~NFS_CAP_POSIX_LOCK;
1854 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1959 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1895,6 +2000,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1895 2000
1896 do { 2001 do {
1897 err = _nfs4_open_expired(ctx, state); 2002 err = _nfs4_open_expired(ctx, state);
2003 trace_nfs4_open_expired(ctx, 0, err);
1898 if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) 2004 if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
1899 continue; 2005 continue;
1900 switch (err) { 2006 switch (err) {
@@ -1944,6 +2050,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1944 cred = get_rpccred(delegation->cred); 2050 cred = get_rpccred(delegation->cred);
1945 rcu_read_unlock(); 2051 rcu_read_unlock();
1946 status = nfs41_test_stateid(server, stateid, cred); 2052 status = nfs41_test_stateid(server, stateid, cred);
2053 trace_nfs4_test_delegation_stateid(state, NULL, status);
1947 } else 2054 } else
1948 rcu_read_unlock(); 2055 rcu_read_unlock();
1949 2056
@@ -1986,6 +2093,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1986 return -NFS4ERR_BAD_STATEID; 2093 return -NFS4ERR_BAD_STATEID;
1987 2094
1988 status = nfs41_test_stateid(server, stateid, cred); 2095 status = nfs41_test_stateid(server, stateid, cred);
2096 trace_nfs4_test_open_stateid(state, NULL, status);
1989 if (status != NFS_OK) { 2097 if (status != NFS_OK) {
1990 /* Free the stateid unless the server explicitly 2098 /* Free the stateid unless the server explicitly
1991 * informs us the stateid is unrecognized. */ 2099 * informs us the stateid is unrecognized. */
@@ -2089,7 +2197,8 @@ static int _nfs4_do_open(struct inode *dir,
2089 struct nfs_open_context *ctx, 2197 struct nfs_open_context *ctx,
2090 int flags, 2198 int flags,
2091 struct iattr *sattr, 2199 struct iattr *sattr,
2092 struct nfs4_label *label) 2200 struct nfs4_label *label,
2201 int *opened)
2093{ 2202{
2094 struct nfs4_state_owner *sp; 2203 struct nfs4_state_owner *sp;
2095 struct nfs4_state *state = NULL; 2204 struct nfs4_state *state = NULL;
@@ -2159,6 +2268,8 @@ static int _nfs4_do_open(struct inode *dir,
2159 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); 2268 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2160 } 2269 }
2161 } 2270 }
2271 if (opendata->file_created)
2272 *opened |= FILE_CREATED;
2162 2273
2163 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2274 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
2164 *ctx_th = opendata->f_attr.mdsthreshold; 2275 *ctx_th = opendata->f_attr.mdsthreshold;
@@ -2187,7 +2298,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2187 struct nfs_open_context *ctx, 2298 struct nfs_open_context *ctx,
2188 int flags, 2299 int flags,
2189 struct iattr *sattr, 2300 struct iattr *sattr,
2190 struct nfs4_label *label) 2301 struct nfs4_label *label,
2302 int *opened)
2191{ 2303{
2192 struct nfs_server *server = NFS_SERVER(dir); 2304 struct nfs_server *server = NFS_SERVER(dir);
2193 struct nfs4_exception exception = { }; 2305 struct nfs4_exception exception = { };
@@ -2195,8 +2307,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2195 int status; 2307 int status;
2196 2308
2197 do { 2309 do {
2198 status = _nfs4_do_open(dir, ctx, flags, sattr, label); 2310 status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
2199 res = ctx->state; 2311 res = ctx->state;
2312 trace_nfs4_open_file(ctx, flags, status);
2200 if (status == 0) 2313 if (status == 0)
2201 break; 2314 break;
2202 /* NOTE: BAD_SEQID means the server and client disagree about the 2315 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2310,6 +2423,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2310 int err; 2423 int err;
2311 do { 2424 do {
2312 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); 2425 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
2426 trace_nfs4_setattr(inode, err);
2313 switch (err) { 2427 switch (err) {
2314 case -NFS4ERR_OPENMODE: 2428 case -NFS4ERR_OPENMODE:
2315 if (!(sattr->ia_valid & ATTR_SIZE)) { 2429 if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2387,6 +2501,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2387 dprintk("%s: begin!\n", __func__); 2501 dprintk("%s: begin!\n", __func__);
2388 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 2502 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
2389 return; 2503 return;
2504 trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
2390 /* hmm. we are done with the inode, and in the process of freeing 2505 /* hmm. we are done with the inode, and in the process of freeing
2391 * the state_owner. we keep this around to process errors 2506 * the state_owner. we keep this around to process errors
2392 */ 2507 */
@@ -2511,10 +2626,13 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2511 }; 2626 };
2512 int status = -ENOMEM; 2627 int status = -ENOMEM;
2513 2628
2629 nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
2630 &task_setup_data.rpc_client, &msg);
2631
2514 calldata = kzalloc(sizeof(*calldata), gfp_mask); 2632 calldata = kzalloc(sizeof(*calldata), gfp_mask);
2515 if (calldata == NULL) 2633 if (calldata == NULL)
2516 goto out; 2634 goto out;
2517 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); 2635 nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
2518 calldata->inode = state->inode; 2636 calldata->inode = state->inode;
2519 calldata->state = state; 2637 calldata->state = state;
2520 calldata->arg.fh = NFS_FH(state->inode); 2638 calldata->arg.fh = NFS_FH(state->inode);
@@ -2551,7 +2669,8 @@ out:
2551} 2669}
2552 2670
2553static struct inode * 2671static struct inode *
2554nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) 2672nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
2673 int open_flags, struct iattr *attr, int *opened)
2555{ 2674{
2556 struct nfs4_state *state; 2675 struct nfs4_state *state;
2557 struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; 2676 struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
@@ -2559,7 +2678,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
2559 label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); 2678 label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
2560 2679
2561 /* Protect against concurrent sillydeletes */ 2680 /* Protect against concurrent sillydeletes */
2562 state = nfs4_do_open(dir, ctx, open_flags, attr, label); 2681 state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
2563 2682
2564 nfs4_label_release_security(label); 2683 nfs4_label_release_security(label);
2565 2684
@@ -2690,6 +2809,7 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2690 int err; 2809 int err;
2691 do { 2810 do {
2692 err = _nfs4_lookup_root(server, fhandle, info); 2811 err = _nfs4_lookup_root(server, fhandle, info);
2812 trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
2693 switch (err) { 2813 switch (err) {
2694 case 0: 2814 case 0:
2695 case -NFS4ERR_WRONGSEC: 2815 case -NFS4ERR_WRONGSEC:
@@ -2705,10 +2825,13 @@ out:
2705static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, 2825static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2706 struct nfs_fsinfo *info, rpc_authflavor_t flavor) 2826 struct nfs_fsinfo *info, rpc_authflavor_t flavor)
2707{ 2827{
2828 struct rpc_auth_create_args auth_args = {
2829 .pseudoflavor = flavor,
2830 };
2708 struct rpc_auth *auth; 2831 struct rpc_auth *auth;
2709 int ret; 2832 int ret;
2710 2833
2711 auth = rpcauth_create(flavor, server->client); 2834 auth = rpcauth_create(&auth_args, server->client);
2712 if (IS_ERR(auth)) { 2835 if (IS_ERR(auth)) {
2713 ret = -EACCES; 2836 ret = -EACCES;
2714 goto out; 2837 goto out;
@@ -2772,18 +2895,27 @@ static int nfs4_do_find_root_sec(struct nfs_server *server,
2772 * @server: initialized nfs_server handle 2895 * @server: initialized nfs_server handle
2773 * @fhandle: we fill in the pseudo-fs root file handle 2896 * @fhandle: we fill in the pseudo-fs root file handle
2774 * @info: we fill in an FSINFO struct 2897 * @info: we fill in an FSINFO struct
2898 * @auth_probe: probe the auth flavours
2775 * 2899 *
2776 * Returns zero on success, or a negative errno. 2900 * Returns zero on success, or a negative errno.
2777 */ 2901 */
2778int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, 2902int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
2779 struct nfs_fsinfo *info) 2903 struct nfs_fsinfo *info,
2904 bool auth_probe)
2780{ 2905{
2781 int status; 2906 int status;
2782 2907
2783 status = nfs4_lookup_root(server, fhandle, info); 2908 switch (auth_probe) {
2784 if ((status == -NFS4ERR_WRONGSEC) && 2909 case false:
2785 !(server->flags & NFS_MOUNT_SECFLAVOUR)) 2910 status = nfs4_lookup_root(server, fhandle, info);
2911 if (status != -NFS4ERR_WRONGSEC)
2912 break;
2913 /* Did user force a 'sec=' mount option? */
2914 if (server->flags & NFS_MOUNT_SECFLAVOUR)
2915 break;
2916 default:
2786 status = nfs4_do_find_root_sec(server, fhandle, info); 2917 status = nfs4_do_find_root_sec(server, fhandle, info);
2918 }
2787 2919
2788 if (status == 0) 2920 if (status == 0)
2789 status = nfs4_server_capabilities(server, fhandle); 2921 status = nfs4_server_capabilities(server, fhandle);
@@ -2899,8 +3031,9 @@ static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2899 struct nfs4_exception exception = { }; 3031 struct nfs4_exception exception = { };
2900 int err; 3032 int err;
2901 do { 3033 do {
2902 err = nfs4_handle_exception(server, 3034 err = _nfs4_proc_getattr(server, fhandle, fattr, label);
2903 _nfs4_proc_getattr(server, fhandle, fattr, label), 3035 trace_nfs4_getattr(server, fhandle, fattr, err);
3036 err = nfs4_handle_exception(server, err,
2904 &exception); 3037 &exception);
2905 } while (exception.retry); 3038 } while (exception.retry);
2906 return err; 3039 return err;
@@ -2940,10 +3073,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2940 3073
2941 /* Deal with open(O_TRUNC) */ 3074 /* Deal with open(O_TRUNC) */
2942 if (sattr->ia_valid & ATTR_OPEN) 3075 if (sattr->ia_valid & ATTR_OPEN)
2943 sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); 3076 sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
2944 3077
2945 /* Optimization: if the end result is no change, don't RPC */ 3078 /* Optimization: if the end result is no change, don't RPC */
2946 if ((sattr->ia_valid & ~(ATTR_FILE)) == 0) 3079 if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
2947 return 0; 3080 return 0;
2948 3081
2949 /* Search for an existing open(O_WRITE) file */ 3082 /* Search for an existing open(O_WRITE) file */
@@ -3020,6 +3153,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
3020 int err; 3153 int err;
3021 do { 3154 do {
3022 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); 3155 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
3156 trace_nfs4_lookup(dir, name, err);
3023 switch (err) { 3157 switch (err) {
3024 case -NFS4ERR_BADNAME: 3158 case -NFS4ERR_BADNAME:
3025 err = -ENOENT; 3159 err = -ENOENT;
@@ -3031,7 +3165,9 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
3031 err = -EPERM; 3165 err = -EPERM;
3032 if (client != *clnt) 3166 if (client != *clnt)
3033 goto out; 3167 goto out;
3034 3168 /* No security negotiation if the user specified 'sec=' */
3169 if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
3170 goto out;
3035 client = nfs4_create_sec_client(client, dir, name); 3171 client = nfs4_create_sec_client(client, dir, name);
3036 if (IS_ERR(client)) 3172 if (IS_ERR(client))
3037 return PTR_ERR(client); 3173 return PTR_ERR(client);
@@ -3134,8 +3270,9 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
3134 struct nfs4_exception exception = { }; 3270 struct nfs4_exception exception = { };
3135 int err; 3271 int err;
3136 do { 3272 do {
3137 err = nfs4_handle_exception(NFS_SERVER(inode), 3273 err = _nfs4_proc_access(inode, entry);
3138 _nfs4_proc_access(inode, entry), 3274 trace_nfs4_access(inode, err);
3275 err = nfs4_handle_exception(NFS_SERVER(inode), err,
3139 &exception); 3276 &exception);
3140 } while (exception.retry); 3277 } while (exception.retry);
3141 return err; 3278 return err;
@@ -3188,8 +3325,9 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
3188 struct nfs4_exception exception = { }; 3325 struct nfs4_exception exception = { };
3189 int err; 3326 int err;
3190 do { 3327 do {
3191 err = nfs4_handle_exception(NFS_SERVER(inode), 3328 err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
3192 _nfs4_proc_readlink(inode, page, pgbase, pglen), 3329 trace_nfs4_readlink(inode, err);
3330 err = nfs4_handle_exception(NFS_SERVER(inode), err,
3193 &exception); 3331 &exception);
3194 } while (exception.retry); 3332 } while (exception.retry);
3195 return err; 3333 return err;
@@ -3205,6 +3343,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3205 struct nfs4_label l, *ilabel = NULL; 3343 struct nfs4_label l, *ilabel = NULL;
3206 struct nfs_open_context *ctx; 3344 struct nfs_open_context *ctx;
3207 struct nfs4_state *state; 3345 struct nfs4_state *state;
3346 int opened = 0;
3208 int status = 0; 3347 int status = 0;
3209 3348
3210 ctx = alloc_nfs_open_context(dentry, FMODE_READ); 3349 ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3214,7 +3353,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3214 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); 3353 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
3215 3354
3216 sattr->ia_mode &= ~current_umask(); 3355 sattr->ia_mode &= ~current_umask();
3217 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); 3356 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
3218 if (IS_ERR(state)) { 3357 if (IS_ERR(state)) {
3219 status = PTR_ERR(state); 3358 status = PTR_ERR(state);
3220 goto out; 3359 goto out;
@@ -3253,8 +3392,9 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
3253 struct nfs4_exception exception = { }; 3392 struct nfs4_exception exception = { };
3254 int err; 3393 int err;
3255 do { 3394 do {
3256 err = nfs4_handle_exception(NFS_SERVER(dir), 3395 err = _nfs4_proc_remove(dir, name);
3257 _nfs4_proc_remove(dir, name), 3396 trace_nfs4_remove(dir, name, err);
3397 err = nfs4_handle_exception(NFS_SERVER(dir), err,
3258 &exception); 3398 &exception);
3259 } while (exception.retry); 3399 } while (exception.retry);
3260 return err; 3400 return err;
@@ -3268,7 +3408,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3268 3408
3269 res->server = server; 3409 res->server = server;
3270 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 3410 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
3271 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); 3411 nfs4_init_sequence(&args->seq_args, &res->seq_res, 1);
3272 3412
3273 nfs_fattr_init(res->dir_attr); 3413 nfs_fattr_init(res->dir_attr);
3274} 3414}
@@ -3283,7 +3423,8 @@ static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlin
3283 3423
3284static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) 3424static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
3285{ 3425{
3286 struct nfs_removeres *res = task->tk_msg.rpc_resp; 3426 struct nfs_unlinkdata *data = task->tk_calldata;
3427 struct nfs_removeres *res = &data->res;
3287 3428
3288 if (!nfs4_sequence_done(task, &res->seq_res)) 3429 if (!nfs4_sequence_done(task, &res->seq_res))
3289 return 0; 3430 return 0;
@@ -3301,7 +3442,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
3301 3442
3302 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; 3443 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
3303 res->server = server; 3444 res->server = server;
3304 nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1); 3445 nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1);
3305} 3446}
3306 3447
3307static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) 3448static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
@@ -3315,7 +3456,8 @@ static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renam
3315static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 3456static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3316 struct inode *new_dir) 3457 struct inode *new_dir)
3317{ 3458{
3318 struct nfs_renameres *res = task->tk_msg.rpc_resp; 3459 struct nfs_renamedata *data = task->tk_calldata;
3460 struct nfs_renameres *res = &data->res;
3319 3461
3320 if (!nfs4_sequence_done(task, &res->seq_res)) 3462 if (!nfs4_sequence_done(task, &res->seq_res))
3321 return 0; 3463 return 0;
@@ -3361,9 +3503,10 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3361 struct nfs4_exception exception = { }; 3503 struct nfs4_exception exception = { };
3362 int err; 3504 int err;
3363 do { 3505 do {
3364 err = nfs4_handle_exception(NFS_SERVER(old_dir), 3506 err = _nfs4_proc_rename(old_dir, old_name,
3365 _nfs4_proc_rename(old_dir, old_name, 3507 new_dir, new_name);
3366 new_dir, new_name), 3508 trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
3509 err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
3367 &exception); 3510 &exception);
3368 } while (exception.retry); 3511 } while (exception.retry);
3369 return err; 3512 return err;
@@ -3525,9 +3668,9 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3525 label = nfs4_label_init_security(dir, dentry, sattr, &l); 3668 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3526 3669
3527 do { 3670 do {
3528 err = nfs4_handle_exception(NFS_SERVER(dir), 3671 err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
3529 _nfs4_proc_symlink(dir, dentry, page, 3672 trace_nfs4_symlink(dir, &dentry->d_name, err);
3530 len, sattr, label), 3673 err = nfs4_handle_exception(NFS_SERVER(dir), err,
3531 &exception); 3674 &exception);
3532 } while (exception.retry); 3675 } while (exception.retry);
3533 3676
@@ -3564,8 +3707,9 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3564 3707
3565 sattr->ia_mode &= ~current_umask(); 3708 sattr->ia_mode &= ~current_umask();
3566 do { 3709 do {
3567 err = nfs4_handle_exception(NFS_SERVER(dir), 3710 err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
3568 _nfs4_proc_mkdir(dir, dentry, sattr, label), 3711 trace_nfs4_mkdir(dir, &dentry->d_name, err);
3712 err = nfs4_handle_exception(NFS_SERVER(dir), err,
3569 &exception); 3713 &exception);
3570 } while (exception.retry); 3714 } while (exception.retry);
3571 nfs4_label_release_security(label); 3715 nfs4_label_release_security(label);
@@ -3618,9 +3762,10 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
3618 struct nfs4_exception exception = { }; 3762 struct nfs4_exception exception = { };
3619 int err; 3763 int err;
3620 do { 3764 do {
3621 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), 3765 err = _nfs4_proc_readdir(dentry, cred, cookie,
3622 _nfs4_proc_readdir(dentry, cred, cookie, 3766 pages, count, plus);
3623 pages, count, plus), 3767 trace_nfs4_readdir(dentry->d_inode, err);
3768 err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err,
3624 &exception); 3769 &exception);
3625 } while (exception.retry); 3770 } while (exception.retry);
3626 return err; 3771 return err;
@@ -3672,8 +3817,9 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3672 3817
3673 sattr->ia_mode &= ~current_umask(); 3818 sattr->ia_mode &= ~current_umask();
3674 do { 3819 do {
3675 err = nfs4_handle_exception(NFS_SERVER(dir), 3820 err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
3676 _nfs4_proc_mknod(dir, dentry, sattr, label, rdev), 3821 trace_nfs4_mknod(dir, &dentry->d_name, err);
3822 err = nfs4_handle_exception(NFS_SERVER(dir), err,
3677 &exception); 3823 &exception);
3678 } while (exception.retry); 3824 } while (exception.retry);
3679 3825
@@ -3741,6 +3887,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
3741 3887
3742 do { 3888 do {
3743 err = _nfs4_do_fsinfo(server, fhandle, fsinfo); 3889 err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
3890 trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
3744 if (err == 0) { 3891 if (err == 0) {
3745 struct nfs_client *clp = server->nfs_client; 3892 struct nfs_client *clp = server->nfs_client;
3746 3893
@@ -3859,6 +4006,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3859{ 4006{
3860 struct nfs_server *server = NFS_SERVER(data->header->inode); 4007 struct nfs_server *server = NFS_SERVER(data->header->inode);
3861 4008
4009 trace_nfs4_read(data, task->tk_status);
3862 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 4010 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3863 rpc_restart_call_prepare(task); 4011 rpc_restart_call_prepare(task);
3864 return -EAGAIN; 4012 return -EAGAIN;
@@ -3902,24 +4050,29 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
3902 data->timestamp = jiffies; 4050 data->timestamp = jiffies;
3903 data->read_done_cb = nfs4_read_done_cb; 4051 data->read_done_cb = nfs4_read_done_cb;
3904 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 4052 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
3905 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 4053 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
3906} 4054}
3907 4055
3908static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) 4056static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
3909{ 4057{
3910 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 4058 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3911 &data->args.seq_args, 4059 &data->args.seq_args,
3912 &data->res.seq_res, 4060 &data->res.seq_res,
3913 task)) 4061 task))
3914 return; 4062 return 0;
3915 nfs4_set_rw_stateid(&data->args.stateid, data->args.context, 4063 if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
3916 data->args.lock_context, FMODE_READ); 4064 data->args.lock_context, FMODE_READ) == -EIO)
4065 return -EIO;
4066 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
4067 return -EIO;
4068 return 0;
3917} 4069}
3918 4070
3919static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) 4071static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3920{ 4072{
3921 struct inode *inode = data->header->inode; 4073 struct inode *inode = data->header->inode;
3922 4074
4075 trace_nfs4_write(data, task->tk_status);
3923 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 4076 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3924 rpc_restart_call_prepare(task); 4077 rpc_restart_call_prepare(task);
3925 return -EAGAIN; 4078 return -EAGAIN;
@@ -3985,18 +4138,22 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
3985 data->timestamp = jiffies; 4138 data->timestamp = jiffies;
3986 4139
3987 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 4140 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
3988 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 4141 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
3989} 4142}
3990 4143
3991static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) 4144static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
3992{ 4145{
3993 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 4146 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3994 &data->args.seq_args, 4147 &data->args.seq_args,
3995 &data->res.seq_res, 4148 &data->res.seq_res,
3996 task)) 4149 task))
3997 return; 4150 return 0;
3998 nfs4_set_rw_stateid(&data->args.stateid, data->args.context, 4151 if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
3999 data->args.lock_context, FMODE_WRITE); 4152 data->args.lock_context, FMODE_WRITE) == -EIO)
4153 return -EIO;
4154 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
4155 return -EIO;
4156 return 0;
4000} 4157}
4001 4158
4002static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 4159static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4011,6 +4168,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
4011{ 4168{
4012 struct inode *inode = data->inode; 4169 struct inode *inode = data->inode;
4013 4170
4171 trace_nfs4_commit(data, task->tk_status);
4014 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 4172 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
4015 rpc_restart_call_prepare(task); 4173 rpc_restart_call_prepare(task);
4016 return -EAGAIN; 4174 return -EAGAIN;
@@ -4033,7 +4191,7 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
4033 data->commit_done_cb = nfs4_commit_done_cb; 4191 data->commit_done_cb = nfs4_commit_done_cb;
4034 data->res.server = server; 4192 data->res.server = server;
4035 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 4193 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
4036 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 4194 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
4037} 4195}
4038 4196
4039struct nfs4_renewdata { 4197struct nfs4_renewdata {
@@ -4062,6 +4220,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
4062 struct nfs_client *clp = data->client; 4220 struct nfs_client *clp = data->client;
4063 unsigned long timestamp = data->timestamp; 4221 unsigned long timestamp = data->timestamp;
4064 4222
4223 trace_nfs4_renew_async(clp, task->tk_status);
4065 if (task->tk_status < 0) { 4224 if (task->tk_status < 0) {
4066 /* Unless we're shutting down, schedule state recovery! */ 4225 /* Unless we're shutting down, schedule state recovery! */
4067 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) 4226 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
@@ -4319,6 +4478,7 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl
4319 ssize_t ret; 4478 ssize_t ret;
4320 do { 4479 do {
4321 ret = __nfs4_get_acl_uncached(inode, buf, buflen); 4480 ret = __nfs4_get_acl_uncached(inode, buf, buflen);
4481 trace_nfs4_get_acl(inode, ret);
4322 if (ret >= 0) 4482 if (ret >= 0)
4323 break; 4483 break;
4324 ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); 4484 ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
@@ -4398,8 +4558,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
4398 struct nfs4_exception exception = { }; 4558 struct nfs4_exception exception = { };
4399 int err; 4559 int err;
4400 do { 4560 do {
4401 err = nfs4_handle_exception(NFS_SERVER(inode), 4561 err = __nfs4_proc_set_acl(inode, buf, buflen);
4402 __nfs4_proc_set_acl(inode, buf, buflen), 4562 trace_nfs4_set_acl(inode, err);
4563 err = nfs4_handle_exception(NFS_SERVER(inode), err,
4403 &exception); 4564 &exception);
4404 } while (exception.retry); 4565 } while (exception.retry);
4405 return err; 4566 return err;
@@ -4452,8 +4613,9 @@ static int nfs4_get_security_label(struct inode *inode, void *buf,
4452 return -EOPNOTSUPP; 4613 return -EOPNOTSUPP;
4453 4614
4454 do { 4615 do {
4455 err = nfs4_handle_exception(NFS_SERVER(inode), 4616 err = _nfs4_get_security_label(inode, buf, buflen);
4456 _nfs4_get_security_label(inode, buf, buflen), 4617 trace_nfs4_get_security_label(inode, err);
4618 err = nfs4_handle_exception(NFS_SERVER(inode), err,
4457 &exception); 4619 &exception);
4458 } while (exception.retry); 4620 } while (exception.retry);
4459 return err; 4621 return err;
@@ -4505,9 +4667,10 @@ static int nfs4_do_set_security_label(struct inode *inode,
4505 int err; 4667 int err;
4506 4668
4507 do { 4669 do {
4508 err = nfs4_handle_exception(NFS_SERVER(inode), 4670 err = _nfs4_do_set_security_label(inode, ilabel,
4509 _nfs4_do_set_security_label(inode, ilabel, 4671 fattr, olabel);
4510 fattr, olabel), 4672 trace_nfs4_set_security_label(inode, err);
4673 err = nfs4_handle_exception(NFS_SERVER(inode), err,
4511 &exception); 4674 &exception);
4512 } while (exception.retry); 4675 } while (exception.retry);
4513 return err; 4676 return err;
@@ -4630,11 +4793,11 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
4630 /* An impossible timestamp guarantees this value 4793 /* An impossible timestamp guarantees this value
4631 * will never match a generated boot time. */ 4794 * will never match a generated boot time. */
4632 verf[0] = 0; 4795 verf[0] = 0;
4633 verf[1] = (__be32)(NSEC_PER_SEC + 1); 4796 verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
4634 } else { 4797 } else {
4635 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); 4798 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
4636 verf[0] = (__be32)nn->boot_time.tv_sec; 4799 verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
4637 verf[1] = (__be32)nn->boot_time.tv_nsec; 4800 verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
4638 } 4801 }
4639 memcpy(bootverf->data, verf, sizeof(bootverf->data)); 4802 memcpy(bootverf->data, verf, sizeof(bootverf->data));
4640} 4803}
@@ -4660,10 +4823,14 @@ static unsigned int
4660nfs4_init_uniform_client_string(const struct nfs_client *clp, 4823nfs4_init_uniform_client_string(const struct nfs_client *clp,
4661 char *buf, size_t len) 4824 char *buf, size_t len)
4662{ 4825{
4663 char *nodename = clp->cl_rpcclient->cl_nodename; 4826 const char *nodename = clp->cl_rpcclient->cl_nodename;
4664 4827
4665 if (nfs4_client_id_uniquifier[0] != '\0') 4828 if (nfs4_client_id_uniquifier[0] != '\0')
4666 nodename = nfs4_client_id_uniquifier; 4829 return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
4830 clp->rpc_ops->version,
4831 clp->cl_minorversion,
4832 nfs4_client_id_uniquifier,
4833 nodename);
4667 return scnprintf(buf, len, "Linux NFSv%u.%u %s", 4834 return scnprintf(buf, len, "Linux NFSv%u.%u %s",
4668 clp->rpc_ops->version, clp->cl_minorversion, 4835 clp->rpc_ops->version, clp->cl_minorversion,
4669 nodename); 4836 nodename);
@@ -4724,6 +4891,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4724 clp->cl_rpcclient->cl_auth->au_ops->au_name, 4891 clp->cl_rpcclient->cl_auth->au_ops->au_name,
4725 setclientid.sc_name_len, setclientid.sc_name); 4892 setclientid.sc_name_len, setclientid.sc_name);
4726 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 4893 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
4894 trace_nfs4_setclientid(clp, status);
4727 dprintk("NFS reply setclientid: %d\n", status); 4895 dprintk("NFS reply setclientid: %d\n", status);
4728 return status; 4896 return status;
4729} 4897}
@@ -4751,6 +4919,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
4751 clp->cl_rpcclient->cl_auth->au_ops->au_name, 4919 clp->cl_rpcclient->cl_auth->au_ops->au_name,
4752 clp->cl_clientid); 4920 clp->cl_clientid);
4753 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 4921 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
4922 trace_nfs4_setclientid_confirm(clp, status);
4754 dprintk("NFS reply setclientid_confirm: %d\n", status); 4923 dprintk("NFS reply setclientid_confirm: %d\n", status);
4755 return status; 4924 return status;
4756} 4925}
@@ -4772,6 +4941,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4772 if (!nfs4_sequence_done(task, &data->res.seq_res)) 4941 if (!nfs4_sequence_done(task, &data->res.seq_res))
4773 return; 4942 return;
4774 4943
4944 trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
4775 switch (task->tk_status) { 4945 switch (task->tk_status) {
4776 case -NFS4ERR_STALE_STATEID: 4946 case -NFS4ERR_STALE_STATEID:
4777 case -NFS4ERR_EXPIRED: 4947 case -NFS4ERR_EXPIRED:
@@ -4793,7 +4963,6 @@ static void nfs4_delegreturn_release(void *calldata)
4793 kfree(calldata); 4963 kfree(calldata);
4794} 4964}
4795 4965
4796#if defined(CONFIG_NFS_V4_1)
4797static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) 4966static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
4798{ 4967{
4799 struct nfs4_delegreturndata *d_data; 4968 struct nfs4_delegreturndata *d_data;
@@ -4805,12 +4974,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
4805 &d_data->res.seq_res, 4974 &d_data->res.seq_res,
4806 task); 4975 task);
4807} 4976}
4808#endif /* CONFIG_NFS_V4_1 */
4809 4977
4810static const struct rpc_call_ops nfs4_delegreturn_ops = { 4978static const struct rpc_call_ops nfs4_delegreturn_ops = {
4811#if defined(CONFIG_NFS_V4_1)
4812 .rpc_call_prepare = nfs4_delegreturn_prepare, 4979 .rpc_call_prepare = nfs4_delegreturn_prepare,
4813#endif /* CONFIG_NFS_V4_1 */
4814 .rpc_call_done = nfs4_delegreturn_done, 4980 .rpc_call_done = nfs4_delegreturn_done,
4815 .rpc_release = nfs4_delegreturn_release, 4981 .rpc_release = nfs4_delegreturn_release,
4816}; 4982};
@@ -4835,7 +5001,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
4835 data = kzalloc(sizeof(*data), GFP_NOFS); 5001 data = kzalloc(sizeof(*data), GFP_NOFS);
4836 if (data == NULL) 5002 if (data == NULL)
4837 return -ENOMEM; 5003 return -ENOMEM;
4838 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 5004 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
4839 data->args.fhandle = &data->fh; 5005 data->args.fhandle = &data->fh;
4840 data->args.stateid = &data->stateid; 5006 data->args.stateid = &data->stateid;
4841 data->args.bitmask = server->cache_consistency_bitmask; 5007 data->args.bitmask = server->cache_consistency_bitmask;
@@ -4875,6 +5041,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
4875 int err; 5041 int err;
4876 do { 5042 do {
4877 err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); 5043 err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
5044 trace_nfs4_delegreturn(inode, err);
4878 switch (err) { 5045 switch (err) {
4879 case -NFS4ERR_STALE_STATEID: 5046 case -NFS4ERR_STALE_STATEID:
4880 case -NFS4ERR_EXPIRED: 5047 case -NFS4ERR_EXPIRED:
@@ -4949,8 +5116,9 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
4949 int err; 5116 int err;
4950 5117
4951 do { 5118 do {
4952 err = nfs4_handle_exception(NFS_SERVER(state->inode), 5119 err = _nfs4_proc_getlk(state, cmd, request);
4953 _nfs4_proc_getlk(state, cmd, request), 5120 trace_nfs4_get_lock(request, state, cmd, err);
5121 err = nfs4_handle_exception(NFS_SERVER(state->inode), err,
4954 &exception); 5122 &exception);
4955 } while (exception.retry); 5123 } while (exception.retry);
4956 return err; 5124 return err;
@@ -5087,6 +5255,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
5087 .flags = RPC_TASK_ASYNC, 5255 .flags = RPC_TASK_ASYNC,
5088 }; 5256 };
5089 5257
5258 nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
5259 NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
5260
5090 /* Ensure this is an unlock - when canceling a lock, the 5261 /* Ensure this is an unlock - when canceling a lock, the
5091 * canceled lock is passed in, and it won't be an unlock. 5262 * canceled lock is passed in, and it won't be an unlock.
5092 */ 5263 */
@@ -5098,7 +5269,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
5098 return ERR_PTR(-ENOMEM); 5269 return ERR_PTR(-ENOMEM);
5099 } 5270 }
5100 5271
5101 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); 5272 nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
5102 msg.rpc_argp = &data->arg; 5273 msg.rpc_argp = &data->arg;
5103 msg.rpc_resp = &data->res; 5274 msg.rpc_resp = &data->res;
5104 task_setup_data.callback_data = data; 5275 task_setup_data.callback_data = data;
@@ -5148,6 +5319,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5148 rpc_put_task(task); 5319 rpc_put_task(task);
5149out: 5320out:
5150 request->fl_flags = fl_flags; 5321 request->fl_flags = fl_flags;
5322 trace_nfs4_unlock(request, state, F_SETLK, status);
5151 return status; 5323 return status;
5152} 5324}
5153 5325
@@ -5333,7 +5505,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
5333 return -ENOMEM; 5505 return -ENOMEM;
5334 if (IS_SETLKW(cmd)) 5506 if (IS_SETLKW(cmd))
5335 data->arg.block = 1; 5507 data->arg.block = 1;
5336 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); 5508 nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
5337 msg.rpc_argp = &data->arg; 5509 msg.rpc_argp = &data->arg;
5338 msg.rpc_resp = &data->res; 5510 msg.rpc_resp = &data->res;
5339 task_setup_data.callback_data = data; 5511 task_setup_data.callback_data = data;
@@ -5371,6 +5543,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
5371 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 5543 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
5372 return 0; 5544 return 0;
5373 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 5545 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
5546 trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
5374 if (err != -NFS4ERR_DELAY) 5547 if (err != -NFS4ERR_DELAY)
5375 break; 5548 break;
5376 nfs4_handle_exception(server, err, &exception); 5549 nfs4_handle_exception(server, err, &exception);
@@ -5389,10 +5562,15 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
5389 err = nfs4_set_lock_state(state, request); 5562 err = nfs4_set_lock_state(state, request);
5390 if (err != 0) 5563 if (err != 0)
5391 return err; 5564 return err;
5565 if (!recover_lost_locks) {
5566 set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
5567 return 0;
5568 }
5392 do { 5569 do {
5393 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 5570 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
5394 return 0; 5571 return 0;
5395 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); 5572 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
5573 trace_nfs4_lock_expired(request, state, F_SETLK, err);
5396 switch (err) { 5574 switch (err) {
5397 default: 5575 default:
5398 goto out; 5576 goto out;
@@ -5428,6 +5606,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
5428 status = nfs41_test_stateid(server, 5606 status = nfs41_test_stateid(server,
5429 &lsp->ls_stateid, 5607 &lsp->ls_stateid,
5430 cred); 5608 cred);
5609 trace_nfs4_test_lock_stateid(state, lsp, status);
5431 if (status != NFS_OK) { 5610 if (status != NFS_OK) {
5432 /* Free the stateid unless the server 5611 /* Free the stateid unless the server
5433 * informs us the stateid is unrecognized. */ 5612 * informs us the stateid is unrecognized. */
@@ -5515,6 +5694,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
5515 5694
5516 do { 5695 do {
5517 err = _nfs4_proc_setlk(state, cmd, request); 5696 err = _nfs4_proc_setlk(state, cmd, request);
5697 trace_nfs4_set_lock(request, state, cmd, err);
5518 if (err == -NFS4ERR_DENIED) 5698 if (err == -NFS4ERR_DENIED)
5519 err = -EAGAIN; 5699 err = -EAGAIN;
5520 err = nfs4_handle_exception(NFS_SERVER(state->inode), 5700 err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -5597,8 +5777,23 @@ struct nfs_release_lockowner_data {
5597 struct nfs4_lock_state *lsp; 5777 struct nfs4_lock_state *lsp;
5598 struct nfs_server *server; 5778 struct nfs_server *server;
5599 struct nfs_release_lockowner_args args; 5779 struct nfs_release_lockowner_args args;
5780 struct nfs4_sequence_args seq_args;
5781 struct nfs4_sequence_res seq_res;
5600}; 5782};
5601 5783
5784static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
5785{
5786 struct nfs_release_lockowner_data *data = calldata;
5787 nfs40_setup_sequence(data->server,
5788 &data->seq_args, &data->seq_res, task);
5789}
5790
5791static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5792{
5793 struct nfs_release_lockowner_data *data = calldata;
5794 nfs40_sequence_done(task, &data->seq_res);
5795}
5796
5602static void nfs4_release_lockowner_release(void *calldata) 5797static void nfs4_release_lockowner_release(void *calldata)
5603{ 5798{
5604 struct nfs_release_lockowner_data *data = calldata; 5799 struct nfs_release_lockowner_data *data = calldata;
@@ -5607,6 +5802,8 @@ static void nfs4_release_lockowner_release(void *calldata)
5607} 5802}
5608 5803
5609static const struct rpc_call_ops nfs4_release_lockowner_ops = { 5804static const struct rpc_call_ops nfs4_release_lockowner_ops = {
5805 .rpc_call_prepare = nfs4_release_lockowner_prepare,
5806 .rpc_call_done = nfs4_release_lockowner_done,
5610 .rpc_release = nfs4_release_lockowner_release, 5807 .rpc_release = nfs4_release_lockowner_release,
5611}; 5808};
5612 5809
@@ -5619,14 +5816,17 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5619 5816
5620 if (server->nfs_client->cl_mvops->minor_version != 0) 5817 if (server->nfs_client->cl_mvops->minor_version != 0)
5621 return -EINVAL; 5818 return -EINVAL;
5819
5622 data = kmalloc(sizeof(*data), GFP_NOFS); 5820 data = kmalloc(sizeof(*data), GFP_NOFS);
5623 if (!data) 5821 if (!data)
5624 return -ENOMEM; 5822 return -ENOMEM;
5823 nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);
5625 data->lsp = lsp; 5824 data->lsp = lsp;
5626 data->server = server; 5825 data->server = server;
5627 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 5826 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5628 data->args.lock_owner.id = lsp->ls_seqid.owner_id; 5827 data->args.lock_owner.id = lsp->ls_seqid.owner_id;
5629 data->args.lock_owner.s_dev = server->s_dev; 5828 data->args.lock_owner.s_dev = server->s_dev;
5829
5630 msg.rpc_argp = &data->args; 5830 msg.rpc_argp = &data->args;
5631 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); 5831 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
5632 return 0; 5832 return 0;
@@ -5781,14 +5981,23 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
5781 struct nfs4_exception exception = { }; 5981 struct nfs4_exception exception = { };
5782 int err; 5982 int err;
5783 do { 5983 do {
5784 err = nfs4_handle_exception(NFS_SERVER(dir), 5984 err = _nfs4_proc_fs_locations(client, dir, name,
5785 _nfs4_proc_fs_locations(client, dir, name, fs_locations, page), 5985 fs_locations, page);
5986 trace_nfs4_get_fs_locations(dir, name, err);
5987 err = nfs4_handle_exception(NFS_SERVER(dir), err,
5786 &exception); 5988 &exception);
5787 } while (exception.retry); 5989 } while (exception.retry);
5788 return err; 5990 return err;
5789} 5991}
5790 5992
5791static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) 5993/**
5994 * If 'use_integrity' is true and the state managment nfs_client
5995 * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
5996 * and the machine credential as per RFC3530bis and RFC5661 Security
5997 * Considerations sections. Otherwise, just use the user cred with the
5998 * filesystem's rpc_client.
5999 */
6000static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
5792{ 6001{
5793 int status; 6002 int status;
5794 struct nfs4_secinfo_arg args = { 6003 struct nfs4_secinfo_arg args = {
@@ -5803,10 +6012,27 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
5803 .rpc_argp = &args, 6012 .rpc_argp = &args,
5804 .rpc_resp = &res, 6013 .rpc_resp = &res,
5805 }; 6014 };
6015 struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
6016 struct rpc_cred *cred = NULL;
6017
6018 if (use_integrity) {
6019 clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
6020 cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
6021 msg.rpc_cred = cred;
6022 }
5806 6023
5807 dprintk("NFS call secinfo %s\n", name->name); 6024 dprintk("NFS call secinfo %s\n", name->name);
5808 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); 6025
6026 nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
6027 NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
6028
6029 status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
6030 &res.seq_res, 0);
5809 dprintk("NFS reply secinfo: %d\n", status); 6031 dprintk("NFS reply secinfo: %d\n", status);
6032
6033 if (cred)
6034 put_rpccred(cred);
6035
5810 return status; 6036 return status;
5811} 6037}
5812 6038
@@ -5816,8 +6042,23 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
5816 struct nfs4_exception exception = { }; 6042 struct nfs4_exception exception = { };
5817 int err; 6043 int err;
5818 do { 6044 do {
5819 err = nfs4_handle_exception(NFS_SERVER(dir), 6045 err = -NFS4ERR_WRONGSEC;
5820 _nfs4_proc_secinfo(dir, name, flavors), 6046
6047 /* try to use integrity protection with machine cred */
6048 if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client))
6049 err = _nfs4_proc_secinfo(dir, name, flavors, true);
6050
6051 /*
6052 * if unable to use integrity protection, or SECINFO with
6053 * integrity protection returns NFS4ERR_WRONGSEC (which is
6054 * disallowed by spec, but exists in deployed servers) use
6055 * the current filesystem's rpc_client and the user cred.
6056 */
6057 if (err == -NFS4ERR_WRONGSEC)
6058 err = _nfs4_proc_secinfo(dir, name, flavors, false);
6059
6060 trace_nfs4_secinfo(dir, name, err);
6061 err = nfs4_handle_exception(NFS_SERVER(dir), err,
5821 &exception); 6062 &exception);
5822 } while (exception.retry); 6063 } while (exception.retry);
5823 return err; 6064 return err;
@@ -5881,6 +6122,7 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
5881 } 6122 }
5882 6123
5883 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6124 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6125 trace_nfs4_bind_conn_to_session(clp, status);
5884 if (status == 0) { 6126 if (status == 0) {
5885 if (memcmp(res.session->sess_id.data, 6127 if (memcmp(res.session->sess_id.data,
5886 clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { 6128 clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
@@ -5909,16 +6151,126 @@ out:
5909} 6151}
5910 6152
5911/* 6153/*
5912 * nfs4_proc_exchange_id() 6154 * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
6155 * and operations we'd like to see to enable certain features in the allow map
6156 */
6157static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
6158 .how = SP4_MACH_CRED,
6159 .enforce.u.words = {
6160 [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
6161 1 << (OP_EXCHANGE_ID - 32) |
6162 1 << (OP_CREATE_SESSION - 32) |
6163 1 << (OP_DESTROY_SESSION - 32) |
6164 1 << (OP_DESTROY_CLIENTID - 32)
6165 },
6166 .allow.u.words = {
6167 [0] = 1 << (OP_CLOSE) |
6168 1 << (OP_LOCKU) |
6169 1 << (OP_COMMIT),
6170 [1] = 1 << (OP_SECINFO - 32) |
6171 1 << (OP_SECINFO_NO_NAME - 32) |
6172 1 << (OP_TEST_STATEID - 32) |
6173 1 << (OP_FREE_STATEID - 32) |
6174 1 << (OP_WRITE - 32)
6175 }
6176};
6177
6178/*
6179 * Select the state protection mode for client `clp' given the server results
6180 * from exchange_id in `sp'.
5913 * 6181 *
5914 * Returns zero, a negative errno, or a negative NFS4ERR status code. 6182 * Returns 0 on success, negative errno otherwise.
6183 */
6184static int nfs4_sp4_select_mode(struct nfs_client *clp,
6185 struct nfs41_state_protection *sp)
6186{
6187 static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = {
6188 [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
6189 1 << (OP_EXCHANGE_ID - 32) |
6190 1 << (OP_CREATE_SESSION - 32) |
6191 1 << (OP_DESTROY_SESSION - 32) |
6192 1 << (OP_DESTROY_CLIENTID - 32)
6193 };
6194 unsigned int i;
6195
6196 if (sp->how == SP4_MACH_CRED) {
6197 /* Print state protect result */
6198 dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n");
6199 for (i = 0; i <= LAST_NFS4_OP; i++) {
6200 if (test_bit(i, sp->enforce.u.longs))
6201 dfprintk(MOUNT, " enforce op %d\n", i);
6202 if (test_bit(i, sp->allow.u.longs))
6203 dfprintk(MOUNT, " allow op %d\n", i);
6204 }
6205
6206 /* make sure nothing is on enforce list that isn't supported */
6207 for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
6208 if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
6209 dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
6210 return -EINVAL;
6211 }
6212 }
6213
6214 /*
6215 * Minimal mode - state operations are allowed to use machine
6216 * credential. Note this already happens by default, so the
6217 * client doesn't have to do anything more than the negotiation.
6218 *
6219 * NOTE: we don't care if EXCHANGE_ID is in the list -
6220 * we're already using the machine cred for exchange_id
6221 * and will never use a different cred.
6222 */
6223 if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) &&
6224 test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) &&
6225 test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) &&
6226 test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
6227 dfprintk(MOUNT, "sp4_mach_cred:\n");
6228 dfprintk(MOUNT, " minimal mode enabled\n");
6229 set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
6230 } else {
6231 dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
6232 return -EINVAL;
6233 }
6234
6235 if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
6236 test_bit(OP_LOCKU, sp->allow.u.longs)) {
6237 dfprintk(MOUNT, " cleanup mode enabled\n");
6238 set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
6239 }
6240
6241 if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
6242 test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
6243 dfprintk(MOUNT, " secinfo mode enabled\n");
6244 set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
6245 }
6246
6247 if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
6248 test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
6249 dfprintk(MOUNT, " stateid mode enabled\n");
6250 set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
6251 }
6252
6253 if (test_bit(OP_WRITE, sp->allow.u.longs)) {
6254 dfprintk(MOUNT, " write mode enabled\n");
6255 set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
6256 }
6257
6258 if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
6259 dfprintk(MOUNT, " commit mode enabled\n");
6260 set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
6261 }
6262 }
6263
6264 return 0;
6265}
6266
6267/*
6268 * _nfs4_proc_exchange_id()
5915 * 6269 *
5916 * Since the clientid has expired, all compounds using sessions 6270 * Wrapper for EXCHANGE_ID operation.
5917 * associated with the stale clientid will be returning
5918 * NFS4ERR_BADSESSION in the sequence operation, and will therefore
5919 * be in some phase of session reset.
5920 */ 6271 */
5921int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) 6272static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
6273 u32 sp4_how)
5922{ 6274{
5923 nfs4_verifier verifier; 6275 nfs4_verifier verifier;
5924 struct nfs41_exchange_id_args args = { 6276 struct nfs41_exchange_id_args args = {
@@ -5965,10 +6317,30 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5965 goto out_server_scope; 6317 goto out_server_scope;
5966 } 6318 }
5967 6319
6320 switch (sp4_how) {
6321 case SP4_NONE:
6322 args.state_protect.how = SP4_NONE;
6323 break;
6324
6325 case SP4_MACH_CRED:
6326 args.state_protect = nfs4_sp4_mach_cred_request;
6327 break;
6328
6329 default:
6330 /* unsupported! */
6331 WARN_ON_ONCE(1);
6332 status = -EINVAL;
6333 goto out_server_scope;
6334 }
6335
5968 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6336 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6337 trace_nfs4_exchange_id(clp, status);
5969 if (status == 0) 6338 if (status == 0)
5970 status = nfs4_check_cl_exchange_flags(res.flags); 6339 status = nfs4_check_cl_exchange_flags(res.flags);
5971 6340
6341 if (status == 0)
6342 status = nfs4_sp4_select_mode(clp, &res.state_protect);
6343
5972 if (status == 0) { 6344 if (status == 0) {
5973 clp->cl_clientid = res.clientid; 6345 clp->cl_clientid = res.clientid;
5974 clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); 6346 clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
@@ -6015,6 +6387,35 @@ out:
6015 return status; 6387 return status;
6016} 6388}
6017 6389
6390/*
6391 * nfs4_proc_exchange_id()
6392 *
6393 * Returns zero, a negative errno, or a negative NFS4ERR status code.
6394 *
6395 * Since the clientid has expired, all compounds using sessions
6396 * associated with the stale clientid will be returning
6397 * NFS4ERR_BADSESSION in the sequence operation, and will therefore
6398 * be in some phase of session reset.
6399 *
6400 * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
6401 */
6402int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
6403{
6404 rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
6405 int status;
6406
6407 /* try SP4_MACH_CRED if krb5i/p */
6408 if (authflavor == RPC_AUTH_GSS_KRB5I ||
6409 authflavor == RPC_AUTH_GSS_KRB5P) {
6410 status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
6411 if (!status)
6412 return 0;
6413 }
6414
6415 /* try SP4_NONE */
6416 return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
6417}
6418
6018static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, 6419static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
6019 struct rpc_cred *cred) 6420 struct rpc_cred *cred)
6020{ 6421{
@@ -6026,6 +6427,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
6026 int status; 6427 int status;
6027 6428
6028 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6429 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6430 trace_nfs4_destroy_clientid(clp, status);
6029 if (status) 6431 if (status)
6030 dprintk("NFS: Got error %d from the server %s on " 6432 dprintk("NFS: Got error %d from the server %s on "
6031 "DESTROY_CLIENTID.", status, clp->cl_hostname); 6433 "DESTROY_CLIENTID.", status, clp->cl_hostname);
@@ -6063,7 +6465,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
6063 goto out; 6465 goto out;
6064 if (clp->cl_preserve_clid) 6466 if (clp->cl_preserve_clid)
6065 goto out; 6467 goto out;
6066 cred = nfs4_get_exchange_id_cred(clp); 6468 cred = nfs4_get_clid_cred(clp);
6067 ret = nfs4_proc_destroy_clientid(clp, cred); 6469 ret = nfs4_proc_destroy_clientid(clp, cred);
6068 if (cred) 6470 if (cred)
6069 put_rpccred(cred); 6471 put_rpccred(cred);
@@ -6155,7 +6557,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
6155 }; 6557 };
6156 int status; 6558 int status;
6157 6559
6158 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); 6560 nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
6159 nfs4_set_sequence_privileged(&args.la_seq_args); 6561 nfs4_set_sequence_privileged(&args.la_seq_args);
6160 dprintk("--> %s\n", __func__); 6562 dprintk("--> %s\n", __func__);
6161 task = rpc_run_task(&task_setup); 6563 task = rpc_run_task(&task_setup);
@@ -6289,6 +6691,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
6289 args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); 6691 args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
6290 6692
6291 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6693 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6694 trace_nfs4_create_session(clp, status);
6292 6695
6293 if (!status) { 6696 if (!status) {
6294 /* Verify the session's negotiated channel_attrs values */ 6697 /* Verify the session's negotiated channel_attrs values */
@@ -6352,6 +6755,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
6352 return status; 6755 return status;
6353 6756
6354 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6757 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6758 trace_nfs4_destroy_session(session->clp, status);
6355 6759
6356 if (status) 6760 if (status)
6357 dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " 6761 dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
@@ -6401,6 +6805,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
6401 if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) 6805 if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
6402 return; 6806 return;
6403 6807
6808 trace_nfs4_sequence(clp, task->tk_status);
6404 if (task->tk_status < 0) { 6809 if (task->tk_status < 0) {
6405 dprintk("%s ERROR %d\n", __func__, task->tk_status); 6810 dprintk("%s ERROR %d\n", __func__, task->tk_status);
6406 if (atomic_read(&clp->cl_count) == 1) 6811 if (atomic_read(&clp->cl_count) == 1)
@@ -6458,7 +6863,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
6458 nfs_put_client(clp); 6863 nfs_put_client(clp);
6459 return ERR_PTR(-ENOMEM); 6864 return ERR_PTR(-ENOMEM);
6460 } 6865 }
6461 nfs41_init_sequence(&calldata->args, &calldata->res, 0); 6866 nfs4_init_sequence(&calldata->args, &calldata->res, 0);
6462 if (is_privileged) 6867 if (is_privileged)
6463 nfs4_set_sequence_privileged(&calldata->args); 6868 nfs4_set_sequence_privileged(&calldata->args);
6464 msg.rpc_argp = &calldata->args; 6869 msg.rpc_argp = &calldata->args;
@@ -6553,6 +6958,7 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
6553 if (!nfs41_sequence_done(task, res)) 6958 if (!nfs41_sequence_done(task, res))
6554 return; 6959 return;
6555 6960
6961 trace_nfs4_reclaim_complete(clp, task->tk_status);
6556 if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { 6962 if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
6557 rpc_restart_call_prepare(task); 6963 rpc_restart_call_prepare(task);
6558 return; 6964 return;
@@ -6600,7 +7006,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
6600 calldata->clp = clp; 7006 calldata->clp = clp;
6601 calldata->arg.one_fs = 0; 7007 calldata->arg.one_fs = 0;
6602 7008
6603 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); 7009 nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
6604 nfs4_set_sequence_privileged(&calldata->arg.seq_args); 7010 nfs4_set_sequence_privileged(&calldata->arg.seq_args);
6605 msg.rpc_argp = &calldata->arg; 7011 msg.rpc_argp = &calldata->arg;
6606 msg.rpc_resp = &calldata->res; 7012 msg.rpc_resp = &calldata->res;
@@ -6791,7 +7197,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6791 7197
6792 lgp->res.layoutp = &lgp->args.layout; 7198 lgp->res.layoutp = &lgp->args.layout;
6793 lgp->res.seq_res.sr_slot = NULL; 7199 lgp->res.seq_res.sr_slot = NULL;
6794 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); 7200 nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
6795 7201
6796 /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ 7202 /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
6797 pnfs_get_layout_hdr(NFS_I(inode)->layout); 7203 pnfs_get_layout_hdr(NFS_I(inode)->layout);
@@ -6802,6 +7208,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6802 status = nfs4_wait_for_completion_rpc_task(task); 7208 status = nfs4_wait_for_completion_rpc_task(task);
6803 if (status == 0) 7209 if (status == 0)
6804 status = task->tk_status; 7210 status = task->tk_status;
7211 trace_nfs4_layoutget(lgp->args.ctx,
7212 &lgp->args.range,
7213 &lgp->res.range,
7214 status);
6805 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ 7215 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
6806 if (status == 0 && lgp->res.layoutp->len) 7216 if (status == 0 && lgp->res.layoutp->len)
6807 lseg = pnfs_layout_process(lgp); 7217 lseg = pnfs_layout_process(lgp);
@@ -6874,7 +7284,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
6874 .rpc_cred = lrp->cred, 7284 .rpc_cred = lrp->cred,
6875 }; 7285 };
6876 struct rpc_task_setup task_setup_data = { 7286 struct rpc_task_setup task_setup_data = {
6877 .rpc_client = lrp->clp->cl_rpcclient, 7287 .rpc_client = NFS_SERVER(lrp->args.inode)->client,
6878 .rpc_message = &msg, 7288 .rpc_message = &msg,
6879 .callback_ops = &nfs4_layoutreturn_call_ops, 7289 .callback_ops = &nfs4_layoutreturn_call_ops,
6880 .callback_data = lrp, 7290 .callback_data = lrp,
@@ -6882,11 +7292,12 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
6882 int status; 7292 int status;
6883 7293
6884 dprintk("--> %s\n", __func__); 7294 dprintk("--> %s\n", __func__);
6885 nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); 7295 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
6886 task = rpc_run_task(&task_setup_data); 7296 task = rpc_run_task(&task_setup_data);
6887 if (IS_ERR(task)) 7297 if (IS_ERR(task))
6888 return PTR_ERR(task); 7298 return PTR_ERR(task);
6889 status = task->tk_status; 7299 status = task->tk_status;
7300 trace_nfs4_layoutreturn(lrp->args.inode, status);
6890 dprintk("<-- %s status=%d\n", __func__, status); 7301 dprintk("<-- %s status=%d\n", __func__, status);
6891 rpc_put_task(task); 7302 rpc_put_task(task);
6892 return status; 7303 return status;
@@ -7063,7 +7474,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7063 data->args.lastbytewritten, 7474 data->args.lastbytewritten,
7064 data->args.inode->i_ino); 7475 data->args.inode->i_ino);
7065 7476
7066 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 7477 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
7067 task = rpc_run_task(&task_setup_data); 7478 task = rpc_run_task(&task_setup_data);
7068 if (IS_ERR(task)) 7479 if (IS_ERR(task))
7069 return PTR_ERR(task); 7480 return PTR_ERR(task);
@@ -7073,15 +7484,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7073 if (status != 0) 7484 if (status != 0)
7074 goto out; 7485 goto out;
7075 status = task->tk_status; 7486 status = task->tk_status;
7487 trace_nfs4_layoutcommit(data->args.inode, status);
7076out: 7488out:
7077 dprintk("%s: status %d\n", __func__, status); 7489 dprintk("%s: status %d\n", __func__, status);
7078 rpc_put_task(task); 7490 rpc_put_task(task);
7079 return status; 7491 return status;
7080} 7492}
7081 7493
7494/**
7495 * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
7496 * possible) as per RFC3530bis and RFC5661 Security Considerations sections
7497 */
7082static int 7498static int
7083_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, 7499_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
7084 struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) 7500 struct nfs_fsinfo *info,
7501 struct nfs4_secinfo_flavors *flavors, bool use_integrity)
7085{ 7502{
7086 struct nfs41_secinfo_no_name_args args = { 7503 struct nfs41_secinfo_no_name_args args = {
7087 .style = SECINFO_STYLE_CURRENT_FH, 7504 .style = SECINFO_STYLE_CURRENT_FH,
@@ -7094,7 +7511,25 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
7094 .rpc_argp = &args, 7511 .rpc_argp = &args,
7095 .rpc_resp = &res, 7512 .rpc_resp = &res,
7096 }; 7513 };
7097 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 7514 struct rpc_clnt *clnt = server->client;
7515 struct rpc_cred *cred = NULL;
7516 int status;
7517
7518 if (use_integrity) {
7519 clnt = server->nfs_client->cl_rpcclient;
7520 cred = nfs4_get_clid_cred(server->nfs_client);
7521 msg.rpc_cred = cred;
7522 }
7523
7524 dprintk("--> %s\n", __func__);
7525 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
7526 &res.seq_res, 0);
7527 dprintk("<-- %s status=%d\n", __func__, status);
7528
7529 if (cred)
7530 put_rpccred(cred);
7531
7532 return status;
7098} 7533}
7099 7534
7100static int 7535static int
@@ -7104,7 +7539,24 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
7104 struct nfs4_exception exception = { }; 7539 struct nfs4_exception exception = { };
7105 int err; 7540 int err;
7106 do { 7541 do {
7107 err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); 7542 /* first try using integrity protection */
7543 err = -NFS4ERR_WRONGSEC;
7544
7545 /* try to use integrity protection with machine cred */
7546 if (_nfs4_is_integrity_protected(server->nfs_client))
7547 err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
7548 flavors, true);
7549
7550 /*
7551 * if unable to use integrity protection, or SECINFO with
7552 * integrity protection returns NFS4ERR_WRONGSEC (which is
7553 * disallowed by spec, but exists in deployed servers) use
7554 * the current filesystem's rpc_client and the user cred.
7555 */
7556 if (err == -NFS4ERR_WRONGSEC)
7557 err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
7558 flavors, false);
7559
7108 switch (err) { 7560 switch (err) {
7109 case 0: 7561 case 0:
7110 case -NFS4ERR_WRONGSEC: 7562 case -NFS4ERR_WRONGSEC:
@@ -7124,8 +7576,10 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
7124{ 7576{
7125 int err; 7577 int err;
7126 struct page *page; 7578 struct page *page;
7127 rpc_authflavor_t flavor; 7579 rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
7128 struct nfs4_secinfo_flavors *flavors; 7580 struct nfs4_secinfo_flavors *flavors;
7581 struct nfs4_secinfo4 *secinfo;
7582 int i;
7129 7583
7130 page = alloc_page(GFP_KERNEL); 7584 page = alloc_page(GFP_KERNEL);
7131 if (!page) { 7585 if (!page) {
@@ -7147,9 +7601,31 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
7147 if (err) 7601 if (err)
7148 goto out_freepage; 7602 goto out_freepage;
7149 7603
7150 flavor = nfs_find_best_sec(flavors); 7604 for (i = 0; i < flavors->num_flavors; i++) {
7151 if (err == 0) 7605 secinfo = &flavors->flavors[i];
7152 err = nfs4_lookup_root_sec(server, fhandle, info, flavor); 7606
7607 switch (secinfo->flavor) {
7608 case RPC_AUTH_NULL:
7609 case RPC_AUTH_UNIX:
7610 case RPC_AUTH_GSS:
7611 flavor = rpcauth_get_pseudoflavor(secinfo->flavor,
7612 &secinfo->flavor_info);
7613 break;
7614 default:
7615 flavor = RPC_AUTH_MAXFLAVOR;
7616 break;
7617 }
7618
7619 if (flavor != RPC_AUTH_MAXFLAVOR) {
7620 err = nfs4_lookup_root_sec(server, fhandle,
7621 info, flavor);
7622 if (!err)
7623 break;
7624 }
7625 }
7626
7627 if (flavor == RPC_AUTH_MAXFLAVOR)
7628 err = -EPERM;
7153 7629
7154out_freepage: 7630out_freepage:
7155 put_page(page); 7631 put_page(page);
@@ -7174,11 +7650,15 @@ static int _nfs41_test_stateid(struct nfs_server *server,
7174 .rpc_resp = &res, 7650 .rpc_resp = &res,
7175 .rpc_cred = cred, 7651 .rpc_cred = cred,
7176 }; 7652 };
7653 struct rpc_clnt *rpc_client = server->client;
7654
7655 nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
7656 &rpc_client, &msg);
7177 7657
7178 dprintk("NFS call test_stateid %p\n", stateid); 7658 dprintk("NFS call test_stateid %p\n", stateid);
7179 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); 7659 nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
7180 nfs4_set_sequence_privileged(&args.seq_args); 7660 nfs4_set_sequence_privileged(&args.seq_args);
7181 status = nfs4_call_sync_sequence(server->client, server, &msg, 7661 status = nfs4_call_sync_sequence(rpc_client, server, &msg,
7182 &args.seq_args, &res.seq_res); 7662 &args.seq_args, &res.seq_res);
7183 if (status != NFS_OK) { 7663 if (status != NFS_OK) {
7184 dprintk("NFS reply test_stateid: failed, %d\n", status); 7664 dprintk("NFS reply test_stateid: failed, %d\n", status);
@@ -7247,7 +7727,7 @@ static void nfs41_free_stateid_release(void *calldata)
7247 kfree(calldata); 7727 kfree(calldata);
7248} 7728}
7249 7729
7250const struct rpc_call_ops nfs41_free_stateid_ops = { 7730static const struct rpc_call_ops nfs41_free_stateid_ops = {
7251 .rpc_call_prepare = nfs41_free_stateid_prepare, 7731 .rpc_call_prepare = nfs41_free_stateid_prepare,
7252 .rpc_call_done = nfs41_free_stateid_done, 7732 .rpc_call_done = nfs41_free_stateid_done,
7253 .rpc_release = nfs41_free_stateid_release, 7733 .rpc_release = nfs41_free_stateid_release,
@@ -7270,6 +7750,9 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
7270 }; 7750 };
7271 struct nfs_free_stateid_data *data; 7751 struct nfs_free_stateid_data *data;
7272 7752
7753 nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
7754 &task_setup.rpc_client, &msg);
7755
7273 dprintk("NFS call free_stateid %p\n", stateid); 7756 dprintk("NFS call free_stateid %p\n", stateid);
7274 data = kmalloc(sizeof(*data), GFP_NOFS); 7757 data = kmalloc(sizeof(*data), GFP_NOFS);
7275 if (!data) 7758 if (!data)
@@ -7281,7 +7764,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
7281 7764
7282 msg.rpc_argp = &data->args; 7765 msg.rpc_argp = &data->args;
7283 msg.rpc_resp = &data->res; 7766 msg.rpc_resp = &data->res;
7284 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 7767 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
7285 if (privileged) 7768 if (privileged)
7286 nfs4_set_sequence_privileged(&data->args.seq_args); 7769 nfs4_set_sequence_privileged(&data->args.seq_args);
7287 7770
@@ -7357,7 +7840,6 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
7357 .recover_open = nfs4_open_reclaim, 7840 .recover_open = nfs4_open_reclaim,
7358 .recover_lock = nfs4_lock_reclaim, 7841 .recover_lock = nfs4_lock_reclaim,
7359 .establish_clid = nfs4_init_clientid, 7842 .establish_clid = nfs4_init_clientid,
7360 .get_clid_cred = nfs4_get_setclientid_cred,
7361 .detect_trunking = nfs40_discover_server_trunking, 7843 .detect_trunking = nfs40_discover_server_trunking,
7362}; 7844};
7363 7845
@@ -7368,7 +7850,6 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
7368 .recover_open = nfs4_open_reclaim, 7850 .recover_open = nfs4_open_reclaim,
7369 .recover_lock = nfs4_lock_reclaim, 7851 .recover_lock = nfs4_lock_reclaim,
7370 .establish_clid = nfs41_init_clientid, 7852 .establish_clid = nfs41_init_clientid,
7371 .get_clid_cred = nfs4_get_exchange_id_cred,
7372 .reclaim_complete = nfs41_proc_reclaim_complete, 7853 .reclaim_complete = nfs41_proc_reclaim_complete,
7373 .detect_trunking = nfs41_discover_server_trunking, 7854 .detect_trunking = nfs41_discover_server_trunking,
7374}; 7855};
@@ -7380,7 +7861,6 @@ static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
7380 .recover_open = nfs4_open_expired, 7861 .recover_open = nfs4_open_expired,
7381 .recover_lock = nfs4_lock_expired, 7862 .recover_lock = nfs4_lock_expired,
7382 .establish_clid = nfs4_init_clientid, 7863 .establish_clid = nfs4_init_clientid,
7383 .get_clid_cred = nfs4_get_setclientid_cred,
7384}; 7864};
7385 7865
7386#if defined(CONFIG_NFS_V4_1) 7866#if defined(CONFIG_NFS_V4_1)
@@ -7390,7 +7870,6 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
7390 .recover_open = nfs41_open_expired, 7870 .recover_open = nfs41_open_expired,
7391 .recover_lock = nfs41_lock_expired, 7871 .recover_lock = nfs41_lock_expired,
7392 .establish_clid = nfs41_init_clientid, 7872 .establish_clid = nfs41_init_clientid,
7393 .get_clid_cred = nfs4_get_exchange_id_cred,
7394}; 7873};
7395#endif /* CONFIG_NFS_V4_1 */ 7874#endif /* CONFIG_NFS_V4_1 */
7396 7875
@@ -7414,10 +7893,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
7414 | NFS_CAP_ATOMIC_OPEN 7893 | NFS_CAP_ATOMIC_OPEN
7415 | NFS_CAP_CHANGE_ATTR 7894 | NFS_CAP_CHANGE_ATTR
7416 | NFS_CAP_POSIX_LOCK, 7895 | NFS_CAP_POSIX_LOCK,
7417 .call_sync = _nfs4_call_sync, 7896 .init_client = nfs40_init_client,
7897 .shutdown_client = nfs40_shutdown_client,
7418 .match_stateid = nfs4_match_stateid, 7898 .match_stateid = nfs4_match_stateid,
7419 .find_root_sec = nfs4_find_root_sec, 7899 .find_root_sec = nfs4_find_root_sec,
7420 .free_lock_state = nfs4_release_lockowner, 7900 .free_lock_state = nfs4_release_lockowner,
7901 .call_sync_ops = &nfs40_call_sync_ops,
7421 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 7902 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
7422 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 7903 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
7423 .state_renewal_ops = &nfs40_state_renewal_ops, 7904 .state_renewal_ops = &nfs40_state_renewal_ops,
@@ -7432,10 +7913,12 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7432 | NFS_CAP_POSIX_LOCK 7913 | NFS_CAP_POSIX_LOCK
7433 | NFS_CAP_STATEID_NFSV41 7914 | NFS_CAP_STATEID_NFSV41
7434 | NFS_CAP_ATOMIC_OPEN_V1, 7915 | NFS_CAP_ATOMIC_OPEN_V1,
7435 .call_sync = nfs4_call_sync_sequence, 7916 .init_client = nfs41_init_client,
7917 .shutdown_client = nfs41_shutdown_client,
7436 .match_stateid = nfs41_match_stateid, 7918 .match_stateid = nfs41_match_stateid,
7437 .find_root_sec = nfs41_find_root_sec, 7919 .find_root_sec = nfs41_find_root_sec,
7438 .free_lock_state = nfs41_free_lock_state, 7920 .free_lock_state = nfs41_free_lock_state,
7921 .call_sync_ops = &nfs41_call_sync_ops,
7439 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 7922 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
7440 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 7923 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
7441 .state_renewal_ops = &nfs41_state_renewal_ops, 7924 .state_renewal_ops = &nfs41_state_renewal_ops,
@@ -7451,10 +7934,12 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
7451 | NFS_CAP_POSIX_LOCK 7934 | NFS_CAP_POSIX_LOCK
7452 | NFS_CAP_STATEID_NFSV41 7935 | NFS_CAP_STATEID_NFSV41
7453 | NFS_CAP_ATOMIC_OPEN_V1, 7936 | NFS_CAP_ATOMIC_OPEN_V1,
7454 .call_sync = nfs4_call_sync_sequence, 7937 .init_client = nfs41_init_client,
7938 .shutdown_client = nfs41_shutdown_client,
7455 .match_stateid = nfs41_match_stateid, 7939 .match_stateid = nfs41_match_stateid,
7456 .find_root_sec = nfs41_find_root_sec, 7940 .find_root_sec = nfs41_find_root_sec,
7457 .free_lock_state = nfs41_free_lock_state, 7941 .free_lock_state = nfs41_free_lock_state,
7942 .call_sync_ops = &nfs41_call_sync_ops,
7458 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 7943 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
7459 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 7944 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
7460 .state_renewal_ops = &nfs41_state_renewal_ops, 7945 .state_renewal_ops = &nfs41_state_renewal_ops,
@@ -7471,7 +7956,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
7471#endif 7956#endif
7472}; 7957};
7473 7958
7474const struct inode_operations nfs4_dir_inode_operations = { 7959static const struct inode_operations nfs4_dir_inode_operations = {
7475 .create = nfs_create, 7960 .create = nfs_create,
7476 .lookup = nfs_lookup, 7961 .lookup = nfs_lookup,
7477 .atomic_open = nfs_atomic_open, 7962 .atomic_open = nfs_atomic_open,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 36e21cb29d65..cf883c7ae053 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -23,6 +23,14 @@
23 23
24#define NFSDBG_FACILITY NFSDBG_STATE 24#define NFSDBG_FACILITY NFSDBG_STATE
25 25
26static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
27{
28 tbl->highest_used_slotid = NFS4_NO_SLOT;
29 spin_lock_init(&tbl->slot_tbl_lock);
30 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
31 init_completion(&tbl->complete);
32}
33
26/* 34/*
27 * nfs4_shrink_slot_table - free retired slots from the slot table 35 * nfs4_shrink_slot_table - free retired slots from the slot table
28 */ 36 */
@@ -44,6 +52,17 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
44 } 52 }
45} 53}
46 54
55/**
56 * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
57 * @tbl - controlling slot table
58 *
59 */
60void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
61{
62 if (nfs4_slot_tbl_draining(tbl))
63 complete(&tbl->complete);
64}
65
47/* 66/*
48 * nfs4_free_slot - free a slot and efficiently update slot table. 67 * nfs4_free_slot - free a slot and efficiently update slot table.
49 * 68 *
@@ -76,7 +95,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
76 nfs4_slot_tbl_drain_complete(tbl); 95 nfs4_slot_tbl_drain_complete(tbl);
77 } 96 }
78 } 97 }
79 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, 98 dprintk("%s: slotid %u highest_used_slotid %u\n", __func__,
80 slotid, tbl->highest_used_slotid); 99 slotid, tbl->highest_used_slotid);
81} 100}
82 101
@@ -146,9 +165,9 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
146 ret->generation = tbl->generation; 165 ret->generation = tbl->generation;
147 166
148out: 167out:
149 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", 168 dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
150 __func__, tbl->used_slots[0], tbl->highest_used_slotid, 169 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
151 !IS_ERR(ret) ? ret->slot_nr : -1); 170 !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
152 return ret; 171 return ret;
153} 172}
154 173
@@ -191,7 +210,7 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
191{ 210{
192 int ret; 211 int ret;
193 212
194 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, 213 dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__,
195 max_reqs, tbl->max_slots); 214 max_reqs, tbl->max_slots);
196 215
197 if (max_reqs > NFS4_MAX_SLOT_TABLE) 216 if (max_reqs > NFS4_MAX_SLOT_TABLE)
@@ -205,18 +224,36 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
205 nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); 224 nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
206 spin_unlock(&tbl->slot_tbl_lock); 225 spin_unlock(&tbl->slot_tbl_lock);
207 226
208 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, 227 dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__,
209 tbl, tbl->slots, tbl->max_slots); 228 tbl, tbl->slots, tbl->max_slots);
210out: 229out:
211 dprintk("<-- %s: return %d\n", __func__, ret); 230 dprintk("<-- %s: return %d\n", __func__, ret);
212 return ret; 231 return ret;
213} 232}
214 233
215/* Destroy the slot table */ 234/**
216static void nfs4_destroy_slot_tables(struct nfs4_session *session) 235 * nfs4_release_slot_table - release resources attached to a slot table
236 * @tbl: slot table to shut down
237 *
238 */
239void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
240{
241 nfs4_shrink_slot_table(tbl, 0);
242}
243
244/**
245 * nfs4_setup_slot_table - prepare a stand-alone slot table for use
246 * @tbl: slot table to set up
247 * @max_reqs: maximum number of requests allowed
248 * @queue: name to give RPC wait queue
249 *
250 * Returns zero on success, or a negative errno.
251 */
252int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs,
253 const char *queue)
217{ 254{
218 nfs4_shrink_slot_table(&session->fc_slot_table, 0); 255 nfs4_init_slot_table(tbl, queue);
219 nfs4_shrink_slot_table(&session->bc_slot_table, 0); 256 return nfs4_realloc_slot_table(tbl, max_reqs, 0);
220} 257}
221 258
222static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) 259static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
@@ -273,6 +310,8 @@ void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
273 } 310 }
274} 311}
275 312
313#if defined(CONFIG_NFS_V4_1)
314
276static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, 315static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
277 u32 target_highest_slotid) 316 u32 target_highest_slotid)
278{ 317{
@@ -383,6 +422,12 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
383 spin_unlock(&tbl->slot_tbl_lock); 422 spin_unlock(&tbl->slot_tbl_lock);
384} 423}
385 424
425static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
426{
427 nfs4_release_slot_table(&session->fc_slot_table);
428 nfs4_release_slot_table(&session->bc_slot_table);
429}
430
386/* 431/*
387 * Initialize or reset the forechannel and backchannel tables 432 * Initialize or reset the forechannel and backchannel tables
388 */ 433 */
@@ -405,31 +450,20 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
405 if (status && tbl->slots == NULL) 450 if (status && tbl->slots == NULL)
406 /* Fore and back channel share a connection so get 451 /* Fore and back channel share a connection so get
407 * both slot tables or neither */ 452 * both slot tables or neither */
408 nfs4_destroy_slot_tables(ses); 453 nfs4_destroy_session_slot_tables(ses);
409 return status; 454 return status;
410} 455}
411 456
412struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) 457struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
413{ 458{
414 struct nfs4_session *session; 459 struct nfs4_session *session;
415 struct nfs4_slot_table *tbl;
416 460
417 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); 461 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
418 if (!session) 462 if (!session)
419 return NULL; 463 return NULL;
420 464
421 tbl = &session->fc_slot_table; 465 nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table");
422 tbl->highest_used_slotid = NFS4_NO_SLOT; 466 nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table");
423 spin_lock_init(&tbl->slot_tbl_lock);
424 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
425 init_completion(&tbl->complete);
426
427 tbl = &session->bc_slot_table;
428 tbl->highest_used_slotid = NFS4_NO_SLOT;
429 spin_lock_init(&tbl->slot_tbl_lock);
430 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
431 init_completion(&tbl->complete);
432
433 session->session_state = 1<<NFS4_SESSION_INITING; 467 session->session_state = 1<<NFS4_SESSION_INITING;
434 468
435 session->clp = clp; 469 session->clp = clp;
@@ -441,7 +475,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
441 struct rpc_xprt *xprt; 475 struct rpc_xprt *xprt;
442 struct rpc_cred *cred; 476 struct rpc_cred *cred;
443 477
444 cred = nfs4_get_exchange_id_cred(session->clp); 478 cred = nfs4_get_clid_cred(session->clp);
445 nfs4_proc_destroy_session(session, cred); 479 nfs4_proc_destroy_session(session, cred);
446 if (cred) 480 if (cred)
447 put_rpccred(cred); 481 put_rpccred(cred);
@@ -452,7 +486,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
452 dprintk("%s Destroy backchannel for xprt %p\n", 486 dprintk("%s Destroy backchannel for xprt %p\n",
453 __func__, xprt); 487 __func__, xprt);
454 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); 488 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
455 nfs4_destroy_slot_tables(session); 489 nfs4_destroy_session_slot_tables(session);
456 kfree(session); 490 kfree(session);
457} 491}
458 492
@@ -513,4 +547,4 @@ int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
513} 547}
514EXPORT_SYMBOL_GPL(nfs4_init_ds_session); 548EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
515 549
516 550#endif /* defined(CONFIG_NFS_V4_1) */
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3a153d82b90c..232306100651 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -8,7 +8,7 @@
8#define __LINUX_FS_NFS_NFS4SESSION_H 8#define __LINUX_FS_NFS_NFS4SESSION_H
9 9
10/* maximum number of slots to use */ 10/* maximum number of slots to use */
11#define NFS4_DEF_SLOT_TABLE_SIZE (16U) 11#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
12#define NFS4_MAX_SLOT_TABLE (1024U) 12#define NFS4_MAX_SLOT_TABLE (1024U)
13#define NFS4_NO_SLOT ((u32)-1) 13#define NFS4_NO_SLOT ((u32)-1)
14 14
@@ -72,10 +72,22 @@ enum nfs4_session_state {
72 NFS4_SESSION_INITING, 72 NFS4_SESSION_INITING,
73}; 73};
74 74
75#if defined(CONFIG_NFS_V4_1) 75extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
76 unsigned int max_reqs, const char *queue);
77extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl);
76extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); 78extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
77extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); 79extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
80extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
81bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
82 struct nfs4_slot *slot);
83void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
84
85static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
86{
87 return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
88}
78 89
90#if defined(CONFIG_NFS_V4_1)
79extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, 91extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
80 u32 target_highest_slotid); 92 u32 target_highest_slotid);
81extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, 93extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
@@ -89,17 +101,6 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
89extern int nfs4_init_session(struct nfs_client *clp); 101extern int nfs4_init_session(struct nfs_client *clp);
90extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); 102extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
91 103
92extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
93
94static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
95{
96 return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
97}
98
99bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
100 struct nfs4_slot *slot);
101void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
102
103/* 104/*
104 * Determine if sessions are in use. 105 * Determine if sessions are in use.
105 */ 106 */
@@ -117,6 +118,16 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
117 return 0; 118 return 0;
118} 119}
119 120
121#ifdef CONFIG_CRC32
122/*
123 * nfs_session_id_hash - calculate the crc32 hash for the session id
124 * @session - pointer to session
125 */
126#define nfs_session_id_hash(sess_id) \
127 (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
128#else
129#define nfs_session_id_hash(session) (0)
130#endif
120#else /* defined(CONFIG_NFS_V4_1) */ 131#else /* defined(CONFIG_NFS_V4_1) */
121 132
122static inline int nfs4_init_session(struct nfs_client *clp) 133static inline int nfs4_init_session(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e22862f13564..cc14cbb78b73 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -154,6 +154,19 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
154 return cred; 154 return cred;
155} 155}
156 156
157static void nfs4_root_machine_cred(struct nfs_client *clp)
158{
159 struct rpc_cred *cred, *new;
160
161 new = rpc_lookup_machine_cred(NULL);
162 spin_lock(&clp->cl_lock);
163 cred = clp->cl_machine_cred;
164 clp->cl_machine_cred = new;
165 spin_unlock(&clp->cl_lock);
166 if (cred != NULL)
167 put_rpccred(cred);
168}
169
157static struct rpc_cred * 170static struct rpc_cred *
158nfs4_get_renew_cred_server_locked(struct nfs_server *server) 171nfs4_get_renew_cred_server_locked(struct nfs_server *server)
159{ 172{
@@ -202,32 +215,6 @@ out:
202 return cred; 215 return cred;
203} 216}
204 217
205#if defined(CONFIG_NFS_V4_1)
206
207static int nfs41_setup_state_renewal(struct nfs_client *clp)
208{
209 int status;
210 struct nfs_fsinfo fsinfo;
211
212 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
213 nfs4_schedule_state_renewal(clp);
214 return 0;
215 }
216
217 status = nfs4_proc_get_lease_time(clp, &fsinfo);
218 if (status == 0) {
219 /* Update lease time and schedule renewal */
220 spin_lock(&clp->cl_lock);
221 clp->cl_lease_time = fsinfo.lease_time * HZ;
222 clp->cl_last_renewal = jiffies;
223 spin_unlock(&clp->cl_lock);
224
225 nfs4_schedule_state_renewal(clp);
226 }
227
228 return status;
229}
230
231static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) 218static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
232{ 219{
233 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 220 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -241,20 +228,18 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
241{ 228{
242 struct nfs4_session *ses = clp->cl_session; 229 struct nfs4_session *ses = clp->cl_session;
243 230
231 if (clp->cl_slot_tbl) {
232 nfs4_end_drain_slot_table(clp->cl_slot_tbl);
233 return;
234 }
235
244 if (ses != NULL) { 236 if (ses != NULL) {
245 nfs4_end_drain_slot_table(&ses->bc_slot_table); 237 nfs4_end_drain_slot_table(&ses->bc_slot_table);
246 nfs4_end_drain_slot_table(&ses->fc_slot_table); 238 nfs4_end_drain_slot_table(&ses->fc_slot_table);
247 } 239 }
248} 240}
249 241
250/* 242#if defined(CONFIG_NFS_V4_1)
251 * Signal state manager thread if session fore channel is drained
252 */
253void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
254{
255 if (nfs4_slot_tbl_draining(tbl))
256 complete(&tbl->complete);
257}
258 243
259static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) 244static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
260{ 245{
@@ -274,6 +259,9 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
274 struct nfs4_session *ses = clp->cl_session; 259 struct nfs4_session *ses = clp->cl_session;
275 int ret = 0; 260 int ret = 0;
276 261
262 if (clp->cl_slot_tbl)
263 return nfs4_drain_slot_tbl(clp->cl_slot_tbl);
264
277 /* back channel */ 265 /* back channel */
278 ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); 266 ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
279 if (ret) 267 if (ret)
@@ -282,6 +270,30 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
282 return nfs4_drain_slot_tbl(&ses->fc_slot_table); 270 return nfs4_drain_slot_tbl(&ses->fc_slot_table);
283} 271}
284 272
273static int nfs41_setup_state_renewal(struct nfs_client *clp)
274{
275 int status;
276 struct nfs_fsinfo fsinfo;
277
278 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
279 nfs4_schedule_state_renewal(clp);
280 return 0;
281 }
282
283 status = nfs4_proc_get_lease_time(clp, &fsinfo);
284 if (status == 0) {
285 /* Update lease time and schedule renewal */
286 spin_lock(&clp->cl_lock);
287 clp->cl_lease_time = fsinfo.lease_time * HZ;
288 clp->cl_last_renewal = jiffies;
289 spin_unlock(&clp->cl_lock);
290
291 nfs4_schedule_state_renewal(clp);
292 }
293
294 return status;
295}
296
285static void nfs41_finish_session_reset(struct nfs_client *clp) 297static void nfs41_finish_session_reset(struct nfs_client *clp)
286{ 298{
287 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 299 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
@@ -339,62 +351,21 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
339 return nfs41_walk_client_list(clp, result, cred); 351 return nfs41_walk_client_list(clp, result, cred);
340} 352}
341 353
342struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
343{
344 struct rpc_cred *cred;
345
346 spin_lock(&clp->cl_lock);
347 cred = nfs4_get_machine_cred_locked(clp);
348 spin_unlock(&clp->cl_lock);
349 return cred;
350}
351
352#endif /* CONFIG_NFS_V4_1 */ 354#endif /* CONFIG_NFS_V4_1 */
353 355
354static struct rpc_cred *
355nfs4_get_setclientid_cred_server(struct nfs_server *server)
356{
357 struct nfs_client *clp = server->nfs_client;
358 struct rpc_cred *cred = NULL;
359 struct nfs4_state_owner *sp;
360 struct rb_node *pos;
361
362 spin_lock(&clp->cl_lock);
363 pos = rb_first(&server->state_owners);
364 if (pos != NULL) {
365 sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
366 cred = get_rpccred(sp->so_cred);
367 }
368 spin_unlock(&clp->cl_lock);
369 return cred;
370}
371
372/** 356/**
373 * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation 357 * nfs4_get_clid_cred - Acquire credential for a setclientid operation
374 * @clp: client state handle 358 * @clp: client state handle
375 * 359 *
376 * Returns an rpc_cred with reference count bumped, or NULL. 360 * Returns an rpc_cred with reference count bumped, or NULL.
377 */ 361 */
378struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) 362struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
379{ 363{
380 struct nfs_server *server;
381 struct rpc_cred *cred; 364 struct rpc_cred *cred;
382 365
383 spin_lock(&clp->cl_lock); 366 spin_lock(&clp->cl_lock);
384 cred = nfs4_get_machine_cred_locked(clp); 367 cred = nfs4_get_machine_cred_locked(clp);
385 spin_unlock(&clp->cl_lock); 368 spin_unlock(&clp->cl_lock);
386 if (cred != NULL)
387 goto out;
388
389 rcu_read_lock();
390 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
391 cred = nfs4_get_setclientid_cred_server(server);
392 if (cred != NULL)
393 break;
394 }
395 rcu_read_unlock();
396
397out:
398 return cred; 369 return cred;
399} 370}
400 371
@@ -998,7 +969,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
998 fl_pid = lockowner->l_pid; 969 fl_pid = lockowner->l_pid;
999 spin_lock(&state->state_lock); 970 spin_lock(&state->state_lock);
1000 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 971 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
1001 if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { 972 if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
973 ret = -EIO;
974 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
1002 nfs4_stateid_copy(dst, &lsp->ls_stateid); 975 nfs4_stateid_copy(dst, &lsp->ls_stateid);
1003 ret = 0; 976 ret = 0;
1004 smp_rmb(); 977 smp_rmb();
@@ -1038,11 +1011,17 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
1038int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, 1011int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
1039 fmode_t fmode, const struct nfs_lockowner *lockowner) 1012 fmode_t fmode, const struct nfs_lockowner *lockowner)
1040{ 1013{
1041 int ret = 0; 1014 int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
1015 if (ret == -EIO)
1016 /* A lost lock - don't even consider delegations */
1017 goto out;
1042 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) 1018 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
1043 goto out; 1019 goto out;
1044 ret = nfs4_copy_lock_stateid(dst, state, lockowner);
1045 if (ret != -ENOENT) 1020 if (ret != -ENOENT)
1021 /* nfs4_copy_delegation_stateid() didn't over-write
1022 * dst, so it still has the lock stateid which we now
1023 * choose to use.
1024 */
1046 goto out; 1025 goto out;
1047 ret = nfs4_copy_open_stateid(dst, state); 1026 ret = nfs4_copy_open_stateid(dst, state);
1048out: 1027out:
@@ -1443,14 +1422,16 @@ restart:
1443 if (status >= 0) { 1422 if (status >= 0) {
1444 status = nfs4_reclaim_locks(state, ops); 1423 status = nfs4_reclaim_locks(state, ops);
1445 if (status >= 0) { 1424 if (status >= 0) {
1446 spin_lock(&state->state_lock); 1425 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
1447 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1426 spin_lock(&state->state_lock);
1448 if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) 1427 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1449 pr_warn_ratelimited("NFS: " 1428 if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
1450 "%s: Lock reclaim " 1429 pr_warn_ratelimited("NFS: "
1451 "failed!\n", __func__); 1430 "%s: Lock reclaim "
1431 "failed!\n", __func__);
1432 }
1433 spin_unlock(&state->state_lock);
1452 } 1434 }
1453 spin_unlock(&state->state_lock);
1454 nfs4_put_open_state(state); 1435 nfs4_put_open_state(state);
1455 spin_lock(&sp->so_lock); 1436 spin_lock(&sp->so_lock);
1456 goto restart; 1437 goto restart;
@@ -1618,7 +1599,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1618 if (!nfs4_state_clear_reclaim_reboot(clp)) 1599 if (!nfs4_state_clear_reclaim_reboot(clp))
1619 return; 1600 return;
1620 ops = clp->cl_mvops->reboot_recovery_ops; 1601 ops = clp->cl_mvops->reboot_recovery_ops;
1621 cred = ops->get_clid_cred(clp); 1602 cred = nfs4_get_clid_cred(clp);
1622 nfs4_reclaim_complete(clp, ops, cred); 1603 nfs4_reclaim_complete(clp, ops, cred);
1623 put_rpccred(cred); 1604 put_rpccred(cred);
1624} 1605}
@@ -1732,7 +1713,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
1732 cred = ops->get_state_renewal_cred_locked(clp); 1713 cred = ops->get_state_renewal_cred_locked(clp);
1733 spin_unlock(&clp->cl_lock); 1714 spin_unlock(&clp->cl_lock);
1734 if (cred == NULL) { 1715 if (cred == NULL) {
1735 cred = nfs4_get_setclientid_cred(clp); 1716 cred = nfs4_get_clid_cred(clp);
1736 status = -ENOKEY; 1717 status = -ENOKEY;
1737 if (cred == NULL) 1718 if (cred == NULL)
1738 goto out; 1719 goto out;
@@ -1804,7 +1785,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
1804 clp->cl_mvops->reboot_recovery_ops; 1785 clp->cl_mvops->reboot_recovery_ops;
1805 int status; 1786 int status;
1806 1787
1807 cred = ops->get_clid_cred(clp); 1788 cred = nfs4_get_clid_cred(clp);
1808 if (cred == NULL) 1789 if (cred == NULL)
1809 return -ENOENT; 1790 return -ENOENT;
1810 status = ops->establish_clid(clp, cred); 1791 status = ops->establish_clid(clp, cred);
@@ -1878,7 +1859,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
1878 mutex_lock(&nfs_clid_init_mutex); 1859 mutex_lock(&nfs_clid_init_mutex);
1879again: 1860again:
1880 status = -ENOENT; 1861 status = -ENOENT;
1881 cred = ops->get_clid_cred(clp); 1862 cred = nfs4_get_clid_cred(clp);
1882 if (cred == NULL) 1863 if (cred == NULL)
1883 goto out_unlock; 1864 goto out_unlock;
1884 1865
@@ -1896,7 +1877,11 @@ again:
1896 __func__, status); 1877 __func__, status);
1897 goto again; 1878 goto again;
1898 case -EACCES: 1879 case -EACCES:
1899 if (i++) 1880 if (i++ == 0) {
1881 nfs4_root_machine_cred(clp);
1882 goto again;
1883 }
1884 if (i > 2)
1900 break; 1885 break;
1901 case -NFS4ERR_CLID_INUSE: 1886 case -NFS4ERR_CLID_INUSE:
1902 case -NFS4ERR_WRONGSEC: 1887 case -NFS4ERR_WRONGSEC:
@@ -2052,7 +2037,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
2052 if (!nfs4_has_session(clp)) 2037 if (!nfs4_has_session(clp))
2053 return 0; 2038 return 0;
2054 nfs4_begin_drain_session(clp); 2039 nfs4_begin_drain_session(clp);
2055 cred = nfs4_get_exchange_id_cred(clp); 2040 cred = nfs4_get_clid_cred(clp);
2056 status = nfs4_proc_destroy_session(clp->cl_session, cred); 2041 status = nfs4_proc_destroy_session(clp->cl_session, cred);
2057 switch (status) { 2042 switch (status) {
2058 case 0: 2043 case 0:
@@ -2095,7 +2080,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2095 if (!nfs4_has_session(clp)) 2080 if (!nfs4_has_session(clp))
2096 return 0; 2081 return 0;
2097 nfs4_begin_drain_session(clp); 2082 nfs4_begin_drain_session(clp);
2098 cred = nfs4_get_exchange_id_cred(clp); 2083 cred = nfs4_get_clid_cred(clp);
2099 ret = nfs4_proc_bind_conn_to_session(clp, cred); 2084 ret = nfs4_proc_bind_conn_to_session(clp, cred);
2100 if (cred) 2085 if (cred)
2101 put_rpccred(cred); 2086 put_rpccred(cred);
@@ -2116,7 +2101,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2116} 2101}
2117#else /* CONFIG_NFS_V4_1 */ 2102#else /* CONFIG_NFS_V4_1 */
2118static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 2103static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
2119static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
2120 2104
2121static int nfs4_bind_conn_to_session(struct nfs_client *clp) 2105static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2122{ 2106{
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 5dbe2d269210..e26acdd1a645 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -253,8 +253,6 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
253 253
254 dfprintk(MOUNT, "--> nfs4_try_mount()\n"); 254 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
255 255
256 if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR)
257 data->auth_flavors[0] = RPC_AUTH_UNIX;
258 export_path = data->nfs_server.export_path; 256 export_path = data->nfs_server.export_path;
259 data->nfs_server.export_path = "/"; 257 data->nfs_server.export_path = "/";
260 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, 258 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
new file mode 100644
index 000000000000..d774335cc8bc
--- /dev/null
+++ b/fs/nfs/nfs4trace.c
@@ -0,0 +1,17 @@
1/*
2 * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
3 */
4#include <linux/nfs_fs.h>
5#include "nfs4_fs.h"
6#include "internal.h"
7#include "nfs4session.h"
8#include "callback.h"
9
10#define CREATE_TRACE_POINTS
11#include "nfs4trace.h"
12
13#ifdef CONFIG_NFS_V4_1
14EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
15EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
16EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
17#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
new file mode 100644
index 000000000000..849cf146db30
--- /dev/null
+++ b/fs/nfs/nfs4trace.h
@@ -0,0 +1,1148 @@
1/*
2 * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
3 */
4#undef TRACE_SYSTEM
5#define TRACE_SYSTEM nfs4
6
7#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ)
8#define _TRACE_NFS4_H
9
10#include <linux/tracepoint.h>
11
12#define show_nfsv4_errors(error) \
13 __print_symbolic(error, \
14 { NFS4_OK, "OK" }, \
15 /* Mapped by nfs4_stat_to_errno() */ \
16 { -EPERM, "EPERM" }, \
17 { -ENOENT, "ENOENT" }, \
18 { -EIO, "EIO" }, \
19 { -ENXIO, "ENXIO" }, \
20 { -EACCES, "EACCES" }, \
21 { -EEXIST, "EEXIST" }, \
22 { -EXDEV, "EXDEV" }, \
23 { -ENOTDIR, "ENOTDIR" }, \
24 { -EISDIR, "EISDIR" }, \
25 { -EFBIG, "EFBIG" }, \
26 { -ENOSPC, "ENOSPC" }, \
27 { -EROFS, "EROFS" }, \
28 { -EMLINK, "EMLINK" }, \
29 { -ENAMETOOLONG, "ENAMETOOLONG" }, \
30 { -ENOTEMPTY, "ENOTEMPTY" }, \
31 { -EDQUOT, "EDQUOT" }, \
32 { -ESTALE, "ESTALE" }, \
33 { -EBADHANDLE, "EBADHANDLE" }, \
34 { -EBADCOOKIE, "EBADCOOKIE" }, \
35 { -ENOTSUPP, "ENOTSUPP" }, \
36 { -ETOOSMALL, "ETOOSMALL" }, \
37 { -EREMOTEIO, "EREMOTEIO" }, \
38 { -EBADTYPE, "EBADTYPE" }, \
39 { -EAGAIN, "EAGAIN" }, \
40 { -ELOOP, "ELOOP" }, \
41 { -EOPNOTSUPP, "EOPNOTSUPP" }, \
42 { -EDEADLK, "EDEADLK" }, \
43 /* RPC errors */ \
44 { -ENOMEM, "ENOMEM" }, \
45 { -EKEYEXPIRED, "EKEYEXPIRED" }, \
46 { -ETIMEDOUT, "ETIMEDOUT" }, \
47 { -ERESTARTSYS, "ERESTARTSYS" }, \
48 { -ECONNREFUSED, "ECONNREFUSED" }, \
49 { -ECONNRESET, "ECONNRESET" }, \
50 { -ENETUNREACH, "ENETUNREACH" }, \
51 { -EHOSTUNREACH, "EHOSTUNREACH" }, \
52 { -EHOSTDOWN, "EHOSTDOWN" }, \
53 { -EPIPE, "EPIPE" }, \
54 { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \
55 { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
56 /* NFSv4 native errors */ \
57 { -NFS4ERR_ACCESS, "ACCESS" }, \
58 { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
59 { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
60 { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
61 { -NFS4ERR_BADCHAR, "BADCHAR" }, \
62 { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \
63 { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \
64 { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
65 { -NFS4ERR_BADLABEL, "BADLABEL" }, \
66 { -NFS4ERR_BADNAME, "BADNAME" }, \
67 { -NFS4ERR_BADOWNER, "BADOWNER" }, \
68 { -NFS4ERR_BADSESSION, "BADSESSION" }, \
69 { -NFS4ERR_BADSLOT, "BADSLOT" }, \
70 { -NFS4ERR_BADTYPE, "BADTYPE" }, \
71 { -NFS4ERR_BADXDR, "BADXDR" }, \
72 { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
73 { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
74 { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
75 { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
76 { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
77 { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
78 { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
79 { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
80 { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
81 { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
82 { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
83 "CONN_NOT_BOUND_TO_SESSION" }, \
84 { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \
85 { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
86 { -NFS4ERR_DELAY, "DELAY" }, \
87 { -NFS4ERR_DELEG_ALREADY_WANTED, \
88 "DELEG_ALREADY_WANTED" }, \
89 { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
90 { -NFS4ERR_DENIED, "DENIED" }, \
91 { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
92 { -NFS4ERR_DQUOT, "DQUOT" }, \
93 { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
94 { -NFS4ERR_EXIST, "EXIST" }, \
95 { -NFS4ERR_EXPIRED, "EXPIRED" }, \
96 { -NFS4ERR_FBIG, "FBIG" }, \
97 { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
98 { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
99 { -NFS4ERR_GRACE, "GRACE" }, \
100 { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
101 { -NFS4ERR_INVAL, "INVAL" }, \
102 { -NFS4ERR_IO, "IO" }, \
103 { -NFS4ERR_ISDIR, "ISDIR" }, \
104 { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
105 { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
106 { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
107 { -NFS4ERR_LOCKED, "LOCKED" }, \
108 { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
109 { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
110 { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
111 { -NFS4ERR_MLINK, "MLINK" }, \
112 { -NFS4ERR_MOVED, "MOVED" }, \
113 { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
114 { -NFS4ERR_NOENT, "NOENT" }, \
115 { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
116 { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
117 { -NFS4ERR_NOSPC, "NOSPC" }, \
118 { -NFS4ERR_NOTDIR, "NOTDIR" }, \
119 { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
120 { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \
121 { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
122 { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \
123 { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \
124 { -NFS4ERR_NXIO, "NXIO" }, \
125 { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
126 { -NFS4ERR_OPENMODE, "OPENMODE" }, \
127 { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
128 { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
129 { -NFS4ERR_PERM, "PERM" }, \
130 { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
131 { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
132 { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
133 { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
134 { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
135 { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
136 { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
137 { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \
138 "REP_TOO_BIG_TO_CACHE" }, \
139 { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
140 { -NFS4ERR_RESOURCE, "RESOURCE" }, \
141 { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \
142 { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
143 { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
144 { -NFS4ERR_ROFS, "ROFS" }, \
145 { -NFS4ERR_SAME, "SAME" }, \
146 { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
147 { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
148 { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
149 { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
150 { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
151 { -NFS4ERR_STALE, "STALE" }, \
152 { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
153 { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
154 { -NFS4ERR_SYMLINK, "SYMLINK" }, \
155 { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \
156 { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
157 { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
158 { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
159 { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \
160 { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
161 { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
162 { -NFS4ERR_XDEV, "XDEV" })
163
164#define show_open_flags(flags) \
165 __print_flags(flags, "|", \
166 { O_CREAT, "O_CREAT" }, \
167 { O_EXCL, "O_EXCL" }, \
168 { O_TRUNC, "O_TRUNC" }, \
169 { O_DIRECT, "O_DIRECT" })
170
171#define show_fmode_flags(mode) \
172 __print_flags(mode, "|", \
173 { ((__force unsigned long)FMODE_READ), "READ" }, \
174 { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
175 { ((__force unsigned long)FMODE_EXEC), "EXEC" })
176
177#define show_nfs_fattr_flags(valid) \
178 __print_flags((unsigned long)valid, "|", \
179 { NFS_ATTR_FATTR_TYPE, "TYPE" }, \
180 { NFS_ATTR_FATTR_MODE, "MODE" }, \
181 { NFS_ATTR_FATTR_NLINK, "NLINK" }, \
182 { NFS_ATTR_FATTR_OWNER, "OWNER" }, \
183 { NFS_ATTR_FATTR_GROUP, "GROUP" }, \
184 { NFS_ATTR_FATTR_RDEV, "RDEV" }, \
185 { NFS_ATTR_FATTR_SIZE, "SIZE" }, \
186 { NFS_ATTR_FATTR_FSID, "FSID" }, \
187 { NFS_ATTR_FATTR_FILEID, "FILEID" }, \
188 { NFS_ATTR_FATTR_ATIME, "ATIME" }, \
189 { NFS_ATTR_FATTR_MTIME, "MTIME" }, \
190 { NFS_ATTR_FATTR_CTIME, "CTIME" }, \
191 { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
192 { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
193 { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
194
195DECLARE_EVENT_CLASS(nfs4_clientid_event,
196 TP_PROTO(
197 const struct nfs_client *clp,
198 int error
199 ),
200
201 TP_ARGS(clp, error),
202
203 TP_STRUCT__entry(
204 __string(dstaddr,
205 rpc_peeraddr2str(clp->cl_rpcclient,
206 RPC_DISPLAY_ADDR))
207 __field(int, error)
208 ),
209
210 TP_fast_assign(
211 __entry->error = error;
212 __assign_str(dstaddr,
213 rpc_peeraddr2str(clp->cl_rpcclient,
214 RPC_DISPLAY_ADDR));
215 ),
216
217 TP_printk(
218 "error=%d (%s) dstaddr=%s",
219 __entry->error,
220 show_nfsv4_errors(__entry->error),
221 __get_str(dstaddr)
222 )
223);
224#define DEFINE_NFS4_CLIENTID_EVENT(name) \
225 DEFINE_EVENT(nfs4_clientid_event, name, \
226 TP_PROTO( \
227 const struct nfs_client *clp, \
228 int error \
229 ), \
230 TP_ARGS(clp, error))
231DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid);
232DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm);
233DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew);
234DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async);
235#ifdef CONFIG_NFS_V4_1
236DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id);
237DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session);
238DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session);
239DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid);
240DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
241DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
242DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
243
244TRACE_EVENT(nfs4_setup_sequence,
245 TP_PROTO(
246 const struct nfs4_session *session,
247 const struct nfs4_sequence_args *args
248 ),
249 TP_ARGS(session, args),
250
251 TP_STRUCT__entry(
252 __field(unsigned int, session)
253 __field(unsigned int, slot_nr)
254 __field(unsigned int, seq_nr)
255 __field(unsigned int, highest_used_slotid)
256 ),
257
258 TP_fast_assign(
259 const struct nfs4_slot *sa_slot = args->sa_slot;
260 __entry->session = nfs_session_id_hash(&session->sess_id);
261 __entry->slot_nr = sa_slot->slot_nr;
262 __entry->seq_nr = sa_slot->seq_nr;
263 __entry->highest_used_slotid =
264 sa_slot->table->highest_used_slotid;
265 ),
266 TP_printk(
267 "session=0x%08x slot_nr=%u seq_nr=%u "
268 "highest_used_slotid=%u",
269 __entry->session,
270 __entry->slot_nr,
271 __entry->seq_nr,
272 __entry->highest_used_slotid
273 )
274);
275
276#define show_nfs4_sequence_status_flags(status) \
277 __print_flags((unsigned long)status, "|", \
278 { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
279 { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
280 "CB_GSS_CONTEXTS_EXPIRING" }, \
281 { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
282 "CB_GSS_CONTEXTS_EXPIRED" }, \
283 { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
284 "EXPIRED_ALL_STATE_REVOKED" }, \
285 { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
286 "EXPIRED_SOME_STATE_REVOKED" }, \
287 { SEQ4_STATUS_ADMIN_STATE_REVOKED, \
288 "ADMIN_STATE_REVOKED" }, \
289 { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \
290 "RECALLABLE_STATE_REVOKED" }, \
291 { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
292 { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
293 "RESTART_RECLAIM_NEEDED" }, \
294 { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
295 "CB_PATH_DOWN_SESSION" }, \
296 { SEQ4_STATUS_BACKCHANNEL_FAULT, \
297 "BACKCHANNEL_FAULT" })
298
299TRACE_EVENT(nfs4_sequence_done,
300 TP_PROTO(
301 const struct nfs4_session *session,
302 const struct nfs4_sequence_res *res
303 ),
304 TP_ARGS(session, res),
305
306 TP_STRUCT__entry(
307 __field(unsigned int, session)
308 __field(unsigned int, slot_nr)
309 __field(unsigned int, seq_nr)
310 __field(unsigned int, highest_slotid)
311 __field(unsigned int, target_highest_slotid)
312 __field(unsigned int, status_flags)
313 __field(int, error)
314 ),
315
316 TP_fast_assign(
317 const struct nfs4_slot *sr_slot = res->sr_slot;
318 __entry->session = nfs_session_id_hash(&session->sess_id);
319 __entry->slot_nr = sr_slot->slot_nr;
320 __entry->seq_nr = sr_slot->seq_nr;
321 __entry->highest_slotid = res->sr_highest_slotid;
322 __entry->target_highest_slotid =
323 res->sr_target_highest_slotid;
324 __entry->error = res->sr_status;
325 ),
326 TP_printk(
327 "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
328 "highest_slotid=%u target_highest_slotid=%u "
329 "status_flags=%u (%s)",
330 __entry->error,
331 show_nfsv4_errors(__entry->error),
332 __entry->session,
333 __entry->slot_nr,
334 __entry->seq_nr,
335 __entry->highest_slotid,
336 __entry->target_highest_slotid,
337 __entry->status_flags,
338 show_nfs4_sequence_status_flags(__entry->status_flags)
339 )
340);
341
342struct cb_sequenceargs;
343struct cb_sequenceres;
344
345TRACE_EVENT(nfs4_cb_sequence,
346 TP_PROTO(
347 const struct cb_sequenceargs *args,
348 const struct cb_sequenceres *res,
349 __be32 status
350 ),
351 TP_ARGS(args, res, status),
352
353 TP_STRUCT__entry(
354 __field(unsigned int, session)
355 __field(unsigned int, slot_nr)
356 __field(unsigned int, seq_nr)
357 __field(unsigned int, highest_slotid)
358 __field(unsigned int, cachethis)
359 __field(int, error)
360 ),
361
362 TP_fast_assign(
363 __entry->session = nfs_session_id_hash(&args->csa_sessionid);
364 __entry->slot_nr = args->csa_slotid;
365 __entry->seq_nr = args->csa_sequenceid;
366 __entry->highest_slotid = args->csa_highestslotid;
367 __entry->cachethis = args->csa_cachethis;
368 __entry->error = -be32_to_cpu(status);
369 ),
370
371 TP_printk(
372 "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
373 "highest_slotid=%u",
374 __entry->error,
375 show_nfsv4_errors(__entry->error),
376 __entry->session,
377 __entry->slot_nr,
378 __entry->seq_nr,
379 __entry->highest_slotid
380 )
381);
382#endif /* CONFIG_NFS_V4_1 */
383
384DECLARE_EVENT_CLASS(nfs4_open_event,
385 TP_PROTO(
386 const struct nfs_open_context *ctx,
387 int flags,
388 int error
389 ),
390
391 TP_ARGS(ctx, flags, error),
392
393 TP_STRUCT__entry(
394 __field(int, error)
395 __field(unsigned int, flags)
396 __field(unsigned int, fmode)
397 __field(dev_t, dev)
398 __field(u32, fhandle)
399 __field(u64, fileid)
400 __field(u64, dir)
401 __string(name, ctx->dentry->d_name.name)
402 ),
403
404 TP_fast_assign(
405 const struct nfs4_state *state = ctx->state;
406 const struct inode *inode = NULL;
407
408 __entry->error = error;
409 __entry->flags = flags;
410 __entry->fmode = (__force unsigned int)ctx->mode;
411 __entry->dev = ctx->dentry->d_sb->s_dev;
412 if (!IS_ERR(state))
413 inode = state->inode;
414 if (inode != NULL) {
415 __entry->fileid = NFS_FILEID(inode);
416 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
417 } else {
418 __entry->fileid = 0;
419 __entry->fhandle = 0;
420 }
421 __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode);
422 __assign_str(name, ctx->dentry->d_name.name);
423 ),
424
425 TP_printk(
426 "error=%d (%s) flags=%d (%s) fmode=%s "
427 "fileid=%02x:%02x:%llu fhandle=0x%08x "
428 "name=%02x:%02x:%llu/%s",
429 __entry->error,
430 show_nfsv4_errors(__entry->error),
431 __entry->flags,
432 show_open_flags(__entry->flags),
433 show_fmode_flags(__entry->fmode),
434 MAJOR(__entry->dev), MINOR(__entry->dev),
435 (unsigned long long)__entry->fileid,
436 __entry->fhandle,
437 MAJOR(__entry->dev), MINOR(__entry->dev),
438 (unsigned long long)__entry->dir,
439 __get_str(name)
440 )
441);
442
443#define DEFINE_NFS4_OPEN_EVENT(name) \
444 DEFINE_EVENT(nfs4_open_event, name, \
445 TP_PROTO( \
446 const struct nfs_open_context *ctx, \
447 int flags, \
448 int error \
449 ), \
450 TP_ARGS(ctx, flags, error))
451DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
452DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
453DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
454
455TRACE_EVENT(nfs4_close,
456 TP_PROTO(
457 const struct nfs4_state *state,
458 const struct nfs_closeargs *args,
459 const struct nfs_closeres *res,
460 int error
461 ),
462
463 TP_ARGS(state, args, res, error),
464
465 TP_STRUCT__entry(
466 __field(dev_t, dev)
467 __field(u32, fhandle)
468 __field(u64, fileid)
469 __field(unsigned int, fmode)
470 __field(int, error)
471 ),
472
473 TP_fast_assign(
474 const struct inode *inode = state->inode;
475
476 __entry->dev = inode->i_sb->s_dev;
477 __entry->fileid = NFS_FILEID(inode);
478 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
479 __entry->fmode = (__force unsigned int)state->state;
480 __entry->error = error;
481 ),
482
483 TP_printk(
484 "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
485 "fhandle=0x%08x",
486 __entry->error,
487 show_nfsv4_errors(__entry->error),
488 __entry->fmode ? show_fmode_flags(__entry->fmode) :
489 "closed",
490 MAJOR(__entry->dev), MINOR(__entry->dev),
491 (unsigned long long)__entry->fileid,
492 __entry->fhandle
493 )
494);
495
496#define show_lock_cmd(type) \
497 __print_symbolic((int)type, \
498 { F_GETLK, "GETLK" }, \
499 { F_SETLK, "SETLK" }, \
500 { F_SETLKW, "SETLKW" })
501#define show_lock_type(type) \
502 __print_symbolic((int)type, \
503 { F_RDLCK, "RDLCK" }, \
504 { F_WRLCK, "WRLCK" }, \
505 { F_UNLCK, "UNLCK" })
506
507DECLARE_EVENT_CLASS(nfs4_lock_event,
508 TP_PROTO(
509 const struct file_lock *request,
510 const struct nfs4_state *state,
511 int cmd,
512 int error
513 ),
514
515 TP_ARGS(request, state, cmd, error),
516
517 TP_STRUCT__entry(
518 __field(int, error)
519 __field(int, cmd)
520 __field(char, type)
521 __field(loff_t, start)
522 __field(loff_t, end)
523 __field(dev_t, dev)
524 __field(u32, fhandle)
525 __field(u64, fileid)
526 ),
527
528 TP_fast_assign(
529 const struct inode *inode = state->inode;
530
531 __entry->error = error;
532 __entry->cmd = cmd;
533 __entry->type = request->fl_type;
534 __entry->start = request->fl_start;
535 __entry->end = request->fl_end;
536 __entry->dev = inode->i_sb->s_dev;
537 __entry->fileid = NFS_FILEID(inode);
538 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
539 ),
540
541 TP_printk(
542 "error=%d (%s) cmd=%s:%s range=%lld:%lld "
543 "fileid=%02x:%02x:%llu fhandle=0x%08x",
544 __entry->error,
545 show_nfsv4_errors(__entry->error),
546 show_lock_cmd(__entry->cmd),
547 show_lock_type(__entry->type),
548 (long long)__entry->start,
549 (long long)__entry->end,
550 MAJOR(__entry->dev), MINOR(__entry->dev),
551 (unsigned long long)__entry->fileid,
552 __entry->fhandle
553 )
554);
555
556#define DEFINE_NFS4_LOCK_EVENT(name) \
557 DEFINE_EVENT(nfs4_lock_event, name, \
558 TP_PROTO( \
559 const struct file_lock *request, \
560 const struct nfs4_state *state, \
561 int cmd, \
562 int error \
563 ), \
564 TP_ARGS(request, state, cmd, error))
565DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
566DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
567DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
568DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
569DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
570
571DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
572 TP_PROTO(
573 const struct inode *inode,
574 fmode_t fmode
575 ),
576
577 TP_ARGS(inode, fmode),
578
579 TP_STRUCT__entry(
580 __field(dev_t, dev)
581 __field(u32, fhandle)
582 __field(u64, fileid)
583 __field(unsigned int, fmode)
584 ),
585
586 TP_fast_assign(
587 __entry->dev = inode->i_sb->s_dev;
588 __entry->fileid = NFS_FILEID(inode);
589 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
590 __entry->fmode = (__force unsigned int)fmode;
591 ),
592
593 TP_printk(
594 "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
595 show_fmode_flags(__entry->fmode),
596 MAJOR(__entry->dev), MINOR(__entry->dev),
597 (unsigned long long)__entry->fileid,
598 __entry->fhandle
599 )
600);
601#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \
602 DEFINE_EVENT(nfs4_set_delegation_event, name, \
603 TP_PROTO( \
604 const struct inode *inode, \
605 fmode_t fmode \
606 ), \
607 TP_ARGS(inode, fmode))
608DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
609DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
610
611TRACE_EVENT(nfs4_delegreturn_exit,
612 TP_PROTO(
613 const struct nfs4_delegreturnargs *args,
614 const struct nfs4_delegreturnres *res,
615 int error
616 ),
617
618 TP_ARGS(args, res, error),
619
620 TP_STRUCT__entry(
621 __field(dev_t, dev)
622 __field(u32, fhandle)
623 __field(int, error)
624 ),
625
626 TP_fast_assign(
627 __entry->dev = res->server->s_dev;
628 __entry->fhandle = nfs_fhandle_hash(args->fhandle);
629 __entry->error = error;
630 ),
631
632 TP_printk(
633 "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
634 __entry->error,
635 show_nfsv4_errors(__entry->error),
636 MAJOR(__entry->dev), MINOR(__entry->dev),
637 __entry->fhandle
638 )
639);
640
641#ifdef CONFIG_NFS_V4_1
642DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
643 TP_PROTO(
644 const struct nfs4_state *state,
645 const struct nfs4_lock_state *lsp,
646 int error
647 ),
648
649 TP_ARGS(state, lsp, error),
650
651 TP_STRUCT__entry(
652 __field(int, error)
653 __field(dev_t, dev)
654 __field(u32, fhandle)
655 __field(u64, fileid)
656 ),
657
658 TP_fast_assign(
659 const struct inode *inode = state->inode;
660
661 __entry->error = error;
662 __entry->dev = inode->i_sb->s_dev;
663 __entry->fileid = NFS_FILEID(inode);
664 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
665 ),
666
667 TP_printk(
668 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
669 __entry->error,
670 show_nfsv4_errors(__entry->error),
671 MAJOR(__entry->dev), MINOR(__entry->dev),
672 (unsigned long long)__entry->fileid,
673 __entry->fhandle
674 )
675);
676
677#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \
678 DEFINE_EVENT(nfs4_test_stateid_event, name, \
679 TP_PROTO( \
680 const struct nfs4_state *state, \
681 const struct nfs4_lock_state *lsp, \
682 int error \
683 ), \
684 TP_ARGS(state, lsp, error))
685DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid);
686DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid);
687DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid);
688#endif /* CONFIG_NFS_V4_1 */
689
690DECLARE_EVENT_CLASS(nfs4_lookup_event,
691 TP_PROTO(
692 const struct inode *dir,
693 const struct qstr *name,
694 int error
695 ),
696
697 TP_ARGS(dir, name, error),
698
699 TP_STRUCT__entry(
700 __field(dev_t, dev)
701 __field(int, error)
702 __field(u64, dir)
703 __string(name, name->name)
704 ),
705
706 TP_fast_assign(
707 __entry->dev = dir->i_sb->s_dev;
708 __entry->dir = NFS_FILEID(dir);
709 __entry->error = error;
710 __assign_str(name, name->name);
711 ),
712
713 TP_printk(
714 "error=%d (%s) name=%02x:%02x:%llu/%s",
715 __entry->error,
716 show_nfsv4_errors(__entry->error),
717 MAJOR(__entry->dev), MINOR(__entry->dev),
718 (unsigned long long)__entry->dir,
719 __get_str(name)
720 )
721);
722
723#define DEFINE_NFS4_LOOKUP_EVENT(name) \
724 DEFINE_EVENT(nfs4_lookup_event, name, \
725 TP_PROTO( \
726 const struct inode *dir, \
727 const struct qstr *name, \
728 int error \
729 ), \
730 TP_ARGS(dir, name, error))
731
732DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup);
733DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink);
734DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir);
735DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod);
736DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove);
737DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations);
738DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo);
739
740TRACE_EVENT(nfs4_rename,
741 TP_PROTO(
742 const struct inode *olddir,
743 const struct qstr *oldname,
744 const struct inode *newdir,
745 const struct qstr *newname,
746 int error
747 ),
748
749 TP_ARGS(olddir, oldname, newdir, newname, error),
750
751 TP_STRUCT__entry(
752 __field(dev_t, dev)
753 __field(int, error)
754 __field(u64, olddir)
755 __string(oldname, oldname->name)
756 __field(u64, newdir)
757 __string(newname, newname->name)
758 ),
759
760 TP_fast_assign(
761 __entry->dev = olddir->i_sb->s_dev;
762 __entry->olddir = NFS_FILEID(olddir);
763 __entry->newdir = NFS_FILEID(newdir);
764 __entry->error = error;
765 __assign_str(oldname, oldname->name);
766 __assign_str(newname, newname->name);
767 ),
768
769 TP_printk(
770 "error=%d (%s) oldname=%02x:%02x:%llu/%s "
771 "newname=%02x:%02x:%llu/%s",
772 __entry->error,
773 show_nfsv4_errors(__entry->error),
774 MAJOR(__entry->dev), MINOR(__entry->dev),
775 (unsigned long long)__entry->olddir,
776 __get_str(oldname),
777 MAJOR(__entry->dev), MINOR(__entry->dev),
778 (unsigned long long)__entry->newdir,
779 __get_str(newname)
780 )
781);
782
783DECLARE_EVENT_CLASS(nfs4_inode_event,
784 TP_PROTO(
785 const struct inode *inode,
786 int error
787 ),
788
789 TP_ARGS(inode, error),
790
791 TP_STRUCT__entry(
792 __field(dev_t, dev)
793 __field(u32, fhandle)
794 __field(u64, fileid)
795 __field(int, error)
796 ),
797
798 TP_fast_assign(
799 __entry->dev = inode->i_sb->s_dev;
800 __entry->fileid = NFS_FILEID(inode);
801 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
802 __entry->error = error;
803 ),
804
805 TP_printk(
806 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
807 __entry->error,
808 show_nfsv4_errors(__entry->error),
809 MAJOR(__entry->dev), MINOR(__entry->dev),
810 (unsigned long long)__entry->fileid,
811 __entry->fhandle
812 )
813);
814
815#define DEFINE_NFS4_INODE_EVENT(name) \
816 DEFINE_EVENT(nfs4_inode_event, name, \
817 TP_PROTO( \
818 const struct inode *inode, \
819 int error \
820 ), \
821 TP_ARGS(inode, error))
822
823DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
824DEFINE_NFS4_INODE_EVENT(nfs4_access);
825DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
826DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
827DEFINE_NFS4_INODE_EVENT(nfs4_get_acl);
828DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
829#ifdef CONFIG_NFS_V4_SECURITY_LABEL
830DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
831DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
832#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
833DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
834DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
835
836DECLARE_EVENT_CLASS(nfs4_getattr_event,
837 TP_PROTO(
838 const struct nfs_server *server,
839 const struct nfs_fh *fhandle,
840 const struct nfs_fattr *fattr,
841 int error
842 ),
843
844 TP_ARGS(server, fhandle, fattr, error),
845
846 TP_STRUCT__entry(
847 __field(dev_t, dev)
848 __field(u32, fhandle)
849 __field(u64, fileid)
850 __field(unsigned int, valid)
851 __field(int, error)
852 ),
853
854 TP_fast_assign(
855 __entry->dev = server->s_dev;
856 __entry->valid = fattr->valid;
857 __entry->fhandle = nfs_fhandle_hash(fhandle);
858 __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0;
859 __entry->error = error;
860 ),
861
862 TP_printk(
863 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
864 "valid=%s",
865 __entry->error,
866 show_nfsv4_errors(__entry->error),
867 MAJOR(__entry->dev), MINOR(__entry->dev),
868 (unsigned long long)__entry->fileid,
869 __entry->fhandle,
870 show_nfs_fattr_flags(__entry->valid)
871 )
872);
873
874#define DEFINE_NFS4_GETATTR_EVENT(name) \
875 DEFINE_EVENT(nfs4_getattr_event, name, \
876 TP_PROTO( \
877 const struct nfs_server *server, \
878 const struct nfs_fh *fhandle, \
879 const struct nfs_fattr *fattr, \
880 int error \
881 ), \
882 TP_ARGS(server, fhandle, fattr, error))
883DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
884DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
885DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
886
887DECLARE_EVENT_CLASS(nfs4_idmap_event,
888 TP_PROTO(
889 const char *name,
890 int len,
891 u32 id,
892 int error
893 ),
894
895 TP_ARGS(name, len, id, error),
896
897 TP_STRUCT__entry(
898 __field(int, error)
899 __field(u32, id)
900 __dynamic_array(char, name, len > 0 ? len + 1 : 1)
901 ),
902
903 TP_fast_assign(
904 if (len < 0)
905 len = 0;
906 __entry->error = error < 0 ? error : 0;
907 __entry->id = id;
908 memcpy(__get_dynamic_array(name), name, len);
909 ((char *)__get_dynamic_array(name))[len] = 0;
910 ),
911
912 TP_printk(
913 "error=%d id=%u name=%s",
914 __entry->error,
915 __entry->id,
916 __get_str(name)
917 )
918);
919#define DEFINE_NFS4_IDMAP_EVENT(name) \
920 DEFINE_EVENT(nfs4_idmap_event, name, \
921 TP_PROTO( \
922 const char *name, \
923 int len, \
924 u32 id, \
925 int error \
926 ), \
927 TP_ARGS(name, len, id, error))
928DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid);
929DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
930DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
931DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
932
933DECLARE_EVENT_CLASS(nfs4_read_event,
934 TP_PROTO(
935 const struct nfs_read_data *data,
936 int error
937 ),
938
939 TP_ARGS(data, error),
940
941 TP_STRUCT__entry(
942 __field(dev_t, dev)
943 __field(u32, fhandle)
944 __field(u64, fileid)
945 __field(loff_t, offset)
946 __field(size_t, count)
947 __field(int, error)
948 ),
949
950 TP_fast_assign(
951 const struct inode *inode = data->header->inode;
952 __entry->dev = inode->i_sb->s_dev;
953 __entry->fileid = NFS_FILEID(inode);
954 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
955 __entry->offset = data->args.offset;
956 __entry->count = data->args.count;
957 __entry->error = error;
958 ),
959
960 TP_printk(
961 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
962 "offset=%lld count=%zu",
963 __entry->error,
964 show_nfsv4_errors(__entry->error),
965 MAJOR(__entry->dev), MINOR(__entry->dev),
966 (unsigned long long)__entry->fileid,
967 __entry->fhandle,
968 (long long)__entry->offset,
969 __entry->count
970 )
971);
972#define DEFINE_NFS4_READ_EVENT(name) \
973 DEFINE_EVENT(nfs4_read_event, name, \
974 TP_PROTO( \
975 const struct nfs_read_data *data, \
976 int error \
977 ), \
978 TP_ARGS(data, error))
979DEFINE_NFS4_READ_EVENT(nfs4_read);
980#ifdef CONFIG_NFS_V4_1
981DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
982#endif /* CONFIG_NFS_V4_1 */
983
984DECLARE_EVENT_CLASS(nfs4_write_event,
985 TP_PROTO(
986 const struct nfs_write_data *data,
987 int error
988 ),
989
990 TP_ARGS(data, error),
991
992 TP_STRUCT__entry(
993 __field(dev_t, dev)
994 __field(u32, fhandle)
995 __field(u64, fileid)
996 __field(loff_t, offset)
997 __field(size_t, count)
998 __field(int, error)
999 ),
1000
1001 TP_fast_assign(
1002 const struct inode *inode = data->header->inode;
1003 __entry->dev = inode->i_sb->s_dev;
1004 __entry->fileid = NFS_FILEID(inode);
1005 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
1006 __entry->offset = data->args.offset;
1007 __entry->count = data->args.count;
1008 __entry->error = error;
1009 ),
1010
1011 TP_printk(
1012 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
1013 "offset=%lld count=%zu",
1014 __entry->error,
1015 show_nfsv4_errors(__entry->error),
1016 MAJOR(__entry->dev), MINOR(__entry->dev),
1017 (unsigned long long)__entry->fileid,
1018 __entry->fhandle,
1019 (long long)__entry->offset,
1020 __entry->count
1021 )
1022);
1023
1024#define DEFINE_NFS4_WRITE_EVENT(name) \
1025 DEFINE_EVENT(nfs4_write_event, name, \
1026 TP_PROTO( \
1027 const struct nfs_write_data *data, \
1028 int error \
1029 ), \
1030 TP_ARGS(data, error))
1031DEFINE_NFS4_WRITE_EVENT(nfs4_write);
1032#ifdef CONFIG_NFS_V4_1
1033DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
1034#endif /* CONFIG_NFS_V4_1 */
1035
1036DECLARE_EVENT_CLASS(nfs4_commit_event,
1037 TP_PROTO(
1038 const struct nfs_commit_data *data,
1039 int error
1040 ),
1041
1042 TP_ARGS(data, error),
1043
1044 TP_STRUCT__entry(
1045 __field(dev_t, dev)
1046 __field(u32, fhandle)
1047 __field(u64, fileid)
1048 __field(loff_t, offset)
1049 __field(size_t, count)
1050 __field(int, error)
1051 ),
1052
1053 TP_fast_assign(
1054 const struct inode *inode = data->inode;
1055 __entry->dev = inode->i_sb->s_dev;
1056 __entry->fileid = NFS_FILEID(inode);
1057 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
1058 __entry->offset = data->args.offset;
1059 __entry->count = data->args.count;
1060 __entry->error = error;
1061 ),
1062
1063 TP_printk(
1064 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
1065 "offset=%lld count=%zu",
1066 __entry->error,
1067 show_nfsv4_errors(__entry->error),
1068 MAJOR(__entry->dev), MINOR(__entry->dev),
1069 (unsigned long long)__entry->fileid,
1070 __entry->fhandle,
1071 (long long)__entry->offset,
1072 __entry->count
1073 )
1074);
1075#define DEFINE_NFS4_COMMIT_EVENT(name) \
1076 DEFINE_EVENT(nfs4_commit_event, name, \
1077 TP_PROTO( \
1078 const struct nfs_commit_data *data, \
1079 int error \
1080 ), \
1081 TP_ARGS(data, error))
1082DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
1083#ifdef CONFIG_NFS_V4_1
1084DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
1085
1086#define show_pnfs_iomode(iomode) \
1087 __print_symbolic(iomode, \
1088 { IOMODE_READ, "READ" }, \
1089 { IOMODE_RW, "RW" }, \
1090 { IOMODE_ANY, "ANY" })
1091
1092TRACE_EVENT(nfs4_layoutget,
1093 TP_PROTO(
1094 const struct nfs_open_context *ctx,
1095 const struct pnfs_layout_range *args,
1096 const struct pnfs_layout_range *res,
1097 int error
1098 ),
1099
1100 TP_ARGS(ctx, args, res, error),
1101
1102 TP_STRUCT__entry(
1103 __field(dev_t, dev)
1104 __field(u32, fhandle)
1105 __field(u64, fileid)
1106 __field(u32, iomode)
1107 __field(u64, offset)
1108 __field(u64, count)
1109 __field(int, error)
1110 ),
1111
1112 TP_fast_assign(
1113 const struct inode *inode = ctx->dentry->d_inode;
1114 __entry->dev = inode->i_sb->s_dev;
1115 __entry->fileid = NFS_FILEID(inode);
1116 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
1117 __entry->iomode = args->iomode;
1118 __entry->offset = args->offset;
1119 __entry->count = args->length;
1120 __entry->error = error;
1121 ),
1122
1123 TP_printk(
1124 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
1125 "iomode=%s offset=%llu count=%llu",
1126 __entry->error,
1127 show_nfsv4_errors(__entry->error),
1128 MAJOR(__entry->dev), MINOR(__entry->dev),
1129 (unsigned long long)__entry->fileid,
1130 __entry->fhandle,
1131 show_pnfs_iomode(__entry->iomode),
1132 (unsigned long long)__entry->offset,
1133 (unsigned long long)__entry->count
1134 )
1135);
1136
1137DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
1138DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
1139
1140#endif /* CONFIG_NFS_V4_1 */
1141
1142#endif /* _TRACE_NFS4_H */
1143
1144#undef TRACE_INCLUDE_PATH
1145#define TRACE_INCLUDE_PATH .
1146#define TRACE_INCLUDE_FILE nfs4trace
1147/* This part must be outside protection */
1148#include <trace/define_trace.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3850b018815f..79210d23f607 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -294,7 +294,9 @@ static int nfs4_stat_to_errno(int);
294 XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ 294 XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
295 1 /* flags */ + \ 295 1 /* flags */ + \
296 1 /* spa_how */ + \ 296 1 /* spa_how */ + \
297 0 /* SP4_NONE (for now) */ + \ 297 /* max is SP4_MACH_CRED (for now) */ + \
298 1 + NFS4_OP_MAP_NUM_WORDS + \
299 1 + NFS4_OP_MAP_NUM_WORDS + \
298 1 /* implementation id array of size 1 */ + \ 300 1 /* implementation id array of size 1 */ + \
299 1 /* nii_domain */ + \ 301 1 /* nii_domain */ + \
300 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 302 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
@@ -306,7 +308,9 @@ static int nfs4_stat_to_errno(int);
306 1 /* eir_sequenceid */ + \ 308 1 /* eir_sequenceid */ + \
307 1 /* eir_flags */ + \ 309 1 /* eir_flags */ + \
308 1 /* spr_how */ + \ 310 1 /* spr_how */ + \
309 0 /* SP4_NONE (for now) */ + \ 311 /* max is SP4_MACH_CRED (for now) */ + \
312 1 + NFS4_OP_MAP_NUM_WORDS + \
313 1 + NFS4_OP_MAP_NUM_WORDS + \
310 2 /* eir_server_owner.so_minor_id */ + \ 314 2 /* eir_server_owner.so_minor_id */ + \
311 /* eir_server_owner.so_major_id<> */ \ 315 /* eir_server_owner.so_major_id<> */ \
312 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ 316 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
@@ -410,7 +414,7 @@ static int nfs4_stat_to_errno(int);
410#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) 414#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1)
411#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ 415#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \
412 XDR_QUADLEN(NFS4_STATEID_SIZE)) 416 XDR_QUADLEN(NFS4_STATEID_SIZE))
413#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1) 417#define decode_free_stateid_maxsz (op_decode_hdr_maxsz)
414#else /* CONFIG_NFS_V4_1 */ 418#else /* CONFIG_NFS_V4_1 */
415#define encode_sequence_maxsz 0 419#define encode_sequence_maxsz 0
416#define decode_sequence_maxsz 0 420#define decode_sequence_maxsz 0
@@ -997,12 +1001,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
997 int owner_namelen = 0; 1001 int owner_namelen = 0;
998 int owner_grouplen = 0; 1002 int owner_grouplen = 0;
999 __be32 *p; 1003 __be32 *p;
1000 __be32 *q; 1004 unsigned i;
1001 int len; 1005 uint32_t len = 0;
1002 uint32_t bmval_len = 2; 1006 uint32_t bmval_len;
1003 uint32_t bmval0 = 0; 1007 uint32_t bmval[3] = { 0 };
1004 uint32_t bmval1 = 0;
1005 uint32_t bmval2 = 0;
1006 1008
1007 /* 1009 /*
1008 * We reserve enough space to write the entire attribute buffer at once. 1010 * We reserve enough space to write the entire attribute buffer at once.
@@ -1011,13 +1013,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1011 * = 40 bytes, plus any contribution from variable-length fields 1013 * = 40 bytes, plus any contribution from variable-length fields
1012 * such as owner/group. 1014 * such as owner/group.
1013 */ 1015 */
1014 len = 8; 1016 if (iap->ia_valid & ATTR_SIZE) {
1015 1017 bmval[0] |= FATTR4_WORD0_SIZE;
1016 /* Sigh */
1017 if (iap->ia_valid & ATTR_SIZE)
1018 len += 8; 1018 len += 8;
1019 if (iap->ia_valid & ATTR_MODE) 1019 }
1020 if (iap->ia_valid & ATTR_MODE) {
1021 bmval[1] |= FATTR4_WORD1_MODE;
1020 len += 4; 1022 len += 4;
1023 }
1021 if (iap->ia_valid & ATTR_UID) { 1024 if (iap->ia_valid & ATTR_UID) {
1022 owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); 1025 owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
1023 if (owner_namelen < 0) { 1026 if (owner_namelen < 0) {
@@ -1028,6 +1031,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1028 owner_namelen = sizeof("nobody") - 1; 1031 owner_namelen = sizeof("nobody") - 1;
1029 /* goto out; */ 1032 /* goto out; */
1030 } 1033 }
1034 bmval[1] |= FATTR4_WORD1_OWNER;
1031 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 1035 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
1032 } 1036 }
1033 if (iap->ia_valid & ATTR_GID) { 1037 if (iap->ia_valid & ATTR_GID) {
@@ -1039,92 +1043,73 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1039 owner_grouplen = sizeof("nobody") - 1; 1043 owner_grouplen = sizeof("nobody") - 1;
1040 /* goto out; */ 1044 /* goto out; */
1041 } 1045 }
1046 bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
1042 len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); 1047 len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
1043 } 1048 }
1044 if (iap->ia_valid & ATTR_ATIME_SET) 1049 if (iap->ia_valid & ATTR_ATIME_SET) {
1050 bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
1045 len += 16; 1051 len += 16;
1046 else if (iap->ia_valid & ATTR_ATIME) 1052 } else if (iap->ia_valid & ATTR_ATIME) {
1053 bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
1047 len += 4; 1054 len += 4;
1048 if (iap->ia_valid & ATTR_MTIME_SET) 1055 }
1056 if (iap->ia_valid & ATTR_MTIME_SET) {
1057 bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
1049 len += 16; 1058 len += 16;
1050 else if (iap->ia_valid & ATTR_MTIME) 1059 } else if (iap->ia_valid & ATTR_MTIME) {
1060 bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
1051 len += 4; 1061 len += 4;
1062 }
1052 if (label) { 1063 if (label) {
1053 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); 1064 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
1054 bmval_len = 3; 1065 bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
1055 } 1066 }
1056 1067
1057 len += bmval_len << 2; 1068 if (bmval[2] != 0)
1058 p = reserve_space(xdr, len); 1069 bmval_len = 3;
1070 else if (bmval[1] != 0)
1071 bmval_len = 2;
1072 else
1073 bmval_len = 1;
1074
1075 p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len);
1059 1076
1060 /*
1061 * We write the bitmap length now, but leave the bitmap and the attribute
1062 * buffer length to be backfilled at the end of this routine.
1063 */
1064 *p++ = cpu_to_be32(bmval_len); 1077 *p++ = cpu_to_be32(bmval_len);
1065 q = p; 1078 for (i = 0; i < bmval_len; i++)
1066 /* Skip bitmap entries + attrlen */ 1079 *p++ = cpu_to_be32(bmval[i]);
1067 p += bmval_len + 1; 1080 *p++ = cpu_to_be32(len);
1068 1081
1069 if (iap->ia_valid & ATTR_SIZE) { 1082 if (bmval[0] & FATTR4_WORD0_SIZE)
1070 bmval0 |= FATTR4_WORD0_SIZE;
1071 p = xdr_encode_hyper(p, iap->ia_size); 1083 p = xdr_encode_hyper(p, iap->ia_size);
1072 } 1084 if (bmval[1] & FATTR4_WORD1_MODE)
1073 if (iap->ia_valid & ATTR_MODE) {
1074 bmval1 |= FATTR4_WORD1_MODE;
1075 *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); 1085 *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
1076 } 1086 if (bmval[1] & FATTR4_WORD1_OWNER)
1077 if (iap->ia_valid & ATTR_UID) {
1078 bmval1 |= FATTR4_WORD1_OWNER;
1079 p = xdr_encode_opaque(p, owner_name, owner_namelen); 1087 p = xdr_encode_opaque(p, owner_name, owner_namelen);
1080 } 1088 if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
1081 if (iap->ia_valid & ATTR_GID) {
1082 bmval1 |= FATTR4_WORD1_OWNER_GROUP;
1083 p = xdr_encode_opaque(p, owner_group, owner_grouplen); 1089 p = xdr_encode_opaque(p, owner_group, owner_grouplen);
1090 if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
1091 if (iap->ia_valid & ATTR_ATIME_SET) {
1092 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
1093 p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
1094 *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
1095 } else
1096 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1084 } 1097 }
1085 if (iap->ia_valid & ATTR_ATIME_SET) { 1098 if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
1086 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 1099 if (iap->ia_valid & ATTR_MTIME_SET) {
1087 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); 1100 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
1088 p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); 1101 p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
1089 *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); 1102 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
1090 } 1103 } else
1091 else if (iap->ia_valid & ATTR_ATIME) { 1104 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1092 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
1093 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1094 }
1095 if (iap->ia_valid & ATTR_MTIME_SET) {
1096 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
1097 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
1098 p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
1099 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
1100 }
1101 else if (iap->ia_valid & ATTR_MTIME) {
1102 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
1103 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1104 } 1105 }
1105 if (label) { 1106 if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
1106 bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
1107 *p++ = cpu_to_be32(label->lfs); 1107 *p++ = cpu_to_be32(label->lfs);
1108 *p++ = cpu_to_be32(label->pi); 1108 *p++ = cpu_to_be32(label->pi);
1109 *p++ = cpu_to_be32(label->len); 1109 *p++ = cpu_to_be32(label->len);
1110 p = xdr_encode_opaque_fixed(p, label->label, label->len); 1110 p = xdr_encode_opaque_fixed(p, label->label, label->len);
1111 } 1111 }
1112 1112
1113 /*
1114 * Now we backfill the bitmap and the attribute buffer length.
1115 */
1116 if (len != ((char *)p - (char *)q) + 4) {
1117 printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
1118 len, ((char *)p - (char *)q) + 4);
1119 BUG();
1120 }
1121 *q++ = htonl(bmval0);
1122 *q++ = htonl(bmval1);
1123 if (bmval_len == 3)
1124 *q++ = htonl(bmval2);
1125 len = (char *)p - (char *)(q + 1);
1126 *q = htonl(len);
1127
1128/* out: */ 1113/* out: */
1129} 1114}
1130 1115
@@ -1745,6 +1730,14 @@ static void encode_bind_conn_to_session(struct xdr_stream *xdr,
1745 *p = 0; /* use_conn_in_rdma_mode = False */ 1730 *p = 0; /* use_conn_in_rdma_mode = False */
1746} 1731}
1747 1732
1733static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
1734{
1735 unsigned int i;
1736 encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS);
1737 for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++)
1738 encode_uint32(xdr, op_map->u.words[i]);
1739}
1740
1748static void encode_exchange_id(struct xdr_stream *xdr, 1741static void encode_exchange_id(struct xdr_stream *xdr,
1749 struct nfs41_exchange_id_args *args, 1742 struct nfs41_exchange_id_args *args,
1750 struct compound_hdr *hdr) 1743 struct compound_hdr *hdr)
@@ -1758,9 +1751,20 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1758 1751
1759 encode_string(xdr, args->id_len, args->id); 1752 encode_string(xdr, args->id_len, args->id);
1760 1753
1761 p = reserve_space(xdr, 12); 1754 encode_uint32(xdr, args->flags);
1762 *p++ = cpu_to_be32(args->flags); 1755 encode_uint32(xdr, args->state_protect.how);
1763 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ 1756
1757 switch (args->state_protect.how) {
1758 case SP4_NONE:
1759 break;
1760 case SP4_MACH_CRED:
1761 encode_op_map(xdr, &args->state_protect.enforce);
1762 encode_op_map(xdr, &args->state_protect.allow);
1763 break;
1764 default:
1765 WARN_ON_ONCE(1);
1766 break;
1767 }
1764 1768
1765 if (send_implementation_id && 1769 if (send_implementation_id &&
1766 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && 1770 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
@@ -1771,7 +1775,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1771 utsname()->version, utsname()->machine); 1775 utsname()->version, utsname()->machine);
1772 1776
1773 if (len > 0) { 1777 if (len > 0) {
1774 *p = cpu_to_be32(1); /* implementation id array length=1 */ 1778 encode_uint32(xdr, 1); /* implementation id array length=1 */
1775 1779
1776 encode_string(xdr, 1780 encode_string(xdr,
1777 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, 1781 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
@@ -1782,7 +1786,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1782 p = xdr_encode_hyper(p, 0); 1786 p = xdr_encode_hyper(p, 0);
1783 *p = cpu_to_be32(0); 1787 *p = cpu_to_be32(0);
1784 } else 1788 } else
1785 *p = cpu_to_be32(0); /* implementation id array length=0 */ 1789 encode_uint32(xdr, 0); /* implementation id array length=0 */
1786} 1790}
1787 1791
1788static void encode_create_session(struct xdr_stream *xdr, 1792static void encode_create_session(struct xdr_stream *xdr,
@@ -1835,7 +1839,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1835 *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ 1839 *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
1836 1840
1837 /* authsys_parms rfc1831 */ 1841 /* authsys_parms rfc1831 */
1838 *p++ = (__be32)nn->boot_time.tv_nsec; /* stamp */ 1842 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
1839 p = xdr_encode_opaque(p, machine_name, len); 1843 p = xdr_encode_opaque(p, machine_name, len);
1840 *p++ = cpu_to_be32(0); /* UID */ 1844 *p++ = cpu_to_be32(0); /* UID */
1841 *p++ = cpu_to_be32(0); /* GID */ 1845 *p++ = cpu_to_be32(0); /* GID */
@@ -1877,11 +1881,10 @@ static void encode_sequence(struct xdr_stream *xdr,
1877 struct nfs4_slot *slot = args->sa_slot; 1881 struct nfs4_slot *slot = args->sa_slot;
1878 __be32 *p; 1882 __be32 *p;
1879 1883
1880 if (slot == NULL)
1881 return;
1882
1883 tp = slot->table; 1884 tp = slot->table;
1884 session = tp->session; 1885 session = tp->session;
1886 if (!session)
1887 return;
1885 1888
1886 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); 1889 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
1887 1890
@@ -2062,9 +2065,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
2062static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) 2065static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
2063{ 2066{
2064#if defined(CONFIG_NFS_V4_1) 2067#if defined(CONFIG_NFS_V4_1)
2065 2068 struct nfs4_session *session = args->sa_slot->table->session;
2066 if (args->sa_slot) 2069 if (session)
2067 return args->sa_slot->table->session->clp->cl_mvops->minor_version; 2070 return session->clp->cl_mvops->minor_version;
2068#endif /* CONFIG_NFS_V4_1 */ 2071#endif /* CONFIG_NFS_V4_1 */
2069 return 0; 2072 return 0;
2070} 2073}
@@ -4649,7 +4652,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4649static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, 4652static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4650 uint32_t *layouttype) 4653 uint32_t *layouttype)
4651{ 4654{
4652 uint32_t *p; 4655 __be32 *p;
4653 int num; 4656 int num;
4654 4657
4655 p = xdr_inline_decode(xdr, 4); 4658 p = xdr_inline_decode(xdr, 4);
@@ -5394,6 +5397,23 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re
5394 return decode_secinfo_common(xdr, res); 5397 return decode_secinfo_common(xdr, res);
5395} 5398}
5396 5399
5400static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
5401{
5402 __be32 *p;
5403 uint32_t bitmap_words;
5404 unsigned int i;
5405
5406 p = xdr_inline_decode(xdr, 4);
5407 bitmap_words = be32_to_cpup(p++);
5408 if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
5409 return -EIO;
5410 p = xdr_inline_decode(xdr, 4 * bitmap_words);
5411 for (i = 0; i < bitmap_words; i++)
5412 op_map->u.words[i] = be32_to_cpup(p++);
5413
5414 return 0;
5415}
5416
5397static int decode_exchange_id(struct xdr_stream *xdr, 5417static int decode_exchange_id(struct xdr_stream *xdr,
5398 struct nfs41_exchange_id_res *res) 5418 struct nfs41_exchange_id_res *res)
5399{ 5419{
@@ -5417,10 +5437,22 @@ static int decode_exchange_id(struct xdr_stream *xdr,
5417 res->seqid = be32_to_cpup(p++); 5437 res->seqid = be32_to_cpup(p++);
5418 res->flags = be32_to_cpup(p++); 5438 res->flags = be32_to_cpup(p++);
5419 5439
5420 /* We ask for SP4_NONE */ 5440 res->state_protect.how = be32_to_cpup(p);
5421 dummy = be32_to_cpup(p); 5441 switch (res->state_protect.how) {
5422 if (dummy != SP4_NONE) 5442 case SP4_NONE:
5443 break;
5444 case SP4_MACH_CRED:
5445 status = decode_op_map(xdr, &res->state_protect.enforce);
5446 if (status)
5447 return status;
5448 status = decode_op_map(xdr, &res->state_protect.allow);
5449 if (status)
5450 return status;
5451 break;
5452 default:
5453 WARN_ON_ONCE(1);
5423 return -EIO; 5454 return -EIO;
5455 }
5424 5456
5425 /* server_owner4.so_minor_id */ 5457 /* server_owner4.so_minor_id */
5426 p = xdr_inline_decode(xdr, 8); 5458 p = xdr_inline_decode(xdr, 8);
@@ -5614,6 +5646,8 @@ static int decode_sequence(struct xdr_stream *xdr,
5614 5646
5615 if (res->sr_slot == NULL) 5647 if (res->sr_slot == NULL)
5616 return 0; 5648 return 0;
5649 if (!res->sr_slot->table->session)
5650 return 0;
5617 5651
5618 status = decode_op_hdr(xdr, OP_SEQUENCE); 5652 status = decode_op_hdr(xdr, OP_SEQUENCE);
5619 if (!status) 5653 if (!status)
@@ -5932,21 +5966,8 @@ out:
5932static int decode_free_stateid(struct xdr_stream *xdr, 5966static int decode_free_stateid(struct xdr_stream *xdr,
5933 struct nfs41_free_stateid_res *res) 5967 struct nfs41_free_stateid_res *res)
5934{ 5968{
5935 __be32 *p; 5969 res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
5936 int status;
5937
5938 status = decode_op_hdr(xdr, OP_FREE_STATEID);
5939 if (status)
5940 return status;
5941
5942 p = xdr_inline_decode(xdr, 4);
5943 if (unlikely(!p))
5944 goto out_overflow;
5945 res->status = be32_to_cpup(p++);
5946 return res->status; 5970 return res->status;
5947out_overflow:
5948 print_overflow_msg(__func__, xdr);
5949 return -EIO;
5950} 5971}
5951#endif /* CONFIG_NFS_V4_1 */ 5972#endif /* CONFIG_NFS_V4_1 */
5952 5973
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
new file mode 100644
index 000000000000..4eb0aead69b6
--- /dev/null
+++ b/fs/nfs/nfstrace.c
@@ -0,0 +1,9 @@
1/*
2 * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
3 */
4#include <linux/nfs_fs.h>
5#include <linux/namei.h>
6#include "internal.h"
7
8#define CREATE_TRACE_POINTS
9#include "nfstrace.h"
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
new file mode 100644
index 000000000000..89fe741e58b1
--- /dev/null
+++ b/fs/nfs/nfstrace.h
@@ -0,0 +1,729 @@
1/*
2 * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
3 */
4#undef TRACE_SYSTEM
5#define TRACE_SYSTEM nfs
6
7#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
8#define _TRACE_NFS_H
9
10#include <linux/tracepoint.h>
11
12#define nfs_show_file_type(ftype) \
13 __print_symbolic(ftype, \
14 { DT_UNKNOWN, "UNKNOWN" }, \
15 { DT_FIFO, "FIFO" }, \
16 { DT_CHR, "CHR" }, \
17 { DT_DIR, "DIR" }, \
18 { DT_BLK, "BLK" }, \
19 { DT_REG, "REG" }, \
20 { DT_LNK, "LNK" }, \
21 { DT_SOCK, "SOCK" }, \
22 { DT_WHT, "WHT" })
23
24#define nfs_show_cache_validity(v) \
25 __print_flags(v, "|", \
26 { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \
27 { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
28 { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
29 { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
30 { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
31 { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
32 { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
33 { NFS_INO_INVALID_LABEL, "INVALID_LABEL" })
34
35#define nfs_show_nfsi_flags(v) \
36 __print_flags(v, "|", \
37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
38 { 1 << NFS_INO_STALE, "STALE" }, \
39 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
40 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
41 { 1 << NFS_INO_COMMIT, "COMMIT" }, \
42 { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
43 { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
44
45DECLARE_EVENT_CLASS(nfs_inode_event,
46 TP_PROTO(
47 const struct inode *inode
48 ),
49
50 TP_ARGS(inode),
51
52 TP_STRUCT__entry(
53 __field(dev_t, dev)
54 __field(u32, fhandle)
55 __field(u64, fileid)
56 __field(u64, version)
57 ),
58
59 TP_fast_assign(
60 const struct nfs_inode *nfsi = NFS_I(inode);
61 __entry->dev = inode->i_sb->s_dev;
62 __entry->fileid = nfsi->fileid;
63 __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
64 __entry->version = inode->i_version;
65 ),
66
67 TP_printk(
68 "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
69 MAJOR(__entry->dev), MINOR(__entry->dev),
70 (unsigned long long)__entry->fileid,
71 __entry->fhandle,
72 (unsigned long long)__entry->version
73 )
74);
75
76DECLARE_EVENT_CLASS(nfs_inode_event_done,
77 TP_PROTO(
78 const struct inode *inode,
79 int error
80 ),
81
82 TP_ARGS(inode, error),
83
84 TP_STRUCT__entry(
85 __field(int, error)
86 __field(dev_t, dev)
87 __field(u32, fhandle)
88 __field(unsigned char, type)
89 __field(u64, fileid)
90 __field(u64, version)
91 __field(loff_t, size)
92 __field(unsigned long, nfsi_flags)
93 __field(unsigned long, cache_validity)
94 ),
95
96 TP_fast_assign(
97 const struct nfs_inode *nfsi = NFS_I(inode);
98 __entry->error = error;
99 __entry->dev = inode->i_sb->s_dev;
100 __entry->fileid = nfsi->fileid;
101 __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
102 __entry->type = nfs_umode_to_dtype(inode->i_mode);
103 __entry->version = inode->i_version;
104 __entry->size = i_size_read(inode);
105 __entry->nfsi_flags = nfsi->flags;
106 __entry->cache_validity = nfsi->cache_validity;
107 ),
108
109 TP_printk(
110 "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
111 "type=%u (%s) version=%llu size=%lld "
112 "cache_validity=%lu (%s) nfs_flags=%ld (%s)",
113 __entry->error,
114 MAJOR(__entry->dev), MINOR(__entry->dev),
115 (unsigned long long)__entry->fileid,
116 __entry->fhandle,
117 __entry->type,
118 nfs_show_file_type(__entry->type),
119 (unsigned long long)__entry->version,
120 (long long)__entry->size,
121 __entry->cache_validity,
122 nfs_show_cache_validity(__entry->cache_validity),
123 __entry->nfsi_flags,
124 nfs_show_nfsi_flags(__entry->nfsi_flags)
125 )
126);
127
128#define DEFINE_NFS_INODE_EVENT(name) \
129 DEFINE_EVENT(nfs_inode_event, name, \
130 TP_PROTO( \
131 const struct inode *inode \
132 ), \
133 TP_ARGS(inode))
134#define DEFINE_NFS_INODE_EVENT_DONE(name) \
135 DEFINE_EVENT(nfs_inode_event_done, name, \
136 TP_PROTO( \
137 const struct inode *inode, \
138 int error \
139 ), \
140 TP_ARGS(inode, error))
141DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
142DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
143DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
144DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit);
145DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter);
146DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit);
147DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
148DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
149DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
150DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
151DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter);
152DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit);
153DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
154DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
155DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
156DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
157DEFINE_NFS_INODE_EVENT(nfs_access_enter);
158DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit);
159
160#define show_lookup_flags(flags) \
161 __print_flags((unsigned long)flags, "|", \
162 { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
163 { LOOKUP_DIRECTORY, "DIRECTORY" }, \
164 { LOOKUP_OPEN, "OPEN" }, \
165 { LOOKUP_CREATE, "CREATE" }, \
166 { LOOKUP_EXCL, "EXCL" })
167
168DECLARE_EVENT_CLASS(nfs_lookup_event,
169 TP_PROTO(
170 const struct inode *dir,
171 const struct dentry *dentry,
172 unsigned int flags
173 ),
174
175 TP_ARGS(dir, dentry, flags),
176
177 TP_STRUCT__entry(
178 __field(unsigned int, flags)
179 __field(dev_t, dev)
180 __field(u64, dir)
181 __string(name, dentry->d_name.name)
182 ),
183
184 TP_fast_assign(
185 __entry->dev = dir->i_sb->s_dev;
186 __entry->dir = NFS_FILEID(dir);
187 __entry->flags = flags;
188 __assign_str(name, dentry->d_name.name);
189 ),
190
191 TP_printk(
192 "flags=%u (%s) name=%02x:%02x:%llu/%s",
193 __entry->flags,
194 show_lookup_flags(__entry->flags),
195 MAJOR(__entry->dev), MINOR(__entry->dev),
196 (unsigned long long)__entry->dir,
197 __get_str(name)
198 )
199);
200
201#define DEFINE_NFS_LOOKUP_EVENT(name) \
202 DEFINE_EVENT(nfs_lookup_event, name, \
203 TP_PROTO( \
204 const struct inode *dir, \
205 const struct dentry *dentry, \
206 unsigned int flags \
207 ), \
208 TP_ARGS(dir, dentry, flags))
209
210DECLARE_EVENT_CLASS(nfs_lookup_event_done,
211 TP_PROTO(
212 const struct inode *dir,
213 const struct dentry *dentry,
214 unsigned int flags,
215 int error
216 ),
217
218 TP_ARGS(dir, dentry, flags, error),
219
220 TP_STRUCT__entry(
221 __field(int, error)
222 __field(unsigned int, flags)
223 __field(dev_t, dev)
224 __field(u64, dir)
225 __string(name, dentry->d_name.name)
226 ),
227
228 TP_fast_assign(
229 __entry->dev = dir->i_sb->s_dev;
230 __entry->dir = NFS_FILEID(dir);
231 __entry->error = error;
232 __entry->flags = flags;
233 __assign_str(name, dentry->d_name.name);
234 ),
235
236 TP_printk(
237 "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
238 __entry->error,
239 __entry->flags,
240 show_lookup_flags(__entry->flags),
241 MAJOR(__entry->dev), MINOR(__entry->dev),
242 (unsigned long long)__entry->dir,
243 __get_str(name)
244 )
245);
246
247#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \
248 DEFINE_EVENT(nfs_lookup_event_done, name, \
249 TP_PROTO( \
250 const struct inode *dir, \
251 const struct dentry *dentry, \
252 unsigned int flags, \
253 int error \
254 ), \
255 TP_ARGS(dir, dentry, flags, error))
256
257DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
258DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
259DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
260DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
261
262#define show_open_flags(flags) \
263 __print_flags((unsigned long)flags, "|", \
264 { O_CREAT, "O_CREAT" }, \
265 { O_EXCL, "O_EXCL" }, \
266 { O_TRUNC, "O_TRUNC" }, \
267 { O_APPEND, "O_APPEND" }, \
268 { O_DSYNC, "O_DSYNC" }, \
269 { O_DIRECT, "O_DIRECT" }, \
270 { O_DIRECTORY, "O_DIRECTORY" })
271
272#define show_fmode_flags(mode) \
273 __print_flags(mode, "|", \
274 { ((__force unsigned long)FMODE_READ), "READ" }, \
275 { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
276 { ((__force unsigned long)FMODE_EXEC), "EXEC" })
277
278TRACE_EVENT(nfs_atomic_open_enter,
279 TP_PROTO(
280 const struct inode *dir,
281 const struct nfs_open_context *ctx,
282 unsigned int flags
283 ),
284
285 TP_ARGS(dir, ctx, flags),
286
287 TP_STRUCT__entry(
288 __field(unsigned int, flags)
289 __field(unsigned int, fmode)
290 __field(dev_t, dev)
291 __field(u64, dir)
292 __string(name, ctx->dentry->d_name.name)
293 ),
294
295 TP_fast_assign(
296 __entry->dev = dir->i_sb->s_dev;
297 __entry->dir = NFS_FILEID(dir);
298 __entry->flags = flags;
299 __entry->fmode = (__force unsigned int)ctx->mode;
300 __assign_str(name, ctx->dentry->d_name.name);
301 ),
302
303 TP_printk(
304 "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s",
305 __entry->flags,
306 show_open_flags(__entry->flags),
307 show_fmode_flags(__entry->fmode),
308 MAJOR(__entry->dev), MINOR(__entry->dev),
309 (unsigned long long)__entry->dir,
310 __get_str(name)
311 )
312);
313
314TRACE_EVENT(nfs_atomic_open_exit,
315 TP_PROTO(
316 const struct inode *dir,
317 const struct nfs_open_context *ctx,
318 unsigned int flags,
319 int error
320 ),
321
322 TP_ARGS(dir, ctx, flags, error),
323
324 TP_STRUCT__entry(
325 __field(int, error)
326 __field(unsigned int, flags)
327 __field(unsigned int, fmode)
328 __field(dev_t, dev)
329 __field(u64, dir)
330 __string(name, ctx->dentry->d_name.name)
331 ),
332
333 TP_fast_assign(
334 __entry->error = error;
335 __entry->dev = dir->i_sb->s_dev;
336 __entry->dir = NFS_FILEID(dir);
337 __entry->flags = flags;
338 __entry->fmode = (__force unsigned int)ctx->mode;
339 __assign_str(name, ctx->dentry->d_name.name);
340 ),
341
342 TP_printk(
343 "error=%d flags=%u (%s) fmode=%s "
344 "name=%02x:%02x:%llu/%s",
345 __entry->error,
346 __entry->flags,
347 show_open_flags(__entry->flags),
348 show_fmode_flags(__entry->fmode),
349 MAJOR(__entry->dev), MINOR(__entry->dev),
350 (unsigned long long)__entry->dir,
351 __get_str(name)
352 )
353);
354
355TRACE_EVENT(nfs_create_enter,
356 TP_PROTO(
357 const struct inode *dir,
358 const struct dentry *dentry,
359 unsigned int flags
360 ),
361
362 TP_ARGS(dir, dentry, flags),
363
364 TP_STRUCT__entry(
365 __field(unsigned int, flags)
366 __field(dev_t, dev)
367 __field(u64, dir)
368 __string(name, dentry->d_name.name)
369 ),
370
371 TP_fast_assign(
372 __entry->dev = dir->i_sb->s_dev;
373 __entry->dir = NFS_FILEID(dir);
374 __entry->flags = flags;
375 __assign_str(name, dentry->d_name.name);
376 ),
377
378 TP_printk(
379 "flags=%u (%s) name=%02x:%02x:%llu/%s",
380 __entry->flags,
381 show_open_flags(__entry->flags),
382 MAJOR(__entry->dev), MINOR(__entry->dev),
383 (unsigned long long)__entry->dir,
384 __get_str(name)
385 )
386);
387
388TRACE_EVENT(nfs_create_exit,
389 TP_PROTO(
390 const struct inode *dir,
391 const struct dentry *dentry,
392 unsigned int flags,
393 int error
394 ),
395
396 TP_ARGS(dir, dentry, flags, error),
397
398 TP_STRUCT__entry(
399 __field(int, error)
400 __field(unsigned int, flags)
401 __field(dev_t, dev)
402 __field(u64, dir)
403 __string(name, dentry->d_name.name)
404 ),
405
406 TP_fast_assign(
407 __entry->error = error;
408 __entry->dev = dir->i_sb->s_dev;
409 __entry->dir = NFS_FILEID(dir);
410 __entry->flags = flags;
411 __assign_str(name, dentry->d_name.name);
412 ),
413
414 TP_printk(
415 "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
416 __entry->error,
417 __entry->flags,
418 show_open_flags(__entry->flags),
419 MAJOR(__entry->dev), MINOR(__entry->dev),
420 (unsigned long long)__entry->dir,
421 __get_str(name)
422 )
423);
424
425DECLARE_EVENT_CLASS(nfs_directory_event,
426 TP_PROTO(
427 const struct inode *dir,
428 const struct dentry *dentry
429 ),
430
431 TP_ARGS(dir, dentry),
432
433 TP_STRUCT__entry(
434 __field(dev_t, dev)
435 __field(u64, dir)
436 __string(name, dentry->d_name.name)
437 ),
438
439 TP_fast_assign(
440 __entry->dev = dir->i_sb->s_dev;
441 __entry->dir = NFS_FILEID(dir);
442 __assign_str(name, dentry->d_name.name);
443 ),
444
445 TP_printk(
446 "name=%02x:%02x:%llu/%s",
447 MAJOR(__entry->dev), MINOR(__entry->dev),
448 (unsigned long long)__entry->dir,
449 __get_str(name)
450 )
451);
452
453#define DEFINE_NFS_DIRECTORY_EVENT(name) \
454 DEFINE_EVENT(nfs_directory_event, name, \
455 TP_PROTO( \
456 const struct inode *dir, \
457 const struct dentry *dentry \
458 ), \
459 TP_ARGS(dir, dentry))
460
461DECLARE_EVENT_CLASS(nfs_directory_event_done,
462 TP_PROTO(
463 const struct inode *dir,
464 const struct dentry *dentry,
465 int error
466 ),
467
468 TP_ARGS(dir, dentry, error),
469
470 TP_STRUCT__entry(
471 __field(int, error)
472 __field(dev_t, dev)
473 __field(u64, dir)
474 __string(name, dentry->d_name.name)
475 ),
476
477 TP_fast_assign(
478 __entry->dev = dir->i_sb->s_dev;
479 __entry->dir = NFS_FILEID(dir);
480 __entry->error = error;
481 __assign_str(name, dentry->d_name.name);
482 ),
483
484 TP_printk(
485 "error=%d name=%02x:%02x:%llu/%s",
486 __entry->error,
487 MAJOR(__entry->dev), MINOR(__entry->dev),
488 (unsigned long long)__entry->dir,
489 __get_str(name)
490 )
491);
492
493#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \
494 DEFINE_EVENT(nfs_directory_event_done, name, \
495 TP_PROTO( \
496 const struct inode *dir, \
497 const struct dentry *dentry, \
498 int error \
499 ), \
500 TP_ARGS(dir, dentry, error))
501
502DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter);
503DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit);
504DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter);
505DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit);
506DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter);
507DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit);
508DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter);
509DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit);
510DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter);
511DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit);
512DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter);
513DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit);
514
515TRACE_EVENT(nfs_link_enter,
516 TP_PROTO(
517 const struct inode *inode,
518 const struct inode *dir,
519 const struct dentry *dentry
520 ),
521
522 TP_ARGS(inode, dir, dentry),
523
524 TP_STRUCT__entry(
525 __field(dev_t, dev)
526 __field(u64, fileid)
527 __field(u64, dir)
528 __string(name, dentry->d_name.name)
529 ),
530
531 TP_fast_assign(
532 __entry->dev = inode->i_sb->s_dev;
533 __entry->fileid = NFS_FILEID(inode);
534 __entry->dir = NFS_FILEID(dir);
535 __assign_str(name, dentry->d_name.name);
536 ),
537
538 TP_printk(
539 "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
540 MAJOR(__entry->dev), MINOR(__entry->dev),
541 __entry->fileid,
542 MAJOR(__entry->dev), MINOR(__entry->dev),
543 (unsigned long long)__entry->dir,
544 __get_str(name)
545 )
546);
547
548TRACE_EVENT(nfs_link_exit,
549 TP_PROTO(
550 const struct inode *inode,
551 const struct inode *dir,
552 const struct dentry *dentry,
553 int error
554 ),
555
556 TP_ARGS(inode, dir, dentry, error),
557
558 TP_STRUCT__entry(
559 __field(int, error)
560 __field(dev_t, dev)
561 __field(u64, fileid)
562 __field(u64, dir)
563 __string(name, dentry->d_name.name)
564 ),
565
566 TP_fast_assign(
567 __entry->dev = inode->i_sb->s_dev;
568 __entry->fileid = NFS_FILEID(inode);
569 __entry->dir = NFS_FILEID(dir);
570 __entry->error = error;
571 __assign_str(name, dentry->d_name.name);
572 ),
573
574 TP_printk(
575 "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
576 __entry->error,
577 MAJOR(__entry->dev), MINOR(__entry->dev),
578 __entry->fileid,
579 MAJOR(__entry->dev), MINOR(__entry->dev),
580 (unsigned long long)__entry->dir,
581 __get_str(name)
582 )
583);
584
585DECLARE_EVENT_CLASS(nfs_rename_event,
586 TP_PROTO(
587 const struct inode *old_dir,
588 const struct dentry *old_dentry,
589 const struct inode *new_dir,
590 const struct dentry *new_dentry
591 ),
592
593 TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
594
595 TP_STRUCT__entry(
596 __field(dev_t, dev)
597 __field(u64, old_dir)
598 __field(u64, new_dir)
599 __string(old_name, old_dentry->d_name.name)
600 __string(new_name, new_dentry->d_name.name)
601 ),
602
603 TP_fast_assign(
604 __entry->dev = old_dir->i_sb->s_dev;
605 __entry->old_dir = NFS_FILEID(old_dir);
606 __entry->new_dir = NFS_FILEID(new_dir);
607 __assign_str(old_name, old_dentry->d_name.name);
608 __assign_str(new_name, new_dentry->d_name.name);
609 ),
610
611 TP_printk(
612 "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s",
613 MAJOR(__entry->dev), MINOR(__entry->dev),
614 (unsigned long long)__entry->old_dir,
615 __get_str(old_name),
616 MAJOR(__entry->dev), MINOR(__entry->dev),
617 (unsigned long long)__entry->new_dir,
618 __get_str(new_name)
619 )
620);
621#define DEFINE_NFS_RENAME_EVENT(name) \
622 DEFINE_EVENT(nfs_rename_event, name, \
623 TP_PROTO( \
624 const struct inode *old_dir, \
625 const struct dentry *old_dentry, \
626 const struct inode *new_dir, \
627 const struct dentry *new_dentry \
628 ), \
629 TP_ARGS(old_dir, old_dentry, new_dir, new_dentry))
630
631DECLARE_EVENT_CLASS(nfs_rename_event_done,
632 TP_PROTO(
633 const struct inode *old_dir,
634 const struct dentry *old_dentry,
635 const struct inode *new_dir,
636 const struct dentry *new_dentry,
637 int error
638 ),
639
640 TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error),
641
642 TP_STRUCT__entry(
643 __field(dev_t, dev)
644 __field(int, error)
645 __field(u64, old_dir)
646 __string(old_name, old_dentry->d_name.name)
647 __field(u64, new_dir)
648 __string(new_name, new_dentry->d_name.name)
649 ),
650
651 TP_fast_assign(
652 __entry->dev = old_dir->i_sb->s_dev;
653 __entry->old_dir = NFS_FILEID(old_dir);
654 __entry->new_dir = NFS_FILEID(new_dir);
655 __entry->error = error;
656 __assign_str(old_name, old_dentry->d_name.name);
657 __assign_str(new_name, new_dentry->d_name.name);
658 ),
659
660 TP_printk(
661 "error=%d old_name=%02x:%02x:%llu/%s "
662 "new_name=%02x:%02x:%llu/%s",
663 __entry->error,
664 MAJOR(__entry->dev), MINOR(__entry->dev),
665 (unsigned long long)__entry->old_dir,
666 __get_str(old_name),
667 MAJOR(__entry->dev), MINOR(__entry->dev),
668 (unsigned long long)__entry->new_dir,
669 __get_str(new_name)
670 )
671);
672#define DEFINE_NFS_RENAME_EVENT_DONE(name) \
673 DEFINE_EVENT(nfs_rename_event_done, name, \
674 TP_PROTO( \
675 const struct inode *old_dir, \
676 const struct dentry *old_dentry, \
677 const struct inode *new_dir, \
678 const struct dentry *new_dentry, \
679 int error \
680 ), \
681 TP_ARGS(old_dir, old_dentry, new_dir, \
682 new_dentry, error))
683
684DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
685DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
686
687DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
688
689TRACE_EVENT(nfs_sillyrename_unlink,
690 TP_PROTO(
691 const struct nfs_unlinkdata *data,
692 int error
693 ),
694
695 TP_ARGS(data, error),
696
697 TP_STRUCT__entry(
698 __field(dev_t, dev)
699 __field(int, error)
700 __field(u64, dir)
701 __dynamic_array(char, name, data->args.name.len + 1)
702 ),
703
704 TP_fast_assign(
705 struct inode *dir = data->dir;
706 size_t len = data->args.name.len;
707 __entry->dev = dir->i_sb->s_dev;
708 __entry->dir = NFS_FILEID(dir);
709 __entry->error = error;
710 memcpy(__get_dynamic_array(name),
711 data->args.name.name, len);
712 ((char *)__get_dynamic_array(name))[len] = 0;
713 ),
714
715 TP_printk(
716 "error=%d name=%02x:%02x:%llu/%s",
717 __entry->error,
718 MAJOR(__entry->dev), MINOR(__entry->dev),
719 (unsigned long long)__entry->dir,
720 __get_str(name)
721 )
722);
723#endif /* _TRACE_NFS_H */
724
725#undef TRACE_INCLUDE_PATH
726#define TRACE_INCLUDE_PATH .
727#define TRACE_INCLUDE_FILE nfstrace
728/* This part must be outside protection */
729#include <trace/define_trace.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29cfb7ade121..2ffebf2081ce 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -328,6 +328,19 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
328} 328}
329EXPORT_SYMBOL_GPL(nfs_pageio_init); 329EXPORT_SYMBOL_GPL(nfs_pageio_init);
330 330
331static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
332 const struct nfs_open_context *ctx2)
333{
334 return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
335}
336
337static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
338 const struct nfs_lock_context *l2)
339{
340 return l1->lockowner.l_owner == l2->lockowner.l_owner
341 && l1->lockowner.l_pid == l2->lockowner.l_pid;
342}
343
331/** 344/**
332 * nfs_can_coalesce_requests - test two requests for compatibility 345 * nfs_can_coalesce_requests - test two requests for compatibility
333 * @prev: pointer to nfs_page 346 * @prev: pointer to nfs_page
@@ -343,13 +356,10 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
343 struct nfs_page *req, 356 struct nfs_page *req,
344 struct nfs_pageio_descriptor *pgio) 357 struct nfs_pageio_descriptor *pgio)
345{ 358{
346 if (req->wb_context->cred != prev->wb_context->cred) 359 if (!nfs_match_open_context(req->wb_context, prev->wb_context))
347 return false;
348 if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner)
349 return false;
350 if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid)
351 return false; 360 return false;
352 if (req->wb_context->state != prev->wb_context->state) 361 if (req->wb_context->dentry->d_inode->i_flock != NULL &&
362 !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context))
353 return false; 363 return false;
354 if (req->wb_pgbase != 0) 364 if (req->wb_pgbase != 0)
355 return false; 365 return false;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3a3a79d6bf15..d75d938d36cb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -33,6 +33,7 @@
33#include "internal.h" 33#include "internal.h"
34#include "pnfs.h" 34#include "pnfs.h"
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h"
36 37
37#define NFSDBG_FACILITY NFSDBG_PNFS 38#define NFSDBG_FACILITY NFSDBG_PNFS
38#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 39#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -1526,6 +1527,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
1526{ 1527{
1527 struct nfs_pgio_header *hdr = data->header; 1528 struct nfs_pgio_header *hdr = data->header;
1528 1529
1530 trace_nfs4_pnfs_write(data, hdr->pnfs_error);
1529 if (!hdr->pnfs_error) { 1531 if (!hdr->pnfs_error) {
1530 pnfs_set_layoutcommit(data); 1532 pnfs_set_layoutcommit(data);
1531 hdr->mds_ops->rpc_call_done(&data->task, data); 1533 hdr->mds_ops->rpc_call_done(&data->task, data);
@@ -1680,6 +1682,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
1680{ 1682{
1681 struct nfs_pgio_header *hdr = data->header; 1683 struct nfs_pgio_header *hdr = data->header;
1682 1684
1685 trace_nfs4_pnfs_read(data, hdr->pnfs_error);
1683 if (likely(!hdr->pnfs_error)) { 1686 if (likely(!hdr->pnfs_error)) {
1684 __nfs4_read_done_cb(data); 1687 __nfs4_read_done_cb(data);
1685 hdr->mds_ops->rpc_call_done(&data->task, data); 1688 hdr->mds_ops->rpc_call_done(&data->task, data);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c041c41f7a52..a8f57c728df5 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
623 msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; 623 msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
624} 624}
625 625
626static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) 626static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
627{ 627{
628 rpc_call_start(task); 628 rpc_call_start(task);
629 return 0;
629} 630}
630 631
631static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 632static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
644 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; 645 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
645} 646}
646 647
647static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) 648static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
648{ 649{
649 rpc_call_start(task); 650 rpc_call_start(task);
651 return 0;
650} 652}
651 653
652static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 654static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 70a26c651f09..31db5c366b81 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
513void nfs_read_prepare(struct rpc_task *task, void *calldata) 513void nfs_read_prepare(struct rpc_task *task, void *calldata)
514{ 514{
515 struct nfs_read_data *data = calldata; 515 struct nfs_read_data *data = calldata;
516 NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); 516 int err;
517 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) 517 err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
518 rpc_exit(task, -EIO); 518 if (err)
519 rpc_exit(task, err);
519} 520}
520 521
521static const struct rpc_call_ops nfs_read_common_ops = { 522static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f6db66d8f647..a03b9c6f9489 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -360,7 +360,8 @@ static void unregister_nfs4_fs(void)
360#endif 360#endif
361 361
362static struct shrinker acl_shrinker = { 362static struct shrinker acl_shrinker = {
363 .shrink = nfs_access_cache_shrinker, 363 .count_objects = nfs_access_cache_count,
364 .scan_objects = nfs_access_cache_scan,
364 .seeks = DEFAULT_SEEKS, 365 .seeks = DEFAULT_SEEKS,
365}; 366};
366 367
@@ -923,7 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
923 data->nfs_server.port = NFS_UNSPEC_PORT; 924 data->nfs_server.port = NFS_UNSPEC_PORT;
924 data->nfs_server.protocol = XPRT_TRANSPORT_TCP; 925 data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
925 data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; 926 data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR;
926 data->auth_flavor_len = 1; 927 data->auth_flavor_len = 0;
927 data->minorversion = 0; 928 data->minorversion = 0;
928 data->need_mount = true; 929 data->need_mount = true;
929 data->net = current->nsproxy->net_ns; 930 data->net = current->nsproxy->net_ns;
@@ -1018,6 +1019,13 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
1018 } 1019 }
1019} 1020}
1020 1021
1022static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
1023 rpc_authflavor_t pseudoflavor)
1024{
1025 data->auth_flavors[0] = pseudoflavor;
1026 data->auth_flavor_len = 1;
1027}
1028
1021/* 1029/*
1022 * Parse the value of the 'sec=' option. 1030 * Parse the value of the 'sec=' option.
1023 */ 1031 */
@@ -1025,49 +1033,50 @@ static int nfs_parse_security_flavors(char *value,
1025 struct nfs_parsed_mount_data *mnt) 1033 struct nfs_parsed_mount_data *mnt)
1026{ 1034{
1027 substring_t args[MAX_OPT_ARGS]; 1035 substring_t args[MAX_OPT_ARGS];
1036 rpc_authflavor_t pseudoflavor;
1028 1037
1029 dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); 1038 dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
1030 1039
1031 switch (match_token(value, nfs_secflavor_tokens, args)) { 1040 switch (match_token(value, nfs_secflavor_tokens, args)) {
1032 case Opt_sec_none: 1041 case Opt_sec_none:
1033 mnt->auth_flavors[0] = RPC_AUTH_NULL; 1042 pseudoflavor = RPC_AUTH_NULL;
1034 break; 1043 break;
1035 case Opt_sec_sys: 1044 case Opt_sec_sys:
1036 mnt->auth_flavors[0] = RPC_AUTH_UNIX; 1045 pseudoflavor = RPC_AUTH_UNIX;
1037 break; 1046 break;
1038 case Opt_sec_krb5: 1047 case Opt_sec_krb5:
1039 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; 1048 pseudoflavor = RPC_AUTH_GSS_KRB5;
1040 break; 1049 break;
1041 case Opt_sec_krb5i: 1050 case Opt_sec_krb5i:
1042 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; 1051 pseudoflavor = RPC_AUTH_GSS_KRB5I;
1043 break; 1052 break;
1044 case Opt_sec_krb5p: 1053 case Opt_sec_krb5p:
1045 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; 1054 pseudoflavor = RPC_AUTH_GSS_KRB5P;
1046 break; 1055 break;
1047 case Opt_sec_lkey: 1056 case Opt_sec_lkey:
1048 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; 1057 pseudoflavor = RPC_AUTH_GSS_LKEY;
1049 break; 1058 break;
1050 case Opt_sec_lkeyi: 1059 case Opt_sec_lkeyi:
1051 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; 1060 pseudoflavor = RPC_AUTH_GSS_LKEYI;
1052 break; 1061 break;
1053 case Opt_sec_lkeyp: 1062 case Opt_sec_lkeyp:
1054 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; 1063 pseudoflavor = RPC_AUTH_GSS_LKEYP;
1055 break; 1064 break;
1056 case Opt_sec_spkm: 1065 case Opt_sec_spkm:
1057 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; 1066 pseudoflavor = RPC_AUTH_GSS_SPKM;
1058 break; 1067 break;
1059 case Opt_sec_spkmi: 1068 case Opt_sec_spkmi:
1060 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; 1069 pseudoflavor = RPC_AUTH_GSS_SPKMI;
1061 break; 1070 break;
1062 case Opt_sec_spkmp: 1071 case Opt_sec_spkmp:
1063 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; 1072 pseudoflavor = RPC_AUTH_GSS_SPKMP;
1064 break; 1073 break;
1065 default: 1074 default:
1066 return 0; 1075 return 0;
1067 } 1076 }
1068 1077
1069 mnt->flags |= NFS_MOUNT_SECFLAVOUR; 1078 mnt->flags |= NFS_MOUNT_SECFLAVOUR;
1070 mnt->auth_flavor_len = 1; 1079 nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
1071 return 1; 1080 return 1;
1072} 1081}
1073 1082
@@ -1729,7 +1738,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
1729 * Was a sec= authflavor specified in the options? First, verify 1738 * Was a sec= authflavor specified in the options? First, verify
1730 * whether the server supports it, and then just try to use it if so. 1739 * whether the server supports it, and then just try to use it if so.
1731 */ 1740 */
1732 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { 1741 if (args->auth_flavor_len > 0) {
1733 status = nfs_verify_authflavor(args, authlist, authlist_len); 1742 status = nfs_verify_authflavor(args, authlist, authlist_len);
1734 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); 1743 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1735 if (status) 1744 if (status)
@@ -1760,7 +1769,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
1760 /* Fallthrough */ 1769 /* Fallthrough */
1761 } 1770 }
1762 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); 1771 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
1763 args->auth_flavors[0] = flavor; 1772 nfs_set_auth_parsed_mount_data(args, flavor);
1764 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); 1773 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1765 if (!IS_ERR(server)) 1774 if (!IS_ERR(server))
1766 return server; 1775 return server;
@@ -1776,7 +1785,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
1776 1785
1777 /* Last chance! Try AUTH_UNIX */ 1786 /* Last chance! Try AUTH_UNIX */
1778 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); 1787 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
1779 args->auth_flavors[0] = RPC_AUTH_UNIX; 1788 nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
1780 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); 1789 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1781} 1790}
1782 1791
@@ -1893,6 +1902,7 @@ static int nfs23_validate_mount_data(void *options,
1893{ 1902{
1894 struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1903 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1895 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; 1904 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
1905 int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
1896 1906
1897 if (data == NULL) 1907 if (data == NULL)
1898 goto out_no_data; 1908 goto out_no_data;
@@ -1908,6 +1918,8 @@ static int nfs23_validate_mount_data(void *options,
1908 goto out_no_v3; 1918 goto out_no_v3;
1909 data->root.size = NFS2_FHSIZE; 1919 data->root.size = NFS2_FHSIZE;
1910 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); 1920 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
1921 /* Turn off security negotiation */
1922 extra_flags |= NFS_MOUNT_SECFLAVOUR;
1911 case 4: 1923 case 4:
1912 if (data->flags & NFS_MOUNT_SECFLAVOUR) 1924 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1913 goto out_no_sec; 1925 goto out_no_sec;
@@ -1935,7 +1947,7 @@ static int nfs23_validate_mount_data(void *options,
1935 * can deal with. 1947 * can deal with.
1936 */ 1948 */
1937 args->flags = data->flags & NFS_MOUNT_FLAGMASK; 1949 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1938 args->flags |= NFS_MOUNT_LEGACY_INTERFACE; 1950 args->flags |= extra_flags;
1939 args->rsize = data->rsize; 1951 args->rsize = data->rsize;
1940 args->wsize = data->wsize; 1952 args->wsize = data->wsize;
1941 args->timeo = data->timeo; 1953 args->timeo = data->timeo;
@@ -1959,9 +1971,10 @@ static int nfs23_validate_mount_data(void *options,
1959 args->namlen = data->namlen; 1971 args->namlen = data->namlen;
1960 args->bsize = data->bsize; 1972 args->bsize = data->bsize;
1961 1973
1962 args->auth_flavors[0] = RPC_AUTH_UNIX;
1963 if (data->flags & NFS_MOUNT_SECFLAVOUR) 1974 if (data->flags & NFS_MOUNT_SECFLAVOUR)
1964 args->auth_flavors[0] = data->pseudoflavor; 1975 nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
1976 else
1977 nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
1965 if (!args->nfs_server.hostname) 1978 if (!args->nfs_server.hostname)
1966 goto out_nomem; 1979 goto out_nomem;
1967 1980
@@ -2084,6 +2097,8 @@ static int nfs_validate_text_mount_data(void *options,
2084 max_namelen = NFS4_MAXNAMLEN; 2097 max_namelen = NFS4_MAXNAMLEN;
2085 max_pathlen = NFS4_MAXPATHLEN; 2098 max_pathlen = NFS4_MAXPATHLEN;
2086 nfs_validate_transport_protocol(args); 2099 nfs_validate_transport_protocol(args);
2100 if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
2101 goto out_invalid_transport_udp;
2087 nfs4_validate_mount_flags(args); 2102 nfs4_validate_mount_flags(args);
2088#else 2103#else
2089 goto out_v4_not_compiled; 2104 goto out_v4_not_compiled;
@@ -2106,6 +2121,10 @@ static int nfs_validate_text_mount_data(void *options,
2106out_v4_not_compiled: 2121out_v4_not_compiled:
2107 dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); 2122 dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
2108 return -EPROTONOSUPPORT; 2123 return -EPROTONOSUPPORT;
2124#else
2125out_invalid_transport_udp:
2126 dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
2127 return -EINVAL;
2109#endif /* !CONFIG_NFS_V4 */ 2128#endif /* !CONFIG_NFS_V4 */
2110 2129
2111out_no_address: 2130out_no_address:
@@ -2170,7 +2189,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
2170 data->rsize = nfss->rsize; 2189 data->rsize = nfss->rsize;
2171 data->wsize = nfss->wsize; 2190 data->wsize = nfss->wsize;
2172 data->retrans = nfss->client->cl_timeout->to_retries; 2191 data->retrans = nfss->client->cl_timeout->to_retries;
2173 data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; 2192 nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
2174 data->acregmin = nfss->acregmin / HZ; 2193 data->acregmin = nfss->acregmin / HZ;
2175 data->acregmax = nfss->acregmax / HZ; 2194 data->acregmax = nfss->acregmax / HZ;
2176 data->acdirmin = nfss->acdirmin / HZ; 2195 data->acdirmin = nfss->acdirmin / HZ;
@@ -2277,6 +2296,18 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
2277 nfs_initialise_sb(sb); 2296 nfs_initialise_sb(sb);
2278} 2297}
2279 2298
2299#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
2300 | NFS_MOUNT_SECURE \
2301 | NFS_MOUNT_TCP \
2302 | NFS_MOUNT_VER3 \
2303 | NFS_MOUNT_KERBEROS \
2304 | NFS_MOUNT_NONLM \
2305 | NFS_MOUNT_BROKEN_SUID \
2306 | NFS_MOUNT_STRICTLOCK \
2307 | NFS_MOUNT_UNSHARED \
2308 | NFS_MOUNT_NORESVPORT \
2309 | NFS_MOUNT_LEGACY_INTERFACE)
2310
2280static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) 2311static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
2281{ 2312{
2282 const struct nfs_server *a = s->s_fs_info; 2313 const struct nfs_server *a = s->s_fs_info;
@@ -2287,7 +2318,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
2287 goto Ebusy; 2318 goto Ebusy;
2288 if (a->nfs_client != b->nfs_client) 2319 if (a->nfs_client != b->nfs_client)
2289 goto Ebusy; 2320 goto Ebusy;
2290 if (a->flags != b->flags) 2321 if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)
2291 goto Ebusy; 2322 goto Ebusy;
2292 if (a->wsize != b->wsize) 2323 if (a->wsize != b->wsize)
2293 goto Ebusy; 2324 goto Ebusy;
@@ -2301,7 +2332,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
2301 goto Ebusy; 2332 goto Ebusy;
2302 if (a->acdirmax != b->acdirmax) 2333 if (a->acdirmax != b->acdirmax)
2303 goto Ebusy; 2334 goto Ebusy;
2304 if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) 2335 if (b->flags & NFS_MOUNT_SECFLAVOUR &&
2336 clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
2305 goto Ebusy; 2337 goto Ebusy;
2306 return 1; 2338 return 1;
2307Ebusy: 2339Ebusy:
@@ -2673,15 +2705,17 @@ static int nfs4_validate_mount_data(void *options,
2673 goto out_no_address; 2705 goto out_no_address;
2674 args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); 2706 args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
2675 2707
2676 args->auth_flavors[0] = RPC_AUTH_UNIX;
2677 if (data->auth_flavourlen) { 2708 if (data->auth_flavourlen) {
2709 rpc_authflavor_t pseudoflavor;
2678 if (data->auth_flavourlen > 1) 2710 if (data->auth_flavourlen > 1)
2679 goto out_inval_auth; 2711 goto out_inval_auth;
2680 if (copy_from_user(&args->auth_flavors[0], 2712 if (copy_from_user(&pseudoflavor,
2681 data->auth_flavours, 2713 data->auth_flavours,
2682 sizeof(args->auth_flavors[0]))) 2714 sizeof(pseudoflavor)))
2683 return -EFAULT; 2715 return -EFAULT;
2684 } 2716 nfs_set_auth_parsed_mount_data(args, pseudoflavor);
2717 } else
2718 nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
2685 2719
2686 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); 2720 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
2687 if (IS_ERR(c)) 2721 if (IS_ERR(c))
@@ -2715,6 +2749,8 @@ static int nfs4_validate_mount_data(void *options,
2715 args->acdirmax = data->acdirmax; 2749 args->acdirmax = data->acdirmax;
2716 args->nfs_server.protocol = data->proto; 2750 args->nfs_server.protocol = data->proto;
2717 nfs_validate_transport_protocol(args); 2751 nfs_validate_transport_protocol(args);
2752 if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
2753 goto out_invalid_transport_udp;
2718 2754
2719 break; 2755 break;
2720 default: 2756 default:
@@ -2735,6 +2771,10 @@ out_inval_auth:
2735out_no_address: 2771out_no_address:
2736 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2772 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
2737 return -EINVAL; 2773 return -EINVAL;
2774
2775out_invalid_transport_udp:
2776 dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
2777 return -EINVAL;
2738} 2778}
2739 2779
2740/* 2780/*
@@ -2750,6 +2790,7 @@ bool nfs4_disable_idmapping = true;
2750unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; 2790unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
2751unsigned short send_implementation_id = 1; 2791unsigned short send_implementation_id = 1;
2752char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; 2792char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
2793bool recover_lost_locks = false;
2753 2794
2754EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); 2795EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
2755EXPORT_SYMBOL_GPL(nfs_callback_tcpport); 2796EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
@@ -2758,6 +2799,7 @@ EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
2758EXPORT_SYMBOL_GPL(max_session_slots); 2799EXPORT_SYMBOL_GPL(max_session_slots);
2759EXPORT_SYMBOL_GPL(send_implementation_id); 2800EXPORT_SYMBOL_GPL(send_implementation_id);
2760EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); 2801EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
2802EXPORT_SYMBOL_GPL(recover_lost_locks);
2761 2803
2762#define NFS_CALLBACK_MAXPORTNR (65535U) 2804#define NFS_CALLBACK_MAXPORTNR (65535U)
2763 2805
@@ -2795,4 +2837,10 @@ MODULE_PARM_DESC(send_implementation_id,
2795 "Send implementation ID with NFSv4.1 exchange_id"); 2837 "Send implementation ID with NFSv4.1 exchange_id");
2796MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); 2838MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
2797 2839
2840module_param(recover_lost_locks, bool, 0644);
2841MODULE_PARM_DESC(recover_lost_locks,
2842 "If the server reports that a lock might be lost, "
2843 "try to recover it risking data corruption.");
2844
2845
2798#endif /* CONFIG_NFS_V4 */ 2846#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 60395ad3a2e4..bb939edd4c99 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -20,6 +20,8 @@
20#include "iostat.h" 20#include "iostat.h"
21#include "delegation.h" 21#include "delegation.h"
22 22
23#include "nfstrace.h"
24
23/** 25/**
24 * nfs_free_unlinkdata - release data from a sillydelete operation. 26 * nfs_free_unlinkdata - release data from a sillydelete operation.
25 * @data: pointer to unlink structure. 27 * @data: pointer to unlink structure.
@@ -77,6 +79,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
77 struct nfs_unlinkdata *data = calldata; 79 struct nfs_unlinkdata *data = calldata;
78 struct inode *dir = data->dir; 80 struct inode *dir = data->dir;
79 81
82 trace_nfs_sillyrename_unlink(data, task->tk_status);
80 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 83 if (!NFS_PROTO(dir)->unlink_done(task, dir))
81 rpc_restart_call_prepare(task); 84 rpc_restart_call_prepare(task);
82} 85}
@@ -204,6 +207,13 @@ out_free:
204 return ret; 207 return ret;
205} 208}
206 209
210void nfs_wait_on_sillyrename(struct dentry *dentry)
211{
212 struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
213
214 wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
215}
216
207void nfs_block_sillyrename(struct dentry *dentry) 217void nfs_block_sillyrename(struct dentry *dentry)
208{ 218{
209 struct nfs_inode *nfsi = NFS_I(dentry->d_inode); 219 struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
@@ -336,6 +346,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
336 struct inode *new_dir = data->new_dir; 346 struct inode *new_dir = data->new_dir;
337 struct dentry *old_dentry = data->old_dentry; 347 struct dentry *old_dentry = data->old_dentry;
338 348
349 trace_nfs_sillyrename_rename(old_dir, old_dentry,
350 new_dir, data->new_dentry, task->tk_status);
339 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { 351 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
340 rpc_restart_call_prepare(task); 352 rpc_restart_call_prepare(task);
341 return; 353 return;
@@ -444,6 +456,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
444 return rpc_run_task(&task_setup_data); 456 return rpc_run_task(&task_setup_data);
445} 457}
446 458
459#define SILLYNAME_PREFIX ".nfs"
460#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
461#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
462#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1)
463#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \
464 SILLYNAME_FILEID_LEN + \
465 SILLYNAME_COUNTER_LEN)
466
447/** 467/**
448 * nfs_sillyrename - Perform a silly-rename of a dentry 468 * nfs_sillyrename - Perform a silly-rename of a dentry
449 * @dir: inode of directory that contains dentry 469 * @dir: inode of directory that contains dentry
@@ -469,10 +489,8 @@ int
469nfs_sillyrename(struct inode *dir, struct dentry *dentry) 489nfs_sillyrename(struct inode *dir, struct dentry *dentry)
470{ 490{
471 static unsigned int sillycounter; 491 static unsigned int sillycounter;
472 const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; 492 unsigned char silly[SILLYNAME_LEN + 1];
473 const int countersize = sizeof(sillycounter)*2; 493 unsigned long long fileid;
474 const int slen = sizeof(".nfs")+fileidsize+countersize-1;
475 char silly[slen+1];
476 struct dentry *sdentry; 494 struct dentry *sdentry;
477 struct rpc_task *task; 495 struct rpc_task *task;
478 int error = -EIO; 496 int error = -EIO;
@@ -489,20 +507,20 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
489 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 507 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
490 goto out; 508 goto out;
491 509
492 sprintf(silly, ".nfs%*.*Lx", 510 fileid = NFS_FILEID(dentry->d_inode);
493 fileidsize, fileidsize,
494 (unsigned long long)NFS_FILEID(dentry->d_inode));
495 511
496 /* Return delegation in anticipation of the rename */ 512 /* Return delegation in anticipation of the rename */
497 NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); 513 NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode);
498 514
499 sdentry = NULL; 515 sdentry = NULL;
500 do { 516 do {
501 char *suffix = silly + slen - countersize; 517 int slen;
502
503 dput(sdentry); 518 dput(sdentry);
504 sillycounter++; 519 sillycounter++;
505 sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); 520 slen = scnprintf(silly, sizeof(silly),
521 SILLYNAME_PREFIX "%0*llx%0*x",
522 SILLYNAME_FILEID_LEN, fileid,
523 SILLYNAME_COUNTER_LEN, sillycounter);
506 524
507 dfprintk(VFS, "NFS: trying to rename %s to %s\n", 525 dfprintk(VFS, "NFS: trying to rename %s to %s\n",
508 dentry->d_name.name, silly); 526 dentry->d_name.name, silly);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f1bdb7254776..ac1dc331ba31 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -31,6 +31,8 @@
31#include "fscache.h" 31#include "fscache.h"
32#include "pnfs.h" 32#include "pnfs.h"
33 33
34#include "nfstrace.h"
35
34#define NFSDBG_FACILITY NFSDBG_PAGECACHE 36#define NFSDBG_FACILITY NFSDBG_PAGECACHE
35 37
36#define MIN_POOL_WRITE (32) 38#define MIN_POOL_WRITE (32)
@@ -861,7 +863,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
861 return 0; 863 return 0;
862 l_ctx = req->wb_lock_context; 864 l_ctx = req->wb_lock_context;
863 do_flush = req->wb_page != page || req->wb_context != ctx; 865 do_flush = req->wb_page != page || req->wb_context != ctx;
864 if (l_ctx) { 866 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
865 do_flush |= l_ctx->lockowner.l_owner != current->files 867 do_flush |= l_ctx->lockowner.l_owner != current->files
866 || l_ctx->lockowner.l_pid != current->tgid; 868 || l_ctx->lockowner.l_pid != current->tgid;
867 } 869 }
@@ -874,6 +876,33 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
874} 876}
875 877
876/* 878/*
879 * Avoid buffered writes when a open context credential's key would
880 * expire soon.
881 *
882 * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL.
883 *
884 * Return 0 and set a credential flag which triggers the inode to flush
885 * and performs NFS_FILE_SYNC writes if the key will expired within
886 * RPC_KEY_EXPIRE_TIMEO.
887 */
888int
889nfs_key_timeout_notify(struct file *filp, struct inode *inode)
890{
891 struct nfs_open_context *ctx = nfs_file_open_context(filp);
892 struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
893
894 return rpcauth_key_timeout_notify(auth, ctx->cred);
895}
896
897/*
898 * Test if the open context credential key is marked to expire soon.
899 */
900bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
901{
902 return rpcauth_cred_key_to_expire(ctx->cred);
903}
904
905/*
877 * If the page cache is marked as unsafe or invalid, then we can't rely on 906 * If the page cache is marked as unsafe or invalid, then we can't rely on
878 * the PageUptodate() flag. In this case, we will need to turn off 907 * the PageUptodate() flag. In this case, we will need to turn off
879 * write optimisations that depend on the page contents being correct. 908 * write optimisations that depend on the page contents being correct.
@@ -993,6 +1022,9 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
993 data->args.count, 1022 data->args.count,
994 (unsigned long long)data->args.offset); 1023 (unsigned long long)data->args.offset);
995 1024
1025 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
1026 &task_setup_data.rpc_client, &msg, data);
1027
996 task = rpc_run_task(&task_setup_data); 1028 task = rpc_run_task(&task_setup_data);
997 if (IS_ERR(task)) { 1029 if (IS_ERR(task)) {
998 ret = PTR_ERR(task); 1030 ret = PTR_ERR(task);
@@ -1265,9 +1297,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1265void nfs_write_prepare(struct rpc_task *task, void *calldata) 1297void nfs_write_prepare(struct rpc_task *task, void *calldata)
1266{ 1298{
1267 struct nfs_write_data *data = calldata; 1299 struct nfs_write_data *data = calldata;
1268 NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); 1300 int err;
1269 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) 1301 err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
1270 rpc_exit(task, -EIO); 1302 if (err)
1303 rpc_exit(task, err);
1271} 1304}
1272 1305
1273void nfs_commit_prepare(struct rpc_task *task, void *calldata) 1306void nfs_commit_prepare(struct rpc_task *task, void *calldata)
@@ -1458,6 +1491,9 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1458 1491
1459 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1492 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1460 1493
1494 nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
1495 NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
1496
1461 task = rpc_run_task(&task_setup_data); 1497 task = rpc_run_task(&task_setup_data);
1462 if (IS_ERR(task)) 1498 if (IS_ERR(task))
1463 return PTR_ERR(task); 1499 return PTR_ERR(task);
@@ -1732,8 +1768,14 @@ int nfs_wb_all(struct inode *inode)
1732 .range_start = 0, 1768 .range_start = 0,
1733 .range_end = LLONG_MAX, 1769 .range_end = LLONG_MAX,
1734 }; 1770 };
1771 int ret;
1735 1772
1736 return sync_inode(inode, &wbc); 1773 trace_nfs_writeback_inode_enter(inode);
1774
1775 ret = sync_inode(inode, &wbc);
1776
1777 trace_nfs_writeback_inode_exit(inode, ret);
1778 return ret;
1737} 1779}
1738EXPORT_SYMBOL_GPL(nfs_wb_all); 1780EXPORT_SYMBOL_GPL(nfs_wb_all);
1739 1781
@@ -1781,6 +1823,8 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1781 }; 1823 };
1782 int ret; 1824 int ret;
1783 1825
1826 trace_nfs_writeback_page_enter(inode);
1827
1784 for (;;) { 1828 for (;;) {
1785 wait_on_page_writeback(page); 1829 wait_on_page_writeback(page);
1786 if (clear_page_dirty_for_io(page)) { 1830 if (clear_page_dirty_for_io(page)) {
@@ -1789,14 +1833,15 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1789 goto out_error; 1833 goto out_error;
1790 continue; 1834 continue;
1791 } 1835 }
1836 ret = 0;
1792 if (!PagePrivate(page)) 1837 if (!PagePrivate(page))
1793 break; 1838 break;
1794 ret = nfs_commit_inode(inode, FLUSH_SYNC); 1839 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1795 if (ret < 0) 1840 if (ret < 0)
1796 goto out_error; 1841 goto out_error;
1797 } 1842 }
1798 return 0;
1799out_error: 1843out_error:
1844 trace_nfs_writeback_page_exit(inode, ret);
1800 return ret; 1845 return ret;
1801} 1846}
1802 1847
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 105a3b080d12..e0a65a9e37e9 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -173,8 +173,6 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
173 int status; 173 int status;
174 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 174 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
175 175
176 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
177
178 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 176 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
179 return; 177 return;
180 if (!nn->rec_file) 178 if (!nn->rec_file)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 43f42290e5df..0874998a49cd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -368,11 +368,8 @@ static struct nfs4_delegation *
368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) 368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
369{ 369{
370 struct nfs4_delegation *dp; 370 struct nfs4_delegation *dp;
371 struct nfs4_file *fp = stp->st_file;
372 371
373 dprintk("NFSD alloc_init_deleg\n"); 372 dprintk("NFSD alloc_init_deleg\n");
374 if (fp->fi_had_conflict)
375 return NULL;
376 if (num_delegations > max_delegations) 373 if (num_delegations > max_delegations)
377 return NULL; 374 return NULL;
378 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); 375 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
@@ -389,8 +386,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
389 INIT_LIST_HEAD(&dp->dl_perfile); 386 INIT_LIST_HEAD(&dp->dl_perfile);
390 INIT_LIST_HEAD(&dp->dl_perclnt); 387 INIT_LIST_HEAD(&dp->dl_perclnt);
391 INIT_LIST_HEAD(&dp->dl_recall_lru); 388 INIT_LIST_HEAD(&dp->dl_recall_lru);
392 get_nfs4_file(fp); 389 dp->dl_file = NULL;
393 dp->dl_file = fp;
394 dp->dl_type = NFS4_OPEN_DELEGATE_READ; 390 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
395 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 391 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
396 dp->dl_time = 0; 392 dp->dl_time = 0;
@@ -3035,7 +3031,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3035 if (status) { 3031 if (status) {
3036 list_del_init(&dp->dl_perclnt); 3032 list_del_init(&dp->dl_perclnt);
3037 locks_free_lock(fl); 3033 locks_free_lock(fl);
3038 return -ENOMEM; 3034 return status;
3039 } 3035 }
3040 fp->fi_lease = fl; 3036 fp->fi_lease = fl;
3041 fp->fi_deleg_file = get_file(fl->fl_file); 3037 fp->fi_deleg_file = get_file(fl->fl_file);
@@ -3044,22 +3040,35 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3044 return 0; 3040 return 0;
3045} 3041}
3046 3042
3047static int nfs4_set_delegation(struct nfs4_delegation *dp) 3043static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
3048{ 3044{
3049 struct nfs4_file *fp = dp->dl_file; 3045 int status;
3050 3046
3051 if (!fp->fi_lease) 3047 if (fp->fi_had_conflict)
3052 return nfs4_setlease(dp); 3048 return -EAGAIN;
3049 get_nfs4_file(fp);
3050 dp->dl_file = fp;
3051 if (!fp->fi_lease) {
3052 status = nfs4_setlease(dp);
3053 if (status)
3054 goto out_free;
3055 return 0;
3056 }
3053 spin_lock(&recall_lock); 3057 spin_lock(&recall_lock);
3054 if (fp->fi_had_conflict) { 3058 if (fp->fi_had_conflict) {
3055 spin_unlock(&recall_lock); 3059 spin_unlock(&recall_lock);
3056 return -EAGAIN; 3060 status = -EAGAIN;
3061 goto out_free;
3057 } 3062 }
3058 atomic_inc(&fp->fi_delegees); 3063 atomic_inc(&fp->fi_delegees);
3059 list_add(&dp->dl_perfile, &fp->fi_delegations); 3064 list_add(&dp->dl_perfile, &fp->fi_delegations);
3060 spin_unlock(&recall_lock); 3065 spin_unlock(&recall_lock);
3061 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); 3066 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
3062 return 0; 3067 return 0;
3068out_free:
3069 put_nfs4_file(fp);
3070 dp->dl_file = fp;
3071 return status;
3063} 3072}
3064 3073
3065static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) 3074static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3134,7 +3143,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3134 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh); 3143 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
3135 if (dp == NULL) 3144 if (dp == NULL)
3136 goto out_no_deleg; 3145 goto out_no_deleg;
3137 status = nfs4_set_delegation(dp); 3146 status = nfs4_set_delegation(dp, stp->st_file);
3138 if (status) 3147 if (status)
3139 goto out_free; 3148 goto out_free;
3140 3149
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c2a4701d7286..d9454fe5653f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1816,10 +1816,7 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1816static __be32 nfsd4_encode_path(const struct path *root, 1816static __be32 nfsd4_encode_path(const struct path *root,
1817 const struct path *path, __be32 **pp, int *buflen) 1817 const struct path *path, __be32 **pp, int *buflen)
1818{ 1818{
1819 struct path cur = { 1819 struct path cur = *path;
1820 .mnt = path->mnt,
1821 .dentry = path->dentry,
1822 };
1823 __be32 *p = *pp; 1820 __be32 *p = *pp;
1824 struct dentry **components = NULL; 1821 struct dentry **components = NULL;
1825 unsigned int ncomponents = 0; 1822 unsigned int ncomponents = 0;
@@ -1859,14 +1856,19 @@ static __be32 nfsd4_encode_path(const struct path *root,
1859 1856
1860 while (ncomponents) { 1857 while (ncomponents) {
1861 struct dentry *dentry = components[ncomponents - 1]; 1858 struct dentry *dentry = components[ncomponents - 1];
1862 unsigned int len = dentry->d_name.len; 1859 unsigned int len;
1863 1860
1861 spin_lock(&dentry->d_lock);
1862 len = dentry->d_name.len;
1864 *buflen -= 4 + (XDR_QUADLEN(len) << 2); 1863 *buflen -= 4 + (XDR_QUADLEN(len) << 2);
1865 if (*buflen < 0) 1864 if (*buflen < 0) {
1865 spin_unlock(&dentry->d_lock);
1866 goto out_free; 1866 goto out_free;
1867 }
1867 WRITE32(len); 1868 WRITE32(len);
1868 WRITEMEM(dentry->d_name.name, len); 1869 WRITEMEM(dentry->d_name.name, len);
1869 dprintk("/%s", dentry->d_name.name); 1870 dprintk("/%s", dentry->d_name.name);
1871 spin_unlock(&dentry->d_lock);
1870 dput(dentry); 1872 dput(dentry);
1871 ncomponents--; 1873 ncomponents--;
1872 } 1874 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index e76244edd748..9186c7ce0b14 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -59,11 +59,14 @@ static unsigned int longest_chain_cachesize;
59 59
60static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); 60static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
61static void cache_cleaner_func(struct work_struct *unused); 61static void cache_cleaner_func(struct work_struct *unused);
62static int nfsd_reply_cache_shrink(struct shrinker *shrink, 62static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
63 struct shrink_control *sc); 63 struct shrink_control *sc);
64static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
65 struct shrink_control *sc);
64 66
65static struct shrinker nfsd_reply_cache_shrinker = { 67static struct shrinker nfsd_reply_cache_shrinker = {
66 .shrink = nfsd_reply_cache_shrink, 68 .scan_objects = nfsd_reply_cache_scan,
69 .count_objects = nfsd_reply_cache_count,
67 .seeks = 1, 70 .seeks = 1,
68}; 71};
69 72
@@ -232,16 +235,18 @@ nfsd_cache_entry_expired(struct svc_cacherep *rp)
232 * Walk the LRU list and prune off entries that are older than RC_EXPIRE. 235 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
233 * Also prune the oldest ones when the total exceeds the max number of entries. 236 * Also prune the oldest ones when the total exceeds the max number of entries.
234 */ 237 */
235static void 238static long
236prune_cache_entries(void) 239prune_cache_entries(void)
237{ 240{
238 struct svc_cacherep *rp, *tmp; 241 struct svc_cacherep *rp, *tmp;
242 long freed = 0;
239 243
240 list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { 244 list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
241 if (!nfsd_cache_entry_expired(rp) && 245 if (!nfsd_cache_entry_expired(rp) &&
242 num_drc_entries <= max_drc_entries) 246 num_drc_entries <= max_drc_entries)
243 break; 247 break;
244 nfsd_reply_cache_free_locked(rp); 248 nfsd_reply_cache_free_locked(rp);
249 freed++;
245 } 250 }
246 251
247 /* 252 /*
@@ -254,6 +259,7 @@ prune_cache_entries(void)
254 cancel_delayed_work(&cache_cleaner); 259 cancel_delayed_work(&cache_cleaner);
255 else 260 else
256 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); 261 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
262 return freed;
257} 263}
258 264
259static void 265static void
@@ -264,20 +270,28 @@ cache_cleaner_func(struct work_struct *unused)
264 spin_unlock(&cache_lock); 270 spin_unlock(&cache_lock);
265} 271}
266 272
267static int 273static unsigned long
268nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) 274nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
269{ 275{
270 unsigned int num; 276 unsigned long num;
271 277
272 spin_lock(&cache_lock); 278 spin_lock(&cache_lock);
273 if (sc->nr_to_scan)
274 prune_cache_entries();
275 num = num_drc_entries; 279 num = num_drc_entries;
276 spin_unlock(&cache_lock); 280 spin_unlock(&cache_lock);
277 281
278 return num; 282 return num;
279} 283}
280 284
285static unsigned long
286nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
287{
288 unsigned long freed;
289
290 spin_lock(&cache_lock);
291 freed = prune_cache_entries();
292 spin_unlock(&cache_lock);
293 return freed;
294}
281/* 295/*
282 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes 296 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
283 */ 297 */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b1a5277cfd18..7e350c562e0e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
254 struct inode *inode = mapping->host; 254 struct inode *inode = mapping->host;
255 255
256 if (to > inode->i_size) { 256 if (to > inode->i_size) {
257 truncate_pagecache(inode, to, inode->i_size); 257 truncate_pagecache(inode, inode->i_size);
258 nilfs_truncate(inode); 258 nilfs_truncate(inode);
259 } 259 }
260} 260}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0ba679866e50..da276640f776 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -94,6 +94,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
94 clear_buffer_nilfs_volatile(bh); 94 clear_buffer_nilfs_volatile(bh);
95 clear_buffer_nilfs_checked(bh); 95 clear_buffer_nilfs_checked(bh);
96 clear_buffer_nilfs_redirected(bh); 96 clear_buffer_nilfs_redirected(bh);
97 clear_buffer_async_write(bh);
97 clear_buffer_dirty(bh); 98 clear_buffer_dirty(bh);
98 if (nilfs_page_buffers_clean(page)) 99 if (nilfs_page_buffers_clean(page))
99 __nilfs_clear_page_dirty(page); 100 __nilfs_clear_page_dirty(page);
@@ -429,6 +430,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
429 "discard block %llu, size %zu", 430 "discard block %llu, size %zu",
430 (u64)bh->b_blocknr, bh->b_size); 431 (u64)bh->b_blocknr, bh->b_size);
431 } 432 }
433 clear_buffer_async_write(bh);
432 clear_buffer_dirty(bh); 434 clear_buffer_dirty(bh);
433 clear_buffer_nilfs_volatile(bh); 435 clear_buffer_nilfs_volatile(bh);
434 clear_buffer_nilfs_checked(bh); 436 clear_buffer_nilfs_checked(bh);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index bd88a7461063..9f6b486b6c01 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -665,7 +665,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
665 665
666 bh = head = page_buffers(page); 666 bh = head = page_buffers(page);
667 do { 667 do {
668 if (!buffer_dirty(bh)) 668 if (!buffer_dirty(bh) || buffer_async_write(bh))
669 continue; 669 continue;
670 get_bh(bh); 670 get_bh(bh);
671 list_add_tail(&bh->b_assoc_buffers, listp); 671 list_add_tail(&bh->b_assoc_buffers, listp);
@@ -699,7 +699,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
699 for (i = 0; i < pagevec_count(&pvec); i++) { 699 for (i = 0; i < pagevec_count(&pvec); i++) {
700 bh = head = page_buffers(pvec.pages[i]); 700 bh = head = page_buffers(pvec.pages[i]);
701 do { 701 do {
702 if (buffer_dirty(bh)) { 702 if (buffer_dirty(bh) &&
703 !buffer_async_write(bh)) {
703 get_bh(bh); 704 get_bh(bh);
704 list_add_tail(&bh->b_assoc_buffers, 705 list_add_tail(&bh->b_assoc_buffers,
705 listp); 706 listp);
@@ -1579,6 +1580,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
1579 1580
1580 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, 1581 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1581 b_assoc_buffers) { 1582 b_assoc_buffers) {
1583 set_buffer_async_write(bh);
1582 if (bh->b_page != bd_page) { 1584 if (bh->b_page != bd_page) {
1583 if (bd_page) { 1585 if (bd_page) {
1584 lock_page(bd_page); 1586 lock_page(bd_page);
@@ -1592,6 +1594,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
1592 1594
1593 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1595 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1594 b_assoc_buffers) { 1596 b_assoc_buffers) {
1597 set_buffer_async_write(bh);
1595 if (bh == segbuf->sb_super_root) { 1598 if (bh == segbuf->sb_super_root) {
1596 if (bh->b_page != bd_page) { 1599 if (bh->b_page != bd_page) {
1597 lock_page(bd_page); 1600 lock_page(bd_page);
@@ -1677,6 +1680,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
1677 list_for_each_entry(segbuf, logs, sb_list) { 1680 list_for_each_entry(segbuf, logs, sb_list) {
1678 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, 1681 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1679 b_assoc_buffers) { 1682 b_assoc_buffers) {
1683 clear_buffer_async_write(bh);
1680 if (bh->b_page != bd_page) { 1684 if (bh->b_page != bd_page) {
1681 if (bd_page) 1685 if (bd_page)
1682 end_page_writeback(bd_page); 1686 end_page_writeback(bd_page);
@@ -1686,6 +1690,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
1686 1690
1687 list_for_each_entry(bh, &segbuf->sb_payload_buffers, 1691 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1688 b_assoc_buffers) { 1692 b_assoc_buffers) {
1693 clear_buffer_async_write(bh);
1689 if (bh == segbuf->sb_super_root) { 1694 if (bh == segbuf->sb_super_root) {
1690 if (bh->b_page != bd_page) { 1695 if (bh->b_page != bd_page) {
1691 end_page_writeback(bd_page); 1696 end_page_writeback(bd_page);
@@ -1755,6 +1760,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1755 b_assoc_buffers) { 1760 b_assoc_buffers) {
1756 set_buffer_uptodate(bh); 1761 set_buffer_uptodate(bh);
1757 clear_buffer_dirty(bh); 1762 clear_buffer_dirty(bh);
1763 clear_buffer_async_write(bh);
1758 if (bh->b_page != bd_page) { 1764 if (bh->b_page != bd_page) {
1759 if (bd_page) 1765 if (bd_page)
1760 end_page_writeback(bd_page); 1766 end_page_writeback(bd_page);
@@ -1776,6 +1782,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1776 b_assoc_buffers) { 1782 b_assoc_buffers) {
1777 set_buffer_uptodate(bh); 1783 set_buffer_uptodate(bh);
1778 clear_buffer_dirty(bh); 1784 clear_buffer_dirty(bh);
1785 clear_buffer_async_write(bh);
1779 clear_buffer_delay(bh); 1786 clear_buffer_delay(bh);
1780 clear_buffer_nilfs_volatile(bh); 1787 clear_buffer_nilfs_volatile(bh);
1781 clear_buffer_nilfs_redirected(bh); 1788 clear_buffer_nilfs_redirected(bh);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index af3ba0478cdf..7ac2a122ca1d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -994,23 +994,16 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
994 return ret; 994 return ret;
995} 995}
996 996
997static int nilfs_tree_was_touched(struct dentry *root_dentry)
998{
999 return d_count(root_dentry) > 1;
1000}
1001
1002/** 997/**
1003 * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint 998 * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint
1004 * @root_dentry: root dentry of the tree to be shrunk 999 * @root_dentry: root dentry of the tree to be shrunk
1005 * 1000 *
1006 * This function returns true if the tree was in-use. 1001 * This function returns true if the tree was in-use.
1007 */ 1002 */
1008static int nilfs_try_to_shrink_tree(struct dentry *root_dentry) 1003static bool nilfs_tree_is_busy(struct dentry *root_dentry)
1009{ 1004{
1010 if (have_submounts(root_dentry))
1011 return true;
1012 shrink_dcache_parent(root_dentry); 1005 shrink_dcache_parent(root_dentry);
1013 return nilfs_tree_was_touched(root_dentry); 1006 return d_count(root_dentry) > 1;
1014} 1007}
1015 1008
1016int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) 1009int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
@@ -1034,8 +1027,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
1034 if (inode) { 1027 if (inode) {
1035 dentry = d_find_alias(inode); 1028 dentry = d_find_alias(inode);
1036 if (dentry) { 1029 if (dentry) {
1037 if (nilfs_tree_was_touched(dentry)) 1030 ret = nilfs_tree_is_busy(dentry);
1038 ret = nilfs_try_to_shrink_tree(dentry);
1039 dput(dentry); 1031 dput(dentry);
1040 } 1032 }
1041 iput(inode); 1033 iput(inode);
@@ -1331,11 +1323,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1331 1323
1332 s->s_flags |= MS_ACTIVE; 1324 s->s_flags |= MS_ACTIVE;
1333 } else if (!sd.cno) { 1325 } else if (!sd.cno) {
1334 int busy = false; 1326 if (nilfs_tree_is_busy(s->s_root)) {
1335 1327 if ((flags ^ s->s_flags) & MS_RDONLY) {
1336 if (nilfs_tree_was_touched(s->s_root)) {
1337 busy = nilfs_try_to_shrink_tree(s->s_root);
1338 if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
1339 printk(KERN_ERR "NILFS: the device already " 1328 printk(KERN_ERR "NILFS: the device already "
1340 "has a %s mount.\n", 1329 "has a %s mount.\n",
1341 (s->s_flags & MS_RDONLY) ? 1330 (s->s_flags & MS_RDONLY) ?
@@ -1343,8 +1332,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
1343 err = -EBUSY; 1332 err = -EBUSY;
1344 goto failed_super; 1333 goto failed_super;
1345 } 1334 }
1346 } 1335 } else {
1347 if (!busy) {
1348 /* 1336 /*
1349 * Try remount to setup mount states if the current 1337 * Try remount to setup mount states if the current
1350 * tree is not mounted and only snapshots use this sb. 1338 * tree is not mounted and only snapshots use this sb.
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c5670b8d198c..ea4ba9daeb47 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to)
1768 struct inode *inode = mapping->host; 1768 struct inode *inode = mapping->host;
1769 1769
1770 if (to > inode->i_size) { 1770 if (to > inode->i_size) {
1771 truncate_pagecache(inode, to, inode->i_size); 1771 truncate_pagecache(inode, inode->i_size);
1772 ntfs_truncate_vfs(inode); 1772 ntfs_truncate_vfs(inode);
1773 } 1773 }
1774} 1774}
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 8a404576fb26..b4f788e0ca31 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -51,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
51 return ERR_PTR(-EINVAL); 51 return ERR_PTR(-EINVAL);
52 52
53 count = size / sizeof(struct posix_acl_entry); 53 count = size / sizeof(struct posix_acl_entry);
54 if (count < 0)
55 return ERR_PTR(-EINVAL);
56 if (count == 0)
57 return NULL;
58 54
59 acl = posix_acl_alloc(count, GFP_NOFS); 55 acl = posix_acl_alloc(count, GFP_NOFS);
60 if (!acl) 56 if (!acl)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 2abf97b2a592..f37d3c0e2053 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -565,9 +565,7 @@ bail:
565static void ocfs2_dio_end_io(struct kiocb *iocb, 565static void ocfs2_dio_end_io(struct kiocb *iocb,
566 loff_t offset, 566 loff_t offset,
567 ssize_t bytes, 567 ssize_t bytes,
568 void *private, 568 void *private)
569 int ret,
570 bool is_async)
571{ 569{
572 struct inode *inode = file_inode(iocb->ki_filp); 570 struct inode *inode = file_inode(iocb->ki_filp);
573 int level; 571 int level;
@@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
592 590
593 level = ocfs2_iocb_rw_locked_level(iocb); 591 level = ocfs2_iocb_rw_locked_level(iocb);
594 ocfs2_rw_unlock(inode, level); 592 ocfs2_rw_unlock(inode, level);
595
596 inode_dio_done(inode);
597 if (is_async)
598 aio_complete(iocb, ret, 0);
599} 593}
600 594
601/* 595/*
@@ -2050,7 +2044,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2050 2044
2051out_write_size: 2045out_write_size:
2052 pos += copied; 2046 pos += copied;
2053 if (pos > inode->i_size) { 2047 if (pos > i_size_read(inode)) {
2054 i_size_write(inode, pos); 2048 i_size_write(inode, pos);
2055 mark_inode_dirty(inode); 2049 mark_inode_dirty(inode);
2056 } 2050 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c1c864e81cc..363f0dcc924f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -628,11 +628,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
628 struct o2nm_node *node, 628 struct o2nm_node *node,
629 int idx) 629 int idx)
630{ 630{
631 struct list_head *iter;
632 struct o2hb_callback_func *f; 631 struct o2hb_callback_func *f;
633 632
634 list_for_each(iter, &hbcall->list) { 633 list_for_each_entry(f, &hbcall->list, hc_item) {
635 f = list_entry(iter, struct o2hb_callback_func, hc_item);
636 mlog(ML_HEARTBEAT, "calling funcs %p\n", f); 634 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
637 (f->hc_func)(node, idx, f->hc_data); 635 (f->hc_func)(node, idx, f->hc_data);
638 } 636 }
@@ -641,16 +639,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
641/* Will run the list in order until we process the passed event */ 639/* Will run the list in order until we process the passed event */
642static void o2hb_run_event_list(struct o2hb_node_event *queued_event) 640static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
643{ 641{
644 int empty;
645 struct o2hb_callback *hbcall; 642 struct o2hb_callback *hbcall;
646 struct o2hb_node_event *event; 643 struct o2hb_node_event *event;
647 644
648 spin_lock(&o2hb_live_lock);
649 empty = list_empty(&queued_event->hn_item);
650 spin_unlock(&o2hb_live_lock);
651 if (empty)
652 return;
653
654 /* Holding callback sem assures we don't alter the callback 645 /* Holding callback sem assures we don't alter the callback
655 * lists when doing this, and serializes ourselves with other 646 * lists when doing this, and serializes ourselves with other
656 * processes wanting callbacks. */ 647 * processes wanting callbacks. */
@@ -709,6 +700,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
709 struct o2hb_node_event event = 700 struct o2hb_node_event event =
710 { .hn_item = LIST_HEAD_INIT(event.hn_item), }; 701 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
711 struct o2nm_node *node; 702 struct o2nm_node *node;
703 int queued = 0;
712 704
713 node = o2nm_get_node_by_num(slot->ds_node_num); 705 node = o2nm_get_node_by_num(slot->ds_node_num);
714 if (!node) 706 if (!node)
@@ -726,11 +718,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
726 718
727 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 719 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
728 slot->ds_node_num); 720 slot->ds_node_num);
721 queued = 1;
729 } 722 }
730 } 723 }
731 spin_unlock(&o2hb_live_lock); 724 spin_unlock(&o2hb_live_lock);
732 725
733 o2hb_run_event_list(&event); 726 if (queued)
727 o2hb_run_event_list(&event);
734 728
735 o2nm_node_put(node); 729 o2nm_node_put(node);
736} 730}
@@ -790,6 +784,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
790 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 784 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
791 unsigned int slot_dead_ms; 785 unsigned int slot_dead_ms;
792 int tmp; 786 int tmp;
787 int queued = 0;
793 788
794 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 789 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
795 790
@@ -883,6 +878,7 @@ fire_callbacks:
883 slot->ds_node_num); 878 slot->ds_node_num);
884 879
885 changed = 1; 880 changed = 1;
881 queued = 1;
886 } 882 }
887 883
888 list_add_tail(&slot->ds_live_item, 884 list_add_tail(&slot->ds_live_item,
@@ -934,6 +930,7 @@ fire_callbacks:
934 node, slot->ds_node_num); 930 node, slot->ds_node_num);
935 931
936 changed = 1; 932 changed = 1;
933 queued = 1;
937 } 934 }
938 935
939 /* We don't clear this because the node is still 936 /* We don't clear this because the node is still
@@ -949,7 +946,8 @@ fire_callbacks:
949out: 946out:
950 spin_unlock(&o2hb_live_lock); 947 spin_unlock(&o2hb_live_lock);
951 948
952 o2hb_run_event_list(&event); 949 if (queued)
950 o2hb_run_event_list(&event);
953 951
954 if (node) 952 if (node)
955 o2nm_node_put(node); 953 o2nm_node_put(node);
@@ -2516,8 +2514,7 @@ unlock:
2516int o2hb_register_callback(const char *region_uuid, 2514int o2hb_register_callback(const char *region_uuid,
2517 struct o2hb_callback_func *hc) 2515 struct o2hb_callback_func *hc)
2518{ 2516{
2519 struct o2hb_callback_func *tmp; 2517 struct o2hb_callback_func *f;
2520 struct list_head *iter;
2521 struct o2hb_callback *hbcall; 2518 struct o2hb_callback *hbcall;
2522 int ret; 2519 int ret;
2523 2520
@@ -2540,10 +2537,9 @@ int o2hb_register_callback(const char *region_uuid,
2540 2537
2541 down_write(&o2hb_callback_sem); 2538 down_write(&o2hb_callback_sem);
2542 2539
2543 list_for_each(iter, &hbcall->list) { 2540 list_for_each_entry(f, &hbcall->list, hc_item) {
2544 tmp = list_entry(iter, struct o2hb_callback_func, hc_item); 2541 if (hc->hc_priority < f->hc_priority) {
2545 if (hc->hc_priority < tmp->hc_priority) { 2542 list_add_tail(&hc->hc_item, &f->hc_item);
2546 list_add_tail(&hc->hc_item, iter);
2547 break; 2543 break;
2548 } 2544 }
2549 } 2545 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d644dc611425..2cd2406b4140 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -543,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
543 } 543 }
544 544
545 if (was_valid && !valid) { 545 if (was_valid && !valid) {
546 printk(KERN_NOTICE "o2net: No longer connected to " 546 if (old_sc)
547 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 547 printk(KERN_NOTICE "o2net: No longer connected to "
548 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
548 o2net_complete_nodes_nsw(nn); 549 o2net_complete_nodes_nsw(nn);
549 } 550 }
550 551
@@ -765,32 +766,32 @@ static struct o2net_msg_handler *
765o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, 766o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
766 struct rb_node **ret_parent) 767 struct rb_node **ret_parent)
767{ 768{
768 struct rb_node **p = &o2net_handler_tree.rb_node; 769 struct rb_node **p = &o2net_handler_tree.rb_node;
769 struct rb_node *parent = NULL; 770 struct rb_node *parent = NULL;
770 struct o2net_msg_handler *nmh, *ret = NULL; 771 struct o2net_msg_handler *nmh, *ret = NULL;
771 int cmp; 772 int cmp;
772 773
773 while (*p) { 774 while (*p) {
774 parent = *p; 775 parent = *p;
775 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); 776 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
776 cmp = o2net_handler_cmp(nmh, msg_type, key); 777 cmp = o2net_handler_cmp(nmh, msg_type, key);
777 778
778 if (cmp < 0) 779 if (cmp < 0)
779 p = &(*p)->rb_left; 780 p = &(*p)->rb_left;
780 else if (cmp > 0) 781 else if (cmp > 0)
781 p = &(*p)->rb_right; 782 p = &(*p)->rb_right;
782 else { 783 else {
783 ret = nmh; 784 ret = nmh;
784 break; 785 break;
785 } 786 }
786 } 787 }
787 788
788 if (ret_p != NULL) 789 if (ret_p != NULL)
789 *ret_p = p; 790 *ret_p = p;
790 if (ret_parent != NULL) 791 if (ret_parent != NULL)
791 *ret_parent = parent; 792 *ret_parent = parent;
792 793
793 return ret; 794 return ret;
794} 795}
795 796
796static void o2net_handler_kref_release(struct kref *kref) 797static void o2net_handler_kref_release(struct kref *kref)
@@ -1695,13 +1696,12 @@ static void o2net_start_connect(struct work_struct *work)
1695 ret = 0; 1696 ret = 0;
1696 1697
1697out: 1698out:
1698 if (ret) { 1699 if (ret && sc) {
1699 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT 1700 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
1700 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); 1701 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
1701 /* 0 err so that another will be queued and attempted 1702 /* 0 err so that another will be queued and attempted
1702 * from set_nn_state */ 1703 * from set_nn_state */
1703 if (sc) 1704 o2net_ensure_shutdown(nn, sc, 0);
1704 o2net_ensure_shutdown(nn, sc, 0);
1705 } 1705 }
1706 if (sc) 1706 if (sc)
1707 sc_put(sc); 1707 sc_put(sc);
@@ -1873,12 +1873,16 @@ static int o2net_accept_one(struct socket *sock)
1873 1873
1874 if (o2nm_this_node() >= node->nd_num) { 1874 if (o2nm_this_node() >= node->nd_num) {
1875 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1875 local_node = o2nm_get_node_by_num(o2nm_this_node());
1876 printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " 1876 if (local_node)
1877 "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " 1877 printk(KERN_NOTICE "o2net: Unexpected connect attempt "
1878 "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, 1878 "seen at node '%s' (%u, %pI4:%d) from "
1879 &(local_node->nd_ipv4_address), 1879 "node '%s' (%u, %pI4:%d)\n",
1880 ntohs(local_node->nd_ipv4_port), node->nd_name, 1880 local_node->nd_name, local_node->nd_num,
1881 node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1881 &(local_node->nd_ipv4_address),
1882 ntohs(local_node->nd_ipv4_port),
1883 node->nd_name,
1884 node->nd_num, &sin.sin_addr.s_addr,
1885 ntohs(sin.sin_port));
1882 ret = -EINVAL; 1886 ret = -EINVAL;
1883 goto out; 1887 goto out;
1884 } 1888 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index ef999729e274..0d3a97d2d5f6 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -70,9 +70,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
70 */ 70 */
71 if (inode == NULL) { 71 if (inode == NULL) {
72 unsigned long gen = (unsigned long) dentry->d_fsdata; 72 unsigned long gen = (unsigned long) dentry->d_fsdata;
73 unsigned long pgen = 73 unsigned long pgen;
74 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; 74 spin_lock(&dentry->d_lock);
75 75 pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
76 spin_unlock(&dentry->d_lock);
76 trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, 77 trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
77 dentry->d_name.name, 78 dentry->d_name.name,
78 pgen, gen); 79 pgen, gen);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fbec0be62326..b46278f9ae44 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -292,7 +292,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
292 struct dlm_lock *lock = NULL; 292 struct dlm_lock *lock = NULL;
293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; 293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
294 char *name; 294 char *name;
295 struct list_head *iter, *head=NULL; 295 struct list_head *head = NULL;
296 __be64 cookie; 296 __be64 cookie;
297 u32 flags; 297 u32 flags;
298 u8 node; 298 u8 node;
@@ -373,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
373 /* try convert queue for both ast/bast */ 373 /* try convert queue for both ast/bast */
374 head = &res->converting; 374 head = &res->converting;
375 lock = NULL; 375 lock = NULL;
376 list_for_each(iter, head) { 376 list_for_each_entry(lock, head, list) {
377 lock = list_entry (iter, struct dlm_lock, list);
378 if (lock->ml.cookie == cookie) 377 if (lock->ml.cookie == cookie)
379 goto do_ast; 378 goto do_ast;
380 } 379 }
@@ -385,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
385 else 384 else
386 head = &res->granted; 385 head = &res->granted;
387 386
388 list_for_each(iter, head) { 387 list_for_each_entry(lock, head, list) {
389 lock = list_entry (iter, struct dlm_lock, list);
390 if (lock->ml.cookie == cookie) 388 if (lock->ml.cookie == cookie)
391 goto do_ast; 389 goto do_ast;
392 } 390 }
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index de854cca12a2..e0517762fcc0 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1079,11 +1079,9 @@ static inline int dlm_lock_compatible(int existing, int request)
1079static inline int dlm_lock_on_list(struct list_head *head, 1079static inline int dlm_lock_on_list(struct list_head *head,
1080 struct dlm_lock *lock) 1080 struct dlm_lock *lock)
1081{ 1081{
1082 struct list_head *iter;
1083 struct dlm_lock *tmplock; 1082 struct dlm_lock *tmplock;
1084 1083
1085 list_for_each(iter, head) { 1084 list_for_each_entry(tmplock, head, list) {
1086 tmplock = list_entry(iter, struct dlm_lock, list);
1087 if (tmplock == lock) 1085 if (tmplock == lock)
1088 return 1; 1086 return 1;
1089 } 1087 }
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 29a886d1e82c..e36d63ff1783 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -123,7 +123,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
123 int *kick_thread) 123 int *kick_thread)
124{ 124{
125 enum dlm_status status = DLM_NORMAL; 125 enum dlm_status status = DLM_NORMAL;
126 struct list_head *iter;
127 struct dlm_lock *tmplock=NULL; 126 struct dlm_lock *tmplock=NULL;
128 127
129 assert_spin_locked(&res->spinlock); 128 assert_spin_locked(&res->spinlock);
@@ -185,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
185 184
186 /* upconvert from here on */ 185 /* upconvert from here on */
187 status = DLM_NORMAL; 186 status = DLM_NORMAL;
188 list_for_each(iter, &res->granted) { 187 list_for_each_entry(tmplock, &res->granted, list) {
189 tmplock = list_entry(iter, struct dlm_lock, list);
190 if (tmplock == lock) 188 if (tmplock == lock)
191 continue; 189 continue;
192 if (!dlm_lock_compatible(tmplock->ml.type, type)) 190 if (!dlm_lock_compatible(tmplock->ml.type, type))
193 goto switch_queues; 191 goto switch_queues;
194 } 192 }
195 193
196 list_for_each(iter, &res->converting) { 194 list_for_each_entry(tmplock, &res->converting, list) {
197 tmplock = list_entry(iter, struct dlm_lock, list);
198 if (!dlm_lock_compatible(tmplock->ml.type, type)) 195 if (!dlm_lock_compatible(tmplock->ml.type, type))
199 goto switch_queues; 196 goto switch_queues;
200 /* existing conversion requests take precedence */ 197 /* existing conversion requests take precedence */
@@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
424 struct dlm_ctxt *dlm = data; 421 struct dlm_ctxt *dlm = data;
425 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; 422 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
426 struct dlm_lock_resource *res = NULL; 423 struct dlm_lock_resource *res = NULL;
427 struct list_head *iter;
428 struct dlm_lock *lock = NULL; 424 struct dlm_lock *lock = NULL;
425 struct dlm_lock *tmp_lock;
429 struct dlm_lockstatus *lksb; 426 struct dlm_lockstatus *lksb;
430 enum dlm_status status = DLM_NORMAL; 427 enum dlm_status status = DLM_NORMAL;
431 u32 flags; 428 u32 flags;
@@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
471 dlm_error(status); 468 dlm_error(status);
472 goto leave; 469 goto leave;
473 } 470 }
474 list_for_each(iter, &res->granted) { 471 list_for_each_entry(tmp_lock, &res->granted, list) {
475 lock = list_entry(iter, struct dlm_lock, list); 472 if (tmp_lock->ml.cookie == cnv->cookie &&
476 if (lock->ml.cookie == cnv->cookie && 473 tmp_lock->ml.node == cnv->node_idx) {
477 lock->ml.node == cnv->node_idx) { 474 lock = tmp_lock;
478 dlm_lock_get(lock); 475 dlm_lock_get(lock);
479 break; 476 break;
480 } 477 }
481 lock = NULL;
482 } 478 }
483 spin_unlock(&res->spinlock); 479 spin_unlock(&res->spinlock);
484 if (!lock) { 480 if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0e28e242226d..e33cd7a3c582 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -96,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
96 96
97void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 97void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
98{ 98{
99 struct list_head *iter2;
100 struct dlm_lock *lock; 99 struct dlm_lock *lock;
101 char buf[DLM_LOCKID_NAME_MAX]; 100 char buf[DLM_LOCKID_NAME_MAX];
102 101
@@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
118 res->inflight_locks, atomic_read(&res->asts_reserved)); 117 res->inflight_locks, atomic_read(&res->asts_reserved));
119 dlm_print_lockres_refmap(res); 118 dlm_print_lockres_refmap(res);
120 printk(" granted queue:\n"); 119 printk(" granted queue:\n");
121 list_for_each(iter2, &res->granted) { 120 list_for_each_entry(lock, &res->granted, list) {
122 lock = list_entry(iter2, struct dlm_lock, list);
123 __dlm_print_lock(lock); 121 __dlm_print_lock(lock);
124 } 122 }
125 printk(" converting queue:\n"); 123 printk(" converting queue:\n");
126 list_for_each(iter2, &res->converting) { 124 list_for_each_entry(lock, &res->converting, list) {
127 lock = list_entry(iter2, struct dlm_lock, list);
128 __dlm_print_lock(lock); 125 __dlm_print_lock(lock);
129 } 126 }
130 printk(" blocked queue:\n"); 127 printk(" blocked queue:\n");
131 list_for_each(iter2, &res->blocked) { 128 list_for_each_entry(lock, &res->blocked, list) {
132 lock = list_entry(iter2, struct dlm_lock, list);
133 __dlm_print_lock(lock); 129 __dlm_print_lock(lock);
134 } 130 }
135} 131}
@@ -446,7 +442,6 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
446{ 442{
447 struct dlm_master_list_entry *mle; 443 struct dlm_master_list_entry *mle;
448 struct hlist_head *bucket; 444 struct hlist_head *bucket;
449 struct hlist_node *list;
450 int i, out = 0; 445 int i, out = 0;
451 unsigned long total = 0, longest = 0, bucket_count = 0; 446 unsigned long total = 0, longest = 0, bucket_count = 0;
452 447
@@ -456,9 +451,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
456 spin_lock(&dlm->master_lock); 451 spin_lock(&dlm->master_lock);
457 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 452 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
458 bucket = dlm_master_hash(dlm, i); 453 bucket = dlm_master_hash(dlm, i);
459 hlist_for_each(list, bucket) { 454 hlist_for_each_entry(mle, bucket, master_hash_node) {
460 mle = hlist_entry(list, struct dlm_master_list_entry,
461 master_hash_node);
462 ++total; 455 ++total;
463 ++bucket_count; 456 ++bucket_count;
464 if (len - out < 200) 457 if (len - out < 200)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index dbb17c07656a..8b3382abf840 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -193,7 +193,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
193 unsigned int hash) 193 unsigned int hash)
194{ 194{
195 struct hlist_head *bucket; 195 struct hlist_head *bucket;
196 struct hlist_node *list; 196 struct dlm_lock_resource *res;
197 197
198 mlog(0, "%.*s\n", len, name); 198 mlog(0, "%.*s\n", len, name);
199 199
@@ -201,9 +201,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
201 201
202 bucket = dlm_lockres_hash(dlm, hash); 202 bucket = dlm_lockres_hash(dlm, hash);
203 203
204 hlist_for_each(list, bucket) { 204 hlist_for_each_entry(res, bucket, hash_node) {
205 struct dlm_lock_resource *res = hlist_entry(list,
206 struct dlm_lock_resource, hash_node);
207 if (res->lockname.name[0] != name[0]) 205 if (res->lockname.name[0] != name[0])
208 continue; 206 continue;
209 if (unlikely(res->lockname.len != len)) 207 if (unlikely(res->lockname.len != len))
@@ -262,22 +260,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
262 260
263static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) 261static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
264{ 262{
265 struct dlm_ctxt *tmp = NULL; 263 struct dlm_ctxt *tmp;
266 struct list_head *iter;
267 264
268 assert_spin_locked(&dlm_domain_lock); 265 assert_spin_locked(&dlm_domain_lock);
269 266
270 /* tmp->name here is always NULL terminated, 267 /* tmp->name here is always NULL terminated,
271 * but domain may not be! */ 268 * but domain may not be! */
272 list_for_each(iter, &dlm_domains) { 269 list_for_each_entry(tmp, &dlm_domains, list) {
273 tmp = list_entry (iter, struct dlm_ctxt, list);
274 if (strlen(tmp->name) == len && 270 if (strlen(tmp->name) == len &&
275 memcmp(tmp->name, domain, len)==0) 271 memcmp(tmp->name, domain, len)==0)
276 break; 272 return tmp;
277 tmp = NULL;
278 } 273 }
279 274
280 return tmp; 275 return NULL;
281} 276}
282 277
283/* For null terminated domain strings ONLY */ 278/* For null terminated domain strings ONLY */
@@ -366,25 +361,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
366 * you shouldn't trust your pointer. */ 361 * you shouldn't trust your pointer. */
367struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) 362struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
368{ 363{
369 struct list_head *iter; 364 struct dlm_ctxt *target;
370 struct dlm_ctxt *target = NULL; 365 struct dlm_ctxt *ret = NULL;
371 366
372 spin_lock(&dlm_domain_lock); 367 spin_lock(&dlm_domain_lock);
373 368
374 list_for_each(iter, &dlm_domains) { 369 list_for_each_entry(target, &dlm_domains, list) {
375 target = list_entry (iter, struct dlm_ctxt, list);
376
377 if (target == dlm) { 370 if (target == dlm) {
378 __dlm_get(target); 371 __dlm_get(target);
372 ret = target;
379 break; 373 break;
380 } 374 }
381
382 target = NULL;
383 } 375 }
384 376
385 spin_unlock(&dlm_domain_lock); 377 spin_unlock(&dlm_domain_lock);
386 378
387 return target; 379 return ret;
388} 380}
389 381
390int dlm_domain_fully_joined(struct dlm_ctxt *dlm) 382int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -2296,13 +2288,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
2296void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, 2288void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
2297 int node_num) 2289 int node_num)
2298{ 2290{
2299 struct list_head *iter;
2300 struct dlm_eviction_cb *cb; 2291 struct dlm_eviction_cb *cb;
2301 2292
2302 down_read(&dlm_callback_sem); 2293 down_read(&dlm_callback_sem);
2303 list_for_each(iter, &dlm->dlm_eviction_callbacks) { 2294 list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
2304 cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
2305
2306 cb->ec_func(node_num, cb->ec_data); 2295 cb->ec_func(node_num, cb->ec_data);
2307 } 2296 }
2308 up_read(&dlm_callback_sem); 2297 up_read(&dlm_callback_sem);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 47e67c2d228f..5d32f7511f74 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -91,19 +91,14 @@ void dlm_destroy_lock_cache(void)
91static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, 91static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
92 struct dlm_lock *lock) 92 struct dlm_lock *lock)
93{ 93{
94 struct list_head *iter;
95 struct dlm_lock *tmplock; 94 struct dlm_lock *tmplock;
96 95
97 list_for_each(iter, &res->granted) { 96 list_for_each_entry(tmplock, &res->granted, list) {
98 tmplock = list_entry(iter, struct dlm_lock, list);
99
100 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 97 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
101 return 0; 98 return 0;
102 } 99 }
103 100
104 list_for_each(iter, &res->converting) { 101 list_for_each_entry(tmplock, &res->converting, list) {
105 tmplock = list_entry(iter, struct dlm_lock, list);
106
107 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) 102 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
108 return 0; 103 return 0;
109 if (!dlm_lock_compatible(tmplock->ml.convert_type, 104 if (!dlm_lock_compatible(tmplock->ml.convert_type,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 33ecbe0e6734..cf0f103963b1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
342{ 342{
343 struct dlm_master_list_entry *tmpmle; 343 struct dlm_master_list_entry *tmpmle;
344 struct hlist_head *bucket; 344 struct hlist_head *bucket;
345 struct hlist_node *list;
346 unsigned int hash; 345 unsigned int hash;
347 346
348 assert_spin_locked(&dlm->master_lock); 347 assert_spin_locked(&dlm->master_lock);
349 348
350 hash = dlm_lockid_hash(name, namelen); 349 hash = dlm_lockid_hash(name, namelen);
351 bucket = dlm_master_hash(dlm, hash); 350 bucket = dlm_master_hash(dlm, hash);
352 hlist_for_each(list, bucket) { 351 hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
353 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
354 master_hash_node);
355 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 352 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
356 continue; 353 continue;
357 dlm_get_mle(tmpmle); 354 dlm_get_mle(tmpmle);
@@ -3183,7 +3180,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3183 struct dlm_master_list_entry *mle; 3180 struct dlm_master_list_entry *mle;
3184 struct dlm_lock_resource *res; 3181 struct dlm_lock_resource *res;
3185 struct hlist_head *bucket; 3182 struct hlist_head *bucket;
3186 struct hlist_node *list; 3183 struct hlist_node *tmp;
3187 unsigned int i; 3184 unsigned int i;
3188 3185
3189 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); 3186 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
@@ -3194,10 +3191,7 @@ top:
3194 spin_lock(&dlm->master_lock); 3191 spin_lock(&dlm->master_lock);
3195 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3192 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3196 bucket = dlm_master_hash(dlm, i); 3193 bucket = dlm_master_hash(dlm, i);
3197 hlist_for_each(list, bucket) { 3194 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3198 mle = hlist_entry(list, struct dlm_master_list_entry,
3199 master_hash_node);
3200
3201 BUG_ON(mle->type != DLM_MLE_BLOCK && 3195 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3202 mle->type != DLM_MLE_MASTER && 3196 mle->type != DLM_MLE_MASTER &&
3203 mle->type != DLM_MLE_MIGRATION); 3197 mle->type != DLM_MLE_MIGRATION);
@@ -3378,7 +3372,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
3378 int i; 3372 int i;
3379 struct hlist_head *bucket; 3373 struct hlist_head *bucket;
3380 struct dlm_master_list_entry *mle; 3374 struct dlm_master_list_entry *mle;
3381 struct hlist_node *tmp, *list; 3375 struct hlist_node *tmp;
3382 3376
3383 /* 3377 /*
3384 * We notified all other nodes that we are exiting the domain and 3378 * We notified all other nodes that we are exiting the domain and
@@ -3394,9 +3388,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
3394 3388
3395 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 3389 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3396 bucket = dlm_master_hash(dlm, i); 3390 bucket = dlm_master_hash(dlm, i);
3397 hlist_for_each_safe(list, tmp, bucket) { 3391 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3398 mle = hlist_entry(list, struct dlm_master_list_entry,
3399 master_hash_node);
3400 if (mle->type != DLM_MLE_BLOCK) { 3392 if (mle->type != DLM_MLE_BLOCK) {
3401 mlog(ML_ERROR, "bad mle: %p\n", mle); 3393 mlog(ML_ERROR, "bad mle: %p\n", mle);
3402 dlm_print_one_mle(mle); 3394 dlm_print_one_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 773bd32bfd8c..0b5adca1b178 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -787,6 +787,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
787{ 787{
788 struct dlm_lock_request lr; 788 struct dlm_lock_request lr;
789 int ret; 789 int ret;
790 int status;
790 791
791 mlog(0, "\n"); 792 mlog(0, "\n");
792 793
@@ -800,13 +801,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
800 801
801 // send message 802 // send message
802 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, 803 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
803 &lr, sizeof(lr), request_from, NULL); 804 &lr, sizeof(lr), request_from, &status);
804 805
805 /* negative status is handled by caller */ 806 /* negative status is handled by caller */
806 if (ret < 0) 807 if (ret < 0)
807 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " 808 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
808 "to recover dead node %u\n", dlm->name, ret, 809 "to recover dead node %u\n", dlm->name, ret,
809 request_from, dead_node); 810 request_from, dead_node);
811 else
812 ret = status;
810 // return from here, then 813 // return from here, then
811 // sleep until all received or error 814 // sleep until all received or error
812 return ret; 815 return ret;
@@ -2328,6 +2331,14 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2328 } else if (res->owner == dlm->node_num) { 2331 } else if (res->owner == dlm->node_num) {
2329 dlm_free_dead_locks(dlm, res, dead_node); 2332 dlm_free_dead_locks(dlm, res, dead_node);
2330 __dlm_lockres_calc_usage(dlm, res); 2333 __dlm_lockres_calc_usage(dlm, res);
2334 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2335 if (test_bit(dead_node, res->refmap)) {
2336 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2337 "no locks and had not purged before dying\n",
2338 dlm->name, res->lockname.len,
2339 res->lockname.name, dead_node);
2340 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2341 }
2331 } 2342 }
2332 spin_unlock(&res->spinlock); 2343 spin_unlock(&res->spinlock);
2333 } 2344 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index e73c833fc2a1..9db869de829d 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -286,8 +286,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
286 struct dlm_lock_resource *res) 286 struct dlm_lock_resource *res)
287{ 287{
288 struct dlm_lock *lock, *target; 288 struct dlm_lock *lock, *target;
289 struct list_head *iter;
290 struct list_head *head;
291 int can_grant = 1; 289 int can_grant = 1;
292 290
293 /* 291 /*
@@ -314,9 +312,7 @@ converting:
314 dlm->name, res->lockname.len, res->lockname.name); 312 dlm->name, res->lockname.len, res->lockname.name);
315 BUG(); 313 BUG();
316 } 314 }
317 head = &res->granted; 315 list_for_each_entry(lock, &res->granted, list) {
318 list_for_each(iter, head) {
319 lock = list_entry(iter, struct dlm_lock, list);
320 if (lock==target) 316 if (lock==target)
321 continue; 317 continue;
322 if (!dlm_lock_compatible(lock->ml.type, 318 if (!dlm_lock_compatible(lock->ml.type,
@@ -333,9 +329,8 @@ converting:
333 target->ml.convert_type; 329 target->ml.convert_type;
334 } 330 }
335 } 331 }
336 head = &res->converting; 332
337 list_for_each(iter, head) { 333 list_for_each_entry(lock, &res->converting, list) {
338 lock = list_entry(iter, struct dlm_lock, list);
339 if (lock==target) 334 if (lock==target)
340 continue; 335 continue;
341 if (!dlm_lock_compatible(lock->ml.type, 336 if (!dlm_lock_compatible(lock->ml.type,
@@ -384,9 +379,7 @@ blocked:
384 goto leave; 379 goto leave;
385 target = list_entry(res->blocked.next, struct dlm_lock, list); 380 target = list_entry(res->blocked.next, struct dlm_lock, list);
386 381
387 head = &res->granted; 382 list_for_each_entry(lock, &res->granted, list) {
388 list_for_each(iter, head) {
389 lock = list_entry(iter, struct dlm_lock, list);
390 if (lock==target) 383 if (lock==target)
391 continue; 384 continue;
392 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { 385 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
@@ -400,9 +393,7 @@ blocked:
400 } 393 }
401 } 394 }
402 395
403 head = &res->converting; 396 list_for_each_entry(lock, &res->converting, list) {
404 list_for_each(iter, head) {
405 lock = list_entry(iter, struct dlm_lock, list);
406 if (lock==target) 397 if (lock==target)
407 continue; 398 continue;
408 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { 399 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 850aa7e87537..5698b52cf5c9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -388,7 +388,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
388 struct dlm_ctxt *dlm = data; 388 struct dlm_ctxt *dlm = data;
389 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; 389 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
390 struct dlm_lock_resource *res = NULL; 390 struct dlm_lock_resource *res = NULL;
391 struct list_head *iter;
392 struct dlm_lock *lock = NULL; 391 struct dlm_lock *lock = NULL;
393 enum dlm_status status = DLM_NORMAL; 392 enum dlm_status status = DLM_NORMAL;
394 int found = 0, i; 393 int found = 0, i;
@@ -458,8 +457,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
458 } 457 }
459 458
460 for (i=0; i<3; i++) { 459 for (i=0; i<3; i++) {
461 list_for_each(iter, queue) { 460 list_for_each_entry(lock, queue, list) {
462 lock = list_entry(iter, struct dlm_lock, list);
463 if (lock->ml.cookie == unlock->cookie && 461 if (lock->ml.cookie == unlock->cookie &&
464 lock->ml.node == unlock->node_idx) { 462 lock->ml.node == unlock->node_idx) {
465 dlm_lock_get(lock); 463 dlm_lock_get(lock);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 12bafb7265ce..efa2b3d339e3 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -401,11 +401,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
401{ 401{
402 struct inode *inode = new_inode(sb); 402 struct inode *inode = new_inode(sb);
403 umode_t mode = S_IFDIR | 0755; 403 umode_t mode = S_IFDIR | 0755;
404 struct dlmfs_inode_private *ip;
405 404
406 if (inode) { 405 if (inode) {
407 ip = DLMFS_I(inode);
408
409 inode->i_ino = get_next_ino(); 406 inode->i_ino = get_next_ino();
410 inode_init_owner(inode, NULL, mode); 407 inode_init_owner(inode, NULL, mode);
411 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 408 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2487116d0d33..767370b656ca 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
781 cpos = map_start >> osb->s_clustersize_bits; 781 cpos = map_start >> osb->s_clustersize_bits;
782 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, 782 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
783 map_start + map_len); 783 map_start + map_len);
784 mapping_end -= cpos;
785 is_last = 0; 784 is_last = 0;
786 while (cpos < mapping_end && !is_last) { 785 while (cpos < mapping_end && !is_last) {
787 u32 fe_flags; 786 u32 fe_flags;
@@ -852,20 +851,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
852 851
853 down_read(&OCFS2_I(inode)->ip_alloc_sem); 852 down_read(&OCFS2_I(inode)->ip_alloc_sem);
854 853
855 if (*offset >= inode->i_size) { 854 if (*offset >= i_size_read(inode)) {
856 ret = -ENXIO; 855 ret = -ENXIO;
857 goto out_unlock; 856 goto out_unlock;
858 } 857 }
859 858
860 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 859 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
861 if (whence == SEEK_HOLE) 860 if (whence == SEEK_HOLE)
862 *offset = inode->i_size; 861 *offset = i_size_read(inode);
863 goto out_unlock; 862 goto out_unlock;
864 } 863 }
865 864
866 clen = 0; 865 clen = 0;
867 cpos = *offset >> cs_bits; 866 cpos = *offset >> cs_bits;
868 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); 867 cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
869 868
870 while (cpos < cend && !is_last) { 869 while (cpos < cend && !is_last) {
871 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, 870 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
@@ -904,8 +903,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
904 extlen = clen; 903 extlen = clen;
905 extlen <<= cs_bits; 904 extlen <<= cs_bits;
906 905
907 if ((extoff + extlen) > inode->i_size) 906 if ((extoff + extlen) > i_size_read(inode))
908 extlen = inode->i_size - extoff; 907 extlen = i_size_read(inode) - extoff;
909 extoff += extlen; 908 extoff += extlen;
910 if (extoff > *offset) 909 if (extoff > *offset)
911 *offset = extoff; 910 *offset = extoff;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3261d71319ee..d71903c6068b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -671,11 +671,7 @@ restarted_transaction:
671 } else { 671 } else {
672 BUG_ON(why != RESTART_TRANS); 672 BUG_ON(why != RESTART_TRANS);
673 673
674 /* TODO: This can be more intelligent. */ 674 status = ocfs2_allocate_extend_trans(handle, 1);
675 credits = ocfs2_calc_extend_credits(osb->sb,
676 &fe->id2.i_list,
677 clusters_to_add);
678 status = ocfs2_extend_trans(handle, credits);
679 if (status < 0) { 675 if (status < 0) {
680 /* handle still has to be committed at 676 /* handle still has to be committed at
681 * this point. */ 677 * this point. */
@@ -1800,6 +1796,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1800 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1796 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1801 1797
1802out: 1798out:
1799 ocfs2_free_path(path);
1803 ocfs2_schedule_truncate_log_flush(osb, 1); 1800 ocfs2_schedule_truncate_log_flush(osb, 1);
1804 ocfs2_run_deallocs(osb, &dealloc); 1801 ocfs2_run_deallocs(osb, &dealloc);
1805 1802
@@ -2245,7 +2242,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2245 file->f_path.dentry->d_name.name, 2242 file->f_path.dentry->d_name.name,
2246 (unsigned int)nr_segs); 2243 (unsigned int)nr_segs);
2247 2244
2248 if (iocb->ki_left == 0) 2245 if (iocb->ki_nbytes == 0)
2249 return 0; 2246 return 0;
2250 2247
2251 appending = file->f_flags & O_APPEND ? 1 : 0; 2248 appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2296,7 +2293,7 @@ relock:
2296 2293
2297 can_do_direct = direct_io; 2294 can_do_direct = direct_io;
2298 ret = ocfs2_prepare_inode_for_write(file, ppos, 2295 ret = ocfs2_prepare_inode_for_write(file, ppos,
2299 iocb->ki_left, appending, 2296 iocb->ki_nbytes, appending,
2300 &can_do_direct, &has_refcount); 2297 &can_do_direct, &has_refcount);
2301 if (ret < 0) { 2298 if (ret < 0) {
2302 mlog_errno(ret); 2299 mlog_errno(ret);
@@ -2304,7 +2301,7 @@ relock:
2304 } 2301 }
2305 2302
2306 if (direct_io && !is_sync_kiocb(iocb)) 2303 if (direct_io && !is_sync_kiocb(iocb))
2307 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, 2304 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
2308 *ppos); 2305 *ppos);
2309 2306
2310 /* 2307 /*
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 0c60ef2d8056..fa32ce9b455d 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -303,7 +303,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
303 if (o2info_from_user(oij, req)) 303 if (o2info_from_user(oij, req))
304 goto bail; 304 goto bail;
305 305
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 306 oij.ij_journal_size = i_size_read(osb->journal->j_inode);
307 307
308 o2info_set_request_filled(&oij.ij_req); 308 o2info_set_request_filled(&oij.ij_req);
309 309
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 242170d83971..44fc3e530c3d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -455,6 +455,41 @@ bail:
455 return status; 455 return status;
456} 456}
457 457
458/*
459 * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
460 * If that fails, restart the transaction & regain write access for the
461 * buffer head which is used for metadata modifications.
462 * Taken from Ext4: extend_or_restart_transaction()
463 */
464int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
465{
466 int status, old_nblks;
467
468 BUG_ON(!handle);
469
470 old_nblks = handle->h_buffer_credits;
471 trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
472
473 if (old_nblks < thresh)
474 return 0;
475
476 status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
477 if (status < 0) {
478 mlog_errno(status);
479 goto bail;
480 }
481
482 if (status > 0) {
483 status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
484 if (status < 0)
485 mlog_errno(status);
486 }
487
488bail:
489 return status;
490}
491
492
458struct ocfs2_triggers { 493struct ocfs2_triggers {
459 struct jbd2_buffer_trigger_type ot_triggers; 494 struct jbd2_buffer_trigger_type ot_triggers;
460 int ot_offset; 495 int ot_offset;
@@ -801,14 +836,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
801 inode_lock = 1; 836 inode_lock = 1;
802 di = (struct ocfs2_dinode *)bh->b_data; 837 di = (struct ocfs2_dinode *)bh->b_data;
803 838
804 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 839 if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
805 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", 840 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
806 inode->i_size); 841 i_size_read(inode));
807 status = -EINVAL; 842 status = -EINVAL;
808 goto done; 843 goto done;
809 } 844 }
810 845
811 trace_ocfs2_journal_init(inode->i_size, 846 trace_ocfs2_journal_init(i_size_read(inode),
812 (unsigned long long)inode->i_blocks, 847 (unsigned long long)inode->i_blocks,
813 OCFS2_I(inode)->ip_clusters); 848 OCFS2_I(inode)->ip_clusters);
814 849
@@ -1096,7 +1131,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
1096 1131
1097 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 1132 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
1098 1133
1099 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); 1134 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
1100 v_blkno = 0; 1135 v_blkno = 0;
1101 while (v_blkno < num_blocks) { 1136 while (v_blkno < num_blocks) {
1102 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 1137 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0a992737dcaf..0b479bab3671 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -258,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
258int ocfs2_commit_trans(struct ocfs2_super *osb, 258int ocfs2_commit_trans(struct ocfs2_super *osb,
259 handle_t *handle); 259 handle_t *handle);
260int ocfs2_extend_trans(handle_t *handle, int nblocks); 260int ocfs2_extend_trans(handle_t *handle, int nblocks);
261int ocfs2_allocate_extend_trans(handle_t *handle,
262 int thresh);
263
264/*
265 * Define an arbitrary limit for the amount of data we will anticipate
266 * writing to any given transaction. For unbounded transactions such as
267 * fallocate(2) we can write more than this, but we always
268 * start off at the maximum transaction size and grow the transaction
269 * optimistically as we go.
270 */
271#define OCFS2_MAX_TRANS_DATA 64U
261 272
262/* 273/*
263 * Create access is for when we get a newly created buffer and we're 274 * Create access is for when we get a newly created buffer and we're
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index aebeacd807c3..cd5496b7a0a3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1082,7 +1082,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
1082 } 1082 }
1083 1083
1084retry_enospc: 1084retry_enospc:
1085 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; 1085 (*ac)->ac_bits_wanted = osb->local_alloc_bits;
1086 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1086 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1087 if (status == -ENOSPC) { 1087 if (status == -ENOSPC) {
1088 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 1088 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1154,7 +1154,7 @@ retry_enospc:
1154 OCFS2_LA_DISABLED) 1154 OCFS2_LA_DISABLED)
1155 goto bail; 1155 goto bail;
1156 1156
1157 ac->ac_bits_wanted = osb->local_alloc_default_bits; 1157 ac->ac_bits_wanted = osb->local_alloc_bits;
1158 status = ocfs2_claim_clusters(handle, ac, 1158 status = ocfs2_claim_clusters(handle, ac,
1159 osb->local_alloc_bits, 1159 osb->local_alloc_bits,
1160 &cluster_off, 1160 &cluster_off,
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 452068b45749..3d3f3c83065c 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -152,6 +152,7 @@ static int __ocfs2_move_extent(handle_t *handle,
152 } 152 }
153 153
154out: 154out:
155 ocfs2_free_path(path);
155 return ret; 156 return ret;
156} 157}
157 158
@@ -845,7 +846,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
845 struct ocfs2_move_extents *range = context->range; 846 struct ocfs2_move_extents *range = context->range;
846 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
847 848
848 if ((inode->i_size == 0) || (range->me_len == 0)) 849 if ((i_size_read(inode) == 0) || (range->me_len == 0))
849 return 0; 850 return 0;
850 851
851 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 852 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 3b481f490633..1b60c62aa9d6 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2579,6 +2579,8 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
2579 2579
2580DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); 2580DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
2581 2581
2582DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
2583
2582DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); 2584DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
2583 2585
2584DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); 2586DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 332a281f217e..aaa50611ec66 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; 234 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
235 } 235 }
236 236
237 if (gqinode->i_size < off + len) { 237 if (i_size_read(gqinode) < off + len) {
238 loff_t rounded_end = 238 loff_t rounded_end =
239 ocfs2_align_bytes_to_blocks(sb, off + len); 239 ocfs2_align_bytes_to_blocks(sb, off + len);
240 240
@@ -778,8 +778,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
778 */ 778 */
779 WARN_ON(journal_current_handle()); 779 WARN_ON(journal_current_handle());
780 status = ocfs2_extend_no_holes(gqinode, NULL, 780 status = ocfs2_extend_no_holes(gqinode, NULL,
781 gqinode->i_size + (need_alloc << sb->s_blocksize_bits), 781 i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
782 gqinode->i_size); 782 i_size_read(gqinode));
783 if (status < 0) 783 if (status < 0)
784 goto out_dq; 784 goto out_dq;
785 } 785 }
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 27fe7ee4874c..2e4344be3b96 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -982,14 +982,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
982 982
983 /* We are protected by dqio_sem so no locking needed */ 983 /* We are protected by dqio_sem so no locking needed */
984 status = ocfs2_extend_no_holes(lqinode, NULL, 984 status = ocfs2_extend_no_holes(lqinode, NULL,
985 lqinode->i_size + 2 * sb->s_blocksize, 985 i_size_read(lqinode) + 2 * sb->s_blocksize,
986 lqinode->i_size); 986 i_size_read(lqinode));
987 if (status < 0) { 987 if (status < 0) {
988 mlog_errno(status); 988 mlog_errno(status);
989 goto out; 989 goto out;
990 } 990 }
991 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, 991 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
992 lqinode->i_size + 2 * sb->s_blocksize); 992 i_size_read(lqinode) + 2 * sb->s_blocksize);
993 if (status < 0) { 993 if (status < 0) {
994 mlog_errno(status); 994 mlog_errno(status);
995 goto out; 995 goto out;
@@ -1125,14 +1125,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1125 1125
1126 /* We are protected by dqio_sem so no locking needed */ 1126 /* We are protected by dqio_sem so no locking needed */
1127 status = ocfs2_extend_no_holes(lqinode, NULL, 1127 status = ocfs2_extend_no_holes(lqinode, NULL,
1128 lqinode->i_size + sb->s_blocksize, 1128 i_size_read(lqinode) + sb->s_blocksize,
1129 lqinode->i_size); 1129 i_size_read(lqinode));
1130 if (status < 0) { 1130 if (status < 0) {
1131 mlog_errno(status); 1131 mlog_errno(status);
1132 goto out; 1132 goto out;
1133 } 1133 }
1134 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, 1134 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
1135 lqinode->i_size + sb->s_blocksize); 1135 i_size_read(lqinode) + sb->s_blocksize);
1136 if (status < 0) { 1136 if (status < 0) {
1137 mlog_errno(status); 1137 mlog_errno(status);
1138 goto out; 1138 goto out;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a70d604593b6..bf4dfc14bb2c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3854,7 +3854,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
3854 while (cpos < clusters) { 3854 while (cpos < clusters) {
3855 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3855 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3856 &num_clusters, &ext_flags); 3856 &num_clusters, &ext_flags);
3857 3857 if (ret) {
3858 mlog_errno(ret);
3859 goto unlock;
3860 }
3858 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { 3861 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3859 ret = ocfs2_add_refcount_flag(inode, &di_et, 3862 ret = ocfs2_add_refcount_flag(inode, &di_et,
3860 &ref_tree->rf_ci, 3863 &ref_tree->rf_ci,
@@ -4025,7 +4028,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4025 while (cpos < clusters) { 4028 while (cpos < clusters) {
4026 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, 4029 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4027 &num_clusters, &ext_flags); 4030 &num_clusters, &ext_flags);
4028 4031 if (ret) {
4032 mlog_errno(ret);
4033 goto out;
4034 }
4029 if (p_cluster) { 4035 if (p_cluster) {
4030 ret = ocfs2_add_refcounted_extent(t_inode, &et, 4036 ret = ocfs2_add_refcounted_extent(t_inode, &et,
4031 ref_ci, ref_root_bh, 4037 ref_ci, ref_root_bh,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 121da2dc3be8..d4e81e4a9b04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1924,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1924{ 1924{
1925 int tmp, hangup_needed = 0; 1925 int tmp, hangup_needed = 0;
1926 struct ocfs2_super *osb = NULL; 1926 struct ocfs2_super *osb = NULL;
1927 char nodestr[8]; 1927 char nodestr[12];
1928 1928
1929 trace_ocfs2_dismount_volume(sb); 1929 trace_ocfs2_dismount_volume(sb);
1930 1930
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 317ef0abccbb..6ce0686eab72 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3505,7 +3505,7 @@ int ocfs2_xattr_set(struct inode *inode,
3505 int ret, credits, ref_meta = 0, ref_credits = 0; 3505 int ret, credits, ref_meta = 0, ref_credits = 0;
3506 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3506 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3507 struct inode *tl_inode = osb->osb_tl_inode; 3507 struct inode *tl_inode = osb->osb_tl_inode;
3508 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 3508 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
3509 struct ocfs2_refcount_tree *ref_tree = NULL; 3509 struct ocfs2_refcount_tree *ref_tree = NULL;
3510 3510
3511 struct ocfs2_xattr_info xi = { 3511 struct ocfs2_xattr_info xi = {
@@ -3609,13 +3609,14 @@ int ocfs2_xattr_set(struct inode *inode,
3609 if (IS_ERR(ctxt.handle)) { 3609 if (IS_ERR(ctxt.handle)) {
3610 ret = PTR_ERR(ctxt.handle); 3610 ret = PTR_ERR(ctxt.handle);
3611 mlog_errno(ret); 3611 mlog_errno(ret);
3612 goto cleanup; 3612 goto out_free_ac;
3613 } 3613 }
3614 3614
3615 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); 3615 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
3616 3616
3617 ocfs2_commit_trans(osb, ctxt.handle); 3617 ocfs2_commit_trans(osb, ctxt.handle);
3618 3618
3619out_free_ac:
3619 if (ctxt.data_ac) 3620 if (ctxt.data_ac)
3620 ocfs2_free_alloc_context(ctxt.data_ac); 3621 ocfs2_free_alloc_context(ctxt.data_ac);
3621 if (ctxt.meta_ac) 3622 if (ctxt.meta_ac)
@@ -5881,6 +5882,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
5881 while (cpos < clusters) { 5882 while (cpos < clusters) {
5882 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 5883 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
5883 &num_clusters, el, &ext_flags); 5884 &num_clusters, el, &ext_flags);
5885 if (ret) {
5886 mlog_errno(ret);
5887 break;
5888 }
5884 5889
5885 cpos += num_clusters; 5890 cpos += num_clusters;
5886 if ((ext_flags & OCFS2_EXT_REFCOUNTED)) 5891 if ((ext_flags & OCFS2_EXT_REFCOUNTED))
@@ -6797,7 +6802,7 @@ out:
6797 if (ret) { 6802 if (ret) {
6798 if (*meta_ac) { 6803 if (*meta_ac) {
6799 ocfs2_free_alloc_context(*meta_ac); 6804 ocfs2_free_alloc_context(*meta_ac);
6800 meta_ac = NULL; 6805 *meta_ac = NULL;
6801 } 6806 }
6802 } 6807 }
6803 6808
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e5c7f15465b4..19f134e896a9 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,7 +32,7 @@ enum ocfs2_xattr_type {
32 32
33struct ocfs2_security_xattr_info { 33struct ocfs2_security_xattr_info {
34 int enable; 34 int enable;
35 char *name; 35 const char *name;
36 void *value; 36 void *value;
37 size_t value_len; 37 size_t value_len;
38}; 38};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index e0d9b3e722bd..54d57d6ba68d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
311 struct inode *inode = mapping->host; 311 struct inode *inode = mapping->host;
312 312
313 if (to > inode->i_size) { 313 if (to > inode->i_size) {
314 truncate_pagecache(inode, to, inode->i_size); 314 truncate_pagecache(inode, inode->i_size);
315 omfs_truncate(inode); 315 omfs_truncate(inode);
316 } 316 }
317} 317}
diff --git a/fs/open.c b/fs/open.c
index 7931f76acc2b..d420331ca32a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -443,7 +443,7 @@ retry:
443 goto dput_and_out; 443 goto dput_and_out;
444 444
445 error = -EPERM; 445 error = -EPERM;
446 if (!nsown_capable(CAP_SYS_CHROOT)) 446 if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
447 goto dput_and_out; 447 goto dput_and_out;
448 error = security_path_chroot(&path); 448 error = security_path_chroot(&path);
449 if (error) 449 if (error)
@@ -485,14 +485,13 @@ out_unlock:
485 485
486SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) 486SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
487{ 487{
488 struct file * file; 488 struct fd f = fdget(fd);
489 int err = -EBADF; 489 int err = -EBADF;
490 490
491 file = fget(fd); 491 if (f.file) {
492 if (file) { 492 audit_inode(NULL, f.file->f_path.dentry, 0);
493 audit_inode(NULL, file->f_path.dentry, 0); 493 err = chmod_common(&f.file->f_path, mode);
494 err = chmod_common(&file->f_path, mode); 494 fdput(f);
495 fput(file);
496 } 495 }
497 return err; 496 return err;
498} 497}
@@ -745,14 +744,24 @@ cleanup_file:
745 744
746/** 745/**
747 * finish_open - finish opening a file 746 * finish_open - finish opening a file
748 * @od: opaque open data 747 * @file: file pointer
749 * @dentry: pointer to dentry 748 * @dentry: pointer to dentry
750 * @open: open callback 749 * @open: open callback
750 * @opened: state of open
751 * 751 *
752 * This can be used to finish opening a file passed to i_op->atomic_open(). 752 * This can be used to finish opening a file passed to i_op->atomic_open().
753 * 753 *
754 * If the open callback is set to NULL, then the standard f_op->open() 754 * If the open callback is set to NULL, then the standard f_op->open()
755 * filesystem callback is substituted. 755 * filesystem callback is substituted.
756 *
757 * NB: the dentry reference is _not_ consumed. If, for example, the dentry is
758 * the return value of d_splice_alias(), then the caller needs to perform dput()
759 * on it after finish_open().
760 *
761 * On successful return @file is a fully instantiated open file. After this, if
762 * an error occurs in ->atomic_open(), it needs to clean up with fput().
763 *
764 * Returns zero on success or -errno if the open failed.
756 */ 765 */
757int finish_open(struct file *file, struct dentry *dentry, 766int finish_open(struct file *file, struct dentry *dentry,
758 int (*open)(struct inode *, struct file *), 767 int (*open)(struct inode *, struct file *),
@@ -773,11 +782,16 @@ EXPORT_SYMBOL(finish_open);
773/** 782/**
774 * finish_no_open - finish ->atomic_open() without opening the file 783 * finish_no_open - finish ->atomic_open() without opening the file
775 * 784 *
776 * @od: opaque open data 785 * @file: file pointer
777 * @dentry: dentry or NULL (as returned from ->lookup()) 786 * @dentry: dentry or NULL (as returned from ->lookup())
778 * 787 *
779 * This can be used to set the result of a successful lookup in ->atomic_open(). 788 * This can be used to set the result of a successful lookup in ->atomic_open().
780 * The filesystem's atomic_open() method shall return NULL after calling this. 789 *
790 * NB: unlike finish_open() this function does consume the dentry reference and
791 * the caller need not dput() it.
792 *
793 * Returns "1" which must be the return value of ->atomic_open() after having
794 * called this function.
781 */ 795 */
782int finish_no_open(struct file *file, struct dentry *dentry) 796int finish_no_open(struct file *file, struct dentry *dentry)
783{ 797{
diff --git a/fs/pnode.h b/fs/pnode.h
index b091445c1c4a..59e7eda1851e 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,11 +19,14 @@
19 19
20#define CL_EXPIRE 0x01 20#define CL_EXPIRE 0x01
21#define CL_SLAVE 0x02 21#define CL_SLAVE 0x02
22#define CL_COPY_ALL 0x04 22#define CL_COPY_UNBINDABLE 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PRIVATE 0x10 24#define CL_PRIVATE 0x10
25#define CL_SHARED_TO_SLAVE 0x20 25#define CL_SHARED_TO_SLAVE 0x20
26#define CL_UNPRIVILEGED 0x40 26#define CL_UNPRIVILEGED 0x40
27#define CL_COPY_MNT_NS_FILE 0x80
28
29#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
27 30
28static inline void set_mnt_shared(struct mount *mnt) 31static inline void set_mnt_shared(struct mount *mnt)
29{ 32{
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0ff80f9b930f..985ea881b5bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -286,7 +286,7 @@ int proc_fd_permission(struct inode *inode, int mask)
286 int rv = generic_permission(inode, mask); 286 int rv = generic_permission(inode, mask);
287 if (rv == 0) 287 if (rv == 0)
288 return 0; 288 return 0;
289 if (task_pid(current) == proc_pid(inode)) 289 if (task_tgid(current) == proc_pid(inode))
290 rv = 0; 290 rv = 0;
291 return rv; 291 return rv;
292} 292}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 073aea60cf8f..9f8ef9b7674d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -285,6 +285,20 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
285 return rv; 285 return rv;
286} 286}
287 287
288static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
289{
290 struct proc_dir_entry *pde = PDE(file_inode(file));
291 int rv = -EIO;
292 unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
293 if (use_pde(pde)) {
294 get_unmapped_area = pde->proc_fops->get_unmapped_area;
295 if (get_unmapped_area)
296 rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
297 unuse_pde(pde);
298 }
299 return rv;
300}
301
288static int proc_reg_open(struct inode *inode, struct file *file) 302static int proc_reg_open(struct inode *inode, struct file *file)
289{ 303{
290 struct proc_dir_entry *pde = PDE(inode); 304 struct proc_dir_entry *pde = PDE(inode);
@@ -356,6 +370,7 @@ static const struct file_operations proc_reg_file_ops = {
356 .compat_ioctl = proc_reg_compat_ioctl, 370 .compat_ioctl = proc_reg_compat_ioctl,
357#endif 371#endif
358 .mmap = proc_reg_mmap, 372 .mmap = proc_reg_mmap,
373 .get_unmapped_area = proc_reg_get_unmapped_area,
359 .open = proc_reg_open, 374 .open = proc_reg_open,
360 .release = proc_reg_release, 375 .release = proc_reg_release,
361}; 376};
@@ -368,6 +383,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
368 .poll = proc_reg_poll, 383 .poll = proc_reg_poll,
369 .unlocked_ioctl = proc_reg_unlocked_ioctl, 384 .unlocked_ioctl = proc_reg_unlocked_ioctl,
370 .mmap = proc_reg_mmap, 385 .mmap = proc_reg_mmap,
386 .get_unmapped_area = proc_reg_get_unmapped_area,
371 .open = proc_reg_open, 387 .open = proc_reg_open,
372 .release = proc_reg_release, 388 .release = proc_reg_release,
373}; 389};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 5aa847a603c0..59d85d608898 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
132 K(i.freeswap), 132 K(i.freeswap),
133 K(global_page_state(NR_FILE_DIRTY)), 133 K(global_page_state(NR_FILE_DIRTY)),
134 K(global_page_state(NR_WRITEBACK)), 134 K(global_page_state(NR_WRITEBACK)),
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 K(global_page_state(NR_ANON_PAGES)
137 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
138 HPAGE_PMD_NR),
139#else
140 K(global_page_state(NR_ANON_PAGES)), 135 K(global_page_state(NR_ANON_PAGES)),
141#endif
142 K(global_page_state(NR_FILE_MAPPED)), 136 K(global_page_state(NR_FILE_MAPPED)),
143 K(global_page_state(NR_SHMEM)), 137 K(global_page_state(NR_SHMEM)),
144 K(global_page_state(NR_SLAB_RECLAIMABLE) + 138 K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/root.c b/fs/proc/root.c
index e0a790da726d..87dbcbef7fe4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
110 ns = task_active_pid_ns(current); 110 ns = task_active_pid_ns(current);
111 options = data; 111 options = data;
112 112
113 if (!current_user_ns()->may_mount_proc) 113 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
114 return ERR_PTR(-EPERM);
115
116 /* Does the mounter have privilege over the pid namespace? */
117 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
114 return ERR_PTR(-EPERM); 118 return ERR_PTR(-EPERM);
115 } 119 }
116 120
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 107d026f5d6e..7366e9d63cee 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
740 ptent = pte_file_clear_soft_dirty(ptent); 740 ptent = pte_file_clear_soft_dirty(ptent);
741 } 741 }
742 742
743 if (vma->vm_flags & VM_SOFTDIRTY)
744 vma->vm_flags &= ~VM_SOFTDIRTY;
745
743 set_pte_at(vma->vm_mm, addr, pte, ptent); 746 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif 747#endif
745} 748}
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
949 if (is_migration_entry(entry)) 952 if (is_migration_entry(entry))
950 page = migration_entry_to_page(entry); 953 page = migration_entry_to_page(entry);
951 } else { 954 } else {
952 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 955 if (vma->vm_flags & VM_SOFTDIRTY)
956 flags2 |= __PM_SOFT_DIRTY;
957 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
953 return; 958 return;
954 } 959 }
955 960
956 if (page && !PageAnon(page)) 961 if (page && !PageAnon(page))
957 flags |= PM_FILE; 962 flags |= PM_FILE;
958 if (pte_soft_dirty(pte)) 963 if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY; 964 flags2 |= __PM_SOFT_DIRTY;
960 965
961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 966 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 979 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 980 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
976 else 981 else
977 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 982 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
978} 983}
979#else 984#else
980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 985static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 1002 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2; 1003 int pmd_flags2;
999 1004
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); 1005 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1006 pmd_flags2 = __PM_SOFT_DIRTY;
1007 else
1008 pmd_flags2 = 0;
1009
1001 for (; addr != end; addr += PAGE_SIZE) { 1010 for (; addr != end; addr += PAGE_SIZE) {
1002 unsigned long offset; 1011 unsigned long offset;
1003 1012
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1015 if (pmd_trans_unstable(pmd)) 1024 if (pmd_trans_unstable(pmd))
1016 return 0; 1025 return 0;
1017 for (; addr != end; addr += PAGE_SIZE) { 1026 for (; addr != end; addr += PAGE_SIZE) {
1027 int flags2;
1018 1028
1019 /* check to see if we've left 'vma' behind 1029 /* check to see if we've left 'vma' behind
1020 * and need a new, higher one */ 1030 * and need a new, higher one */
1021 if (vma && (addr >= vma->vm_end)) { 1031 if (vma && (addr >= vma->vm_end)) {
1022 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
1023 pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1033 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1034 flags2 = __PM_SOFT_DIRTY;
1035 else
1036 flags2 = 0;
1037 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1024 } 1038 }
1025 1039
1026 /* check that 'vma' actually covers this address, 1040 /* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1044 1058
1045#ifdef CONFIG_HUGETLB_PAGE 1059#ifdef CONFIG_HUGETLB_PAGE
1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1060static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1047 pte_t pte, int offset) 1061 pte_t pte, int offset, int flags2)
1048{ 1062{
1049 if (pte_present(pte)) 1063 if (pte_present(pte))
1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1064 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT); 1065 PM_STATUS2(pm->v2, flags2) |
1066 PM_PRESENT);
1052 else 1067 else
1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1068 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1069 PM_STATUS2(pm->v2, flags2));
1054} 1070}
1055 1071
1056/* This function walks within one hugetlb entry in the single call */ 1072/* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1059 struct mm_walk *walk) 1075 struct mm_walk *walk)
1060{ 1076{
1061 struct pagemapread *pm = walk->private; 1077 struct pagemapread *pm = walk->private;
1078 struct vm_area_struct *vma;
1062 int err = 0; 1079 int err = 0;
1080 int flags2;
1063 pagemap_entry_t pme; 1081 pagemap_entry_t pme;
1064 1082
1083 vma = find_vma(walk->mm, addr);
1084 WARN_ON_ONCE(!vma);
1085
1086 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1087 flags2 = __PM_SOFT_DIRTY;
1088 else
1089 flags2 = 0;
1090
1065 for (; addr != end; addr += PAGE_SIZE) { 1091 for (; addr != end; addr += PAGE_SIZE) {
1066 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1092 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); 1093 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1068 err = add_to_pagemap(addr, &pme, pm); 1094 err = add_to_pagemap(addr, &pme, pm);
1069 if (err) 1095 if (err)
1070 return err; 1096 return err;
@@ -1376,8 +1402,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1376 walk.mm = mm; 1402 walk.mm = mm;
1377 1403
1378 pol = get_vma_policy(task, vma, vma->vm_start); 1404 pol = get_vma_policy(task, vma, vma->vm_start);
1379 mpol_to_str(buffer, sizeof(buffer), pol); 1405 n = mpol_to_str(buffer, sizeof(buffer), pol);
1380 mpol_cond_put(pol); 1406 mpol_cond_put(pol);
1407 if (n < 0)
1408 return n;
1381 1409
1382 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1410 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1383 1411
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a1a16eb97c7b..9100d6959886 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -21,6 +21,7 @@
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/pagemap.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/io.h> 26#include <asm/io.h>
26#include "internal.h" 27#include "internal.h"
@@ -123,11 +124,65 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
123 return read; 124 return read;
124} 125}
125 126
127/*
128 * Architectures may override this function to allocate ELF header in 2nd kernel
129 */
130int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
131{
132 return 0;
133}
134
135/*
136 * Architectures may override this function to free header
137 */
138void __weak elfcorehdr_free(unsigned long long addr)
139{}
140
141/*
142 * Architectures may override this function to read from ELF header
143 */
144ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
145{
146 return read_from_oldmem(buf, count, ppos, 0);
147}
148
149/*
150 * Architectures may override this function to read from notes sections
151 */
152ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
153{
154 return read_from_oldmem(buf, count, ppos, 0);
155}
156
157/*
158 * Architectures may override this function to map oldmem
159 */
160int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
161 unsigned long from, unsigned long pfn,
162 unsigned long size, pgprot_t prot)
163{
164 return remap_pfn_range(vma, from, pfn, size, prot);
165}
166
167/*
168 * Copy to either kernel or user space
169 */
170static int copy_to(void *target, void *src, size_t size, int userbuf)
171{
172 if (userbuf) {
173 if (copy_to_user((char __user *) target, src, size))
174 return -EFAULT;
175 } else {
176 memcpy(target, src, size);
177 }
178 return 0;
179}
180
126/* Read from the ELF header and then the crash dump. On error, negative value is 181/* Read from the ELF header and then the crash dump. On error, negative value is
127 * returned otherwise number of bytes read are returned. 182 * returned otherwise number of bytes read are returned.
128 */ 183 */
129static ssize_t read_vmcore(struct file *file, char __user *buffer, 184static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
130 size_t buflen, loff_t *fpos) 185 int userbuf)
131{ 186{
132 ssize_t acc = 0, tmp; 187 ssize_t acc = 0, tmp;
133 size_t tsz; 188 size_t tsz;
@@ -144,7 +199,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
144 /* Read ELF core header */ 199 /* Read ELF core header */
145 if (*fpos < elfcorebuf_sz) { 200 if (*fpos < elfcorebuf_sz) {
146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); 201 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 202 if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
148 return -EFAULT; 203 return -EFAULT;
149 buflen -= tsz; 204 buflen -= tsz;
150 *fpos += tsz; 205 *fpos += tsz;
@@ -162,7 +217,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
162 217
163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); 218 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; 219 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
165 if (copy_to_user(buffer, kaddr, tsz)) 220 if (copy_to(buffer, kaddr, tsz, userbuf))
166 return -EFAULT; 221 return -EFAULT;
167 buflen -= tsz; 222 buflen -= tsz;
168 *fpos += tsz; 223 *fpos += tsz;
@@ -178,7 +233,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
178 if (*fpos < m->offset + m->size) { 233 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); 234 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset; 235 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1); 236 tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
182 if (tmp < 0) 237 if (tmp < 0)
183 return tmp; 238 return tmp;
184 buflen -= tsz; 239 buflen -= tsz;
@@ -195,6 +250,55 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
195 return acc; 250 return acc;
196} 251}
197 252
253static ssize_t read_vmcore(struct file *file, char __user *buffer,
254 size_t buflen, loff_t *fpos)
255{
256 return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
257}
258
259/*
260 * The vmcore fault handler uses the page cache and fills data using the
261 * standard __vmcore_read() function.
262 *
263 * On s390 the fault handler is used for memory regions that can't be mapped
264 * directly with remap_pfn_range().
265 */
266static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
267{
268#ifdef CONFIG_S390
269 struct address_space *mapping = vma->vm_file->f_mapping;
270 pgoff_t index = vmf->pgoff;
271 struct page *page;
272 loff_t offset;
273 char *buf;
274 int rc;
275
276 page = find_or_create_page(mapping, index, GFP_KERNEL);
277 if (!page)
278 return VM_FAULT_OOM;
279 if (!PageUptodate(page)) {
280 offset = (loff_t) index << PAGE_CACHE_SHIFT;
281 buf = __va((page_to_pfn(page) << PAGE_SHIFT));
282 rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
283 if (rc < 0) {
284 unlock_page(page);
285 page_cache_release(page);
286 return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
287 }
288 SetPageUptodate(page);
289 }
290 unlock_page(page);
291 vmf->page = page;
292 return 0;
293#else
294 return VM_FAULT_SIGBUS;
295#endif
296}
297
298static const struct vm_operations_struct vmcore_mmap_ops = {
299 .fault = mmap_vmcore_fault,
300};
301
198/** 302/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in 303 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory 304 * vmalloc memory
@@ -223,7 +327,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into 327 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout. 328 * virtually contiguous user-space in ELF layout.
225 */ 329 */
226#if defined(CONFIG_MMU) && !defined(CONFIG_S390) 330#ifdef CONFIG_MMU
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) 331static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{ 332{
229 size_t size = vma->vm_end - vma->vm_start; 333 size_t size = vma->vm_end - vma->vm_start;
@@ -241,6 +345,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
241 345
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); 346 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP; 347 vma->vm_flags |= VM_MIXEDMAP;
348 vma->vm_ops = &vmcore_mmap_ops;
244 349
245 len = 0; 350 len = 0;
246 351
@@ -282,9 +387,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
282 387
283 tsz = min_t(size_t, m->offset + m->size - start, size); 388 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset; 389 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len, 390 if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz, 391 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot)) 392 vma->vm_page_prot))
288 goto fail; 393 goto fail;
289 size -= tsz; 394 size -= tsz;
290 start += tsz; 395 start += tsz;
@@ -357,7 +462,7 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
357 notes_section = kmalloc(max_sz, GFP_KERNEL); 462 notes_section = kmalloc(max_sz, GFP_KERNEL);
358 if (!notes_section) 463 if (!notes_section)
359 return -ENOMEM; 464 return -ENOMEM;
360 rc = read_from_oldmem(notes_section, max_sz, &offset, 0); 465 rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
361 if (rc < 0) { 466 if (rc < 0) {
362 kfree(notes_section); 467 kfree(notes_section);
363 return rc; 468 return rc;
@@ -444,7 +549,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
444 if (phdr_ptr->p_type != PT_NOTE) 549 if (phdr_ptr->p_type != PT_NOTE)
445 continue; 550 continue;
446 offset = phdr_ptr->p_offset; 551 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); 552 rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
553 &offset);
448 if (rc < 0) 554 if (rc < 0)
449 return rc; 555 return rc;
450 notes_buf += phdr_ptr->p_memsz; 556 notes_buf += phdr_ptr->p_memsz;
@@ -536,7 +642,7 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
536 notes_section = kmalloc(max_sz, GFP_KERNEL); 642 notes_section = kmalloc(max_sz, GFP_KERNEL);
537 if (!notes_section) 643 if (!notes_section)
538 return -ENOMEM; 644 return -ENOMEM;
539 rc = read_from_oldmem(notes_section, max_sz, &offset, 0); 645 rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
540 if (rc < 0) { 646 if (rc < 0) {
541 kfree(notes_section); 647 kfree(notes_section);
542 return rc; 648 return rc;
@@ -623,7 +729,8 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
623 if (phdr_ptr->p_type != PT_NOTE) 729 if (phdr_ptr->p_type != PT_NOTE)
624 continue; 730 continue;
625 offset = phdr_ptr->p_offset; 731 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); 732 rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
733 &offset);
627 if (rc < 0) 734 if (rc < 0)
628 return rc; 735 return rc;
629 notes_buf += phdr_ptr->p_memsz; 736 notes_buf += phdr_ptr->p_memsz;
@@ -810,7 +917,7 @@ static int __init parse_crash_elf64_headers(void)
810 addr = elfcorehdr_addr; 917 addr = elfcorehdr_addr;
811 918
812 /* Read Elf header */ 919 /* Read Elf header */
813 rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); 920 rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
814 if (rc < 0) 921 if (rc < 0)
815 return rc; 922 return rc;
816 923
@@ -837,7 +944,7 @@ static int __init parse_crash_elf64_headers(void)
837 if (!elfcorebuf) 944 if (!elfcorebuf)
838 return -ENOMEM; 945 return -ENOMEM;
839 addr = elfcorehdr_addr; 946 addr = elfcorehdr_addr;
840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); 947 rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
841 if (rc < 0) 948 if (rc < 0)
842 goto fail; 949 goto fail;
843 950
@@ -866,7 +973,7 @@ static int __init parse_crash_elf32_headers(void)
866 addr = elfcorehdr_addr; 973 addr = elfcorehdr_addr;
867 974
868 /* Read Elf header */ 975 /* Read Elf header */
869 rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); 976 rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
870 if (rc < 0) 977 if (rc < 0)
871 return rc; 978 return rc;
872 979
@@ -892,7 +999,7 @@ static int __init parse_crash_elf32_headers(void)
892 if (!elfcorebuf) 999 if (!elfcorebuf)
893 return -ENOMEM; 1000 return -ENOMEM;
894 addr = elfcorehdr_addr; 1001 addr = elfcorehdr_addr;
895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); 1002 rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
896 if (rc < 0) 1003 if (rc < 0)
897 goto fail; 1004 goto fail;
898 1005
@@ -919,7 +1026,7 @@ static int __init parse_crash_elf_headers(void)
919 int rc=0; 1026 int rc=0;
920 1027
921 addr = elfcorehdr_addr; 1028 addr = elfcorehdr_addr;
922 rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); 1029 rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
923 if (rc < 0) 1030 if (rc < 0)
924 return rc; 1031 return rc;
925 if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { 1032 if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
@@ -952,7 +1059,14 @@ static int __init vmcore_init(void)
952{ 1059{
953 int rc = 0; 1060 int rc = 0;
954 1061
955 /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ 1062 /* Allow architectures to allocate ELF header in 2nd kernel */
1063 rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
1064 if (rc)
1065 return rc;
1066 /*
1067 * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
1068 * then capture the dump.
1069 */
956 if (!(is_vmcore_usable())) 1070 if (!(is_vmcore_usable()))
957 return rc; 1071 return rc;
958 rc = parse_crash_elf_headers(); 1072 rc = parse_crash_elf_headers();
@@ -960,6 +1074,8 @@ static int __init vmcore_init(void)
960 pr_warn("Kdump: vmcore not initialized\n"); 1074 pr_warn("Kdump: vmcore not initialized\n");
961 return rc; 1075 return rc;
962 } 1076 }
1077 elfcorehdr_free(elfcorehdr_addr);
1078 elfcorehdr_addr = ELFCORE_ADDR_ERR;
963 1079
964 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); 1080 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
965 if (proc_vmcore) 1081 if (proc_vmcore)
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index ca71db69da07..983d9510becc 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,6 +1,8 @@
1config PSTORE 1config PSTORE
2 bool "Persistent store support" 2 bool "Persistent store support"
3 default n 3 default n
4 select ZLIB_DEFLATE
5 select ZLIB_INFLATE
4 help 6 help
5 This option enables generic access to platform level 7 This option enables generic access to platform level
6 persistent storage via "pstore" filesystem that can 8 persistent storage via "pstore" filesystem that can
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 71bf5f4ae84c..12823845d324 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -275,8 +275,8 @@ int pstore_is_mounted(void)
275 * Set the mtime & ctime to the date that this record was originally stored. 275 * Set the mtime & ctime to the date that this record was originally stored.
276 */ 276 */
277int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, 277int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
278 char *data, size_t size, struct timespec time, 278 char *data, bool compressed, size_t size,
279 struct pstore_info *psi) 279 struct timespec time, struct pstore_info *psi)
280{ 280{
281 struct dentry *root = pstore_sb->s_root; 281 struct dentry *root = pstore_sb->s_root;
282 struct dentry *dentry; 282 struct dentry *dentry;
@@ -315,7 +315,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
315 315
316 switch (type) { 316 switch (type) {
317 case PSTORE_TYPE_DMESG: 317 case PSTORE_TYPE_DMESG:
318 sprintf(name, "dmesg-%s-%lld", psname, id); 318 sprintf(name, "dmesg-%s-%lld%s", psname, id,
319 compressed ? ".enc.z" : "");
319 break; 320 break;
320 case PSTORE_TYPE_CONSOLE: 321 case PSTORE_TYPE_CONSOLE:
321 sprintf(name, "console-%s", psname); 322 sprintf(name, "console-%s", psname);
@@ -345,9 +346,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
345 346
346 mutex_lock(&root->d_inode->i_mutex); 347 mutex_lock(&root->d_inode->i_mutex);
347 348
348 rc = -ENOSPC;
349 dentry = d_alloc_name(root, name); 349 dentry = d_alloc_name(root, name);
350 if (IS_ERR(dentry)) 350 if (!dentry)
351 goto fail_lockedalloc; 351 goto fail_lockedalloc;
352 352
353 memcpy(private->data, data, size); 353 memcpy(private->data, data, size);
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 937d820f273c..3b3d305277c4 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,8 +50,9 @@ extern struct pstore_info *psinfo;
50extern void pstore_set_kmsg_bytes(int); 50extern void pstore_set_kmsg_bytes(int);
51extern void pstore_get_records(int); 51extern void pstore_get_records(int);
52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
53 int count, char *data, size_t size, 53 int count, char *data, bool compressed,
54 struct timespec time, struct pstore_info *psi); 54 size_t size, struct timespec time,
55 struct pstore_info *psi);
55extern int pstore_is_mounted(void); 56extern int pstore_is_mounted(void);
56 57
57#endif 58#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 422962ae9fc2..b8e93a40a5d3 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/pstore.h> 28#include <linux/pstore.h>
29#include <linux/zlib.h>
29#include <linux/string.h> 30#include <linux/string.h>
30#include <linux/timer.h> 31#include <linux/timer.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
@@ -65,6 +66,15 @@ struct pstore_info *psinfo;
65 66
66static char *backend; 67static char *backend;
67 68
69/* Compression parameters */
70#define COMPR_LEVEL 6
71#define WINDOW_BITS 12
72#define MEM_LEVEL 4
73static struct z_stream_s stream;
74
75static char *big_oops_buf;
76static size_t big_oops_buf_sz;
77
68/* How much of the console log to snapshot */ 78/* How much of the console log to snapshot */
69static unsigned long kmsg_bytes = 10240; 79static unsigned long kmsg_bytes = 10240;
70 80
@@ -117,6 +127,142 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
117} 127}
118EXPORT_SYMBOL_GPL(pstore_cannot_block_path); 128EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
119 129
130/* Derived from logfs_compress() */
131static int pstore_compress(const void *in, void *out, size_t inlen,
132 size_t outlen)
133{
134 int err, ret;
135
136 ret = -EIO;
137 err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
138 MEM_LEVEL, Z_DEFAULT_STRATEGY);
139 if (err != Z_OK)
140 goto error;
141
142 stream.next_in = in;
143 stream.avail_in = inlen;
144 stream.total_in = 0;
145 stream.next_out = out;
146 stream.avail_out = outlen;
147 stream.total_out = 0;
148
149 err = zlib_deflate(&stream, Z_FINISH);
150 if (err != Z_STREAM_END)
151 goto error;
152
153 err = zlib_deflateEnd(&stream);
154 if (err != Z_OK)
155 goto error;
156
157 if (stream.total_out >= stream.total_in)
158 goto error;
159
160 ret = stream.total_out;
161error:
162 return ret;
163}
164
165/* Derived from logfs_uncompress */
166static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
167{
168 int err, ret;
169
170 ret = -EIO;
171 err = zlib_inflateInit2(&stream, WINDOW_BITS);
172 if (err != Z_OK)
173 goto error;
174
175 stream.next_in = in;
176 stream.avail_in = inlen;
177 stream.total_in = 0;
178 stream.next_out = out;
179 stream.avail_out = outlen;
180 stream.total_out = 0;
181
182 err = zlib_inflate(&stream, Z_FINISH);
183 if (err != Z_STREAM_END)
184 goto error;
185
186 err = zlib_inflateEnd(&stream);
187 if (err != Z_OK)
188 goto error;
189
190 ret = stream.total_out;
191error:
192 return ret;
193}
194
195static void allocate_buf_for_compression(void)
196{
197 size_t size;
198 size_t cmpr;
199
200 switch (psinfo->bufsize) {
201 /* buffer range for efivars */
202 case 1000 ... 2000:
203 cmpr = 56;
204 break;
205 case 2001 ... 3000:
206 cmpr = 54;
207 break;
208 case 3001 ... 3999:
209 cmpr = 52;
210 break;
211 /* buffer range for nvram, erst */
212 case 4000 ... 10000:
213 cmpr = 45;
214 break;
215 default:
216 cmpr = 60;
217 break;
218 }
219
220 big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr;
221 big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
222 if (big_oops_buf) {
223 size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
224 zlib_inflate_workspacesize());
225 stream.workspace = kmalloc(size, GFP_KERNEL);
226 if (!stream.workspace) {
227 pr_err("pstore: No memory for compression workspace; "
228 "skipping compression\n");
229 kfree(big_oops_buf);
230 big_oops_buf = NULL;
231 }
232 } else {
233 pr_err("No memory for uncompressed data; "
234 "skipping compression\n");
235 stream.workspace = NULL;
236 }
237
238}
239
240/*
241 * Called when compression fails, since the printk buffer
242 * would be fetched for compression calling it again when
243 * compression fails would have moved the iterator of
244 * printk buffer which results in fetching old contents.
245 * Copy the recent messages from big_oops_buf to psinfo->buf
246 */
247static size_t copy_kmsg_to_buffer(int hsize, size_t len)
248{
249 size_t total_len;
250 size_t diff;
251
252 total_len = hsize + len;
253
254 if (total_len > psinfo->bufsize) {
255 diff = total_len - psinfo->bufsize + hsize;
256 memcpy(psinfo->buf, big_oops_buf, hsize);
257 memcpy(psinfo->buf + hsize, big_oops_buf + diff,
258 psinfo->bufsize - hsize);
259 total_len = psinfo->bufsize;
260 } else
261 memcpy(psinfo->buf, big_oops_buf, total_len);
262
263 return total_len;
264}
265
120/* 266/*
121 * callback from kmsg_dump. (s2,l2) has the most recently 267 * callback from kmsg_dump. (s2,l2) has the most recently
122 * written bytes, older bytes are in (s1,l1). Save as much 268 * written bytes, older bytes are in (s1,l1). Save as much
@@ -148,22 +294,52 @@ static void pstore_dump(struct kmsg_dumper *dumper,
148 char *dst; 294 char *dst;
149 unsigned long size; 295 unsigned long size;
150 int hsize; 296 int hsize;
297 int zipped_len = -1;
151 size_t len; 298 size_t len;
299 bool compressed;
300 size_t total_len;
152 301
153 dst = psinfo->buf; 302 if (big_oops_buf) {
154 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); 303 dst = big_oops_buf;
155 size = psinfo->bufsize - hsize; 304 hsize = sprintf(dst, "%s#%d Part%d\n", why,
156 dst += hsize; 305 oopscount, part);
306 size = big_oops_buf_sz - hsize;
157 307
158 if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len)) 308 if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
159 break; 309 size, &len))
310 break;
311
312 zipped_len = pstore_compress(dst, psinfo->buf,
313 hsize + len, psinfo->bufsize);
314
315 if (zipped_len > 0) {
316 compressed = true;
317 total_len = zipped_len;
318 } else {
319 compressed = false;
320 total_len = copy_kmsg_to_buffer(hsize, len);
321 }
322 } else {
323 dst = psinfo->buf;
324 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
325 part);
326 size = psinfo->bufsize - hsize;
327 dst += hsize;
328
329 if (!kmsg_dump_get_buffer(dumper, true, dst,
330 size, &len))
331 break;
332
333 compressed = false;
334 total_len = hsize + len;
335 }
160 336
161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 337 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
162 oopscount, hsize, hsize + len, psinfo); 338 oopscount, compressed, total_len, psinfo);
163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 339 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
164 pstore_new_entry = 1; 340 pstore_new_entry = 1;
165 341
166 total += hsize + len; 342 total += total_len;
167 part++; 343 part++;
168 } 344 }
169 if (pstore_cannot_block_path(reason)) { 345 if (pstore_cannot_block_path(reason)) {
@@ -221,10 +397,10 @@ static void pstore_register_console(void) {}
221static int pstore_write_compat(enum pstore_type_id type, 397static int pstore_write_compat(enum pstore_type_id type,
222 enum kmsg_dump_reason reason, 398 enum kmsg_dump_reason reason,
223 u64 *id, unsigned int part, int count, 399 u64 *id, unsigned int part, int count,
224 size_t hsize, size_t size, 400 bool compressed, size_t size,
225 struct pstore_info *psi) 401 struct pstore_info *psi)
226{ 402{
227 return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, 403 return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
228 size, psi); 404 size, psi);
229} 405}
230 406
@@ -261,6 +437,8 @@ int pstore_register(struct pstore_info *psi)
261 return -EINVAL; 437 return -EINVAL;
262 } 438 }
263 439
440 allocate_buf_for_compression();
441
264 if (pstore_is_mounted()) 442 if (pstore_is_mounted())
265 pstore_get_records(0); 443 pstore_get_records(0);
266 444
@@ -297,6 +475,8 @@ void pstore_get_records(int quiet)
297 enum pstore_type_id type; 475 enum pstore_type_id type;
298 struct timespec time; 476 struct timespec time;
299 int failed = 0, rc; 477 int failed = 0, rc;
478 bool compressed;
479 int unzipped_len = -1;
300 480
301 if (!psi) 481 if (!psi)
302 return; 482 return;
@@ -305,11 +485,32 @@ void pstore_get_records(int quiet)
305 if (psi->open && psi->open(psi)) 485 if (psi->open && psi->open(psi))
306 goto out; 486 goto out;
307 487
308 while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) { 488 while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
489 psi)) > 0) {
490 if (compressed && (type == PSTORE_TYPE_DMESG)) {
491 if (big_oops_buf)
492 unzipped_len = pstore_decompress(buf,
493 big_oops_buf, size,
494 big_oops_buf_sz);
495
496 if (unzipped_len > 0) {
497 buf = big_oops_buf;
498 size = unzipped_len;
499 compressed = false;
500 } else {
501 pr_err("pstore: decompression failed;"
502 "returned %d\n", unzipped_len);
503 compressed = true;
504 }
505 }
309 rc = pstore_mkfile(type, psi->name, id, count, buf, 506 rc = pstore_mkfile(type, psi->name, id, count, buf,
310 (size_t)size, time, psi); 507 compressed, (size_t)size, time, psi);
311 kfree(buf); 508 if (unzipped_len < 0) {
312 buf = NULL; 509 /* Free buffer other than big oops */
510 kfree(buf);
511 buf = NULL;
512 } else
513 unzipped_len = -1;
313 if (rc && (rc != -EEXIST || !quiet)) 514 if (rc && (rc != -EEXIST || !quiet))
314 failed++; 515 failed++;
315 } 516 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index a6119f9469e2..fa8cef2cca3a 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -131,9 +131,31 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
131 return prz; 131 return prz;
132} 132}
133 133
134static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
135 bool *compressed)
136{
137 char data_type;
138
139 if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
140 &time->tv_sec, &time->tv_nsec, &data_type) == 3) {
141 if (data_type == 'C')
142 *compressed = true;
143 else
144 *compressed = false;
145 } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
146 &time->tv_sec, &time->tv_nsec) == 2) {
147 *compressed = false;
148 } else {
149 time->tv_sec = 0;
150 time->tv_nsec = 0;
151 *compressed = false;
152 }
153}
154
134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 155static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
135 int *count, struct timespec *time, 156 int *count, struct timespec *time,
136 char **buf, struct pstore_info *psi) 157 char **buf, bool *compressed,
158 struct pstore_info *psi)
137{ 159{
138 ssize_t size; 160 ssize_t size;
139 ssize_t ecc_notice_size; 161 ssize_t ecc_notice_size;
@@ -152,10 +174,6 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
152 if (!prz) 174 if (!prz)
153 return 0; 175 return 0;
154 176
155 /* TODO(kees): Bogus time for the moment. */
156 time->tv_sec = 0;
157 time->tv_nsec = 0;
158
159 size = persistent_ram_old_size(prz); 177 size = persistent_ram_old_size(prz);
160 178
161 /* ECC correction notice */ 179 /* ECC correction notice */
@@ -166,12 +184,14 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
166 return -ENOMEM; 184 return -ENOMEM;
167 185
168 memcpy(*buf, persistent_ram_old(prz), size); 186 memcpy(*buf, persistent_ram_old(prz), size);
187 ramoops_read_kmsg_hdr(*buf, time, compressed);
169 persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); 188 persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
170 189
171 return size + ecc_notice_size; 190 return size + ecc_notice_size;
172} 191}
173 192
174static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) 193static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
194 bool compressed)
175{ 195{
176 char *hdr; 196 char *hdr;
177 struct timespec timestamp; 197 struct timespec timestamp;
@@ -182,8 +202,9 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
182 timestamp.tv_sec = 0; 202 timestamp.tv_sec = 0;
183 timestamp.tv_nsec = 0; 203 timestamp.tv_nsec = 0;
184 } 204 }
185 hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", 205 hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
186 (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); 206 (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000),
207 compressed ? 'C' : 'D');
187 WARN_ON_ONCE(!hdr); 208 WARN_ON_ONCE(!hdr);
188 len = hdr ? strlen(hdr) : 0; 209 len = hdr ? strlen(hdr) : 0;
189 persistent_ram_write(prz, hdr, len); 210 persistent_ram_write(prz, hdr, len);
@@ -196,7 +217,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
196 enum kmsg_dump_reason reason, 217 enum kmsg_dump_reason reason,
197 u64 *id, unsigned int part, 218 u64 *id, unsigned int part,
198 const char *buf, 219 const char *buf,
199 size_t hsize, size_t size, 220 bool compressed, size_t size,
200 struct pstore_info *psi) 221 struct pstore_info *psi)
201{ 222{
202 struct ramoops_context *cxt = psi->data; 223 struct ramoops_context *cxt = psi->data;
@@ -242,7 +263,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
242 263
243 prz = cxt->przs[cxt->dump_write_cnt]; 264 prz = cxt->przs[cxt->dump_write_cnt];
244 265
245 hlen = ramoops_write_kmsg_hdr(prz); 266 hlen = ramoops_write_kmsg_hdr(prz, compressed);
246 if (size + hlen > prz->buffer_size) 267 if (size + hlen > prz->buffer_size)
247 size = prz->buffer_size - hlen; 268 size = prz->buffer_size - hlen;
248 persistent_ram_write(prz, buf, size); 269 persistent_ram_write(prz, buf, size);
@@ -400,11 +421,11 @@ static int ramoops_probe(struct platform_device *pdev)
400 goto fail_out; 421 goto fail_out;
401 } 422 }
402 423
403 if (!is_power_of_2(pdata->record_size)) 424 if (pdata->record_size && !is_power_of_2(pdata->record_size))
404 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 425 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
405 if (!is_power_of_2(pdata->console_size)) 426 if (pdata->console_size && !is_power_of_2(pdata->console_size))
406 pdata->console_size = rounddown_pow_of_two(pdata->console_size); 427 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
407 if (!is_power_of_2(pdata->ftrace_size)) 428 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
408 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 429 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
409 430
410 cxt->dump_read_cnt = 0; 431 cxt->dump_read_cnt = 0;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fbad622841f9..831d49a4111f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -687,45 +687,37 @@ int dquot_quota_sync(struct super_block *sb, int type)
687} 687}
688EXPORT_SYMBOL(dquot_quota_sync); 688EXPORT_SYMBOL(dquot_quota_sync);
689 689
690/* Free unused dquots from cache */ 690static unsigned long
691static void prune_dqcache(int count) 691dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
692{ 692{
693 struct list_head *head; 693 struct list_head *head;
694 struct dquot *dquot; 694 struct dquot *dquot;
695 unsigned long freed = 0;
695 696
696 head = free_dquots.prev; 697 head = free_dquots.prev;
697 while (head != &free_dquots && count) { 698 while (head != &free_dquots && sc->nr_to_scan) {
698 dquot = list_entry(head, struct dquot, dq_free); 699 dquot = list_entry(head, struct dquot, dq_free);
699 remove_dquot_hash(dquot); 700 remove_dquot_hash(dquot);
700 remove_free_dquot(dquot); 701 remove_free_dquot(dquot);
701 remove_inuse(dquot); 702 remove_inuse(dquot);
702 do_destroy_dquot(dquot); 703 do_destroy_dquot(dquot);
703 count--; 704 sc->nr_to_scan--;
705 freed++;
704 head = free_dquots.prev; 706 head = free_dquots.prev;
705 } 707 }
708 return freed;
706} 709}
707 710
708/* 711static unsigned long
709 * This is called from kswapd when we think we need some 712dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
710 * more memory
711 */
712static int shrink_dqcache_memory(struct shrinker *shrink,
713 struct shrink_control *sc)
714{ 713{
715 int nr = sc->nr_to_scan; 714 return vfs_pressure_ratio(
716 715 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
717 if (nr) {
718 spin_lock(&dq_list_lock);
719 prune_dqcache(nr);
720 spin_unlock(&dq_list_lock);
721 }
722 return ((unsigned)
723 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
724 /100) * sysctl_vfs_cache_pressure;
725} 716}
726 717
727static struct shrinker dqcache_shrinker = { 718static struct shrinker dqcache_shrinker = {
728 .shrink = shrink_dqcache_memory, 719 .count_objects = dqcache_shrink_count,
720 .scan_objects = dqcache_shrink_scan,
729 .seeks = DEFAULT_SEEKS, 721 .seeks = DEFAULT_SEEKS,
730}; 722};
731 723
@@ -1094,6 +1086,14 @@ static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
1094 dquot->dq_dqb.dqb_rsvspace -= number; 1086 dquot->dq_dqb.dqb_rsvspace -= number;
1095} 1087}
1096 1088
1089static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number)
1090{
1091 if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
1092 number = dquot->dq_dqb.dqb_curspace;
1093 dquot->dq_dqb.dqb_rsvspace += number;
1094 dquot->dq_dqb.dqb_curspace -= number;
1095}
1096
1097static inline 1097static inline
1098void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) 1098void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
1099{ 1099{
@@ -1528,6 +1528,15 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1528} 1528}
1529EXPORT_SYMBOL(inode_claim_rsv_space); 1529EXPORT_SYMBOL(inode_claim_rsv_space);
1530 1530
1531void inode_reclaim_rsv_space(struct inode *inode, qsize_t number)
1532{
1533 spin_lock(&inode->i_lock);
1534 *inode_reserved_space(inode) += number;
1535 __inode_sub_bytes(inode, number);
1536 spin_unlock(&inode->i_lock);
1537}
1538EXPORT_SYMBOL(inode_reclaim_rsv_space);
1539
1531void inode_sub_rsv_space(struct inode *inode, qsize_t number) 1540void inode_sub_rsv_space(struct inode *inode, qsize_t number)
1532{ 1541{
1533 spin_lock(&inode->i_lock); 1542 spin_lock(&inode->i_lock);
@@ -1702,6 +1711,35 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1702EXPORT_SYMBOL(dquot_claim_space_nodirty); 1711EXPORT_SYMBOL(dquot_claim_space_nodirty);
1703 1712
1704/* 1713/*
1714 * Convert allocated space back to in-memory reserved quotas
1715 */
1716void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
1717{
1718 int cnt;
1719
1720 if (!dquot_active(inode)) {
1721 inode_reclaim_rsv_space(inode, number);
1722 return;
1723 }
1724
1725 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1726 spin_lock(&dq_data_lock);
1727 /* Claim reserved quotas to allocated quotas */
1728 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1729 if (inode->i_dquot[cnt])
1730 dquot_reclaim_reserved_space(inode->i_dquot[cnt],
1731 number);
1732 }
1733 /* Update inode bytes */
1734 inode_reclaim_rsv_space(inode, number);
1735 spin_unlock(&dq_data_lock);
1736 mark_all_dquot_dirty(inode->i_dquot);
1737 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1738 return;
1739}
1740EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
1741
1742/*
1705 * This operation can block, but only after everything is updated 1743 * This operation can block, but only after everything is updated
1706 */ 1744 */
1707void __dquot_free_space(struct inode *inode, qsize_t number, int flags) 1745void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index c7314f1771f5..dea86e8967ee 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -27,6 +27,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
27 case Q_SYNC: 27 case Q_SYNC:
28 case Q_GETINFO: 28 case Q_GETINFO:
29 case Q_XGETQSTAT: 29 case Q_XGETQSTAT:
30 case Q_XGETQSTATV:
30 case Q_XQUOTASYNC: 31 case Q_XQUOTASYNC:
31 break; 32 break;
32 /* allow to query information for dquots we "own" */ 33 /* allow to query information for dquots we "own" */
@@ -217,6 +218,31 @@ static int quota_getxstate(struct super_block *sb, void __user *addr)
217 return ret; 218 return ret;
218} 219}
219 220
221static int quota_getxstatev(struct super_block *sb, void __user *addr)
222{
223 struct fs_quota_statv fqs;
224 int ret;
225
226 if (!sb->s_qcop->get_xstatev)
227 return -ENOSYS;
228
229 memset(&fqs, 0, sizeof(fqs));
230 if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */
231 return -EFAULT;
232
233 /* If this kernel doesn't support user specified version, fail */
234 switch (fqs.qs_version) {
235 case FS_QSTATV_VERSION1:
236 break;
237 default:
238 return -EINVAL;
239 }
240 ret = sb->s_qcop->get_xstatev(sb, &fqs);
241 if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
242 return -EFAULT;
243 return ret;
244}
245
220static int quota_setxquota(struct super_block *sb, int type, qid_t id, 246static int quota_setxquota(struct super_block *sb, int type, qid_t id,
221 void __user *addr) 247 void __user *addr)
222{ 248{
@@ -293,6 +319,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
293 return quota_setxstate(sb, cmd, addr); 319 return quota_setxstate(sb, cmd, addr);
294 case Q_XGETQSTAT: 320 case Q_XGETQSTAT:
295 return quota_getxstate(sb, addr); 321 return quota_getxstate(sb, addr);
322 case Q_XGETQSTATV:
323 return quota_getxstatev(sb, addr);
296 case Q_XSETQLIM: 324 case Q_XSETQLIM:
297 return quota_setxquota(sb, type, id, addr); 325 return quota_setxquota(sb, type, id, addr);
298 case Q_XGETQUOTA: 326 case Q_XGETQUOTA:
@@ -317,6 +345,7 @@ static int quotactl_cmd_write(int cmd)
317 case Q_GETINFO: 345 case Q_GETINFO:
318 case Q_SYNC: 346 case Q_SYNC:
319 case Q_XGETQSTAT: 347 case Q_XGETQSTAT:
348 case Q_XGETQSTATV:
320 case Q_XGETQUOTA: 349 case Q_XGETQUOTA:
321 case Q_XQUOTASYNC: 350 case Q_XQUOTASYNC:
322 return 0; 351 return 0;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c24f1e10b946..39d14659a8d3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -244,12 +244,6 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type,
244 return mount_nodev(fs_type, flags, data, ramfs_fill_super); 244 return mount_nodev(fs_type, flags, data, ramfs_fill_super);
245} 245}
246 246
247static struct dentry *rootfs_mount(struct file_system_type *fs_type,
248 int flags, const char *dev_name, void *data)
249{
250 return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
251}
252
253static void ramfs_kill_sb(struct super_block *sb) 247static void ramfs_kill_sb(struct super_block *sb)
254{ 248{
255 kfree(sb->s_fs_info); 249 kfree(sb->s_fs_info);
@@ -262,29 +256,23 @@ static struct file_system_type ramfs_fs_type = {
262 .kill_sb = ramfs_kill_sb, 256 .kill_sb = ramfs_kill_sb,
263 .fs_flags = FS_USERNS_MOUNT, 257 .fs_flags = FS_USERNS_MOUNT,
264}; 258};
265static struct file_system_type rootfs_fs_type = {
266 .name = "rootfs",
267 .mount = rootfs_mount,
268 .kill_sb = kill_litter_super,
269};
270 259
271static int __init init_ramfs_fs(void) 260int __init init_ramfs_fs(void)
272{
273 return register_filesystem(&ramfs_fs_type);
274}
275module_init(init_ramfs_fs)
276
277int __init init_rootfs(void)
278{ 261{
262 static unsigned long once;
279 int err; 263 int err;
280 264
265 if (test_and_set_bit(0, &once))
266 return 0;
267
281 err = bdi_init(&ramfs_backing_dev_info); 268 err = bdi_init(&ramfs_backing_dev_info);
282 if (err) 269 if (err)
283 return err; 270 return err;
284 271
285 err = register_filesystem(&rootfs_fs_type); 272 err = register_filesystem(&ramfs_fs_type);
286 if (err) 273 if (err)
287 bdi_destroy(&ramfs_backing_dev_info); 274 bdi_destroy(&ramfs_backing_dev_info);
288 275
289 return err; 276 return err;
290} 277}
278module_init(init_ramfs_fs)
diff --git a/fs/read_write.c b/fs/read_write.c
index 122a3846d9e1..e3cd280b158c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
367 367
368 init_sync_kiocb(&kiocb, filp); 368 init_sync_kiocb(&kiocb, filp);
369 kiocb.ki_pos = *ppos; 369 kiocb.ki_pos = *ppos;
370 kiocb.ki_left = len;
371 kiocb.ki_nbytes = len; 370 kiocb.ki_nbytes = len;
372 371
373 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 372 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
417 416
418 init_sync_kiocb(&kiocb, filp); 417 init_sync_kiocb(&kiocb, filp);
419 kiocb.ki_pos = *ppos; 418 kiocb.ki_pos = *ppos;
420 kiocb.ki_left = len;
421 kiocb.ki_nbytes = len; 419 kiocb.ki_nbytes = len;
422 420
423 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 421 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
599 597
600 init_sync_kiocb(&kiocb, filp); 598 init_sync_kiocb(&kiocb, filp);
601 kiocb.ki_pos = *ppos; 599 kiocb.ki_pos = *ppos;
602 kiocb.ki_left = len;
603 kiocb.ki_nbytes = len; 600 kiocb.ki_nbytes = len;
604 601
605 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); 602 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a98b7740a0fc..dc9a6829f7c6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -423,8 +423,11 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
423 set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); 423 set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
424 424
425 journal_mark_dirty(th, s, sbh); 425 journal_mark_dirty(th, s, sbh);
426 if (for_unformatted) 426 if (for_unformatted) {
427 int depth = reiserfs_write_unlock_nested(s);
427 dquot_free_block_nodirty(inode, 1); 428 dquot_free_block_nodirty(inode, 1);
429 reiserfs_write_lock_nested(s, depth);
430 }
428} 431}
429 432
430void reiserfs_free_block(struct reiserfs_transaction_handle *th, 433void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1128,6 +1131,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1128 b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; 1131 b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
1129 int passno = 0; 1132 int passno = 0;
1130 int nr_allocated = 0; 1133 int nr_allocated = 0;
1134 int depth;
1131 1135
1132 determine_prealloc_size(hint); 1136 determine_prealloc_size(hint);
1133 if (!hint->formatted_node) { 1137 if (!hint->formatted_node) {
@@ -1137,10 +1141,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1137 "reiserquota: allocating %d blocks id=%u", 1141 "reiserquota: allocating %d blocks id=%u",
1138 amount_needed, hint->inode->i_uid); 1142 amount_needed, hint->inode->i_uid);
1139#endif 1143#endif
1144 depth = reiserfs_write_unlock_nested(s);
1140 quota_ret = 1145 quota_ret =
1141 dquot_alloc_block_nodirty(hint->inode, amount_needed); 1146 dquot_alloc_block_nodirty(hint->inode, amount_needed);
1142 if (quota_ret) /* Quota exceeded? */ 1147 if (quota_ret) { /* Quota exceeded? */
1148 reiserfs_write_lock_nested(s, depth);
1143 return QUOTA_EXCEEDED; 1149 return QUOTA_EXCEEDED;
1150 }
1144 if (hint->preallocate && hint->prealloc_size) { 1151 if (hint->preallocate && hint->prealloc_size) {
1145#ifdef REISERQUOTA_DEBUG 1152#ifdef REISERQUOTA_DEBUG
1146 reiserfs_debug(s, REISERFS_DEBUG_CODE, 1153 reiserfs_debug(s, REISERFS_DEBUG_CODE,
@@ -1153,6 +1160,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1153 hint->preallocate = hint->prealloc_size = 0; 1160 hint->preallocate = hint->prealloc_size = 0;
1154 } 1161 }
1155 /* for unformatted nodes, force large allocations */ 1162 /* for unformatted nodes, force large allocations */
1163 reiserfs_write_lock_nested(s, depth);
1156 } 1164 }
1157 1165
1158 do { 1166 do {
@@ -1181,9 +1189,11 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1181 hint->inode->i_uid); 1189 hint->inode->i_uid);
1182#endif 1190#endif
1183 /* Free not allocated blocks */ 1191 /* Free not allocated blocks */
1192 depth = reiserfs_write_unlock_nested(s);
1184 dquot_free_block_nodirty(hint->inode, 1193 dquot_free_block_nodirty(hint->inode,
1185 amount_needed + hint->prealloc_size - 1194 amount_needed + hint->prealloc_size -
1186 nr_allocated); 1195 nr_allocated);
1196 reiserfs_write_lock_nested(s, depth);
1187 } 1197 }
1188 while (nr_allocated--) 1198 while (nr_allocated--)
1189 reiserfs_free_block(hint->th, hint->inode, 1199 reiserfs_free_block(hint->th, hint->inode,
@@ -1214,10 +1224,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1214 REISERFS_I(hint->inode)->i_prealloc_count, 1224 REISERFS_I(hint->inode)->i_prealloc_count,
1215 hint->inode->i_uid); 1225 hint->inode->i_uid);
1216#endif 1226#endif
1227
1228 depth = reiserfs_write_unlock_nested(s);
1217 dquot_free_block_nodirty(hint->inode, amount_needed + 1229 dquot_free_block_nodirty(hint->inode, amount_needed +
1218 hint->prealloc_size - nr_allocated - 1230 hint->prealloc_size - nr_allocated -
1219 REISERFS_I(hint->inode)-> 1231 REISERFS_I(hint->inode)->
1220 i_prealloc_count); 1232 i_prealloc_count);
1233 reiserfs_write_lock_nested(s, depth);
1221 } 1234 }
1222 1235
1223 return CARRY_ON; 1236 return CARRY_ON;
@@ -1340,10 +1353,11 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1340 "reading failed", __func__, block); 1353 "reading failed", __func__, block);
1341 else { 1354 else {
1342 if (buffer_locked(bh)) { 1355 if (buffer_locked(bh)) {
1356 int depth;
1343 PROC_INFO_INC(sb, scan_bitmap.wait); 1357 PROC_INFO_INC(sb, scan_bitmap.wait);
1344 reiserfs_write_unlock(sb); 1358 depth = reiserfs_write_unlock_nested(sb);
1345 __wait_on_buffer(bh); 1359 __wait_on_buffer(bh);
1346 reiserfs_write_lock(sb); 1360 reiserfs_write_lock_nested(sb, depth);
1347 } 1361 }
1348 BUG_ON(!buffer_uptodate(bh)); 1362 BUG_ON(!buffer_uptodate(bh));
1349 BUG_ON(atomic_read(&bh->b_count) == 0); 1363 BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 03e4ca5624d6..1fd2051109a3 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -71,6 +71,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
71 char small_buf[32]; /* avoid kmalloc if we can */ 71 char small_buf[32]; /* avoid kmalloc if we can */
72 struct reiserfs_dir_entry de; 72 struct reiserfs_dir_entry de;
73 int ret = 0; 73 int ret = 0;
74 int depth;
74 75
75 reiserfs_write_lock(inode->i_sb); 76 reiserfs_write_lock(inode->i_sb);
76 77
@@ -181,17 +182,17 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
181 * Since filldir might sleep, we can release 182 * Since filldir might sleep, we can release
182 * the write lock here for other waiters 183 * the write lock here for other waiters
183 */ 184 */
184 reiserfs_write_unlock(inode->i_sb); 185 depth = reiserfs_write_unlock_nested(inode->i_sb);
185 if (!dir_emit 186 if (!dir_emit
186 (ctx, local_buf, d_reclen, d_ino, 187 (ctx, local_buf, d_reclen, d_ino,
187 DT_UNKNOWN)) { 188 DT_UNKNOWN)) {
188 reiserfs_write_lock(inode->i_sb); 189 reiserfs_write_lock_nested(inode->i_sb, depth);
189 if (local_buf != small_buf) { 190 if (local_buf != small_buf) {
190 kfree(local_buf); 191 kfree(local_buf);
191 } 192 }
192 goto end; 193 goto end;
193 } 194 }
194 reiserfs_write_lock(inode->i_sb); 195 reiserfs_write_lock_nested(inode->i_sb, depth);
195 if (local_buf != small_buf) { 196 if (local_buf != small_buf) {
196 kfree(local_buf); 197 kfree(local_buf);
197 } 198 }
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 430e0658704c..dc4d41530316 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -1022,9 +1022,9 @@ static int get_far_parent(struct tree_balance *tb,
1022 if (buffer_locked(*pcom_father)) { 1022 if (buffer_locked(*pcom_father)) {
1023 1023
1024 /* Release the write lock while the buffer is busy */ 1024 /* Release the write lock while the buffer is busy */
1025 reiserfs_write_unlock(tb->tb_sb); 1025 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
1026 __wait_on_buffer(*pcom_father); 1026 __wait_on_buffer(*pcom_father);
1027 reiserfs_write_lock(tb->tb_sb); 1027 reiserfs_write_lock_nested(tb->tb_sb, depth);
1028 if (FILESYSTEM_CHANGED_TB(tb)) { 1028 if (FILESYSTEM_CHANGED_TB(tb)) {
1029 brelse(*pcom_father); 1029 brelse(*pcom_father);
1030 return REPEAT_SEARCH; 1030 return REPEAT_SEARCH;
@@ -1929,9 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
1929 return REPEAT_SEARCH; 1929 return REPEAT_SEARCH;
1930 1930
1931 if (buffer_locked(bh)) { 1931 if (buffer_locked(bh)) {
1932 reiserfs_write_unlock(tb->tb_sb); 1932 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
1933 __wait_on_buffer(bh); 1933 __wait_on_buffer(bh);
1934 reiserfs_write_lock(tb->tb_sb); 1934 reiserfs_write_lock_nested(tb->tb_sb, depth);
1935 if (FILESYSTEM_CHANGED_TB(tb)) 1935 if (FILESYSTEM_CHANGED_TB(tb))
1936 return REPEAT_SEARCH; 1936 return REPEAT_SEARCH;
1937 } 1937 }
@@ -1952,6 +1952,7 @@ static int get_neighbors(struct tree_balance *tb, int h)
1952 unsigned long son_number; 1952 unsigned long son_number;
1953 struct super_block *sb = tb->tb_sb; 1953 struct super_block *sb = tb->tb_sb;
1954 struct buffer_head *bh; 1954 struct buffer_head *bh;
1955 int depth;
1955 1956
1956 PROC_INFO_INC(sb, get_neighbors[h]); 1957 PROC_INFO_INC(sb, get_neighbors[h]);
1957 1958
@@ -1969,9 +1970,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
1969 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> 1970 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1970 FL[h]); 1971 FL[h]);
1971 son_number = B_N_CHILD_NUM(tb->FL[h], child_position); 1972 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1972 reiserfs_write_unlock(sb); 1973 depth = reiserfs_write_unlock_nested(tb->tb_sb);
1973 bh = sb_bread(sb, son_number); 1974 bh = sb_bread(sb, son_number);
1974 reiserfs_write_lock(sb); 1975 reiserfs_write_lock_nested(tb->tb_sb, depth);
1975 if (!bh) 1976 if (!bh)
1976 return IO_ERROR; 1977 return IO_ERROR;
1977 if (FILESYSTEM_CHANGED_TB(tb)) { 1978 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2009,9 +2010,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
2009 child_position = 2010 child_position =
2010 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; 2011 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2011 son_number = B_N_CHILD_NUM(tb->FR[h], child_position); 2012 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2012 reiserfs_write_unlock(sb); 2013 depth = reiserfs_write_unlock_nested(tb->tb_sb);
2013 bh = sb_bread(sb, son_number); 2014 bh = sb_bread(sb, son_number);
2014 reiserfs_write_lock(sb); 2015 reiserfs_write_lock_nested(tb->tb_sb, depth);
2015 if (!bh) 2016 if (!bh)
2016 return IO_ERROR; 2017 return IO_ERROR;
2017 if (FILESYSTEM_CHANGED_TB(tb)) { 2018 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2272,6 +2273,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2272 } 2273 }
2273 2274
2274 if (locked) { 2275 if (locked) {
2276 int depth;
2275#ifdef CONFIG_REISERFS_CHECK 2277#ifdef CONFIG_REISERFS_CHECK
2276 repeat_counter++; 2278 repeat_counter++;
2277 if ((repeat_counter % 10000) == 0) { 2279 if ((repeat_counter % 10000) == 0) {
@@ -2286,9 +2288,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2286 REPEAT_SEARCH : CARRY_ON; 2288 REPEAT_SEARCH : CARRY_ON;
2287 } 2289 }
2288#endif 2290#endif
2289 reiserfs_write_unlock(tb->tb_sb); 2291 depth = reiserfs_write_unlock_nested(tb->tb_sb);
2290 __wait_on_buffer(locked); 2292 __wait_on_buffer(locked);
2291 reiserfs_write_lock(tb->tb_sb); 2293 reiserfs_write_lock_nested(tb->tb_sb, depth);
2292 if (FILESYSTEM_CHANGED_TB(tb)) 2294 if (FILESYSTEM_CHANGED_TB(tb))
2293 return REPEAT_SEARCH; 2295 return REPEAT_SEARCH;
2294 } 2296 }
@@ -2359,9 +2361,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
2359 2361
2360 /* if it possible in indirect_to_direct conversion */ 2362 /* if it possible in indirect_to_direct conversion */
2361 if (buffer_locked(tbS0)) { 2363 if (buffer_locked(tbS0)) {
2362 reiserfs_write_unlock(tb->tb_sb); 2364 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2363 __wait_on_buffer(tbS0); 2365 __wait_on_buffer(tbS0);
2364 reiserfs_write_lock(tb->tb_sb); 2366 reiserfs_write_lock_nested(tb->tb_sb, depth);
2365 if (FILESYSTEM_CHANGED_TB(tb)) 2367 if (FILESYSTEM_CHANGED_TB(tb))
2366 return REPEAT_SEARCH; 2368 return REPEAT_SEARCH;
2367 } 2369 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0048cc16a6a8..ad62bdbb451e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -30,7 +30,6 @@ void reiserfs_evict_inode(struct inode *inode)
30 JOURNAL_PER_BALANCE_CNT * 2 + 30 JOURNAL_PER_BALANCE_CNT * 2 +
31 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); 31 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
32 struct reiserfs_transaction_handle th; 32 struct reiserfs_transaction_handle th;
33 int depth;
34 int err; 33 int err;
35 34
36 if (!inode->i_nlink && !is_bad_inode(inode)) 35 if (!inode->i_nlink && !is_bad_inode(inode))
@@ -40,12 +39,13 @@ void reiserfs_evict_inode(struct inode *inode)
40 if (inode->i_nlink) 39 if (inode->i_nlink)
41 goto no_delete; 40 goto no_delete;
42 41
43 depth = reiserfs_write_lock_once(inode->i_sb);
44
45 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 42 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
46 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 43 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
44
47 reiserfs_delete_xattrs(inode); 45 reiserfs_delete_xattrs(inode);
48 46
47 reiserfs_write_lock(inode->i_sb);
48
49 if (journal_begin(&th, inode->i_sb, jbegin_count)) 49 if (journal_begin(&th, inode->i_sb, jbegin_count))
50 goto out; 50 goto out;
51 reiserfs_update_inode_transaction(inode); 51 reiserfs_update_inode_transaction(inode);
@@ -57,8 +57,11 @@ void reiserfs_evict_inode(struct inode *inode)
57 /* Do quota update inside a transaction for journaled quotas. We must do that 57 /* Do quota update inside a transaction for journaled quotas. We must do that
58 * after delete_object so that quota updates go into the same transaction as 58 * after delete_object so that quota updates go into the same transaction as
59 * stat data deletion */ 59 * stat data deletion */
60 if (!err) 60 if (!err) {
61 int depth = reiserfs_write_unlock_nested(inode->i_sb);
61 dquot_free_inode(inode); 62 dquot_free_inode(inode);
63 reiserfs_write_lock_nested(inode->i_sb, depth);
64 }
62 65
63 if (journal_end(&th, inode->i_sb, jbegin_count)) 66 if (journal_end(&th, inode->i_sb, jbegin_count))
64 goto out; 67 goto out;
@@ -72,12 +75,12 @@ void reiserfs_evict_inode(struct inode *inode)
72 /* all items of file are deleted, so we can remove "save" link */ 75 /* all items of file are deleted, so we can remove "save" link */
73 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything 76 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
74 * about an error here */ 77 * about an error here */
78out:
79 reiserfs_write_unlock(inode->i_sb);
75 } else { 80 } else {
76 /* no object items are in the tree */ 81 /* no object items are in the tree */
77 ; 82 ;
78 } 83 }
79 out:
80 reiserfs_write_unlock_once(inode->i_sb, depth);
81 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 84 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
82 dquot_drop(inode); 85 dquot_drop(inode);
83 inode->i_blocks = 0; 86 inode->i_blocks = 0;
@@ -610,7 +613,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
610 __le32 *item; 613 __le32 *item;
611 int done; 614 int done;
612 int fs_gen; 615 int fs_gen;
613 int lock_depth;
614 struct reiserfs_transaction_handle *th = NULL; 616 struct reiserfs_transaction_handle *th = NULL;
615 /* space reserved in transaction batch: 617 /* space reserved in transaction batch:
616 . 3 balancings in direct->indirect conversion 618 . 3 balancings in direct->indirect conversion
@@ -626,11 +628,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
626 loff_t new_offset = 628 loff_t new_offset =
627 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 629 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
628 630
629 lock_depth = reiserfs_write_lock_once(inode->i_sb); 631 reiserfs_write_lock(inode->i_sb);
630 version = get_inode_item_key_version(inode); 632 version = get_inode_item_key_version(inode);
631 633
632 if (!file_capable(inode, block)) { 634 if (!file_capable(inode, block)) {
633 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 635 reiserfs_write_unlock(inode->i_sb);
634 return -EFBIG; 636 return -EFBIG;
635 } 637 }
636 638
@@ -642,7 +644,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
642 /* find number of block-th logical block of the file */ 644 /* find number of block-th logical block of the file */
643 ret = _get_block_create_0(inode, block, bh_result, 645 ret = _get_block_create_0(inode, block, bh_result,
644 create | GET_BLOCK_READ_DIRECT); 646 create | GET_BLOCK_READ_DIRECT);
645 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 647 reiserfs_write_unlock(inode->i_sb);
646 return ret; 648 return ret;
647 } 649 }
648 /* 650 /*
@@ -760,7 +762,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
760 if (!dangle && th) 762 if (!dangle && th)
761 retval = reiserfs_end_persistent_transaction(th); 763 retval = reiserfs_end_persistent_transaction(th);
762 764
763 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 765 reiserfs_write_unlock(inode->i_sb);
764 766
765 /* the item was found, so new blocks were not added to the file 767 /* the item was found, so new blocks were not added to the file
766 ** there is no need to make sure the inode is updated with this 768 ** there is no need to make sure the inode is updated with this
@@ -1011,11 +1013,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
1011 * long time. reschedule if needed and also release the write 1013 * long time. reschedule if needed and also release the write
1012 * lock for others. 1014 * lock for others.
1013 */ 1015 */
1014 if (need_resched()) { 1016 reiserfs_cond_resched(inode->i_sb);
1015 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1016 schedule();
1017 lock_depth = reiserfs_write_lock_once(inode->i_sb);
1018 }
1019 1017
1020 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1018 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1021 if (retval == IO_ERROR) { 1019 if (retval == IO_ERROR) {
@@ -1050,7 +1048,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
1050 retval = err; 1048 retval = err;
1051 } 1049 }
1052 1050
1053 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 1051 reiserfs_write_unlock(inode->i_sb);
1054 reiserfs_check_path(&path); 1052 reiserfs_check_path(&path);
1055 return retval; 1053 return retval;
1056} 1054}
@@ -1509,14 +1507,15 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1509{ 1507{
1510 struct inode *inode; 1508 struct inode *inode;
1511 struct reiserfs_iget_args args; 1509 struct reiserfs_iget_args args;
1510 int depth;
1512 1511
1513 args.objectid = key->on_disk_key.k_objectid; 1512 args.objectid = key->on_disk_key.k_objectid;
1514 args.dirid = key->on_disk_key.k_dir_id; 1513 args.dirid = key->on_disk_key.k_dir_id;
1515 reiserfs_write_unlock(s); 1514 depth = reiserfs_write_unlock_nested(s);
1516 inode = iget5_locked(s, key->on_disk_key.k_objectid, 1515 inode = iget5_locked(s, key->on_disk_key.k_objectid,
1517 reiserfs_find_actor, reiserfs_init_locked_inode, 1516 reiserfs_find_actor, reiserfs_init_locked_inode,
1518 (void *)(&args)); 1517 (void *)(&args));
1519 reiserfs_write_lock(s); 1518 reiserfs_write_lock_nested(s, depth);
1520 if (!inode) 1519 if (!inode)
1521 return ERR_PTR(-ENOMEM); 1520 return ERR_PTR(-ENOMEM);
1522 1521
@@ -1772,7 +1771,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1772 struct inode *inode, 1771 struct inode *inode,
1773 struct reiserfs_security_handle *security) 1772 struct reiserfs_security_handle *security)
1774{ 1773{
1775 struct super_block *sb; 1774 struct super_block *sb = dir->i_sb;
1776 struct reiserfs_iget_args args; 1775 struct reiserfs_iget_args args;
1777 INITIALIZE_PATH(path_to_key); 1776 INITIALIZE_PATH(path_to_key);
1778 struct cpu_key key; 1777 struct cpu_key key;
@@ -1780,12 +1779,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1780 struct stat_data sd; 1779 struct stat_data sd;
1781 int retval; 1780 int retval;
1782 int err; 1781 int err;
1782 int depth;
1783 1783
1784 BUG_ON(!th->t_trans_id); 1784 BUG_ON(!th->t_trans_id);
1785 1785
1786 reiserfs_write_unlock(inode->i_sb); 1786 depth = reiserfs_write_unlock_nested(sb);
1787 err = dquot_alloc_inode(inode); 1787 err = dquot_alloc_inode(inode);
1788 reiserfs_write_lock(inode->i_sb); 1788 reiserfs_write_lock_nested(sb, depth);
1789 if (err) 1789 if (err)
1790 goto out_end_trans; 1790 goto out_end_trans;
1791 if (!dir->i_nlink) { 1791 if (!dir->i_nlink) {
@@ -1793,8 +1793,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1793 goto out_bad_inode; 1793 goto out_bad_inode;
1794 } 1794 }
1795 1795
1796 sb = dir->i_sb;
1797
1798 /* item head of new item */ 1796 /* item head of new item */
1799 ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); 1797 ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1800 ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); 1798 ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
@@ -1812,10 +1810,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1812 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); 1810 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1813 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); 1811 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1814 1812
1815 reiserfs_write_unlock(inode->i_sb); 1813 depth = reiserfs_write_unlock_nested(inode->i_sb);
1816 err = insert_inode_locked4(inode, args.objectid, 1814 err = insert_inode_locked4(inode, args.objectid,
1817 reiserfs_find_actor, &args); 1815 reiserfs_find_actor, &args);
1818 reiserfs_write_lock(inode->i_sb); 1816 reiserfs_write_lock_nested(inode->i_sb, depth);
1819 if (err) { 1817 if (err) {
1820 err = -EINVAL; 1818 err = -EINVAL;
1821 goto out_bad_inode; 1819 goto out_bad_inode;
@@ -1941,7 +1939,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1941 } 1939 }
1942 1940
1943 if (reiserfs_posixacl(inode->i_sb)) { 1941 if (reiserfs_posixacl(inode->i_sb)) {
1942 reiserfs_write_unlock(inode->i_sb);
1944 retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); 1943 retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
1944 reiserfs_write_lock(inode->i_sb);
1945 if (retval) { 1945 if (retval) {
1946 err = retval; 1946 err = retval;
1947 reiserfs_check_path(&path_to_key); 1947 reiserfs_check_path(&path_to_key);
@@ -1956,7 +1956,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1956 inode->i_flags |= S_PRIVATE; 1956 inode->i_flags |= S_PRIVATE;
1957 1957
1958 if (security->name) { 1958 if (security->name) {
1959 reiserfs_write_unlock(inode->i_sb);
1959 retval = reiserfs_security_write(th, inode, security); 1960 retval = reiserfs_security_write(th, inode, security);
1961 reiserfs_write_lock(inode->i_sb);
1960 if (retval) { 1962 if (retval) {
1961 err = retval; 1963 err = retval;
1962 reiserfs_check_path(&path_to_key); 1964 reiserfs_check_path(&path_to_key);
@@ -1982,14 +1984,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1982 INODE_PKEY(inode)->k_objectid = 0; 1984 INODE_PKEY(inode)->k_objectid = 0;
1983 1985
1984 /* Quota change must be inside a transaction for journaling */ 1986 /* Quota change must be inside a transaction for journaling */
1987 depth = reiserfs_write_unlock_nested(inode->i_sb);
1985 dquot_free_inode(inode); 1988 dquot_free_inode(inode);
1989 reiserfs_write_lock_nested(inode->i_sb, depth);
1986 1990
1987 out_end_trans: 1991 out_end_trans:
1988 journal_end(th, th->t_super, th->t_blocks_allocated); 1992 journal_end(th, th->t_super, th->t_blocks_allocated);
1989 reiserfs_write_unlock(inode->i_sb);
1990 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1993 /* Drop can be outside and it needs more credits so it's better to have it outside */
1994 depth = reiserfs_write_unlock_nested(inode->i_sb);
1991 dquot_drop(inode); 1995 dquot_drop(inode);
1992 reiserfs_write_lock(inode->i_sb); 1996 reiserfs_write_lock_nested(inode->i_sb, depth);
1993 inode->i_flags |= S_NOQUOTA; 1997 inode->i_flags |= S_NOQUOTA;
1994 make_bad_inode(inode); 1998 make_bad_inode(inode);
1995 1999
@@ -2103,9 +2107,8 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2103 int error; 2107 int error;
2104 struct buffer_head *bh = NULL; 2108 struct buffer_head *bh = NULL;
2105 int err2; 2109 int err2;
2106 int lock_depth;
2107 2110
2108 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2111 reiserfs_write_lock(inode->i_sb);
2109 2112
2110 if (inode->i_size > 0) { 2113 if (inode->i_size > 0) {
2111 error = grab_tail_page(inode, &page, &bh); 2114 error = grab_tail_page(inode, &page, &bh);
@@ -2174,7 +2177,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2174 page_cache_release(page); 2177 page_cache_release(page);
2175 } 2178 }
2176 2179
2177 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2180 reiserfs_write_unlock(inode->i_sb);
2178 2181
2179 return 0; 2182 return 0;
2180 out: 2183 out:
@@ -2183,7 +2186,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2183 page_cache_release(page); 2186 page_cache_release(page);
2184 } 2187 }
2185 2188
2186 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2189 reiserfs_write_unlock(inode->i_sb);
2187 2190
2188 return error; 2191 return error;
2189} 2192}
@@ -2648,10 +2651,11 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2648 struct inode *inode = page->mapping->host; 2651 struct inode *inode = page->mapping->host;
2649 int ret; 2652 int ret;
2650 int old_ref = 0; 2653 int old_ref = 0;
2654 int depth;
2651 2655
2652 reiserfs_write_unlock(inode->i_sb); 2656 depth = reiserfs_write_unlock_nested(inode->i_sb);
2653 reiserfs_wait_on_write_block(inode->i_sb); 2657 reiserfs_wait_on_write_block(inode->i_sb);
2654 reiserfs_write_lock(inode->i_sb); 2658 reiserfs_write_lock_nested(inode->i_sb, depth);
2655 2659
2656 fix_tail_page_for_writing(page); 2660 fix_tail_page_for_writing(page);
2657 if (reiserfs_transaction_running(inode->i_sb)) { 2661 if (reiserfs_transaction_running(inode->i_sb)) {
@@ -2708,7 +2712,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2708 int update_sd = 0; 2712 int update_sd = 0;
2709 struct reiserfs_transaction_handle *th; 2713 struct reiserfs_transaction_handle *th;
2710 unsigned start; 2714 unsigned start;
2711 int lock_depth = 0;
2712 bool locked = false; 2715 bool locked = false;
2713 2716
2714 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) 2717 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
@@ -2737,7 +2740,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2737 */ 2740 */
2738 if (pos + copied > inode->i_size) { 2741 if (pos + copied > inode->i_size) {
2739 struct reiserfs_transaction_handle myth; 2742 struct reiserfs_transaction_handle myth;
2740 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2743 reiserfs_write_lock(inode->i_sb);
2741 locked = true; 2744 locked = true;
2742 /* If the file have grown beyond the border where it 2745 /* If the file have grown beyond the border where it
2743 can have a tail, unmark it as needing a tail 2746 can have a tail, unmark it as needing a tail
@@ -2768,7 +2771,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2768 } 2771 }
2769 if (th) { 2772 if (th) {
2770 if (!locked) { 2773 if (!locked) {
2771 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2774 reiserfs_write_lock(inode->i_sb);
2772 locked = true; 2775 locked = true;
2773 } 2776 }
2774 if (!update_sd) 2777 if (!update_sd)
@@ -2780,7 +2783,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2780 2783
2781 out: 2784 out:
2782 if (locked) 2785 if (locked)
2783 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2786 reiserfs_write_unlock(inode->i_sb);
2784 unlock_page(page); 2787 unlock_page(page);
2785 page_cache_release(page); 2788 page_cache_release(page);
2786 2789
@@ -2790,7 +2793,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2790 return ret == 0 ? copied : ret; 2793 return ret == 0 ? copied : ret;
2791 2794
2792 journal_error: 2795 journal_error:
2793 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2796 reiserfs_write_unlock(inode->i_sb);
2794 locked = false; 2797 locked = false;
2795 if (th) { 2798 if (th) {
2796 if (!update_sd) 2799 if (!update_sd)
@@ -2808,10 +2811,11 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2808 int ret = 0; 2811 int ret = 0;
2809 int update_sd = 0; 2812 int update_sd = 0;
2810 struct reiserfs_transaction_handle *th = NULL; 2813 struct reiserfs_transaction_handle *th = NULL;
2814 int depth;
2811 2815
2812 reiserfs_write_unlock(inode->i_sb); 2816 depth = reiserfs_write_unlock_nested(inode->i_sb);
2813 reiserfs_wait_on_write_block(inode->i_sb); 2817 reiserfs_wait_on_write_block(inode->i_sb);
2814 reiserfs_write_lock(inode->i_sb); 2818 reiserfs_write_lock_nested(inode->i_sb, depth);
2815 2819
2816 if (reiserfs_transaction_running(inode->i_sb)) { 2820 if (reiserfs_transaction_running(inode->i_sb)) {
2817 th = current->journal_info; 2821 th = current->journal_info;
@@ -3110,7 +3114,6 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3110{ 3114{
3111 struct inode *inode = dentry->d_inode; 3115 struct inode *inode = dentry->d_inode;
3112 unsigned int ia_valid; 3116 unsigned int ia_valid;
3113 int depth;
3114 int error; 3117 int error;
3115 3118
3116 error = inode_change_ok(inode, attr); 3119 error = inode_change_ok(inode, attr);
@@ -3122,13 +3125,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3122 3125
3123 if (is_quota_modification(inode, attr)) 3126 if (is_quota_modification(inode, attr))
3124 dquot_initialize(inode); 3127 dquot_initialize(inode);
3125 depth = reiserfs_write_lock_once(inode->i_sb); 3128 reiserfs_write_lock(inode->i_sb);
3126 if (attr->ia_valid & ATTR_SIZE) { 3129 if (attr->ia_valid & ATTR_SIZE) {
3127 /* version 2 items will be caught by the s_maxbytes check 3130 /* version 2 items will be caught by the s_maxbytes check
3128 ** done for us in vmtruncate 3131 ** done for us in vmtruncate
3129 */ 3132 */
3130 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && 3133 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
3131 attr->ia_size > MAX_NON_LFS) { 3134 attr->ia_size > MAX_NON_LFS) {
3135 reiserfs_write_unlock(inode->i_sb);
3132 error = -EFBIG; 3136 error = -EFBIG;
3133 goto out; 3137 goto out;
3134 } 3138 }
@@ -3150,8 +3154,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3150 if (err) 3154 if (err)
3151 error = err; 3155 error = err;
3152 } 3156 }
3153 if (error) 3157 if (error) {
3158 reiserfs_write_unlock(inode->i_sb);
3154 goto out; 3159 goto out;
3160 }
3155 /* 3161 /*
3156 * file size is changed, ctime and mtime are 3162 * file size is changed, ctime and mtime are
3157 * to be updated 3163 * to be updated
@@ -3159,6 +3165,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3159 attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); 3165 attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
3160 } 3166 }
3161 } 3167 }
3168 reiserfs_write_unlock(inode->i_sb);
3162 3169
3163 if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || 3170 if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
3164 ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && 3171 ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
@@ -3183,14 +3190,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3183 return error; 3190 return error;
3184 3191
3185 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ 3192 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3193 reiserfs_write_lock(inode->i_sb);
3186 error = journal_begin(&th, inode->i_sb, jbegin_count); 3194 error = journal_begin(&th, inode->i_sb, jbegin_count);
3195 reiserfs_write_unlock(inode->i_sb);
3187 if (error) 3196 if (error)
3188 goto out; 3197 goto out;
3189 reiserfs_write_unlock_once(inode->i_sb, depth);
3190 error = dquot_transfer(inode, attr); 3198 error = dquot_transfer(inode, attr);
3191 depth = reiserfs_write_lock_once(inode->i_sb); 3199 reiserfs_write_lock(inode->i_sb);
3192 if (error) { 3200 if (error) {
3193 journal_end(&th, inode->i_sb, jbegin_count); 3201 journal_end(&th, inode->i_sb, jbegin_count);
3202 reiserfs_write_unlock(inode->i_sb);
3194 goto out; 3203 goto out;
3195 } 3204 }
3196 3205
@@ -3202,17 +3211,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3202 inode->i_gid = attr->ia_gid; 3211 inode->i_gid = attr->ia_gid;
3203 mark_inode_dirty(inode); 3212 mark_inode_dirty(inode);
3204 error = journal_end(&th, inode->i_sb, jbegin_count); 3213 error = journal_end(&th, inode->i_sb, jbegin_count);
3214 reiserfs_write_unlock(inode->i_sb);
3205 if (error) 3215 if (error)
3206 goto out; 3216 goto out;
3207 } 3217 }
3208 3218
3209 /*
3210 * Relax the lock here, as it might truncate the
3211 * inode pages and wait for inode pages locks.
3212 * To release such page lock, the owner needs the
3213 * reiserfs lock
3214 */
3215 reiserfs_write_unlock_once(inode->i_sb, depth);
3216 if ((attr->ia_valid & ATTR_SIZE) && 3219 if ((attr->ia_valid & ATTR_SIZE) &&
3217 attr->ia_size != i_size_read(inode)) { 3220 attr->ia_size != i_size_read(inode)) {
3218 error = inode_newsize_ok(inode, attr->ia_size); 3221 error = inode_newsize_ok(inode, attr->ia_size);
@@ -3226,16 +3229,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3226 setattr_copy(inode, attr); 3229 setattr_copy(inode, attr);
3227 mark_inode_dirty(inode); 3230 mark_inode_dirty(inode);
3228 } 3231 }
3229 depth = reiserfs_write_lock_once(inode->i_sb);
3230 3232
3231 if (!error && reiserfs_posixacl(inode->i_sb)) { 3233 if (!error && reiserfs_posixacl(inode->i_sb)) {
3232 if (attr->ia_valid & ATTR_MODE) 3234 if (attr->ia_valid & ATTR_MODE)
3233 error = reiserfs_acl_chmod(inode); 3235 error = reiserfs_acl_chmod(inode);
3234 } 3236 }
3235 3237
3236 out: 3238out:
3237 reiserfs_write_unlock_once(inode->i_sb, depth);
3238
3239 return error; 3239 return error;
3240} 3240}
3241 3241
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 15cb5fe6b425..946ccbf5b5a1 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -167,7 +167,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
167int reiserfs_unpack(struct inode *inode, struct file *filp) 167int reiserfs_unpack(struct inode *inode, struct file *filp)
168{ 168{
169 int retval = 0; 169 int retval = 0;
170 int depth;
171 int index; 170 int index;
172 struct page *page; 171 struct page *page;
173 struct address_space *mapping; 172 struct address_space *mapping;
@@ -183,11 +182,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
183 return 0; 182 return 0;
184 } 183 }
185 184
186 depth = reiserfs_write_lock_once(inode->i_sb);
187
188 /* we need to make sure nobody is changing the file size beneath us */ 185 /* we need to make sure nobody is changing the file size beneath us */
189 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); 186 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
190 187
188 reiserfs_write_lock(inode->i_sb);
189
191 write_from = inode->i_size & (blocksize - 1); 190 write_from = inode->i_size & (blocksize - 1);
192 /* if we are on a block boundary, we are already unpacked. */ 191 /* if we are on a block boundary, we are already unpacked. */
193 if (write_from == 0) { 192 if (write_from == 0) {
@@ -221,6 +220,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
221 220
222 out: 221 out:
223 mutex_unlock(&inode->i_mutex); 222 mutex_unlock(&inode->i_mutex);
224 reiserfs_write_unlock_once(inode->i_sb, depth); 223 reiserfs_write_unlock(inode->i_sb);
225 return retval; 224 return retval;
226} 225}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 742fdd4c209a..fd777032c2ba 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -947,9 +947,11 @@ static int reiserfs_async_progress_wait(struct super_block *s)
947 struct reiserfs_journal *j = SB_JOURNAL(s); 947 struct reiserfs_journal *j = SB_JOURNAL(s);
948 948
949 if (atomic_read(&j->j_async_throttle)) { 949 if (atomic_read(&j->j_async_throttle)) {
950 reiserfs_write_unlock(s); 950 int depth;
951
952 depth = reiserfs_write_unlock_nested(s);
951 congestion_wait(BLK_RW_ASYNC, HZ / 10); 953 congestion_wait(BLK_RW_ASYNC, HZ / 10);
952 reiserfs_write_lock(s); 954 reiserfs_write_lock_nested(s, depth);
953 } 955 }
954 956
955 return 0; 957 return 0;
@@ -972,6 +974,7 @@ static int flush_commit_list(struct super_block *s,
972 struct reiserfs_journal *journal = SB_JOURNAL(s); 974 struct reiserfs_journal *journal = SB_JOURNAL(s);
973 int retval = 0; 975 int retval = 0;
974 int write_len; 976 int write_len;
977 int depth;
975 978
976 reiserfs_check_lock_depth(s, "flush_commit_list"); 979 reiserfs_check_lock_depth(s, "flush_commit_list");
977 980
@@ -1018,12 +1021,12 @@ static int flush_commit_list(struct super_block *s,
1018 * We might sleep in numerous places inside 1021 * We might sleep in numerous places inside
1019 * write_ordered_buffers. Relax the write lock. 1022 * write_ordered_buffers. Relax the write lock.
1020 */ 1023 */
1021 reiserfs_write_unlock(s); 1024 depth = reiserfs_write_unlock_nested(s);
1022 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, 1025 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1023 journal, jl, &jl->j_bh_list); 1026 journal, jl, &jl->j_bh_list);
1024 if (ret < 0 && retval == 0) 1027 if (ret < 0 && retval == 0)
1025 retval = ret; 1028 retval = ret;
1026 reiserfs_write_lock(s); 1029 reiserfs_write_lock_nested(s, depth);
1027 } 1030 }
1028 BUG_ON(!list_empty(&jl->j_bh_list)); 1031 BUG_ON(!list_empty(&jl->j_bh_list));
1029 /* 1032 /*
@@ -1043,9 +1046,9 @@ static int flush_commit_list(struct super_block *s,
1043 tbh = journal_find_get_block(s, bn); 1046 tbh = journal_find_get_block(s, bn);
1044 if (tbh) { 1047 if (tbh) {
1045 if (buffer_dirty(tbh)) { 1048 if (buffer_dirty(tbh)) {
1046 reiserfs_write_unlock(s); 1049 depth = reiserfs_write_unlock_nested(s);
1047 ll_rw_block(WRITE, 1, &tbh); 1050 ll_rw_block(WRITE, 1, &tbh);
1048 reiserfs_write_lock(s); 1051 reiserfs_write_lock_nested(s, depth);
1049 } 1052 }
1050 put_bh(tbh) ; 1053 put_bh(tbh) ;
1051 } 1054 }
@@ -1057,17 +1060,17 @@ static int flush_commit_list(struct super_block *s,
1057 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1060 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1058 tbh = journal_find_get_block(s, bn); 1061 tbh = journal_find_get_block(s, bn);
1059 1062
1060 reiserfs_write_unlock(s); 1063 depth = reiserfs_write_unlock_nested(s);
1061 wait_on_buffer(tbh); 1064 __wait_on_buffer(tbh);
1062 reiserfs_write_lock(s); 1065 reiserfs_write_lock_nested(s, depth);
1063 // since we're using ll_rw_blk above, it might have skipped over 1066 // since we're using ll_rw_blk above, it might have skipped over
1064 // a locked buffer. Double check here 1067 // a locked buffer. Double check here
1065 // 1068 //
1066 /* redundant, sync_dirty_buffer() checks */ 1069 /* redundant, sync_dirty_buffer() checks */
1067 if (buffer_dirty(tbh)) { 1070 if (buffer_dirty(tbh)) {
1068 reiserfs_write_unlock(s); 1071 depth = reiserfs_write_unlock_nested(s);
1069 sync_dirty_buffer(tbh); 1072 sync_dirty_buffer(tbh);
1070 reiserfs_write_lock(s); 1073 reiserfs_write_lock_nested(s, depth);
1071 } 1074 }
1072 if (unlikely(!buffer_uptodate(tbh))) { 1075 if (unlikely(!buffer_uptodate(tbh))) {
1073#ifdef CONFIG_REISERFS_CHECK 1076#ifdef CONFIG_REISERFS_CHECK
@@ -1091,12 +1094,12 @@ static int flush_commit_list(struct super_block *s,
1091 if (buffer_dirty(jl->j_commit_bh)) 1094 if (buffer_dirty(jl->j_commit_bh))
1092 BUG(); 1095 BUG();
1093 mark_buffer_dirty(jl->j_commit_bh) ; 1096 mark_buffer_dirty(jl->j_commit_bh) ;
1094 reiserfs_write_unlock(s); 1097 depth = reiserfs_write_unlock_nested(s);
1095 if (reiserfs_barrier_flush(s)) 1098 if (reiserfs_barrier_flush(s))
1096 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); 1099 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1097 else 1100 else
1098 sync_dirty_buffer(jl->j_commit_bh); 1101 sync_dirty_buffer(jl->j_commit_bh);
1099 reiserfs_write_lock(s); 1102 reiserfs_write_lock_nested(s, depth);
1100 } 1103 }
1101 1104
1102 /* If there was a write error in the journal - we can't commit this 1105 /* If there was a write error in the journal - we can't commit this
@@ -1160,21 +1163,6 @@ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
1160 return NULL; 1163 return NULL;
1161} 1164}
1162 1165
1163static int newer_jl_done(struct reiserfs_journal_cnode *cn)
1164{
1165 struct super_block *sb = cn->sb;
1166 b_blocknr_t blocknr = cn->blocknr;
1167
1168 cn = cn->hprev;
1169 while (cn) {
1170 if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
1171 atomic_read(&cn->jlist->j_commit_left) != 0)
1172 return 0;
1173 cn = cn->hprev;
1174 }
1175 return 1;
1176}
1177
1178static void remove_journal_hash(struct super_block *, 1166static void remove_journal_hash(struct super_block *,
1179 struct reiserfs_journal_cnode **, 1167 struct reiserfs_journal_cnode **,
1180 struct reiserfs_journal_list *, unsigned long, 1168 struct reiserfs_journal_list *, unsigned long,
@@ -1228,15 +1216,16 @@ static int _update_journal_header_block(struct super_block *sb,
1228{ 1216{
1229 struct reiserfs_journal_header *jh; 1217 struct reiserfs_journal_header *jh;
1230 struct reiserfs_journal *journal = SB_JOURNAL(sb); 1218 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1219 int depth;
1231 1220
1232 if (reiserfs_is_journal_aborted(journal)) 1221 if (reiserfs_is_journal_aborted(journal))
1233 return -EIO; 1222 return -EIO;
1234 1223
1235 if (trans_id >= journal->j_last_flush_trans_id) { 1224 if (trans_id >= journal->j_last_flush_trans_id) {
1236 if (buffer_locked((journal->j_header_bh))) { 1225 if (buffer_locked((journal->j_header_bh))) {
1237 reiserfs_write_unlock(sb); 1226 depth = reiserfs_write_unlock_nested(sb);
1238 wait_on_buffer((journal->j_header_bh)); 1227 __wait_on_buffer(journal->j_header_bh);
1239 reiserfs_write_lock(sb); 1228 reiserfs_write_lock_nested(sb, depth);
1240 if (unlikely(!buffer_uptodate(journal->j_header_bh))) { 1229 if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1241#ifdef CONFIG_REISERFS_CHECK 1230#ifdef CONFIG_REISERFS_CHECK
1242 reiserfs_warning(sb, "journal-699", 1231 reiserfs_warning(sb, "journal-699",
@@ -1254,14 +1243,14 @@ static int _update_journal_header_block(struct super_block *sb,
1254 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1243 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1255 1244
1256 set_buffer_dirty(journal->j_header_bh); 1245 set_buffer_dirty(journal->j_header_bh);
1257 reiserfs_write_unlock(sb); 1246 depth = reiserfs_write_unlock_nested(sb);
1258 1247
1259 if (reiserfs_barrier_flush(sb)) 1248 if (reiserfs_barrier_flush(sb))
1260 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); 1249 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1261 else 1250 else
1262 sync_dirty_buffer(journal->j_header_bh); 1251 sync_dirty_buffer(journal->j_header_bh);
1263 1252
1264 reiserfs_write_lock(sb); 1253 reiserfs_write_lock_nested(sb, depth);
1265 if (!buffer_uptodate(journal->j_header_bh)) { 1254 if (!buffer_uptodate(journal->j_header_bh)) {
1266 reiserfs_warning(sb, "journal-837", 1255 reiserfs_warning(sb, "journal-837",
1267 "IO error during journal replay"); 1256 "IO error during journal replay");
@@ -1341,6 +1330,7 @@ static int flush_journal_list(struct super_block *s,
1341 unsigned long j_len_saved = jl->j_len; 1330 unsigned long j_len_saved = jl->j_len;
1342 struct reiserfs_journal *journal = SB_JOURNAL(s); 1331 struct reiserfs_journal *journal = SB_JOURNAL(s);
1343 int err = 0; 1332 int err = 0;
1333 int depth;
1344 1334
1345 BUG_ON(j_len_saved <= 0); 1335 BUG_ON(j_len_saved <= 0);
1346 1336
@@ -1348,7 +1338,6 @@ static int flush_journal_list(struct super_block *s,
1348 reiserfs_warning(s, "clm-2048", "called with wcount %d", 1338 reiserfs_warning(s, "clm-2048", "called with wcount %d",
1349 atomic_read(&journal->j_wcount)); 1339 atomic_read(&journal->j_wcount));
1350 } 1340 }
1351 BUG_ON(jl->j_trans_id == 0);
1352 1341
1353 /* if flushall == 0, the lock is already held */ 1342 /* if flushall == 0, the lock is already held */
1354 if (flushall) { 1343 if (flushall) {
@@ -1495,9 +1484,9 @@ static int flush_journal_list(struct super_block *s,
1495 "cn->bh is NULL"); 1484 "cn->bh is NULL");
1496 } 1485 }
1497 1486
1498 reiserfs_write_unlock(s); 1487 depth = reiserfs_write_unlock_nested(s);
1499 wait_on_buffer(cn->bh); 1488 __wait_on_buffer(cn->bh);
1500 reiserfs_write_lock(s); 1489 reiserfs_write_lock_nested(s, depth);
1501 1490
1502 if (!cn->bh) { 1491 if (!cn->bh) {
1503 reiserfs_panic(s, "journal-1012", 1492 reiserfs_panic(s, "journal-1012",
@@ -1588,31 +1577,6 @@ static int flush_journal_list(struct super_block *s,
1588 return err; 1577 return err;
1589} 1578}
1590 1579
1591static int test_transaction(struct super_block *s,
1592 struct reiserfs_journal_list *jl)
1593{
1594 struct reiserfs_journal_cnode *cn;
1595
1596 if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
1597 return 1;
1598
1599 cn = jl->j_realblock;
1600 while (cn) {
1601 /* if the blocknr == 0, this has been cleared from the hash,
1602 ** skip it
1603 */
1604 if (cn->blocknr == 0) {
1605 goto next;
1606 }
1607 if (cn->bh && !newer_jl_done(cn))
1608 return 0;
1609 next:
1610 cn = cn->next;
1611 cond_resched();
1612 }
1613 return 0;
1614}
1615
1616static int write_one_transaction(struct super_block *s, 1580static int write_one_transaction(struct super_block *s,
1617 struct reiserfs_journal_list *jl, 1581 struct reiserfs_journal_list *jl,
1618 struct buffer_chunk *chunk) 1582 struct buffer_chunk *chunk)
@@ -1800,6 +1764,8 @@ static int flush_used_journal_lists(struct super_block *s,
1800 break; 1764 break;
1801 tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); 1765 tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
1802 } 1766 }
1767 get_journal_list(jl);
1768 get_journal_list(flush_jl);
1803 /* try to find a group of blocks we can flush across all the 1769 /* try to find a group of blocks we can flush across all the
1804 ** transactions, but only bother if we've actually spanned 1770 ** transactions, but only bother if we've actually spanned
1805 ** across multiple lists 1771 ** across multiple lists
@@ -1808,6 +1774,8 @@ static int flush_used_journal_lists(struct super_block *s,
1808 ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); 1774 ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
1809 } 1775 }
1810 flush_journal_list(s, flush_jl, 1); 1776 flush_journal_list(s, flush_jl, 1);
1777 put_journal_list(s, flush_jl);
1778 put_journal_list(s, jl);
1811 return 0; 1779 return 0;
1812} 1780}
1813 1781
@@ -1974,6 +1942,7 @@ static int journal_compare_desc_commit(struct super_block *sb,
1974/* returns 0 if it did not find a description block 1942/* returns 0 if it did not find a description block
1975** returns -1 if it found a corrupt commit block 1943** returns -1 if it found a corrupt commit block
1976** returns 1 if both desc and commit were valid 1944** returns 1 if both desc and commit were valid
1945** NOTE: only called during fs mount
1977*/ 1946*/
1978static int journal_transaction_is_valid(struct super_block *sb, 1947static int journal_transaction_is_valid(struct super_block *sb,
1979 struct buffer_head *d_bh, 1948 struct buffer_head *d_bh,
@@ -2073,8 +2042,9 @@ static void brelse_array(struct buffer_head **heads, int num)
2073 2042
2074/* 2043/*
2075** given the start, and values for the oldest acceptable transactions, 2044** given the start, and values for the oldest acceptable transactions,
2076** this either reads in a replays a transaction, or returns because the transaction 2045** this either reads in a replays a transaction, or returns because the
2077** is invalid, or too old. 2046** transaction is invalid, or too old.
2047** NOTE: only called during fs mount
2078*/ 2048*/
2079static int journal_read_transaction(struct super_block *sb, 2049static int journal_read_transaction(struct super_block *sb,
2080 unsigned long cur_dblock, 2050 unsigned long cur_dblock,
@@ -2208,10 +2178,7 @@ static int journal_read_transaction(struct super_block *sb,
2208 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); 2178 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2209 for (i = 0; i < get_desc_trans_len(desc); i++) { 2179 for (i = 0; i < get_desc_trans_len(desc); i++) {
2210 2180
2211 reiserfs_write_unlock(sb);
2212 wait_on_buffer(log_blocks[i]); 2181 wait_on_buffer(log_blocks[i]);
2213 reiserfs_write_lock(sb);
2214
2215 if (!buffer_uptodate(log_blocks[i])) { 2182 if (!buffer_uptodate(log_blocks[i])) {
2216 reiserfs_warning(sb, "journal-1212", 2183 reiserfs_warning(sb, "journal-1212",
2217 "REPLAY FAILURE fsck required! " 2184 "REPLAY FAILURE fsck required! "
@@ -2318,12 +2285,13 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
2318 2285
2319/* 2286/*
2320** read and replay the log 2287** read and replay the log
2321** on a clean unmount, the journal header's next unflushed pointer will be to an invalid 2288** on a clean unmount, the journal header's next unflushed pointer will
2322** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. 2289** be to an invalid transaction. This tests that before finding all the
2323** 2290** transactions in the log, which makes normal mount times fast.
2324** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. 2291** After a crash, this starts with the next unflushed transaction, and
2325** 2292** replays until it finds one too old, or invalid.
2326** On exit, it sets things up so the first transaction will work correctly. 2293** On exit, it sets things up so the first transaction will work correctly.
2294** NOTE: only called during fs mount
2327*/ 2295*/
2328static int journal_read(struct super_block *sb) 2296static int journal_read(struct super_block *sb)
2329{ 2297{
@@ -2501,14 +2469,18 @@ static int journal_read(struct super_block *sb)
2501 "replayed %d transactions in %lu seconds\n", 2469 "replayed %d transactions in %lu seconds\n",
2502 replay_count, get_seconds() - start); 2470 replay_count, get_seconds() - start);
2503 } 2471 }
2472 /* needed to satisfy the locking in _update_journal_header_block */
2473 reiserfs_write_lock(sb);
2504 if (!bdev_read_only(sb->s_bdev) && 2474 if (!bdev_read_only(sb->s_bdev) &&
2505 _update_journal_header_block(sb, journal->j_start, 2475 _update_journal_header_block(sb, journal->j_start,
2506 journal->j_last_flush_trans_id)) { 2476 journal->j_last_flush_trans_id)) {
2477 reiserfs_write_unlock(sb);
2507 /* replay failed, caller must call free_journal_ram and abort 2478 /* replay failed, caller must call free_journal_ram and abort
2508 ** the mount 2479 ** the mount
2509 */ 2480 */
2510 return -1; 2481 return -1;
2511 } 2482 }
2483 reiserfs_write_unlock(sb);
2512 return 0; 2484 return 0;
2513} 2485}
2514 2486
@@ -2828,13 +2800,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2828 goto free_and_return; 2800 goto free_and_return;
2829 } 2801 }
2830 2802
2831 /*
2832 * Journal_read needs to be inspected in order to push down
2833 * the lock further inside (or even remove it).
2834 */
2835 reiserfs_write_lock(sb);
2836 ret = journal_read(sb); 2803 ret = journal_read(sb);
2837 reiserfs_write_unlock(sb);
2838 if (ret < 0) { 2804 if (ret < 0) {
2839 reiserfs_warning(sb, "reiserfs-2006", 2805 reiserfs_warning(sb, "reiserfs-2006",
2840 "Replay Failure, unable to mount"); 2806 "Replay Failure, unable to mount");
@@ -2923,9 +2889,9 @@ static void queue_log_writer(struct super_block *s)
2923 add_wait_queue(&journal->j_join_wait, &wait); 2889 add_wait_queue(&journal->j_join_wait, &wait);
2924 set_current_state(TASK_UNINTERRUPTIBLE); 2890 set_current_state(TASK_UNINTERRUPTIBLE);
2925 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { 2891 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
2926 reiserfs_write_unlock(s); 2892 int depth = reiserfs_write_unlock_nested(s);
2927 schedule(); 2893 schedule();
2928 reiserfs_write_lock(s); 2894 reiserfs_write_lock_nested(s, depth);
2929 } 2895 }
2930 __set_current_state(TASK_RUNNING); 2896 __set_current_state(TASK_RUNNING);
2931 remove_wait_queue(&journal->j_join_wait, &wait); 2897 remove_wait_queue(&journal->j_join_wait, &wait);
@@ -2943,9 +2909,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2943 struct reiserfs_journal *journal = SB_JOURNAL(sb); 2909 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2944 unsigned long bcount = journal->j_bcount; 2910 unsigned long bcount = journal->j_bcount;
2945 while (1) { 2911 while (1) {
2946 reiserfs_write_unlock(sb); 2912 int depth;
2913
2914 depth = reiserfs_write_unlock_nested(sb);
2947 schedule_timeout_uninterruptible(1); 2915 schedule_timeout_uninterruptible(1);
2948 reiserfs_write_lock(sb); 2916 reiserfs_write_lock_nested(sb, depth);
2917
2949 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; 2918 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2950 while ((atomic_read(&journal->j_wcount) > 0 || 2919 while ((atomic_read(&journal->j_wcount) > 0 ||
2951 atomic_read(&journal->j_jlock)) && 2920 atomic_read(&journal->j_jlock)) &&
@@ -2976,6 +2945,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
2976 struct reiserfs_transaction_handle myth; 2945 struct reiserfs_transaction_handle myth;
2977 int sched_count = 0; 2946 int sched_count = 0;
2978 int retval; 2947 int retval;
2948 int depth;
2979 2949
2980 reiserfs_check_lock_depth(sb, "journal_begin"); 2950 reiserfs_check_lock_depth(sb, "journal_begin");
2981 BUG_ON(nblocks > journal->j_trans_max); 2951 BUG_ON(nblocks > journal->j_trans_max);
@@ -2996,9 +2966,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
2996 2966
2997 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { 2967 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
2998 unlock_journal(sb); 2968 unlock_journal(sb);
2999 reiserfs_write_unlock(sb); 2969 depth = reiserfs_write_unlock_nested(sb);
3000 reiserfs_wait_on_write_block(sb); 2970 reiserfs_wait_on_write_block(sb);
3001 reiserfs_write_lock(sb); 2971 reiserfs_write_lock_nested(sb, depth);
3002 PROC_INFO_INC(sb, journal.journal_relock_writers); 2972 PROC_INFO_INC(sb, journal.journal_relock_writers);
3003 goto relock; 2973 goto relock;
3004 } 2974 }
@@ -3821,6 +3791,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
3821 if (test_clear_buffer_journal_restore_dirty(bh) && 3791 if (test_clear_buffer_journal_restore_dirty(bh) &&
3822 buffer_journal_dirty(bh)) { 3792 buffer_journal_dirty(bh)) {
3823 struct reiserfs_journal_cnode *cn; 3793 struct reiserfs_journal_cnode *cn;
3794 reiserfs_write_lock(sb);
3824 cn = get_journal_hash_dev(sb, 3795 cn = get_journal_hash_dev(sb,
3825 journal->j_list_hash_table, 3796 journal->j_list_hash_table,
3826 bh->b_blocknr); 3797 bh->b_blocknr);
@@ -3828,6 +3799,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
3828 set_buffer_journal_test(bh); 3799 set_buffer_journal_test(bh);
3829 mark_buffer_dirty(bh); 3800 mark_buffer_dirty(bh);
3830 } 3801 }
3802 reiserfs_write_unlock(sb);
3831 } 3803 }
3832 clear_buffer_journal_prepared(bh); 3804 clear_buffer_journal_prepared(bh);
3833} 3805}
@@ -3859,27 +3831,6 @@ int reiserfs_prepare_for_journal(struct super_block *sb,
3859 return 1; 3831 return 1;
3860} 3832}
3861 3833
3862static void flush_old_journal_lists(struct super_block *s)
3863{
3864 struct reiserfs_journal *journal = SB_JOURNAL(s);
3865 struct reiserfs_journal_list *jl;
3866 struct list_head *entry;
3867 time_t now = get_seconds();
3868
3869 while (!list_empty(&journal->j_journal_list)) {
3870 entry = journal->j_journal_list.next;
3871 jl = JOURNAL_LIST_ENTRY(entry);
3872 /* this check should always be run, to send old lists to disk */
3873 if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
3874 atomic_read(&jl->j_commit_left) == 0 &&
3875 test_transaction(s, jl)) {
3876 flush_used_journal_lists(s, jl);
3877 } else {
3878 break;
3879 }
3880 }
3881}
3882
3883/* 3834/*
3884** long and ugly. If flush, will not return until all commit 3835** long and ugly. If flush, will not return until all commit
3885** blocks and all real buffers in the trans are on disk. 3836** blocks and all real buffers in the trans are on disk.
@@ -3911,6 +3862,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3911 unsigned long jindex; 3862 unsigned long jindex;
3912 unsigned int commit_trans_id; 3863 unsigned int commit_trans_id;
3913 int trans_half; 3864 int trans_half;
3865 int depth;
3914 3866
3915 BUG_ON(th->t_refcount > 1); 3867 BUG_ON(th->t_refcount > 1);
3916 BUG_ON(!th->t_trans_id); 3868 BUG_ON(!th->t_trans_id);
@@ -4116,9 +4068,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4116 next = cn->next; 4068 next = cn->next;
4117 free_cnode(sb, cn); 4069 free_cnode(sb, cn);
4118 cn = next; 4070 cn = next;
4119 reiserfs_write_unlock(sb); 4071 reiserfs_cond_resched(sb);
4120 cond_resched();
4121 reiserfs_write_lock(sb);
4122 } 4072 }
4123 4073
4124 /* we are done with both the c_bh and d_bh, but 4074 /* we are done with both the c_bh and d_bh, but
@@ -4165,10 +4115,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4165 * is lost. 4115 * is lost.
4166 */ 4116 */
4167 if (!list_empty(&jl->j_tail_bh_list)) { 4117 if (!list_empty(&jl->j_tail_bh_list)) {
4168 reiserfs_write_unlock(sb); 4118 depth = reiserfs_write_unlock_nested(sb);
4169 write_ordered_buffers(&journal->j_dirty_buffers_lock, 4119 write_ordered_buffers(&journal->j_dirty_buffers_lock,
4170 journal, jl, &jl->j_tail_bh_list); 4120 journal, jl, &jl->j_tail_bh_list);
4171 reiserfs_write_lock(sb); 4121 reiserfs_write_lock_nested(sb, depth);
4172 } 4122 }
4173 BUG_ON(!list_empty(&jl->j_tail_bh_list)); 4123 BUG_ON(!list_empty(&jl->j_tail_bh_list));
4174 mutex_unlock(&jl->j_commit_mutex); 4124 mutex_unlock(&jl->j_commit_mutex);
@@ -4224,7 +4174,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4224 } 4174 }
4225 } 4175 }
4226 } 4176 }
4227 flush_old_journal_lists(sb);
4228 4177
4229 journal->j_current_jl->j_list_bitmap = 4178 journal->j_current_jl->j_list_bitmap =
4230 get_list_bitmap(sb, journal->j_current_jl); 4179 get_list_bitmap(sb, journal->j_current_jl);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index d735bc8470e3..045b83ef9fd9 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -48,30 +48,35 @@ void reiserfs_write_unlock(struct super_block *s)
48 } 48 }
49} 49}
50 50
51/* 51int __must_check reiserfs_write_unlock_nested(struct super_block *s)
52 * If we already own the lock, just exit and don't increase the depth.
53 * Useful when we don't want to lock more than once.
54 *
55 * We always return the lock_depth we had before calling
56 * this function.
57 */
58int reiserfs_write_lock_once(struct super_block *s)
59{ 52{
60 struct reiserfs_sb_info *sb_i = REISERFS_SB(s); 53 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
54 int depth;
61 55
62 if (sb_i->lock_owner != current) { 56 /* this can happen when the lock isn't always held */
63 mutex_lock(&sb_i->lock); 57 if (sb_i->lock_owner != current)
64 sb_i->lock_owner = current; 58 return -1;
65 return sb_i->lock_depth++; 59
66 } 60 depth = sb_i->lock_depth;
61
62 sb_i->lock_depth = -1;
63 sb_i->lock_owner = NULL;
64 mutex_unlock(&sb_i->lock);
67 65
68 return sb_i->lock_depth; 66 return depth;
69} 67}
70 68
71void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) 69void reiserfs_write_lock_nested(struct super_block *s, int depth)
72{ 70{
73 if (lock_depth == -1) 71 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
74 reiserfs_write_unlock(s); 72
73 /* this can happen when the lock isn't always held */
74 if (depth == -1)
75 return;
76
77 mutex_lock(&sb_i->lock);
78 sb_i->lock_owner = current;
79 sb_i->lock_depth = depth;
75} 80}
76 81
77/* 82/*
@@ -82,9 +87,7 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
82{ 87{
83 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); 88 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
84 89
85 if (sb_i->lock_depth < 0) 90 WARN_ON(sb_i->lock_depth < 0);
86 reiserfs_panic(sb, "%s called without kernel lock held %d",
87 caller);
88} 91}
89 92
90#ifdef CONFIG_REISERFS_CHECK 93#ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 8567fb847601..dc5236f6de1b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -325,7 +325,6 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
325 unsigned int flags) 325 unsigned int flags)
326{ 326{
327 int retval; 327 int retval;
328 int lock_depth;
329 struct inode *inode = NULL; 328 struct inode *inode = NULL;
330 struct reiserfs_dir_entry de; 329 struct reiserfs_dir_entry de;
331 INITIALIZE_PATH(path_to_entry); 330 INITIALIZE_PATH(path_to_entry);
@@ -333,12 +332,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
333 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) 332 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
334 return ERR_PTR(-ENAMETOOLONG); 333 return ERR_PTR(-ENAMETOOLONG);
335 334
336 /* 335 reiserfs_write_lock(dir->i_sb);
337 * Might be called with or without the write lock, must be careful
338 * to not recursively hold it in case we want to release the lock
339 * before rescheduling.
340 */
341 lock_depth = reiserfs_write_lock_once(dir->i_sb);
342 336
343 de.de_gen_number_bit_string = NULL; 337 de.de_gen_number_bit_string = NULL;
344 retval = 338 retval =
@@ -349,7 +343,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
349 inode = reiserfs_iget(dir->i_sb, 343 inode = reiserfs_iget(dir->i_sb,
350 (struct cpu_key *)&(de.de_dir_id)); 344 (struct cpu_key *)&(de.de_dir_id));
351 if (!inode || IS_ERR(inode)) { 345 if (!inode || IS_ERR(inode)) {
352 reiserfs_write_unlock_once(dir->i_sb, lock_depth); 346 reiserfs_write_unlock(dir->i_sb);
353 return ERR_PTR(-EACCES); 347 return ERR_PTR(-EACCES);
354 } 348 }
355 349
@@ -358,7 +352,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
358 if (IS_PRIVATE(dir)) 352 if (IS_PRIVATE(dir))
359 inode->i_flags |= S_PRIVATE; 353 inode->i_flags |= S_PRIVATE;
360 } 354 }
361 reiserfs_write_unlock_once(dir->i_sb, lock_depth); 355 reiserfs_write_unlock(dir->i_sb);
362 if (retval == IO_ERROR) { 356 if (retval == IO_ERROR) {
363 return ERR_PTR(-EIO); 357 return ERR_PTR(-EIO);
364 } 358 }
@@ -727,7 +721,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
727 struct inode *inode; 721 struct inode *inode;
728 struct reiserfs_transaction_handle th; 722 struct reiserfs_transaction_handle th;
729 struct reiserfs_security_handle security; 723 struct reiserfs_security_handle security;
730 int lock_depth;
731 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 724 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
732 int jbegin_count = 725 int jbegin_count =
733 JOURNAL_PER_BALANCE_CNT * 3 + 726 JOURNAL_PER_BALANCE_CNT * 3 +
@@ -753,7 +746,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
753 return retval; 746 return retval;
754 } 747 }
755 jbegin_count += retval; 748 jbegin_count += retval;
756 lock_depth = reiserfs_write_lock_once(dir->i_sb); 749 reiserfs_write_lock(dir->i_sb);
757 750
758 retval = journal_begin(&th, dir->i_sb, jbegin_count); 751 retval = journal_begin(&th, dir->i_sb, jbegin_count);
759 if (retval) { 752 if (retval) {
@@ -804,7 +797,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
804 d_instantiate(dentry, inode); 797 d_instantiate(dentry, inode);
805 retval = journal_end(&th, dir->i_sb, jbegin_count); 798 retval = journal_end(&th, dir->i_sb, jbegin_count);
806out_failed: 799out_failed:
807 reiserfs_write_unlock_once(dir->i_sb, lock_depth); 800 reiserfs_write_unlock(dir->i_sb);
808 return retval; 801 return retval;
809} 802}
810 803
@@ -920,7 +913,6 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
920 struct reiserfs_transaction_handle th; 913 struct reiserfs_transaction_handle th;
921 int jbegin_count; 914 int jbegin_count;
922 unsigned long savelink; 915 unsigned long savelink;
923 int depth;
924 916
925 dquot_initialize(dir); 917 dquot_initialize(dir);
926 918
@@ -934,7 +926,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
934 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 926 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
935 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 927 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
936 928
937 depth = reiserfs_write_lock_once(dir->i_sb); 929 reiserfs_write_lock(dir->i_sb);
938 retval = journal_begin(&th, dir->i_sb, jbegin_count); 930 retval = journal_begin(&th, dir->i_sb, jbegin_count);
939 if (retval) 931 if (retval)
940 goto out_unlink; 932 goto out_unlink;
@@ -995,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
995 987
996 retval = journal_end(&th, dir->i_sb, jbegin_count); 988 retval = journal_end(&th, dir->i_sb, jbegin_count);
997 reiserfs_check_path(&path); 989 reiserfs_check_path(&path);
998 reiserfs_write_unlock_once(dir->i_sb, depth); 990 reiserfs_write_unlock(dir->i_sb);
999 return retval; 991 return retval;
1000 992
1001 end_unlink: 993 end_unlink:
@@ -1005,7 +997,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
1005 if (err) 997 if (err)
1006 retval = err; 998 retval = err;
1007 out_unlink: 999 out_unlink:
1008 reiserfs_write_unlock_once(dir->i_sb, depth); 1000 reiserfs_write_unlock(dir->i_sb);
1009 return retval; 1001 return retval;
1010} 1002}
1011 1003
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index c0b1112ab7e3..54944d5a4a6e 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -358,12 +358,13 @@ void __reiserfs_panic(struct super_block *sb, const char *id,
358 dump_stack(); 358 dump_stack();
359#endif 359#endif
360 if (sb) 360 if (sb)
361 panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", 361 printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
362 sb->s_id, id ? id : "", id ? " " : "", 362 sb->s_id, id ? id : "", id ? " " : "",
363 function, error_buf); 363 function, error_buf);
364 else 364 else
365 panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", 365 printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
366 id ? id : "", id ? " " : "", function, error_buf); 366 id ? id : "", id ? " " : "", function, error_buf);
367 BUG();
367} 368}
368 369
369void __reiserfs_error(struct super_block *sb, const char *id, 370void __reiserfs_error(struct super_block *sb, const char *id,
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 3df5ce6c724d..f8adaee537c2 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -630,8 +630,8 @@ static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
630 */ 630 */
631void reiserfs_write_lock(struct super_block *s); 631void reiserfs_write_lock(struct super_block *s);
632void reiserfs_write_unlock(struct super_block *s); 632void reiserfs_write_unlock(struct super_block *s);
633int reiserfs_write_lock_once(struct super_block *s); 633int __must_check reiserfs_write_unlock_nested(struct super_block *s);
634void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); 634void reiserfs_write_lock_nested(struct super_block *s, int depth);
635 635
636#ifdef CONFIG_REISERFS_CHECK 636#ifdef CONFIG_REISERFS_CHECK
637void reiserfs_lock_check_recursive(struct super_block *s); 637void reiserfs_lock_check_recursive(struct super_block *s);
@@ -667,31 +667,33 @@ static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
667 * - The inode mutex 667 * - The inode mutex
668 */ 668 */
669static inline void reiserfs_mutex_lock_safe(struct mutex *m, 669static inline void reiserfs_mutex_lock_safe(struct mutex *m,
670 struct super_block *s) 670 struct super_block *s)
671{ 671{
672 reiserfs_lock_check_recursive(s); 672 int depth;
673 reiserfs_write_unlock(s); 673
674 depth = reiserfs_write_unlock_nested(s);
674 mutex_lock(m); 675 mutex_lock(m);
675 reiserfs_write_lock(s); 676 reiserfs_write_lock_nested(s, depth);
676} 677}
677 678
678static inline void 679static inline void
679reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, 680reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
680 struct super_block *s) 681 struct super_block *s)
681{ 682{
682 reiserfs_lock_check_recursive(s); 683 int depth;
683 reiserfs_write_unlock(s); 684
685 depth = reiserfs_write_unlock_nested(s);
684 mutex_lock_nested(m, subclass); 686 mutex_lock_nested(m, subclass);
685 reiserfs_write_lock(s); 687 reiserfs_write_lock_nested(s, depth);
686} 688}
687 689
688static inline void 690static inline void
689reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) 691reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
690{ 692{
691 reiserfs_lock_check_recursive(s); 693 int depth;
692 reiserfs_write_unlock(s); 694 depth = reiserfs_write_unlock_nested(s);
693 down_read(sem); 695 down_read(sem);
694 reiserfs_write_lock(s); 696 reiserfs_write_lock_nested(s, depth);
695} 697}
696 698
697/* 699/*
@@ -701,9 +703,11 @@ reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
701static inline void reiserfs_cond_resched(struct super_block *s) 703static inline void reiserfs_cond_resched(struct super_block *s)
702{ 704{
703 if (need_resched()) { 705 if (need_resched()) {
704 reiserfs_write_unlock(s); 706 int depth;
707
708 depth = reiserfs_write_unlock_nested(s);
705 schedule(); 709 schedule();
706 reiserfs_write_lock(s); 710 reiserfs_write_lock_nested(s, depth);
707 } 711 }
708} 712}
709 713
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 3ce02cff5e90..a4ef5cd606eb 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -34,6 +34,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
34 unsigned long int block_count, free_blocks; 34 unsigned long int block_count, free_blocks;
35 int i; 35 int i;
36 int copy_size; 36 int copy_size;
37 int depth;
37 38
38 sb = SB_DISK_SUPER_BLOCK(s); 39 sb = SB_DISK_SUPER_BLOCK(s);
39 40
@@ -43,7 +44,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
43 } 44 }
44 45
45 /* check the device size */ 46 /* check the device size */
47 depth = reiserfs_write_unlock_nested(s);
46 bh = sb_bread(s, block_count_new - 1); 48 bh = sb_bread(s, block_count_new - 1);
49 reiserfs_write_lock_nested(s, depth);
47 if (!bh) { 50 if (!bh) {
48 printk("reiserfs_resize: can\'t read last block\n"); 51 printk("reiserfs_resize: can\'t read last block\n");
49 return -EINVAL; 52 return -EINVAL;
@@ -125,9 +128,12 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
125 * transaction begins, and the new bitmaps don't matter if the 128 * transaction begins, and the new bitmaps don't matter if the
126 * transaction fails. */ 129 * transaction fails. */
127 for (i = bmap_nr; i < bmap_nr_new; i++) { 130 for (i = bmap_nr; i < bmap_nr_new; i++) {
131 int depth;
128 /* don't use read_bitmap_block since it will cache 132 /* don't use read_bitmap_block since it will cache
129 * the uninitialized bitmap */ 133 * the uninitialized bitmap */
134 depth = reiserfs_write_unlock_nested(s);
130 bh = sb_bread(s, i * s->s_blocksize * 8); 135 bh = sb_bread(s, i * s->s_blocksize * 8);
136 reiserfs_write_lock_nested(s, depth);
131 if (!bh) { 137 if (!bh) {
132 vfree(bitmap); 138 vfree(bitmap);
133 return -EIO; 139 return -EIO;
@@ -138,9 +144,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
138 144
139 set_buffer_uptodate(bh); 145 set_buffer_uptodate(bh);
140 mark_buffer_dirty(bh); 146 mark_buffer_dirty(bh);
141 reiserfs_write_unlock(s); 147 depth = reiserfs_write_unlock_nested(s);
142 sync_dirty_buffer(bh); 148 sync_dirty_buffer(bh);
143 reiserfs_write_lock(s); 149 reiserfs_write_lock_nested(s, depth);
144 // update bitmap_info stuff 150 // update bitmap_info stuff
145 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; 151 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
146 brelse(bh); 152 brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 2f40a4c70a4d..b14706a05d52 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -524,14 +524,14 @@ static int is_tree_node(struct buffer_head *bh, int level)
524 * the caller (search_by_key) will perform other schedule-unsafe 524 * the caller (search_by_key) will perform other schedule-unsafe
525 * operations just after calling this function. 525 * operations just after calling this function.
526 * 526 *
527 * @return true if we have unlocked 527 * @return depth of lock to be restored after read completes
528 */ 528 */
529static bool search_by_key_reada(struct super_block *s, 529static int search_by_key_reada(struct super_block *s,
530 struct buffer_head **bh, 530 struct buffer_head **bh,
531 b_blocknr_t *b, int num) 531 b_blocknr_t *b, int num)
532{ 532{
533 int i, j; 533 int i, j;
534 bool unlocked = false; 534 int depth = -1;
535 535
536 for (i = 0; i < num; i++) { 536 for (i = 0; i < num; i++) {
537 bh[i] = sb_getblk(s, b[i]); 537 bh[i] = sb_getblk(s, b[i]);
@@ -549,15 +549,13 @@ static bool search_by_key_reada(struct super_block *s,
549 * you have to make sure the prepared bit isn't set on this buffer 549 * you have to make sure the prepared bit isn't set on this buffer
550 */ 550 */
551 if (!buffer_uptodate(bh[j])) { 551 if (!buffer_uptodate(bh[j])) {
552 if (!unlocked) { 552 if (depth == -1)
553 reiserfs_write_unlock(s); 553 depth = reiserfs_write_unlock_nested(s);
554 unlocked = true;
555 }
556 ll_rw_block(READA, 1, bh + j); 554 ll_rw_block(READA, 1, bh + j);
557 } 555 }
558 brelse(bh[j]); 556 brelse(bh[j]);
559 } 557 }
560 return unlocked; 558 return depth;
561} 559}
562 560
563/************************************************************************** 561/**************************************************************************
@@ -645,26 +643,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
645 have a pointer to it. */ 643 have a pointer to it. */
646 if ((bh = last_element->pe_buffer = 644 if ((bh = last_element->pe_buffer =
647 sb_getblk(sb, block_number))) { 645 sb_getblk(sb, block_number))) {
648 bool unlocked = false;
649 646
650 if (!buffer_uptodate(bh) && reada_count > 1)
651 /* may unlock the write lock */
652 unlocked = search_by_key_reada(sb, reada_bh,
653 reada_blocks, reada_count);
654 /* 647 /*
655 * If we haven't already unlocked the write lock, 648 * We'll need to drop the lock if we encounter any
656 * then we need to do that here before reading 649 * buffers that need to be read. If all of them are
657 * the current block 650 * already up to date, we don't need to drop the lock.
658 */ 651 */
659 if (!buffer_uptodate(bh) && !unlocked) { 652 int depth = -1;
660 reiserfs_write_unlock(sb); 653
661 unlocked = true; 654 if (!buffer_uptodate(bh) && reada_count > 1)
662 } 655 depth = search_by_key_reada(sb, reada_bh,
656 reada_blocks, reada_count);
657
658 if (!buffer_uptodate(bh) && depth == -1)
659 depth = reiserfs_write_unlock_nested(sb);
660
663 ll_rw_block(READ, 1, &bh); 661 ll_rw_block(READ, 1, &bh);
664 wait_on_buffer(bh); 662 wait_on_buffer(bh);
665 663
666 if (unlocked) 664 if (depth != -1)
667 reiserfs_write_lock(sb); 665 reiserfs_write_lock_nested(sb, depth);
668 if (!buffer_uptodate(bh)) 666 if (!buffer_uptodate(bh))
669 goto io_error; 667 goto io_error;
670 } else { 668 } else {
@@ -1059,9 +1057,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
1059 reiserfs_free_block(th, inode, block, 1); 1057 reiserfs_free_block(th, inode, block, 1);
1060 } 1058 }
1061 1059
1062 reiserfs_write_unlock(sb); 1060 reiserfs_cond_resched(sb);
1063 cond_resched();
1064 reiserfs_write_lock(sb);
1065 1061
1066 if (item_moved (&s_ih, path)) { 1062 if (item_moved (&s_ih, path)) {
1067 need_re_search = 1; 1063 need_re_search = 1;
@@ -1190,6 +1186,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1190 struct item_head *q_ih; 1186 struct item_head *q_ih;
1191 int quota_cut_bytes; 1187 int quota_cut_bytes;
1192 int ret_value, del_size, removed; 1188 int ret_value, del_size, removed;
1189 int depth;
1193 1190
1194#ifdef CONFIG_REISERFS_CHECK 1191#ifdef CONFIG_REISERFS_CHECK
1195 char mode; 1192 char mode;
@@ -1299,7 +1296,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1299 "reiserquota delete_item(): freeing %u, id=%u type=%c", 1296 "reiserquota delete_item(): freeing %u, id=%u type=%c",
1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih)); 1297 quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1301#endif 1298#endif
1299 depth = reiserfs_write_unlock_nested(inode->i_sb);
1302 dquot_free_space_nodirty(inode, quota_cut_bytes); 1300 dquot_free_space_nodirty(inode, quota_cut_bytes);
1301 reiserfs_write_lock_nested(inode->i_sb, depth);
1303 1302
1304 /* Return deleted body length */ 1303 /* Return deleted body length */
1305 return ret_value; 1304 return ret_value;
@@ -1325,6 +1324,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1325void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, 1324void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1326 struct inode *inode, struct reiserfs_key *key) 1325 struct inode *inode, struct reiserfs_key *key)
1327{ 1326{
1327 struct super_block *sb = th->t_super;
1328 struct tree_balance tb; 1328 struct tree_balance tb;
1329 INITIALIZE_PATH(path); 1329 INITIALIZE_PATH(path);
1330 int item_len = 0; 1330 int item_len = 0;
@@ -1377,14 +1377,17 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1377 if (retval == CARRY_ON) { 1377 if (retval == CARRY_ON) {
1378 do_balance(&tb, NULL, NULL, M_DELETE); 1378 do_balance(&tb, NULL, NULL, M_DELETE);
1379 if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ 1379 if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
1380 int depth;
1380#ifdef REISERQUOTA_DEBUG 1381#ifdef REISERQUOTA_DEBUG
1381 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, 1382 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
1382 "reiserquota delete_solid_item(): freeing %u id=%u type=%c", 1383 "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
1383 quota_cut_bytes, inode->i_uid, 1384 quota_cut_bytes, inode->i_uid,
1384 key2type(key)); 1385 key2type(key));
1385#endif 1386#endif
1387 depth = reiserfs_write_unlock_nested(sb);
1386 dquot_free_space_nodirty(inode, 1388 dquot_free_space_nodirty(inode,
1387 quota_cut_bytes); 1389 quota_cut_bytes);
1390 reiserfs_write_lock_nested(sb, depth);
1388 } 1391 }
1389 break; 1392 break;
1390 } 1393 }
@@ -1561,6 +1564,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1561 int retval2 = -1; 1564 int retval2 = -1;
1562 int quota_cut_bytes; 1565 int quota_cut_bytes;
1563 loff_t tail_pos = 0; 1566 loff_t tail_pos = 0;
1567 int depth;
1564 1568
1565 BUG_ON(!th->t_trans_id); 1569 BUG_ON(!th->t_trans_id);
1566 1570
@@ -1733,7 +1737,9 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c", 1737 "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1734 quota_cut_bytes, inode->i_uid, '?'); 1738 quota_cut_bytes, inode->i_uid, '?');
1735#endif 1739#endif
1740 depth = reiserfs_write_unlock_nested(sb);
1736 dquot_free_space_nodirty(inode, quota_cut_bytes); 1741 dquot_free_space_nodirty(inode, quota_cut_bytes);
1742 reiserfs_write_lock_nested(sb, depth);
1737 return ret_value; 1743 return ret_value;
1738} 1744}
1739 1745
@@ -1953,9 +1959,11 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1953 const char *body, /* Pointer to the bytes to paste. */ 1959 const char *body, /* Pointer to the bytes to paste. */
1954 int pasted_size) 1960 int pasted_size)
1955{ /* Size of pasted bytes. */ 1961{ /* Size of pasted bytes. */
1962 struct super_block *sb = inode->i_sb;
1956 struct tree_balance s_paste_balance; 1963 struct tree_balance s_paste_balance;
1957 int retval; 1964 int retval;
1958 int fs_gen; 1965 int fs_gen;
1966 int depth;
1959 1967
1960 BUG_ON(!th->t_trans_id); 1968 BUG_ON(!th->t_trans_id);
1961 1969
@@ -1968,9 +1976,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1968 key2type(&(key->on_disk_key))); 1976 key2type(&(key->on_disk_key)));
1969#endif 1977#endif
1970 1978
1971 reiserfs_write_unlock(inode->i_sb); 1979 depth = reiserfs_write_unlock_nested(sb);
1972 retval = dquot_alloc_space_nodirty(inode, pasted_size); 1980 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1973 reiserfs_write_lock(inode->i_sb); 1981 reiserfs_write_lock_nested(sb, depth);
1974 if (retval) { 1982 if (retval) {
1975 pathrelse(search_path); 1983 pathrelse(search_path);
1976 return retval; 1984 return retval;
@@ -2027,7 +2035,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
2027 pasted_size, inode->i_uid, 2035 pasted_size, inode->i_uid,
2028 key2type(&(key->on_disk_key))); 2036 key2type(&(key->on_disk_key)));
2029#endif 2037#endif
2038 depth = reiserfs_write_unlock_nested(sb);
2030 dquot_free_space_nodirty(inode, pasted_size); 2039 dquot_free_space_nodirty(inode, pasted_size);
2040 reiserfs_write_lock_nested(sb, depth);
2031 return retval; 2041 return retval;
2032} 2042}
2033 2043
@@ -2050,6 +2060,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2050 BUG_ON(!th->t_trans_id); 2060 BUG_ON(!th->t_trans_id);
2051 2061
2052 if (inode) { /* Do we count quotas for item? */ 2062 if (inode) { /* Do we count quotas for item? */
2063 int depth;
2053 fs_gen = get_generation(inode->i_sb); 2064 fs_gen = get_generation(inode->i_sb);
2054 quota_bytes = ih_item_len(ih); 2065 quota_bytes = ih_item_len(ih);
2055 2066
@@ -2063,11 +2074,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2063 "reiserquota insert_item(): allocating %u id=%u type=%c", 2074 "reiserquota insert_item(): allocating %u id=%u type=%c",
2064 quota_bytes, inode->i_uid, head2type(ih)); 2075 quota_bytes, inode->i_uid, head2type(ih));
2065#endif 2076#endif
2066 reiserfs_write_unlock(inode->i_sb);
2067 /* We can't dirty inode here. It would be immediately written but 2077 /* We can't dirty inode here. It would be immediately written but
2068 * appropriate stat item isn't inserted yet... */ 2078 * appropriate stat item isn't inserted yet... */
2079 depth = reiserfs_write_unlock_nested(inode->i_sb);
2069 retval = dquot_alloc_space_nodirty(inode, quota_bytes); 2080 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2070 reiserfs_write_lock(inode->i_sb); 2081 reiserfs_write_lock_nested(inode->i_sb, depth);
2071 if (retval) { 2082 if (retval) {
2072 pathrelse(path); 2083 pathrelse(path);
2073 return retval; 2084 return retval;
@@ -2118,7 +2129,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2118 "reiserquota insert_item(): freeing %u id=%u type=%c", 2129 "reiserquota insert_item(): freeing %u id=%u type=%c",
2119 quota_bytes, inode->i_uid, head2type(ih)); 2130 quota_bytes, inode->i_uid, head2type(ih));
2120#endif 2131#endif
2121 if (inode) 2132 if (inode) {
2133 int depth = reiserfs_write_unlock_nested(inode->i_sb);
2122 dquot_free_space_nodirty(inode, quota_bytes); 2134 dquot_free_space_nodirty(inode, quota_bytes);
2135 reiserfs_write_lock_nested(inode->i_sb, depth);
2136 }
2123 return retval; 2137 return retval;
2124} 2138}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e2e202a07b31..3ead145dadc4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -243,6 +243,7 @@ static int finish_unfinished(struct super_block *s)
243 done = 0; 243 done = 0;
244 REISERFS_SB(s)->s_is_unlinked_ok = 1; 244 REISERFS_SB(s)->s_is_unlinked_ok = 1;
245 while (!retval) { 245 while (!retval) {
246 int depth;
246 retval = search_item(s, &max_cpu_key, &path); 247 retval = search_item(s, &max_cpu_key, &path);
247 if (retval != ITEM_NOT_FOUND) { 248 if (retval != ITEM_NOT_FOUND) {
248 reiserfs_error(s, "vs-2140", 249 reiserfs_error(s, "vs-2140",
@@ -298,9 +299,9 @@ static int finish_unfinished(struct super_block *s)
298 retval = remove_save_link_only(s, &save_link_key, 0); 299 retval = remove_save_link_only(s, &save_link_key, 0);
299 continue; 300 continue;
300 } 301 }
301 reiserfs_write_unlock(s); 302 depth = reiserfs_write_unlock_nested(inode->i_sb);
302 dquot_initialize(inode); 303 dquot_initialize(inode);
303 reiserfs_write_lock(s); 304 reiserfs_write_lock_nested(inode->i_sb, depth);
304 305
305 if (truncate && S_ISDIR(inode->i_mode)) { 306 if (truncate && S_ISDIR(inode->i_mode)) {
306 /* We got a truncate request for a dir which is impossible. 307 /* We got a truncate request for a dir which is impossible.
@@ -356,10 +357,12 @@ static int finish_unfinished(struct super_block *s)
356 357
357#ifdef CONFIG_QUOTA 358#ifdef CONFIG_QUOTA
358 /* Turn quotas off */ 359 /* Turn quotas off */
360 reiserfs_write_unlock(s);
359 for (i = 0; i < MAXQUOTAS; i++) { 361 for (i = 0; i < MAXQUOTAS; i++) {
360 if (sb_dqopt(s)->files[i] && quota_enabled[i]) 362 if (sb_dqopt(s)->files[i] && quota_enabled[i])
361 dquot_quota_off(s, i); 363 dquot_quota_off(s, i);
362 } 364 }
365 reiserfs_write_lock(s);
363 if (ms_active_set) 366 if (ms_active_set)
364 /* Restore the flag back */ 367 /* Restore the flag back */
365 s->s_flags &= ~MS_ACTIVE; 368 s->s_flags &= ~MS_ACTIVE;
@@ -623,7 +626,6 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
623 struct reiserfs_transaction_handle th; 626 struct reiserfs_transaction_handle th;
624 627
625 int err = 0; 628 int err = 0;
626 int lock_depth;
627 629
628 if (inode->i_sb->s_flags & MS_RDONLY) { 630 if (inode->i_sb->s_flags & MS_RDONLY) {
629 reiserfs_warning(inode->i_sb, "clm-6006", 631 reiserfs_warning(inode->i_sb, "clm-6006",
@@ -631,7 +633,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
631 inode->i_ino); 633 inode->i_ino);
632 return; 634 return;
633 } 635 }
634 lock_depth = reiserfs_write_lock_once(inode->i_sb); 636 reiserfs_write_lock(inode->i_sb);
635 637
636 /* this is really only used for atime updates, so they don't have 638 /* this is really only used for atime updates, so they don't have
637 ** to be included in O_SYNC or fsync 639 ** to be included in O_SYNC or fsync
@@ -644,7 +646,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
644 journal_end(&th, inode->i_sb, 1); 646 journal_end(&th, inode->i_sb, 1);
645 647
646out: 648out:
647 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 649 reiserfs_write_unlock(inode->i_sb);
648} 650}
649 651
650static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) 652static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -1334,7 +1336,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1334 kfree(qf_names[i]); 1336 kfree(qf_names[i]);
1335#endif 1337#endif
1336 err = -EINVAL; 1338 err = -EINVAL;
1337 goto out_unlock; 1339 goto out_err_unlock;
1338 } 1340 }
1339#ifdef CONFIG_QUOTA 1341#ifdef CONFIG_QUOTA
1340 handle_quota_files(s, qf_names, &qfmt); 1342 handle_quota_files(s, qf_names, &qfmt);
@@ -1378,35 +1380,32 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1378 if (blocks) { 1380 if (blocks) {
1379 err = reiserfs_resize(s, blocks); 1381 err = reiserfs_resize(s, blocks);
1380 if (err != 0) 1382 if (err != 0)
1381 goto out_unlock; 1383 goto out_err_unlock;
1382 } 1384 }
1383 1385
1384 if (*mount_flags & MS_RDONLY) { 1386 if (*mount_flags & MS_RDONLY) {
1387 reiserfs_write_unlock(s);
1385 reiserfs_xattr_init(s, *mount_flags); 1388 reiserfs_xattr_init(s, *mount_flags);
1386 /* remount read-only */ 1389 /* remount read-only */
1387 if (s->s_flags & MS_RDONLY) 1390 if (s->s_flags & MS_RDONLY)
1388 /* it is read-only already */ 1391 /* it is read-only already */
1389 goto out_ok; 1392 goto out_ok_unlocked;
1390 1393
1391 /*
1392 * Drop write lock. Quota will retake it when needed and lock
1393 * ordering requires calling dquot_suspend() without it.
1394 */
1395 reiserfs_write_unlock(s);
1396 err = dquot_suspend(s, -1); 1394 err = dquot_suspend(s, -1);
1397 if (err < 0) 1395 if (err < 0)
1398 goto out_err; 1396 goto out_err;
1399 reiserfs_write_lock(s);
1400 1397
1401 /* try to remount file system with read-only permissions */ 1398 /* try to remount file system with read-only permissions */
1402 if (sb_umount_state(rs) == REISERFS_VALID_FS 1399 if (sb_umount_state(rs) == REISERFS_VALID_FS
1403 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1400 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
1404 goto out_ok; 1401 goto out_ok_unlocked;
1405 } 1402 }
1406 1403
1404 reiserfs_write_lock(s);
1405
1407 err = journal_begin(&th, s, 10); 1406 err = journal_begin(&th, s, 10);
1408 if (err) 1407 if (err)
1409 goto out_unlock; 1408 goto out_err_unlock;
1410 1409
1411 /* Mounting a rw partition read-only. */ 1410 /* Mounting a rw partition read-only. */
1412 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1411 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1415,13 +1414,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1415 } else { 1414 } else {
1416 /* remount read-write */ 1415 /* remount read-write */
1417 if (!(s->s_flags & MS_RDONLY)) { 1416 if (!(s->s_flags & MS_RDONLY)) {
1417 reiserfs_write_unlock(s);
1418 reiserfs_xattr_init(s, *mount_flags); 1418 reiserfs_xattr_init(s, *mount_flags);
1419 goto out_ok; /* We are read-write already */ 1419 goto out_ok_unlocked; /* We are read-write already */
1420 } 1420 }
1421 1421
1422 if (reiserfs_is_journal_aborted(journal)) { 1422 if (reiserfs_is_journal_aborted(journal)) {
1423 err = journal->j_errno; 1423 err = journal->j_errno;
1424 goto out_unlock; 1424 goto out_err_unlock;
1425 } 1425 }
1426 1426
1427 handle_data_mode(s, mount_options); 1427 handle_data_mode(s, mount_options);
@@ -1430,7 +1430,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1430 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ 1430 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
1431 err = journal_begin(&th, s, 10); 1431 err = journal_begin(&th, s, 10);
1432 if (err) 1432 if (err)
1433 goto out_unlock; 1433 goto out_err_unlock;
1434 1434
1435 /* Mount a partition which is read-only, read-write */ 1435 /* Mount a partition which is read-only, read-write */
1436 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1436 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1447,26 +1447,22 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1447 SB_JOURNAL(s)->j_must_wait = 1; 1447 SB_JOURNAL(s)->j_must_wait = 1;
1448 err = journal_end(&th, s, 10); 1448 err = journal_end(&th, s, 10);
1449 if (err) 1449 if (err)
1450 goto out_unlock; 1450 goto out_err_unlock;
1451 1451
1452 reiserfs_write_unlock(s);
1452 if (!(*mount_flags & MS_RDONLY)) { 1453 if (!(*mount_flags & MS_RDONLY)) {
1453 /*
1454 * Drop write lock. Quota will retake it when needed and lock
1455 * ordering requires calling dquot_resume() without it.
1456 */
1457 reiserfs_write_unlock(s);
1458 dquot_resume(s, -1); 1454 dquot_resume(s, -1);
1459 reiserfs_write_lock(s); 1455 reiserfs_write_lock(s);
1460 finish_unfinished(s); 1456 finish_unfinished(s);
1457 reiserfs_write_unlock(s);
1461 reiserfs_xattr_init(s, *mount_flags); 1458 reiserfs_xattr_init(s, *mount_flags);
1462 } 1459 }
1463 1460
1464out_ok: 1461out_ok_unlocked:
1465 replace_mount_options(s, new_opts); 1462 replace_mount_options(s, new_opts);
1466 reiserfs_write_unlock(s);
1467 return 0; 1463 return 0;
1468 1464
1469out_unlock: 1465out_err_unlock:
1470 reiserfs_write_unlock(s); 1466 reiserfs_write_unlock(s);
1471out_err: 1467out_err:
1472 kfree(new_opts); 1468 kfree(new_opts);
@@ -2013,12 +2009,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
2013 goto error; 2009 goto error;
2014 } 2010 }
2015 2011
2012 reiserfs_write_unlock(s);
2016 if ((errval = reiserfs_lookup_privroot(s)) || 2013 if ((errval = reiserfs_lookup_privroot(s)) ||
2017 (errval = reiserfs_xattr_init(s, s->s_flags))) { 2014 (errval = reiserfs_xattr_init(s, s->s_flags))) {
2018 dput(s->s_root); 2015 dput(s->s_root);
2019 s->s_root = NULL; 2016 s->s_root = NULL;
2020 goto error; 2017 goto error_unlocked;
2021 } 2018 }
2019 reiserfs_write_lock(s);
2022 2020
2023 /* look for files which were to be removed in previous session */ 2021 /* look for files which were to be removed in previous session */
2024 finish_unfinished(s); 2022 finish_unfinished(s);
@@ -2027,12 +2025,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
2027 reiserfs_info(s, "using 3.5.x disk format\n"); 2025 reiserfs_info(s, "using 3.5.x disk format\n");
2028 } 2026 }
2029 2027
2028 reiserfs_write_unlock(s);
2030 if ((errval = reiserfs_lookup_privroot(s)) || 2029 if ((errval = reiserfs_lookup_privroot(s)) ||
2031 (errval = reiserfs_xattr_init(s, s->s_flags))) { 2030 (errval = reiserfs_xattr_init(s, s->s_flags))) {
2032 dput(s->s_root); 2031 dput(s->s_root);
2033 s->s_root = NULL; 2032 s->s_root = NULL;
2034 goto error; 2033 goto error_unlocked;
2035 } 2034 }
2035 reiserfs_write_lock(s);
2036 } 2036 }
2037 // mark hash in super block: it could be unset. overwrite should be ok 2037 // mark hash in super block: it could be unset. overwrite should be ok
2038 set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); 2038 set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
@@ -2100,6 +2100,7 @@ static int reiserfs_write_dquot(struct dquot *dquot)
2100{ 2100{
2101 struct reiserfs_transaction_handle th; 2101 struct reiserfs_transaction_handle th;
2102 int ret, err; 2102 int ret, err;
2103 int depth;
2103 2104
2104 reiserfs_write_lock(dquot->dq_sb); 2105 reiserfs_write_lock(dquot->dq_sb);
2105 ret = 2106 ret =
@@ -2107,9 +2108,9 @@ static int reiserfs_write_dquot(struct dquot *dquot)
2107 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2108 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2108 if (ret) 2109 if (ret)
2109 goto out; 2110 goto out;
2110 reiserfs_write_unlock(dquot->dq_sb); 2111 depth = reiserfs_write_unlock_nested(dquot->dq_sb);
2111 ret = dquot_commit(dquot); 2112 ret = dquot_commit(dquot);
2112 reiserfs_write_lock(dquot->dq_sb); 2113 reiserfs_write_lock_nested(dquot->dq_sb, depth);
2113 err = 2114 err =
2114 journal_end(&th, dquot->dq_sb, 2115 journal_end(&th, dquot->dq_sb,
2115 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2116 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
@@ -2124,6 +2125,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
2124{ 2125{
2125 struct reiserfs_transaction_handle th; 2126 struct reiserfs_transaction_handle th;
2126 int ret, err; 2127 int ret, err;
2128 int depth;
2127 2129
2128 reiserfs_write_lock(dquot->dq_sb); 2130 reiserfs_write_lock(dquot->dq_sb);
2129 ret = 2131 ret =
@@ -2131,9 +2133,9 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
2131 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2133 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2132 if (ret) 2134 if (ret)
2133 goto out; 2135 goto out;
2134 reiserfs_write_unlock(dquot->dq_sb); 2136 depth = reiserfs_write_unlock_nested(dquot->dq_sb);
2135 ret = dquot_acquire(dquot); 2137 ret = dquot_acquire(dquot);
2136 reiserfs_write_lock(dquot->dq_sb); 2138 reiserfs_write_lock_nested(dquot->dq_sb, depth);
2137 err = 2139 err =
2138 journal_end(&th, dquot->dq_sb, 2140 journal_end(&th, dquot->dq_sb,
2139 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2141 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
@@ -2186,15 +2188,16 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2186{ 2188{
2187 struct reiserfs_transaction_handle th; 2189 struct reiserfs_transaction_handle th;
2188 int ret, err; 2190 int ret, err;
2191 int depth;
2189 2192
2190 /* Data block + inode block */ 2193 /* Data block + inode block */
2191 reiserfs_write_lock(sb); 2194 reiserfs_write_lock(sb);
2192 ret = journal_begin(&th, sb, 2); 2195 ret = journal_begin(&th, sb, 2);
2193 if (ret) 2196 if (ret)
2194 goto out; 2197 goto out;
2195 reiserfs_write_unlock(sb); 2198 depth = reiserfs_write_unlock_nested(sb);
2196 ret = dquot_commit_info(sb, type); 2199 ret = dquot_commit_info(sb, type);
2197 reiserfs_write_lock(sb); 2200 reiserfs_write_lock_nested(sb, depth);
2198 err = journal_end(&th, sb, 2); 2201 err = journal_end(&th, sb, 2);
2199 if (!ret && err) 2202 if (!ret && err)
2200 ret = err; 2203 ret = err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c69cdd749f09..8a9e2dcfe004 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -81,8 +81,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
81 int error; 81 int error;
82 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 82 BUG_ON(!mutex_is_locked(&dir->i_mutex));
83 83
84 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 84 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
85 I_MUTEX_CHILD, dir->i_sb);
86 error = dir->i_op->unlink(dir, dentry); 85 error = dir->i_op->unlink(dir, dentry);
87 mutex_unlock(&dentry->d_inode->i_mutex); 86 mutex_unlock(&dentry->d_inode->i_mutex);
88 87
@@ -96,8 +95,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
96 int error; 95 int error;
97 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 96 BUG_ON(!mutex_is_locked(&dir->i_mutex));
98 97
99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 98 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
100 I_MUTEX_CHILD, dir->i_sb);
101 error = dir->i_op->rmdir(dir, dentry); 99 error = dir->i_op->rmdir(dir, dentry);
102 if (!error) 100 if (!error)
103 dentry->d_inode->i_flags |= S_DEAD; 101 dentry->d_inode->i_flags |= S_DEAD;
@@ -232,22 +230,17 @@ static int reiserfs_for_each_xattr(struct inode *inode,
232 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) 230 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
233 return 0; 231 return 0;
234 232
235 reiserfs_write_unlock(inode->i_sb);
236 dir = open_xa_dir(inode, XATTR_REPLACE); 233 dir = open_xa_dir(inode, XATTR_REPLACE);
237 if (IS_ERR(dir)) { 234 if (IS_ERR(dir)) {
238 err = PTR_ERR(dir); 235 err = PTR_ERR(dir);
239 reiserfs_write_lock(inode->i_sb);
240 goto out; 236 goto out;
241 } else if (!dir->d_inode) { 237 } else if (!dir->d_inode) {
242 err = 0; 238 err = 0;
243 reiserfs_write_lock(inode->i_sb);
244 goto out_dir; 239 goto out_dir;
245 } 240 }
246 241
247 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); 242 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
248 243
249 reiserfs_write_lock(inode->i_sb);
250
251 buf.xadir = dir; 244 buf.xadir = dir;
252 while (1) { 245 while (1) {
253 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); 246 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
@@ -281,14 +274,17 @@ static int reiserfs_for_each_xattr(struct inode *inode,
281 int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 274 int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
282 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); 275 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
283 struct reiserfs_transaction_handle th; 276 struct reiserfs_transaction_handle th;
277 reiserfs_write_lock(inode->i_sb);
284 err = journal_begin(&th, inode->i_sb, blocks); 278 err = journal_begin(&th, inode->i_sb, blocks);
279 reiserfs_write_unlock(inode->i_sb);
285 if (!err) { 280 if (!err) {
286 int jerror; 281 int jerror;
287 reiserfs_mutex_lock_nested_safe( 282 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
288 &dir->d_parent->d_inode->i_mutex, 283 I_MUTEX_XATTR);
289 I_MUTEX_XATTR, inode->i_sb);
290 err = action(dir, data); 284 err = action(dir, data);
285 reiserfs_write_lock(inode->i_sb);
291 jerror = journal_end(&th, inode->i_sb, blocks); 286 jerror = journal_end(&th, inode->i_sb, blocks);
287 reiserfs_write_unlock(inode->i_sb);
292 mutex_unlock(&dir->d_parent->d_inode->i_mutex); 288 mutex_unlock(&dir->d_parent->d_inode->i_mutex);
293 err = jerror ?: err; 289 err = jerror ?: err;
294 } 290 }
@@ -455,9 +451,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
455 } 451 }
456 452
457 if (dentry->d_inode) { 453 if (dentry->d_inode) {
458 reiserfs_write_lock(inode->i_sb);
459 err = xattr_unlink(xadir->d_inode, dentry); 454 err = xattr_unlink(xadir->d_inode, dentry);
460 reiserfs_write_unlock(inode->i_sb);
461 update_ctime(inode); 455 update_ctime(inode);
462 } 456 }
463 457
@@ -491,24 +485,17 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
491 if (get_inode_sd_version(inode) == STAT_DATA_V1) 485 if (get_inode_sd_version(inode) == STAT_DATA_V1)
492 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
493 487
494 reiserfs_write_unlock(inode->i_sb);
495
496 if (!buffer) { 488 if (!buffer) {
497 err = lookup_and_delete_xattr(inode, name); 489 err = lookup_and_delete_xattr(inode, name);
498 reiserfs_write_lock(inode->i_sb);
499 return err; 490 return err;
500 } 491 }
501 492
502 dentry = xattr_lookup(inode, name, flags); 493 dentry = xattr_lookup(inode, name, flags);
503 if (IS_ERR(dentry)) { 494 if (IS_ERR(dentry))
504 reiserfs_write_lock(inode->i_sb);
505 return PTR_ERR(dentry); 495 return PTR_ERR(dentry);
506 }
507 496
508 down_write(&REISERFS_I(inode)->i_xattr_sem); 497 down_write(&REISERFS_I(inode)->i_xattr_sem);
509 498
510 reiserfs_write_lock(inode->i_sb);
511
512 xahash = xattr_hash(buffer, buffer_size); 499 xahash = xattr_hash(buffer, buffer_size);
513 while (buffer_pos < buffer_size || buffer_pos == 0) { 500 while (buffer_pos < buffer_size || buffer_pos == 0) {
514 size_t chunk; 501 size_t chunk;
@@ -538,6 +525,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
538 rxh->h_hash = cpu_to_le32(xahash); 525 rxh->h_hash = cpu_to_le32(xahash);
539 } 526 }
540 527
528 reiserfs_write_lock(inode->i_sb);
541 err = __reiserfs_write_begin(page, page_offset, chunk + skip); 529 err = __reiserfs_write_begin(page, page_offset, chunk + skip);
542 if (!err) { 530 if (!err) {
543 if (buffer) 531 if (buffer)
@@ -546,6 +534,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
546 page_offset + chunk + 534 page_offset + chunk +
547 skip); 535 skip);
548 } 536 }
537 reiserfs_write_unlock(inode->i_sb);
549 unlock_page(page); 538 unlock_page(page);
550 reiserfs_put_page(page); 539 reiserfs_put_page(page);
551 buffer_pos += chunk; 540 buffer_pos += chunk;
@@ -563,10 +552,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
563 .ia_valid = ATTR_SIZE | ATTR_CTIME, 552 .ia_valid = ATTR_SIZE | ATTR_CTIME,
564 }; 553 };
565 554
566 reiserfs_write_unlock(inode->i_sb);
567 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); 555 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
568 inode_dio_wait(dentry->d_inode); 556 inode_dio_wait(dentry->d_inode);
569 reiserfs_write_lock(inode->i_sb);
570 557
571 err = reiserfs_setattr(dentry, &newattrs); 558 err = reiserfs_setattr(dentry, &newattrs);
572 mutex_unlock(&dentry->d_inode->i_mutex); 559 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -592,18 +579,19 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
592 579
593 reiserfs_write_lock(inode->i_sb); 580 reiserfs_write_lock(inode->i_sb);
594 error = journal_begin(&th, inode->i_sb, jbegin_count); 581 error = journal_begin(&th, inode->i_sb, jbegin_count);
582 reiserfs_write_unlock(inode->i_sb);
595 if (error) { 583 if (error) {
596 reiserfs_write_unlock(inode->i_sb);
597 return error; 584 return error;
598 } 585 }
599 586
600 error = reiserfs_xattr_set_handle(&th, inode, name, 587 error = reiserfs_xattr_set_handle(&th, inode, name,
601 buffer, buffer_size, flags); 588 buffer, buffer_size, flags);
602 589
590 reiserfs_write_lock(inode->i_sb);
603 error2 = journal_end(&th, inode->i_sb, jbegin_count); 591 error2 = journal_end(&th, inode->i_sb, jbegin_count);
592 reiserfs_write_unlock(inode->i_sb);
604 if (error == 0) 593 if (error == 0)
605 error = error2; 594 error = error2;
606 reiserfs_write_unlock(inode->i_sb);
607 595
608 return error; 596 return error;
609} 597}
@@ -968,7 +956,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
968 int err = 0; 956 int err = 0;
969 957
970 /* If we don't have the privroot located yet - go find it */ 958 /* If we don't have the privroot located yet - go find it */
971 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); 959 mutex_lock(&s->s_root->d_inode->i_mutex);
972 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 960 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
973 strlen(PRIVROOT_NAME)); 961 strlen(PRIVROOT_NAME));
974 if (!IS_ERR(dentry)) { 962 if (!IS_ERR(dentry)) {
@@ -996,14 +984,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
996 goto error; 984 goto error;
997 985
998 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { 986 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
999 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); 987 mutex_lock(&s->s_root->d_inode->i_mutex);
1000 err = create_privroot(REISERFS_SB(s)->priv_root); 988 err = create_privroot(REISERFS_SB(s)->priv_root);
1001 mutex_unlock(&s->s_root->d_inode->i_mutex); 989 mutex_unlock(&s->s_root->d_inode->i_mutex);
1002 } 990 }
1003 991
1004 if (privroot->d_inode) { 992 if (privroot->d_inode) {
1005 s->s_xattr = reiserfs_xattr_handlers; 993 s->s_xattr = reiserfs_xattr_handlers;
1006 reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); 994 mutex_lock(&privroot->d_inode->i_mutex);
1007 if (!REISERFS_SB(s)->xattr_root) { 995 if (!REISERFS_SB(s)->xattr_root) {
1008 struct dentry *dentry; 996 struct dentry *dentry;
1009 dentry = lookup_one_len(XAROOT_NAME, privroot, 997 dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6c8767fdfc6a..06c04f73da65 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -49,13 +49,15 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
49 49
50 reiserfs_write_lock(inode->i_sb); 50 reiserfs_write_lock(inode->i_sb);
51 error = journal_begin(&th, inode->i_sb, jcreate_blocks); 51 error = journal_begin(&th, inode->i_sb, jcreate_blocks);
52 reiserfs_write_unlock(inode->i_sb);
52 if (error == 0) { 53 if (error == 0) {
53 error = reiserfs_set_acl(&th, inode, type, acl); 54 error = reiserfs_set_acl(&th, inode, type, acl);
55 reiserfs_write_lock(inode->i_sb);
54 error2 = journal_end(&th, inode->i_sb, jcreate_blocks); 56 error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
57 reiserfs_write_unlock(inode->i_sb);
55 if (error2) 58 if (error2)
56 error = error2; 59 error = error2;
57 } 60 }
58 reiserfs_write_unlock(inode->i_sb);
59 61
60 release_and_out: 62 release_and_out:
61 posix_acl_release(acl); 63 posix_acl_release(acl);
@@ -435,12 +437,14 @@ int reiserfs_cache_default_acl(struct inode *inode)
435 return nblocks; 437 return nblocks;
436} 438}
437 439
440/*
441 * Called under i_mutex
442 */
438int reiserfs_acl_chmod(struct inode *inode) 443int reiserfs_acl_chmod(struct inode *inode)
439{ 444{
440 struct reiserfs_transaction_handle th; 445 struct reiserfs_transaction_handle th;
441 struct posix_acl *acl; 446 struct posix_acl *acl;
442 size_t size; 447 size_t size;
443 int depth;
444 int error; 448 int error;
445 449
446 if (IS_PRIVATE(inode)) 450 if (IS_PRIVATE(inode))
@@ -454,9 +458,7 @@ int reiserfs_acl_chmod(struct inode *inode)
454 return 0; 458 return 0;
455 } 459 }
456 460
457 reiserfs_write_unlock(inode->i_sb);
458 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 461 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
459 reiserfs_write_lock(inode->i_sb);
460 if (!acl) 462 if (!acl)
461 return 0; 463 return 0;
462 if (IS_ERR(acl)) 464 if (IS_ERR(acl))
@@ -466,16 +468,18 @@ int reiserfs_acl_chmod(struct inode *inode)
466 return error; 468 return error;
467 469
468 size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); 470 size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
469 depth = reiserfs_write_lock_once(inode->i_sb); 471 reiserfs_write_lock(inode->i_sb);
470 error = journal_begin(&th, inode->i_sb, size * 2); 472 error = journal_begin(&th, inode->i_sb, size * 2);
473 reiserfs_write_unlock(inode->i_sb);
471 if (!error) { 474 if (!error) {
472 int error2; 475 int error2;
473 error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); 476 error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
477 reiserfs_write_lock(inode->i_sb);
474 error2 = journal_end(&th, inode->i_sb, size * 2); 478 error2 = journal_end(&th, inode->i_sb, size * 2);
479 reiserfs_write_unlock(inode->i_sb);
475 if (error2) 480 if (error2)
476 error = error2; 481 error = error2;
477 } 482 }
478 reiserfs_write_unlock_once(inode->i_sb, depth);
479 posix_acl_release(acl); 483 posix_acl_release(acl);
480 return error; 484 return error;
481} 485}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index fb50652e4e11..41d108ecc9be 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -167,17 +167,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
167 /* 167 /*
168 * Block is uncompressed. 168 * Block is uncompressed.
169 */ 169 */
170 int i, in, pg_offset = 0; 170 int in, pg_offset = 0;
171
172 for (i = 0; i < b; i++) {
173 wait_on_buffer(bh[i]);
174 if (!buffer_uptodate(bh[i]))
175 goto block_release;
176 }
177 171
178 for (bytes = length; k < b; k++) { 172 for (bytes = length; k < b; k++) {
179 in = min(bytes, msblk->devblksize - offset); 173 in = min(bytes, msblk->devblksize - offset);
180 bytes -= in; 174 bytes -= in;
175 wait_on_buffer(bh[k]);
176 if (!buffer_uptodate(bh[k]))
177 goto block_release;
181 while (in) { 178 while (in) {
182 if (pg_offset == PAGE_CACHE_SIZE) { 179 if (pg_offset == PAGE_CACHE_SIZE) {
183 page++; 180 page++;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index f7f527bf8c10..d8c2d747be28 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -54,6 +54,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
54{ 54{
55 struct squashfs_sb_info *msblk = sb->s_fs_info; 55 struct squashfs_sb_info *msblk = sb->s_fs_info;
56 int err, i, index, length = 0; 56 int err, i, index, length = 0;
57 unsigned int size;
57 struct squashfs_dir_index dir_index; 58 struct squashfs_dir_index dir_index;
58 59
59 TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", 60 TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
@@ -81,8 +82,14 @@ static int get_dir_index_using_offset(struct super_block *sb,
81 */ 82 */
82 break; 83 break;
83 84
85 size = le32_to_cpu(dir_index.size) + 1;
86
87 /* size should never be larger than SQUASHFS_NAME_LEN */
88 if (size > SQUASHFS_NAME_LEN)
89 break;
90
84 err = squashfs_read_metadata(sb, NULL, &index_start, 91 err = squashfs_read_metadata(sb, NULL, &index_start,
85 &index_offset, le32_to_cpu(dir_index.size) + 1); 92 &index_offset, size);
86 if (err < 0) 93 if (err < 0)
87 break; 94 break;
88 95
@@ -105,9 +112,8 @@ static int squashfs_readdir(struct file *file, struct dir_context *ctx)
105 struct inode *inode = file_inode(file); 112 struct inode *inode = file_inode(file);
106 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; 113 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
107 u64 block = squashfs_i(inode)->start + msblk->directory_table; 114 u64 block = squashfs_i(inode)->start + msblk->directory_table;
108 int offset = squashfs_i(inode)->offset, length, dir_count, size, 115 int offset = squashfs_i(inode)->offset, length, err;
109 type, err; 116 unsigned int inode_number, dir_count, size, type;
110 unsigned int inode_number;
111 struct squashfs_dir_header dirh; 117 struct squashfs_dir_header dirh;
112 struct squashfs_dir_entry *dire; 118 struct squashfs_dir_entry *dire;
113 119
@@ -200,6 +206,9 @@ static int squashfs_readdir(struct file *file, struct dir_context *ctx)
200 ((short) le16_to_cpu(dire->inode_number)); 206 ((short) le16_to_cpu(dire->inode_number));
201 type = le16_to_cpu(dire->type); 207 type = le16_to_cpu(dire->type);
202 208
209 if (type > SQUASHFS_MAX_DIR_TYPE)
210 goto failed_read;
211
203 if (!dir_emit(ctx, dire->name, size, 212 if (!dir_emit(ctx, dire->name, size,
204 inode_number, 213 inode_number,
205 squashfs_filetype_table[type])) 214 squashfs_filetype_table[type]))
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7834a517f7f4..67cad77fefb4 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -79,7 +79,8 @@ static int get_dir_index_using_name(struct super_block *sb,
79 int len) 79 int len)
80{ 80{
81 struct squashfs_sb_info *msblk = sb->s_fs_info; 81 struct squashfs_sb_info *msblk = sb->s_fs_info;
82 int i, size, length = 0, err; 82 int i, length = 0, err;
83 unsigned int size;
83 struct squashfs_dir_index *index; 84 struct squashfs_dir_index *index;
84 char *str; 85 char *str;
85 86
@@ -103,6 +104,8 @@ static int get_dir_index_using_name(struct super_block *sb,
103 104
104 105
105 size = le32_to_cpu(index->size) + 1; 106 size = le32_to_cpu(index->size) + 1;
107 if (size > SQUASHFS_NAME_LEN)
108 break;
106 109
107 err = squashfs_read_metadata(sb, index->name, &index_start, 110 err = squashfs_read_metadata(sb, index->name, &index_start,
108 &index_offset, size); 111 &index_offset, size);
@@ -144,7 +147,8 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
144 struct squashfs_dir_entry *dire; 147 struct squashfs_dir_entry *dire;
145 u64 block = squashfs_i(dir)->start + msblk->directory_table; 148 u64 block = squashfs_i(dir)->start + msblk->directory_table;
146 int offset = squashfs_i(dir)->offset; 149 int offset = squashfs_i(dir)->offset;
147 int err, length, dir_count, size; 150 int err, length;
151 unsigned int dir_count, size;
148 152
149 TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); 153 TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
150 154
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 9e2349d07cb1..4b2beda49498 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -87,7 +87,7 @@
87#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \ 87#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \
88 SQUASHFS_COMP_OPT) 88 SQUASHFS_COMP_OPT)
89 89
90/* Max number of types and file types */ 90/* Inode types including extended types */
91#define SQUASHFS_DIR_TYPE 1 91#define SQUASHFS_DIR_TYPE 1
92#define SQUASHFS_REG_TYPE 2 92#define SQUASHFS_REG_TYPE 2
93#define SQUASHFS_SYMLINK_TYPE 3 93#define SQUASHFS_SYMLINK_TYPE 3
@@ -103,6 +103,9 @@
103#define SQUASHFS_LFIFO_TYPE 13 103#define SQUASHFS_LFIFO_TYPE 13
104#define SQUASHFS_LSOCKET_TYPE 14 104#define SQUASHFS_LSOCKET_TYPE 14
105 105
106/* Max type value stored in directory entry */
107#define SQUASHFS_MAX_DIR_TYPE 7
108
106/* Xattr types */ 109/* Xattr types */
107#define SQUASHFS_XATTR_USER 0 110#define SQUASHFS_XATTR_USER 0
108#define SQUASHFS_XATTR_TRUSTED 1 111#define SQUASHFS_XATTR_TRUSTED 1
diff --git a/fs/stat.c b/fs/stat.c
index 04ce1ac20d20..d0ea7ef75e26 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -447,9 +447,8 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
447 447
448EXPORT_SYMBOL(inode_add_bytes); 448EXPORT_SYMBOL(inode_add_bytes);
449 449
450void inode_sub_bytes(struct inode *inode, loff_t bytes) 450void __inode_sub_bytes(struct inode *inode, loff_t bytes)
451{ 451{
452 spin_lock(&inode->i_lock);
453 inode->i_blocks -= bytes >> 9; 452 inode->i_blocks -= bytes >> 9;
454 bytes &= 511; 453 bytes &= 511;
455 if (inode->i_bytes < bytes) { 454 if (inode->i_bytes < bytes) {
@@ -457,6 +456,14 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes)
457 inode->i_bytes += 512; 456 inode->i_bytes += 512;
458 } 457 }
459 inode->i_bytes -= bytes; 458 inode->i_bytes -= bytes;
459}
460
461EXPORT_SYMBOL(__inode_sub_bytes);
462
463void inode_sub_bytes(struct inode *inode, loff_t bytes)
464{
465 spin_lock(&inode->i_lock);
466 __inode_sub_bytes(inode, bytes);
460 spin_unlock(&inode->i_lock); 467 spin_unlock(&inode->i_lock);
461} 468}
462 469
diff --git a/fs/super.c b/fs/super.c
index 68307c029228..0225c20f8770 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,11 +53,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
53 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we 53 * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
54 * take a passive reference to the superblock to avoid this from occurring. 54 * take a passive reference to the superblock to avoid this from occurring.
55 */ 55 */
56static int prune_super(struct shrinker *shrink, struct shrink_control *sc) 56static unsigned long super_cache_scan(struct shrinker *shrink,
57 struct shrink_control *sc)
57{ 58{
58 struct super_block *sb; 59 struct super_block *sb;
59 int fs_objects = 0; 60 long fs_objects = 0;
60 int total_objects; 61 long total_objects;
62 long freed = 0;
63 long dentries;
64 long inodes;
61 65
62 sb = container_of(shrink, struct super_block, s_shrink); 66 sb = container_of(shrink, struct super_block, s_shrink);
63 67
@@ -65,46 +69,62 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
65 * Deadlock avoidance. We may hold various FS locks, and we don't want 69 * Deadlock avoidance. We may hold various FS locks, and we don't want
66 * to recurse into the FS that called us in clear_inode() and friends.. 70 * to recurse into the FS that called us in clear_inode() and friends..
67 */ 71 */
68 if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) 72 if (!(sc->gfp_mask & __GFP_FS))
69 return -1; 73 return SHRINK_STOP;
70 74
71 if (!grab_super_passive(sb)) 75 if (!grab_super_passive(sb))
72 return -1; 76 return SHRINK_STOP;
73 77
74 if (sb->s_op && sb->s_op->nr_cached_objects) 78 if (sb->s_op->nr_cached_objects)
75 fs_objects = sb->s_op->nr_cached_objects(sb); 79 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
76
77 total_objects = sb->s_nr_dentry_unused +
78 sb->s_nr_inodes_unused + fs_objects + 1;
79
80 if (sc->nr_to_scan) {
81 int dentries;
82 int inodes;
83
84 /* proportion the scan between the caches */
85 dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
86 total_objects;
87 inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
88 total_objects;
89 if (fs_objects)
90 fs_objects = (sc->nr_to_scan * fs_objects) /
91 total_objects;
92 /*
93 * prune the dcache first as the icache is pinned by it, then
94 * prune the icache, followed by the filesystem specific caches
95 */
96 prune_dcache_sb(sb, dentries);
97 prune_icache_sb(sb, inodes);
98 80
99 if (fs_objects && sb->s_op->free_cached_objects) { 81 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
100 sb->s_op->free_cached_objects(sb, fs_objects); 82 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
101 fs_objects = sb->s_op->nr_cached_objects(sb); 83 total_objects = dentries + inodes + fs_objects + 1;
102 } 84
103 total_objects = sb->s_nr_dentry_unused + 85 /* proportion the scan between the caches */
104 sb->s_nr_inodes_unused + fs_objects; 86 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
87 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
88
89 /*
90 * prune the dcache first as the icache is pinned by it, then
91 * prune the icache, followed by the filesystem specific caches
92 */
93 freed = prune_dcache_sb(sb, dentries, sc->nid);
94 freed += prune_icache_sb(sb, inodes, sc->nid);
95
96 if (fs_objects) {
97 fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
98 total_objects);
99 freed += sb->s_op->free_cached_objects(sb, fs_objects,
100 sc->nid);
105 } 101 }
106 102
107 total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; 103 drop_super(sb);
104 return freed;
105}
106
107static unsigned long super_cache_count(struct shrinker *shrink,
108 struct shrink_control *sc)
109{
110 struct super_block *sb;
111 long total_objects = 0;
112
113 sb = container_of(shrink, struct super_block, s_shrink);
114
115 if (!grab_super_passive(sb))
116 return 0;
117
118 if (sb->s_op && sb->s_op->nr_cached_objects)
119 total_objects = sb->s_op->nr_cached_objects(sb,
120 sc->nid);
121
122 total_objects += list_lru_count_node(&sb->s_dentry_lru,
123 sc->nid);
124 total_objects += list_lru_count_node(&sb->s_inode_lru,
125 sc->nid);
126
127 total_objects = vfs_pressure_ratio(total_objects);
108 drop_super(sb); 128 drop_super(sb);
109 return total_objects; 129 return total_objects;
110} 130}
@@ -152,15 +172,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
152 static const struct super_operations default_op; 172 static const struct super_operations default_op;
153 173
154 if (s) { 174 if (s) {
155 if (security_sb_alloc(s)) { 175 if (security_sb_alloc(s))
156 /* 176 goto out_free_sb;
157 * We cannot call security_sb_free() without 177
158 * security_sb_alloc() succeeding. So bail out manually
159 */
160 kfree(s);
161 s = NULL;
162 goto out;
163 }
164#ifdef CONFIG_SMP 178#ifdef CONFIG_SMP
165 s->s_files = alloc_percpu(struct list_head); 179 s->s_files = alloc_percpu(struct list_head);
166 if (!s->s_files) 180 if (!s->s_files)
@@ -181,9 +195,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
181 INIT_HLIST_NODE(&s->s_instances); 195 INIT_HLIST_NODE(&s->s_instances);
182 INIT_HLIST_BL_HEAD(&s->s_anon); 196 INIT_HLIST_BL_HEAD(&s->s_anon);
183 INIT_LIST_HEAD(&s->s_inodes); 197 INIT_LIST_HEAD(&s->s_inodes);
184 INIT_LIST_HEAD(&s->s_dentry_lru); 198
185 INIT_LIST_HEAD(&s->s_inode_lru); 199 if (list_lru_init(&s->s_dentry_lru))
186 spin_lock_init(&s->s_inode_lru_lock); 200 goto err_out;
201 if (list_lru_init(&s->s_inode_lru))
202 goto err_out_dentry_lru;
203
187 INIT_LIST_HEAD(&s->s_mounts); 204 INIT_LIST_HEAD(&s->s_mounts);
188 init_rwsem(&s->s_umount); 205 init_rwsem(&s->s_umount);
189 lockdep_set_class(&s->s_umount, &type->s_umount_key); 206 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -216,11 +233,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
216 s->cleancache_poolid = -1; 233 s->cleancache_poolid = -1;
217 234
218 s->s_shrink.seeks = DEFAULT_SEEKS; 235 s->s_shrink.seeks = DEFAULT_SEEKS;
219 s->s_shrink.shrink = prune_super; 236 s->s_shrink.scan_objects = super_cache_scan;
237 s->s_shrink.count_objects = super_cache_count;
220 s->s_shrink.batch = 1024; 238 s->s_shrink.batch = 1024;
239 s->s_shrink.flags = SHRINKER_NUMA_AWARE;
221 } 240 }
222out: 241out:
223 return s; 242 return s;
243
244err_out_dentry_lru:
245 list_lru_destroy(&s->s_dentry_lru);
224err_out: 246err_out:
225 security_sb_free(s); 247 security_sb_free(s);
226#ifdef CONFIG_SMP 248#ifdef CONFIG_SMP
@@ -228,6 +250,7 @@ err_out:
228 free_percpu(s->s_files); 250 free_percpu(s->s_files);
229#endif 251#endif
230 destroy_sb_writers(s); 252 destroy_sb_writers(s);
253out_free_sb:
231 kfree(s); 254 kfree(s);
232 s = NULL; 255 s = NULL;
233 goto out; 256 goto out;
@@ -241,6 +264,8 @@ err_out:
241 */ 264 */
242static inline void destroy_super(struct super_block *s) 265static inline void destroy_super(struct super_block *s)
243{ 266{
267 list_lru_destroy(&s->s_dentry_lru);
268 list_lru_destroy(&s->s_inode_lru);
244#ifdef CONFIG_SMP 269#ifdef CONFIG_SMP
245 free_percpu(s->s_files); 270 free_percpu(s->s_files);
246#endif 271#endif
@@ -300,6 +325,7 @@ void deactivate_locked_super(struct super_block *s)
300 325
301 /* caches are now gone, we can safely kill the shrinker now */ 326 /* caches are now gone, we can safely kill the shrinker now */
302 unregister_shrinker(&s->s_shrink); 327 unregister_shrinker(&s->s_shrink);
328
303 put_filesystem(fs); 329 put_filesystem(fs);
304 put_super(s); 330 put_super(s);
305 } else { 331 } else {
@@ -414,6 +440,11 @@ void generic_shutdown_super(struct super_block *sb)
414 440
415 evict_inodes(sb); 441 evict_inodes(sb);
416 442
443 if (sb->s_dio_done_wq) {
444 destroy_workqueue(sb->s_dio_done_wq);
445 sb->s_dio_done_wq = NULL;
446 }
447
417 if (sop->put_super) 448 if (sop->put_super)
418 sop->put_super(sb); 449 sop->put_super(sb);
419 450
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 15c68f9489ae..c590cabd57bb 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -22,8 +22,7 @@
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25 25#include <linux/uaccess.h>
26#include <asm/uaccess.h>
27 26
28#include "sysfs.h" 27#include "sysfs.h"
29 28
@@ -391,7 +390,7 @@ out_unlock:
391 return rc; 390 return rc;
392} 391}
393 392
394static int open(struct inode * inode, struct file * file) 393static int open(struct inode *inode, struct file *file)
395{ 394{
396 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 395 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
397 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; 396 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
@@ -435,7 +434,7 @@ static int open(struct inode * inode, struct file * file)
435 return error; 434 return error;
436} 435}
437 436
438static int release(struct inode * inode, struct file * file) 437static int release(struct inode *inode, struct file *file)
439{ 438{
440 struct bin_buffer *bb = file->private_data; 439 struct bin_buffer *bb = file->private_data;
441 440
@@ -481,7 +480,6 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
481 * @kobj: object. 480 * @kobj: object.
482 * @attr: attribute descriptor. 481 * @attr: attribute descriptor.
483 */ 482 */
484
485int sysfs_create_bin_file(struct kobject *kobj, 483int sysfs_create_bin_file(struct kobject *kobj,
486 const struct bin_attribute *attr) 484 const struct bin_attribute *attr)
487{ 485{
@@ -489,19 +487,16 @@ int sysfs_create_bin_file(struct kobject *kobj,
489 487
490 return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); 488 return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
491} 489}
492 490EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
493 491
494/** 492/**
495 * sysfs_remove_bin_file - remove binary file for object. 493 * sysfs_remove_bin_file - remove binary file for object.
496 * @kobj: object. 494 * @kobj: object.
497 * @attr: attribute descriptor. 495 * @attr: attribute descriptor.
498 */ 496 */
499
500void sysfs_remove_bin_file(struct kobject *kobj, 497void sysfs_remove_bin_file(struct kobject *kobj,
501 const struct bin_attribute *attr) 498 const struct bin_attribute *attr)
502{ 499{
503 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); 500 sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
504} 501}
505
506EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
507EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); 502EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e068e744dbdd..4d83cedb9fcb 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -46,7 +46,7 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name)
46 unsigned int len = strlen(name); 46 unsigned int len = strlen(name);
47 while (len--) 47 while (len--)
48 hash = partial_name_hash(*name++, hash); 48 hash = partial_name_hash(*name++, hash);
49 hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) ); 49 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
50 hash &= 0x7fffffffU; 50 hash &= 0x7fffffffU;
51 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ 51 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
52 if (hash < 1) 52 if (hash < 1)
@@ -258,7 +258,7 @@ static void sysfs_free_ino(unsigned int ino)
258 spin_unlock(&sysfs_ino_lock); 258 spin_unlock(&sysfs_ino_lock);
259} 259}
260 260
261void release_sysfs_dirent(struct sysfs_dirent * sd) 261void release_sysfs_dirent(struct sysfs_dirent *sd)
262{ 262{
263 struct sysfs_dirent *parent_sd; 263 struct sysfs_dirent *parent_sd;
264 264
@@ -297,7 +297,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
297static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) 297static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
298{ 298{
299 struct sysfs_dirent *sd; 299 struct sysfs_dirent *sd;
300 int is_dir;
301 int type; 300 int type;
302 301
303 if (flags & LOOKUP_RCU) 302 if (flags & LOOKUP_RCU)
@@ -341,18 +340,15 @@ out_bad:
341 * is performed at its new name the dentry will be readded 340 * is performed at its new name the dentry will be readded
342 * to the dcache hashes. 341 * to the dcache hashes.
343 */ 342 */
344 is_dir = (sysfs_type(sd) == SYSFS_DIR);
345 mutex_unlock(&sysfs_mutex); 343 mutex_unlock(&sysfs_mutex);
346 if (is_dir) { 344
347 /* If we have submounts we must allow the vfs caches 345 /* If we have submounts we must allow the vfs caches
348 * to lie about the state of the filesystem to prevent 346 * to lie about the state of the filesystem to prevent
349 * leaks and other nasty things. 347 * leaks and other nasty things.
350 */ 348 */
351 if (have_submounts(dentry)) 349 if (check_submounts_and_drop(dentry) != 0)
352 goto out_valid; 350 goto out_valid;
353 shrink_dcache_parent(dentry); 351
354 }
355 d_drop(dentry);
356 return 0; 352 return 0;
357} 353}
358 354
@@ -451,7 +447,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
451 447
452 if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { 448 if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
453 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", 449 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
454 sysfs_ns_type(acxt->parent_sd)? "required": "invalid", 450 sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid",
455 acxt->parent_sd->s_name, sd->s_name); 451 acxt->parent_sd->s_name, sd->s_name);
456 return -EINVAL; 452 return -EINVAL;
457 } 453 }
@@ -619,7 +615,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
619 615
620 if (!!sysfs_ns_type(parent_sd) != !!ns) { 616 if (!!sysfs_ns_type(parent_sd) != !!ns) {
621 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", 617 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
622 sysfs_ns_type(parent_sd)? "required": "invalid", 618 sysfs_ns_type(parent_sd) ? "required" : "invalid",
623 parent_sd->s_name, name); 619 parent_sd->s_name, name);
624 return NULL; 620 return NULL;
625 } 621 }
@@ -674,7 +670,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
674 enum kobj_ns_type type, const void *ns, const char *name, 670 enum kobj_ns_type type, const void *ns, const char *name,
675 struct sysfs_dirent **p_sd) 671 struct sysfs_dirent **p_sd)
676{ 672{
677 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 673 umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
678 struct sysfs_addrm_cxt acxt; 674 struct sysfs_addrm_cxt acxt;
679 struct sysfs_dirent *sd; 675 struct sysfs_dirent *sd;
680 int rc; 676 int rc;
@@ -735,9 +731,9 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
735 731
736/** 732/**
737 * sysfs_create_dir - create a directory for an object. 733 * sysfs_create_dir - create a directory for an object.
738 * @kobj: object we're creating directory for. 734 * @kobj: object we're creating directory for.
739 */ 735 */
740int sysfs_create_dir(struct kobject * kobj) 736int sysfs_create_dir(struct kobject *kobj)
741{ 737{
742 enum kobj_ns_type type; 738 enum kobj_ns_type type;
743 struct sysfs_dirent *parent_sd, *sd; 739 struct sysfs_dirent *parent_sd, *sd;
@@ -764,8 +760,8 @@ int sysfs_create_dir(struct kobject * kobj)
764 return error; 760 return error;
765} 761}
766 762
767static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, 763static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
768 unsigned int flags) 764 unsigned int flags)
769{ 765{
770 struct dentry *ret = NULL; 766 struct dentry *ret = NULL;
771 struct dentry *parent = dentry->d_parent; 767 struct dentry *parent = dentry->d_parent;
@@ -857,7 +853,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
857 * what used to be sysfs_rmdir() below, instead of calling separately. 853 * what used to be sysfs_rmdir() below, instead of calling separately.
858 */ 854 */
859 855
860void sysfs_remove_dir(struct kobject * kobj) 856void sysfs_remove_dir(struct kobject *kobj)
861{ 857{
862 struct sysfs_dirent *sd = kobj->sd; 858 struct sysfs_dirent *sd = kobj->sd;
863 859
@@ -896,7 +892,9 @@ int sysfs_rename(struct sysfs_dirent *sd,
896 sd->s_name = new_name; 892 sd->s_name = new_name;
897 } 893 }
898 894
899 /* Move to the appropriate place in the appropriate directories rbtree. */ 895 /*
896 * Move to the appropriate place in the appropriate directories rbtree.
897 */
900 sysfs_unlink_sibling(sd); 898 sysfs_unlink_sibling(sd);
901 sysfs_get(new_parent_sd); 899 sysfs_get(new_parent_sd);
902 sysfs_put(sd->s_parent); 900 sysfs_put(sd->s_parent);
@@ -988,20 +986,21 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
988 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) 986 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
989{ 987{
990 pos = sysfs_dir_pos(ns, parent_sd, ino, pos); 988 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
991 if (pos) do { 989 if (pos)
992 struct rb_node *node = rb_next(&pos->s_rb); 990 do {
993 if (!node) 991 struct rb_node *node = rb_next(&pos->s_rb);
994 pos = NULL; 992 if (!node)
995 else 993 pos = NULL;
996 pos = to_sysfs_dirent(node); 994 else
997 } while (pos && pos->s_ns != ns); 995 pos = to_sysfs_dirent(node);
996 } while (pos && pos->s_ns != ns);
998 return pos; 997 return pos;
999} 998}
1000 999
1001static int sysfs_readdir(struct file *file, struct dir_context *ctx) 1000static int sysfs_readdir(struct file *file, struct dir_context *ctx)
1002{ 1001{
1003 struct dentry *dentry = file->f_path.dentry; 1002 struct dentry *dentry = file->f_path.dentry;
1004 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 1003 struct sysfs_dirent *parent_sd = dentry->d_fsdata;
1005 struct sysfs_dirent *pos = file->private_data; 1004 struct sysfs_dirent *pos = file->private_data;
1006 enum kobj_ns_type type; 1005 enum kobj_ns_type type;
1007 const void *ns; 1006 const void *ns;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d2bb7ed8fa74..15ef5eb13663 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,7 +20,7 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/mutex.h> 21#include <linux/mutex.h>
22#include <linux/limits.h> 22#include <linux/limits.h>
23#include <asm/uaccess.h> 23#include <linux/uaccess.h>
24 24
25#include "sysfs.h" 25#include "sysfs.h"
26 26
@@ -45,8 +45,8 @@ struct sysfs_open_dirent {
45struct sysfs_buffer { 45struct sysfs_buffer {
46 size_t count; 46 size_t count;
47 loff_t pos; 47 loff_t pos;
48 char * page; 48 char *page;
49 const struct sysfs_ops * ops; 49 const struct sysfs_ops *ops;
50 struct mutex mutex; 50 struct mutex mutex;
51 int needs_read_fill; 51 int needs_read_fill;
52 int event; 52 int event;
@@ -59,16 +59,16 @@ struct sysfs_buffer {
59 * @buffer: data buffer for file. 59 * @buffer: data buffer for file.
60 * 60 *
61 * Allocate @buffer->page, if it hasn't been already, then call the 61 * Allocate @buffer->page, if it hasn't been already, then call the
62 * kobject's show() method to fill the buffer with this attribute's 62 * kobject's show() method to fill the buffer with this attribute's
63 * data. 63 * data.
64 * This is called only once, on the file's first read unless an error 64 * This is called only once, on the file's first read unless an error
65 * is returned. 65 * is returned.
66 */ 66 */
67static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) 67static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
68{ 68{
69 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 69 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
70 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 70 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
71 const struct sysfs_ops * ops = buffer->ops; 71 const struct sysfs_ops *ops = buffer->ops;
72 int ret = 0; 72 int ret = 0;
73 ssize_t count; 73 ssize_t count;
74 74
@@ -106,7 +106,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
106} 106}
107 107
108/** 108/**
109 * sysfs_read_file - read an attribute. 109 * sysfs_read_file - read an attribute.
110 * @file: file pointer. 110 * @file: file pointer.
111 * @buf: buffer to fill. 111 * @buf: buffer to fill.
112 * @count: number of bytes to read. 112 * @count: number of bytes to read.
@@ -127,12 +127,12 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
127static ssize_t 127static ssize_t
128sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) 128sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
129{ 129{
130 struct sysfs_buffer * buffer = file->private_data; 130 struct sysfs_buffer *buffer = file->private_data;
131 ssize_t retval = 0; 131 ssize_t retval = 0;
132 132
133 mutex_lock(&buffer->mutex); 133 mutex_lock(&buffer->mutex);
134 if (buffer->needs_read_fill || *ppos == 0) { 134 if (buffer->needs_read_fill || *ppos == 0) {
135 retval = fill_read_buffer(file->f_path.dentry,buffer); 135 retval = fill_read_buffer(file->f_path.dentry, buffer);
136 if (retval) 136 if (retval)
137 goto out; 137 goto out;
138 } 138 }
@@ -154,9 +154,8 @@ out:
154 * Allocate @buffer->page if it hasn't been already, then 154 * Allocate @buffer->page if it hasn't been already, then
155 * copy the user-supplied buffer into it. 155 * copy the user-supplied buffer into it.
156 */ 156 */
157 157static int fill_write_buffer(struct sysfs_buffer *buffer,
158static int 158 const char __user *buf, size_t count)
159fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count)
160{ 159{
161 int error; 160 int error;
162 161
@@ -167,7 +166,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t
167 166
168 if (count >= PAGE_SIZE) 167 if (count >= PAGE_SIZE)
169 count = PAGE_SIZE - 1; 168 count = PAGE_SIZE - 1;
170 error = copy_from_user(buffer->page,buf,count); 169 error = copy_from_user(buffer->page, buf, count);
171 buffer->needs_read_fill = 1; 170 buffer->needs_read_fill = 1;
172 /* if buf is assumed to contain a string, terminate it by \0, 171 /* if buf is assumed to contain a string, terminate it by \0,
173 so e.g. sscanf() can scan the string easily */ 172 so e.g. sscanf() can scan the string easily */
@@ -183,16 +182,15 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t
183 * @count: number of bytes 182 * @count: number of bytes
184 * 183 *
185 * Get the correct pointers for the kobject and the attribute we're 184 * Get the correct pointers for the kobject and the attribute we're
186 * dealing with, then call the store() method for the attribute, 185 * dealing with, then call the store() method for the attribute,
187 * passing the buffer that we acquired in fill_write_buffer(). 186 * passing the buffer that we acquired in fill_write_buffer().
188 */ 187 */
189 188static int flush_write_buffer(struct dentry *dentry,
190static int 189 struct sysfs_buffer *buffer, size_t count)
191flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
192{ 190{
193 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 191 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
194 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 192 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
195 const struct sysfs_ops * ops = buffer->ops; 193 const struct sysfs_ops *ops = buffer->ops;
196 int rc; 194 int rc;
197 195
198 /* need attr_sd for attr and ops, its parent for kobj */ 196 /* need attr_sd for attr and ops, its parent for kobj */
@@ -219,15 +217,14 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
219 * then push it to the kobject in flush_write_buffer(). 217 * then push it to the kobject in flush_write_buffer().
220 * There is no easy way for us to know if userspace is only doing a partial 218 * There is no easy way for us to know if userspace is only doing a partial
221 * write, so we don't support them. We expect the entire buffer to come 219 * write, so we don't support them. We expect the entire buffer to come
222 * on the first write. 220 * on the first write.
223 * Hint: if you're writing a value, first read the file, modify only the 221 * Hint: if you're writing a value, first read the file, modify only the
224 * the value you're changing, then write entire buffer back. 222 * the value you're changing, then write entire buffer back.
225 */ 223 */
226 224static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
227static ssize_t 225 size_t count, loff_t *ppos)
228sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
229{ 226{
230 struct sysfs_buffer * buffer = file->private_data; 227 struct sysfs_buffer *buffer = file->private_data;
231 ssize_t len; 228 ssize_t len;
232 229
233 mutex_lock(&buffer->mutex); 230 mutex_lock(&buffer->mutex);
@@ -339,13 +336,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
339 if (kobj->ktype && kobj->ktype->sysfs_ops) 336 if (kobj->ktype && kobj->ktype->sysfs_ops)
340 ops = kobj->ktype->sysfs_ops; 337 ops = kobj->ktype->sysfs_ops;
341 else { 338 else {
342 WARN(1, KERN_ERR "missing sysfs attribute operations for " 339 WARN(1, KERN_ERR
343 "kobject: %s\n", kobject_name(kobj)); 340 "missing sysfs attribute operations for kobject: %s\n",
341 kobject_name(kobj));
344 goto err_out; 342 goto err_out;
345 } 343 }
346 344
347 /* File needs write support. 345 /* File needs write support.
348 * The inode's perms must say it's ok, 346 * The inode's perms must say it's ok,
349 * and we must have a store method. 347 * and we must have a store method.
350 */ 348 */
351 if (file->f_mode & FMODE_WRITE) { 349 if (file->f_mode & FMODE_WRITE) {
@@ -420,7 +418,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
420 */ 418 */
421static unsigned int sysfs_poll(struct file *filp, poll_table *wait) 419static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
422{ 420{
423 struct sysfs_buffer * buffer = filp->private_data; 421 struct sysfs_buffer *buffer = filp->private_data;
424 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; 422 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
425 struct sysfs_open_dirent *od = attr_sd->s_attr.open; 423 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
426 424
@@ -518,8 +516,9 @@ static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
518 ns = ops->namespace(kobj, attr); 516 ns = ops->namespace(kobj, attr);
519out: 517out:
520 if (err) { 518 if (err) {
521 WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " 519 WARN(1, KERN_ERR
522 "kobject: %s\n", kobject_name(kobj)); 520 "missing sysfs namespace attribute operation for kobject: %s\n",
521 kobject_name(kobj));
523 } 522 }
524 *pns = ns; 523 *pns = ns;
525 return err; 524 return err;
@@ -566,17 +565,17 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
566 565
567/** 566/**
568 * sysfs_create_file - create an attribute file for an object. 567 * sysfs_create_file - create an attribute file for an object.
569 * @kobj: object we're creating for. 568 * @kobj: object we're creating for.
570 * @attr: attribute descriptor. 569 * @attr: attribute descriptor.
571 */ 570 */
572 571int sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
573int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
574{ 572{
575 BUG_ON(!kobj || !kobj->sd || !attr); 573 BUG_ON(!kobj || !kobj->sd || !attr);
576 574
577 return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); 575 return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
578 576
579} 577}
578EXPORT_SYMBOL_GPL(sysfs_create_file);
580 579
581int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) 580int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
582{ 581{
@@ -590,6 +589,7 @@ int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
590 sysfs_remove_file(kobj, ptr[i]); 589 sysfs_remove_file(kobj, ptr[i]);
591 return err; 590 return err;
592} 591}
592EXPORT_SYMBOL_GPL(sysfs_create_files);
593 593
594/** 594/**
595 * sysfs_add_file_to_group - add an attribute file to a pre-existing group. 595 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -654,7 +654,6 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
654} 654}
655EXPORT_SYMBOL_GPL(sysfs_chmod_file); 655EXPORT_SYMBOL_GPL(sysfs_chmod_file);
656 656
657
658/** 657/**
659 * sysfs_remove_file - remove an object attribute. 658 * sysfs_remove_file - remove an object attribute.
660 * @kobj: object we're acting for. 659 * @kobj: object we're acting for.
@@ -662,8 +661,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
662 * 661 *
663 * Hash the attribute name and kill the victim. 662 * Hash the attribute name and kill the victim.
664 */ 663 */
665 664void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
666void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
667{ 665{
668 const void *ns; 666 const void *ns;
669 667
@@ -672,13 +670,15 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
672 670
673 sysfs_hash_and_remove(kobj->sd, ns, attr->name); 671 sysfs_hash_and_remove(kobj->sd, ns, attr->name);
674} 672}
673EXPORT_SYMBOL_GPL(sysfs_remove_file);
675 674
676void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) 675void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
677{ 676{
678 int i; 677 int i;
679 for (i = 0; ptr[i]; i++) 678 for (i = 0; ptr[i]; i++)
680 sysfs_remove_file(kobj, ptr[i]); 679 sysfs_remove_file(kobj, ptr[i]);
681} 680}
681EXPORT_SYMBOL_GPL(sysfs_remove_files);
682 682
683/** 683/**
684 * sysfs_remove_file_from_group - remove an attribute file from a group. 684 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -793,9 +793,3 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
793 return 0; 793 return 0;
794} 794}
795EXPORT_SYMBOL_GPL(sysfs_schedule_callback); 795EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
796
797
798EXPORT_SYMBOL_GPL(sysfs_create_file);
799EXPORT_SYMBOL_GPL(sysfs_remove_file);
800EXPORT_SYMBOL_GPL(sysfs_remove_files);
801EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 09a1a25cd145..5f92cd2f61c1 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2013 Greg Kroah-Hartman
7 * Copyright (c) 2013 The Linux Foundation
6 * 8 *
7 * This file is released undert the GPL v2. 9 * This file is released undert the GPL v2.
8 * 10 *
9 */ 11 */
10 12
@@ -19,8 +21,8 @@
19static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 21static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
20 const struct attribute_group *grp) 22 const struct attribute_group *grp)
21{ 23{
22 struct attribute *const* attr; 24 struct attribute *const *attr;
23 struct bin_attribute *const* bin_attr; 25 struct bin_attribute *const *bin_attr;
24 26
25 if (grp->attrs) 27 if (grp->attrs)
26 for (attr = grp->attrs; *attr; attr++) 28 for (attr = grp->attrs; *attr; attr++)
@@ -33,8 +35,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
33static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 35static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
34 const struct attribute_group *grp, int update) 36 const struct attribute_group *grp, int update)
35{ 37{
36 struct attribute *const* attr; 38 struct attribute *const *attr;
37 struct bin_attribute *const* bin_attr; 39 struct bin_attribute *const *bin_attr;
38 int error = 0, i; 40 int error = 0, i;
39 41
40 if (grp->attrs) { 42 if (grp->attrs) {
@@ -129,6 +131,41 @@ int sysfs_create_group(struct kobject *kobj,
129{ 131{
130 return internal_create_group(kobj, 0, grp); 132 return internal_create_group(kobj, 0, grp);
131} 133}
134EXPORT_SYMBOL_GPL(sysfs_create_group);
135
136/**
137 * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
138 * @kobj: The kobject to create the group on
139 * @groups: The attribute groups to create, NULL terminated
140 *
141 * This function creates a bunch of attribute groups. If an error occurs when
142 * creating a group, all previously created groups will be removed, unwinding
143 * everything back to the original state when this function was called.
144 * It will explicitly warn and error if any of the attribute files being
145 * created already exist.
146 *
147 * Returns 0 on success or error code from sysfs_create_group on error.
148 */
149int sysfs_create_groups(struct kobject *kobj,
150 const struct attribute_group **groups)
151{
152 int error = 0;
153 int i;
154
155 if (!groups)
156 return 0;
157
158 for (i = 0; groups[i]; i++) {
159 error = sysfs_create_group(kobj, groups[i]);
160 if (error) {
161 while (--i >= 0)
162 sysfs_remove_group(kobj, groups[i]);
163 break;
164 }
165 }
166 return error;
167}
168EXPORT_SYMBOL_GPL(sysfs_create_groups);
132 169
133/** 170/**
134 * sysfs_update_group - given a directory kobject, update an attribute group 171 * sysfs_update_group - given a directory kobject, update an attribute group
@@ -152,11 +189,18 @@ int sysfs_update_group(struct kobject *kobj,
152{ 189{
153 return internal_create_group(kobj, 1, grp); 190 return internal_create_group(kobj, 1, grp);
154} 191}
192EXPORT_SYMBOL_GPL(sysfs_update_group);
155 193
156 194/**
157 195 * sysfs_remove_group: remove a group from a kobject
158void sysfs_remove_group(struct kobject * kobj, 196 * @kobj: kobject to remove the group from
159 const struct attribute_group * grp) 197 * @grp: group to remove
198 *
199 * This function removes a group of attributes from a kobject. The attributes
200 * previously have to have been created for this group, otherwise it will fail.
201 */
202void sysfs_remove_group(struct kobject *kobj,
203 const struct attribute_group *grp)
160{ 204{
161 struct sysfs_dirent *dir_sd = kobj->sd; 205 struct sysfs_dirent *dir_sd = kobj->sd;
162 struct sysfs_dirent *sd; 206 struct sysfs_dirent *sd;
@@ -164,8 +208,9 @@ void sysfs_remove_group(struct kobject * kobj,
164 if (grp->name) { 208 if (grp->name) {
165 sd = sysfs_get_dirent(dir_sd, NULL, grp->name); 209 sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
166 if (!sd) { 210 if (!sd) {
167 WARN(!sd, KERN_WARNING "sysfs group %p not found for " 211 WARN(!sd, KERN_WARNING
168 "kobject '%s'\n", grp, kobject_name(kobj)); 212 "sysfs group %p not found for kobject '%s'\n",
213 grp, kobject_name(kobj));
169 return; 214 return;
170 } 215 }
171 } else 216 } else
@@ -177,6 +222,27 @@ void sysfs_remove_group(struct kobject * kobj,
177 222
178 sysfs_put(sd); 223 sysfs_put(sd);
179} 224}
225EXPORT_SYMBOL_GPL(sysfs_remove_group);
226
227/**
228 * sysfs_remove_groups - remove a list of groups
229 *
230 * @kobj: The kobject for the groups to be removed from
231 * @groups: NULL terminated list of groups to be removed
232 *
233 * If groups is not NULL, remove the specified groups from the kobject.
234 */
235void sysfs_remove_groups(struct kobject *kobj,
236 const struct attribute_group **groups)
237{
238 int i;
239
240 if (!groups)
241 return;
242 for (i = 0; groups[i]; i++)
243 sysfs_remove_group(kobj, groups[i]);
244}
245EXPORT_SYMBOL_GPL(sysfs_remove_groups);
180 246
181/** 247/**
182 * sysfs_merge_group - merge files into a pre-existing attribute group. 248 * sysfs_merge_group - merge files into a pre-existing attribute group.
@@ -273,7 +339,3 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
273 } 339 }
274} 340}
275EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); 341EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
276
277EXPORT_SYMBOL_GPL(sysfs_create_group);
278EXPORT_SYMBOL_GPL(sysfs_update_group);
279EXPORT_SYMBOL_GPL(sysfs_remove_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 3e2837a633ed..963f910c8034 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -10,7 +10,7 @@
10 * Please see Documentation/filesystems/sysfs.txt for more information. 10 * Please see Documentation/filesystems/sysfs.txt for more information.
11 */ 11 */
12 12
13#undef DEBUG 13#undef DEBUG
14 14
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
@@ -36,7 +36,7 @@ static struct backing_dev_info sysfs_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
37}; 37};
38 38
39static const struct inode_operations sysfs_inode_operations ={ 39static const struct inode_operations sysfs_inode_operations = {
40 .permission = sysfs_permission, 40 .permission = sysfs_permission,
41 .setattr = sysfs_setattr, 41 .setattr = sysfs_setattr,
42 .getattr = sysfs_getattr, 42 .getattr = sysfs_getattr,
@@ -67,7 +67,7 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
67 return attrs; 67 return attrs;
68} 68}
69 69
70int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) 70int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
71{ 71{
72 struct sysfs_inode_attrs *sd_attrs; 72 struct sysfs_inode_attrs *sd_attrs;
73 struct iattr *iattrs; 73 struct iattr *iattrs;
@@ -128,7 +128,8 @@ out:
128 return error; 128 return error;
129} 129}
130 130
131static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) 131static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
132 u32 *secdata_len)
132{ 133{
133 struct sysfs_inode_attrs *iattrs; 134 struct sysfs_inode_attrs *iattrs;
134 void *old_secdata; 135 void *old_secdata;
@@ -186,13 +187,13 @@ out:
186 return error; 187 return error;
187} 188}
188 189
189static inline void set_default_inode_attr(struct inode * inode, umode_t mode) 190static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
190{ 191{
191 inode->i_mode = mode; 192 inode->i_mode = mode;
192 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 193 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
193} 194}
194 195
195static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) 196static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
196{ 197{
197 inode->i_uid = iattr->ia_uid; 198 inode->i_uid = iattr->ia_uid;
198 inode->i_gid = iattr->ia_gid; 199 inode->i_gid = iattr->ia_gid;
@@ -220,7 +221,8 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
220 set_nlink(inode, sd->s_dir.subdirs + 2); 221 set_nlink(inode, sd->s_dir.subdirs + 2);
221} 222}
222 223
223int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 224int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
225 struct kstat *stat)
224{ 226{
225 struct sysfs_dirent *sd = dentry->d_fsdata; 227 struct sysfs_dirent *sd = dentry->d_fsdata;
226 struct inode *inode = dentry->d_inode; 228 struct inode *inode = dentry->d_inode;
@@ -285,7 +287,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
285 * RETURNS: 287 * RETURNS:
286 * Pointer to allocated inode on success, NULL on failure. 288 * Pointer to allocated inode on success, NULL on failure.
287 */ 289 */
288struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) 290struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
289{ 291{
290 struct inode *inode; 292 struct inode *inode;
291 293
@@ -312,7 +314,8 @@ void sysfs_evict_inode(struct inode *inode)
312 sysfs_put(sd); 314 sysfs_put(sd);
313} 315}
314 316
315int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) 317int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
318 const char *name)
316{ 319{
317 struct sysfs_addrm_cxt acxt; 320 struct sysfs_addrm_cxt acxt;
318 struct sysfs_dirent *sd; 321 struct sysfs_dirent *sd;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index afd83273e6ce..834ec2cdb7a3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -64,7 +64,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
64 /* instantiate and link root dentry */ 64 /* instantiate and link root dentry */
65 root = d_make_root(inode); 65 root = d_make_root(inode);
66 if (!root) { 66 if (!root) {
67 pr_debug("%s: could not get root dentry!\n",__func__); 67 pr_debug("%s: could not get root dentry!\n", __func__);
68 return -ENOMEM; 68 return -ENOMEM;
69 } 69 }
70 root->d_fsdata = &sysfs_root; 70 root->d_fsdata = &sysfs_root;
@@ -112,8 +112,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
112 struct super_block *sb; 112 struct super_block *sb;
113 int error; 113 int error;
114 114
115 if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) 115 if (!(flags & MS_KERNMOUNT)) {
116 return ERR_PTR(-EPERM); 116 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
117 return ERR_PTR(-EPERM);
118
119 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
120 if (!kobj_ns_current_may_mount(type))
121 return ERR_PTR(-EPERM);
122 }
123 }
117 124
118 info = kzalloc(sizeof(*info), GFP_KERNEL); 125 info = kzalloc(sizeof(*info), GFP_KERNEL);
119 if (!info) 126 if (!info)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8c940df97a52..2dd4507d9edd 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -125,6 +125,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target,
125{ 125{
126 return sysfs_do_create_link(kobj, target, name, 1); 126 return sysfs_do_create_link(kobj, target, name, 1);
127} 127}
128EXPORT_SYMBOL_GPL(sysfs_create_link);
128 129
129/** 130/**
130 * sysfs_create_link_nowarn - create symlink between two objects. 131 * sysfs_create_link_nowarn - create symlink between two objects.
@@ -166,8 +167,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
166 * @kobj: object we're acting for. 167 * @kobj: object we're acting for.
167 * @name: name of the symlink to remove. 168 * @name: name of the symlink to remove.
168 */ 169 */
169 170void sysfs_remove_link(struct kobject *kobj, const char *name)
170void sysfs_remove_link(struct kobject * kobj, const char * name)
171{ 171{
172 struct sysfs_dirent *parent_sd = NULL; 172 struct sysfs_dirent *parent_sd = NULL;
173 173
@@ -178,6 +178,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
178 178
179 sysfs_hash_and_remove(parent_sd, NULL, name); 179 sysfs_hash_and_remove(parent_sd, NULL, name);
180} 180}
181EXPORT_SYMBOL_GPL(sysfs_remove_link);
181 182
182/** 183/**
183 * sysfs_rename_link - rename symlink in object's directory. 184 * sysfs_rename_link - rename symlink in object's directory.
@@ -223,6 +224,7 @@ out:
223 sysfs_put(sd); 224 sysfs_put(sd);
224 return result; 225 return result;
225} 226}
227EXPORT_SYMBOL_GPL(sysfs_rename_link);
226 228
227static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 229static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
228 struct sysfs_dirent *target_sd, char *path) 230 struct sysfs_dirent *target_sd, char *path)
@@ -276,7 +278,7 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
276 return 0; 278 return 0;
277} 279}
278 280
279static int sysfs_getlink(struct dentry *dentry, char * path) 281static int sysfs_getlink(struct dentry *dentry, char *path)
280{ 282{
281 struct sysfs_dirent *sd = dentry->d_fsdata; 283 struct sysfs_dirent *sd = dentry->d_fsdata;
282 struct sysfs_dirent *parent_sd = sd->s_parent; 284 struct sysfs_dirent *parent_sd = sd->s_parent;
@@ -295,7 +297,7 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
295 int error = -ENOMEM; 297 int error = -ENOMEM;
296 unsigned long page = get_zeroed_page(GFP_KERNEL); 298 unsigned long page = get_zeroed_page(GFP_KERNEL);
297 if (page) { 299 if (page) {
298 error = sysfs_getlink(dentry, (char *) page); 300 error = sysfs_getlink(dentry, (char *) page);
299 if (error < 0) 301 if (error < 0)
300 free_page((unsigned long)page); 302 free_page((unsigned long)page);
301 } 303 }
@@ -303,7 +305,8 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
303 return NULL; 305 return NULL;
304} 306}
305 307
306static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 308static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
309 void *cookie)
307{ 310{
308 char *page = nd_get_link(nd); 311 char *page = nd_get_link(nd);
309 if (!IS_ERR(page)) 312 if (!IS_ERR(page))
@@ -319,8 +322,3 @@ const struct inode_operations sysfs_symlink_inode_operations = {
319 .getattr = sysfs_getattr, 322 .getattr = sysfs_getattr,
320 .permission = sysfs_permission, 323 .permission = sysfs_permission,
321}; 324};
322
323
324EXPORT_SYMBOL_GPL(sysfs_create_link);
325EXPORT_SYMBOL_GPL(sysfs_remove_link);
326EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d1e4043eb0c3..b6deca3e301d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -78,7 +78,7 @@ struct sysfs_dirent {
78 }; 78 };
79 79
80 unsigned short s_flags; 80 unsigned short s_flags;
81 umode_t s_mode; 81 umode_t s_mode;
82 unsigned int s_ino; 82 unsigned int s_ino;
83 struct sysfs_inode_attrs *s_iattr; 83 struct sysfs_inode_attrs *s_iattr;
84}; 84};
@@ -123,9 +123,9 @@ do { \
123 key = &attr->skey; \ 123 key = &attr->skey; \
124 \ 124 \
125 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ 125 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
126} while(0) 126} while (0)
127#else 127#else
128#define sysfs_dirent_init_lockdep(sd) do {} while(0) 128#define sysfs_dirent_init_lockdep(sd) do {} while (0)
129#endif 129#endif
130 130
131/* 131/*
@@ -186,8 +186,8 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
186 struct sysfs_dirent **p_sd); 186 struct sysfs_dirent **p_sd);
187void sysfs_remove_subdir(struct sysfs_dirent *sd); 187void sysfs_remove_subdir(struct sysfs_dirent *sd);
188 188
189int sysfs_rename(struct sysfs_dirent *sd, 189int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
190 struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); 190 const void *ns, const char *new_name);
191 191
192static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) 192static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
193{ 193{
@@ -214,10 +214,12 @@ void sysfs_evict_inode(struct inode *inode);
214int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 214int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
215int sysfs_permission(struct inode *inode, int mask); 215int sysfs_permission(struct inode *inode, int mask);
216int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 216int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
217int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 217int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
218 struct kstat *stat);
218int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, 219int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
219 size_t size, int flags); 220 size_t size, int flags);
220int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); 221int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
222 const char *name);
221int sysfs_inode_init(void); 223int sysfs_inode_init(void);
222 224
223/* 225/*
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index c1a591a4725b..66bc316927e8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
469 struct inode *inode = mapping->host; 469 struct inode *inode = mapping->host;
470 470
471 if (to > inode->i_size) { 471 if (to > inode->i_size) {
472 truncate_pagecache(inode, to, inode->i_size); 472 truncate_pagecache(inode, inode->i_size);
473 sysv_truncate(inode); 473 sysv_truncate(inode);
474 } 474 }
475} 475}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index d0c6a007ce83..eda10959714f 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -487,6 +487,7 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
487 sbi->s_sb = sb; 487 sbi->s_sb = sb;
488 sbi->s_block_base = 0; 488 sbi->s_block_base = 0;
489 sbi->s_type = FSTYPE_V7; 489 sbi->s_type = FSTYPE_V7;
490 mutex_init(&sbi->s_lock);
490 sb->s_fs_info = sbi; 491 sb->s_fs_info = sbi;
491 492
492 sb_set_blocksize(sb, 512); 493 sb_set_blocksize(sb, 512);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7f60e900edff..6e025e02ffde 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2587,10 +2587,11 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
2587 return -EROFS; 2587 return -EROFS;
2588 2588
2589 failing = power_cut_emulated(c, lnum, 1); 2589 failing = power_cut_emulated(c, lnum, 1);
2590 if (failing) 2590 if (failing) {
2591 len = corrupt_data(c, buf, len); 2591 len = corrupt_data(c, buf, len);
2592 ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", 2592 ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
2593 len, lnum, offs); 2593 len, lnum, offs);
2594 }
2594 err = ubi_leb_write(c->ubi, lnum, buf, offs, len); 2595 err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
2595 if (err) 2596 if (err)
2596 return err; 2597 return err;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 9e1d05666fed..f35135e28e96 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,18 +277,25 @@ static int kick_a_thread(void)
277 return 0; 277 return 0;
278} 278}
279 279
280int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) 280unsigned long ubifs_shrink_count(struct shrinker *shrink,
281 struct shrink_control *sc)
281{ 282{
282 int nr = sc->nr_to_scan;
283 int freed, contention = 0;
284 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); 283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
285 284
286 if (nr == 0) 285 /*
287 /* 286 * Due to the way UBIFS updates the clean znode counter it may
288 * Due to the way UBIFS updates the clean znode counter it may 287 * temporarily be negative.
289 * temporarily be negative. 288 */
290 */ 289 return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
291 return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; 290}
291
292unsigned long ubifs_shrink_scan(struct shrinker *shrink,
293 struct shrink_control *sc)
294{
295 unsigned long nr = sc->nr_to_scan;
296 int contention = 0;
297 unsigned long freed;
298 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
292 299
293 if (!clean_zn_cnt) { 300 if (!clean_zn_cnt) {
294 /* 301 /*
@@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
316 323
317 if (!freed && contention) { 324 if (!freed && contention) {
318 dbg_tnc("freed nothing, but contention"); 325 dbg_tnc("freed nothing, but contention");
319 return -1; 326 return SHRINK_STOP;
320 } 327 }
321 328
322out: 329out:
323 dbg_tnc("%d znodes were freed, requested %d", freed, nr); 330 dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
324 return freed; 331 return freed;
325} 332}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 879b9976c12b..3e4aa7281e04 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab;
49 49
50/* UBIFS TNC shrinker description */ 50/* UBIFS TNC shrinker description */
51static struct shrinker ubifs_shrinker_info = { 51static struct shrinker ubifs_shrinker_info = {
52 .shrink = ubifs_shrinker, 52 .scan_objects = ubifs_shrink_scan,
53 .count_objects = ubifs_shrink_count,
53 .seeks = DEFAULT_SEEKS, 54 .seeks = DEFAULT_SEEKS,
54}; 55};
55 56
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2babce4d70f..e8c8cfe1435c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1624int ubifs_tnc_end_commit(struct ubifs_info *c); 1624int ubifs_tnc_end_commit(struct ubifs_info *c);
1625 1625
1626/* shrinker.c */ 1626/* shrinker.c */
1627int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc); 1627unsigned long ubifs_shrink_scan(struct shrinker *shrink,
1628 struct shrink_control *sc);
1629unsigned long ubifs_shrink_count(struct shrinker *shrink,
1630 struct shrink_control *sc);
1628 1631
1629/* commit.c */ 1632/* commit.c */
1630int ubifs_bg_thread(void *info); 1633int ubifs_bg_thread(void *info);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 29569dd08168..c02a27a19c6d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
141 struct file *file = iocb->ki_filp; 141 struct file *file = iocb->ki_filp;
142 struct inode *inode = file_inode(file); 142 struct inode *inode = file_inode(file);
143 int err, pos; 143 int err, pos;
144 size_t count = iocb->ki_left; 144 size_t count = iocb->ki_nbytes;
145 struct udf_inode_info *iinfo = UDF_I(inode); 145 struct udf_inode_info *iinfo = UDF_I(inode);
146 146
147 down_write(&iinfo->i_data_sem); 147 down_write(&iinfo->i_data_sem);
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 7e5aae4bf46f..6eaf5edf1ea1 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -30,18 +30,17 @@ void udf_free_inode(struct inode *inode)
30{ 30{
31 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
32 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
33 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
33 34
34 mutex_lock(&sbi->s_alloc_mutex); 35 if (lvidiu) {
35 if (sbi->s_lvid_bh) { 36 mutex_lock(&sbi->s_alloc_mutex);
36 struct logicalVolIntegrityDescImpUse *lvidiu =
37 udf_sb_lvidiu(sbi);
38 if (S_ISDIR(inode->i_mode)) 37 if (S_ISDIR(inode->i_mode))
39 le32_add_cpu(&lvidiu->numDirs, -1); 38 le32_add_cpu(&lvidiu->numDirs, -1);
40 else 39 else
41 le32_add_cpu(&lvidiu->numFiles, -1); 40 le32_add_cpu(&lvidiu->numFiles, -1);
42 udf_updated_lvid(sb); 41 udf_updated_lvid(sb);
42 mutex_unlock(&sbi->s_alloc_mutex);
43 } 43 }
44 mutex_unlock(&sbi->s_alloc_mutex);
45 44
46 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); 45 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
47} 46}
@@ -55,6 +54,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
55 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 54 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
56 struct udf_inode_info *iinfo; 55 struct udf_inode_info *iinfo;
57 struct udf_inode_info *dinfo = UDF_I(dir); 56 struct udf_inode_info *dinfo = UDF_I(dir);
57 struct logicalVolIntegrityDescImpUse *lvidiu;
58 58
59 inode = new_inode(sb); 59 inode = new_inode(sb);
60 60
@@ -92,12 +92,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 if (sbi->s_lvid_bh) { 95 lvidiu = udf_sb_lvidiu(sb);
96 struct logicalVolIntegrityDescImpUse *lvidiu; 96 if (lvidiu) {
97
98 iinfo->i_unique = lvid_get_unique_id(sb); 97 iinfo->i_unique = lvid_get_unique_id(sb);
99 mutex_lock(&sbi->s_alloc_mutex); 98 mutex_lock(&sbi->s_alloc_mutex);
100 lvidiu = udf_sb_lvidiu(sbi);
101 if (S_ISDIR(mode)) 99 if (S_ISDIR(mode))
102 le32_add_cpu(&lvidiu->numDirs, 1); 100 le32_add_cpu(&lvidiu->numDirs, 1);
103 else 101 else
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b6d15d349810..062b7925bca0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
172 loff_t isize = inode->i_size; 172 loff_t isize = inode->i_size;
173 173
174 if (to > isize) { 174 if (to > isize) {
175 truncate_pagecache(inode, to, isize); 175 truncate_pagecache(inode, isize);
176 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 176 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
177 down_write(&iinfo->i_data_sem); 177 down_write(&iinfo->i_data_sem);
178 udf_clear_extent_cache(inode); 178 udf_clear_extent_cache(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9ac4057a86c9..91219385691d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,13 +94,25 @@ static unsigned int udf_count_free(struct super_block *);
94static int udf_statfs(struct dentry *, struct kstatfs *); 94static int udf_statfs(struct dentry *, struct kstatfs *);
95static int udf_show_options(struct seq_file *, struct dentry *); 95static int udf_show_options(struct seq_file *, struct dentry *);
96 96
97struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) 97struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
98{ 98{
99 struct logicalVolIntegrityDesc *lvid = 99 struct logicalVolIntegrityDesc *lvid;
100 (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 100 unsigned int partnum;
101 __u32 number_of_partitions = le32_to_cpu(lvid->numOfPartitions); 101 unsigned int offset;
102 __u32 offset = number_of_partitions * 2 * 102
103 sizeof(uint32_t)/sizeof(uint8_t); 103 if (!UDF_SB(sb)->s_lvid_bh)
104 return NULL;
105 lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
106 partnum = le32_to_cpu(lvid->numOfPartitions);
107 if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
108 offsetof(struct logicalVolIntegrityDesc, impUse)) /
109 (2 * sizeof(uint32_t)) < partnum) {
110 udf_err(sb, "Logical volume integrity descriptor corrupted "
111 "(numOfPartitions = %u)!\n", partnum);
112 return NULL;
113 }
114 /* The offset is to skip freeSpaceTable and sizeTable arrays */
115 offset = partnum * 2 * sizeof(uint32_t);
104 return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); 116 return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
105} 117}
106 118
@@ -629,6 +641,13 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
629 struct udf_options uopt; 641 struct udf_options uopt;
630 struct udf_sb_info *sbi = UDF_SB(sb); 642 struct udf_sb_info *sbi = UDF_SB(sb);
631 int error = 0; 643 int error = 0;
644 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
645
646 if (lvidiu) {
647 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
648 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
649 return -EACCES;
650 }
632 651
633 uopt.flags = sbi->s_flags; 652 uopt.flags = sbi->s_flags;
634 uopt.uid = sbi->s_uid; 653 uopt.uid = sbi->s_uid;
@@ -649,12 +668,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
649 sbi->s_dmode = uopt.dmode; 668 sbi->s_dmode = uopt.dmode;
650 write_unlock(&sbi->s_cred_lock); 669 write_unlock(&sbi->s_cred_lock);
651 670
652 if (sbi->s_lvid_bh) {
653 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
654 if (write_rev > UDF_MAX_WRITE_VERSION)
655 *flags |= MS_RDONLY;
656 }
657
658 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 671 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
659 goto out_unlock; 672 goto out_unlock;
660 673
@@ -843,27 +856,38 @@ static int udf_find_fileset(struct super_block *sb,
843 return 1; 856 return 1;
844} 857}
845 858
859/*
860 * Load primary Volume Descriptor Sequence
861 *
862 * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence
863 * should be tried.
864 */
846static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 865static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
847{ 866{
848 struct primaryVolDesc *pvoldesc; 867 struct primaryVolDesc *pvoldesc;
849 struct ustr *instr, *outstr; 868 struct ustr *instr, *outstr;
850 struct buffer_head *bh; 869 struct buffer_head *bh;
851 uint16_t ident; 870 uint16_t ident;
852 int ret = 1; 871 int ret = -ENOMEM;
853 872
854 instr = kmalloc(sizeof(struct ustr), GFP_NOFS); 873 instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
855 if (!instr) 874 if (!instr)
856 return 1; 875 return -ENOMEM;
857 876
858 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); 877 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
859 if (!outstr) 878 if (!outstr)
860 goto out1; 879 goto out1;
861 880
862 bh = udf_read_tagged(sb, block, block, &ident); 881 bh = udf_read_tagged(sb, block, block, &ident);
863 if (!bh) 882 if (!bh) {
883 ret = -EAGAIN;
864 goto out2; 884 goto out2;
885 }
865 886
866 BUG_ON(ident != TAG_IDENT_PVD); 887 if (ident != TAG_IDENT_PVD) {
888 ret = -EIO;
889 goto out_bh;
890 }
867 891
868 pvoldesc = (struct primaryVolDesc *)bh->b_data; 892 pvoldesc = (struct primaryVolDesc *)bh->b_data;
869 893
@@ -889,8 +913,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
889 if (udf_CS0toUTF8(outstr, instr)) 913 if (udf_CS0toUTF8(outstr, instr))
890 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); 914 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
891 915
892 brelse(bh);
893 ret = 0; 916 ret = 0;
917out_bh:
918 brelse(bh);
894out2: 919out2:
895 kfree(outstr); 920 kfree(outstr);
896out1: 921out1:
@@ -947,7 +972,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
947 972
948 if (mdata->s_mirror_fe == NULL) { 973 if (mdata->s_mirror_fe == NULL) {
949 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); 974 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
950 goto error_exit; 975 return -EIO;
951 } 976 }
952 } 977 }
953 978
@@ -964,23 +989,18 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
964 addr.logicalBlockNum, addr.partitionReferenceNum); 989 addr.logicalBlockNum, addr.partitionReferenceNum);
965 990
966 mdata->s_bitmap_fe = udf_iget(sb, &addr); 991 mdata->s_bitmap_fe = udf_iget(sb, &addr);
967
968 if (mdata->s_bitmap_fe == NULL) { 992 if (mdata->s_bitmap_fe == NULL) {
969 if (sb->s_flags & MS_RDONLY) 993 if (sb->s_flags & MS_RDONLY)
970 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); 994 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
971 else { 995 else {
972 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); 996 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
973 goto error_exit; 997 return -EIO;
974 } 998 }
975 } 999 }
976 } 1000 }
977 1001
978 udf_debug("udf_load_metadata_files Ok\n"); 1002 udf_debug("udf_load_metadata_files Ok\n");
979
980 return 0; 1003 return 0;
981
982error_exit:
983 return 1;
984} 1004}
985 1005
986static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 1006static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1069,7 +1089,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1069 if (!map->s_uspace.s_table) { 1089 if (!map->s_uspace.s_table) {
1070 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1090 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1071 p_index); 1091 p_index);
1072 return 1; 1092 return -EIO;
1073 } 1093 }
1074 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; 1094 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
1075 udf_debug("unallocSpaceTable (part %d) @ %ld\n", 1095 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
@@ -1079,7 +1099,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1079 if (phd->unallocSpaceBitmap.extLength) { 1099 if (phd->unallocSpaceBitmap.extLength) {
1080 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); 1100 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
1081 if (!bitmap) 1101 if (!bitmap)
1082 return 1; 1102 return -ENOMEM;
1083 map->s_uspace.s_bitmap = bitmap; 1103 map->s_uspace.s_bitmap = bitmap;
1084 bitmap->s_extPosition = le32_to_cpu( 1104 bitmap->s_extPosition = le32_to_cpu(
1085 phd->unallocSpaceBitmap.extPosition); 1105 phd->unallocSpaceBitmap.extPosition);
@@ -1102,7 +1122,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1102 if (!map->s_fspace.s_table) { 1122 if (!map->s_fspace.s_table) {
1103 udf_debug("cannot load freedSpaceTable (part %d)\n", 1123 udf_debug("cannot load freedSpaceTable (part %d)\n",
1104 p_index); 1124 p_index);
1105 return 1; 1125 return -EIO;
1106 } 1126 }
1107 1127
1108 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; 1128 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
@@ -1113,7 +1133,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1113 if (phd->freedSpaceBitmap.extLength) { 1133 if (phd->freedSpaceBitmap.extLength) {
1114 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); 1134 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
1115 if (!bitmap) 1135 if (!bitmap)
1116 return 1; 1136 return -ENOMEM;
1117 map->s_fspace.s_bitmap = bitmap; 1137 map->s_fspace.s_bitmap = bitmap;
1118 bitmap->s_extPosition = le32_to_cpu( 1138 bitmap->s_extPosition = le32_to_cpu(
1119 phd->freedSpaceBitmap.extPosition); 1139 phd->freedSpaceBitmap.extPosition);
@@ -1165,7 +1185,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1165 udf_find_vat_block(sb, p_index, type1_index, blocks - 1); 1185 udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
1166 } 1186 }
1167 if (!sbi->s_vat_inode) 1187 if (!sbi->s_vat_inode)
1168 return 1; 1188 return -EIO;
1169 1189
1170 if (map->s_partition_type == UDF_VIRTUAL_MAP15) { 1190 if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
1171 map->s_type_specific.s_virtual.s_start_offset = 0; 1191 map->s_type_specific.s_virtual.s_start_offset = 0;
@@ -1177,7 +1197,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1177 pos = udf_block_map(sbi->s_vat_inode, 0); 1197 pos = udf_block_map(sbi->s_vat_inode, 0);
1178 bh = sb_bread(sb, pos); 1198 bh = sb_bread(sb, pos);
1179 if (!bh) 1199 if (!bh)
1180 return 1; 1200 return -EIO;
1181 vat20 = (struct virtualAllocationTable20 *)bh->b_data; 1201 vat20 = (struct virtualAllocationTable20 *)bh->b_data;
1182 } else { 1202 } else {
1183 vat20 = (struct virtualAllocationTable20 *) 1203 vat20 = (struct virtualAllocationTable20 *)
@@ -1195,6 +1215,12 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1195 return 0; 1215 return 0;
1196} 1216}
1197 1217
1218/*
1219 * Load partition descriptor block
1220 *
1221 * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor
1222 * sequence.
1223 */
1198static int udf_load_partdesc(struct super_block *sb, sector_t block) 1224static int udf_load_partdesc(struct super_block *sb, sector_t block)
1199{ 1225{
1200 struct buffer_head *bh; 1226 struct buffer_head *bh;
@@ -1204,13 +1230,15 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
1204 int i, type1_idx; 1230 int i, type1_idx;
1205 uint16_t partitionNumber; 1231 uint16_t partitionNumber;
1206 uint16_t ident; 1232 uint16_t ident;
1207 int ret = 0; 1233 int ret;
1208 1234
1209 bh = udf_read_tagged(sb, block, block, &ident); 1235 bh = udf_read_tagged(sb, block, block, &ident);
1210 if (!bh) 1236 if (!bh)
1211 return 1; 1237 return -EAGAIN;
1212 if (ident != TAG_IDENT_PD) 1238 if (ident != TAG_IDENT_PD) {
1239 ret = 0;
1213 goto out_bh; 1240 goto out_bh;
1241 }
1214 1242
1215 p = (struct partitionDesc *)bh->b_data; 1243 p = (struct partitionDesc *)bh->b_data;
1216 partitionNumber = le16_to_cpu(p->partitionNumber); 1244 partitionNumber = le16_to_cpu(p->partitionNumber);
@@ -1229,10 +1257,13 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
1229 if (i >= sbi->s_partitions) { 1257 if (i >= sbi->s_partitions) {
1230 udf_debug("Partition (%d) not found in partition map\n", 1258 udf_debug("Partition (%d) not found in partition map\n",
1231 partitionNumber); 1259 partitionNumber);
1260 ret = 0;
1232 goto out_bh; 1261 goto out_bh;
1233 } 1262 }
1234 1263
1235 ret = udf_fill_partdesc_info(sb, p, i); 1264 ret = udf_fill_partdesc_info(sb, p, i);
1265 if (ret < 0)
1266 goto out_bh;
1236 1267
1237 /* 1268 /*
1238 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and 1269 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
@@ -1249,32 +1280,37 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
1249 break; 1280 break;
1250 } 1281 }
1251 1282
1252 if (i >= sbi->s_partitions) 1283 if (i >= sbi->s_partitions) {
1284 ret = 0;
1253 goto out_bh; 1285 goto out_bh;
1286 }
1254 1287
1255 ret = udf_fill_partdesc_info(sb, p, i); 1288 ret = udf_fill_partdesc_info(sb, p, i);
1256 if (ret) 1289 if (ret < 0)
1257 goto out_bh; 1290 goto out_bh;
1258 1291
1259 if (map->s_partition_type == UDF_METADATA_MAP25) { 1292 if (map->s_partition_type == UDF_METADATA_MAP25) {
1260 ret = udf_load_metadata_files(sb, i); 1293 ret = udf_load_metadata_files(sb, i);
1261 if (ret) { 1294 if (ret < 0) {
1262 udf_err(sb, "error loading MetaData partition map %d\n", 1295 udf_err(sb, "error loading MetaData partition map %d\n",
1263 i); 1296 i);
1264 goto out_bh; 1297 goto out_bh;
1265 } 1298 }
1266 } else { 1299 } else {
1267 ret = udf_load_vat(sb, i, type1_idx);
1268 if (ret)
1269 goto out_bh;
1270 /* 1300 /*
1271 * Mark filesystem read-only if we have a partition with 1301 * If we have a partition with virtual map, we don't handle
1272 * virtual map since we don't handle writing to it (we 1302 * writing to it (we overwrite blocks instead of relocating
1273 * overwrite blocks instead of relocating them). 1303 * them).
1274 */ 1304 */
1275 sb->s_flags |= MS_RDONLY; 1305 if (!(sb->s_flags & MS_RDONLY)) {
1276 pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); 1306 ret = -EACCES;
1307 goto out_bh;
1308 }
1309 ret = udf_load_vat(sb, i, type1_idx);
1310 if (ret < 0)
1311 goto out_bh;
1277 } 1312 }
1313 ret = 0;
1278out_bh: 1314out_bh:
1279 /* In case loading failed, we handle cleanup in udf_fill_super */ 1315 /* In case loading failed, we handle cleanup in udf_fill_super */
1280 brelse(bh); 1316 brelse(bh);
@@ -1340,11 +1376,11 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1340 uint16_t ident; 1376 uint16_t ident;
1341 struct buffer_head *bh; 1377 struct buffer_head *bh;
1342 unsigned int table_len; 1378 unsigned int table_len;
1343 int ret = 0; 1379 int ret;
1344 1380
1345 bh = udf_read_tagged(sb, block, block, &ident); 1381 bh = udf_read_tagged(sb, block, block, &ident);
1346 if (!bh) 1382 if (!bh)
1347 return 1; 1383 return -EAGAIN;
1348 BUG_ON(ident != TAG_IDENT_LVD); 1384 BUG_ON(ident != TAG_IDENT_LVD);
1349 lvd = (struct logicalVolDesc *)bh->b_data; 1385 lvd = (struct logicalVolDesc *)bh->b_data;
1350 table_len = le32_to_cpu(lvd->mapTableLength); 1386 table_len = le32_to_cpu(lvd->mapTableLength);
@@ -1352,7 +1388,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1352 udf_err(sb, "error loading logical volume descriptor: " 1388 udf_err(sb, "error loading logical volume descriptor: "
1353 "Partition table too long (%u > %lu)\n", table_len, 1389 "Partition table too long (%u > %lu)\n", table_len,
1354 sb->s_blocksize - sizeof(*lvd)); 1390 sb->s_blocksize - sizeof(*lvd));
1355 ret = 1; 1391 ret = -EIO;
1356 goto out_bh; 1392 goto out_bh;
1357 } 1393 }
1358 1394
@@ -1396,11 +1432,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1396 } else if (!strncmp(upm2->partIdent.ident, 1432 } else if (!strncmp(upm2->partIdent.ident,
1397 UDF_ID_SPARABLE, 1433 UDF_ID_SPARABLE,
1398 strlen(UDF_ID_SPARABLE))) { 1434 strlen(UDF_ID_SPARABLE))) {
1399 if (udf_load_sparable_map(sb, map, 1435 ret = udf_load_sparable_map(sb, map,
1400 (struct sparablePartitionMap *)gpm) < 0) { 1436 (struct sparablePartitionMap *)gpm);
1401 ret = 1; 1437 if (ret < 0)
1402 goto out_bh; 1438 goto out_bh;
1403 }
1404 } else if (!strncmp(upm2->partIdent.ident, 1439 } else if (!strncmp(upm2->partIdent.ident,
1405 UDF_ID_METADATA, 1440 UDF_ID_METADATA,
1406 strlen(UDF_ID_METADATA))) { 1441 strlen(UDF_ID_METADATA))) {
@@ -1465,7 +1500,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1465 } 1500 }
1466 if (lvd->integritySeqExt.extLength) 1501 if (lvd->integritySeqExt.extLength)
1467 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); 1502 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
1468 1503 ret = 0;
1469out_bh: 1504out_bh:
1470 brelse(bh); 1505 brelse(bh);
1471 return ret; 1506 return ret;
@@ -1503,22 +1538,18 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
1503} 1538}
1504 1539
1505/* 1540/*
1506 * udf_process_sequence 1541 * Process a main/reserve volume descriptor sequence.
1507 * 1542 * @block First block of first extent of the sequence.
1508 * PURPOSE 1543 * @lastblock Lastblock of first extent of the sequence.
1509 * Process a main/reserve volume descriptor sequence. 1544 * @fileset There we store extent containing root fileset
1510 * 1545 *
1511 * PRE-CONDITIONS 1546 * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor
1512 * sb Pointer to _locked_ superblock. 1547 * sequence
1513 * block First block of first extent of the sequence.
1514 * lastblock Lastblock of first extent of the sequence.
1515 *
1516 * HISTORY
1517 * July 1, 1997 - Andrew E. Mileski
1518 * Written, tested, and released.
1519 */ 1548 */
1520static noinline int udf_process_sequence(struct super_block *sb, long block, 1549static noinline int udf_process_sequence(
1521 long lastblock, struct kernel_lb_addr *fileset) 1550 struct super_block *sb,
1551 sector_t block, sector_t lastblock,
1552 struct kernel_lb_addr *fileset)
1522{ 1553{
1523 struct buffer_head *bh = NULL; 1554 struct buffer_head *bh = NULL;
1524 struct udf_vds_record vds[VDS_POS_LENGTH]; 1555 struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1529,6 +1560,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1529 uint32_t vdsn; 1560 uint32_t vdsn;
1530 uint16_t ident; 1561 uint16_t ident;
1531 long next_s = 0, next_e = 0; 1562 long next_s = 0, next_e = 0;
1563 int ret;
1532 1564
1533 memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); 1565 memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
1534 1566
@@ -1543,7 +1575,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1543 udf_err(sb, 1575 udf_err(sb,
1544 "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", 1576 "Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
1545 (unsigned long long)block); 1577 (unsigned long long)block);
1546 return 1; 1578 return -EAGAIN;
1547 } 1579 }
1548 1580
1549 /* Process each descriptor (ISO 13346 3/8.3-8.4) */ 1581 /* Process each descriptor (ISO 13346 3/8.3-8.4) */
@@ -1616,14 +1648,19 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1616 */ 1648 */
1617 if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { 1649 if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
1618 udf_err(sb, "Primary Volume Descriptor not found!\n"); 1650 udf_err(sb, "Primary Volume Descriptor not found!\n");
1619 return 1; 1651 return -EAGAIN;
1652 }
1653 ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block);
1654 if (ret < 0)
1655 return ret;
1656
1657 if (vds[VDS_POS_LOGICAL_VOL_DESC].block) {
1658 ret = udf_load_logicalvol(sb,
1659 vds[VDS_POS_LOGICAL_VOL_DESC].block,
1660 fileset);
1661 if (ret < 0)
1662 return ret;
1620 } 1663 }
1621 if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
1622 return 1;
1623
1624 if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
1625 vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
1626 return 1;
1627 1664
1628 if (vds[VDS_POS_PARTITION_DESC].block) { 1665 if (vds[VDS_POS_PARTITION_DESC].block) {
1629 /* 1666 /*
@@ -1632,19 +1669,27 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1632 */ 1669 */
1633 for (block = vds[VDS_POS_PARTITION_DESC].block; 1670 for (block = vds[VDS_POS_PARTITION_DESC].block;
1634 block < vds[VDS_POS_TERMINATING_DESC].block; 1671 block < vds[VDS_POS_TERMINATING_DESC].block;
1635 block++) 1672 block++) {
1636 if (udf_load_partdesc(sb, block)) 1673 ret = udf_load_partdesc(sb, block);
1637 return 1; 1674 if (ret < 0)
1675 return ret;
1676 }
1638 } 1677 }
1639 1678
1640 return 0; 1679 return 0;
1641} 1680}
1642 1681
1682/*
1683 * Load Volume Descriptor Sequence described by anchor in bh
1684 *
1685 * Returns <0 on error, 0 on success
1686 */
1643static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, 1687static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1644 struct kernel_lb_addr *fileset) 1688 struct kernel_lb_addr *fileset)
1645{ 1689{
1646 struct anchorVolDescPtr *anchor; 1690 struct anchorVolDescPtr *anchor;
1647 long main_s, main_e, reserve_s, reserve_e; 1691 sector_t main_s, main_e, reserve_s, reserve_e;
1692 int ret;
1648 1693
1649 anchor = (struct anchorVolDescPtr *)bh->b_data; 1694 anchor = (struct anchorVolDescPtr *)bh->b_data;
1650 1695
@@ -1662,18 +1707,26 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1662 1707
1663 /* Process the main & reserve sequences */ 1708 /* Process the main & reserve sequences */
1664 /* responsible for finding the PartitionDesc(s) */ 1709 /* responsible for finding the PartitionDesc(s) */
1665 if (!udf_process_sequence(sb, main_s, main_e, fileset)) 1710 ret = udf_process_sequence(sb, main_s, main_e, fileset);
1666 return 1; 1711 if (ret != -EAGAIN)
1667 udf_sb_free_partitions(sb); 1712 return ret;
1668 if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset))
1669 return 1;
1670 udf_sb_free_partitions(sb); 1713 udf_sb_free_partitions(sb);
1671 return 0; 1714 ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset);
1715 if (ret < 0) {
1716 udf_sb_free_partitions(sb);
1717 /* No sequence was OK, return -EIO */
1718 if (ret == -EAGAIN)
1719 ret = -EIO;
1720 }
1721 return ret;
1672} 1722}
1673 1723
1674/* 1724/*
1675 * Check whether there is an anchor block in the given block and 1725 * Check whether there is an anchor block in the given block and
1676 * load Volume Descriptor Sequence if so. 1726 * load Volume Descriptor Sequence if so.
1727 *
1728 * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor
1729 * block
1677 */ 1730 */
1678static int udf_check_anchor_block(struct super_block *sb, sector_t block, 1731static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1679 struct kernel_lb_addr *fileset) 1732 struct kernel_lb_addr *fileset)
@@ -1685,33 +1738,40 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1685 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && 1738 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
1686 udf_fixed_to_variable(block) >= 1739 udf_fixed_to_variable(block) >=
1687 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) 1740 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
1688 return 0; 1741 return -EAGAIN;
1689 1742
1690 bh = udf_read_tagged(sb, block, block, &ident); 1743 bh = udf_read_tagged(sb, block, block, &ident);
1691 if (!bh) 1744 if (!bh)
1692 return 0; 1745 return -EAGAIN;
1693 if (ident != TAG_IDENT_AVDP) { 1746 if (ident != TAG_IDENT_AVDP) {
1694 brelse(bh); 1747 brelse(bh);
1695 return 0; 1748 return -EAGAIN;
1696 } 1749 }
1697 ret = udf_load_sequence(sb, bh, fileset); 1750 ret = udf_load_sequence(sb, bh, fileset);
1698 brelse(bh); 1751 brelse(bh);
1699 return ret; 1752 return ret;
1700} 1753}
1701 1754
1702/* Search for an anchor volume descriptor pointer */ 1755/*
1703static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, 1756 * Search for an anchor volume descriptor pointer.
1704 struct kernel_lb_addr *fileset) 1757 *
1758 * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set
1759 * of anchors.
1760 */
1761static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
1762 struct kernel_lb_addr *fileset)
1705{ 1763{
1706 sector_t last[6]; 1764 sector_t last[6];
1707 int i; 1765 int i;
1708 struct udf_sb_info *sbi = UDF_SB(sb); 1766 struct udf_sb_info *sbi = UDF_SB(sb);
1709 int last_count = 0; 1767 int last_count = 0;
1768 int ret;
1710 1769
1711 /* First try user provided anchor */ 1770 /* First try user provided anchor */
1712 if (sbi->s_anchor) { 1771 if (sbi->s_anchor) {
1713 if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) 1772 ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset);
1714 return lastblock; 1773 if (ret != -EAGAIN)
1774 return ret;
1715 } 1775 }
1716 /* 1776 /*
1717 * according to spec, anchor is in either: 1777 * according to spec, anchor is in either:
@@ -1720,39 +1780,46 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1720 * lastblock 1780 * lastblock
1721 * however, if the disc isn't closed, it could be 512. 1781 * however, if the disc isn't closed, it could be 512.
1722 */ 1782 */
1723 if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) 1783 ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset);
1724 return lastblock; 1784 if (ret != -EAGAIN)
1785 return ret;
1725 /* 1786 /*
1726 * The trouble is which block is the last one. Drives often misreport 1787 * The trouble is which block is the last one. Drives often misreport
1727 * this so we try various possibilities. 1788 * this so we try various possibilities.
1728 */ 1789 */
1729 last[last_count++] = lastblock; 1790 last[last_count++] = *lastblock;
1730 if (lastblock >= 1) 1791 if (*lastblock >= 1)
1731 last[last_count++] = lastblock - 1; 1792 last[last_count++] = *lastblock - 1;
1732 last[last_count++] = lastblock + 1; 1793 last[last_count++] = *lastblock + 1;
1733 if (lastblock >= 2) 1794 if (*lastblock >= 2)
1734 last[last_count++] = lastblock - 2; 1795 last[last_count++] = *lastblock - 2;
1735 if (lastblock >= 150) 1796 if (*lastblock >= 150)
1736 last[last_count++] = lastblock - 150; 1797 last[last_count++] = *lastblock - 150;
1737 if (lastblock >= 152) 1798 if (*lastblock >= 152)
1738 last[last_count++] = lastblock - 152; 1799 last[last_count++] = *lastblock - 152;
1739 1800
1740 for (i = 0; i < last_count; i++) { 1801 for (i = 0; i < last_count; i++) {
1741 if (last[i] >= sb->s_bdev->bd_inode->i_size >> 1802 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
1742 sb->s_blocksize_bits) 1803 sb->s_blocksize_bits)
1743 continue; 1804 continue;
1744 if (udf_check_anchor_block(sb, last[i], fileset)) 1805 ret = udf_check_anchor_block(sb, last[i], fileset);
1745 return last[i]; 1806 if (ret != -EAGAIN) {
1807 if (!ret)
1808 *lastblock = last[i];
1809 return ret;
1810 }
1746 if (last[i] < 256) 1811 if (last[i] < 256)
1747 continue; 1812 continue;
1748 if (udf_check_anchor_block(sb, last[i] - 256, fileset)) 1813 ret = udf_check_anchor_block(sb, last[i] - 256, fileset);
1749 return last[i]; 1814 if (ret != -EAGAIN) {
1815 if (!ret)
1816 *lastblock = last[i];
1817 return ret;
1818 }
1750 } 1819 }
1751 1820
1752 /* Finally try block 512 in case media is open */ 1821 /* Finally try block 512 in case media is open */
1753 if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) 1822 return udf_check_anchor_block(sb, sbi->s_session + 512, fileset);
1754 return last[0];
1755 return 0;
1756} 1823}
1757 1824
1758/* 1825/*
@@ -1760,54 +1827,59 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1760 * area specified by it. The function expects sbi->s_lastblock to be the last 1827 * area specified by it. The function expects sbi->s_lastblock to be the last
1761 * block on the media. 1828 * block on the media.
1762 * 1829 *
1763 * Return 1 if ok, 0 if not found. 1830 * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor
1764 * 1831 * was not found.
1765 */ 1832 */
1766static int udf_find_anchor(struct super_block *sb, 1833static int udf_find_anchor(struct super_block *sb,
1767 struct kernel_lb_addr *fileset) 1834 struct kernel_lb_addr *fileset)
1768{ 1835{
1769 sector_t lastblock;
1770 struct udf_sb_info *sbi = UDF_SB(sb); 1836 struct udf_sb_info *sbi = UDF_SB(sb);
1837 sector_t lastblock = sbi->s_last_block;
1838 int ret;
1771 1839
1772 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); 1840 ret = udf_scan_anchors(sb, &lastblock, fileset);
1773 if (lastblock) 1841 if (ret != -EAGAIN)
1774 goto out; 1842 goto out;
1775 1843
1776 /* No anchor found? Try VARCONV conversion of block numbers */ 1844 /* No anchor found? Try VARCONV conversion of block numbers */
1777 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); 1845 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
1846 lastblock = udf_variable_to_fixed(sbi->s_last_block);
1778 /* Firstly, we try to not convert number of the last block */ 1847 /* Firstly, we try to not convert number of the last block */
1779 lastblock = udf_scan_anchors(sb, 1848 ret = udf_scan_anchors(sb, &lastblock, fileset);
1780 udf_variable_to_fixed(sbi->s_last_block), 1849 if (ret != -EAGAIN)
1781 fileset);
1782 if (lastblock)
1783 goto out; 1850 goto out;
1784 1851
1852 lastblock = sbi->s_last_block;
1785 /* Secondly, we try with converted number of the last block */ 1853 /* Secondly, we try with converted number of the last block */
1786 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); 1854 ret = udf_scan_anchors(sb, &lastblock, fileset);
1787 if (!lastblock) { 1855 if (ret < 0) {
1788 /* VARCONV didn't help. Clear it. */ 1856 /* VARCONV didn't help. Clear it. */
1789 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); 1857 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
1790 return 0;
1791 } 1858 }
1792out: 1859out:
1793 sbi->s_last_block = lastblock; 1860 if (ret == 0)
1794 return 1; 1861 sbi->s_last_block = lastblock;
1862 return ret;
1795} 1863}
1796 1864
1797/* 1865/*
1798 * Check Volume Structure Descriptor, find Anchor block and load Volume 1866 * Check Volume Structure Descriptor, find Anchor block and load Volume
1799 * Descriptor Sequence 1867 * Descriptor Sequence.
1868 *
1869 * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor
1870 * block was not found.
1800 */ 1871 */
1801static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, 1872static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1802 int silent, struct kernel_lb_addr *fileset) 1873 int silent, struct kernel_lb_addr *fileset)
1803{ 1874{
1804 struct udf_sb_info *sbi = UDF_SB(sb); 1875 struct udf_sb_info *sbi = UDF_SB(sb);
1805 loff_t nsr_off; 1876 loff_t nsr_off;
1877 int ret;
1806 1878
1807 if (!sb_set_blocksize(sb, uopt->blocksize)) { 1879 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1808 if (!silent) 1880 if (!silent)
1809 udf_warn(sb, "Bad block size\n"); 1881 udf_warn(sb, "Bad block size\n");
1810 return 0; 1882 return -EINVAL;
1811 } 1883 }
1812 sbi->s_last_block = uopt->lastblock; 1884 sbi->s_last_block = uopt->lastblock;
1813 if (!uopt->novrs) { 1885 if (!uopt->novrs) {
@@ -1828,12 +1900,13 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1828 1900
1829 /* Look for anchor block and load Volume Descriptor Sequence */ 1901 /* Look for anchor block and load Volume Descriptor Sequence */
1830 sbi->s_anchor = uopt->anchor; 1902 sbi->s_anchor = uopt->anchor;
1831 if (!udf_find_anchor(sb, fileset)) { 1903 ret = udf_find_anchor(sb, fileset);
1832 if (!silent) 1904 if (ret < 0) {
1905 if (!silent && ret == -EAGAIN)
1833 udf_warn(sb, "No anchor found\n"); 1906 udf_warn(sb, "No anchor found\n");
1834 return 0; 1907 return ret;
1835 } 1908 }
1836 return 1; 1909 return 0;
1837} 1910}
1838 1911
1839static void udf_open_lvid(struct super_block *sb) 1912static void udf_open_lvid(struct super_block *sb)
@@ -1845,11 +1918,12 @@ static void udf_open_lvid(struct super_block *sb)
1845 1918
1846 if (!bh) 1919 if (!bh)
1847 return; 1920 return;
1848
1849 mutex_lock(&sbi->s_alloc_mutex);
1850 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1921 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1851 lvidiu = udf_sb_lvidiu(sbi); 1922 lvidiu = udf_sb_lvidiu(sb);
1923 if (!lvidiu)
1924 return;
1852 1925
1926 mutex_lock(&sbi->s_alloc_mutex);
1853 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1927 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1854 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1928 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1855 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, 1929 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
@@ -1877,10 +1951,12 @@ static void udf_close_lvid(struct super_block *sb)
1877 1951
1878 if (!bh) 1952 if (!bh)
1879 return; 1953 return;
1954 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1955 lvidiu = udf_sb_lvidiu(sb);
1956 if (!lvidiu)
1957 return;
1880 1958
1881 mutex_lock(&sbi->s_alloc_mutex); 1959 mutex_lock(&sbi->s_alloc_mutex);
1882 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1883 lvidiu = udf_sb_lvidiu(sbi);
1884 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1960 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1885 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1961 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1886 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); 1962 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
@@ -1939,7 +2015,7 @@ u64 lvid_get_unique_id(struct super_block *sb)
1939 2015
1940static int udf_fill_super(struct super_block *sb, void *options, int silent) 2016static int udf_fill_super(struct super_block *sb, void *options, int silent)
1941{ 2017{
1942 int ret; 2018 int ret = -EINVAL;
1943 struct inode *inode = NULL; 2019 struct inode *inode = NULL;
1944 struct udf_options uopt; 2020 struct udf_options uopt;
1945 struct kernel_lb_addr rootdir, fileset; 2021 struct kernel_lb_addr rootdir, fileset;
@@ -2011,7 +2087,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2011 } else { 2087 } else {
2012 uopt.blocksize = bdev_logical_block_size(sb->s_bdev); 2088 uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
2013 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 2089 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
2014 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { 2090 if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
2015 if (!silent) 2091 if (!silent)
2016 pr_notice("Rescanning with blocksize %d\n", 2092 pr_notice("Rescanning with blocksize %d\n",
2017 UDF_DEFAULT_BLOCKSIZE); 2093 UDF_DEFAULT_BLOCKSIZE);
@@ -2021,8 +2097,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2021 ret = udf_load_vrs(sb, &uopt, silent, &fileset); 2097 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
2022 } 2098 }
2023 } 2099 }
2024 if (!ret) { 2100 if (ret < 0) {
2025 udf_warn(sb, "No partition found (1)\n"); 2101 if (ret == -EAGAIN) {
2102 udf_warn(sb, "No partition found (1)\n");
2103 ret = -EINVAL;
2104 }
2026 goto error_out; 2105 goto error_out;
2027 } 2106 }
2028 2107
@@ -2030,19 +2109,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2030 2109
2031 if (sbi->s_lvid_bh) { 2110 if (sbi->s_lvid_bh) {
2032 struct logicalVolIntegrityDescImpUse *lvidiu = 2111 struct logicalVolIntegrityDescImpUse *lvidiu =
2033 udf_sb_lvidiu(sbi); 2112 udf_sb_lvidiu(sb);
2034 uint16_t minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev); 2113 uint16_t minUDFReadRev;
2035 uint16_t minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev); 2114 uint16_t minUDFWriteRev;
2036 /* uint16_t maxUDFWriteRev =
2037 le16_to_cpu(lvidiu->maxUDFWriteRev); */
2038 2115
2116 if (!lvidiu) {
2117 ret = -EINVAL;
2118 goto error_out;
2119 }
2120 minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
2121 minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
2039 if (minUDFReadRev > UDF_MAX_READ_VERSION) { 2122 if (minUDFReadRev > UDF_MAX_READ_VERSION) {
2040 udf_err(sb, "minUDFReadRev=%x (max is %x)\n", 2123 udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
2041 le16_to_cpu(lvidiu->minUDFReadRev), 2124 minUDFReadRev,
2042 UDF_MAX_READ_VERSION); 2125 UDF_MAX_READ_VERSION);
2126 ret = -EINVAL;
2127 goto error_out;
2128 } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
2129 !(sb->s_flags & MS_RDONLY)) {
2130 ret = -EACCES;
2043 goto error_out; 2131 goto error_out;
2044 } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) 2132 }
2045 sb->s_flags |= MS_RDONLY;
2046 2133
2047 sbi->s_udfrev = minUDFWriteRev; 2134 sbi->s_udfrev = minUDFWriteRev;
2048 2135
@@ -2054,17 +2141,20 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2054 2141
2055 if (!sbi->s_partitions) { 2142 if (!sbi->s_partitions) {
2056 udf_warn(sb, "No partition found (2)\n"); 2143 udf_warn(sb, "No partition found (2)\n");
2144 ret = -EINVAL;
2057 goto error_out; 2145 goto error_out;
2058 } 2146 }
2059 2147
2060 if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & 2148 if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
2061 UDF_PART_FLAG_READ_ONLY) { 2149 UDF_PART_FLAG_READ_ONLY &&
2062 pr_notice("Partition marked readonly; forcing readonly mount\n"); 2150 !(sb->s_flags & MS_RDONLY)) {
2063 sb->s_flags |= MS_RDONLY; 2151 ret = -EACCES;
2152 goto error_out;
2064 } 2153 }
2065 2154
2066 if (udf_find_fileset(sb, &fileset, &rootdir)) { 2155 if (udf_find_fileset(sb, &fileset, &rootdir)) {
2067 udf_warn(sb, "No fileset found\n"); 2156 udf_warn(sb, "No fileset found\n");
2157 ret = -EINVAL;
2068 goto error_out; 2158 goto error_out;
2069 } 2159 }
2070 2160
@@ -2086,6 +2176,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2086 if (!inode) { 2176 if (!inode) {
2087 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", 2177 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
2088 rootdir.logicalBlockNum, rootdir.partitionReferenceNum); 2178 rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
2179 ret = -EIO;
2089 goto error_out; 2180 goto error_out;
2090 } 2181 }
2091 2182
@@ -2093,6 +2184,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2093 sb->s_root = d_make_root(inode); 2184 sb->s_root = d_make_root(inode);
2094 if (!sb->s_root) { 2185 if (!sb->s_root) {
2095 udf_err(sb, "Couldn't allocate root dentry\n"); 2186 udf_err(sb, "Couldn't allocate root dentry\n");
2187 ret = -ENOMEM;
2096 goto error_out; 2188 goto error_out;
2097 } 2189 }
2098 sb->s_maxbytes = MAX_LFS_FILESIZE; 2190 sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -2113,7 +2205,7 @@ error_out:
2113 kfree(sbi); 2205 kfree(sbi);
2114 sb->s_fs_info = NULL; 2206 sb->s_fs_info = NULL;
2115 2207
2116 return -EINVAL; 2208 return ret;
2117} 2209}
2118 2210
2119void _udf_err(struct super_block *sb, const char *function, 2211void _udf_err(struct super_block *sb, const char *function,
@@ -2193,11 +2285,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2193 struct logicalVolIntegrityDescImpUse *lvidiu; 2285 struct logicalVolIntegrityDescImpUse *lvidiu;
2194 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 2286 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
2195 2287
2196 if (sbi->s_lvid_bh != NULL) 2288 lvidiu = udf_sb_lvidiu(sb);
2197 lvidiu = udf_sb_lvidiu(sbi);
2198 else
2199 lvidiu = NULL;
2200
2201 buf->f_type = UDF_SUPER_MAGIC; 2289 buf->f_type = UDF_SUPER_MAGIC;
2202 buf->f_bsize = sb->s_blocksize; 2290 buf->f_bsize = sb->s_blocksize;
2203 buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; 2291 buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index ed401e94aa8c..1f32c7bd9f57 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -162,7 +162,7 @@ static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
162 return sb->s_fs_info; 162 return sb->s_fs_info;
163} 163}
164 164
165struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi); 165struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb);
166 166
167int udf_compute_nr_groups(struct super_block *sb, u32 partition); 167int udf_compute_nr_groups(struct super_block *sb, u32 partition);
168 168
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ff24e4449ece..c8ca96086784 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
531 struct inode *inode = mapping->host; 531 struct inode *inode = mapping->host;
532 532
533 if (to > inode->i_size) 533 if (to > inode->i_size)
534 truncate_pagecache(inode, to, inode->i_size); 534 truncate_pagecache(inode, inode->i_size);
535} 535}
536 536
537static int ufs_write_begin(struct file *file, struct address_space *mapping, 537static int ufs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4a4508023a3c..0719e4db93f2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -27,9 +27,12 @@ xfs-y += xfs_trace.o
27 27
28# highlevel code 28# highlevel code
29xfs-y += xfs_aops.o \ 29xfs-y += xfs_aops.o \
30 xfs_attr_inactive.o \
31 xfs_attr_list.o \
30 xfs_bit.o \ 32 xfs_bit.o \
33 xfs_bmap_util.o \
31 xfs_buf.o \ 34 xfs_buf.o \
32 xfs_dfrag.o \ 35 xfs_dir2_readdir.o \
33 xfs_discard.o \ 36 xfs_discard.o \
34 xfs_error.o \ 37 xfs_error.o \
35 xfs_export.o \ 38 xfs_export.o \
@@ -44,11 +47,11 @@ xfs-y += xfs_aops.o \
44 xfs_iops.o \ 47 xfs_iops.o \
45 xfs_itable.o \ 48 xfs_itable.o \
46 xfs_message.o \ 49 xfs_message.o \
50 xfs_mount.o \
47 xfs_mru_cache.o \ 51 xfs_mru_cache.o \
48 xfs_rename.o \
49 xfs_super.o \ 52 xfs_super.o \
50 xfs_utils.o \ 53 xfs_symlink.o \
51 xfs_vnodeops.o \ 54 xfs_trans.o \
52 xfs_xattr.o \ 55 xfs_xattr.o \
53 kmem.o \ 56 kmem.o \
54 uuid.o 57 uuid.o
@@ -73,10 +76,13 @@ xfs-y += xfs_alloc.o \
73 xfs_ialloc_btree.o \ 76 xfs_ialloc_btree.o \
74 xfs_icreate_item.o \ 77 xfs_icreate_item.o \
75 xfs_inode.o \ 78 xfs_inode.o \
79 xfs_inode_fork.o \
80 xfs_inode_buf.o \
76 xfs_log_recover.o \ 81 xfs_log_recover.o \
77 xfs_mount.o \ 82 xfs_log_rlimit.o \
78 xfs_symlink.o \ 83 xfs_sb.o \
79 xfs_trans.o 84 xfs_symlink_remote.o \
85 xfs_trans_resv.o
80 86
81# low-level transaction/log code 87# low-level transaction/log code
82xfs-y += xfs_log.o \ 88xfs-y += xfs_log.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 4a7286c1dc80..a02cfb9e3bce 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -27,8 +27,6 @@
27 27
28/* 28/*
29 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
30 *
31 * Must be freed using kmem_free_large.
32 */ 30 */
33void * 31void *
34kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) 32kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
@@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
36 void *ptr; 34 void *ptr;
37 size_t kmsize = maxsize; 35 size_t kmsize = maxsize;
38 36
39 while (!(ptr = kmem_zalloc_large(kmsize))) { 37 while (!(ptr = vzalloc(kmsize))) {
40 if ((kmsize >>= 1) <= minsize) 38 if ((kmsize >>= 1) <= minsize)
41 kmsize = minsize; 39 kmsize = minsize;
42 } 40 }
@@ -75,6 +73,17 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
75 return ptr; 73 return ptr;
76} 74}
77 75
76void *
77kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
78{
79 void *ptr;
80
81 ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
82 if (ptr)
83 return ptr;
84 return vzalloc(size);
85}
86
78void 87void
79kmem_free(const void *ptr) 88kmem_free(const void *ptr)
80{ 89{
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index b2f2620f9a87..3a7371cab508 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -57,17 +57,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
57 57
58extern void *kmem_alloc(size_t, xfs_km_flags_t); 58extern void *kmem_alloc(size_t, xfs_km_flags_t);
59extern void *kmem_zalloc(size_t, xfs_km_flags_t); 59extern void *kmem_zalloc(size_t, xfs_km_flags_t);
60extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
60extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); 61extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
61extern void kmem_free(const void *); 62extern void kmem_free(const void *);
62 63
63static inline void *kmem_zalloc_large(size_t size)
64{
65 return vzalloc(size);
66}
67static inline void kmem_free_large(void *ptr)
68{
69 vfree(ptr);
70}
71 64
72extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); 65extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
73 66
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 306d883d89bc..0e2f37efedd0 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,11 +16,13 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_log_format.h"
20#include "xfs_trans_resv.h"
19#include "xfs_acl.h" 21#include "xfs_acl.h"
20#include "xfs_attr.h" 22#include "xfs_attr.h"
21#include "xfs_bmap_btree.h" 23#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 24#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 25#include "xfs_ag.h"
24#include "xfs_sb.h" 26#include "xfs_sb.h"
25#include "xfs_mount.h" 27#include "xfs_mount.h"
26#include "xfs_trace.h" 28#include "xfs_trace.h"
@@ -68,14 +70,15 @@ xfs_acl_from_disk(
68 70
69 switch (acl_e->e_tag) { 71 switch (acl_e->e_tag) {
70 case ACL_USER: 72 case ACL_USER:
73 acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
74 break;
71 case ACL_GROUP: 75 case ACL_GROUP:
72 acl_e->e_id = be32_to_cpu(ace->ae_id); 76 acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
73 break; 77 break;
74 case ACL_USER_OBJ: 78 case ACL_USER_OBJ:
75 case ACL_GROUP_OBJ: 79 case ACL_GROUP_OBJ:
76 case ACL_MASK: 80 case ACL_MASK:
77 case ACL_OTHER: 81 case ACL_OTHER:
78 acl_e->e_id = ACL_UNDEFINED_ID;
79 break; 82 break;
80 default: 83 default:
81 goto fail; 84 goto fail;
@@ -101,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
101 acl_e = &acl->a_entries[i]; 104 acl_e = &acl->a_entries[i];
102 105
103 ace->ae_tag = cpu_to_be32(acl_e->e_tag); 106 ace->ae_tag = cpu_to_be32(acl_e->e_tag);
104 ace->ae_id = cpu_to_be32(acl_e->e_id); 107 switch (acl_e->e_tag) {
108 case ACL_USER:
109 ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
110 break;
111 case ACL_GROUP:
112 ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
113 break;
114 default:
115 ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
116 break;
117 }
118
105 ace->ae_perm = cpu_to_be16(acl_e->e_perm); 119 ace->ae_perm = cpu_to_be16(acl_e->e_perm);
106 } 120 }
107} 121}
@@ -138,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
138 * go out to the disk. 152 * go out to the disk.
139 */ 153 */
140 len = XFS_ACL_MAX_SIZE(ip->i_mount); 154 len = XFS_ACL_MAX_SIZE(ip->i_mount);
141 xfs_acl = kzalloc(len, GFP_KERNEL); 155 xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
142 if (!xfs_acl) 156 if (!xfs_acl)
143 return ERR_PTR(-ENOMEM); 157 return ERR_PTR(-ENOMEM);
144 158
@@ -161,10 +175,10 @@ xfs_get_acl(struct inode *inode, int type)
161 if (IS_ERR(acl)) 175 if (IS_ERR(acl))
162 goto out; 176 goto out;
163 177
164 out_update_cache: 178out_update_cache:
165 set_cached_acl(inode, type, acl); 179 set_cached_acl(inode, type, acl);
166 out: 180out:
167 kfree(xfs_acl); 181 kmem_free(xfs_acl);
168 return acl; 182 return acl;
169} 183}
170 184
@@ -195,7 +209,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
195 struct xfs_acl *xfs_acl; 209 struct xfs_acl *xfs_acl;
196 int len = XFS_ACL_MAX_SIZE(ip->i_mount); 210 int len = XFS_ACL_MAX_SIZE(ip->i_mount);
197 211
198 xfs_acl = kzalloc(len, GFP_KERNEL); 212 xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
199 if (!xfs_acl) 213 if (!xfs_acl)
200 return -ENOMEM; 214 return -ENOMEM;
201 215
@@ -208,7 +222,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
208 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, 222 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
209 len, ATTR_ROOT); 223 len, ATTR_ROOT);
210 224
211 kfree(xfs_acl); 225 kmem_free(xfs_acl);
212 } else { 226 } else {
213 /* 227 /*
214 * A NULL ACL argument means we want to remove the ACL. 228 * A NULL ACL argument means we want to remove the ACL.
@@ -360,7 +374,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
360 return -EINVAL; 374 return -EINVAL;
361 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) 375 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
362 return value ? -EACCES : 0; 376 return value ? -EACCES : 0;
363 if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) 377 if (!inode_owner_or_capable(inode))
364 return -EPERM; 378 return -EPERM;
365 379
366 if (!value) 380 if (!value)
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 317aa86d96ea..1cb740afd674 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,59 +227,6 @@ typedef struct xfs_agfl {
227} xfs_agfl_t; 227} xfs_agfl_t;
228 228
229/* 229/*
230 * Per-ag incore structure, copies of information in agf and agi,
231 * to improve the performance of allocation group selection.
232 */
233#define XFS_PAGB_NUM_SLOTS 128
234
235typedef struct xfs_perag {
236 struct xfs_mount *pag_mount; /* owner filesystem */
237 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
238 atomic_t pag_ref; /* perag reference count */
239 char pagf_init; /* this agf's entry is initialized */
240 char pagi_init; /* this agi's entry is initialized */
241 char pagf_metadata; /* the agf is preferred to be metadata */
242 char pagi_inodeok; /* The agi is ok for inodes */
243 __uint8_t pagf_levels[XFS_BTNUM_AGF];
244 /* # of levels in bno & cnt btree */
245 __uint32_t pagf_flcount; /* count of blocks in freelist */
246 xfs_extlen_t pagf_freeblks; /* total free blocks */
247 xfs_extlen_t pagf_longest; /* longest free space */
248 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
249 xfs_agino_t pagi_freecount; /* number of free inodes */
250 xfs_agino_t pagi_count; /* number of allocated inodes */
251
252 /*
253 * Inode allocation search lookup optimisation.
254 * If the pagino matches, the search for new inodes
255 * doesn't need to search the near ones again straight away
256 */
257 xfs_agino_t pagl_pagino;
258 xfs_agino_t pagl_leftrec;
259 xfs_agino_t pagl_rightrec;
260#ifdef __KERNEL__
261 spinlock_t pagb_lock; /* lock for pagb_tree */
262 struct rb_root pagb_tree; /* ordered tree of busy extents */
263
264 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
265
266 spinlock_t pag_ici_lock; /* incore inode cache lock */
267 struct radix_tree_root pag_ici_root; /* incore inode cache root */
268 int pag_ici_reclaimable; /* reclaimable inodes */
269 struct mutex pag_ici_reclaim_lock; /* serialisation point */
270 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
271
272 /* buffer cache index */
273 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
274 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
275
276 /* for rcu-safe freeing */
277 struct rcu_head rcu_head;
278#endif
279 int pagb_count; /* pagb slots in use */
280} xfs_perag_t;
281
282/*
283 * tags for inode radix tree 230 * tags for inode radix tree
284 */ 231 */
285#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 232#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 71596e57283a..5a1393f5e020 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -878,7 +878,7 @@ xfs_alloc_ag_vextent_near(
878 xfs_agblock_t ltnew; /* useful start bno of left side */ 878 xfs_agblock_t ltnew; /* useful start bno of left side */
879 xfs_extlen_t rlen; /* length of returned extent */ 879 xfs_extlen_t rlen; /* length of returned extent */
880 int forced = 0; 880 int forced = 0;
881#if defined(DEBUG) && defined(__KERNEL__) 881#ifdef DEBUG
882 /* 882 /*
883 * Randomly don't execute the first algorithm. 883 * Randomly don't execute the first algorithm.
884 */ 884 */
@@ -938,8 +938,8 @@ restart:
938 xfs_extlen_t blen=0; 938 xfs_extlen_t blen=0;
939 xfs_agblock_t bnew=0; 939 xfs_agblock_t bnew=0;
940 940
941#if defined(DEBUG) && defined(__KERNEL__) 941#ifdef DEBUG
942 if (!dofirst) 942 if (dofirst)
943 break; 943 break;
944#endif 944#endif
945 /* 945 /*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 596ec71da00e..e51e581454e9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,9 +28,9 @@
28#include "xfs_alloc.h" 28#include "xfs_alloc.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30#include "xfs_iomap.h" 30#include "xfs_iomap.h"
31#include "xfs_vnodeops.h"
32#include "xfs_trace.h" 31#include "xfs_trace.h"
33#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_bmap_util.h"
34#include <linux/aio.h> 34#include <linux/aio.h>
35#include <linux/gfp.h> 35#include <linux/gfp.h>
36#include <linux/mpage.h> 36#include <linux/mpage.h>
@@ -86,14 +86,6 @@ xfs_destroy_ioend(
86 bh->b_end_io(bh, !ioend->io_error); 86 bh->b_end_io(bh, !ioend->io_error);
87 } 87 }
88 88
89 if (ioend->io_iocb) {
90 inode_dio_done(ioend->io_inode);
91 if (ioend->io_isasync) {
92 aio_complete(ioend->io_iocb, ioend->io_error ?
93 ioend->io_error : ioend->io_result, 0);
94 }
95 }
96
97 mempool_free(ioend, xfs_ioend_pool); 89 mempool_free(ioend, xfs_ioend_pool);
98} 90}
99 91
@@ -116,7 +108,7 @@ xfs_setfilesize_trans_alloc(
116 108
117 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
118 110
119 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 111 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
120 if (error) { 112 if (error) {
121 xfs_trans_cancel(tp, 0); 113 xfs_trans_cancel(tp, 0);
122 return error; 114 return error;
@@ -281,7 +273,6 @@ xfs_alloc_ioend(
281 * all the I/O from calling the completion routine too early. 273 * all the I/O from calling the completion routine too early.
282 */ 274 */
283 atomic_set(&ioend->io_remaining, 1); 275 atomic_set(&ioend->io_remaining, 1);
284 ioend->io_isasync = 0;
285 ioend->io_isdirect = 0; 276 ioend->io_isdirect = 0;
286 ioend->io_error = 0; 277 ioend->io_error = 0;
287 ioend->io_list = NULL; 278 ioend->io_list = NULL;
@@ -291,8 +282,6 @@ xfs_alloc_ioend(
291 ioend->io_buffer_tail = NULL; 282 ioend->io_buffer_tail = NULL;
292 ioend->io_offset = 0; 283 ioend->io_offset = 0;
293 ioend->io_size = 0; 284 ioend->io_size = 0;
294 ioend->io_iocb = NULL;
295 ioend->io_result = 0;
296 ioend->io_append_trans = NULL; 285 ioend->io_append_trans = NULL;
297 286
298 INIT_WORK(&ioend->io_work, xfs_end_io); 287 INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -451,7 +440,7 @@ xfs_start_page_writeback(
451 end_page_writeback(page); 440 end_page_writeback(page);
452} 441}
453 442
454static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) 443static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
455{ 444{
456 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 445 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
457} 446}
@@ -525,7 +514,7 @@ xfs_submit_ioend(
525 goto retry; 514 goto retry;
526 } 515 }
527 516
528 if (bio_add_buffer(bio, bh) != bh->b_size) { 517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
529 xfs_submit_ioend_bio(wbc, ioend, bio); 518 xfs_submit_ioend_bio(wbc, ioend, bio);
530 goto retry; 519 goto retry;
531 } 520 }
@@ -1292,8 +1281,10 @@ __xfs_get_blocks(
1292 if (create || !ISUNWRITTEN(&imap)) 1281 if (create || !ISUNWRITTEN(&imap))
1293 xfs_map_buffer(inode, bh_result, &imap, offset); 1282 xfs_map_buffer(inode, bh_result, &imap, offset);
1294 if (create && ISUNWRITTEN(&imap)) { 1283 if (create && ISUNWRITTEN(&imap)) {
1295 if (direct) 1284 if (direct) {
1296 bh_result->b_private = inode; 1285 bh_result->b_private = inode;
1286 set_buffer_defer_completion(bh_result);
1287 }
1297 set_buffer_unwritten(bh_result); 1288 set_buffer_unwritten(bh_result);
1298 } 1289 }
1299 } 1290 }
@@ -1390,9 +1381,7 @@ xfs_end_io_direct_write(
1390 struct kiocb *iocb, 1381 struct kiocb *iocb,
1391 loff_t offset, 1382 loff_t offset,
1392 ssize_t size, 1383 ssize_t size,
1393 void *private, 1384 void *private)
1394 int ret,
1395 bool is_async)
1396{ 1385{
1397 struct xfs_ioend *ioend = iocb->private; 1386 struct xfs_ioend *ioend = iocb->private;
1398 1387
@@ -1414,17 +1403,10 @@ xfs_end_io_direct_write(
1414 1403
1415 ioend->io_offset = offset; 1404 ioend->io_offset = offset;
1416 ioend->io_size = size; 1405 ioend->io_size = size;
1417 ioend->io_iocb = iocb;
1418 ioend->io_result = ret;
1419 if (private && size > 0) 1406 if (private && size > 0)
1420 ioend->io_type = XFS_IO_UNWRITTEN; 1407 ioend->io_type = XFS_IO_UNWRITTEN;
1421 1408
1422 if (is_async) { 1409 xfs_finish_ioend_sync(ioend);
1423 ioend->io_isasync = 1;
1424 xfs_finish_ioend(ioend);
1425 } else {
1426 xfs_finish_ioend_sync(ioend);
1427 }
1428} 1410}
1429 1411
1430STATIC ssize_t 1412STATIC ssize_t
@@ -1516,13 +1498,26 @@ xfs_vm_write_failed(
1516 loff_t pos, 1498 loff_t pos,
1517 unsigned len) 1499 unsigned len)
1518{ 1500{
1519 loff_t block_offset = pos & PAGE_MASK; 1501 loff_t block_offset;
1520 loff_t block_start; 1502 loff_t block_start;
1521 loff_t block_end; 1503 loff_t block_end;
1522 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1504 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1523 loff_t to = from + len; 1505 loff_t to = from + len;
1524 struct buffer_head *bh, *head; 1506 struct buffer_head *bh, *head;
1525 1507
1508 /*
1509 * The request pos offset might be 32 or 64 bit, this is all fine
1510 * on 64-bit platform. However, for 64-bit pos request on 32-bit
1511 * platform, the high 32-bit will be masked off if we evaluate the
1512 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1513 * 0xfffff000 as an unsigned long, hence the result is incorrect
1514 * which could cause the following ASSERT failed in most cases.
1515 * In order to avoid this, we can evaluate the block_offset of the
1516 * start of the page by using shifts rather than masks the mismatch
1517 * problem.
1518 */
1519 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
1520
1526 ASSERT(block_offset + from == pos); 1521 ASSERT(block_offset + from == pos);
1527 1522
1528 head = page_buffers(page); 1523 head = page_buffers(page);
@@ -1587,7 +1582,7 @@ xfs_vm_write_begin(
1587 unlock_page(page); 1582 unlock_page(page);
1588 1583
1589 if (pos + len > i_size_read(inode)) 1584 if (pos + len > i_size_read(inode))
1590 truncate_pagecache(inode, pos + len, i_size_read(inode)); 1585 truncate_pagecache(inode, i_size_read(inode));
1591 1586
1592 page_cache_release(page); 1587 page_cache_release(page);
1593 page = NULL; 1588 page = NULL;
@@ -1623,7 +1618,7 @@ xfs_vm_write_end(
1623 loff_t to = pos + len; 1618 loff_t to = pos + len;
1624 1619
1625 if (to > isize) { 1620 if (to > isize) {
1626 truncate_pagecache(inode, to, isize); 1621 truncate_pagecache(inode, isize);
1627 xfs_vm_kill_delalloc_range(inode, isize, to); 1622 xfs_vm_kill_delalloc_range(inode, isize, to);
1628 } 1623 }
1629 } 1624 }
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb8d61a..f94dd459dff9 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
45 unsigned int io_type; /* delalloc / unwritten */ 45 unsigned int io_type; /* delalloc / unwritten */
46 int io_error; /* I/O error code */ 46 int io_error; /* I/O error code */
47 atomic_t io_remaining; /* hold count */ 47 atomic_t io_remaining; /* hold count */
48 unsigned int io_isasync : 1; /* needs aio_complete */
49 unsigned int io_isdirect : 1;/* direct I/O */ 48 unsigned int io_isdirect : 1;/* direct I/O */
50 struct inode *io_inode; /* file being written to */ 49 struct inode *io_inode; /* file being written to */
51 struct buffer_head *io_buffer_head;/* buffer linked list head */ 50 struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
54 xfs_off_t io_offset; /* offset in the file */ 53 xfs_off_t io_offset; /* offset in the file */
55 struct work_struct io_work; /* xfsdatad work queue */ 54 struct work_struct io_work; /* xfsdatad work queue */
56 struct xfs_trans *io_append_trans;/* xact. for size update */ 55 struct xfs_trans *io_append_trans;/* xact. for size update */
57 struct kiocb *io_iocb;
58 int io_result;
59} xfs_ioend_t; 56} xfs_ioend_t;
60 57
61extern const struct address_space_operations xfs_address_space_operations; 58extern const struct address_space_operations xfs_address_space_operations;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 20fe3fe9d341..ddcf2267ffa6 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -17,10 +17,11 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h"
24#include "xfs_sb.h" 25#include "xfs_sb.h"
25#include "xfs_ag.h" 26#include "xfs_ag.h"
26#include "xfs_mount.h" 27#include "xfs_mount.h"
@@ -32,13 +33,13 @@
32#include "xfs_alloc.h" 33#include "xfs_alloc.h"
33#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_attr.h" 37#include "xfs_attr.h"
36#include "xfs_attr_leaf.h" 38#include "xfs_attr_leaf.h"
37#include "xfs_attr_remote.h" 39#include "xfs_attr_remote.h"
38#include "xfs_error.h" 40#include "xfs_error.h"
39#include "xfs_quota.h" 41#include "xfs_quota.h"
40#include "xfs_trans_space.h" 42#include "xfs_trans_space.h"
41#include "xfs_vnodeops.h"
42#include "xfs_trace.h" 43#include "xfs_trace.h"
43 44
44/* 45/*
@@ -62,7 +63,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
62STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); 63STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
63STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); 64STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
64STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); 65STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
65STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
66 66
67/* 67/*
68 * Internal routines when attribute list is more than one block. 68 * Internal routines when attribute list is more than one block.
@@ -70,7 +70,6 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
70STATIC int xfs_attr_node_get(xfs_da_args_t *args); 70STATIC int xfs_attr_node_get(xfs_da_args_t *args);
71STATIC int xfs_attr_node_addname(xfs_da_args_t *args); 71STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
72STATIC int xfs_attr_node_removename(xfs_da_args_t *args); 72STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
73STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
74STATIC int xfs_attr_fillstate(xfs_da_state_t *state); 73STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
75STATIC int xfs_attr_refillstate(xfs_da_state_t *state); 74STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
76 75
@@ -90,7 +89,7 @@ xfs_attr_name_to_xname(
90 return 0; 89 return 0;
91} 90}
92 91
93STATIC int 92int
94xfs_inode_hasattr( 93xfs_inode_hasattr(
95 struct xfs_inode *ip) 94 struct xfs_inode *ip)
96{ 95{
@@ -227,13 +226,14 @@ xfs_attr_set_int(
227 int valuelen, 226 int valuelen,
228 int flags) 227 int flags)
229{ 228{
230 xfs_da_args_t args; 229 xfs_da_args_t args;
231 xfs_fsblock_t firstblock; 230 xfs_fsblock_t firstblock;
232 xfs_bmap_free_t flist; 231 xfs_bmap_free_t flist;
233 int error, err2, committed; 232 int error, err2, committed;
234 xfs_mount_t *mp = dp->i_mount; 233 struct xfs_mount *mp = dp->i_mount;
235 int rsvd = (flags & ATTR_ROOT) != 0; 234 struct xfs_trans_res tres;
236 int local; 235 int rsvd = (flags & ATTR_ROOT) != 0;
236 int local;
237 237
238 /* 238 /*
239 * Attach the dquots to the inode. 239 * Attach the dquots to the inode.
@@ -293,11 +293,11 @@ xfs_attr_set_int(
293 if (rsvd) 293 if (rsvd)
294 args.trans->t_flags |= XFS_TRANS_RESERVE; 294 args.trans->t_flags |= XFS_TRANS_RESERVE;
295 295
296 error = xfs_trans_reserve(args.trans, args.total, 296 tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
297 XFS_ATTRSETM_LOG_RES(mp) + 297 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
298 XFS_ATTRSETRT_LOG_RES(mp) * args.total, 298 tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
299 0, XFS_TRANS_PERM_LOG_RES, 299 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
300 XFS_ATTRSET_LOG_COUNT); 300 error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
301 if (error) { 301 if (error) {
302 xfs_trans_cancel(args.trans, 0); 302 xfs_trans_cancel(args.trans, 0);
303 return(error); 303 return(error);
@@ -517,11 +517,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
517 if (flags & ATTR_ROOT) 517 if (flags & ATTR_ROOT)
518 args.trans->t_flags |= XFS_TRANS_RESERVE; 518 args.trans->t_flags |= XFS_TRANS_RESERVE;
519 519
520 if ((error = xfs_trans_reserve(args.trans, 520 error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
521 XFS_ATTRRM_SPACE_RES(mp), 521 XFS_ATTRRM_SPACE_RES(mp), 0);
522 XFS_ATTRRM_LOG_RES(mp), 522 if (error) {
523 0, XFS_TRANS_PERM_LOG_RES,
524 XFS_ATTRRM_LOG_COUNT))) {
525 xfs_trans_cancel(args.trans, 0); 523 xfs_trans_cancel(args.trans, 0);
526 return(error); 524 return(error);
527 } 525 }
@@ -611,228 +609,6 @@ xfs_attr_remove(
611 return xfs_attr_remove_int(dp, &xname, flags); 609 return xfs_attr_remove_int(dp, &xname, flags);
612} 610}
613 611
614int
615xfs_attr_list_int(xfs_attr_list_context_t *context)
616{
617 int error;
618 xfs_inode_t *dp = context->dp;
619
620 XFS_STATS_INC(xs_attr_list);
621
622 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
623 return EIO;
624
625 xfs_ilock(dp, XFS_ILOCK_SHARED);
626
627 /*
628 * Decide on what work routines to call based on the inode size.
629 */
630 if (!xfs_inode_hasattr(dp)) {
631 error = 0;
632 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
633 error = xfs_attr_shortform_list(context);
634 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
635 error = xfs_attr_leaf_list(context);
636 } else {
637 error = xfs_attr_node_list(context);
638 }
639
640 xfs_iunlock(dp, XFS_ILOCK_SHARED);
641
642 return error;
643}
644
645#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
646 (((struct attrlist_ent *) 0)->a_name - (char *) 0)
647#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
648 ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
649 & ~(sizeof(u_int32_t)-1))
650
651/*
652 * Format an attribute and copy it out to the user's buffer.
653 * Take care to check values and protect against them changing later,
654 * we may be reading them directly out of a user buffer.
655 */
656/*ARGSUSED*/
657STATIC int
658xfs_attr_put_listent(
659 xfs_attr_list_context_t *context,
660 int flags,
661 unsigned char *name,
662 int namelen,
663 int valuelen,
664 unsigned char *value)
665{
666 struct attrlist *alist = (struct attrlist *)context->alist;
667 attrlist_ent_t *aep;
668 int arraytop;
669
670 ASSERT(!(context->flags & ATTR_KERNOVAL));
671 ASSERT(context->count >= 0);
672 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
673 ASSERT(context->firstu >= sizeof(*alist));
674 ASSERT(context->firstu <= context->bufsize);
675
676 /*
677 * Only list entries in the right namespace.
678 */
679 if (((context->flags & ATTR_SECURE) == 0) !=
680 ((flags & XFS_ATTR_SECURE) == 0))
681 return 0;
682 if (((context->flags & ATTR_ROOT) == 0) !=
683 ((flags & XFS_ATTR_ROOT) == 0))
684 return 0;
685
686 arraytop = sizeof(*alist) +
687 context->count * sizeof(alist->al_offset[0]);
688 context->firstu -= ATTR_ENTSIZE(namelen);
689 if (context->firstu < arraytop) {
690 trace_xfs_attr_list_full(context);
691 alist->al_more = 1;
692 context->seen_enough = 1;
693 return 1;
694 }
695
696 aep = (attrlist_ent_t *)&context->alist[context->firstu];
697 aep->a_valuelen = valuelen;
698 memcpy(aep->a_name, name, namelen);
699 aep->a_name[namelen] = 0;
700 alist->al_offset[context->count++] = context->firstu;
701 alist->al_count = context->count;
702 trace_xfs_attr_list_add(context);
703 return 0;
704}
705
706/*
707 * Generate a list of extended attribute names and optionally
708 * also value lengths. Positive return value follows the XFS
709 * convention of being an error, zero or negative return code
710 * is the length of the buffer returned (negated), indicating
711 * success.
712 */
713int
714xfs_attr_list(
715 xfs_inode_t *dp,
716 char *buffer,
717 int bufsize,
718 int flags,
719 attrlist_cursor_kern_t *cursor)
720{
721 xfs_attr_list_context_t context;
722 struct attrlist *alist;
723 int error;
724
725 /*
726 * Validate the cursor.
727 */
728 if (cursor->pad1 || cursor->pad2)
729 return(XFS_ERROR(EINVAL));
730 if ((cursor->initted == 0) &&
731 (cursor->hashval || cursor->blkno || cursor->offset))
732 return XFS_ERROR(EINVAL);
733
734 /*
735 * Check for a properly aligned buffer.
736 */
737 if (((long)buffer) & (sizeof(int)-1))
738 return XFS_ERROR(EFAULT);
739 if (flags & ATTR_KERNOVAL)
740 bufsize = 0;
741
742 /*
743 * Initialize the output buffer.
744 */
745 memset(&context, 0, sizeof(context));
746 context.dp = dp;
747 context.cursor = cursor;
748 context.resynch = 1;
749 context.flags = flags;
750 context.alist = buffer;
751 context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
752 context.firstu = context.bufsize;
753 context.put_listent = xfs_attr_put_listent;
754
755 alist = (struct attrlist *)context.alist;
756 alist->al_count = 0;
757 alist->al_more = 0;
758 alist->al_offset[0] = context.bufsize;
759
760 error = xfs_attr_list_int(&context);
761 ASSERT(error >= 0);
762 return error;
763}
764
765int /* error */
766xfs_attr_inactive(xfs_inode_t *dp)
767{
768 xfs_trans_t *trans;
769 xfs_mount_t *mp;
770 int error;
771
772 mp = dp->i_mount;
773 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
774
775 xfs_ilock(dp, XFS_ILOCK_SHARED);
776 if (!xfs_inode_hasattr(dp) ||
777 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
778 xfs_iunlock(dp, XFS_ILOCK_SHARED);
779 return 0;
780 }
781 xfs_iunlock(dp, XFS_ILOCK_SHARED);
782
783 /*
784 * Start our first transaction of the day.
785 *
786 * All future transactions during this code must be "chained" off
787 * this one via the trans_dup() call. All transactions will contain
788 * the inode, and the inode will always be marked with trans_ihold().
789 * Since the inode will be locked in all transactions, we must log
790 * the inode in every transaction to let it float upward through
791 * the log.
792 */
793 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
794 if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
795 XFS_TRANS_PERM_LOG_RES,
796 XFS_ATTRINVAL_LOG_COUNT))) {
797 xfs_trans_cancel(trans, 0);
798 return(error);
799 }
800 xfs_ilock(dp, XFS_ILOCK_EXCL);
801
802 /*
803 * No need to make quota reservations here. We expect to release some
804 * blocks, not allocate, in the common case.
805 */
806 xfs_trans_ijoin(trans, dp, 0);
807
808 /*
809 * Decide on what work routines to call based on the inode size.
810 */
811 if (!xfs_inode_hasattr(dp) ||
812 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
813 error = 0;
814 goto out;
815 }
816 error = xfs_attr3_root_inactive(&trans, dp);
817 if (error)
818 goto out;
819
820 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
821 if (error)
822 goto out;
823
824 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
825 xfs_iunlock(dp, XFS_ILOCK_EXCL);
826
827 return(error);
828
829out:
830 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
831 xfs_iunlock(dp, XFS_ILOCK_EXCL);
832 return(error);
833}
834
835
836 612
837/*======================================================================== 613/*========================================================================
838 * External routines when attribute list is inside the inode 614 * External routines when attribute list is inside the inode
@@ -1166,28 +942,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1166 return error; 942 return error;
1167} 943}
1168 944
1169/*
1170 * Copy out attribute entries for attr_list(), for leaf attribute lists.
1171 */
1172STATIC int
1173xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1174{
1175 int error;
1176 struct xfs_buf *bp;
1177
1178 trace_xfs_attr_leaf_list(context);
1179
1180 context->cursor->blkno = 0;
1181 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
1182 if (error)
1183 return XFS_ERROR(error);
1184
1185 error = xfs_attr3_leaf_list_int(bp, context);
1186 xfs_trans_brelse(NULL, bp);
1187 return XFS_ERROR(error);
1188}
1189
1190
1191/*======================================================================== 945/*========================================================================
1192 * External routines when attribute list size > XFS_LBSIZE(mp). 946 * External routines when attribute list size > XFS_LBSIZE(mp).
1193 *========================================================================*/ 947 *========================================================================*/
@@ -1260,6 +1014,7 @@ restart:
1260 * have been a b-tree. 1014 * have been a b-tree.
1261 */ 1015 */
1262 xfs_da_state_free(state); 1016 xfs_da_state_free(state);
1017 state = NULL;
1263 xfs_bmap_init(args->flist, args->firstblock); 1018 xfs_bmap_init(args->flist, args->firstblock);
1264 error = xfs_attr3_leaf_to_node(args); 1019 error = xfs_attr3_leaf_to_node(args);
1265 if (!error) { 1020 if (!error) {
@@ -1780,143 +1535,3 @@ xfs_attr_node_get(xfs_da_args_t *args)
1780 xfs_da_state_free(state); 1535 xfs_da_state_free(state);
1781 return(retval); 1536 return(retval);
1782} 1537}
1783
1784STATIC int /* error */
1785xfs_attr_node_list(xfs_attr_list_context_t *context)
1786{
1787 attrlist_cursor_kern_t *cursor;
1788 xfs_attr_leafblock_t *leaf;
1789 xfs_da_intnode_t *node;
1790 struct xfs_attr3_icleaf_hdr leafhdr;
1791 struct xfs_da3_icnode_hdr nodehdr;
1792 struct xfs_da_node_entry *btree;
1793 int error, i;
1794 struct xfs_buf *bp;
1795
1796 trace_xfs_attr_node_list(context);
1797
1798 cursor = context->cursor;
1799 cursor->initted = 1;
1800
1801 /*
1802 * Do all sorts of validation on the passed-in cursor structure.
1803 * If anything is amiss, ignore the cursor and look up the hashval
1804 * starting from the btree root.
1805 */
1806 bp = NULL;
1807 if (cursor->blkno > 0) {
1808 error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
1809 &bp, XFS_ATTR_FORK);
1810 if ((error != 0) && (error != EFSCORRUPTED))
1811 return(error);
1812 if (bp) {
1813 struct xfs_attr_leaf_entry *entries;
1814
1815 node = bp->b_addr;
1816 switch (be16_to_cpu(node->hdr.info.magic)) {
1817 case XFS_DA_NODE_MAGIC:
1818 case XFS_DA3_NODE_MAGIC:
1819 trace_xfs_attr_list_wrong_blk(context);
1820 xfs_trans_brelse(NULL, bp);
1821 bp = NULL;
1822 break;
1823 case XFS_ATTR_LEAF_MAGIC:
1824 case XFS_ATTR3_LEAF_MAGIC:
1825 leaf = bp->b_addr;
1826 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
1827 entries = xfs_attr3_leaf_entryp(leaf);
1828 if (cursor->hashval > be32_to_cpu(
1829 entries[leafhdr.count - 1].hashval)) {
1830 trace_xfs_attr_list_wrong_blk(context);
1831 xfs_trans_brelse(NULL, bp);
1832 bp = NULL;
1833 } else if (cursor->hashval <= be32_to_cpu(
1834 entries[0].hashval)) {
1835 trace_xfs_attr_list_wrong_blk(context);
1836 xfs_trans_brelse(NULL, bp);
1837 bp = NULL;
1838 }
1839 break;
1840 default:
1841 trace_xfs_attr_list_wrong_blk(context);
1842 xfs_trans_brelse(NULL, bp);
1843 bp = NULL;
1844 }
1845 }
1846 }
1847
1848 /*
1849 * We did not find what we expected given the cursor's contents,
1850 * so we start from the top and work down based on the hash value.
1851 * Note that start of node block is same as start of leaf block.
1852 */
1853 if (bp == NULL) {
1854 cursor->blkno = 0;
1855 for (;;) {
1856 __uint16_t magic;
1857
1858 error = xfs_da3_node_read(NULL, context->dp,
1859 cursor->blkno, -1, &bp,
1860 XFS_ATTR_FORK);
1861 if (error)
1862 return(error);
1863 node = bp->b_addr;
1864 magic = be16_to_cpu(node->hdr.info.magic);
1865 if (magic == XFS_ATTR_LEAF_MAGIC ||
1866 magic == XFS_ATTR3_LEAF_MAGIC)
1867 break;
1868 if (magic != XFS_DA_NODE_MAGIC &&
1869 magic != XFS_DA3_NODE_MAGIC) {
1870 XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
1871 XFS_ERRLEVEL_LOW,
1872 context->dp->i_mount,
1873 node);
1874 xfs_trans_brelse(NULL, bp);
1875 return XFS_ERROR(EFSCORRUPTED);
1876 }
1877
1878 xfs_da3_node_hdr_from_disk(&nodehdr, node);
1879 btree = xfs_da3_node_tree_p(node);
1880 for (i = 0; i < nodehdr.count; btree++, i++) {
1881 if (cursor->hashval
1882 <= be32_to_cpu(btree->hashval)) {
1883 cursor->blkno = be32_to_cpu(btree->before);
1884 trace_xfs_attr_list_node_descend(context,
1885 btree);
1886 break;
1887 }
1888 }
1889 if (i == nodehdr.count) {
1890 xfs_trans_brelse(NULL, bp);
1891 return 0;
1892 }
1893 xfs_trans_brelse(NULL, bp);
1894 }
1895 }
1896 ASSERT(bp != NULL);
1897
1898 /*
1899 * Roll upward through the blocks, processing each leaf block in
1900 * order. As long as there is space in the result buffer, keep
1901 * adding the information.
1902 */
1903 for (;;) {
1904 leaf = bp->b_addr;
1905 error = xfs_attr3_leaf_list_int(bp, context);
1906 if (error) {
1907 xfs_trans_brelse(NULL, bp);
1908 return error;
1909 }
1910 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
1911 if (context->seen_enough || leafhdr.forw == 0)
1912 break;
1913 cursor->blkno = leafhdr.forw;
1914 xfs_trans_brelse(NULL, bp);
1915 error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
1916 &bp);
1917 if (error)
1918 return error;
1919 }
1920 xfs_trans_brelse(NULL, bp);
1921 return 0;
1922}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index de8dd58da46c..dd4824589470 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -141,5 +141,14 @@ typedef struct xfs_attr_list_context {
141 */ 141 */
142int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
143int xfs_attr_list_int(struct xfs_attr_list_context *); 143int xfs_attr_list_int(struct xfs_attr_list_context *);
144int xfs_inode_hasattr(struct xfs_inode *ip);
145int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
146 unsigned char *value, int *valuelenp, int flags);
147int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
148 unsigned char *value, int valuelen, int flags);
149int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
150int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
151 int flags, struct attrlist_cursor_kern *cursor);
152
144 153
145#endif /* __XFS_ATTR_H__ */ 154#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 000000000000..bb24b07cbedb
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,453 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_remote.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_bmap.h"
39#include "xfs_attr.h"
40#include "xfs_attr_leaf.h"
41#include "xfs_error.h"
42#include "xfs_quota.h"
43#include "xfs_trace.h"
44#include "xfs_trans_priv.h"
45
46/*
47 * Look at all the extents for this logical region,
48 * invalidate any buffers that are incore/in transactions.
49 */
50STATIC int
51xfs_attr3_leaf_freextent(
52 struct xfs_trans **trans,
53 struct xfs_inode *dp,
54 xfs_dablk_t blkno,
55 int blkcnt)
56{
57 struct xfs_bmbt_irec map;
58 struct xfs_buf *bp;
59 xfs_dablk_t tblkno;
60 xfs_daddr_t dblkno;
61 int tblkcnt;
62 int dblkcnt;
63 int nmap;
64 int error;
65
66 /*
67 * Roll through the "value", invalidating the attribute value's
68 * blocks.
69 */
70 tblkno = blkno;
71 tblkcnt = blkcnt;
72 while (tblkcnt > 0) {
73 /*
74 * Try to remember where we decided to put the value.
75 */
76 nmap = 1;
77 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
78 &map, &nmap, XFS_BMAPI_ATTRFORK);
79 if (error) {
80 return(error);
81 }
82 ASSERT(nmap == 1);
83 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
84
85 /*
86 * If it's a hole, these are already unmapped
87 * so there's nothing to invalidate.
88 */
89 if (map.br_startblock != HOLESTARTBLOCK) {
90
91 dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
92 map.br_startblock);
93 dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
94 map.br_blockcount);
95 bp = xfs_trans_get_buf(*trans,
96 dp->i_mount->m_ddev_targp,
97 dblkno, dblkcnt, 0);
98 if (!bp)
99 return ENOMEM;
100 xfs_trans_binval(*trans, bp);
101 /*
102 * Roll to next transaction.
103 */
104 error = xfs_trans_roll(trans, dp);
105 if (error)
106 return (error);
107 }
108
109 tblkno += map.br_blockcount;
110 tblkcnt -= map.br_blockcount;
111 }
112
113 return(0);
114}
115
116/*
117 * Invalidate all of the "remote" value regions pointed to by a particular
118 * leaf block.
119 * Note that we must release the lock on the buffer so that we are not
120 * caught holding something that the logging code wants to flush to disk.
121 */
122STATIC int
123xfs_attr3_leaf_inactive(
124 struct xfs_trans **trans,
125 struct xfs_inode *dp,
126 struct xfs_buf *bp)
127{
128 struct xfs_attr_leafblock *leaf;
129 struct xfs_attr3_icleaf_hdr ichdr;
130 struct xfs_attr_leaf_entry *entry;
131 struct xfs_attr_leaf_name_remote *name_rmt;
132 struct xfs_attr_inactive_list *list;
133 struct xfs_attr_inactive_list *lp;
134 int error;
135 int count;
136 int size;
137 int tmp;
138 int i;
139
140 leaf = bp->b_addr;
141 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
142
143 /*
144 * Count the number of "remote" value extents.
145 */
146 count = 0;
147 entry = xfs_attr3_leaf_entryp(leaf);
148 for (i = 0; i < ichdr.count; entry++, i++) {
149 if (be16_to_cpu(entry->nameidx) &&
150 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
151 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
152 if (name_rmt->valueblk)
153 count++;
154 }
155 }
156
157 /*
158 * If there are no "remote" values, we're done.
159 */
160 if (count == 0) {
161 xfs_trans_brelse(*trans, bp);
162 return 0;
163 }
164
165 /*
166 * Allocate storage for a list of all the "remote" value extents.
167 */
168 size = count * sizeof(xfs_attr_inactive_list_t);
169 list = kmem_alloc(size, KM_SLEEP);
170
171 /*
172 * Identify each of the "remote" value extents.
173 */
174 lp = list;
175 entry = xfs_attr3_leaf_entryp(leaf);
176 for (i = 0; i < ichdr.count; entry++, i++) {
177 if (be16_to_cpu(entry->nameidx) &&
178 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
179 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
180 if (name_rmt->valueblk) {
181 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
182 lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
183 be32_to_cpu(name_rmt->valuelen));
184 lp++;
185 }
186 }
187 }
188 xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
189
190 /*
191 * Invalidate each of the "remote" value extents.
192 */
193 error = 0;
194 for (lp = list, i = 0; i < count; i++, lp++) {
195 tmp = xfs_attr3_leaf_freextent(trans, dp,
196 lp->valueblk, lp->valuelen);
197
198 if (error == 0)
199 error = tmp; /* save only the 1st errno */
200 }
201
202 kmem_free(list);
203 return error;
204}
205
206/*
207 * Recurse (gasp!) through the attribute nodes until we find leaves.
208 * We're doing a depth-first traversal in order to invalidate everything.
209 */
210STATIC int
211xfs_attr3_node_inactive(
212 struct xfs_trans **trans,
213 struct xfs_inode *dp,
214 struct xfs_buf *bp,
215 int level)
216{
217 xfs_da_blkinfo_t *info;
218 xfs_da_intnode_t *node;
219 xfs_dablk_t child_fsb;
220 xfs_daddr_t parent_blkno, child_blkno;
221 int error, i;
222 struct xfs_buf *child_bp;
223 struct xfs_da_node_entry *btree;
224 struct xfs_da3_icnode_hdr ichdr;
225
226 /*
227 * Since this code is recursive (gasp!) we must protect ourselves.
228 */
229 if (level > XFS_DA_NODE_MAXDEPTH) {
230 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
231 return XFS_ERROR(EIO);
232 }
233
234 node = bp->b_addr;
235 xfs_da3_node_hdr_from_disk(&ichdr, node);
236 parent_blkno = bp->b_bn;
237 if (!ichdr.count) {
238 xfs_trans_brelse(*trans, bp);
239 return 0;
240 }
241 btree = xfs_da3_node_tree_p(node);
242 child_fsb = be32_to_cpu(btree[0].before);
243 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
244
245 /*
246 * If this is the node level just above the leaves, simply loop
247 * over the leaves removing all of them. If this is higher up
248 * in the tree, recurse downward.
249 */
250 for (i = 0; i < ichdr.count; i++) {
251 /*
252 * Read the subsidiary block to see what we have to work with.
253 * Don't do this in a transaction. This is a depth-first
254 * traversal of the tree so we may deal with many blocks
255 * before we come back to this one.
256 */
257 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
258 XFS_ATTR_FORK);
259 if (error)
260 return(error);
261 if (child_bp) {
262 /* save for re-read later */
263 child_blkno = XFS_BUF_ADDR(child_bp);
264
265 /*
266 * Invalidate the subtree, however we have to.
267 */
268 info = child_bp->b_addr;
269 switch (info->magic) {
270 case cpu_to_be16(XFS_DA_NODE_MAGIC):
271 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
272 error = xfs_attr3_node_inactive(trans, dp,
273 child_bp, level + 1);
274 break;
275 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
276 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
277 error = xfs_attr3_leaf_inactive(trans, dp,
278 child_bp);
279 break;
280 default:
281 error = XFS_ERROR(EIO);
282 xfs_trans_brelse(*trans, child_bp);
283 break;
284 }
285 if (error)
286 return error;
287
288 /*
289 * Remove the subsidiary block from the cache
290 * and from the log.
291 */
292 error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
293 &child_bp, XFS_ATTR_FORK);
294 if (error)
295 return error;
296 xfs_trans_binval(*trans, child_bp);
297 }
298
299 /*
300 * If we're not done, re-read the parent to get the next
301 * child block number.
302 */
303 if (i + 1 < ichdr.count) {
304 error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
305 &bp, XFS_ATTR_FORK);
306 if (error)
307 return error;
308 child_fsb = be32_to_cpu(btree[i + 1].before);
309 xfs_trans_brelse(*trans, bp);
310 }
311 /*
312 * Atomically commit the whole invalidate stuff.
313 */
314 error = xfs_trans_roll(trans, dp);
315 if (error)
316 return error;
317 }
318
319 return 0;
320}
321
322/*
323 * Indiscriminately delete the entire attribute fork
324 *
325 * Recurse (gasp!) through the attribute nodes until we find leaves.
326 * We're doing a depth-first traversal in order to invalidate everything.
327 */
328int
329xfs_attr3_root_inactive(
330 struct xfs_trans **trans,
331 struct xfs_inode *dp)
332{
333 struct xfs_da_blkinfo *info;
334 struct xfs_buf *bp;
335 xfs_daddr_t blkno;
336 int error;
337
338 /*
339 * Read block 0 to see what we have to work with.
340 * We only get here if we have extents, since we remove
341 * the extents in reverse order the extent containing
342 * block 0 must still be there.
343 */
344 error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
345 if (error)
346 return error;
347 blkno = bp->b_bn;
348
349 /*
350 * Invalidate the tree, even if the "tree" is only a single leaf block.
351 * This is a depth-first traversal!
352 */
353 info = bp->b_addr;
354 switch (info->magic) {
355 case cpu_to_be16(XFS_DA_NODE_MAGIC):
356 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
357 error = xfs_attr3_node_inactive(trans, dp, bp, 1);
358 break;
359 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
360 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
361 error = xfs_attr3_leaf_inactive(trans, dp, bp);
362 break;
363 default:
364 error = XFS_ERROR(EIO);
365 xfs_trans_brelse(*trans, bp);
366 break;
367 }
368 if (error)
369 return error;
370
371 /*
372 * Invalidate the incore copy of the root block.
373 */
374 error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
375 if (error)
376 return error;
377 xfs_trans_binval(*trans, bp); /* remove from cache */
378 /*
379 * Commit the invalidate and start the next transaction.
380 */
381 error = xfs_trans_roll(trans, dp);
382
383 return error;
384}
385
386int
387xfs_attr_inactive(xfs_inode_t *dp)
388{
389 xfs_trans_t *trans;
390 xfs_mount_t *mp;
391 int error;
392
393 mp = dp->i_mount;
394 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
395
396 xfs_ilock(dp, XFS_ILOCK_SHARED);
397 if (!xfs_inode_hasattr(dp) ||
398 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
399 xfs_iunlock(dp, XFS_ILOCK_SHARED);
400 return 0;
401 }
402 xfs_iunlock(dp, XFS_ILOCK_SHARED);
403
404 /*
405 * Start our first transaction of the day.
406 *
407 * All future transactions during this code must be "chained" off
408 * this one via the trans_dup() call. All transactions will contain
409 * the inode, and the inode will always be marked with trans_ihold().
410 * Since the inode will be locked in all transactions, we must log
411 * the inode in every transaction to let it float upward through
412 * the log.
413 */
414 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
415 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
416 if (error) {
417 xfs_trans_cancel(trans, 0);
418 return(error);
419 }
420 xfs_ilock(dp, XFS_ILOCK_EXCL);
421
422 /*
423 * No need to make quota reservations here. We expect to release some
424 * blocks, not allocate, in the common case.
425 */
426 xfs_trans_ijoin(trans, dp, 0);
427
428 /*
429 * Decide on what work routines to call based on the inode size.
430 */
431 if (!xfs_inode_hasattr(dp) ||
432 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
433 error = 0;
434 goto out;
435 }
436 error = xfs_attr3_root_inactive(&trans, dp);
437 if (error)
438 goto out;
439
440 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
441 if (error)
442 goto out;
443
444 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
445 xfs_iunlock(dp, XFS_ILOCK_EXCL);
446
447 return(error);
448
449out:
450 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
451 xfs_iunlock(dp, XFS_ILOCK_EXCL);
452 return(error);
453}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index b800fbcafc7f..86db20a9cc02 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -22,6 +22,7 @@
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -78,16 +79,6 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
78 int *number_usedbytes_in_blk1); 79 int *number_usedbytes_in_blk1);
79 80
80/* 81/*
81 * Routines used for shrinking the Btree.
82 */
83STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
84 struct xfs_buf *bp, int level);
85STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
86 struct xfs_buf *bp);
87STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
88 xfs_dablk_t blkno, int blkcnt);
89
90/*
91 * Utility routines. 82 * Utility routines.
92 */ 83 */
93STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, 84STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
@@ -635,7 +626,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
635 xfs_attr_sf_entry_t *sfe; 626 xfs_attr_sf_entry_t *sfe;
636 int i; 627 int i;
637 628
638 ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); 629 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
639 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; 630 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
640 sfe = &sf->list[0]; 631 sfe = &sf->list[0];
641 for (i = 0; i < sf->hdr.count; 632 for (i = 0; i < sf->hdr.count;
@@ -751,182 +742,6 @@ out:
751 return(error); 742 return(error);
752} 743}
753 744
754STATIC int
755xfs_attr_shortform_compare(const void *a, const void *b)
756{
757 xfs_attr_sf_sort_t *sa, *sb;
758
759 sa = (xfs_attr_sf_sort_t *)a;
760 sb = (xfs_attr_sf_sort_t *)b;
761 if (sa->hash < sb->hash) {
762 return(-1);
763 } else if (sa->hash > sb->hash) {
764 return(1);
765 } else {
766 return(sa->entno - sb->entno);
767 }
768}
769
770
771#define XFS_ISRESET_CURSOR(cursor) \
772 (!((cursor)->initted) && !((cursor)->hashval) && \
773 !((cursor)->blkno) && !((cursor)->offset))
774/*
775 * Copy out entries of shortform attribute lists for attr_list().
776 * Shortform attribute lists are not stored in hashval sorted order.
777 * If the output buffer is not large enough to hold them all, then we
778 * we have to calculate each entries' hashvalue and sort them before
779 * we can begin returning them to the user.
780 */
781/*ARGSUSED*/
782int
783xfs_attr_shortform_list(xfs_attr_list_context_t *context)
784{
785 attrlist_cursor_kern_t *cursor;
786 xfs_attr_sf_sort_t *sbuf, *sbp;
787 xfs_attr_shortform_t *sf;
788 xfs_attr_sf_entry_t *sfe;
789 xfs_inode_t *dp;
790 int sbsize, nsbuf, count, i;
791 int error;
792
793 ASSERT(context != NULL);
794 dp = context->dp;
795 ASSERT(dp != NULL);
796 ASSERT(dp->i_afp != NULL);
797 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
798 ASSERT(sf != NULL);
799 if (!sf->hdr.count)
800 return(0);
801 cursor = context->cursor;
802 ASSERT(cursor != NULL);
803
804 trace_xfs_attr_list_sf(context);
805
806 /*
807 * If the buffer is large enough and the cursor is at the start,
808 * do not bother with sorting since we will return everything in
809 * one buffer and another call using the cursor won't need to be
810 * made.
811 * Note the generous fudge factor of 16 overhead bytes per entry.
812 * If bufsize is zero then put_listent must be a search function
813 * and can just scan through what we have.
814 */
815 if (context->bufsize == 0 ||
816 (XFS_ISRESET_CURSOR(cursor) &&
817 (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
818 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
819 error = context->put_listent(context,
820 sfe->flags,
821 sfe->nameval,
822 (int)sfe->namelen,
823 (int)sfe->valuelen,
824 &sfe->nameval[sfe->namelen]);
825
826 /*
827 * Either search callback finished early or
828 * didn't fit it all in the buffer after all.
829 */
830 if (context->seen_enough)
831 break;
832
833 if (error)
834 return error;
835 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
836 }
837 trace_xfs_attr_list_sf_all(context);
838 return(0);
839 }
840
841 /* do no more for a search callback */
842 if (context->bufsize == 0)
843 return 0;
844
845 /*
846 * It didn't all fit, so we have to sort everything on hashval.
847 */
848 sbsize = sf->hdr.count * sizeof(*sbuf);
849 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
850
851 /*
852 * Scan the attribute list for the rest of the entries, storing
853 * the relevant info from only those that match into a buffer.
854 */
855 nsbuf = 0;
856 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
857 if (unlikely(
858 ((char *)sfe < (char *)sf) ||
859 ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
860 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
861 XFS_ERRLEVEL_LOW,
862 context->dp->i_mount, sfe);
863 kmem_free(sbuf);
864 return XFS_ERROR(EFSCORRUPTED);
865 }
866
867 sbp->entno = i;
868 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
869 sbp->name = sfe->nameval;
870 sbp->namelen = sfe->namelen;
871 /* These are bytes, and both on-disk, don't endian-flip */
872 sbp->valuelen = sfe->valuelen;
873 sbp->flags = sfe->flags;
874 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
875 sbp++;
876 nsbuf++;
877 }
878
879 /*
880 * Sort the entries on hash then entno.
881 */
882 xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
883
884 /*
885 * Re-find our place IN THE SORTED LIST.
886 */
887 count = 0;
888 cursor->initted = 1;
889 cursor->blkno = 0;
890 for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
891 if (sbp->hash == cursor->hashval) {
892 if (cursor->offset == count) {
893 break;
894 }
895 count++;
896 } else if (sbp->hash > cursor->hashval) {
897 break;
898 }
899 }
900 if (i == nsbuf) {
901 kmem_free(sbuf);
902 return(0);
903 }
904
905 /*
906 * Loop putting entries into the user buffer.
907 */
908 for ( ; i < nsbuf; i++, sbp++) {
909 if (cursor->hashval != sbp->hash) {
910 cursor->hashval = sbp->hash;
911 cursor->offset = 0;
912 }
913 error = context->put_listent(context,
914 sbp->flags,
915 sbp->name,
916 sbp->namelen,
917 sbp->valuelen,
918 &sbp->name[sbp->namelen]);
919 if (error)
920 return error;
921 if (context->seen_enough)
922 break;
923 cursor->offset++;
924 }
925
926 kmem_free(sbuf);
927 return(0);
928}
929
930/* 745/*
931 * Check a leaf attribute block to see if all the entries would fit into 746 * Check a leaf attribute block to see if all the entries would fit into
932 * a shortform attribute list. 747 * a shortform attribute list.
@@ -1121,7 +936,6 @@ out:
1121 return error; 936 return error;
1122} 937}
1123 938
1124
1125/*======================================================================== 939/*========================================================================
1126 * Routines used for growing the Btree. 940 * Routines used for growing the Btree.
1127 *========================================================================*/ 941 *========================================================================*/
@@ -1482,7 +1296,6 @@ xfs_attr3_leaf_compact(
1482 ichdr_dst->freemap[0].size = ichdr_dst->firstused - 1296 ichdr_dst->freemap[0].size = ichdr_dst->firstused -
1483 ichdr_dst->freemap[0].base; 1297 ichdr_dst->freemap[0].base;
1484 1298
1485
1486 /* write the header back to initialise the underlying buffer */ 1299 /* write the header back to initialise the underlying buffer */
1487 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); 1300 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
1488 1301
@@ -2643,130 +2456,6 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
2643 return size; 2456 return size;
2644} 2457}
2645 2458
2646/*
2647 * Copy out attribute list entries for attr_list(), for leaf attribute lists.
2648 */
2649int
2650xfs_attr3_leaf_list_int(
2651 struct xfs_buf *bp,
2652 struct xfs_attr_list_context *context)
2653{
2654 struct attrlist_cursor_kern *cursor;
2655 struct xfs_attr_leafblock *leaf;
2656 struct xfs_attr3_icleaf_hdr ichdr;
2657 struct xfs_attr_leaf_entry *entries;
2658 struct xfs_attr_leaf_entry *entry;
2659 int retval;
2660 int i;
2661
2662 trace_xfs_attr_list_leaf(context);
2663
2664 leaf = bp->b_addr;
2665 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2666 entries = xfs_attr3_leaf_entryp(leaf);
2667
2668 cursor = context->cursor;
2669 cursor->initted = 1;
2670
2671 /*
2672 * Re-find our place in the leaf block if this is a new syscall.
2673 */
2674 if (context->resynch) {
2675 entry = &entries[0];
2676 for (i = 0; i < ichdr.count; entry++, i++) {
2677 if (be32_to_cpu(entry->hashval) == cursor->hashval) {
2678 if (cursor->offset == context->dupcnt) {
2679 context->dupcnt = 0;
2680 break;
2681 }
2682 context->dupcnt++;
2683 } else if (be32_to_cpu(entry->hashval) >
2684 cursor->hashval) {
2685 context->dupcnt = 0;
2686 break;
2687 }
2688 }
2689 if (i == ichdr.count) {
2690 trace_xfs_attr_list_notfound(context);
2691 return 0;
2692 }
2693 } else {
2694 entry = &entries[0];
2695 i = 0;
2696 }
2697 context->resynch = 0;
2698
2699 /*
2700 * We have found our place, start copying out the new attributes.
2701 */
2702 retval = 0;
2703 for (; i < ichdr.count; entry++, i++) {
2704 if (be32_to_cpu(entry->hashval) != cursor->hashval) {
2705 cursor->hashval = be32_to_cpu(entry->hashval);
2706 cursor->offset = 0;
2707 }
2708
2709 if (entry->flags & XFS_ATTR_INCOMPLETE)
2710 continue; /* skip incomplete entries */
2711
2712 if (entry->flags & XFS_ATTR_LOCAL) {
2713 xfs_attr_leaf_name_local_t *name_loc =
2714 xfs_attr3_leaf_name_local(leaf, i);
2715
2716 retval = context->put_listent(context,
2717 entry->flags,
2718 name_loc->nameval,
2719 (int)name_loc->namelen,
2720 be16_to_cpu(name_loc->valuelen),
2721 &name_loc->nameval[name_loc->namelen]);
2722 if (retval)
2723 return retval;
2724 } else {
2725 xfs_attr_leaf_name_remote_t *name_rmt =
2726 xfs_attr3_leaf_name_remote(leaf, i);
2727
2728 int valuelen = be32_to_cpu(name_rmt->valuelen);
2729
2730 if (context->put_value) {
2731 xfs_da_args_t args;
2732
2733 memset((char *)&args, 0, sizeof(args));
2734 args.dp = context->dp;
2735 args.whichfork = XFS_ATTR_FORK;
2736 args.valuelen = valuelen;
2737 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2738 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2739 args.rmtblkcnt = xfs_attr3_rmt_blocks(
2740 args.dp->i_mount, valuelen);
2741 retval = xfs_attr_rmtval_get(&args);
2742 if (retval)
2743 return retval;
2744 retval = context->put_listent(context,
2745 entry->flags,
2746 name_rmt->name,
2747 (int)name_rmt->namelen,
2748 valuelen,
2749 args.value);
2750 kmem_free(args.value);
2751 } else {
2752 retval = context->put_listent(context,
2753 entry->flags,
2754 name_rmt->name,
2755 (int)name_rmt->namelen,
2756 valuelen,
2757 NULL);
2758 }
2759 if (retval)
2760 return retval;
2761 }
2762 if (context->seen_enough)
2763 break;
2764 cursor->offset++;
2765 }
2766 trace_xfs_attr_list_leaf_end(context);
2767 return retval;
2768}
2769
2770 2459
2771/*======================================================================== 2460/*========================================================================
2772 * Manage the INCOMPLETE flag in a leaf entry 2461 * Manage the INCOMPLETE flag in a leaf entry
@@ -3011,345 +2700,3 @@ xfs_attr3_leaf_flipflags(
3011 2700
3012 return error; 2701 return error;
3013} 2702}
3014
3015/*========================================================================
3016 * Indiscriminately delete the entire attribute fork
3017 *========================================================================*/
3018
3019/*
3020 * Recurse (gasp!) through the attribute nodes until we find leaves.
3021 * We're doing a depth-first traversal in order to invalidate everything.
3022 */
3023int
3024xfs_attr3_root_inactive(
3025 struct xfs_trans **trans,
3026 struct xfs_inode *dp)
3027{
3028 struct xfs_da_blkinfo *info;
3029 struct xfs_buf *bp;
3030 xfs_daddr_t blkno;
3031 int error;
3032
3033 /*
3034 * Read block 0 to see what we have to work with.
3035 * We only get here if we have extents, since we remove
3036 * the extents in reverse order the extent containing
3037 * block 0 must still be there.
3038 */
3039 error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
3040 if (error)
3041 return error;
3042 blkno = bp->b_bn;
3043
3044 /*
3045 * Invalidate the tree, even if the "tree" is only a single leaf block.
3046 * This is a depth-first traversal!
3047 */
3048 info = bp->b_addr;
3049 switch (info->magic) {
3050 case cpu_to_be16(XFS_DA_NODE_MAGIC):
3051 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
3052 error = xfs_attr3_node_inactive(trans, dp, bp, 1);
3053 break;
3054 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
3055 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
3056 error = xfs_attr3_leaf_inactive(trans, dp, bp);
3057 break;
3058 default:
3059 error = XFS_ERROR(EIO);
3060 xfs_trans_brelse(*trans, bp);
3061 break;
3062 }
3063 if (error)
3064 return error;
3065
3066 /*
3067 * Invalidate the incore copy of the root block.
3068 */
3069 error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
3070 if (error)
3071 return error;
3072 xfs_trans_binval(*trans, bp); /* remove from cache */
3073 /*
3074 * Commit the invalidate and start the next transaction.
3075 */
3076 error = xfs_trans_roll(trans, dp);
3077
3078 return error;
3079}
3080
3081/*
3082 * Recurse (gasp!) through the attribute nodes until we find leaves.
3083 * We're doing a depth-first traversal in order to invalidate everything.
3084 */
3085STATIC int
3086xfs_attr3_node_inactive(
3087 struct xfs_trans **trans,
3088 struct xfs_inode *dp,
3089 struct xfs_buf *bp,
3090 int level)
3091{
3092 xfs_da_blkinfo_t *info;
3093 xfs_da_intnode_t *node;
3094 xfs_dablk_t child_fsb;
3095 xfs_daddr_t parent_blkno, child_blkno;
3096 int error, i;
3097 struct xfs_buf *child_bp;
3098 struct xfs_da_node_entry *btree;
3099 struct xfs_da3_icnode_hdr ichdr;
3100
3101 /*
3102 * Since this code is recursive (gasp!) we must protect ourselves.
3103 */
3104 if (level > XFS_DA_NODE_MAXDEPTH) {
3105 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
3106 return XFS_ERROR(EIO);
3107 }
3108
3109 node = bp->b_addr;
3110 xfs_da3_node_hdr_from_disk(&ichdr, node);
3111 parent_blkno = bp->b_bn;
3112 if (!ichdr.count) {
3113 xfs_trans_brelse(*trans, bp);
3114 return 0;
3115 }
3116 btree = xfs_da3_node_tree_p(node);
3117 child_fsb = be32_to_cpu(btree[0].before);
3118 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
3119
3120 /*
3121 * If this is the node level just above the leaves, simply loop
3122 * over the leaves removing all of them. If this is higher up
3123 * in the tree, recurse downward.
3124 */
3125 for (i = 0; i < ichdr.count; i++) {
3126 /*
3127 * Read the subsidiary block to see what we have to work with.
3128 * Don't do this in a transaction. This is a depth-first
3129 * traversal of the tree so we may deal with many blocks
3130 * before we come back to this one.
3131 */
3132 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
3133 XFS_ATTR_FORK);
3134 if (error)
3135 return(error);
3136 if (child_bp) {
3137 /* save for re-read later */
3138 child_blkno = XFS_BUF_ADDR(child_bp);
3139
3140 /*
3141 * Invalidate the subtree, however we have to.
3142 */
3143 info = child_bp->b_addr;
3144 switch (info->magic) {
3145 case cpu_to_be16(XFS_DA_NODE_MAGIC):
3146 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
3147 error = xfs_attr3_node_inactive(trans, dp,
3148 child_bp, level + 1);
3149 break;
3150 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
3151 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
3152 error = xfs_attr3_leaf_inactive(trans, dp,
3153 child_bp);
3154 break;
3155 default:
3156 error = XFS_ERROR(EIO);
3157 xfs_trans_brelse(*trans, child_bp);
3158 break;
3159 }
3160 if (error)
3161 return error;
3162
3163 /*
3164 * Remove the subsidiary block from the cache
3165 * and from the log.
3166 */
3167 error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
3168 &child_bp, XFS_ATTR_FORK);
3169 if (error)
3170 return error;
3171 xfs_trans_binval(*trans, child_bp);
3172 }
3173
3174 /*
3175 * If we're not done, re-read the parent to get the next
3176 * child block number.
3177 */
3178 if (i + 1 < ichdr.count) {
3179 error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
3180 &bp, XFS_ATTR_FORK);
3181 if (error)
3182 return error;
3183 child_fsb = be32_to_cpu(btree[i + 1].before);
3184 xfs_trans_brelse(*trans, bp);
3185 }
3186 /*
3187 * Atomically commit the whole invalidate stuff.
3188 */
3189 error = xfs_trans_roll(trans, dp);
3190 if (error)
3191 return error;
3192 }
3193
3194 return 0;
3195}
3196
3197/*
3198 * Invalidate all of the "remote" value regions pointed to by a particular
3199 * leaf block.
3200 * Note that we must release the lock on the buffer so that we are not
3201 * caught holding something that the logging code wants to flush to disk.
3202 */
3203STATIC int
3204xfs_attr3_leaf_inactive(
3205 struct xfs_trans **trans,
3206 struct xfs_inode *dp,
3207 struct xfs_buf *bp)
3208{
3209 struct xfs_attr_leafblock *leaf;
3210 struct xfs_attr3_icleaf_hdr ichdr;
3211 struct xfs_attr_leaf_entry *entry;
3212 struct xfs_attr_leaf_name_remote *name_rmt;
3213 struct xfs_attr_inactive_list *list;
3214 struct xfs_attr_inactive_list *lp;
3215 int error;
3216 int count;
3217 int size;
3218 int tmp;
3219 int i;
3220
3221 leaf = bp->b_addr;
3222 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
3223
3224 /*
3225 * Count the number of "remote" value extents.
3226 */
3227 count = 0;
3228 entry = xfs_attr3_leaf_entryp(leaf);
3229 for (i = 0; i < ichdr.count; entry++, i++) {
3230 if (be16_to_cpu(entry->nameidx) &&
3231 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
3232 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
3233 if (name_rmt->valueblk)
3234 count++;
3235 }
3236 }
3237
3238 /*
3239 * If there are no "remote" values, we're done.
3240 */
3241 if (count == 0) {
3242 xfs_trans_brelse(*trans, bp);
3243 return 0;
3244 }
3245
3246 /*
3247 * Allocate storage for a list of all the "remote" value extents.
3248 */
3249 size = count * sizeof(xfs_attr_inactive_list_t);
3250 list = kmem_alloc(size, KM_SLEEP);
3251
3252 /*
3253 * Identify each of the "remote" value extents.
3254 */
3255 lp = list;
3256 entry = xfs_attr3_leaf_entryp(leaf);
3257 for (i = 0; i < ichdr.count; entry++, i++) {
3258 if (be16_to_cpu(entry->nameidx) &&
3259 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
3260 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
3261 if (name_rmt->valueblk) {
3262 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
3263 lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
3264 be32_to_cpu(name_rmt->valuelen));
3265 lp++;
3266 }
3267 }
3268 }
3269 xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
3270
3271 /*
3272 * Invalidate each of the "remote" value extents.
3273 */
3274 error = 0;
3275 for (lp = list, i = 0; i < count; i++, lp++) {
3276 tmp = xfs_attr3_leaf_freextent(trans, dp,
3277 lp->valueblk, lp->valuelen);
3278
3279 if (error == 0)
3280 error = tmp; /* save only the 1st errno */
3281 }
3282
3283 kmem_free(list);
3284 return error;
3285}
3286
3287/*
3288 * Look at all the extents for this logical region,
3289 * invalidate any buffers that are incore/in transactions.
3290 */
3291STATIC int
3292xfs_attr3_leaf_freextent(
3293 struct xfs_trans **trans,
3294 struct xfs_inode *dp,
3295 xfs_dablk_t blkno,
3296 int blkcnt)
3297{
3298 struct xfs_bmbt_irec map;
3299 struct xfs_buf *bp;
3300 xfs_dablk_t tblkno;
3301 xfs_daddr_t dblkno;
3302 int tblkcnt;
3303 int dblkcnt;
3304 int nmap;
3305 int error;
3306
3307 /*
3308 * Roll through the "value", invalidating the attribute value's
3309 * blocks.
3310 */
3311 tblkno = blkno;
3312 tblkcnt = blkcnt;
3313 while (tblkcnt > 0) {
3314 /*
3315 * Try to remember where we decided to put the value.
3316 */
3317 nmap = 1;
3318 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
3319 &map, &nmap, XFS_BMAPI_ATTRFORK);
3320 if (error) {
3321 return(error);
3322 }
3323 ASSERT(nmap == 1);
3324 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
3325
3326 /*
3327 * If it's a hole, these are already unmapped
3328 * so there's nothing to invalidate.
3329 */
3330 if (map.br_startblock != HOLESTARTBLOCK) {
3331
3332 dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
3333 map.br_startblock);
3334 dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
3335 map.br_blockcount);
3336 bp = xfs_trans_get_buf(*trans,
3337 dp->i_mount->m_ddev_targp,
3338 dblkno, dblkcnt, 0);
3339 if (!bp)
3340 return ENOMEM;
3341 xfs_trans_binval(*trans, bp);
3342 /*
3343 * Roll to next transaction.
3344 */
3345 error = xfs_trans_roll(trans, dp);
3346 if (error)
3347 return (error);
3348 }
3349
3350 tblkno += map.br_blockcount;
3351 tblkcnt -= map.br_blockcount;
3352 }
3353
3354 return(0);
3355}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 444a7704596c..c1022138c7e6 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -333,6 +333,8 @@ int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
333 struct xfs_buf **bpp); 333 struct xfs_buf **bpp);
334void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, 334void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
335 struct xfs_attr_leafblock *from); 335 struct xfs_attr_leafblock *from);
336void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
337 struct xfs_attr3_icleaf_hdr *from);
336 338
337extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; 339extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
338 340
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644
index 000000000000..cbc80d485177
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,655 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_attr_remote.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_attr.h"
41#include "xfs_attr_leaf.h"
42#include "xfs_error.h"
43#include "xfs_trace.h"
44#include "xfs_buf_item.h"
45#include "xfs_cksum.h"
46
47STATIC int
48xfs_attr_shortform_compare(const void *a, const void *b)
49{
50 xfs_attr_sf_sort_t *sa, *sb;
51
52 sa = (xfs_attr_sf_sort_t *)a;
53 sb = (xfs_attr_sf_sort_t *)b;
54 if (sa->hash < sb->hash) {
55 return(-1);
56 } else if (sa->hash > sb->hash) {
57 return(1);
58 } else {
59 return(sa->entno - sb->entno);
60 }
61}
62
63#define XFS_ISRESET_CURSOR(cursor) \
64 (!((cursor)->initted) && !((cursor)->hashval) && \
65 !((cursor)->blkno) && !((cursor)->offset))
66/*
67 * Copy out entries of shortform attribute lists for attr_list().
68 * Shortform attribute lists are not stored in hashval sorted order.
69 * If the output buffer is not large enough to hold them all, then we
70 * we have to calculate each entries' hashvalue and sort them before
71 * we can begin returning them to the user.
72 */
73int
74xfs_attr_shortform_list(xfs_attr_list_context_t *context)
75{
76 attrlist_cursor_kern_t *cursor;
77 xfs_attr_sf_sort_t *sbuf, *sbp;
78 xfs_attr_shortform_t *sf;
79 xfs_attr_sf_entry_t *sfe;
80 xfs_inode_t *dp;
81 int sbsize, nsbuf, count, i;
82 int error;
83
84 ASSERT(context != NULL);
85 dp = context->dp;
86 ASSERT(dp != NULL);
87 ASSERT(dp->i_afp != NULL);
88 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
89 ASSERT(sf != NULL);
90 if (!sf->hdr.count)
91 return(0);
92 cursor = context->cursor;
93 ASSERT(cursor != NULL);
94
95 trace_xfs_attr_list_sf(context);
96
97 /*
98 * If the buffer is large enough and the cursor is at the start,
99 * do not bother with sorting since we will return everything in
100 * one buffer and another call using the cursor won't need to be
101 * made.
102 * Note the generous fudge factor of 16 overhead bytes per entry.
103 * If bufsize is zero then put_listent must be a search function
104 * and can just scan through what we have.
105 */
106 if (context->bufsize == 0 ||
107 (XFS_ISRESET_CURSOR(cursor) &&
108 (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
109 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
110 error = context->put_listent(context,
111 sfe->flags,
112 sfe->nameval,
113 (int)sfe->namelen,
114 (int)sfe->valuelen,
115 &sfe->nameval[sfe->namelen]);
116
117 /*
118 * Either search callback finished early or
119 * didn't fit it all in the buffer after all.
120 */
121 if (context->seen_enough)
122 break;
123
124 if (error)
125 return error;
126 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
127 }
128 trace_xfs_attr_list_sf_all(context);
129 return(0);
130 }
131
132 /* do no more for a search callback */
133 if (context->bufsize == 0)
134 return 0;
135
136 /*
137 * It didn't all fit, so we have to sort everything on hashval.
138 */
139 sbsize = sf->hdr.count * sizeof(*sbuf);
140 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
141
142 /*
143 * Scan the attribute list for the rest of the entries, storing
144 * the relevant info from only those that match into a buffer.
145 */
146 nsbuf = 0;
147 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
148 if (unlikely(
149 ((char *)sfe < (char *)sf) ||
150 ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
151 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
152 XFS_ERRLEVEL_LOW,
153 context->dp->i_mount, sfe);
154 kmem_free(sbuf);
155 return XFS_ERROR(EFSCORRUPTED);
156 }
157
158 sbp->entno = i;
159 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
160 sbp->name = sfe->nameval;
161 sbp->namelen = sfe->namelen;
162 /* These are bytes, and both on-disk, don't endian-flip */
163 sbp->valuelen = sfe->valuelen;
164 sbp->flags = sfe->flags;
165 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
166 sbp++;
167 nsbuf++;
168 }
169
170 /*
171 * Sort the entries on hash then entno.
172 */
173 xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
174
175 /*
176 * Re-find our place IN THE SORTED LIST.
177 */
178 count = 0;
179 cursor->initted = 1;
180 cursor->blkno = 0;
181 for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
182 if (sbp->hash == cursor->hashval) {
183 if (cursor->offset == count) {
184 break;
185 }
186 count++;
187 } else if (sbp->hash > cursor->hashval) {
188 break;
189 }
190 }
191 if (i == nsbuf) {
192 kmem_free(sbuf);
193 return(0);
194 }
195
196 /*
197 * Loop putting entries into the user buffer.
198 */
199 for ( ; i < nsbuf; i++, sbp++) {
200 if (cursor->hashval != sbp->hash) {
201 cursor->hashval = sbp->hash;
202 cursor->offset = 0;
203 }
204 error = context->put_listent(context,
205 sbp->flags,
206 sbp->name,
207 sbp->namelen,
208 sbp->valuelen,
209 &sbp->name[sbp->namelen]);
210 if (error)
211 return error;
212 if (context->seen_enough)
213 break;
214 cursor->offset++;
215 }
216
217 kmem_free(sbuf);
218 return(0);
219}
220
221STATIC int
222xfs_attr_node_list(xfs_attr_list_context_t *context)
223{
224 attrlist_cursor_kern_t *cursor;
225 xfs_attr_leafblock_t *leaf;
226 xfs_da_intnode_t *node;
227 struct xfs_attr3_icleaf_hdr leafhdr;
228 struct xfs_da3_icnode_hdr nodehdr;
229 struct xfs_da_node_entry *btree;
230 int error, i;
231 struct xfs_buf *bp;
232
233 trace_xfs_attr_node_list(context);
234
235 cursor = context->cursor;
236 cursor->initted = 1;
237
238 /*
239 * Do all sorts of validation on the passed-in cursor structure.
240 * If anything is amiss, ignore the cursor and look up the hashval
241 * starting from the btree root.
242 */
243 bp = NULL;
244 if (cursor->blkno > 0) {
245 error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
246 &bp, XFS_ATTR_FORK);
247 if ((error != 0) && (error != EFSCORRUPTED))
248 return(error);
249 if (bp) {
250 struct xfs_attr_leaf_entry *entries;
251
252 node = bp->b_addr;
253 switch (be16_to_cpu(node->hdr.info.magic)) {
254 case XFS_DA_NODE_MAGIC:
255 case XFS_DA3_NODE_MAGIC:
256 trace_xfs_attr_list_wrong_blk(context);
257 xfs_trans_brelse(NULL, bp);
258 bp = NULL;
259 break;
260 case XFS_ATTR_LEAF_MAGIC:
261 case XFS_ATTR3_LEAF_MAGIC:
262 leaf = bp->b_addr;
263 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
264 entries = xfs_attr3_leaf_entryp(leaf);
265 if (cursor->hashval > be32_to_cpu(
266 entries[leafhdr.count - 1].hashval)) {
267 trace_xfs_attr_list_wrong_blk(context);
268 xfs_trans_brelse(NULL, bp);
269 bp = NULL;
270 } else if (cursor->hashval <= be32_to_cpu(
271 entries[0].hashval)) {
272 trace_xfs_attr_list_wrong_blk(context);
273 xfs_trans_brelse(NULL, bp);
274 bp = NULL;
275 }
276 break;
277 default:
278 trace_xfs_attr_list_wrong_blk(context);
279 xfs_trans_brelse(NULL, bp);
280 bp = NULL;
281 }
282 }
283 }
284
285 /*
286 * We did not find what we expected given the cursor's contents,
287 * so we start from the top and work down based on the hash value.
288 * Note that start of node block is same as start of leaf block.
289 */
290 if (bp == NULL) {
291 cursor->blkno = 0;
292 for (;;) {
293 __uint16_t magic;
294
295 error = xfs_da3_node_read(NULL, context->dp,
296 cursor->blkno, -1, &bp,
297 XFS_ATTR_FORK);
298 if (error)
299 return(error);
300 node = bp->b_addr;
301 magic = be16_to_cpu(node->hdr.info.magic);
302 if (magic == XFS_ATTR_LEAF_MAGIC ||
303 magic == XFS_ATTR3_LEAF_MAGIC)
304 break;
305 if (magic != XFS_DA_NODE_MAGIC &&
306 magic != XFS_DA3_NODE_MAGIC) {
307 XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
308 XFS_ERRLEVEL_LOW,
309 context->dp->i_mount,
310 node);
311 xfs_trans_brelse(NULL, bp);
312 return XFS_ERROR(EFSCORRUPTED);
313 }
314
315 xfs_da3_node_hdr_from_disk(&nodehdr, node);
316 btree = xfs_da3_node_tree_p(node);
317 for (i = 0; i < nodehdr.count; btree++, i++) {
318 if (cursor->hashval
319 <= be32_to_cpu(btree->hashval)) {
320 cursor->blkno = be32_to_cpu(btree->before);
321 trace_xfs_attr_list_node_descend(context,
322 btree);
323 break;
324 }
325 }
326 if (i == nodehdr.count) {
327 xfs_trans_brelse(NULL, bp);
328 return 0;
329 }
330 xfs_trans_brelse(NULL, bp);
331 }
332 }
333 ASSERT(bp != NULL);
334
335 /*
336 * Roll upward through the blocks, processing each leaf block in
337 * order. As long as there is space in the result buffer, keep
338 * adding the information.
339 */
340 for (;;) {
341 leaf = bp->b_addr;
342 error = xfs_attr3_leaf_list_int(bp, context);
343 if (error) {
344 xfs_trans_brelse(NULL, bp);
345 return error;
346 }
347 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
348 if (context->seen_enough || leafhdr.forw == 0)
349 break;
350 cursor->blkno = leafhdr.forw;
351 xfs_trans_brelse(NULL, bp);
352 error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
353 &bp);
354 if (error)
355 return error;
356 }
357 xfs_trans_brelse(NULL, bp);
358 return 0;
359}
360
361/*
362 * Copy out attribute list entries for attr_list(), for leaf attribute lists.
363 */
364int
365xfs_attr3_leaf_list_int(
366 struct xfs_buf *bp,
367 struct xfs_attr_list_context *context)
368{
369 struct attrlist_cursor_kern *cursor;
370 struct xfs_attr_leafblock *leaf;
371 struct xfs_attr3_icleaf_hdr ichdr;
372 struct xfs_attr_leaf_entry *entries;
373 struct xfs_attr_leaf_entry *entry;
374 int retval;
375 int i;
376
377 trace_xfs_attr_list_leaf(context);
378
379 leaf = bp->b_addr;
380 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
381 entries = xfs_attr3_leaf_entryp(leaf);
382
383 cursor = context->cursor;
384 cursor->initted = 1;
385
386 /*
387 * Re-find our place in the leaf block if this is a new syscall.
388 */
389 if (context->resynch) {
390 entry = &entries[0];
391 for (i = 0; i < ichdr.count; entry++, i++) {
392 if (be32_to_cpu(entry->hashval) == cursor->hashval) {
393 if (cursor->offset == context->dupcnt) {
394 context->dupcnt = 0;
395 break;
396 }
397 context->dupcnt++;
398 } else if (be32_to_cpu(entry->hashval) >
399 cursor->hashval) {
400 context->dupcnt = 0;
401 break;
402 }
403 }
404 if (i == ichdr.count) {
405 trace_xfs_attr_list_notfound(context);
406 return 0;
407 }
408 } else {
409 entry = &entries[0];
410 i = 0;
411 }
412 context->resynch = 0;
413
414 /*
415 * We have found our place, start copying out the new attributes.
416 */
417 retval = 0;
418 for (; i < ichdr.count; entry++, i++) {
419 if (be32_to_cpu(entry->hashval) != cursor->hashval) {
420 cursor->hashval = be32_to_cpu(entry->hashval);
421 cursor->offset = 0;
422 }
423
424 if (entry->flags & XFS_ATTR_INCOMPLETE)
425 continue; /* skip incomplete entries */
426
427 if (entry->flags & XFS_ATTR_LOCAL) {
428 xfs_attr_leaf_name_local_t *name_loc =
429 xfs_attr3_leaf_name_local(leaf, i);
430
431 retval = context->put_listent(context,
432 entry->flags,
433 name_loc->nameval,
434 (int)name_loc->namelen,
435 be16_to_cpu(name_loc->valuelen),
436 &name_loc->nameval[name_loc->namelen]);
437 if (retval)
438 return retval;
439 } else {
440 xfs_attr_leaf_name_remote_t *name_rmt =
441 xfs_attr3_leaf_name_remote(leaf, i);
442
443 int valuelen = be32_to_cpu(name_rmt->valuelen);
444
445 if (context->put_value) {
446 xfs_da_args_t args;
447
448 memset((char *)&args, 0, sizeof(args));
449 args.dp = context->dp;
450 args.whichfork = XFS_ATTR_FORK;
451 args.valuelen = valuelen;
452 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
453 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
454 args.rmtblkcnt = xfs_attr3_rmt_blocks(
455 args.dp->i_mount, valuelen);
456 retval = xfs_attr_rmtval_get(&args);
457 if (retval)
458 return retval;
459 retval = context->put_listent(context,
460 entry->flags,
461 name_rmt->name,
462 (int)name_rmt->namelen,
463 valuelen,
464 args.value);
465 kmem_free(args.value);
466 } else {
467 retval = context->put_listent(context,
468 entry->flags,
469 name_rmt->name,
470 (int)name_rmt->namelen,
471 valuelen,
472 NULL);
473 }
474 if (retval)
475 return retval;
476 }
477 if (context->seen_enough)
478 break;
479 cursor->offset++;
480 }
481 trace_xfs_attr_list_leaf_end(context);
482 return retval;
483}
484
485/*
486 * Copy out attribute entries for attr_list(), for leaf attribute lists.
487 */
488STATIC int
489xfs_attr_leaf_list(xfs_attr_list_context_t *context)
490{
491 int error;
492 struct xfs_buf *bp;
493
494 trace_xfs_attr_leaf_list(context);
495
496 context->cursor->blkno = 0;
497 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
498 if (error)
499 return XFS_ERROR(error);
500
501 error = xfs_attr3_leaf_list_int(bp, context);
502 xfs_trans_brelse(NULL, bp);
503 return XFS_ERROR(error);
504}
505
506int
507xfs_attr_list_int(
508 xfs_attr_list_context_t *context)
509{
510 int error;
511 xfs_inode_t *dp = context->dp;
512
513 XFS_STATS_INC(xs_attr_list);
514
515 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
516 return EIO;
517
518 xfs_ilock(dp, XFS_ILOCK_SHARED);
519
520 /*
521 * Decide on what work routines to call based on the inode size.
522 */
523 if (!xfs_inode_hasattr(dp)) {
524 error = 0;
525 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
526 error = xfs_attr_shortform_list(context);
527 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
528 error = xfs_attr_leaf_list(context);
529 } else {
530 error = xfs_attr_node_list(context);
531 }
532
533 xfs_iunlock(dp, XFS_ILOCK_SHARED);
534
535 return error;
536}
537
538#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
539 (((struct attrlist_ent *) 0)->a_name - (char *) 0)
540#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
541 ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
542 & ~(sizeof(u_int32_t)-1))
543
544/*
545 * Format an attribute and copy it out to the user's buffer.
546 * Take care to check values and protect against them changing later,
547 * we may be reading them directly out of a user buffer.
548 */
549STATIC int
550xfs_attr_put_listent(
551 xfs_attr_list_context_t *context,
552 int flags,
553 unsigned char *name,
554 int namelen,
555 int valuelen,
556 unsigned char *value)
557{
558 struct attrlist *alist = (struct attrlist *)context->alist;
559 attrlist_ent_t *aep;
560 int arraytop;
561
562 ASSERT(!(context->flags & ATTR_KERNOVAL));
563 ASSERT(context->count >= 0);
564 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
565 ASSERT(context->firstu >= sizeof(*alist));
566 ASSERT(context->firstu <= context->bufsize);
567
568 /*
569 * Only list entries in the right namespace.
570 */
571 if (((context->flags & ATTR_SECURE) == 0) !=
572 ((flags & XFS_ATTR_SECURE) == 0))
573 return 0;
574 if (((context->flags & ATTR_ROOT) == 0) !=
575 ((flags & XFS_ATTR_ROOT) == 0))
576 return 0;
577
578 arraytop = sizeof(*alist) +
579 context->count * sizeof(alist->al_offset[0]);
580 context->firstu -= ATTR_ENTSIZE(namelen);
581 if (context->firstu < arraytop) {
582 trace_xfs_attr_list_full(context);
583 alist->al_more = 1;
584 context->seen_enough = 1;
585 return 1;
586 }
587
588 aep = (attrlist_ent_t *)&context->alist[context->firstu];
589 aep->a_valuelen = valuelen;
590 memcpy(aep->a_name, name, namelen);
591 aep->a_name[namelen] = 0;
592 alist->al_offset[context->count++] = context->firstu;
593 alist->al_count = context->count;
594 trace_xfs_attr_list_add(context);
595 return 0;
596}
597
598/*
599 * Generate a list of extended attribute names and optionally
600 * also value lengths. Positive return value follows the XFS
601 * convention of being an error, zero or negative return code
602 * is the length of the buffer returned (negated), indicating
603 * success.
604 */
605int
606xfs_attr_list(
607 xfs_inode_t *dp,
608 char *buffer,
609 int bufsize,
610 int flags,
611 attrlist_cursor_kern_t *cursor)
612{
613 xfs_attr_list_context_t context;
614 struct attrlist *alist;
615 int error;
616
617 /*
618 * Validate the cursor.
619 */
620 if (cursor->pad1 || cursor->pad2)
621 return(XFS_ERROR(EINVAL));
622 if ((cursor->initted == 0) &&
623 (cursor->hashval || cursor->blkno || cursor->offset))
624 return XFS_ERROR(EINVAL);
625
626 /*
627 * Check for a properly aligned buffer.
628 */
629 if (((long)buffer) & (sizeof(int)-1))
630 return XFS_ERROR(EFAULT);
631 if (flags & ATTR_KERNOVAL)
632 bufsize = 0;
633
634 /*
635 * Initialize the output buffer.
636 */
637 memset(&context, 0, sizeof(context));
638 context.dp = dp;
639 context.cursor = cursor;
640 context.resynch = 1;
641 context.flags = flags;
642 context.alist = buffer;
643 context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
644 context.firstu = context.bufsize;
645 context.put_listent = xfs_attr_put_listent;
646
647 alist = (struct attrlist *)context.alist;
648 alist->al_count = 0;
649 alist->al_more = 0;
650 alist->al_offset[0] = context.bufsize;
651
652 error = xfs_attr_list_int(&context);
653 ASSERT(error >= 0);
654 return error;
655}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index ef6b0c124528..712a502de619 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -33,6 +34,7 @@
33#include "xfs_alloc.h" 34#include "xfs_alloc.h"
34#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_bmap_util.h"
36#include "xfs_attr.h" 38#include "xfs_attr.h"
37#include "xfs_attr_leaf.h" 39#include "xfs_attr_leaf.h"
38#include "xfs_attr_remote.h" 40#include "xfs_attr_remote.h"
@@ -237,7 +239,7 @@ xfs_attr_rmtval_copyout(
237 xfs_ino_t ino, 239 xfs_ino_t ino,
238 int *offset, 240 int *offset,
239 int *valuelen, 241 int *valuelen,
240 char **dst) 242 __uint8_t **dst)
241{ 243{
242 char *src = bp->b_addr; 244 char *src = bp->b_addr;
243 xfs_daddr_t bno = bp->b_bn; 245 xfs_daddr_t bno = bp->b_bn;
@@ -249,7 +251,7 @@ xfs_attr_rmtval_copyout(
249 int hdr_size = 0; 251 int hdr_size = 0;
250 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); 252 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
251 253
252 byte_cnt = min_t(int, *valuelen, byte_cnt); 254 byte_cnt = min(*valuelen, byte_cnt);
253 255
254 if (xfs_sb_version_hascrc(&mp->m_sb)) { 256 if (xfs_sb_version_hascrc(&mp->m_sb)) {
255 if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, 257 if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
@@ -284,7 +286,7 @@ xfs_attr_rmtval_copyin(
284 xfs_ino_t ino, 286 xfs_ino_t ino,
285 int *offset, 287 int *offset,
286 int *valuelen, 288 int *valuelen,
287 char **src) 289 __uint8_t **src)
288{ 290{
289 char *dst = bp->b_addr; 291 char *dst = bp->b_addr;
290 xfs_daddr_t bno = bp->b_bn; 292 xfs_daddr_t bno = bp->b_bn;
@@ -337,7 +339,7 @@ xfs_attr_rmtval_get(
337 struct xfs_mount *mp = args->dp->i_mount; 339 struct xfs_mount *mp = args->dp->i_mount;
338 struct xfs_buf *bp; 340 struct xfs_buf *bp;
339 xfs_dablk_t lblkno = args->rmtblkno; 341 xfs_dablk_t lblkno = args->rmtblkno;
340 char *dst = args->value; 342 __uint8_t *dst = args->value;
341 int valuelen = args->valuelen; 343 int valuelen = args->valuelen;
342 int nmap; 344 int nmap;
343 int error; 345 int error;
@@ -401,7 +403,7 @@ xfs_attr_rmtval_set(
401 struct xfs_bmbt_irec map; 403 struct xfs_bmbt_irec map;
402 xfs_dablk_t lblkno; 404 xfs_dablk_t lblkno;
403 xfs_fileoff_t lfileoff = 0; 405 xfs_fileoff_t lfileoff = 0;
404 char *src = args->value; 406 __uint8_t *src = args->value;
405 int blkcnt; 407 int blkcnt;
406 int valuelen; 408 int valuelen;
407 int nmap; 409 int nmap;
@@ -543,11 +545,6 @@ xfs_attr_rmtval_remove(
543 545
544 /* 546 /*
545 * Roll through the "value", invalidating the attribute value's blocks. 547 * Roll through the "value", invalidating the attribute value's blocks.
546 * Note that args->rmtblkcnt is the minimum number of data blocks we'll
547 * see for a CRC enabled remote attribute. Each extent will have a
548 * header, and so we may have more blocks than we realise here. If we
549 * fail to map the blocks correctly, we'll have problems with the buffer
550 * lookups.
551 */ 548 */
552 lblkno = args->rmtblkno; 549 lblkno = args->rmtblkno;
553 blkcnt = args->rmtblkcnt; 550 blkcnt = args->rmtblkcnt;
@@ -628,4 +625,3 @@ xfs_attr_rmtval_remove(
628 } 625 }
629 return(0); 626 return(0);
630} 627}
631
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 05c698ccb238..f47e65c30be6 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,16 +17,17 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_dir2_format.h"
30#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
@@ -39,6 +40,7 @@
39#include "xfs_extfree_item.h" 40#include "xfs_extfree_item.h"
40#include "xfs_alloc.h" 41#include "xfs_alloc.h"
41#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_bmap_util.h"
42#include "xfs_rtalloc.h" 44#include "xfs_rtalloc.h"
43#include "xfs_error.h" 45#include "xfs_error.h"
44#include "xfs_attr_leaf.h" 46#include "xfs_attr_leaf.h"
@@ -46,7 +48,6 @@
46#include "xfs_trans_space.h" 48#include "xfs_trans_space.h"
47#include "xfs_buf_item.h" 49#include "xfs_buf_item.h"
48#include "xfs_filestream.h" 50#include "xfs_filestream.h"
49#include "xfs_vnodeops.h"
50#include "xfs_trace.h" 51#include "xfs_trace.h"
51#include "xfs_symlink.h" 52#include "xfs_symlink.h"
52 53
@@ -108,19 +109,6 @@ xfs_bmap_compute_maxlevels(
108 mp->m_bm_maxlevels[whichfork] = level; 109 mp->m_bm_maxlevels[whichfork] = level;
109} 110}
110 111
111/*
112 * Convert the given file system block to a disk block. We have to treat it
113 * differently based on whether the file is a real time file or not, because the
114 * bmap code does.
115 */
116xfs_daddr_t
117xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
118{
119 return (XFS_IS_REALTIME_INODE(ip) ? \
120 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
121 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
122}
123
124STATIC int /* error */ 112STATIC int /* error */
125xfs_bmbt_lookup_eq( 113xfs_bmbt_lookup_eq(
126 struct xfs_btree_cur *cur, 114 struct xfs_btree_cur *cur,
@@ -263,173 +251,6 @@ xfs_bmap_forkoff_reset(
263} 251}
264 252
265/* 253/*
266 * Extent tree block counting routines.
267 */
268
269/*
270 * Count leaf blocks given a range of extent records.
271 */
272STATIC void
273xfs_bmap_count_leaves(
274 xfs_ifork_t *ifp,
275 xfs_extnum_t idx,
276 int numrecs,
277 int *count)
278{
279 int b;
280
281 for (b = 0; b < numrecs; b++) {
282 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
283 *count += xfs_bmbt_get_blockcount(frp);
284 }
285}
286
287/*
288 * Count leaf blocks given a range of extent records originally
289 * in btree format.
290 */
291STATIC void
292xfs_bmap_disk_count_leaves(
293 struct xfs_mount *mp,
294 struct xfs_btree_block *block,
295 int numrecs,
296 int *count)
297{
298 int b;
299 xfs_bmbt_rec_t *frp;
300
301 for (b = 1; b <= numrecs; b++) {
302 frp = XFS_BMBT_REC_ADDR(mp, block, b);
303 *count += xfs_bmbt_disk_get_blockcount(frp);
304 }
305}
306
307/*
308 * Recursively walks each level of a btree
309 * to count total fsblocks is use.
310 */
311STATIC int /* error */
312xfs_bmap_count_tree(
313 xfs_mount_t *mp, /* file system mount point */
314 xfs_trans_t *tp, /* transaction pointer */
315 xfs_ifork_t *ifp, /* inode fork pointer */
316 xfs_fsblock_t blockno, /* file system block number */
317 int levelin, /* level in btree */
318 int *count) /* Count of blocks */
319{
320 int error;
321 xfs_buf_t *bp, *nbp;
322 int level = levelin;
323 __be64 *pp;
324 xfs_fsblock_t bno = blockno;
325 xfs_fsblock_t nextbno;
326 struct xfs_btree_block *block, *nextblock;
327 int numrecs;
328
329 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
330 &xfs_bmbt_buf_ops);
331 if (error)
332 return error;
333 *count += 1;
334 block = XFS_BUF_TO_BLOCK(bp);
335
336 if (--level) {
337 /* Not at node above leaves, count this level of nodes */
338 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
339 while (nextbno != NULLFSBLOCK) {
340 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
341 XFS_BMAP_BTREE_REF,
342 &xfs_bmbt_buf_ops);
343 if (error)
344 return error;
345 *count += 1;
346 nextblock = XFS_BUF_TO_BLOCK(nbp);
347 nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
348 xfs_trans_brelse(tp, nbp);
349 }
350
351 /* Dive to the next level */
352 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
353 bno = be64_to_cpu(*pp);
354 if (unlikely((error =
355 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
356 xfs_trans_brelse(tp, bp);
357 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
358 XFS_ERRLEVEL_LOW, mp);
359 return XFS_ERROR(EFSCORRUPTED);
360 }
361 xfs_trans_brelse(tp, bp);
362 } else {
363 /* count all level 1 nodes and their leaves */
364 for (;;) {
365 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
366 numrecs = be16_to_cpu(block->bb_numrecs);
367 xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
368 xfs_trans_brelse(tp, bp);
369 if (nextbno == NULLFSBLOCK)
370 break;
371 bno = nextbno;
372 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
373 XFS_BMAP_BTREE_REF,
374 &xfs_bmbt_buf_ops);
375 if (error)
376 return error;
377 *count += 1;
378 block = XFS_BUF_TO_BLOCK(bp);
379 }
380 }
381 return 0;
382}
383
384/*
385 * Count fsblocks of the given fork.
386 */
387int /* error */
388xfs_bmap_count_blocks(
389 xfs_trans_t *tp, /* transaction pointer */
390 xfs_inode_t *ip, /* incore inode */
391 int whichfork, /* data or attr fork */
392 int *count) /* out: count of blocks */
393{
394 struct xfs_btree_block *block; /* current btree block */
395 xfs_fsblock_t bno; /* block # of "block" */
396 xfs_ifork_t *ifp; /* fork structure */
397 int level; /* btree level, for checking */
398 xfs_mount_t *mp; /* file system mount structure */
399 __be64 *pp; /* pointer to block address */
400
401 bno = NULLFSBLOCK;
402 mp = ip->i_mount;
403 ifp = XFS_IFORK_PTR(ip, whichfork);
404 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
405 xfs_bmap_count_leaves(ifp, 0,
406 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
407 count);
408 return 0;
409 }
410
411 /*
412 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
413 */
414 block = ifp->if_broot;
415 level = be16_to_cpu(block->bb_level);
416 ASSERT(level > 0);
417 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
418 bno = be64_to_cpu(*pp);
419 ASSERT(bno != NULLDFSBNO);
420 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
421 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
422
423 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
424 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
425 mp);
426 return XFS_ERROR(EFSCORRUPTED);
427 }
428
429 return 0;
430}
431
432/*
433 * Debug/sanity checking code 254 * Debug/sanity checking code
434 */ 255 */
435 256
@@ -724,8 +545,8 @@ xfs_bmap_trace_exlist(
724 545
725/* 546/*
726 * Validate that the bmbt_irecs being returned from bmapi are valid 547 * Validate that the bmbt_irecs being returned from bmapi are valid
727 * given the callers original parameters. Specifically check the 548 * given the caller's original parameters. Specifically check the
728 * ranges of the returned irecs to ensure that they only extent beyond 549 * ranges of the returned irecs to ensure that they only extend beyond
729 * the given parameters if the XFS_BMAPI_ENTIRE flag was set. 550 * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
730 */ 551 */
731STATIC void 552STATIC void
@@ -823,7 +644,7 @@ xfs_bmap_add_free(
823 * Remove the entry "free" from the free item list. Prev points to the 644 * Remove the entry "free" from the free item list. Prev points to the
824 * previous entry, unless "free" is the head of the list. 645 * previous entry, unless "free" is the head of the list.
825 */ 646 */
826STATIC void 647void
827xfs_bmap_del_free( 648xfs_bmap_del_free(
828 xfs_bmap_free_t *flist, /* free item list header */ 649 xfs_bmap_free_t *flist, /* free item list header */
829 xfs_bmap_free_item_t *prev, /* previous item on list, if any */ 650 xfs_bmap_free_item_t *prev, /* previous item on list, if any */
@@ -837,92 +658,6 @@ xfs_bmap_del_free(
837 kmem_zone_free(xfs_bmap_free_item_zone, free); 658 kmem_zone_free(xfs_bmap_free_item_zone, free);
838} 659}
839 660
840
841/*
842 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
843 * caller. Frees all the extents that need freeing, which must be done
844 * last due to locking considerations. We never free any extents in
845 * the first transaction.
846 *
847 * Return 1 if the given transaction was committed and a new one
848 * started, and 0 otherwise in the committed parameter.
849 */
850int /* error */
851xfs_bmap_finish(
852 xfs_trans_t **tp, /* transaction pointer addr */
853 xfs_bmap_free_t *flist, /* i/o: list extents to free */
854 int *committed) /* xact committed or not */
855{
856 xfs_efd_log_item_t *efd; /* extent free data */
857 xfs_efi_log_item_t *efi; /* extent free intention */
858 int error; /* error return value */
859 xfs_bmap_free_item_t *free; /* free extent item */
860 unsigned int logres; /* new log reservation */
861 unsigned int logcount; /* new log count */
862 xfs_mount_t *mp; /* filesystem mount structure */
863 xfs_bmap_free_item_t *next; /* next item on free list */
864 xfs_trans_t *ntp; /* new transaction pointer */
865
866 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
867 if (flist->xbf_count == 0) {
868 *committed = 0;
869 return 0;
870 }
871 ntp = *tp;
872 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
873 for (free = flist->xbf_first; free; free = free->xbfi_next)
874 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
875 free->xbfi_blockcount);
876 logres = ntp->t_log_res;
877 logcount = ntp->t_log_count;
878 ntp = xfs_trans_dup(*tp);
879 error = xfs_trans_commit(*tp, 0);
880 *tp = ntp;
881 *committed = 1;
882 /*
883 * We have a new transaction, so we should return committed=1,
884 * even though we're returning an error.
885 */
886 if (error)
887 return error;
888
889 /*
890 * transaction commit worked ok so we can drop the extra ticket
891 * reference that we gained in xfs_trans_dup()
892 */
893 xfs_log_ticket_put(ntp->t_ticket);
894
895 if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
896 logcount)))
897 return error;
898 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
899 for (free = flist->xbf_first; free != NULL; free = next) {
900 next = free->xbfi_next;
901 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
902 free->xbfi_blockcount))) {
903 /*
904 * The bmap free list will be cleaned up at a
905 * higher level. The EFI will be canceled when
906 * this transaction is aborted.
907 * Need to force shutdown here to make sure it
908 * happens, since this transaction may not be
909 * dirty yet.
910 */
911 mp = ntp->t_mountp;
912 if (!XFS_FORCED_SHUTDOWN(mp))
913 xfs_force_shutdown(mp,
914 (error == EFSCORRUPTED) ?
915 SHUTDOWN_CORRUPT_INCORE :
916 SHUTDOWN_META_IO_ERROR);
917 return error;
918 }
919 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
920 free->xbfi_blockcount);
921 xfs_bmap_del_free(flist, NULL, free);
922 }
923 return 0;
924}
925
926/* 661/*
927 * Free up any items left in the list. 662 * Free up any items left in the list.
928 */ 663 */
@@ -1413,8 +1148,8 @@ xfs_bmap_add_attrfork(
1413 blks = XFS_ADDAFORK_SPACE_RES(mp); 1148 blks = XFS_ADDAFORK_SPACE_RES(mp);
1414 if (rsvd) 1149 if (rsvd)
1415 tp->t_flags |= XFS_TRANS_RESERVE; 1150 tp->t_flags |= XFS_TRANS_RESERVE;
1416 if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, 1151 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
1417 XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) 1152 if (error)
1418 goto error0; 1153 goto error0;
1419 xfs_ilock(ip, XFS_ILOCK_EXCL); 1154 xfs_ilock(ip, XFS_ILOCK_EXCL);
1420 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? 1155 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
@@ -1815,7 +1550,7 @@ xfs_bmap_first_unused(
1815} 1550}
1816 1551
1817/* 1552/*
1818 * Returns the file-relative block number of the last block + 1 before 1553 * Returns the file-relative block number of the last block - 1 before
1819 * last_block (input value) in the file. 1554 * last_block (input value) in the file.
1820 * This is not based on i_size, it is based on the extent records. 1555 * This is not based on i_size, it is based on the extent records.
1821 * Returns 0 for local files, as they do not have extent records. 1556 * Returns 0 for local files, as they do not have extent records.
@@ -1863,7 +1598,7 @@ xfs_bmap_last_before(
1863 return 0; 1598 return 0;
1864} 1599}
1865 1600
1866STATIC int 1601int
1867xfs_bmap_last_extent( 1602xfs_bmap_last_extent(
1868 struct xfs_trans *tp, 1603 struct xfs_trans *tp,
1869 struct xfs_inode *ip, 1604 struct xfs_inode *ip,
@@ -1927,29 +1662,6 @@ xfs_bmap_isaeof(
1927} 1662}
1928 1663
1929/* 1664/*
1930 * Check if the endoff is outside the last extent. If so the caller will grow
1931 * the allocation to a stripe unit boundary. All offsets are considered outside
1932 * the end of file for an empty fork, so 1 is returned in *eof in that case.
1933 */
1934int
1935xfs_bmap_eof(
1936 struct xfs_inode *ip,
1937 xfs_fileoff_t endoff,
1938 int whichfork,
1939 int *eof)
1940{
1941 struct xfs_bmbt_irec rec;
1942 int error;
1943
1944 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
1945 if (error || *eof)
1946 return error;
1947
1948 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
1949 return 0;
1950}
1951
1952/*
1953 * Returns the file-relative block number of the first block past eof in 1665 * Returns the file-relative block number of the first block past eof in
1954 * the file. This is not based on i_size, it is based on the extent records. 1666 * the file. This is not based on i_size, it is based on the extent records.
1955 * Returns 0 for local files, as they do not have extent records. 1667 * Returns 0 for local files, as they do not have extent records.
@@ -3488,7 +3200,7 @@ done:
3488/* 3200/*
3489 * Adjust the size of the new extent based on di_extsize and rt extsize. 3201 * Adjust the size of the new extent based on di_extsize and rt extsize.
3490 */ 3202 */
3491STATIC int 3203int
3492xfs_bmap_extsize_align( 3204xfs_bmap_extsize_align(
3493 xfs_mount_t *mp, 3205 xfs_mount_t *mp,
3494 xfs_bmbt_irec_t *gotp, /* next extent pointer */ 3206 xfs_bmbt_irec_t *gotp, /* next extent pointer */
@@ -3650,9 +3362,9 @@ xfs_bmap_extsize_align(
3650 3362
3651#define XFS_ALLOC_GAP_UNITS 4 3363#define XFS_ALLOC_GAP_UNITS 4
3652 3364
3653STATIC void 3365void
3654xfs_bmap_adjacent( 3366xfs_bmap_adjacent(
3655 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3367 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3656{ 3368{
3657 xfs_fsblock_t adjust; /* adjustment to block numbers */ 3369 xfs_fsblock_t adjust; /* adjustment to block numbers */
3658 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 3370 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
@@ -3799,109 +3511,6 @@ xfs_bmap_adjacent(
3799} 3511}
3800 3512
3801STATIC int 3513STATIC int
3802xfs_bmap_rtalloc(
3803 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
3804{
3805 xfs_alloctype_t atype = 0; /* type for allocation routines */
3806 int error; /* error return value */
3807 xfs_mount_t *mp; /* mount point structure */
3808 xfs_extlen_t prod = 0; /* product factor for allocators */
3809 xfs_extlen_t ralen = 0; /* realtime allocation length */
3810 xfs_extlen_t align; /* minimum allocation alignment */
3811 xfs_rtblock_t rtb;
3812
3813 mp = ap->ip->i_mount;
3814 align = xfs_get_extsz_hint(ap->ip);
3815 prod = align / mp->m_sb.sb_rextsize;
3816 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
3817 align, 1, ap->eof, 0,
3818 ap->conv, &ap->offset, &ap->length);
3819 if (error)
3820 return error;
3821 ASSERT(ap->length);
3822 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
3823
3824 /*
3825 * If the offset & length are not perfectly aligned
3826 * then kill prod, it will just get us in trouble.
3827 */
3828 if (do_mod(ap->offset, align) || ap->length % align)
3829 prod = 1;
3830 /*
3831 * Set ralen to be the actual requested length in rtextents.
3832 */
3833 ralen = ap->length / mp->m_sb.sb_rextsize;
3834 /*
3835 * If the old value was close enough to MAXEXTLEN that
3836 * we rounded up to it, cut it back so it's valid again.
3837 * Note that if it's a really large request (bigger than
3838 * MAXEXTLEN), we don't hear about that number, and can't
3839 * adjust the starting point to match it.
3840 */
3841 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
3842 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
3843
3844 /*
3845 * Lock out other modifications to the RT bitmap inode.
3846 */
3847 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
3848 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
3849
3850 /*
3851 * If it's an allocation to an empty file at offset 0,
3852 * pick an extent that will space things out in the rt area.
3853 */
3854 if (ap->eof && ap->offset == 0) {
3855 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
3856
3857 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
3858 if (error)
3859 return error;
3860 ap->blkno = rtx * mp->m_sb.sb_rextsize;
3861 } else {
3862 ap->blkno = 0;
3863 }
3864
3865 xfs_bmap_adjacent(ap);
3866
3867 /*
3868 * Realtime allocation, done through xfs_rtallocate_extent.
3869 */
3870 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
3871 do_div(ap->blkno, mp->m_sb.sb_rextsize);
3872 rtb = ap->blkno;
3873 ap->length = ralen;
3874 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
3875 &ralen, atype, ap->wasdel, prod, &rtb)))
3876 return error;
3877 if (rtb == NULLFSBLOCK && prod > 1 &&
3878 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
3879 ap->length, &ralen, atype,
3880 ap->wasdel, 1, &rtb)))
3881 return error;
3882 ap->blkno = rtb;
3883 if (ap->blkno != NULLFSBLOCK) {
3884 ap->blkno *= mp->m_sb.sb_rextsize;
3885 ralen *= mp->m_sb.sb_rextsize;
3886 ap->length = ralen;
3887 ap->ip->i_d.di_nblocks += ralen;
3888 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
3889 if (ap->wasdel)
3890 ap->ip->i_delayed_blks -= ralen;
3891 /*
3892 * Adjust the disk quota also. This was reserved
3893 * earlier.
3894 */
3895 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
3896 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
3897 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
3898 } else {
3899 ap->length = 0;
3900 }
3901 return 0;
3902}
3903
3904STATIC int
3905xfs_bmap_btalloc_nullfb( 3514xfs_bmap_btalloc_nullfb(
3906 struct xfs_bmalloca *ap, 3515 struct xfs_bmalloca *ap,
3907 struct xfs_alloc_arg *args, 3516 struct xfs_alloc_arg *args,
@@ -4018,7 +3627,7 @@ xfs_bmap_btalloc_nullfb(
4018 3627
4019STATIC int 3628STATIC int
4020xfs_bmap_btalloc( 3629xfs_bmap_btalloc(
4021 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3630 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
4022{ 3631{
4023 xfs_mount_t *mp; /* mount point structure */ 3632 xfs_mount_t *mp; /* mount point structure */
4024 xfs_alloctype_t atype = 0; /* type for allocation routines */ 3633 xfs_alloctype_t atype = 0; /* type for allocation routines */
@@ -4250,7 +3859,7 @@ xfs_bmap_btalloc(
4250 */ 3859 */
4251STATIC int 3860STATIC int
4252xfs_bmap_alloc( 3861xfs_bmap_alloc(
4253 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3862 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
4254{ 3863{
4255 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) 3864 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
4256 return xfs_bmap_rtalloc(ap); 3865 return xfs_bmap_rtalloc(ap);
@@ -4638,7 +4247,7 @@ xfs_bmapi_delay(
4638} 4247}
4639 4248
4640 4249
4641STATIC int 4250int
4642__xfs_bmapi_allocate( 4251__xfs_bmapi_allocate(
4643 struct xfs_bmalloca *bma) 4252 struct xfs_bmalloca *bma)
4644{ 4253{
@@ -4648,12 +4257,9 @@ __xfs_bmapi_allocate(
4648 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); 4257 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4649 int tmp_logflags = 0; 4258 int tmp_logflags = 0;
4650 int error; 4259 int error;
4651 int rt;
4652 4260
4653 ASSERT(bma->length > 0); 4261 ASSERT(bma->length > 0);
4654 4262
4655 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4656
4657 /* 4263 /*
4658 * For the wasdelay case, we could also just allocate the stuff asked 4264 * For the wasdelay case, we could also just allocate the stuff asked
4659 * for in this bmap call but that wouldn't be as good. 4265 * for in this bmap call but that wouldn't be as good.
@@ -4756,45 +4362,6 @@ __xfs_bmapi_allocate(
4756 return 0; 4362 return 0;
4757} 4363}
4758 4364
4759static void
4760xfs_bmapi_allocate_worker(
4761 struct work_struct *work)
4762{
4763 struct xfs_bmalloca *args = container_of(work,
4764 struct xfs_bmalloca, work);
4765 unsigned long pflags;
4766
4767 /* we are in a transaction context here */
4768 current_set_flags_nested(&pflags, PF_FSTRANS);
4769
4770 args->result = __xfs_bmapi_allocate(args);
4771 complete(args->done);
4772
4773 current_restore_flags_nested(&pflags, PF_FSTRANS);
4774}
4775
4776/*
4777 * Some allocation requests often come in with little stack to work on. Push
4778 * them off to a worker thread so there is lots of stack to use. Otherwise just
4779 * call directly to avoid the context switch overhead here.
4780 */
4781int
4782xfs_bmapi_allocate(
4783 struct xfs_bmalloca *args)
4784{
4785 DECLARE_COMPLETION_ONSTACK(done);
4786
4787 if (!args->stack_switch)
4788 return __xfs_bmapi_allocate(args);
4789
4790
4791 args->done = &done;
4792 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
4793 queue_work(xfs_alloc_wq, &args->work);
4794 wait_for_completion(&done);
4795 return args->result;
4796}
4797
4798STATIC int 4365STATIC int
4799xfs_bmapi_convert_unwritten( 4366xfs_bmapi_convert_unwritten(
4800 struct xfs_bmalloca *bma, 4367 struct xfs_bmalloca *bma,
@@ -4883,7 +4450,7 @@ xfs_bmapi_write(
4883{ 4450{
4884 struct xfs_mount *mp = ip->i_mount; 4451 struct xfs_mount *mp = ip->i_mount;
4885 struct xfs_ifork *ifp; 4452 struct xfs_ifork *ifp;
4886 struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ 4453 struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
4887 xfs_fileoff_t end; /* end of mapped file region */ 4454 xfs_fileoff_t end; /* end of mapped file region */
4888 int eof; /* after the end of extents */ 4455 int eof; /* after the end of extents */
4889 int error; /* error return */ 4456 int error; /* error return */
@@ -5789,359 +5356,3 @@ error0:
5789 } 5356 }
5790 return error; 5357 return error;
5791} 5358}
5792
5793/*
5794 * returns 1 for success, 0 if we failed to map the extent.
5795 */
5796STATIC int
5797xfs_getbmapx_fix_eof_hole(
5798 xfs_inode_t *ip, /* xfs incore inode pointer */
5799 struct getbmapx *out, /* output structure */
5800 int prealloced, /* this is a file with
5801 * preallocated data space */
5802 __int64_t end, /* last block requested */
5803 xfs_fsblock_t startblock)
5804{
5805 __int64_t fixlen;
5806 xfs_mount_t *mp; /* file system mount point */
5807 xfs_ifork_t *ifp; /* inode fork pointer */
5808 xfs_extnum_t lastx; /* last extent pointer */
5809 xfs_fileoff_t fileblock;
5810
5811 if (startblock == HOLESTARTBLOCK) {
5812 mp = ip->i_mount;
5813 out->bmv_block = -1;
5814 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
5815 fixlen -= out->bmv_offset;
5816 if (prealloced && out->bmv_offset + out->bmv_length == end) {
5817 /* Came to hole at EOF. Trim it. */
5818 if (fixlen <= 0)
5819 return 0;
5820 out->bmv_length = fixlen;
5821 }
5822 } else {
5823 if (startblock == DELAYSTARTBLOCK)
5824 out->bmv_block = -2;
5825 else
5826 out->bmv_block = xfs_fsb_to_db(ip, startblock);
5827 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
5828 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
5829 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
5830 (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
5831 out->bmv_oflags |= BMV_OF_LAST;
5832 }
5833
5834 return 1;
5835}
5836
5837/*
5838 * Get inode's extents as described in bmv, and format for output.
5839 * Calls formatter to fill the user's buffer until all extents
5840 * are mapped, until the passed-in bmv->bmv_count slots have
5841 * been filled, or until the formatter short-circuits the loop,
5842 * if it is tracking filled-in extents on its own.
5843 */
5844int /* error code */
5845xfs_getbmap(
5846 xfs_inode_t *ip,
5847 struct getbmapx *bmv, /* user bmap structure */
5848 xfs_bmap_format_t formatter, /* format to user */
5849 void *arg) /* formatter arg */
5850{
5851 __int64_t bmvend; /* last block requested */
5852 int error = 0; /* return value */
5853 __int64_t fixlen; /* length for -1 case */
5854 int i; /* extent number */
5855 int lock; /* lock state */
5856 xfs_bmbt_irec_t *map; /* buffer for user's data */
5857 xfs_mount_t *mp; /* file system mount point */
5858 int nex; /* # of user extents can do */
5859 int nexleft; /* # of user extents left */
5860 int subnex; /* # of bmapi's can do */
5861 int nmap; /* number of map entries */
5862 struct getbmapx *out; /* output structure */
5863 int whichfork; /* data or attr fork */
5864 int prealloced; /* this is a file with
5865 * preallocated data space */
5866 int iflags; /* interface flags */
5867 int bmapi_flags; /* flags for xfs_bmapi */
5868 int cur_ext = 0;
5869
5870 mp = ip->i_mount;
5871 iflags = bmv->bmv_iflags;
5872 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
5873
5874 if (whichfork == XFS_ATTR_FORK) {
5875 if (XFS_IFORK_Q(ip)) {
5876 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
5877 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
5878 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
5879 return XFS_ERROR(EINVAL);
5880 } else if (unlikely(
5881 ip->i_d.di_aformat != 0 &&
5882 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
5883 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
5884 ip->i_mount);
5885 return XFS_ERROR(EFSCORRUPTED);
5886 }
5887
5888 prealloced = 0;
5889 fixlen = 1LL << 32;
5890 } else {
5891 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5892 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5893 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5894 return XFS_ERROR(EINVAL);
5895
5896 if (xfs_get_extsz_hint(ip) ||
5897 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
5898 prealloced = 1;
5899 fixlen = mp->m_super->s_maxbytes;
5900 } else {
5901 prealloced = 0;
5902 fixlen = XFS_ISIZE(ip);
5903 }
5904 }
5905
5906 if (bmv->bmv_length == -1) {
5907 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
5908 bmv->bmv_length =
5909 max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
5910 } else if (bmv->bmv_length == 0) {
5911 bmv->bmv_entries = 0;
5912 return 0;
5913 } else if (bmv->bmv_length < 0) {
5914 return XFS_ERROR(EINVAL);
5915 }
5916
5917 nex = bmv->bmv_count - 1;
5918 if (nex <= 0)
5919 return XFS_ERROR(EINVAL);
5920 bmvend = bmv->bmv_offset + bmv->bmv_length;
5921
5922
5923 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
5924 return XFS_ERROR(ENOMEM);
5925 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
5926 if (!out) {
5927 out = kmem_zalloc_large(bmv->bmv_count *
5928 sizeof(struct getbmapx));
5929 if (!out)
5930 return XFS_ERROR(ENOMEM);
5931 }
5932
5933 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5934 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5935 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5936 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5937 if (error)
5938 goto out_unlock_iolock;
5939 }
5940 /*
5941 * even after flushing the inode, there can still be delalloc
5942 * blocks on the inode beyond EOF due to speculative
5943 * preallocation. These are not removed until the release
5944 * function is called or the inode is inactivated. Hence we
5945 * cannot assert here that ip->i_delayed_blks == 0.
5946 */
5947 }
5948
5949 lock = xfs_ilock_map_shared(ip);
5950
5951 /*
5952 * Don't let nex be bigger than the number of extents
5953 * we can have assuming alternating holes and real extents.
5954 */
5955 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5956 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5957
5958 bmapi_flags = xfs_bmapi_aflag(whichfork);
5959 if (!(iflags & BMV_IF_PREALLOC))
5960 bmapi_flags |= XFS_BMAPI_IGSTATE;
5961
5962 /*
5963 * Allocate enough space to handle "subnex" maps at a time.
5964 */
5965 error = ENOMEM;
5966 subnex = 16;
5967 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
5968 if (!map)
5969 goto out_unlock_ilock;
5970
5971 bmv->bmv_entries = 0;
5972
5973 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
5974 (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
5975 error = 0;
5976 goto out_free_map;
5977 }
5978
5979 nexleft = nex;
5980
5981 do {
5982 nmap = (nexleft > subnex) ? subnex : nexleft;
5983 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5984 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5985 map, &nmap, bmapi_flags);
5986 if (error)
5987 goto out_free_map;
5988 ASSERT(nmap <= subnex);
5989
5990 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
5991 out[cur_ext].bmv_oflags = 0;
5992 if (map[i].br_state == XFS_EXT_UNWRITTEN)
5993 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
5994 else if (map[i].br_startblock == DELAYSTARTBLOCK)
5995 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
5996 out[cur_ext].bmv_offset =
5997 XFS_FSB_TO_BB(mp, map[i].br_startoff);
5998 out[cur_ext].bmv_length =
5999 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
6000 out[cur_ext].bmv_unused1 = 0;
6001 out[cur_ext].bmv_unused2 = 0;
6002
6003 /*
6004 * delayed allocation extents that start beyond EOF can
6005 * occur due to speculative EOF allocation when the
6006 * delalloc extent is larger than the largest freespace
6007 * extent at conversion time. These extents cannot be
6008 * converted by data writeback, so can exist here even
6009 * if we are not supposed to be finding delalloc
6010 * extents.
6011 */
6012 if (map[i].br_startblock == DELAYSTARTBLOCK &&
6013 map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
6014 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
6015
6016 if (map[i].br_startblock == HOLESTARTBLOCK &&
6017 whichfork == XFS_ATTR_FORK) {
6018 /* came to the end of attribute fork */
6019 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
6020 goto out_free_map;
6021 }
6022
6023 if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
6024 prealloced, bmvend,
6025 map[i].br_startblock))
6026 goto out_free_map;
6027
6028 bmv->bmv_offset =
6029 out[cur_ext].bmv_offset +
6030 out[cur_ext].bmv_length;
6031 bmv->bmv_length =
6032 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
6033
6034 /*
6035 * In case we don't want to return the hole,
6036 * don't increase cur_ext so that we can reuse
6037 * it in the next loop.
6038 */
6039 if ((iflags & BMV_IF_NO_HOLES) &&
6040 map[i].br_startblock == HOLESTARTBLOCK) {
6041 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
6042 continue;
6043 }
6044
6045 nexleft--;
6046 bmv->bmv_entries++;
6047 cur_ext++;
6048 }
6049 } while (nmap && nexleft && bmv->bmv_length);
6050
6051 out_free_map:
6052 kmem_free(map);
6053 out_unlock_ilock:
6054 xfs_iunlock_map_shared(ip, lock);
6055 out_unlock_iolock:
6056 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
6057
6058 for (i = 0; i < cur_ext; i++) {
6059 int full = 0; /* user array is full */
6060
6061 /* format results & advance arg */
6062 error = formatter(&arg, &out[i], &full);
6063 if (error || full)
6064 break;
6065 }
6066
6067 if (is_vmalloc_addr(out))
6068 kmem_free_large(out);
6069 else
6070 kmem_free(out);
6071 return error;
6072}
6073
6074/*
6075 * dead simple method of punching delalyed allocation blocks from a range in
6076 * the inode. Walks a block at a time so will be slow, but is only executed in
6077 * rare error cases so the overhead is not critical. This will alays punch out
6078 * both the start and end blocks, even if the ranges only partially overlap
6079 * them, so it is up to the caller to ensure that partial blocks are not
6080 * passed in.
6081 */
6082int
6083xfs_bmap_punch_delalloc_range(
6084 struct xfs_inode *ip,
6085 xfs_fileoff_t start_fsb,
6086 xfs_fileoff_t length)
6087{
6088 xfs_fileoff_t remaining = length;
6089 int error = 0;
6090
6091 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6092
6093 do {
6094 int done;
6095 xfs_bmbt_irec_t imap;
6096 int nimaps = 1;
6097 xfs_fsblock_t firstblock;
6098 xfs_bmap_free_t flist;
6099
6100 /*
6101 * Map the range first and check that it is a delalloc extent
6102 * before trying to unmap the range. Otherwise we will be
6103 * trying to remove a real extent (which requires a
6104 * transaction) or a hole, which is probably a bad idea...
6105 */
6106 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
6107 XFS_BMAPI_ENTIRE);
6108
6109 if (error) {
6110 /* something screwed, just bail */
6111 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6112 xfs_alert(ip->i_mount,
6113 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6114 ip->i_ino, start_fsb);
6115 }
6116 break;
6117 }
6118 if (!nimaps) {
6119 /* nothing there */
6120 goto next_block;
6121 }
6122 if (imap.br_startblock != DELAYSTARTBLOCK) {
6123 /* been converted, ignore */
6124 goto next_block;
6125 }
6126 WARN_ON(imap.br_blockcount == 0);
6127
6128 /*
6129 * Note: while we initialise the firstblock/flist pair, they
6130 * should never be used because blocks should never be
6131 * allocated or freed for a delalloc extent and hence we need
6132 * don't cancel or finish them after the xfs_bunmapi() call.
6133 */
6134 xfs_bmap_init(&flist, &firstblock);
6135 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6136 &flist, &done);
6137 if (error)
6138 break;
6139
6140 ASSERT(!flist.xbf_count && !flist.xbf_first);
6141next_block:
6142 start_fsb++;
6143 remaining--;
6144 } while(remaining > 0);
6145
6146 return error;
6147}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1cf1292d29b7..33b41f351225 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -108,41 +108,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
108} 108}
109 109
110/* 110/*
111 * Argument structure for xfs_bmap_alloc.
112 */
113typedef struct xfs_bmalloca {
114 xfs_fsblock_t *firstblock; /* i/o first block allocated */
115 struct xfs_bmap_free *flist; /* bmap freelist */
116 struct xfs_trans *tp; /* transaction pointer */
117 struct xfs_inode *ip; /* incore inode pointer */
118 struct xfs_bmbt_irec prev; /* extent before the new one */
119 struct xfs_bmbt_irec got; /* extent after, or delayed */
120
121 xfs_fileoff_t offset; /* offset in file filling in */
122 xfs_extlen_t length; /* i/o length asked/allocated */
123 xfs_fsblock_t blkno; /* starting block of new extent */
124
125 struct xfs_btree_cur *cur; /* btree cursor */
126 xfs_extnum_t idx; /* current extent index */
127 int nallocs;/* number of extents alloc'd */
128 int logflags;/* flags for transaction logging */
129
130 xfs_extlen_t total; /* total blocks needed for xaction */
131 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
132 xfs_extlen_t minleft; /* amount must be left after alloc */
133 char eof; /* set if allocating past last extent */
134 char wasdel; /* replacing a delayed allocation */
135 char userdata;/* set if is user data */
136 char aeof; /* allocated space at eof */
137 char conv; /* overwriting unwritten extents */
138 char stack_switch;
139 int flags;
140 struct completion *done;
141 struct work_struct work;
142 int result;
143} xfs_bmalloca_t;
144
145/*
146 * Flags for xfs_bmap_add_extent*. 111 * Flags for xfs_bmap_add_extent*.
147 */ 112 */
148#define BMAP_LEFT_CONTIG (1 << 0) 113#define BMAP_LEFT_CONTIG (1 << 0)
@@ -162,7 +127,7 @@ typedef struct xfs_bmalloca {
162 { BMAP_RIGHT_FILLING, "RF" }, \ 127 { BMAP_RIGHT_FILLING, "RF" }, \
163 { BMAP_ATTRFORK, "ATTR" } 128 { BMAP_ATTRFORK, "ATTR" }
164 129
165#if defined(__KERNEL) && defined(DEBUG) 130#ifdef DEBUG
166void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 131void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
167 int whichfork, unsigned long caller_ip); 132 int whichfork, unsigned long caller_ip);
168#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 133#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
@@ -205,23 +170,4 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
205 xfs_extnum_t num); 170 xfs_extnum_t num);
206uint xfs_default_attroffset(struct xfs_inode *ip); 171uint xfs_default_attroffset(struct xfs_inode *ip);
207 172
208#ifdef __KERNEL__
209/* bmap to userspace formatter - copy to user & advance pointer */
210typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
211
212int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
213 int *committed);
214int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
215 xfs_bmap_format_t formatter, void *arg);
216int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
217 int whichfork, int *eof);
218int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
219 int whichfork, int *count);
220int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
221 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
222
223xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
224
225#endif /* __KERNEL__ */
226
227#endif /* __XFS_BMAP_H__ */ 173#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0c61a22be6fd..bb8de8e399c4 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -722,7 +722,7 @@ xfs_bmbt_key_diff(
722 cur->bc_rec.b.br_startoff; 722 cur->bc_rec.b.br_startoff;
723} 723}
724 724
725static int 725static bool
726xfs_bmbt_verify( 726xfs_bmbt_verify(
727 struct xfs_buf *bp) 727 struct xfs_buf *bp)
728{ 728{
@@ -775,7 +775,6 @@ xfs_bmbt_verify(
775 return false; 775 return false;
776 776
777 return true; 777 return true;
778
779} 778}
780 779
781static void 780static void
@@ -789,7 +788,6 @@ xfs_bmbt_read_verify(
789 bp->b_target->bt_mount, bp->b_addr); 788 bp->b_target->bt_mount, bp->b_addr);
790 xfs_buf_ioerror(bp, EFSCORRUPTED); 789 xfs_buf_ioerror(bp, EFSCORRUPTED);
791 } 790 }
792
793} 791}
794 792
795static void 793static void
@@ -927,3 +925,47 @@ xfs_bmdr_maxrecs(
927 return blocklen / sizeof(xfs_bmdr_rec_t); 925 return blocklen / sizeof(xfs_bmdr_rec_t);
928 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); 926 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
929} 927}
928
929/*
930 * Change the owner of a btree format fork fo the inode passed in. Change it to
931 * the owner of that is passed in so that we can change owners before or after
932 * we switch forks between inodes. The operation that the caller is doing will
933 * determine whether is needs to change owner before or after the switch.
934 *
935 * For demand paged transactional modification, the fork switch should be done
936 * after reading in all the blocks, modifying them and pinning them in the
937 * transaction. For modification when the buffers are already pinned in memory,
938 * the fork switch can be done before changing the owner as we won't need to
939 * validate the owner until the btree buffers are unpinned and writes can occur
940 * again.
941 *
942 * For recovery based ownership change, there is no transactional context and
943 * so a buffer list must be supplied so that we can record the buffers that we
944 * modified for the caller to issue IO on.
945 */
946int
947xfs_bmbt_change_owner(
948 struct xfs_trans *tp,
949 struct xfs_inode *ip,
950 int whichfork,
951 xfs_ino_t new_owner,
952 struct list_head *buffer_list)
953{
954 struct xfs_btree_cur *cur;
955 int error;
956
957 ASSERT(tp || buffer_list);
958 ASSERT(!(tp && buffer_list));
959 if (whichfork == XFS_DATA_FORK)
960 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
961 else
962 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
963
964 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
965 if (!cur)
966 return ENOMEM;
967
968 error = xfs_btree_change_owner(cur, new_owner, buffer_list);
969 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
970 return error;
971}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1b726d626941..e367461a638e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,6 +236,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); 236extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); 237extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
238 238
239extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
240 int whichfork, xfs_ino_t new_owner,
241 struct list_head *buffer_list);
242
239extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 243extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
240 struct xfs_trans *, struct xfs_inode *, int); 244 struct xfs_trans *, struct xfs_inode *, int);
241 245
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 000000000000..97f952caea74
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,2045 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_btree.h"
36#include "xfs_extfree_item.h"
37#include "xfs_alloc.h"
38#include "xfs_bmap.h"
39#include "xfs_bmap_util.h"
40#include "xfs_rtalloc.h"
41#include "xfs_error.h"
42#include "xfs_quota.h"
43#include "xfs_trans_space.h"
44#include "xfs_trace.h"
45#include "xfs_icache.h"
46
47/* Kernel only BMAP related definitions and functions */
48
49/*
50 * Convert the given file system block to a disk block. We have to treat it
51 * differently based on whether the file is a real time file or not, because the
52 * bmap code does.
53 */
54xfs_daddr_t
55xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
56{
57 return (XFS_IS_REALTIME_INODE(ip) ? \
58 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
59 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
60}
61
62/*
63 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
64 * caller. Frees all the extents that need freeing, which must be done
65 * last due to locking considerations. We never free any extents in
66 * the first transaction.
67 *
68 * Return 1 if the given transaction was committed and a new one
69 * started, and 0 otherwise in the committed parameter.
70 */
71int /* error */
72xfs_bmap_finish(
73 xfs_trans_t **tp, /* transaction pointer addr */
74 xfs_bmap_free_t *flist, /* i/o: list extents to free */
75 int *committed) /* xact committed or not */
76{
77 xfs_efd_log_item_t *efd; /* extent free data */
78 xfs_efi_log_item_t *efi; /* extent free intention */
79 int error; /* error return value */
80 xfs_bmap_free_item_t *free; /* free extent item */
81 struct xfs_trans_res tres; /* new log reservation */
82 xfs_mount_t *mp; /* filesystem mount structure */
83 xfs_bmap_free_item_t *next; /* next item on free list */
84 xfs_trans_t *ntp; /* new transaction pointer */
85
86 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
87 if (flist->xbf_count == 0) {
88 *committed = 0;
89 return 0;
90 }
91 ntp = *tp;
92 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
93 for (free = flist->xbf_first; free; free = free->xbfi_next)
94 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
95 free->xbfi_blockcount);
96
97 tres.tr_logres = ntp->t_log_res;
98 tres.tr_logcount = ntp->t_log_count;
99 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
100 ntp = xfs_trans_dup(*tp);
101 error = xfs_trans_commit(*tp, 0);
102 *tp = ntp;
103 *committed = 1;
104 /*
105 * We have a new transaction, so we should return committed=1,
106 * even though we're returning an error.
107 */
108 if (error)
109 return error;
110
111 /*
112 * transaction commit worked ok so we can drop the extra ticket
113 * reference that we gained in xfs_trans_dup()
114 */
115 xfs_log_ticket_put(ntp->t_ticket);
116
117 error = xfs_trans_reserve(ntp, &tres, 0, 0);
118 if (error)
119 return error;
120 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
121 for (free = flist->xbf_first; free != NULL; free = next) {
122 next = free->xbfi_next;
123 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
124 free->xbfi_blockcount))) {
125 /*
126 * The bmap free list will be cleaned up at a
127 * higher level. The EFI will be canceled when
128 * this transaction is aborted.
129 * Need to force shutdown here to make sure it
130 * happens, since this transaction may not be
131 * dirty yet.
132 */
133 mp = ntp->t_mountp;
134 if (!XFS_FORCED_SHUTDOWN(mp))
135 xfs_force_shutdown(mp,
136 (error == EFSCORRUPTED) ?
137 SHUTDOWN_CORRUPT_INCORE :
138 SHUTDOWN_META_IO_ERROR);
139 return error;
140 }
141 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
142 free->xbfi_blockcount);
143 xfs_bmap_del_free(flist, NULL, free);
144 }
145 return 0;
146}
147
148int
149xfs_bmap_rtalloc(
150 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
151{
152 xfs_alloctype_t atype = 0; /* type for allocation routines */
153 int error; /* error return value */
154 xfs_mount_t *mp; /* mount point structure */
155 xfs_extlen_t prod = 0; /* product factor for allocators */
156 xfs_extlen_t ralen = 0; /* realtime allocation length */
157 xfs_extlen_t align; /* minimum allocation alignment */
158 xfs_rtblock_t rtb;
159
160 mp = ap->ip->i_mount;
161 align = xfs_get_extsz_hint(ap->ip);
162 prod = align / mp->m_sb.sb_rextsize;
163 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
164 align, 1, ap->eof, 0,
165 ap->conv, &ap->offset, &ap->length);
166 if (error)
167 return error;
168 ASSERT(ap->length);
169 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
170
171 /*
172 * If the offset & length are not perfectly aligned
173 * then kill prod, it will just get us in trouble.
174 */
175 if (do_mod(ap->offset, align) || ap->length % align)
176 prod = 1;
177 /*
178 * Set ralen to be the actual requested length in rtextents.
179 */
180 ralen = ap->length / mp->m_sb.sb_rextsize;
181 /*
182 * If the old value was close enough to MAXEXTLEN that
183 * we rounded up to it, cut it back so it's valid again.
184 * Note that if it's a really large request (bigger than
185 * MAXEXTLEN), we don't hear about that number, and can't
186 * adjust the starting point to match it.
187 */
188 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
189 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
190
191 /*
192 * Lock out other modifications to the RT bitmap inode.
193 */
194 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
195 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
196
197 /*
198 * If it's an allocation to an empty file at offset 0,
199 * pick an extent that will space things out in the rt area.
200 */
201 if (ap->eof && ap->offset == 0) {
202 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
203
204 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
205 if (error)
206 return error;
207 ap->blkno = rtx * mp->m_sb.sb_rextsize;
208 } else {
209 ap->blkno = 0;
210 }
211
212 xfs_bmap_adjacent(ap);
213
214 /*
215 * Realtime allocation, done through xfs_rtallocate_extent.
216 */
217 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
218 do_div(ap->blkno, mp->m_sb.sb_rextsize);
219 rtb = ap->blkno;
220 ap->length = ralen;
221 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
222 &ralen, atype, ap->wasdel, prod, &rtb)))
223 return error;
224 if (rtb == NULLFSBLOCK && prod > 1 &&
225 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
226 ap->length, &ralen, atype,
227 ap->wasdel, 1, &rtb)))
228 return error;
229 ap->blkno = rtb;
230 if (ap->blkno != NULLFSBLOCK) {
231 ap->blkno *= mp->m_sb.sb_rextsize;
232 ralen *= mp->m_sb.sb_rextsize;
233 ap->length = ralen;
234 ap->ip->i_d.di_nblocks += ralen;
235 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
236 if (ap->wasdel)
237 ap->ip->i_delayed_blks -= ralen;
238 /*
239 * Adjust the disk quota also. This was reserved
240 * earlier.
241 */
242 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
243 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
244 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
245 } else {
246 ap->length = 0;
247 }
248 return 0;
249}
250
251/*
252 * Stack switching interfaces for allocation
253 */
254static void
255xfs_bmapi_allocate_worker(
256 struct work_struct *work)
257{
258 struct xfs_bmalloca *args = container_of(work,
259 struct xfs_bmalloca, work);
260 unsigned long pflags;
261
262 /* we are in a transaction context here */
263 current_set_flags_nested(&pflags, PF_FSTRANS);
264
265 args->result = __xfs_bmapi_allocate(args);
266 complete(args->done);
267
268 current_restore_flags_nested(&pflags, PF_FSTRANS);
269}
270
271/*
272 * Some allocation requests often come in with little stack to work on. Push
273 * them off to a worker thread so there is lots of stack to use. Otherwise just
274 * call directly to avoid the context switch overhead here.
275 */
276int
277xfs_bmapi_allocate(
278 struct xfs_bmalloca *args)
279{
280 DECLARE_COMPLETION_ONSTACK(done);
281
282 if (!args->stack_switch)
283 return __xfs_bmapi_allocate(args);
284
285
286 args->done = &done;
287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
288 queue_work(xfs_alloc_wq, &args->work);
289 wait_for_completion(&done);
290 return args->result;
291}
292
293/*
294 * Check if the endoff is outside the last extent. If so the caller will grow
295 * the allocation to a stripe unit boundary. All offsets are considered outside
296 * the end of file for an empty fork, so 1 is returned in *eof in that case.
297 */
298int
299xfs_bmap_eof(
300 struct xfs_inode *ip,
301 xfs_fileoff_t endoff,
302 int whichfork,
303 int *eof)
304{
305 struct xfs_bmbt_irec rec;
306 int error;
307
308 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
309 if (error || *eof)
310 return error;
311
312 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
313 return 0;
314}
315
316/*
317 * Extent tree block counting routines.
318 */
319
320/*
321 * Count leaf blocks given a range of extent records.
322 */
323STATIC void
324xfs_bmap_count_leaves(
325 xfs_ifork_t *ifp,
326 xfs_extnum_t idx,
327 int numrecs,
328 int *count)
329{
330 int b;
331
332 for (b = 0; b < numrecs; b++) {
333 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
334 *count += xfs_bmbt_get_blockcount(frp);
335 }
336}
337
338/*
339 * Count leaf blocks given a range of extent records originally
340 * in btree format.
341 */
342STATIC void
343xfs_bmap_disk_count_leaves(
344 struct xfs_mount *mp,
345 struct xfs_btree_block *block,
346 int numrecs,
347 int *count)
348{
349 int b;
350 xfs_bmbt_rec_t *frp;
351
352 for (b = 1; b <= numrecs; b++) {
353 frp = XFS_BMBT_REC_ADDR(mp, block, b);
354 *count += xfs_bmbt_disk_get_blockcount(frp);
355 }
356}
357
358/*
359 * Recursively walks each level of a btree
360 * to count total fsblocks in use.
361 */
362STATIC int /* error */
363xfs_bmap_count_tree(
364 xfs_mount_t *mp, /* file system mount point */
365 xfs_trans_t *tp, /* transaction pointer */
366 xfs_ifork_t *ifp, /* inode fork pointer */
367 xfs_fsblock_t blockno, /* file system block number */
368 int levelin, /* level in btree */
369 int *count) /* Count of blocks */
370{
371 int error;
372 xfs_buf_t *bp, *nbp;
373 int level = levelin;
374 __be64 *pp;
375 xfs_fsblock_t bno = blockno;
376 xfs_fsblock_t nextbno;
377 struct xfs_btree_block *block, *nextblock;
378 int numrecs;
379
380 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
381 &xfs_bmbt_buf_ops);
382 if (error)
383 return error;
384 *count += 1;
385 block = XFS_BUF_TO_BLOCK(bp);
386
387 if (--level) {
388 /* Not at node above leaves, count this level of nodes */
389 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
390 while (nextbno != NULLFSBLOCK) {
391 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
392 XFS_BMAP_BTREE_REF,
393 &xfs_bmbt_buf_ops);
394 if (error)
395 return error;
396 *count += 1;
397 nextblock = XFS_BUF_TO_BLOCK(nbp);
398 nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
399 xfs_trans_brelse(tp, nbp);
400 }
401
402 /* Dive to the next level */
403 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
404 bno = be64_to_cpu(*pp);
405 if (unlikely((error =
406 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
407 xfs_trans_brelse(tp, bp);
408 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
409 XFS_ERRLEVEL_LOW, mp);
410 return XFS_ERROR(EFSCORRUPTED);
411 }
412 xfs_trans_brelse(tp, bp);
413 } else {
414 /* count all level 1 nodes and their leaves */
415 for (;;) {
416 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
417 numrecs = be16_to_cpu(block->bb_numrecs);
418 xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
419 xfs_trans_brelse(tp, bp);
420 if (nextbno == NULLFSBLOCK)
421 break;
422 bno = nextbno;
423 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
424 XFS_BMAP_BTREE_REF,
425 &xfs_bmbt_buf_ops);
426 if (error)
427 return error;
428 *count += 1;
429 block = XFS_BUF_TO_BLOCK(bp);
430 }
431 }
432 return 0;
433}
434
435/*
436 * Count fsblocks of the given fork.
437 */
438int /* error */
439xfs_bmap_count_blocks(
440 xfs_trans_t *tp, /* transaction pointer */
441 xfs_inode_t *ip, /* incore inode */
442 int whichfork, /* data or attr fork */
443 int *count) /* out: count of blocks */
444{
445 struct xfs_btree_block *block; /* current btree block */
446 xfs_fsblock_t bno; /* block # of "block" */
447 xfs_ifork_t *ifp; /* fork structure */
448 int level; /* btree level, for checking */
449 xfs_mount_t *mp; /* file system mount structure */
450 __be64 *pp; /* pointer to block address */
451
452 bno = NULLFSBLOCK;
453 mp = ip->i_mount;
454 ifp = XFS_IFORK_PTR(ip, whichfork);
455 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
456 xfs_bmap_count_leaves(ifp, 0,
457 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
458 count);
459 return 0;
460 }
461
462 /*
463 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
464 */
465 block = ifp->if_broot;
466 level = be16_to_cpu(block->bb_level);
467 ASSERT(level > 0);
468 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
469 bno = be64_to_cpu(*pp);
470 ASSERT(bno != NULLDFSBNO);
471 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
472 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
473
474 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
475 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
476 mp);
477 return XFS_ERROR(EFSCORRUPTED);
478 }
479
480 return 0;
481}
482
483/*
484 * returns 1 for success, 0 if we failed to map the extent.
485 */
486STATIC int
487xfs_getbmapx_fix_eof_hole(
488 xfs_inode_t *ip, /* xfs incore inode pointer */
489 struct getbmapx *out, /* output structure */
490 int prealloced, /* this is a file with
491 * preallocated data space */
492 __int64_t end, /* last block requested */
493 xfs_fsblock_t startblock)
494{
495 __int64_t fixlen;
496 xfs_mount_t *mp; /* file system mount point */
497 xfs_ifork_t *ifp; /* inode fork pointer */
498 xfs_extnum_t lastx; /* last extent pointer */
499 xfs_fileoff_t fileblock;
500
501 if (startblock == HOLESTARTBLOCK) {
502 mp = ip->i_mount;
503 out->bmv_block = -1;
504 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
505 fixlen -= out->bmv_offset;
506 if (prealloced && out->bmv_offset + out->bmv_length == end) {
507 /* Came to hole at EOF. Trim it. */
508 if (fixlen <= 0)
509 return 0;
510 out->bmv_length = fixlen;
511 }
512 } else {
513 if (startblock == DELAYSTARTBLOCK)
514 out->bmv_block = -2;
515 else
516 out->bmv_block = xfs_fsb_to_db(ip, startblock);
517 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
518 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
519 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
520 (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
521 out->bmv_oflags |= BMV_OF_LAST;
522 }
523
524 return 1;
525}
526
527/*
528 * Get inode's extents as described in bmv, and format for output.
529 * Calls formatter to fill the user's buffer until all extents
530 * are mapped, until the passed-in bmv->bmv_count slots have
531 * been filled, or until the formatter short-circuits the loop,
532 * if it is tracking filled-in extents on its own.
533 */
534int /* error code */
535xfs_getbmap(
536 xfs_inode_t *ip,
537 struct getbmapx *bmv, /* user bmap structure */
538 xfs_bmap_format_t formatter, /* format to user */
539 void *arg) /* formatter arg */
540{
541 __int64_t bmvend; /* last block requested */
542 int error = 0; /* return value */
543 __int64_t fixlen; /* length for -1 case */
544 int i; /* extent number */
545 int lock; /* lock state */
546 xfs_bmbt_irec_t *map; /* buffer for user's data */
547 xfs_mount_t *mp; /* file system mount point */
548 int nex; /* # of user extents can do */
549 int nexleft; /* # of user extents left */
550 int subnex; /* # of bmapi's can do */
551 int nmap; /* number of map entries */
552 struct getbmapx *out; /* output structure */
553 int whichfork; /* data or attr fork */
554 int prealloced; /* this is a file with
555 * preallocated data space */
556 int iflags; /* interface flags */
557 int bmapi_flags; /* flags for xfs_bmapi */
558 int cur_ext = 0;
559
560 mp = ip->i_mount;
561 iflags = bmv->bmv_iflags;
562 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
563
564 if (whichfork == XFS_ATTR_FORK) {
565 if (XFS_IFORK_Q(ip)) {
566 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
567 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
568 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
569 return XFS_ERROR(EINVAL);
570 } else if (unlikely(
571 ip->i_d.di_aformat != 0 &&
572 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
573 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
574 ip->i_mount);
575 return XFS_ERROR(EFSCORRUPTED);
576 }
577
578 prealloced = 0;
579 fixlen = 1LL << 32;
580 } else {
581 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
582 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
583 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
584 return XFS_ERROR(EINVAL);
585
586 if (xfs_get_extsz_hint(ip) ||
587 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
588 prealloced = 1;
589 fixlen = mp->m_super->s_maxbytes;
590 } else {
591 prealloced = 0;
592 fixlen = XFS_ISIZE(ip);
593 }
594 }
595
596 if (bmv->bmv_length == -1) {
597 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
598 bmv->bmv_length =
599 max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
600 } else if (bmv->bmv_length == 0) {
601 bmv->bmv_entries = 0;
602 return 0;
603 } else if (bmv->bmv_length < 0) {
604 return XFS_ERROR(EINVAL);
605 }
606
607 nex = bmv->bmv_count - 1;
608 if (nex <= 0)
609 return XFS_ERROR(EINVAL);
610 bmvend = bmv->bmv_offset + bmv->bmv_length;
611
612
613 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
614 return XFS_ERROR(ENOMEM);
615 out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
616 if (!out)
617 return XFS_ERROR(ENOMEM);
618
619 xfs_ilock(ip, XFS_IOLOCK_SHARED);
620 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
621 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
622 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
623 if (error)
624 goto out_unlock_iolock;
625 }
626 /*
627 * even after flushing the inode, there can still be delalloc
628 * blocks on the inode beyond EOF due to speculative
629 * preallocation. These are not removed until the release
630 * function is called or the inode is inactivated. Hence we
631 * cannot assert here that ip->i_delayed_blks == 0.
632 */
633 }
634
635 lock = xfs_ilock_map_shared(ip);
636
637 /*
638 * Don't let nex be bigger than the number of extents
639 * we can have assuming alternating holes and real extents.
640 */
641 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
642 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
643
644 bmapi_flags = xfs_bmapi_aflag(whichfork);
645 if (!(iflags & BMV_IF_PREALLOC))
646 bmapi_flags |= XFS_BMAPI_IGSTATE;
647
648 /*
649 * Allocate enough space to handle "subnex" maps at a time.
650 */
651 error = ENOMEM;
652 subnex = 16;
653 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
654 if (!map)
655 goto out_unlock_ilock;
656
657 bmv->bmv_entries = 0;
658
659 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
660 (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
661 error = 0;
662 goto out_free_map;
663 }
664
665 nexleft = nex;
666
667 do {
668 nmap = (nexleft > subnex) ? subnex : nexleft;
669 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
670 XFS_BB_TO_FSB(mp, bmv->bmv_length),
671 map, &nmap, bmapi_flags);
672 if (error)
673 goto out_free_map;
674 ASSERT(nmap <= subnex);
675
676 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
677 out[cur_ext].bmv_oflags = 0;
678 if (map[i].br_state == XFS_EXT_UNWRITTEN)
679 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
680 else if (map[i].br_startblock == DELAYSTARTBLOCK)
681 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
682 out[cur_ext].bmv_offset =
683 XFS_FSB_TO_BB(mp, map[i].br_startoff);
684 out[cur_ext].bmv_length =
685 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
686 out[cur_ext].bmv_unused1 = 0;
687 out[cur_ext].bmv_unused2 = 0;
688
689 /*
690 * delayed allocation extents that start beyond EOF can
691 * occur due to speculative EOF allocation when the
692 * delalloc extent is larger than the largest freespace
693 * extent at conversion time. These extents cannot be
694 * converted by data writeback, so can exist here even
695 * if we are not supposed to be finding delalloc
696 * extents.
697 */
698 if (map[i].br_startblock == DELAYSTARTBLOCK &&
699 map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
700 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
701
702 if (map[i].br_startblock == HOLESTARTBLOCK &&
703 whichfork == XFS_ATTR_FORK) {
704 /* came to the end of attribute fork */
705 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
706 goto out_free_map;
707 }
708
709 if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
710 prealloced, bmvend,
711 map[i].br_startblock))
712 goto out_free_map;
713
714 bmv->bmv_offset =
715 out[cur_ext].bmv_offset +
716 out[cur_ext].bmv_length;
717 bmv->bmv_length =
718 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
719
720 /*
721 * In case we don't want to return the hole,
722 * don't increase cur_ext so that we can reuse
723 * it in the next loop.
724 */
725 if ((iflags & BMV_IF_NO_HOLES) &&
726 map[i].br_startblock == HOLESTARTBLOCK) {
727 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
728 continue;
729 }
730
731 nexleft--;
732 bmv->bmv_entries++;
733 cur_ext++;
734 }
735 } while (nmap && nexleft && bmv->bmv_length);
736
737 out_free_map:
738 kmem_free(map);
739 out_unlock_ilock:
740 xfs_iunlock_map_shared(ip, lock);
741 out_unlock_iolock:
742 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
743
744 for (i = 0; i < cur_ext; i++) {
745 int full = 0; /* user array is full */
746
747 /* format results & advance arg */
748 error = formatter(&arg, &out[i], &full);
749 if (error || full)
750 break;
751 }
752
753 kmem_free(out);
754 return error;
755}
756
757/*
758 * dead simple method of punching delalyed allocation blocks from a range in
759 * the inode. Walks a block at a time so will be slow, but is only executed in
760 * rare error cases so the overhead is not critical. This will always punch out
761 * both the start and end blocks, even if the ranges only partially overlap
762 * them, so it is up to the caller to ensure that partial blocks are not
763 * passed in.
764 */
765int
766xfs_bmap_punch_delalloc_range(
767 struct xfs_inode *ip,
768 xfs_fileoff_t start_fsb,
769 xfs_fileoff_t length)
770{
771 xfs_fileoff_t remaining = length;
772 int error = 0;
773
774 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
775
776 do {
777 int done;
778 xfs_bmbt_irec_t imap;
779 int nimaps = 1;
780 xfs_fsblock_t firstblock;
781 xfs_bmap_free_t flist;
782
783 /*
784 * Map the range first and check that it is a delalloc extent
785 * before trying to unmap the range. Otherwise we will be
786 * trying to remove a real extent (which requires a
787 * transaction) or a hole, which is probably a bad idea...
788 */
789 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
790 XFS_BMAPI_ENTIRE);
791
792 if (error) {
793 /* something screwed, just bail */
794 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_alert(ip->i_mount,
796 "Failed delalloc mapping lookup ino %lld fsb %lld.",
797 ip->i_ino, start_fsb);
798 }
799 break;
800 }
801 if (!nimaps) {
802 /* nothing there */
803 goto next_block;
804 }
805 if (imap.br_startblock != DELAYSTARTBLOCK) {
806 /* been converted, ignore */
807 goto next_block;
808 }
809 WARN_ON(imap.br_blockcount == 0);
810
811 /*
812 * Note: while we initialise the firstblock/flist pair, they
813 * should never be used because blocks should never be
814 * allocated or freed for a delalloc extent and hence we need
815 * don't cancel or finish them after the xfs_bunmapi() call.
816 */
817 xfs_bmap_init(&flist, &firstblock);
818 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
819 &flist, &done);
820 if (error)
821 break;
822
823 ASSERT(!flist.xbf_count && !flist.xbf_first);
824next_block:
825 start_fsb++;
826 remaining--;
827 } while(remaining > 0);
828
829 return error;
830}
831
832/*
833 * Test whether it is appropriate to check an inode for and free post EOF
834 * blocks. The 'force' parameter determines whether we should also consider
835 * regular files that are marked preallocated or append-only.
836 */
837bool
838xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
839{
840 /* prealloc/delalloc exists only on regular files */
841 if (!S_ISREG(ip->i_d.di_mode))
842 return false;
843
844 /*
845 * Zero sized files with no cached pages and delalloc blocks will not
846 * have speculative prealloc/delalloc blocks to remove.
847 */
848 if (VFS_I(ip)->i_size == 0 &&
849 VN_CACHED(VFS_I(ip)) == 0 &&
850 ip->i_delayed_blks == 0)
851 return false;
852
853 /* If we haven't read in the extent list, then don't do it now. */
854 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
855 return false;
856
857 /*
858 * Do not free real preallocated or append-only files unless the file
859 * has delalloc blocks and we are forced to remove them.
860 */
861 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
862 if (!force || ip->i_delayed_blks == 0)
863 return false;
864
865 return true;
866}
867
868/*
869 * This is called by xfs_inactive to free any blocks beyond eof
870 * when the link count isn't zero and by xfs_dm_punch_hole() when
871 * punching a hole to EOF.
872 */
873int
874xfs_free_eofblocks(
875 xfs_mount_t *mp,
876 xfs_inode_t *ip,
877 bool need_iolock)
878{
879 xfs_trans_t *tp;
880 int error;
881 xfs_fileoff_t end_fsb;
882 xfs_fileoff_t last_fsb;
883 xfs_filblks_t map_len;
884 int nimaps;
885 xfs_bmbt_irec_t imap;
886
887 /*
888 * Figure out if there are any blocks beyond the end
889 * of the file. If not, then there is nothing to do.
890 */
891 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
892 last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
893 if (last_fsb <= end_fsb)
894 return 0;
895 map_len = last_fsb - end_fsb;
896
897 nimaps = 1;
898 xfs_ilock(ip, XFS_ILOCK_SHARED);
899 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
900 xfs_iunlock(ip, XFS_ILOCK_SHARED);
901
902 if (!error && (nimaps != 0) &&
903 (imap.br_startblock != HOLESTARTBLOCK ||
904 ip->i_delayed_blks)) {
905 /*
906 * Attach the dquots to the inode up front.
907 */
908 error = xfs_qm_dqattach(ip, 0);
909 if (error)
910 return error;
911
912 /*
913 * There are blocks after the end of file.
914 * Free them up now by truncating the file to
915 * its current size.
916 */
917 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
918
919 if (need_iolock) {
920 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
921 xfs_trans_cancel(tp, 0);
922 return EAGAIN;
923 }
924 }
925
926 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
927 if (error) {
928 ASSERT(XFS_FORCED_SHUTDOWN(mp));
929 xfs_trans_cancel(tp, 0);
930 if (need_iolock)
931 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
932 return error;
933 }
934
935 xfs_ilock(ip, XFS_ILOCK_EXCL);
936 xfs_trans_ijoin(tp, ip, 0);
937
938 /*
939 * Do not update the on-disk file size. If we update the
940 * on-disk file size and then the system crashes before the
941 * contents of the file are flushed to disk then the files
942 * may be full of holes (ie NULL files bug).
943 */
944 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
945 XFS_ISIZE(ip));
946 if (error) {
947 /*
948 * If we get an error at this point we simply don't
949 * bother truncating the file.
950 */
951 xfs_trans_cancel(tp,
952 (XFS_TRANS_RELEASE_LOG_RES |
953 XFS_TRANS_ABORT));
954 } else {
955 error = xfs_trans_commit(tp,
956 XFS_TRANS_RELEASE_LOG_RES);
957 if (!error)
958 xfs_inode_clear_eofblocks_tag(ip);
959 }
960
961 xfs_iunlock(ip, XFS_ILOCK_EXCL);
962 if (need_iolock)
963 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
964 }
965 return error;
966}
967
968/*
969 * xfs_alloc_file_space()
970 * This routine allocates disk space for the given file.
971 *
972 * If alloc_type == 0, this request is for an ALLOCSP type
973 * request which will change the file size. In this case, no
974 * DMAPI event will be generated by the call. A TRUNCATE event
975 * will be generated later by xfs_setattr.
976 *
977 * If alloc_type != 0, this request is for a RESVSP type
978 * request, and a DMAPI DM_EVENT_WRITE will be generated if the
979 * lower block boundary byte address is less than the file's
980 * length.
981 *
982 * RETURNS:
983 * 0 on success
984 * errno on error
985 *
986 */
987STATIC int
988xfs_alloc_file_space(
989 xfs_inode_t *ip,
990 xfs_off_t offset,
991 xfs_off_t len,
992 int alloc_type,
993 int attr_flags)
994{
995 xfs_mount_t *mp = ip->i_mount;
996 xfs_off_t count;
997 xfs_filblks_t allocated_fsb;
998 xfs_filblks_t allocatesize_fsb;
999 xfs_extlen_t extsz, temp;
1000 xfs_fileoff_t startoffset_fsb;
1001 xfs_fsblock_t firstfsb;
1002 int nimaps;
1003 int quota_flag;
1004 int rt;
1005 xfs_trans_t *tp;
1006 xfs_bmbt_irec_t imaps[1], *imapp;
1007 xfs_bmap_free_t free_list;
1008 uint qblocks, resblks, resrtextents;
1009 int committed;
1010 int error;
1011
1012 trace_xfs_alloc_file_space(ip);
1013
1014 if (XFS_FORCED_SHUTDOWN(mp))
1015 return XFS_ERROR(EIO);
1016
1017 error = xfs_qm_dqattach(ip, 0);
1018 if (error)
1019 return error;
1020
1021 if (len <= 0)
1022 return XFS_ERROR(EINVAL);
1023
1024 rt = XFS_IS_REALTIME_INODE(ip);
1025 extsz = xfs_get_extsz_hint(ip);
1026
1027 count = len;
1028 imapp = &imaps[0];
1029 nimaps = 1;
1030 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1031 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1032
1033 /*
1034 * Allocate file space until done or until there is an error
1035 */
1036 while (allocatesize_fsb && !error) {
1037 xfs_fileoff_t s, e;
1038
1039 /*
1040 * Determine space reservations for data/realtime.
1041 */
1042 if (unlikely(extsz)) {
1043 s = startoffset_fsb;
1044 do_div(s, extsz);
1045 s *= extsz;
1046 e = startoffset_fsb + allocatesize_fsb;
1047 if ((temp = do_mod(startoffset_fsb, extsz)))
1048 e += temp;
1049 if ((temp = do_mod(e, extsz)))
1050 e += extsz - temp;
1051 } else {
1052 s = 0;
1053 e = allocatesize_fsb;
1054 }
1055
1056 /*
1057 * The transaction reservation is limited to a 32-bit block
1058 * count, hence we need to limit the number of blocks we are
1059 * trying to reserve to avoid an overflow. We can't allocate
1060 * more than @nimaps extents, and an extent is limited on disk
1061 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1062 */
1063 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1064 if (unlikely(rt)) {
1065 resrtextents = qblocks = resblks;
1066 resrtextents /= mp->m_sb.sb_rextsize;
1067 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1068 quota_flag = XFS_QMOPT_RES_RTBLKS;
1069 } else {
1070 resrtextents = 0;
1071 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1072 quota_flag = XFS_QMOPT_RES_REGBLKS;
1073 }
1074
1075 /*
1076 * Allocate and setup the transaction.
1077 */
1078 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1079 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1080 resblks, resrtextents);
1081 /*
1082 * Check for running out of space
1083 */
1084 if (error) {
1085 /*
1086 * Free the transaction structure.
1087 */
1088 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1089 xfs_trans_cancel(tp, 0);
1090 break;
1091 }
1092 xfs_ilock(ip, XFS_ILOCK_EXCL);
1093 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1094 0, quota_flag);
1095 if (error)
1096 goto error1;
1097
1098 xfs_trans_ijoin(tp, ip, 0);
1099
1100 xfs_bmap_init(&free_list, &firstfsb);
1101 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1102 allocatesize_fsb, alloc_type, &firstfsb,
1103 0, imapp, &nimaps, &free_list);
1104 if (error) {
1105 goto error0;
1106 }
1107
1108 /*
1109 * Complete the transaction
1110 */
1111 error = xfs_bmap_finish(&tp, &free_list, &committed);
1112 if (error) {
1113 goto error0;
1114 }
1115
1116 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1117 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1118 if (error) {
1119 break;
1120 }
1121
1122 allocated_fsb = imapp->br_blockcount;
1123
1124 if (nimaps == 0) {
1125 error = XFS_ERROR(ENOSPC);
1126 break;
1127 }
1128
1129 startoffset_fsb += allocated_fsb;
1130 allocatesize_fsb -= allocated_fsb;
1131 }
1132
1133 return error;
1134
1135error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1136 xfs_bmap_cancel(&free_list);
1137 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1138
1139error1: /* Just cancel transaction */
1140 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1141 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1142 return error;
1143}
1144
1145/*
1146 * Zero file bytes between startoff and endoff inclusive.
1147 * The iolock is held exclusive and no blocks are buffered.
1148 *
1149 * This function is used by xfs_free_file_space() to zero
1150 * partial blocks when the range to free is not block aligned.
1151 * When unreserving space with boundaries that are not block
1152 * aligned we round up the start and round down the end
1153 * boundaries and then use this function to zero the parts of
1154 * the blocks that got dropped during the rounding.
1155 */
1156STATIC int
1157xfs_zero_remaining_bytes(
1158 xfs_inode_t *ip,
1159 xfs_off_t startoff,
1160 xfs_off_t endoff)
1161{
1162 xfs_bmbt_irec_t imap;
1163 xfs_fileoff_t offset_fsb;
1164 xfs_off_t lastoffset;
1165 xfs_off_t offset;
1166 xfs_buf_t *bp;
1167 xfs_mount_t *mp = ip->i_mount;
1168 int nimap;
1169 int error = 0;
1170
1171 /*
1172 * Avoid doing I/O beyond eof - it's not necessary
1173 * since nothing can read beyond eof. The space will
1174 * be zeroed when the file is extended anyway.
1175 */
1176 if (startoff >= XFS_ISIZE(ip))
1177 return 0;
1178
1179 if (endoff > XFS_ISIZE(ip))
1180 endoff = XFS_ISIZE(ip);
1181
1182 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1183 mp->m_rtdev_targp : mp->m_ddev_targp,
1184 BTOBB(mp->m_sb.sb_blocksize), 0);
1185 if (!bp)
1186 return XFS_ERROR(ENOMEM);
1187
1188 xfs_buf_unlock(bp);
1189
1190 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1191 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1192 nimap = 1;
1193 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1194 if (error || nimap < 1)
1195 break;
1196 ASSERT(imap.br_blockcount >= 1);
1197 ASSERT(imap.br_startoff == offset_fsb);
1198 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1199 if (lastoffset > endoff)
1200 lastoffset = endoff;
1201 if (imap.br_startblock == HOLESTARTBLOCK)
1202 continue;
1203 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1204 if (imap.br_state == XFS_EXT_UNWRITTEN)
1205 continue;
1206 XFS_BUF_UNDONE(bp);
1207 XFS_BUF_UNWRITE(bp);
1208 XFS_BUF_READ(bp);
1209 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1210 xfsbdstrat(mp, bp);
1211 error = xfs_buf_iowait(bp);
1212 if (error) {
1213 xfs_buf_ioerror_alert(bp,
1214 "xfs_zero_remaining_bytes(read)");
1215 break;
1216 }
1217 memset(bp->b_addr +
1218 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1219 0, lastoffset - offset + 1);
1220 XFS_BUF_UNDONE(bp);
1221 XFS_BUF_UNREAD(bp);
1222 XFS_BUF_WRITE(bp);
1223 xfsbdstrat(mp, bp);
1224 error = xfs_buf_iowait(bp);
1225 if (error) {
1226 xfs_buf_ioerror_alert(bp,
1227 "xfs_zero_remaining_bytes(write)");
1228 break;
1229 }
1230 }
1231 xfs_buf_free(bp);
1232 return error;
1233}
1234
1235/*
1236 * xfs_free_file_space()
1237 * This routine frees disk space for the given file.
1238 *
1239 * This routine is only called by xfs_change_file_space
1240 * for an UNRESVSP type call.
1241 *
1242 * RETURNS:
1243 * 0 on success
1244 * errno on error
1245 *
1246 */
1247STATIC int
1248xfs_free_file_space(
1249 xfs_inode_t *ip,
1250 xfs_off_t offset,
1251 xfs_off_t len,
1252 int attr_flags)
1253{
1254 int committed;
1255 int done;
1256 xfs_fileoff_t endoffset_fsb;
1257 int error;
1258 xfs_fsblock_t firstfsb;
1259 xfs_bmap_free_t free_list;
1260 xfs_bmbt_irec_t imap;
1261 xfs_off_t ioffset;
1262 xfs_extlen_t mod=0;
1263 xfs_mount_t *mp;
1264 int nimap;
1265 uint resblks;
1266 xfs_off_t rounding;
1267 int rt;
1268 xfs_fileoff_t startoffset_fsb;
1269 xfs_trans_t *tp;
1270 int need_iolock = 1;
1271
1272 mp = ip->i_mount;
1273
1274 trace_xfs_free_file_space(ip);
1275
1276 error = xfs_qm_dqattach(ip, 0);
1277 if (error)
1278 return error;
1279
1280 error = 0;
1281 if (len <= 0) /* if nothing being freed */
1282 return error;
1283 rt = XFS_IS_REALTIME_INODE(ip);
1284 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1285 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1286
1287 if (attr_flags & XFS_ATTR_NOLOCK)
1288 need_iolock = 0;
1289 if (need_iolock) {
1290 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1291 /* wait for the completion of any pending DIOs */
1292 inode_dio_wait(VFS_I(ip));
1293 }
1294
1295 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1296 ioffset = offset & ~(rounding - 1);
1297 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1298 ioffset, -1);
1299 if (error)
1300 goto out_unlock_iolock;
1301 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1302
1303 /*
1304 * Need to zero the stuff we're not freeing, on disk.
1305 * If it's a realtime file & can't use unwritten extents then we
1306 * actually need to zero the extent edges. Otherwise xfs_bunmapi
1307 * will take care of it for us.
1308 */
1309 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1310 nimap = 1;
1311 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1312 &imap, &nimap, 0);
1313 if (error)
1314 goto out_unlock_iolock;
1315 ASSERT(nimap == 0 || nimap == 1);
1316 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1317 xfs_daddr_t block;
1318
1319 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1320 block = imap.br_startblock;
1321 mod = do_div(block, mp->m_sb.sb_rextsize);
1322 if (mod)
1323 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1324 }
1325 nimap = 1;
1326 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1327 &imap, &nimap, 0);
1328 if (error)
1329 goto out_unlock_iolock;
1330 ASSERT(nimap == 0 || nimap == 1);
1331 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1332 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1333 mod++;
1334 if (mod && (mod != mp->m_sb.sb_rextsize))
1335 endoffset_fsb -= mod;
1336 }
1337 }
1338 if ((done = (endoffset_fsb <= startoffset_fsb)))
1339 /*
1340 * One contiguous piece to clear
1341 */
1342 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1343 else {
1344 /*
1345 * Some full blocks, possibly two pieces to clear
1346 */
1347 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1348 error = xfs_zero_remaining_bytes(ip, offset,
1349 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1350 if (!error &&
1351 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1352 error = xfs_zero_remaining_bytes(ip,
1353 XFS_FSB_TO_B(mp, endoffset_fsb),
1354 offset + len - 1);
1355 }
1356
1357 /*
1358 * free file space until done or until there is an error
1359 */
1360 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1361 while (!error && !done) {
1362
1363 /*
1364 * allocate and setup the transaction. Allow this
1365 * transaction to dip into the reserve blocks to ensure
1366 * the freeing of the space succeeds at ENOSPC.
1367 */
1368 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1369 tp->t_flags |= XFS_TRANS_RESERVE;
1370 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1371
1372 /*
1373 * check for running out of space
1374 */
1375 if (error) {
1376 /*
1377 * Free the transaction structure.
1378 */
1379 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1380 xfs_trans_cancel(tp, 0);
1381 break;
1382 }
1383 xfs_ilock(ip, XFS_ILOCK_EXCL);
1384 error = xfs_trans_reserve_quota(tp, mp,
1385 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1386 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1387 if (error)
1388 goto error1;
1389
1390 xfs_trans_ijoin(tp, ip, 0);
1391
1392 /*
1393 * issue the bunmapi() call to free the blocks
1394 */
1395 xfs_bmap_init(&free_list, &firstfsb);
1396 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1397 endoffset_fsb - startoffset_fsb,
1398 0, 2, &firstfsb, &free_list, &done);
1399 if (error) {
1400 goto error0;
1401 }
1402
1403 /*
1404 * complete the transaction
1405 */
1406 error = xfs_bmap_finish(&tp, &free_list, &committed);
1407 if (error) {
1408 goto error0;
1409 }
1410
1411 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1412 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1413 }
1414
1415 out_unlock_iolock:
1416 if (need_iolock)
1417 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1418 return error;
1419
1420 error0:
1421 xfs_bmap_cancel(&free_list);
1422 error1:
1423 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1424 xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
1425 XFS_ILOCK_EXCL);
1426 return error;
1427}
1428
1429
1430STATIC int
1431xfs_zero_file_space(
1432 struct xfs_inode *ip,
1433 xfs_off_t offset,
1434 xfs_off_t len,
1435 int attr_flags)
1436{
1437 struct xfs_mount *mp = ip->i_mount;
1438 uint granularity;
1439 xfs_off_t start_boundary;
1440 xfs_off_t end_boundary;
1441 int error;
1442
1443 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1444
1445 /*
1446 * Round the range of extents we are going to convert inwards. If the
1447 * offset is aligned, then it doesn't get changed so we zero from the
1448 * start of the block offset points to.
1449 */
1450 start_boundary = round_up(offset, granularity);
1451 end_boundary = round_down(offset + len, granularity);
1452
1453 ASSERT(start_boundary >= offset);
1454 ASSERT(end_boundary <= offset + len);
1455
1456 if (!(attr_flags & XFS_ATTR_NOLOCK))
1457 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1458
1459 if (start_boundary < end_boundary - 1) {
1460 /* punch out the page cache over the conversion range */
1461 truncate_pagecache_range(VFS_I(ip), start_boundary,
1462 end_boundary - 1);
1463 /* convert the blocks */
1464 error = xfs_alloc_file_space(ip, start_boundary,
1465 end_boundary - start_boundary - 1,
1466 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
1467 attr_flags);
1468 if (error)
1469 goto out_unlock;
1470
1471 /* We've handled the interior of the range, now for the edges */
1472 if (start_boundary != offset)
1473 error = xfs_iozero(ip, offset, start_boundary - offset);
1474 if (error)
1475 goto out_unlock;
1476
1477 if (end_boundary != offset + len)
1478 error = xfs_iozero(ip, end_boundary,
1479 offset + len - end_boundary);
1480
1481 } else {
1482 /*
1483 * It's either a sub-granularity range or the range spanned lies
1484 * partially across two adjacent blocks.
1485 */
1486 error = xfs_iozero(ip, offset, len);
1487 }
1488
1489out_unlock:
1490 if (!(attr_flags & XFS_ATTR_NOLOCK))
1491 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1492 return error;
1493
1494}
1495
1496/*
1497 * xfs_change_file_space()
1498 * This routine allocates or frees disk space for the given file.
1499 * The user specified parameters are checked for alignment and size
1500 * limitations.
1501 *
1502 * RETURNS:
1503 * 0 on success
1504 * errno on error
1505 *
1506 */
1507int
1508xfs_change_file_space(
1509 xfs_inode_t *ip,
1510 int cmd,
1511 xfs_flock64_t *bf,
1512 xfs_off_t offset,
1513 int attr_flags)
1514{
1515 xfs_mount_t *mp = ip->i_mount;
1516 int clrprealloc;
1517 int error;
1518 xfs_fsize_t fsize;
1519 int setprealloc;
1520 xfs_off_t startoffset;
1521 xfs_trans_t *tp;
1522 struct iattr iattr;
1523
1524 if (!S_ISREG(ip->i_d.di_mode))
1525 return XFS_ERROR(EINVAL);
1526
1527 switch (bf->l_whence) {
1528 case 0: /*SEEK_SET*/
1529 break;
1530 case 1: /*SEEK_CUR*/
1531 bf->l_start += offset;
1532 break;
1533 case 2: /*SEEK_END*/
1534 bf->l_start += XFS_ISIZE(ip);
1535 break;
1536 default:
1537 return XFS_ERROR(EINVAL);
1538 }
1539
1540 /*
1541 * length of <= 0 for resv/unresv/zero is invalid. length for
1542 * alloc/free is ignored completely and we have no idea what userspace
1543 * might have set it to, so set it to zero to allow range
1544 * checks to pass.
1545 */
1546 switch (cmd) {
1547 case XFS_IOC_ZERO_RANGE:
1548 case XFS_IOC_RESVSP:
1549 case XFS_IOC_RESVSP64:
1550 case XFS_IOC_UNRESVSP:
1551 case XFS_IOC_UNRESVSP64:
1552 if (bf->l_len <= 0)
1553 return XFS_ERROR(EINVAL);
1554 break;
1555 default:
1556 bf->l_len = 0;
1557 break;
1558 }
1559
1560 if (bf->l_start < 0 ||
1561 bf->l_start > mp->m_super->s_maxbytes ||
1562 bf->l_start + bf->l_len < 0 ||
1563 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
1564 return XFS_ERROR(EINVAL);
1565
1566 bf->l_whence = 0;
1567
1568 startoffset = bf->l_start;
1569 fsize = XFS_ISIZE(ip);
1570
1571 setprealloc = clrprealloc = 0;
1572 switch (cmd) {
1573 case XFS_IOC_ZERO_RANGE:
1574 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
1575 attr_flags);
1576 if (error)
1577 return error;
1578 setprealloc = 1;
1579 break;
1580
1581 case XFS_IOC_RESVSP:
1582 case XFS_IOC_RESVSP64:
1583 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
1584 XFS_BMAPI_PREALLOC, attr_flags);
1585 if (error)
1586 return error;
1587 setprealloc = 1;
1588 break;
1589
1590 case XFS_IOC_UNRESVSP:
1591 case XFS_IOC_UNRESVSP64:
1592 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
1593 attr_flags)))
1594 return error;
1595 break;
1596
1597 case XFS_IOC_ALLOCSP:
1598 case XFS_IOC_ALLOCSP64:
1599 case XFS_IOC_FREESP:
1600 case XFS_IOC_FREESP64:
1601 /*
1602 * These operations actually do IO when extending the file, but
1603 * the allocation is done seperately to the zeroing that is
1604 * done. This set of operations need to be serialised against
1605 * other IO operations, such as truncate and buffered IO. We
1606 * need to take the IOLOCK here to serialise the allocation and
1607 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
1608 * truncate, direct IO) from racing against the transient
1609 * allocated but not written state we can have here.
1610 */
1611 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1612 if (startoffset > fsize) {
1613 error = xfs_alloc_file_space(ip, fsize,
1614 startoffset - fsize, 0,
1615 attr_flags | XFS_ATTR_NOLOCK);
1616 if (error) {
1617 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1618 break;
1619 }
1620 }
1621
1622 iattr.ia_valid = ATTR_SIZE;
1623 iattr.ia_size = startoffset;
1624
1625 error = xfs_setattr_size(ip, &iattr,
1626 attr_flags | XFS_ATTR_NOLOCK);
1627 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1628
1629 if (error)
1630 return error;
1631
1632 clrprealloc = 1;
1633 break;
1634
1635 default:
1636 ASSERT(0);
1637 return XFS_ERROR(EINVAL);
1638 }
1639
1640 /*
1641 * update the inode timestamp, mode, and prealloc flag bits
1642 */
1643 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
1644 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
1645 if (error) {
1646 xfs_trans_cancel(tp, 0);
1647 return error;
1648 }
1649
1650 xfs_ilock(ip, XFS_ILOCK_EXCL);
1651 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1652
1653 if ((attr_flags & XFS_ATTR_DMI) == 0) {
1654 ip->i_d.di_mode &= ~S_ISUID;
1655
1656 /*
1657 * Note that we don't have to worry about mandatory
1658 * file locking being disabled here because we only
1659 * clear the S_ISGID bit if the Group execute bit is
1660 * on, but if it was on then mandatory locking wouldn't
1661 * have been enabled.
1662 */
1663 if (ip->i_d.di_mode & S_IXGRP)
1664 ip->i_d.di_mode &= ~S_ISGID;
1665
1666 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1667 }
1668 if (setprealloc)
1669 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
1670 else if (clrprealloc)
1671 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
1672
1673 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1674 if (attr_flags & XFS_ATTR_SYNC)
1675 xfs_trans_set_sync(tp);
1676 return xfs_trans_commit(tp, 0);
1677}
1678
1679/*
1680 * We need to check that the format of the data fork in the temporary inode is
1681 * valid for the target inode before doing the swap. This is not a problem with
1682 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1683 * data fork depending on the space the attribute fork is taking so we can get
1684 * invalid formats on the target inode.
1685 *
1686 * E.g. target has space for 7 extents in extent format, temp inode only has
1687 * space for 6. If we defragment down to 7 extents, then the tmp format is a
1688 * btree, but when swapped it needs to be in extent format. Hence we can't just
1689 * blindly swap data forks on attr2 filesystems.
1690 *
1691 * Note that we check the swap in both directions so that we don't end up with
1692 * a corrupt temporary inode, either.
1693 *
1694 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1695 * inode will prevent this situation from occurring, so all we do here is
1696 * reject and log the attempt. basically we are putting the responsibility on
1697 * userspace to get this right.
1698 */
1699static int
1700xfs_swap_extents_check_format(
1701 xfs_inode_t *ip, /* target inode */
1702 xfs_inode_t *tip) /* tmp inode */
1703{
1704
1705 /* Should never get a local format */
1706 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1707 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1708 return EINVAL;
1709
1710 /*
1711 * if the target inode has less extents that then temporary inode then
1712 * why did userspace call us?
1713 */
1714 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1715 return EINVAL;
1716
1717 /*
1718 * if the target inode is in extent form and the temp inode is in btree
1719 * form then we will end up with the target inode in the wrong format
1720 * as we already know there are less extents in the temp inode.
1721 */
1722 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1723 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1724 return EINVAL;
1725
1726 /* Check temp in extent form to max in target */
1727 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1728 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1729 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1730 return EINVAL;
1731
1732 /* Check target in extent form to max in temp */
1733 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1734 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1735 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1736 return EINVAL;
1737
1738 /*
1739 * If we are in a btree format, check that the temp root block will fit
1740 * in the target and that it has enough extents to be in btree format
1741 * in the target.
1742 *
1743 * Note that we have to be careful to allow btree->extent conversions
1744 * (a common defrag case) which will occur when the temp inode is in
1745 * extent format...
1746 */
1747 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1748 if (XFS_IFORK_BOFF(ip) &&
1749 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1750 return EINVAL;
1751 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1752 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1753 return EINVAL;
1754 }
1755
1756 /* Reciprocal target->temp btree format checks */
1757 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1758 if (XFS_IFORK_BOFF(tip) &&
1759 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1760 return EINVAL;
1761 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1762 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1763 return EINVAL;
1764 }
1765
1766 return 0;
1767}
1768
1769int
1770xfs_swap_extents(
1771 xfs_inode_t *ip, /* target inode */
1772 xfs_inode_t *tip, /* tmp inode */
1773 xfs_swapext_t *sxp)
1774{
1775 xfs_mount_t *mp = ip->i_mount;
1776 xfs_trans_t *tp;
1777 xfs_bstat_t *sbp = &sxp->sx_stat;
1778 xfs_ifork_t *tempifp, *ifp, *tifp;
1779 int src_log_flags, target_log_flags;
1780 int error = 0;
1781 int aforkblks = 0;
1782 int taforkblks = 0;
1783 __uint64_t tmp;
1784
1785 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1786 if (!tempifp) {
1787 error = XFS_ERROR(ENOMEM);
1788 goto out;
1789 }
1790
1791 /*
1792 * we have to do two separate lock calls here to keep lockdep
1793 * happy. If we try to get all the locks in one call, lock will
1794 * report false positives when we drop the ILOCK and regain them
1795 * below.
1796 */
1797 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1798 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1799
1800 /* Verify that both files have the same format */
1801 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1802 error = XFS_ERROR(EINVAL);
1803 goto out_unlock;
1804 }
1805
1806 /* Verify both files are either real-time or non-realtime */
1807 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1808 error = XFS_ERROR(EINVAL);
1809 goto out_unlock;
1810 }
1811
1812 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
1813 if (error)
1814 goto out_unlock;
1815 truncate_pagecache_range(VFS_I(tip), 0, -1);
1816
1817 /* Verify O_DIRECT for ftmp */
1818 if (VN_CACHED(VFS_I(tip)) != 0) {
1819 error = XFS_ERROR(EINVAL);
1820 goto out_unlock;
1821 }
1822
1823 /* Verify all data are being swapped */
1824 if (sxp->sx_offset != 0 ||
1825 sxp->sx_length != ip->i_d.di_size ||
1826 sxp->sx_length != tip->i_d.di_size) {
1827 error = XFS_ERROR(EFAULT);
1828 goto out_unlock;
1829 }
1830
1831 trace_xfs_swap_extent_before(ip, 0);
1832 trace_xfs_swap_extent_before(tip, 1);
1833
1834 /* check inode formats now that data is flushed */
1835 error = xfs_swap_extents_check_format(ip, tip);
1836 if (error) {
1837 xfs_notice(mp,
1838 "%s: inode 0x%llx format is incompatible for exchanging.",
1839 __func__, ip->i_ino);
1840 goto out_unlock;
1841 }
1842
1843 /*
1844 * Compare the current change & modify times with that
1845 * passed in. If they differ, we abort this swap.
1846 * This is the mechanism used to ensure the calling
1847 * process that the file was not changed out from
1848 * under it.
1849 */
1850 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1851 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1852 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1853 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1854 error = XFS_ERROR(EBUSY);
1855 goto out_unlock;
1856 }
1857
1858 /* We need to fail if the file is memory mapped. Once we have tossed
1859 * all existing pages, the page fault will have no option
1860 * but to go to the filesystem for pages. By making the page fault call
1861 * vop_read (or write in the case of autogrow) they block on the iolock
1862 * until we have switched the extents.
1863 */
1864 if (VN_MAPPED(VFS_I(ip))) {
1865 error = XFS_ERROR(EBUSY);
1866 goto out_unlock;
1867 }
1868
1869 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1870 xfs_iunlock(tip, XFS_ILOCK_EXCL);
1871
1872 /*
1873 * There is a race condition here since we gave up the
1874 * ilock. However, the data fork will not change since
1875 * we have the iolock (locked for truncation too) so we
1876 * are safe. We don't really care if non-io related
1877 * fields change.
1878 */
1879 truncate_pagecache_range(VFS_I(ip), 0, -1);
1880
1881 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1882 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1883 if (error) {
1884 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1885 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
1886 xfs_trans_cancel(tp, 0);
1887 goto out;
1888 }
1889 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1890
1891 /*
1892 * Count the number of extended attribute blocks
1893 */
1894 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1895 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1896 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1897 if (error)
1898 goto out_trans_cancel;
1899 }
1900 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1901 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1902 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1903 &taforkblks);
1904 if (error)
1905 goto out_trans_cancel;
1906 }
1907
1908 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1909 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1910
1911 /*
1912 * Before we've swapped the forks, lets set the owners of the forks
1913 * appropriately. We have to do this as we are demand paging the btree
1914 * buffers, and so the validation done on read will expect the owner
1915 * field to be correctly set. Once we change the owners, we can swap the
1916 * inode forks.
1917 *
1918 * Note the trickiness in setting the log flags - we set the owner log
1919 * flag on the opposite inode (i.e. the inode we are setting the new
1920 * owner to be) because once we swap the forks and log that, log
1921 * recovery is going to see the fork as owned by the swapped inode,
1922 * not the pre-swapped inodes.
1923 */
1924 src_log_flags = XFS_ILOG_CORE;
1925 target_log_flags = XFS_ILOG_CORE;
1926 if (ip->i_d.di_version == 3 &&
1927 ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1928 target_log_flags |= XFS_ILOG_DOWNER;
1929 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1930 tip->i_ino, NULL);
1931 if (error)
1932 goto out_trans_cancel;
1933 }
1934
1935 if (tip->i_d.di_version == 3 &&
1936 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1937 src_log_flags |= XFS_ILOG_DOWNER;
1938 error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1939 ip->i_ino, NULL);
1940 if (error)
1941 goto out_trans_cancel;
1942 }
1943
1944 /*
1945 * Swap the data forks of the inodes
1946 */
1947 ifp = &ip->i_df;
1948 tifp = &tip->i_df;
1949 *tempifp = *ifp; /* struct copy */
1950 *ifp = *tifp; /* struct copy */
1951 *tifp = *tempifp; /* struct copy */
1952
1953 /*
1954 * Fix the on-disk inode values
1955 */
1956 tmp = (__uint64_t)ip->i_d.di_nblocks;
1957 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1958 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1959
1960 tmp = (__uint64_t) ip->i_d.di_nextents;
1961 ip->i_d.di_nextents = tip->i_d.di_nextents;
1962 tip->i_d.di_nextents = tmp;
1963
1964 tmp = (__uint64_t) ip->i_d.di_format;
1965 ip->i_d.di_format = tip->i_d.di_format;
1966 tip->i_d.di_format = tmp;
1967
1968 /*
1969 * The extents in the source inode could still contain speculative
1970 * preallocation beyond EOF (e.g. the file is open but not modified
1971 * while defrag is in progress). In that case, we need to copy over the
1972 * number of delalloc blocks the data fork in the source inode is
1973 * tracking beyond EOF so that when the fork is truncated away when the
1974 * temporary inode is unlinked we don't underrun the i_delayed_blks
1975 * counter on that inode.
1976 */
1977 ASSERT(tip->i_delayed_blks == 0);
1978 tip->i_delayed_blks = ip->i_delayed_blks;
1979 ip->i_delayed_blks = 0;
1980
1981 switch (ip->i_d.di_format) {
1982 case XFS_DINODE_FMT_EXTENTS:
1983 /* If the extents fit in the inode, fix the
1984 * pointer. Otherwise it's already NULL or
1985 * pointing to the extent.
1986 */
1987 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1988 ifp->if_u1.if_extents =
1989 ifp->if_u2.if_inline_ext;
1990 }
1991 src_log_flags |= XFS_ILOG_DEXT;
1992 break;
1993 case XFS_DINODE_FMT_BTREE:
1994 ASSERT(ip->i_d.di_version < 3 ||
1995 (src_log_flags & XFS_ILOG_DOWNER));
1996 src_log_flags |= XFS_ILOG_DBROOT;
1997 break;
1998 }
1999
2000 switch (tip->i_d.di_format) {
2001 case XFS_DINODE_FMT_EXTENTS:
2002 /* If the extents fit in the inode, fix the
2003 * pointer. Otherwise it's already NULL or
2004 * pointing to the extent.
2005 */
2006 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
2007 tifp->if_u1.if_extents =
2008 tifp->if_u2.if_inline_ext;
2009 }
2010 target_log_flags |= XFS_ILOG_DEXT;
2011 break;
2012 case XFS_DINODE_FMT_BTREE:
2013 target_log_flags |= XFS_ILOG_DBROOT;
2014 ASSERT(tip->i_d.di_version < 3 ||
2015 (target_log_flags & XFS_ILOG_DOWNER));
2016 break;
2017 }
2018
2019 xfs_trans_log_inode(tp, ip, src_log_flags);
2020 xfs_trans_log_inode(tp, tip, target_log_flags);
2021
2022 /*
2023 * If this is a synchronous mount, make sure that the
2024 * transaction goes to disk before returning to the user.
2025 */
2026 if (mp->m_flags & XFS_MOUNT_WSYNC)
2027 xfs_trans_set_sync(tp);
2028
2029 error = xfs_trans_commit(tp, 0);
2030
2031 trace_xfs_swap_extent_after(ip, 0);
2032 trace_xfs_swap_extent_after(tip, 1);
2033out:
2034 kmem_free(tempifp);
2035 return error;
2036
2037out_unlock:
2038 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2039 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2040 goto out;
2041
2042out_trans_cancel:
2043 xfs_trans_cancel(tp, 0);
2044 goto out_unlock;
2045}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 000000000000..061260946f7a
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,110 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BMAP_UTIL_H__
19#define __XFS_BMAP_UTIL_H__
20
21/* Kernel only BMAP related definitions and functions */
22
23struct xfs_bmbt_irec;
24struct xfs_bmap_free_item;
25struct xfs_ifork;
26struct xfs_inode;
27struct xfs_mount;
28struct xfs_trans;
29
30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 char eof; /* set if allocating past last extent */
54 char wasdel; /* replacing a delayed allocation */
55 char userdata;/* set if is user data */
56 char aeof; /* allocated space at eof */
57 char conv; /* overwriting unwritten extents */
58 char stack_switch;
59 int flags;
60 struct completion *done;
61 struct work_struct work;
62 int result;
63};
64
65int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
66 int *committed);
67int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
68int xfs_bmapi_allocate(struct xfs_bmalloca *args);
69int __xfs_bmapi_allocate(struct xfs_bmalloca *args);
70int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
71 int whichfork, int *eof);
72int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
73 int whichfork, int *count);
74int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
75 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
76
77/* bmap to userspace formatter - copy to user & advance pointer */
78typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
79int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
80 xfs_bmap_format_t formatter, void *arg);
81
82/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
83void xfs_bmap_del_free(struct xfs_bmap_free *flist,
84 struct xfs_bmap_free_item *prev,
85 struct xfs_bmap_free_item *free);
86int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
87 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
88 int rt, int eof, int delay, int convert,
89 xfs_fileoff_t *offp, xfs_extlen_t *lenp);
90void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
91int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
92 int whichfork, struct xfs_bmbt_irec *rec,
93 int *is_empty);
94
95/* preallocation and hole punch interface */
96int xfs_change_file_space(struct xfs_inode *ip, int cmd,
97 xfs_flock64_t *bf, xfs_off_t offset,
98 int attr_flags);
99
100/* EOF block manipulation functions */
101bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
102int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
103 bool need_iolock);
104
105int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
106 struct xfs_swapext *sx);
107
108xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
109
110#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 0903960410a2..5690e102243d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -510,7 +510,7 @@ xfs_btree_ptr_addr(
510} 510}
511 511
512/* 512/*
513 * Get a the root block which is stored in the inode. 513 * Get the root block which is stored in the inode.
514 * 514 *
515 * For now this btree implementation assumes the btree root is always 515 * For now this btree implementation assumes the btree root is always
516 * stored in the if_broot field of an inode fork. 516 * stored in the if_broot field of an inode fork.
@@ -855,6 +855,41 @@ xfs_btree_readahead(
855 return xfs_btree_readahead_sblock(cur, lr, block); 855 return xfs_btree_readahead_sblock(cur, lr, block);
856} 856}
857 857
858STATIC xfs_daddr_t
859xfs_btree_ptr_to_daddr(
860 struct xfs_btree_cur *cur,
861 union xfs_btree_ptr *ptr)
862{
863 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
864 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
865
866 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
867 } else {
868 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
869 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
870
871 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
872 be32_to_cpu(ptr->s));
873 }
874}
875
876/*
877 * Readahead @count btree blocks at the given @ptr location.
878 *
879 * We don't need to care about long or short form btrees here as we have a
880 * method of converting the ptr directly to a daddr available to us.
881 */
882STATIC void
883xfs_btree_readahead_ptr(
884 struct xfs_btree_cur *cur,
885 union xfs_btree_ptr *ptr,
886 xfs_extlen_t count)
887{
888 xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
889 xfs_btree_ptr_to_daddr(cur, ptr),
890 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
891}
892
858/* 893/*
859 * Set the buffer for level "lev" in the cursor to bp, releasing 894 * Set the buffer for level "lev" in the cursor to bp, releasing
860 * any previous buffer. 895 * any previous buffer.
@@ -978,6 +1013,7 @@ xfs_btree_init_block_int(
978 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1013 buf->bb_u.l.bb_owner = cpu_to_be64(owner);
979 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); 1014 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
980 buf->bb_u.l.bb_pad = 0; 1015 buf->bb_u.l.bb_pad = 0;
1016 buf->bb_u.l.bb_lsn = 0;
981 } 1017 }
982 } else { 1018 } else {
983 /* owner is a 32 bit value on short blocks */ 1019 /* owner is a 32 bit value on short blocks */
@@ -989,6 +1025,7 @@ xfs_btree_init_block_int(
989 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1025 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
990 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1026 buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
991 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); 1027 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1028 buf->bb_u.s.bb_lsn = 0;
992 } 1029 }
993 } 1030 }
994} 1031}
@@ -1071,24 +1108,6 @@ xfs_btree_buf_to_ptr(
1071 } 1108 }
1072} 1109}
1073 1110
1074STATIC xfs_daddr_t
1075xfs_btree_ptr_to_daddr(
1076 struct xfs_btree_cur *cur,
1077 union xfs_btree_ptr *ptr)
1078{
1079 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1080 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
1081
1082 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1083 } else {
1084 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
1085 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
1086
1087 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1088 be32_to_cpu(ptr->s));
1089 }
1090}
1091
1092STATIC void 1111STATIC void
1093xfs_btree_set_refs( 1112xfs_btree_set_refs(
1094 struct xfs_btree_cur *cur, 1113 struct xfs_btree_cur *cur,
@@ -1684,7 +1703,7 @@ xfs_lookup_get_search_key(
1684 1703
1685/* 1704/*
1686 * Lookup the record. The cursor is made to point to it, based on dir. 1705 * Lookup the record. The cursor is made to point to it, based on dir.
1687 * Return 0 if can't find any such record, 1 for success. 1706 * stat is set to 0 if can't find any such record, 1 for success.
1688 */ 1707 */
1689int /* error */ 1708int /* error */
1690xfs_btree_lookup( 1709xfs_btree_lookup(
@@ -2756,7 +2775,6 @@ xfs_btree_make_block_unfull(
2756 2775
2757 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 2776 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2758 /* A root block that can be made bigger. */ 2777 /* A root block that can be made bigger. */
2759
2760 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); 2778 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2761 } else { 2779 } else {
2762 /* A root block that needs replacing */ 2780 /* A root block that needs replacing */
@@ -3868,3 +3886,120 @@ xfs_btree_get_rec(
3868 *stat = 1; 3886 *stat = 1;
3869 return 0; 3887 return 0;
3870} 3888}
3889
3890/*
3891 * Change the owner of a btree.
3892 *
3893 * The mechanism we use here is ordered buffer logging. Because we don't know
3894 * how many buffers were are going to need to modify, we don't really want to
3895 * have to make transaction reservations for the worst case of every buffer in a
3896 * full size btree as that may be more space that we can fit in the log....
3897 *
3898 * We do the btree walk in the most optimal manner possible - we have sibling
3899 * pointers so we can just walk all the blocks on each level from left to right
3900 * in a single pass, and then move to the next level and do the same. We can
3901 * also do readahead on the sibling pointers to get IO moving more quickly,
3902 * though for slow disks this is unlikely to make much difference to performance
3903 * as the amount of CPU work we have to do before moving to the next block is
3904 * relatively small.
3905 *
3906 * For each btree block that we load, modify the owner appropriately, set the
3907 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3908 * we mark the region we change dirty so that if the buffer is relogged in
3909 * a subsequent transaction the changes we make here as an ordered buffer are
3910 * correctly relogged in that transaction. If we are in recovery context, then
3911 * just queue the modified buffer as delayed write buffer so the transaction
3912 * recovery completion writes the changes to disk.
3913 */
3914static int
3915xfs_btree_block_change_owner(
3916 struct xfs_btree_cur *cur,
3917 int level,
3918 __uint64_t new_owner,
3919 struct list_head *buffer_list)
3920{
3921 struct xfs_btree_block *block;
3922 struct xfs_buf *bp;
3923 union xfs_btree_ptr rptr;
3924
3925 /* do right sibling readahead */
3926 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3927
3928 /* modify the owner */
3929 block = xfs_btree_get_block(cur, level, &bp);
3930 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3931 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3932 else
3933 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3934
3935 /*
3936 * If the block is a root block hosted in an inode, we might not have a
3937 * buffer pointer here and we shouldn't attempt to log the change as the
3938 * information is already held in the inode and discarded when the root
3939 * block is formatted into the on-disk inode fork. We still change it,
3940 * though, so everything is consistent in memory.
3941 */
3942 if (bp) {
3943 if (cur->bc_tp) {
3944 xfs_trans_ordered_buf(cur->bc_tp, bp);
3945 xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3946 } else {
3947 xfs_buf_delwri_queue(bp, buffer_list);
3948 }
3949 } else {
3950 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3951 ASSERT(level == cur->bc_nlevels - 1);
3952 }
3953
3954 /* now read rh sibling block for next iteration */
3955 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3956 if (xfs_btree_ptr_is_null(cur, &rptr))
3957 return ENOENT;
3958
3959 return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
3960}
3961
3962int
3963xfs_btree_change_owner(
3964 struct xfs_btree_cur *cur,
3965 __uint64_t new_owner,
3966 struct list_head *buffer_list)
3967{
3968 union xfs_btree_ptr lptr;
3969 int level;
3970 struct xfs_btree_block *block = NULL;
3971 int error = 0;
3972
3973 cur->bc_ops->init_ptr_from_cur(cur, &lptr);
3974
3975 /* for each level */
3976 for (level = cur->bc_nlevels - 1; level >= 0; level--) {
3977 /* grab the left hand block */
3978 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
3979 if (error)
3980 return error;
3981
3982 /* readahead the left most block for the next level down */
3983 if (level > 0) {
3984 union xfs_btree_ptr *ptr;
3985
3986 ptr = xfs_btree_ptr_addr(cur, 1, block);
3987 xfs_btree_readahead_ptr(cur, ptr, 1);
3988
3989 /* save for the next iteration of the loop */
3990 lptr = *ptr;
3991 }
3992
3993 /* for each buffer in the level */
3994 do {
3995 error = xfs_btree_block_change_owner(cur, level,
3996 new_owner,
3997 buffer_list);
3998 } while (!error);
3999
4000 if (error != ENOENT)
4001 return error;
4002 }
4003
4004 return 0;
4005}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 55e3c7cc3c3d..06729b67ad58 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -88,13 +88,11 @@ struct xfs_btree_block {
88#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) 88#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
89#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) 89#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
90 90
91
92#define XFS_BTREE_SBLOCK_CRC_OFF \ 91#define XFS_BTREE_SBLOCK_CRC_OFF \
93 offsetof(struct xfs_btree_block, bb_u.s.bb_crc) 92 offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
94#define XFS_BTREE_LBLOCK_CRC_OFF \ 93#define XFS_BTREE_LBLOCK_CRC_OFF \
95 offsetof(struct xfs_btree_block, bb_u.l.bb_crc) 94 offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
96 95
97
98/* 96/*
99 * Generic key, ptr and record wrapper structures. 97 * Generic key, ptr and record wrapper structures.
100 * 98 *
@@ -123,15 +121,18 @@ union xfs_btree_rec {
123/* 121/*
124 * For logging record fields. 122 * For logging record fields.
125 */ 123 */
126#define XFS_BB_MAGIC 0x01 124#define XFS_BB_MAGIC (1 << 0)
127#define XFS_BB_LEVEL 0x02 125#define XFS_BB_LEVEL (1 << 1)
128#define XFS_BB_NUMRECS 0x04 126#define XFS_BB_NUMRECS (1 << 2)
129#define XFS_BB_LEFTSIB 0x08 127#define XFS_BB_LEFTSIB (1 << 3)
130#define XFS_BB_RIGHTSIB 0x10 128#define XFS_BB_RIGHTSIB (1 << 4)
131#define XFS_BB_BLKNO 0x20 129#define XFS_BB_BLKNO (1 << 5)
130#define XFS_BB_LSN (1 << 6)
131#define XFS_BB_UUID (1 << 7)
132#define XFS_BB_OWNER (1 << 8)
132#define XFS_BB_NUM_BITS 5 133#define XFS_BB_NUM_BITS 5
133#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) 134#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
134#define XFS_BB_NUM_BITS_CRC 8 135#define XFS_BB_NUM_BITS_CRC 9
135#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) 136#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
136 137
137/* 138/*
@@ -444,6 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
444int xfs_btree_insert(struct xfs_btree_cur *, int *); 445int xfs_btree_insert(struct xfs_btree_cur *, int *);
445int xfs_btree_delete(struct xfs_btree_cur *, int *); 446int xfs_btree_delete(struct xfs_btree_cur *, int *);
446int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 447int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
448int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
449 struct list_head *buffer_list);
447 450
448/* 451/*
449 * btree block CRC helpers 452 * btree block CRC helpers
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1b2472a46e46..263470075ea2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,6 +35,7 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h" 37#include "xfs_sb.h"
38#include "xfs_trans_resv.h"
38#include "xfs_log.h" 39#include "xfs_log.h"
39#include "xfs_ag.h" 40#include "xfs_ag.h"
40#include "xfs_mount.h" 41#include "xfs_mount.h"
@@ -80,54 +81,6 @@ xfs_buf_vmap_len(
80} 81}
81 82
82/* 83/*
83 * xfs_buf_lru_add - add a buffer to the LRU.
84 *
85 * The LRU takes a new reference to the buffer so that it will only be freed
86 * once the shrinker takes the buffer off the LRU.
87 */
88STATIC void
89xfs_buf_lru_add(
90 struct xfs_buf *bp)
91{
92 struct xfs_buftarg *btp = bp->b_target;
93
94 spin_lock(&btp->bt_lru_lock);
95 if (list_empty(&bp->b_lru)) {
96 atomic_inc(&bp->b_hold);
97 list_add_tail(&bp->b_lru, &btp->bt_lru);
98 btp->bt_lru_nr++;
99 bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
100 }
101 spin_unlock(&btp->bt_lru_lock);
102}
103
104/*
105 * xfs_buf_lru_del - remove a buffer from the LRU
106 *
107 * The unlocked check is safe here because it only occurs when there are not
108 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
109 * to optimise the shrinker removing the buffer from the LRU and calling
110 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
111 * bt_lru_lock.
112 */
113STATIC void
114xfs_buf_lru_del(
115 struct xfs_buf *bp)
116{
117 struct xfs_buftarg *btp = bp->b_target;
118
119 if (list_empty(&bp->b_lru))
120 return;
121
122 spin_lock(&btp->bt_lru_lock);
123 if (!list_empty(&bp->b_lru)) {
124 list_del_init(&bp->b_lru);
125 btp->bt_lru_nr--;
126 }
127 spin_unlock(&btp->bt_lru_lock);
128}
129
130/*
131 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 84 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
132 * b_lru_ref count so that the buffer is freed immediately when the buffer 85 * b_lru_ref count so that the buffer is freed immediately when the buffer
133 * reference count falls to zero. If the buffer is already on the LRU, we need 86 * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -150,20 +103,14 @@ xfs_buf_stale(
150 */ 103 */
151 bp->b_flags &= ~_XBF_DELWRI_Q; 104 bp->b_flags &= ~_XBF_DELWRI_Q;
152 105
153 atomic_set(&(bp)->b_lru_ref, 0); 106 spin_lock(&bp->b_lock);
154 if (!list_empty(&bp->b_lru)) { 107 atomic_set(&bp->b_lru_ref, 0);
155 struct xfs_buftarg *btp = bp->b_target; 108 if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
109 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
110 atomic_dec(&bp->b_hold);
156 111
157 spin_lock(&btp->bt_lru_lock);
158 if (!list_empty(&bp->b_lru) &&
159 !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
160 list_del_init(&bp->b_lru);
161 btp->bt_lru_nr--;
162 atomic_dec(&bp->b_hold);
163 }
164 spin_unlock(&btp->bt_lru_lock);
165 }
166 ASSERT(atomic_read(&bp->b_hold) >= 1); 112 ASSERT(atomic_read(&bp->b_hold) >= 1);
113 spin_unlock(&bp->b_lock);
167} 114}
168 115
169static int 116static int
@@ -227,6 +174,7 @@ _xfs_buf_alloc(
227 INIT_LIST_HEAD(&bp->b_list); 174 INIT_LIST_HEAD(&bp->b_list);
228 RB_CLEAR_NODE(&bp->b_rbnode); 175 RB_CLEAR_NODE(&bp->b_rbnode);
229 sema_init(&bp->b_sema, 0); /* held, no waiters */ 176 sema_init(&bp->b_sema, 0); /* held, no waiters */
177 spin_lock_init(&bp->b_lock);
230 XB_SET_OWNER(bp); 178 XB_SET_OWNER(bp);
231 bp->b_target = target; 179 bp->b_target = target;
232 bp->b_flags = flags; 180 bp->b_flags = flags;
@@ -303,7 +251,7 @@ _xfs_buf_free_pages(
303 * Releases the specified buffer. 251 * Releases the specified buffer.
304 * 252 *
305 * The modification state of any associated pages is left unchanged. 253 * The modification state of any associated pages is left unchanged.
306 * The buffer most not be on any hash - use xfs_buf_rele instead for 254 * The buffer must not be on any hash - use xfs_buf_rele instead for
307 * hashed and refcounted buffers 255 * hashed and refcounted buffers
308 */ 256 */
309void 257void
@@ -916,12 +864,33 @@ xfs_buf_rele(
916 864
917 ASSERT(atomic_read(&bp->b_hold) > 0); 865 ASSERT(atomic_read(&bp->b_hold) > 0);
918 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 866 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
919 if (!(bp->b_flags & XBF_STALE) && 867 spin_lock(&bp->b_lock);
920 atomic_read(&bp->b_lru_ref)) { 868 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
921 xfs_buf_lru_add(bp); 869 /*
870 * If the buffer is added to the LRU take a new
871 * reference to the buffer for the LRU and clear the
872 * (now stale) dispose list state flag
873 */
874 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
875 bp->b_state &= ~XFS_BSTATE_DISPOSE;
876 atomic_inc(&bp->b_hold);
877 }
878 spin_unlock(&bp->b_lock);
922 spin_unlock(&pag->pag_buf_lock); 879 spin_unlock(&pag->pag_buf_lock);
923 } else { 880 } else {
924 xfs_buf_lru_del(bp); 881 /*
882 * most of the time buffers will already be removed from
883 * the LRU, so optimise that case by checking for the
884 * XFS_BSTATE_DISPOSE flag indicating the last list the
885 * buffer was on was the disposal list
886 */
887 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
888 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
889 } else {
890 ASSERT(list_empty(&bp->b_lru));
891 }
892 spin_unlock(&bp->b_lock);
893
925 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 894 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
926 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 895 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
927 spin_unlock(&pag->pag_buf_lock); 896 spin_unlock(&pag->pag_buf_lock);
@@ -1501,83 +1470,121 @@ xfs_buf_iomove(
1501 * returned. These buffers will have an elevated hold count, so wait on those 1470 * returned. These buffers will have an elevated hold count, so wait on those
1502 * while freeing all the buffers only held by the LRU. 1471 * while freeing all the buffers only held by the LRU.
1503 */ 1472 */
1473static enum lru_status
1474xfs_buftarg_wait_rele(
1475 struct list_head *item,
1476 spinlock_t *lru_lock,
1477 void *arg)
1478
1479{
1480 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1481 struct list_head *dispose = arg;
1482
1483 if (atomic_read(&bp->b_hold) > 1) {
1484 /* need to wait, so skip it this pass */
1485 trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1486 return LRU_SKIP;
1487 }
1488 if (!spin_trylock(&bp->b_lock))
1489 return LRU_SKIP;
1490
1491 /*
1492 * clear the LRU reference count so the buffer doesn't get
1493 * ignored in xfs_buf_rele().
1494 */
1495 atomic_set(&bp->b_lru_ref, 0);
1496 bp->b_state |= XFS_BSTATE_DISPOSE;
1497 list_move(item, dispose);
1498 spin_unlock(&bp->b_lock);
1499 return LRU_REMOVED;
1500}
1501
1504void 1502void
1505xfs_wait_buftarg( 1503xfs_wait_buftarg(
1506 struct xfs_buftarg *btp) 1504 struct xfs_buftarg *btp)
1507{ 1505{
1508 struct xfs_buf *bp; 1506 LIST_HEAD(dispose);
1507 int loop = 0;
1509 1508
1510restart: 1509 /* loop until there is nothing left on the lru list. */
1511 spin_lock(&btp->bt_lru_lock); 1510 while (list_lru_count(&btp->bt_lru)) {
1512 while (!list_empty(&btp->bt_lru)) { 1511 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
1513 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1512 &dispose, LONG_MAX);
1514 if (atomic_read(&bp->b_hold) > 1) { 1513
1515 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1514 while (!list_empty(&dispose)) {
1516 list_move_tail(&bp->b_lru, &btp->bt_lru); 1515 struct xfs_buf *bp;
1517 spin_unlock(&btp->bt_lru_lock); 1516 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1518 delay(100); 1517 list_del_init(&bp->b_lru);
1519 goto restart; 1518 xfs_buf_rele(bp);
1520 } 1519 }
1521 /* 1520 if (loop++ != 0)
1522 * clear the LRU reference count so the buffer doesn't get 1521 delay(100);
1523 * ignored in xfs_buf_rele().
1524 */
1525 atomic_set(&bp->b_lru_ref, 0);
1526 spin_unlock(&btp->bt_lru_lock);
1527 xfs_buf_rele(bp);
1528 spin_lock(&btp->bt_lru_lock);
1529 } 1522 }
1530 spin_unlock(&btp->bt_lru_lock);
1531} 1523}
1532 1524
1533int 1525static enum lru_status
1534xfs_buftarg_shrink( 1526xfs_buftarg_isolate(
1527 struct list_head *item,
1528 spinlock_t *lru_lock,
1529 void *arg)
1530{
1531 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1532 struct list_head *dispose = arg;
1533
1534 /*
1535 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1536 * If we fail to get the lock, just skip it.
1537 */
1538 if (!spin_trylock(&bp->b_lock))
1539 return LRU_SKIP;
1540 /*
1541 * Decrement the b_lru_ref count unless the value is already
1542 * zero. If the value is already zero, we need to reclaim the
1543 * buffer, otherwise it gets another trip through the LRU.
1544 */
1545 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1546 spin_unlock(&bp->b_lock);
1547 return LRU_ROTATE;
1548 }
1549
1550 bp->b_state |= XFS_BSTATE_DISPOSE;
1551 list_move(item, dispose);
1552 spin_unlock(&bp->b_lock);
1553 return LRU_REMOVED;
1554}
1555
1556static unsigned long
1557xfs_buftarg_shrink_scan(
1535 struct shrinker *shrink, 1558 struct shrinker *shrink,
1536 struct shrink_control *sc) 1559 struct shrink_control *sc)
1537{ 1560{
1538 struct xfs_buftarg *btp = container_of(shrink, 1561 struct xfs_buftarg *btp = container_of(shrink,
1539 struct xfs_buftarg, bt_shrinker); 1562 struct xfs_buftarg, bt_shrinker);
1540 struct xfs_buf *bp;
1541 int nr_to_scan = sc->nr_to_scan;
1542 LIST_HEAD(dispose); 1563 LIST_HEAD(dispose);
1564 unsigned long freed;
1565 unsigned long nr_to_scan = sc->nr_to_scan;
1543 1566
1544 if (!nr_to_scan) 1567 freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
1545 return btp->bt_lru_nr; 1568 &dispose, &nr_to_scan);
1546
1547 spin_lock(&btp->bt_lru_lock);
1548 while (!list_empty(&btp->bt_lru)) {
1549 if (nr_to_scan-- <= 0)
1550 break;
1551
1552 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1553
1554 /*
1555 * Decrement the b_lru_ref count unless the value is already
1556 * zero. If the value is already zero, we need to reclaim the
1557 * buffer, otherwise it gets another trip through the LRU.
1558 */
1559 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1560 list_move_tail(&bp->b_lru, &btp->bt_lru);
1561 continue;
1562 }
1563
1564 /*
1565 * remove the buffer from the LRU now to avoid needing another
1566 * lock round trip inside xfs_buf_rele().
1567 */
1568 list_move(&bp->b_lru, &dispose);
1569 btp->bt_lru_nr--;
1570 bp->b_lru_flags |= _XBF_LRU_DISPOSE;
1571 }
1572 spin_unlock(&btp->bt_lru_lock);
1573 1569
1574 while (!list_empty(&dispose)) { 1570 while (!list_empty(&dispose)) {
1571 struct xfs_buf *bp;
1575 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1572 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1576 list_del_init(&bp->b_lru); 1573 list_del_init(&bp->b_lru);
1577 xfs_buf_rele(bp); 1574 xfs_buf_rele(bp);
1578 } 1575 }
1579 1576
1580 return btp->bt_lru_nr; 1577 return freed;
1578}
1579
1580static unsigned long
1581xfs_buftarg_shrink_count(
1582 struct shrinker *shrink,
1583 struct shrink_control *sc)
1584{
1585 struct xfs_buftarg *btp = container_of(shrink,
1586 struct xfs_buftarg, bt_shrinker);
1587 return list_lru_count_node(&btp->bt_lru, sc->nid);
1581} 1588}
1582 1589
1583void 1590void
@@ -1586,6 +1593,7 @@ xfs_free_buftarg(
1586 struct xfs_buftarg *btp) 1593 struct xfs_buftarg *btp)
1587{ 1594{
1588 unregister_shrinker(&btp->bt_shrinker); 1595 unregister_shrinker(&btp->bt_shrinker);
1596 list_lru_destroy(&btp->bt_lru);
1589 1597
1590 if (mp->m_flags & XFS_MOUNT_BARRIER) 1598 if (mp->m_flags & XFS_MOUNT_BARRIER)
1591 xfs_blkdev_issue_flush(btp); 1599 xfs_blkdev_issue_flush(btp);
@@ -1621,7 +1629,7 @@ xfs_setsize_buftarg_flags(
1621/* 1629/*
1622 * When allocating the initial buffer target we have not yet 1630 * When allocating the initial buffer target we have not yet
1623 * read in the superblock, so don't know what sized sectors 1631 * read in the superblock, so don't know what sized sectors
1624 * are being used is at this early stage. Play safe. 1632 * are being used at this early stage. Play safe.
1625 */ 1633 */
1626STATIC int 1634STATIC int
1627xfs_setsize_buftarg_early( 1635xfs_setsize_buftarg_early(
@@ -1659,12 +1667,16 @@ xfs_alloc_buftarg(
1659 if (!btp->bt_bdi) 1667 if (!btp->bt_bdi)
1660 goto error; 1668 goto error;
1661 1669
1662 INIT_LIST_HEAD(&btp->bt_lru);
1663 spin_lock_init(&btp->bt_lru_lock);
1664 if (xfs_setsize_buftarg_early(btp, bdev)) 1670 if (xfs_setsize_buftarg_early(btp, bdev))
1665 goto error; 1671 goto error;
1666 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1672
1673 if (list_lru_init(&btp->bt_lru))
1674 goto error;
1675
1676 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1677 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1667 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1678 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1679 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
1668 register_shrinker(&btp->bt_shrinker); 1680 register_shrinker(&btp->bt_shrinker);
1669 return btp; 1681 return btp;
1670 1682
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12ed7b17..e65683361017 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/uio.h> 27#include <linux/uio.h>
28#include <linux/list_lru.h>
28 29
29/* 30/*
30 * Base types 31 * Base types
@@ -59,7 +60,6 @@ typedef enum {
59#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 60#define _XBF_KMEM (1 << 21)/* backed by heap memory */
60#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ 61#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
61#define _XBF_COMPOUND (1 << 23)/* compound buffer */ 62#define _XBF_COMPOUND (1 << 23)/* compound buffer */
62#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
63 63
64typedef unsigned int xfs_buf_flags_t; 64typedef unsigned int xfs_buf_flags_t;
65 65
@@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
78 { _XBF_PAGES, "PAGES" }, \ 78 { _XBF_PAGES, "PAGES" }, \
79 { _XBF_KMEM, "KMEM" }, \ 79 { _XBF_KMEM, "KMEM" }, \
80 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 80 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
81 { _XBF_COMPOUND, "COMPOUND" }, \ 81 { _XBF_COMPOUND, "COMPOUND" }
82 { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } 82
83/*
84 * Internal state flags.
85 */
86#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
83 87
84typedef struct xfs_buftarg { 88typedef struct xfs_buftarg {
85 dev_t bt_dev; 89 dev_t bt_dev;
@@ -92,9 +96,7 @@ typedef struct xfs_buftarg {
92 96
93 /* LRU control structures */ 97 /* LRU control structures */
94 struct shrinker bt_shrinker; 98 struct shrinker bt_shrinker;
95 struct list_head bt_lru; 99 struct list_lru bt_lru;
96 spinlock_t bt_lru_lock;
97 unsigned int bt_lru_nr;
98} xfs_buftarg_t; 100} xfs_buftarg_t;
99 101
100struct xfs_buf; 102struct xfs_buf;
@@ -137,7 +139,8 @@ typedef struct xfs_buf {
137 * bt_lru_lock and not by b_sema 139 * bt_lru_lock and not by b_sema
138 */ 140 */
139 struct list_head b_lru; /* lru list */ 141 struct list_head b_lru; /* lru list */
140 xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ 142 spinlock_t b_lock; /* internal state lock */
143 unsigned int b_state; /* internal state flags */
141 wait_queue_head_t b_waiters; /* unpin waiters */ 144 wait_queue_head_t b_waiters; /* unpin waiters */
142 struct list_head b_list; 145 struct list_head b_list;
143 struct xfs_perag *b_pag; /* contains rbtree root */ 146 struct xfs_perag *b_pag; /* contains rbtree root */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index bfc4e0c26fd3..f1d85cfc0a54 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -39,6 +39,14 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
39 39
40STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 40STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
41 41
42static inline int
43xfs_buf_log_format_size(
44 struct xfs_buf_log_format *blfp)
45{
46 return offsetof(struct xfs_buf_log_format, blf_data_map) +
47 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
48}
49
42/* 50/*
43 * This returns the number of log iovecs needed to log the 51 * This returns the number of log iovecs needed to log the
44 * given buf log item. 52 * given buf log item.
@@ -49,25 +57,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
49 * 57 *
50 * If the XFS_BLI_STALE flag has been set, then log nothing. 58 * If the XFS_BLI_STALE flag has been set, then log nothing.
51 */ 59 */
52STATIC uint 60STATIC void
53xfs_buf_item_size_segment( 61xfs_buf_item_size_segment(
54 struct xfs_buf_log_item *bip, 62 struct xfs_buf_log_item *bip,
55 struct xfs_buf_log_format *blfp) 63 struct xfs_buf_log_format *blfp,
64 int *nvecs,
65 int *nbytes)
56{ 66{
57 struct xfs_buf *bp = bip->bli_buf; 67 struct xfs_buf *bp = bip->bli_buf;
58 uint nvecs;
59 int next_bit; 68 int next_bit;
60 int last_bit; 69 int last_bit;
61 70
62 last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 71 last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
63 if (last_bit == -1) 72 if (last_bit == -1)
64 return 0; 73 return;
65 74
66 /* 75 /*
67 * initial count for a dirty buffer is 2 vectors - the format structure 76 * initial count for a dirty buffer is 2 vectors - the format structure
68 * and the first dirty region. 77 * and the first dirty region.
69 */ 78 */
70 nvecs = 2; 79 *nvecs += 2;
80 *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
71 81
72 while (last_bit != -1) { 82 while (last_bit != -1) {
73 /* 83 /*
@@ -87,18 +97,17 @@ xfs_buf_item_size_segment(
87 break; 97 break;
88 } else if (next_bit != last_bit + 1) { 98 } else if (next_bit != last_bit + 1) {
89 last_bit = next_bit; 99 last_bit = next_bit;
90 nvecs++; 100 (*nvecs)++;
91 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != 101 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
92 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + 102 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
93 XFS_BLF_CHUNK)) { 103 XFS_BLF_CHUNK)) {
94 last_bit = next_bit; 104 last_bit = next_bit;
95 nvecs++; 105 (*nvecs)++;
96 } else { 106 } else {
97 last_bit++; 107 last_bit++;
98 } 108 }
109 *nbytes += XFS_BLF_CHUNK;
99 } 110 }
100
101 return nvecs;
102} 111}
103 112
104/* 113/*
@@ -118,12 +127,13 @@ xfs_buf_item_size_segment(
118 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 127 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
119 * format structures. 128 * format structures.
120 */ 129 */
121STATIC uint 130STATIC void
122xfs_buf_item_size( 131xfs_buf_item_size(
123 struct xfs_log_item *lip) 132 struct xfs_log_item *lip,
133 int *nvecs,
134 int *nbytes)
124{ 135{
125 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 136 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
126 uint nvecs;
127 int i; 137 int i;
128 138
129 ASSERT(atomic_read(&bip->bli_refcount) > 0); 139 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -135,7 +145,11 @@ xfs_buf_item_size(
135 */ 145 */
136 trace_xfs_buf_item_size_stale(bip); 146 trace_xfs_buf_item_size_stale(bip);
137 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 147 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
138 return bip->bli_format_count; 148 *nvecs += bip->bli_format_count;
149 for (i = 0; i < bip->bli_format_count; i++) {
150 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
151 }
152 return;
139 } 153 }
140 154
141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 155 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
@@ -147,7 +161,8 @@ xfs_buf_item_size(
147 * commit, so no vectors are used at all. 161 * commit, so no vectors are used at all.
148 */ 162 */
149 trace_xfs_buf_item_size_ordered(bip); 163 trace_xfs_buf_item_size_ordered(bip);
150 return XFS_LOG_VEC_ORDERED; 164 *nvecs = XFS_LOG_VEC_ORDERED;
165 return;
151 } 166 }
152 167
153 /* 168 /*
@@ -159,13 +174,11 @@ xfs_buf_item_size(
159 * count for the extra buf log format structure that will need to be 174 * count for the extra buf log format structure that will need to be
160 * written. 175 * written.
161 */ 176 */
162 nvecs = 0;
163 for (i = 0; i < bip->bli_format_count; i++) { 177 for (i = 0; i < bip->bli_format_count; i++) {
164 nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); 178 xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
179 nvecs, nbytes);
165 } 180 }
166
167 trace_xfs_buf_item_size(bip); 181 trace_xfs_buf_item_size(bip);
168 return nvecs;
169} 182}
170 183
171static struct xfs_log_iovec * 184static struct xfs_log_iovec *
@@ -192,8 +205,7 @@ xfs_buf_item_format_segment(
192 * the actual size of the dirty bitmap rather than the size of the in 205 * the actual size of the dirty bitmap rather than the size of the in
193 * memory structure. 206 * memory structure.
194 */ 207 */
195 base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + 208 base_size = xfs_buf_log_format_size(blfp);
196 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
197 209
198 nvecs = 0; 210 nvecs = 0;
199 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 211 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
@@ -601,15 +613,28 @@ xfs_buf_item_unlock(
601 } 613 }
602 } 614 }
603 } 615 }
604 if (clean) 616
605 xfs_buf_item_relse(bp); 617 /*
606 else if (aborted) { 618 * Clean buffers, by definition, cannot be in the AIL. However, aborted
607 if (atomic_dec_and_test(&bip->bli_refcount)) { 619 * buffers may be dirty and hence in the AIL. Therefore if we are
620 * aborting a buffer and we've just taken the last refernce away, we
621 * have to check if it is in the AIL before freeing it. We need to free
622 * it in this case, because an aborted transaction has already shut the
623 * filesystem down and this is the last chance we will have to do so.
624 */
625 if (atomic_dec_and_test(&bip->bli_refcount)) {
626 if (clean)
627 xfs_buf_item_relse(bp);
628 else if (aborted) {
608 ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); 629 ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
630 if (lip->li_flags & XFS_LI_IN_AIL) {
631 spin_lock(&lip->li_ailp->xa_lock);
632 xfs_trans_ail_delete(lip->li_ailp, lip,
633 SHUTDOWN_LOG_IO_ERROR);
634 }
609 xfs_buf_item_relse(bp); 635 xfs_buf_item_relse(bp);
610 } 636 }
611 } else 637 }
612 atomic_dec(&bip->bli_refcount);
613 638
614 if (!(flags & XFS_BLI_HOLD)) 639 if (!(flags & XFS_BLI_HOLD))
615 xfs_buf_relse(bp); 640 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0f1c247dc680..db6371087fe8 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -18,101 +18,9 @@
18#ifndef __XFS_BUF_ITEM_H__ 18#ifndef __XFS_BUF_ITEM_H__
19#define __XFS_BUF_ITEM_H__ 19#define __XFS_BUF_ITEM_H__
20 20
21extern kmem_zone_t *xfs_buf_item_zone; 21/* kernel only definitions */
22
23/*
24 * This flag indicates that the buffer contains on disk inodes
25 * and requires special recovery handling.
26 */
27#define XFS_BLF_INODE_BUF (1<<0)
28/*
29 * This flag indicates that the buffer should not be replayed
30 * during recovery because its blocks are being freed.
31 */
32#define XFS_BLF_CANCEL (1<<1)
33
34/*
35 * This flag indicates that the buffer contains on disk
36 * user or group dquots and may require special recovery handling.
37 */
38#define XFS_BLF_UDQUOT_BUF (1<<2)
39#define XFS_BLF_PDQUOT_BUF (1<<3)
40#define XFS_BLF_GDQUOT_BUF (1<<4)
41
42#define XFS_BLF_CHUNK 128
43#define XFS_BLF_SHIFT 7
44#define BIT_TO_WORD_SHIFT 5
45#define NBWORD (NBBY * sizeof(unsigned int))
46
47/*
48 * This is the structure used to lay out a buf log item in the
49 * log. The data map describes which 128 byte chunks of the buffer
50 * have been logged.
51 */
52#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
53 22
54typedef struct xfs_buf_log_format { 23/* buf log item flags */
55 unsigned short blf_type; /* buf log item type indicator */
56 unsigned short blf_size; /* size of this item */
57 ushort blf_flags; /* misc state */
58 ushort blf_len; /* number of blocks in this buf */
59 __int64_t blf_blkno; /* starting blkno of this buf */
60 unsigned int blf_map_size; /* used size of data bitmap in words */
61 unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
62} xfs_buf_log_format_t;
63
64/*
65 * All buffers now need to tell recovery where the magic number
66 * is so that it can verify and calculate the CRCs on the buffer correctly
67 * once the changes have been replayed into the buffer.
68 *
69 * The type value is held in the upper 5 bits of the blf_flags field, which is
70 * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
71 */
72#define XFS_BLFT_BITS 5
73#define XFS_BLFT_SHIFT 11
74#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
75
76enum xfs_blft {
77 XFS_BLFT_UNKNOWN_BUF = 0,
78 XFS_BLFT_UDQUOT_BUF,
79 XFS_BLFT_PDQUOT_BUF,
80 XFS_BLFT_GDQUOT_BUF,
81 XFS_BLFT_BTREE_BUF,
82 XFS_BLFT_AGF_BUF,
83 XFS_BLFT_AGFL_BUF,
84 XFS_BLFT_AGI_BUF,
85 XFS_BLFT_DINO_BUF,
86 XFS_BLFT_SYMLINK_BUF,
87 XFS_BLFT_DIR_BLOCK_BUF,
88 XFS_BLFT_DIR_DATA_BUF,
89 XFS_BLFT_DIR_FREE_BUF,
90 XFS_BLFT_DIR_LEAF1_BUF,
91 XFS_BLFT_DIR_LEAFN_BUF,
92 XFS_BLFT_DA_NODE_BUF,
93 XFS_BLFT_ATTR_LEAF_BUF,
94 XFS_BLFT_ATTR_RMT_BUF,
95 XFS_BLFT_SB_BUF,
96 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
97};
98
99static inline void
100xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
101{
102 ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
103 blf->blf_flags &= ~XFS_BLFT_MASK;
104 blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
105}
106
107static inline __uint16_t
108xfs_blft_from_flags(struct xfs_buf_log_format *blf)
109{
110 return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
111}
112
113/*
114 * buf log item flags
115 */
116#define XFS_BLI_HOLD 0x01 24#define XFS_BLI_HOLD 0x01
117#define XFS_BLI_DIRTY 0x02 25#define XFS_BLI_DIRTY 0x02
118#define XFS_BLI_STALE 0x04 26#define XFS_BLI_STALE 0x04
@@ -133,8 +41,6 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
133 { XFS_BLI_ORDERED, "ORDERED" } 41 { XFS_BLI_ORDERED, "ORDERED" }
134 42
135 43
136#ifdef __KERNEL__
137
138struct xfs_buf; 44struct xfs_buf;
139struct xfs_mount; 45struct xfs_mount;
140struct xfs_buf_log_item; 46struct xfs_buf_log_item;
@@ -169,6 +75,6 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
169 enum xfs_blft); 75 enum xfs_blft);
170void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); 76void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
171 77
172#endif /* __KERNEL__ */ 78extern kmem_zone_t *xfs_buf_item_zone;
173 79
174#endif /* __XFS_BUF_ITEM_H__ */ 80#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0b8b2a13cd24..20bf8e8002d6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -27,8 +27,8 @@
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
30#include "xfs_dir2.h"
31#include "xfs_dir2_format.h" 30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h" 32#include "xfs_dir2_priv.h"
33#include "xfs_dinode.h" 33#include "xfs_dinode.h"
34#include "xfs_inode.h" 34#include "xfs_inode.h"
@@ -399,7 +399,7 @@ xfs_da3_split(
399 struct xfs_da_intnode *node; 399 struct xfs_da_intnode *node;
400 struct xfs_buf *bp; 400 struct xfs_buf *bp;
401 int max; 401 int max;
402 int action; 402 int action = 0;
403 int error; 403 int error;
404 int i; 404 int i;
405 405
@@ -635,6 +635,7 @@ xfs_da3_root_split(
635 xfs_trans_log_buf(tp, bp, 0, size - 1); 635 xfs_trans_log_buf(tp, bp, 0, size - 1);
636 636
637 bp->b_ops = blk1->bp->b_ops; 637 bp->b_ops = blk1->bp->b_ops;
638 xfs_trans_buf_copy_type(bp, blk1->bp);
638 blk1->bp = bp; 639 blk1->bp = bp;
639 blk1->blkno = blkno; 640 blk1->blkno = blkno;
640 641
@@ -1223,6 +1224,7 @@ xfs_da3_node_toosmall(
1223 /* start with smaller blk num */ 1224 /* start with smaller blk num */
1224 forward = nodehdr.forw < nodehdr.back; 1225 forward = nodehdr.forw < nodehdr.back;
1225 for (i = 0; i < 2; forward = !forward, i++) { 1226 for (i = 0; i < 2; forward = !forward, i++) {
1227 struct xfs_da3_icnode_hdr thdr;
1226 if (forward) 1228 if (forward)
1227 blkno = nodehdr.forw; 1229 blkno = nodehdr.forw;
1228 else 1230 else
@@ -1235,10 +1237,10 @@ xfs_da3_node_toosmall(
1235 return(error); 1237 return(error);
1236 1238
1237 node = bp->b_addr; 1239 node = bp->b_addr;
1238 xfs_da3_node_hdr_from_disk(&nodehdr, node); 1240 xfs_da3_node_hdr_from_disk(&thdr, node);
1239 xfs_trans_brelse(state->args->trans, bp); 1241 xfs_trans_brelse(state->args->trans, bp);
1240 1242
1241 if (count - nodehdr.count >= 0) 1243 if (count - thdr.count >= 0)
1242 break; /* fits with at least 25% to spare */ 1244 break; /* fits with at least 25% to spare */
1243 } 1245 }
1244 if (i >= 2) { 1246 if (i >= 2) {
@@ -2454,9 +2456,9 @@ static int
2454xfs_buf_map_from_irec( 2456xfs_buf_map_from_irec(
2455 struct xfs_mount *mp, 2457 struct xfs_mount *mp,
2456 struct xfs_buf_map **mapp, 2458 struct xfs_buf_map **mapp,
2457 unsigned int *nmaps, 2459 int *nmaps,
2458 struct xfs_bmbt_irec *irecs, 2460 struct xfs_bmbt_irec *irecs,
2459 unsigned int nirecs) 2461 int nirecs)
2460{ 2462{
2461 struct xfs_buf_map *map; 2463 struct xfs_buf_map *map;
2462 int i; 2464 int i;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6fb3371c63cf..b1f267995dea 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -133,12 +133,19 @@ extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
133 struct xfs_da3_icnode_hdr *from); 133 struct xfs_da3_icnode_hdr *from);
134 134
135static inline int 135static inline int
136xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) 136__xfs_da3_node_hdr_size(bool v3)
137{ 137{
138 if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) 138 if (v3)
139 return sizeof(struct xfs_da3_node_hdr); 139 return sizeof(struct xfs_da3_node_hdr);
140 return sizeof(struct xfs_da_node_hdr); 140 return sizeof(struct xfs_da_node_hdr);
141} 141}
142static inline int
143xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
144{
145 bool v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
146
147 return __xfs_da3_node_hdr_size(v3);
148}
142 149
143static inline struct xfs_da_node_entry * 150static inline struct xfs_da_node_entry *
144xfs_da3_node_tree_p(struct xfs_da_intnode *dap) 151xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
@@ -176,6 +183,7 @@ enum xfs_dacmp {
176typedef struct xfs_da_args { 183typedef struct xfs_da_args {
177 const __uint8_t *name; /* string (maybe not NULL terminated) */ 184 const __uint8_t *name; /* string (maybe not NULL terminated) */
178 int namelen; /* length of string (maybe no NULL) */ 185 int namelen; /* length of string (maybe no NULL) */
186 __uint8_t filetype; /* filetype of inode for directories */
179 __uint8_t *value; /* set of bytes (maybe contain NULLs) */ 187 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
180 int valuelen; /* length of value */ 188 int valuelen; /* length of value */
181 int flags; /* argument flags (eg: ATTR_NOCREATE) */ 189 int flags; /* argument flags (eg: ATTR_NOCREATE) */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644
index e36445ceaf80..000000000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,459 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
33#include "xfs_bmap.h"
34#include "xfs_itable.h"
35#include "xfs_dfrag.h"
36#include "xfs_error.h"
37#include "xfs_vnodeops.h"
38#include "xfs_trace.h"
39
40
41static int xfs_swap_extents(
42 xfs_inode_t *ip, /* target inode */
43 xfs_inode_t *tip, /* tmp inode */
44 xfs_swapext_t *sxp);
45
46/*
47 * ioctl interface for swapext
48 */
49int
50xfs_swapext(
51 xfs_swapext_t *sxp)
52{
53 xfs_inode_t *ip, *tip;
54 struct fd f, tmp;
55 int error = 0;
56
57 /* Pull information for the target fd */
58 f = fdget((int)sxp->sx_fdtarget);
59 if (!f.file) {
60 error = XFS_ERROR(EINVAL);
61 goto out;
62 }
63
64 if (!(f.file->f_mode & FMODE_WRITE) ||
65 !(f.file->f_mode & FMODE_READ) ||
66 (f.file->f_flags & O_APPEND)) {
67 error = XFS_ERROR(EBADF);
68 goto out_put_file;
69 }
70
71 tmp = fdget((int)sxp->sx_fdtmp);
72 if (!tmp.file) {
73 error = XFS_ERROR(EINVAL);
74 goto out_put_file;
75 }
76
77 if (!(tmp.file->f_mode & FMODE_WRITE) ||
78 !(tmp.file->f_mode & FMODE_READ) ||
79 (tmp.file->f_flags & O_APPEND)) {
80 error = XFS_ERROR(EBADF);
81 goto out_put_tmp_file;
82 }
83
84 if (IS_SWAPFILE(file_inode(f.file)) ||
85 IS_SWAPFILE(file_inode(tmp.file))) {
86 error = XFS_ERROR(EINVAL);
87 goto out_put_tmp_file;
88 }
89
90 ip = XFS_I(file_inode(f.file));
91 tip = XFS_I(file_inode(tmp.file));
92
93 if (ip->i_mount != tip->i_mount) {
94 error = XFS_ERROR(EINVAL);
95 goto out_put_tmp_file;
96 }
97
98 if (ip->i_ino == tip->i_ino) {
99 error = XFS_ERROR(EINVAL);
100 goto out_put_tmp_file;
101 }
102
103 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
104 error = XFS_ERROR(EIO);
105 goto out_put_tmp_file;
106 }
107
108 error = xfs_swap_extents(ip, tip, sxp);
109
110 out_put_tmp_file:
111 fdput(tmp);
112 out_put_file:
113 fdput(f);
114 out:
115 return error;
116}
117
118/*
119 * We need to check that the format of the data fork in the temporary inode is
120 * valid for the target inode before doing the swap. This is not a problem with
121 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
122 * data fork depending on the space the attribute fork is taking so we can get
123 * invalid formats on the target inode.
124 *
125 * E.g. target has space for 7 extents in extent format, temp inode only has
126 * space for 6. If we defragment down to 7 extents, then the tmp format is a
127 * btree, but when swapped it needs to be in extent format. Hence we can't just
128 * blindly swap data forks on attr2 filesystems.
129 *
130 * Note that we check the swap in both directions so that we don't end up with
131 * a corrupt temporary inode, either.
132 *
133 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
134 * inode will prevent this situation from occurring, so all we do here is
135 * reject and log the attempt. basically we are putting the responsibility on
136 * userspace to get this right.
137 */
138static int
139xfs_swap_extents_check_format(
140 xfs_inode_t *ip, /* target inode */
141 xfs_inode_t *tip) /* tmp inode */
142{
143
144 /* Should never get a local format */
145 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
146 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
147 return EINVAL;
148
149 /*
150 * if the target inode has less extents that then temporary inode then
151 * why did userspace call us?
152 */
153 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
154 return EINVAL;
155
156 /*
157 * if the target inode is in extent form and the temp inode is in btree
158 * form then we will end up with the target inode in the wrong format
159 * as we already know there are less extents in the temp inode.
160 */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
162 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
163 return EINVAL;
164
165 /* Check temp in extent form to max in target */
166 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
167 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
168 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
169 return EINVAL;
170
171 /* Check target in extent form to max in temp */
172 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
173 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
174 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
175 return EINVAL;
176
177 /*
178 * If we are in a btree format, check that the temp root block will fit
179 * in the target and that it has enough extents to be in btree format
180 * in the target.
181 *
182 * Note that we have to be careful to allow btree->extent conversions
183 * (a common defrag case) which will occur when the temp inode is in
184 * extent format...
185 */
186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
187 if (XFS_IFORK_BOFF(ip) &&
188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
189 return EINVAL;
190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
192 return EINVAL;
193 }
194
195 /* Reciprocal target->temp btree format checks */
196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
197 if (XFS_IFORK_BOFF(tip) &&
198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
199 return EINVAL;
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
204
205 return 0;
206}
207
208static int
209xfs_swap_extents(
210 xfs_inode_t *ip, /* target inode */
211 xfs_inode_t *tip, /* tmp inode */
212 xfs_swapext_t *sxp)
213{
214 xfs_mount_t *mp = ip->i_mount;
215 xfs_trans_t *tp;
216 xfs_bstat_t *sbp = &sxp->sx_stat;
217 xfs_ifork_t *tempifp, *ifp, *tifp;
218 int src_log_flags, target_log_flags;
219 int error = 0;
220 int aforkblks = 0;
221 int taforkblks = 0;
222 __uint64_t tmp;
223
224 /*
225 * We have no way of updating owner information in the BMBT blocks for
226 * each inode on CRC enabled filesystems, so to avoid corrupting the
227 * this metadata we simply don't allow extent swaps to occur.
228 */
229 if (xfs_sb_version_hascrc(&mp->m_sb))
230 return XFS_ERROR(EINVAL);
231
232 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
233 if (!tempifp) {
234 error = XFS_ERROR(ENOMEM);
235 goto out;
236 }
237
238 /*
239 * we have to do two separate lock calls here to keep lockdep
240 * happy. If we try to get all the locks in one call, lock will
241 * report false positives when we drop the ILOCK and regain them
242 * below.
243 */
244 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
245 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
246
247 /* Verify that both files have the same format */
248 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
249 error = XFS_ERROR(EINVAL);
250 goto out_unlock;
251 }
252
253 /* Verify both files are either real-time or non-realtime */
254 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
255 error = XFS_ERROR(EINVAL);
256 goto out_unlock;
257 }
258
259 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
260 if (error)
261 goto out_unlock;
262 truncate_pagecache_range(VFS_I(tip), 0, -1);
263
264 /* Verify O_DIRECT for ftmp */
265 if (VN_CACHED(VFS_I(tip)) != 0) {
266 error = XFS_ERROR(EINVAL);
267 goto out_unlock;
268 }
269
270 /* Verify all data are being swapped */
271 if (sxp->sx_offset != 0 ||
272 sxp->sx_length != ip->i_d.di_size ||
273 sxp->sx_length != tip->i_d.di_size) {
274 error = XFS_ERROR(EFAULT);
275 goto out_unlock;
276 }
277
278 trace_xfs_swap_extent_before(ip, 0);
279 trace_xfs_swap_extent_before(tip, 1);
280
281 /* check inode formats now that data is flushed */
282 error = xfs_swap_extents_check_format(ip, tip);
283 if (error) {
284 xfs_notice(mp,
285 "%s: inode 0x%llx format is incompatible for exchanging.",
286 __func__, ip->i_ino);
287 goto out_unlock;
288 }
289
290 /*
291 * Compare the current change & modify times with that
292 * passed in. If they differ, we abort this swap.
293 * This is the mechanism used to ensure the calling
294 * process that the file was not changed out from
295 * under it.
296 */
297 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
298 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
299 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
300 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
301 error = XFS_ERROR(EBUSY);
302 goto out_unlock;
303 }
304
305 /* We need to fail if the file is memory mapped. Once we have tossed
306 * all existing pages, the page fault will have no option
307 * but to go to the filesystem for pages. By making the page fault call
308 * vop_read (or write in the case of autogrow) they block on the iolock
309 * until we have switched the extents.
310 */
311 if (VN_MAPPED(VFS_I(ip))) {
312 error = XFS_ERROR(EBUSY);
313 goto out_unlock;
314 }
315
316 xfs_iunlock(ip, XFS_ILOCK_EXCL);
317 xfs_iunlock(tip, XFS_ILOCK_EXCL);
318
319 /*
320 * There is a race condition here since we gave up the
321 * ilock. However, the data fork will not change since
322 * we have the iolock (locked for truncation too) so we
323 * are safe. We don't really care if non-io related
324 * fields change.
325 */
326 truncate_pagecache_range(VFS_I(ip), 0, -1);
327
328 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
329 if ((error = xfs_trans_reserve(tp, 0,
330 XFS_ICHANGE_LOG_RES(mp), 0,
331 0, 0))) {
332 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
333 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
334 xfs_trans_cancel(tp, 0);
335 goto out;
336 }
337 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
338
339 /*
340 * Count the number of extended attribute blocks
341 */
342 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
343 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
344 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
345 if (error)
346 goto out_trans_cancel;
347 }
348 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
349 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
350 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
351 &taforkblks);
352 if (error)
353 goto out_trans_cancel;
354 }
355
356 /*
357 * Swap the data forks of the inodes
358 */
359 ifp = &ip->i_df;
360 tifp = &tip->i_df;
361 *tempifp = *ifp; /* struct copy */
362 *ifp = *tifp; /* struct copy */
363 *tifp = *tempifp; /* struct copy */
364
365 /*
366 * Fix the on-disk inode values
367 */
368 tmp = (__uint64_t)ip->i_d.di_nblocks;
369 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
370 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
371
372 tmp = (__uint64_t) ip->i_d.di_nextents;
373 ip->i_d.di_nextents = tip->i_d.di_nextents;
374 tip->i_d.di_nextents = tmp;
375
376 tmp = (__uint64_t) ip->i_d.di_format;
377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp;
379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
393 src_log_flags = XFS_ILOG_CORE;
394 switch (ip->i_d.di_format) {
395 case XFS_DINODE_FMT_EXTENTS:
396 /* If the extents fit in the inode, fix the
397 * pointer. Otherwise it's already NULL or
398 * pointing to the extent.
399 */
400 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
401 ifp->if_u1.if_extents =
402 ifp->if_u2.if_inline_ext;
403 }
404 src_log_flags |= XFS_ILOG_DEXT;
405 break;
406 case XFS_DINODE_FMT_BTREE:
407 src_log_flags |= XFS_ILOG_DBROOT;
408 break;
409 }
410
411 target_log_flags = XFS_ILOG_CORE;
412 switch (tip->i_d.di_format) {
413 case XFS_DINODE_FMT_EXTENTS:
414 /* If the extents fit in the inode, fix the
415 * pointer. Otherwise it's already NULL or
416 * pointing to the extent.
417 */
418 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
419 tifp->if_u1.if_extents =
420 tifp->if_u2.if_inline_ext;
421 }
422 target_log_flags |= XFS_ILOG_DEXT;
423 break;
424 case XFS_DINODE_FMT_BTREE:
425 target_log_flags |= XFS_ILOG_DBROOT;
426 break;
427 }
428
429
430 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
431 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
432
433 xfs_trans_log_inode(tp, ip, src_log_flags);
434 xfs_trans_log_inode(tp, tip, target_log_flags);
435
436 /*
437 * If this is a synchronous mount, make sure that the
438 * transaction goes to disk before returning to the user.
439 */
440 if (mp->m_flags & XFS_MOUNT_WSYNC)
441 xfs_trans_set_sync(tp);
442
443 error = xfs_trans_commit(tp, 0);
444
445 trace_xfs_swap_extent_after(ip, 0);
446 trace_xfs_swap_extent_after(tip, 1);
447out:
448 kmem_free(tempifp);
449 return error;
450
451out_unlock:
452 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
453 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
454 goto out;
455
456out_trans_cancel:
457 xfs_trans_cancel(tp, 0);
458 goto out_unlock;
459}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644
index 20bdd935c121..000000000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DFRAG_H__
19#define __XFS_DFRAG_H__
20
21/*
22 * Structure passed to xfs_swapext
23 */
24
25typedef struct xfs_swapext
26{
27 __int64_t sx_version; /* version */
28 __int64_t sx_fdtarget; /* fd of target file */
29 __int64_t sx_fdtmp; /* fd of tmp file */
30 xfs_off_t sx_offset; /* offset into file */
31 xfs_off_t sx_length; /* leng from offset */
32 char sx_pad[16]; /* pad space, unused */
33 xfs_bstat_t sx_stat; /* stat of target b4 copy */
34} xfs_swapext_t;
35
36/*
37 * Version flag
38 */
39#define XFS_SX_VERSION 0
40
41#ifdef __KERNEL__
42/*
43 * Prototypes for visible xfs_dfrag.c routines.
44 */
45
46/*
47 * Syscall interface for xfs_swapext
48 */
49int xfs_swapext(struct xfs_swapext *sx);
50
51#endif /* __KERNEL__ */
52
53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 8f023dee404d..edf203ab50af 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -31,14 +31,14 @@
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
33#include "xfs_bmap.h" 33#include "xfs_bmap.h"
34#include "xfs_dir2.h"
35#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
36#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_vnodeops.h"
39#include "xfs_trace.h" 38#include "xfs_trace.h"
40 39
41struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; 40struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
41
42 42
43/* 43/*
44 * ASCII case-insensitive (ie. A-Z) support for directories that was 44 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -90,6 +90,9 @@ void
90xfs_dir_mount( 90xfs_dir_mount(
91 xfs_mount_t *mp) 91 xfs_mount_t *mp)
92{ 92{
93 int nodehdr_size;
94
95
93 ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); 96 ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
94 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= 97 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
95 XFS_MAX_BLOCKSIZE); 98 XFS_MAX_BLOCKSIZE);
@@ -98,12 +101,13 @@ xfs_dir_mount(
98 mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); 101 mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
99 mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); 102 mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
100 mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); 103 mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
101 mp->m_attr_node_ents = 104
102 (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / 105 nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
103 (uint)sizeof(xfs_da_node_entry_t); 106 mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
104 mp->m_dir_node_ents = 107 (uint)sizeof(xfs_da_node_entry_t);
105 (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / 108 mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
106 (uint)sizeof(xfs_da_node_entry_t); 109 (uint)sizeof(xfs_da_node_entry_t);
110
107 mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; 111 mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
108 if (xfs_sb_version_hasasciici(&mp->m_sb)) 112 if (xfs_sb_version_hasasciici(&mp->m_sb))
109 mp->m_dirnameops = &xfs_ascii_ci_nameops; 113 mp->m_dirnameops = &xfs_ascii_ci_nameops;
@@ -209,6 +213,7 @@ xfs_dir_createname(
209 memset(&args, 0, sizeof(xfs_da_args_t)); 213 memset(&args, 0, sizeof(xfs_da_args_t));
210 args.name = name->name; 214 args.name = name->name;
211 args.namelen = name->len; 215 args.namelen = name->len;
216 args.filetype = name->type;
212 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 217 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
213 args.inumber = inum; 218 args.inumber = inum;
214 args.dp = dp; 219 args.dp = dp;
@@ -283,6 +288,7 @@ xfs_dir_lookup(
283 memset(&args, 0, sizeof(xfs_da_args_t)); 288 memset(&args, 0, sizeof(xfs_da_args_t));
284 args.name = name->name; 289 args.name = name->name;
285 args.namelen = name->len; 290 args.namelen = name->len;
291 args.filetype = name->type;
286 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 292 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
287 args.dp = dp; 293 args.dp = dp;
288 args.whichfork = XFS_DATA_FORK; 294 args.whichfork = XFS_DATA_FORK;
@@ -338,6 +344,7 @@ xfs_dir_removename(
338 memset(&args, 0, sizeof(xfs_da_args_t)); 344 memset(&args, 0, sizeof(xfs_da_args_t));
339 args.name = name->name; 345 args.name = name->name;
340 args.namelen = name->len; 346 args.namelen = name->len;
347 args.filetype = name->type;
341 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 348 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
342 args.inumber = ino; 349 args.inumber = ino;
343 args.dp = dp; 350 args.dp = dp;
@@ -363,37 +370,6 @@ xfs_dir_removename(
363} 370}
364 371
365/* 372/*
366 * Read a directory.
367 */
368int
369xfs_readdir(
370 xfs_inode_t *dp,
371 struct dir_context *ctx,
372 size_t bufsize)
373{
374 int rval; /* return value */
375 int v; /* type-checking value */
376
377 trace_xfs_readdir(dp);
378
379 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
380 return XFS_ERROR(EIO);
381
382 ASSERT(S_ISDIR(dp->i_d.di_mode));
383 XFS_STATS_INC(xs_dir_getdents);
384
385 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
386 rval = xfs_dir2_sf_getdents(dp, ctx);
387 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
388 ;
389 else if (v)
390 rval = xfs_dir2_block_getdents(dp, ctx);
391 else
392 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
393 return rval;
394}
395
396/*
397 * Replace the inode number of a directory entry. 373 * Replace the inode number of a directory entry.
398 */ 374 */
399int 375int
@@ -418,6 +394,7 @@ xfs_dir_replace(
418 memset(&args, 0, sizeof(xfs_da_args_t)); 394 memset(&args, 0, sizeof(xfs_da_args_t));
419 args.name = name->name; 395 args.name = name->name;
420 args.namelen = name->len; 396 args.namelen = name->len;
397 args.filetype = name->type;
421 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 398 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
422 args.inumber = inum; 399 args.inumber = inum;
423 args.dp = dp; 400 args.dp = dp;
@@ -465,6 +442,7 @@ xfs_dir_canenter(
465 memset(&args, 0, sizeof(xfs_da_args_t)); 442 memset(&args, 0, sizeof(xfs_da_args_t));
466 args.name = name->name; 443 args.name = name->name;
467 args.namelen = name->len; 444 args.namelen = name->len;
445 args.filetype = name->type;
468 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 446 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
469 args.dp = dp; 447 args.dp = dp;
470 args.whichfork = XFS_DATA_FORK; 448 args.whichfork = XFS_DATA_FORK;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index e937d9991c18..9910401327d4 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -23,6 +23,11 @@ struct xfs_da_args;
23struct xfs_inode; 23struct xfs_inode;
24struct xfs_mount; 24struct xfs_mount;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_dir2_sf_hdr;
27struct xfs_dir2_sf_entry;
28struct xfs_dir2_data_hdr;
29struct xfs_dir2_data_entry;
30struct xfs_dir2_data_unused;
26 31
27extern struct xfs_name xfs_name_dotdot; 32extern struct xfs_name xfs_name_dotdot;
28 33
@@ -57,4 +62,45 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
57 */ 62 */
58extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); 63extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
59 64
65/*
66 * Interface routines used by userspace utilities
67 */
68extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
69extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
70 xfs_ino_t ino);
71extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
72 struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
73extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
74 struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
75 xfs_ino_t ino);
76
77extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
78extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
79extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
80 struct xfs_buf *bp);
81
82extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
83 struct xfs_dir2_data_hdr *hdr, int *loghead);
84extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
85 struct xfs_dir2_data_entry *dep);
86extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
87 struct xfs_buf *bp);
88extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
89 struct xfs_dir2_data_unused *dup);
90extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
91 xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
92 int *needlogp, int *needscanp);
93extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
94 struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
95 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
96
97extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
98 struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
99
100extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
101extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
102extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
103extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
104extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
105
60#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 5e7fbd72cf52..12dad188939d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -31,8 +31,8 @@
31#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
34#include "xfs_dir2.h"
35#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
36#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_trace.h" 38#include "xfs_trace.h"
@@ -126,7 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
126 .verify_write = xfs_dir3_block_write_verify, 126 .verify_write = xfs_dir3_block_write_verify,
127}; 127};
128 128
129static int 129int
130xfs_dir3_block_read( 130xfs_dir3_block_read(
131 struct xfs_trans *tp, 131 struct xfs_trans *tp,
132 struct xfs_inode *dp, 132 struct xfs_inode *dp,
@@ -369,7 +369,7 @@ xfs_dir2_block_addname(
369 if (error) 369 if (error)
370 return error; 370 return error;
371 371
372 len = xfs_dir2_data_entsize(args->namelen); 372 len = xfs_dir3_data_entsize(mp, args->namelen);
373 373
374 /* 374 /*
375 * Set up pointers to parts of the block. 375 * Set up pointers to parts of the block.
@@ -549,7 +549,8 @@ xfs_dir2_block_addname(
549 dep->inumber = cpu_to_be64(args->inumber); 549 dep->inumber = cpu_to_be64(args->inumber);
550 dep->namelen = args->namelen; 550 dep->namelen = args->namelen;
551 memcpy(dep->name, args->name, args->namelen); 551 memcpy(dep->name, args->name, args->namelen);
552 tagp = xfs_dir2_data_entry_tag_p(dep); 552 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
553 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
553 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 554 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
554 /* 555 /*
555 * Clean up the bestfree array and log the header, tail, and entry. 556 * Clean up the bestfree array and log the header, tail, and entry.
@@ -565,101 +566,6 @@ xfs_dir2_block_addname(
565} 566}
566 567
567/* 568/*
568 * Readdir for block directories.
569 */
570int /* error */
571xfs_dir2_block_getdents(
572 xfs_inode_t *dp, /* incore inode */
573 struct dir_context *ctx)
574{
575 xfs_dir2_data_hdr_t *hdr; /* block header */
576 struct xfs_buf *bp; /* buffer for block */
577 xfs_dir2_block_tail_t *btp; /* block tail */
578 xfs_dir2_data_entry_t *dep; /* block data entry */
579 xfs_dir2_data_unused_t *dup; /* block unused entry */
580 char *endptr; /* end of the data entries */
581 int error; /* error return value */
582 xfs_mount_t *mp; /* filesystem mount point */
583 char *ptr; /* current data entry */
584 int wantoff; /* starting block offset */
585 xfs_off_t cook;
586
587 mp = dp->i_mount;
588 /*
589 * If the block number in the offset is out of range, we're done.
590 */
591 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
592 return 0;
593
594 error = xfs_dir3_block_read(NULL, dp, &bp);
595 if (error)
596 return error;
597
598 /*
599 * Extract the byte offset we start at from the seek pointer.
600 * We'll skip entries before this.
601 */
602 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
603 hdr = bp->b_addr;
604 xfs_dir3_data_check(dp, bp);
605 /*
606 * Set up values for the loop.
607 */
608 btp = xfs_dir2_block_tail_p(mp, hdr);
609 ptr = (char *)xfs_dir3_data_entry_p(hdr);
610 endptr = (char *)xfs_dir2_block_leaf_p(btp);
611
612 /*
613 * Loop over the data portion of the block.
614 * Each object is a real entry (dep) or an unused one (dup).
615 */
616 while (ptr < endptr) {
617 dup = (xfs_dir2_data_unused_t *)ptr;
618 /*
619 * Unused, skip it.
620 */
621 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
622 ptr += be16_to_cpu(dup->length);
623 continue;
624 }
625
626 dep = (xfs_dir2_data_entry_t *)ptr;
627
628 /*
629 * Bump pointer for the next iteration.
630 */
631 ptr += xfs_dir2_data_entsize(dep->namelen);
632 /*
633 * The entry is before the desired starting point, skip it.
634 */
635 if ((char *)dep - (char *)hdr < wantoff)
636 continue;
637
638 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
639 (char *)dep - (char *)hdr);
640
641 ctx->pos = cook & 0x7fffffff;
642 /*
643 * If it didn't fit, set the final offset to here & return.
644 */
645 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
646 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
647 xfs_trans_brelse(NULL, bp);
648 return 0;
649 }
650 }
651
652 /*
653 * Reached the end of the block.
654 * Set the offset to a non-existent block 1 and return.
655 */
656 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
657 0x7fffffff;
658 xfs_trans_brelse(NULL, bp);
659 return 0;
660}
661
662/*
663 * Log leaf entries from the block. 569 * Log leaf entries from the block.
664 */ 570 */
665static void 571static void
@@ -736,6 +642,7 @@ xfs_dir2_block_lookup(
736 * Fill in inode number, CI name if appropriate, release the block. 642 * Fill in inode number, CI name if appropriate, release the block.
737 */ 643 */
738 args->inumber = be64_to_cpu(dep->inumber); 644 args->inumber = be64_to_cpu(dep->inumber);
645 args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
739 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 646 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
740 xfs_trans_brelse(args->trans, bp); 647 xfs_trans_brelse(args->trans, bp);
741 return XFS_ERROR(error); 648 return XFS_ERROR(error);
@@ -894,7 +801,7 @@ xfs_dir2_block_removename(
894 needlog = needscan = 0; 801 needlog = needscan = 0;
895 xfs_dir2_data_make_free(tp, bp, 802 xfs_dir2_data_make_free(tp, bp,
896 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), 803 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
897 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 804 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
898 /* 805 /*
899 * Fix up the block tail. 806 * Fix up the block tail.
900 */ 807 */
@@ -968,6 +875,7 @@ xfs_dir2_block_replace(
968 * Change the inode number to the new value. 875 * Change the inode number to the new value.
969 */ 876 */
970 dep->inumber = cpu_to_be64(args->inumber); 877 dep->inumber = cpu_to_be64(args->inumber);
878 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
971 xfs_dir2_data_log_entry(args->trans, bp, dep); 879 xfs_dir2_data_log_entry(args->trans, bp, dep);
972 xfs_dir3_data_check(dp, bp); 880 xfs_dir3_data_check(dp, bp);
973 return 0; 881 return 0;
@@ -1250,11 +1158,12 @@ xfs_dir2_sf_to_block(
1250 /* 1158 /*
1251 * Create entry for . 1159 * Create entry for .
1252 */ 1160 */
1253 dep = xfs_dir3_data_dot_entry_p(hdr); 1161 dep = xfs_dir3_data_dot_entry_p(mp, hdr);
1254 dep->inumber = cpu_to_be64(dp->i_ino); 1162 dep->inumber = cpu_to_be64(dp->i_ino);
1255 dep->namelen = 1; 1163 dep->namelen = 1;
1256 dep->name[0] = '.'; 1164 dep->name[0] = '.';
1257 tagp = xfs_dir2_data_entry_tag_p(dep); 1165 xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
1166 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1258 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1167 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1259 xfs_dir2_data_log_entry(tp, bp, dep); 1168 xfs_dir2_data_log_entry(tp, bp, dep);
1260 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); 1169 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
@@ -1263,17 +1172,18 @@ xfs_dir2_sf_to_block(
1263 /* 1172 /*
1264 * Create entry for .. 1173 * Create entry for ..
1265 */ 1174 */
1266 dep = xfs_dir3_data_dotdot_entry_p(hdr); 1175 dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
1267 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); 1176 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
1268 dep->namelen = 2; 1177 dep->namelen = 2;
1269 dep->name[0] = dep->name[1] = '.'; 1178 dep->name[0] = dep->name[1] = '.';
1270 tagp = xfs_dir2_data_entry_tag_p(dep); 1179 xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
1180 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1271 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1181 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1272 xfs_dir2_data_log_entry(tp, bp, dep); 1182 xfs_dir2_data_log_entry(tp, bp, dep);
1273 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); 1183 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
1274 blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, 1184 blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
1275 (char *)dep - (char *)hdr)); 1185 (char *)dep - (char *)hdr));
1276 offset = xfs_dir3_data_first_offset(hdr); 1186 offset = xfs_dir3_data_first_offset(mp);
1277 /* 1187 /*
1278 * Loop over existing entries, stuff them in. 1188 * Loop over existing entries, stuff them in.
1279 */ 1189 */
@@ -1312,10 +1222,12 @@ xfs_dir2_sf_to_block(
1312 * Copy a real entry. 1222 * Copy a real entry.
1313 */ 1223 */
1314 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); 1224 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
1315 dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep)); 1225 dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
1316 dep->namelen = sfep->namelen; 1226 dep->namelen = sfep->namelen;
1227 xfs_dir3_dirent_put_ftype(mp, dep,
1228 xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
1317 memcpy(dep->name, sfep->name, dep->namelen); 1229 memcpy(dep->name, sfep->name, dep->namelen);
1318 tagp = xfs_dir2_data_entry_tag_p(dep); 1230 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1319 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1231 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1320 xfs_dir2_data_log_entry(tp, bp, dep); 1232 xfs_dir2_data_log_entry(tp, bp, dep);
1321 name.name = sfep->name; 1233 name.name = sfep->name;
@@ -1328,7 +1240,7 @@ xfs_dir2_sf_to_block(
1328 if (++i == sfp->count) 1240 if (++i == sfp->count)
1329 sfep = NULL; 1241 sfep = NULL;
1330 else 1242 else
1331 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 1243 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
1332 } 1244 }
1333 /* Done with the temporary buffer */ 1245 /* Done with the temporary buffer */
1334 kmem_free(sfp); 1246 kmem_free(sfp);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index c2930238005c..47e1326c169a 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -29,14 +29,12 @@
29#include "xfs_dinode.h" 29#include "xfs_dinode.h"
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_dir2_format.h" 31#include "xfs_dir2_format.h"
32#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h" 33#include "xfs_dir2_priv.h"
33#include "xfs_error.h" 34#include "xfs_error.h"
34#include "xfs_buf_item.h" 35#include "xfs_buf_item.h"
35#include "xfs_cksum.h" 36#include "xfs_cksum.h"
36 37
37STATIC xfs_dir2_data_free_t *
38xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
39
40/* 38/*
41 * Check the consistency of the data block. 39 * Check the consistency of the data block.
42 * The input can also be a block-format directory. 40 * The input can also be a block-format directory.
@@ -149,8 +147,10 @@ __xfs_dir3_data_check(
149 XFS_WANT_CORRUPTED_RETURN( 147 XFS_WANT_CORRUPTED_RETURN(
150 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); 148 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
151 XFS_WANT_CORRUPTED_RETURN( 149 XFS_WANT_CORRUPTED_RETURN(
152 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 150 be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
153 (char *)dep - (char *)hdr); 151 (char *)dep - (char *)hdr);
152 XFS_WANT_CORRUPTED_RETURN(
153 xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
154 count++; 154 count++;
155 lastfree = 0; 155 lastfree = 0;
156 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || 156 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -168,7 +168,7 @@ __xfs_dir3_data_check(
168 } 168 }
169 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); 169 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
170 } 170 }
171 p += xfs_dir2_data_entsize(dep->namelen); 171 p += xfs_dir3_data_entsize(mp, dep->namelen);
172 } 172 }
173 /* 173 /*
174 * Need to have seen all the entries and all the bestfree slots. 174 * Need to have seen all the entries and all the bestfree slots.
@@ -325,7 +325,7 @@ xfs_dir3_data_readahead(
325 * Given a data block and an unused entry from that block, 325 * Given a data block and an unused entry from that block,
326 * return the bestfree entry if any that corresponds to it. 326 * return the bestfree entry if any that corresponds to it.
327 */ 327 */
328STATIC xfs_dir2_data_free_t * 328xfs_dir2_data_free_t *
329xfs_dir2_data_freefind( 329xfs_dir2_data_freefind(
330 xfs_dir2_data_hdr_t *hdr, /* data block */ 330 xfs_dir2_data_hdr_t *hdr, /* data block */
331 xfs_dir2_data_unused_t *dup) /* data unused entry */ 331 xfs_dir2_data_unused_t *dup) /* data unused entry */
@@ -333,7 +333,7 @@ xfs_dir2_data_freefind(
333 xfs_dir2_data_free_t *dfp; /* bestfree entry */ 333 xfs_dir2_data_free_t *dfp; /* bestfree entry */
334 xfs_dir2_data_aoff_t off; /* offset value needed */ 334 xfs_dir2_data_aoff_t off; /* offset value needed */
335 struct xfs_dir2_data_free *bf; 335 struct xfs_dir2_data_free *bf;
336#if defined(DEBUG) && defined(__KERNEL__) 336#ifdef DEBUG
337 int matched; /* matched the value */ 337 int matched; /* matched the value */
338 int seenzero; /* saw a 0 bestfree entry */ 338 int seenzero; /* saw a 0 bestfree entry */
339#endif 339#endif
@@ -341,7 +341,7 @@ xfs_dir2_data_freefind(
341 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 341 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
342 bf = xfs_dir3_data_bestfree_p(hdr); 342 bf = xfs_dir3_data_bestfree_p(hdr);
343 343
344#if defined(DEBUG) && defined(__KERNEL__) 344#ifdef DEBUG
345 /* 345 /*
346 * Validate some consistency in the bestfree table. 346 * Validate some consistency in the bestfree table.
347 * Check order, non-overlapping entries, and if we find the 347 * Check order, non-overlapping entries, and if we find the
@@ -538,8 +538,8 @@ xfs_dir2_data_freescan(
538 else { 538 else {
539 dep = (xfs_dir2_data_entry_t *)p; 539 dep = (xfs_dir2_data_entry_t *)p;
540 ASSERT((char *)dep - (char *)hdr == 540 ASSERT((char *)dep - (char *)hdr ==
541 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); 541 be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
542 p += xfs_dir2_data_entsize(dep->namelen); 542 p += xfs_dir3_data_entsize(mp, dep->namelen);
543 } 543 }
544 } 544 }
545} 545}
@@ -629,7 +629,8 @@ xfs_dir2_data_log_entry(
629 struct xfs_buf *bp, 629 struct xfs_buf *bp,
630 xfs_dir2_data_entry_t *dep) /* data entry pointer */ 630 xfs_dir2_data_entry_t *dep) /* data entry pointer */
631{ 631{
632 xfs_dir2_data_hdr_t *hdr = bp->b_addr; 632 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
633 struct xfs_mount *mp = tp->t_mountp;
633 634
634 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || 635 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
635 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || 636 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -637,7 +638,7 @@ xfs_dir2_data_log_entry(
637 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); 638 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
638 639
639 xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), 640 xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
640 (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - 641 (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
641 (char *)hdr - 1)); 642 (char *)hdr - 1));
642} 643}
643 644
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index 7826782b8d78..9cf67381adf6 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -69,6 +69,23 @@
69#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ 69#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
70 70
71/* 71/*
72 * Dirents in version 3 directories have a file type field. Additions to this
73 * list are an on-disk format change, requiring feature bits. Valid values
74 * are as follows:
75 */
76#define XFS_DIR3_FT_UNKNOWN 0
77#define XFS_DIR3_FT_REG_FILE 1
78#define XFS_DIR3_FT_DIR 2
79#define XFS_DIR3_FT_CHRDEV 3
80#define XFS_DIR3_FT_BLKDEV 4
81#define XFS_DIR3_FT_FIFO 5
82#define XFS_DIR3_FT_SOCK 6
83#define XFS_DIR3_FT_SYMLINK 7
84#define XFS_DIR3_FT_WHT 8
85
86#define XFS_DIR3_FT_MAX 9
87
88/*
72 * Byte offset in data block and shortform entry. 89 * Byte offset in data block and shortform entry.
73 */ 90 */
74typedef __uint16_t xfs_dir2_data_off_t; 91typedef __uint16_t xfs_dir2_data_off_t;
@@ -138,6 +155,9 @@ typedef struct xfs_dir2_sf_entry {
138 xfs_dir2_sf_off_t offset; /* saved offset */ 155 xfs_dir2_sf_off_t offset; /* saved offset */
139 __u8 name[]; /* name, variable size */ 156 __u8 name[]; /* name, variable size */
140 /* 157 /*
158 * A single byte containing the file type field follows the inode
159 * number for version 3 directory entries.
160 *
141 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a 161 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
142 * variable offset after the name. 162 * variable offset after the name.
143 */ 163 */
@@ -162,16 +182,6 @@ xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
162 put_unaligned_be16(off, &sfep->offset.i); 182 put_unaligned_be16(off, &sfep->offset.i);
163} 183}
164 184
165static inline int
166xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len)
167{
168 return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */
169 len + /* name */
170 (hdr->i8count ? /* ino */
171 sizeof(xfs_dir2_ino8_t) :
172 sizeof(xfs_dir2_ino4_t));
173}
174
175static inline struct xfs_dir2_sf_entry * 185static inline struct xfs_dir2_sf_entry *
176xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) 186xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
177{ 187{
@@ -179,14 +189,78 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
179 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); 189 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
180} 190}
181 191
192static inline int
193xfs_dir3_sf_entsize(
194 struct xfs_mount *mp,
195 struct xfs_dir2_sf_hdr *hdr,
196 int len)
197{
198 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
199
200 count += len; /* name */
201 count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
202 sizeof(xfs_dir2_ino4_t); /* ino # */
203 if (xfs_sb_version_hasftype(&mp->m_sb))
204 count += sizeof(__uint8_t); /* file type */
205 return count;
206}
207
182static inline struct xfs_dir2_sf_entry * 208static inline struct xfs_dir2_sf_entry *
183xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, 209xfs_dir3_sf_nextentry(
184 struct xfs_dir2_sf_entry *sfep) 210 struct xfs_mount *mp,
211 struct xfs_dir2_sf_hdr *hdr,
212 struct xfs_dir2_sf_entry *sfep)
185{ 213{
186 return (struct xfs_dir2_sf_entry *) 214 return (struct xfs_dir2_sf_entry *)
187 ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); 215 ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
216}
217
218/*
219 * in dir3 shortform directories, the file type field is stored at a variable
220 * offset after the inode number. Because it's only a single byte, endian
221 * conversion is not necessary.
222 */
223static inline __uint8_t *
224xfs_dir3_sfe_ftypep(
225 struct xfs_dir2_sf_hdr *hdr,
226 struct xfs_dir2_sf_entry *sfep)
227{
228 return (__uint8_t *)&sfep->name[sfep->namelen];
188} 229}
189 230
231static inline __uint8_t
232xfs_dir3_sfe_get_ftype(
233 struct xfs_mount *mp,
234 struct xfs_dir2_sf_hdr *hdr,
235 struct xfs_dir2_sf_entry *sfep)
236{
237 __uint8_t *ftp;
238
239 if (!xfs_sb_version_hasftype(&mp->m_sb))
240 return XFS_DIR3_FT_UNKNOWN;
241
242 ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
243 if (*ftp >= XFS_DIR3_FT_MAX)
244 return XFS_DIR3_FT_UNKNOWN;
245 return *ftp;
246}
247
248static inline void
249xfs_dir3_sfe_put_ftype(
250 struct xfs_mount *mp,
251 struct xfs_dir2_sf_hdr *hdr,
252 struct xfs_dir2_sf_entry *sfep,
253 __uint8_t ftype)
254{
255 __uint8_t *ftp;
256
257 ASSERT(ftype < XFS_DIR3_FT_MAX);
258
259 if (!xfs_sb_version_hasftype(&mp->m_sb))
260 return;
261 ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
262 *ftp = ftype;
263}
190 264
191/* 265/*
192 * Data block structures. 266 * Data block structures.
@@ -286,12 +360,18 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
286 * Active entry in a data block. 360 * Active entry in a data block.
287 * 361 *
288 * Aligned to 8 bytes. After the variable length name field there is a 362 * Aligned to 8 bytes. After the variable length name field there is a
289 * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p. 363 * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
364 *
365 * For dir3 structures, there is file type field between the name and the tag.
366 * This can only be manipulated by helper functions. It is packed hard against
367 * the end of the name so any padding for rounding is between the file type and
368 * the tag.
290 */ 369 */
291typedef struct xfs_dir2_data_entry { 370typedef struct xfs_dir2_data_entry {
292 __be64 inumber; /* inode number */ 371 __be64 inumber; /* inode number */
293 __u8 namelen; /* name length */ 372 __u8 namelen; /* name length */
294 __u8 name[]; /* name bytes, no null */ 373 __u8 name[]; /* name bytes, no null */
374 /* __u8 filetype; */ /* type of inode we point to */
295 /* __be16 tag; */ /* starting offset of us */ 375 /* __be16 tag; */ /* starting offset of us */
296} xfs_dir2_data_entry_t; 376} xfs_dir2_data_entry_t;
297 377
@@ -311,20 +391,67 @@ typedef struct xfs_dir2_data_unused {
311/* 391/*
312 * Size of a data entry. 392 * Size of a data entry.
313 */ 393 */
314static inline int xfs_dir2_data_entsize(int n) 394static inline int
395__xfs_dir3_data_entsize(
396 bool ftype,
397 int n)
398{
399 int size = offsetof(struct xfs_dir2_data_entry, name[0]);
400
401 size += n;
402 size += sizeof(xfs_dir2_data_off_t);
403 if (ftype)
404 size += sizeof(__uint8_t);
405 return roundup(size, XFS_DIR2_DATA_ALIGN);
406}
407static inline int
408xfs_dir3_data_entsize(
409 struct xfs_mount *mp,
410 int n)
411{
412 bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
413 return __xfs_dir3_data_entsize(ftype, n);
414}
415
416static inline __uint8_t
417xfs_dir3_dirent_get_ftype(
418 struct xfs_mount *mp,
419 struct xfs_dir2_data_entry *dep)
420{
421 if (xfs_sb_version_hasftype(&mp->m_sb)) {
422 __uint8_t type = dep->name[dep->namelen];
423
424 ASSERT(type < XFS_DIR3_FT_MAX);
425 if (type < XFS_DIR3_FT_MAX)
426 return type;
427
428 }
429 return XFS_DIR3_FT_UNKNOWN;
430}
431
432static inline void
433xfs_dir3_dirent_put_ftype(
434 struct xfs_mount *mp,
435 struct xfs_dir2_data_entry *dep,
436 __uint8_t type)
315{ 437{
316 return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n + 438 ASSERT(type < XFS_DIR3_FT_MAX);
317 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); 439 ASSERT(dep->namelen != 0);
440
441 if (xfs_sb_version_hasftype(&mp->m_sb))
442 dep->name[dep->namelen] = type;
318} 443}
319 444
320/* 445/*
321 * Pointer to an entry's tag word. 446 * Pointer to an entry's tag word.
322 */ 447 */
323static inline __be16 * 448static inline __be16 *
324xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep) 449xfs_dir3_data_entry_tag_p(
450 struct xfs_mount *mp,
451 struct xfs_dir2_data_entry *dep)
325{ 452{
326 return (__be16 *)((char *)dep + 453 return (__be16 *)((char *)dep +
327 xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); 454 xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
328} 455}
329 456
330/* 457/*
@@ -370,59 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
370/* 497/*
371 * Offsets of . and .. in data space (always block 0) 498 * Offsets of . and .. in data space (always block 0)
372 * 499 *
373 * The macros are used for shortform directories as they have no headers to read 500 * XXX: there is scope for significant optimisation of the logic here. Right
374 * the magic number out of. Shortform directories need to know the size of the 501 * now we are checking for "dir3 format" over and over again. Ideally we should
375 * data block header because the sfe embeds the block offset of the entry into 502 * only do it once for each operation.
376 * it so that it doesn't change when format conversion occurs. Bad Things Happen 503 */
377 * if we don't follow this rule.
378 */
379#define XFS_DIR3_DATA_DOT_OFFSET(mp) \
380 xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
381#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
382 (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1))
383#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \
384 (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2))
385
386static inline xfs_dir2_data_aoff_t 504static inline xfs_dir2_data_aoff_t
387xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) 505xfs_dir3_data_dot_offset(struct xfs_mount *mp)
388{ 506{
389 return xfs_dir3_data_entry_offset(hdr); 507 return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
390} 508}
391 509
392static inline xfs_dir2_data_aoff_t 510static inline xfs_dir2_data_aoff_t
393xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) 511xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
394{ 512{
395 return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); 513 return xfs_dir3_data_dot_offset(mp) +
514 xfs_dir3_data_entsize(mp, 1);
396} 515}
397 516
398static inline xfs_dir2_data_aoff_t 517static inline xfs_dir2_data_aoff_t
399xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) 518xfs_dir3_data_first_offset(struct xfs_mount *mp)
400{ 519{
401 return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); 520 return xfs_dir3_data_dotdot_offset(mp) +
521 xfs_dir3_data_entsize(mp, 2);
402} 522}
403 523
404/* 524/*
405 * location of . and .. in data space (always block 0) 525 * location of . and .. in data space (always block 0)
406 */ 526 */
407static inline struct xfs_dir2_data_entry * 527static inline struct xfs_dir2_data_entry *
408xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) 528xfs_dir3_data_dot_entry_p(
529 struct xfs_mount *mp,
530 struct xfs_dir2_data_hdr *hdr)
409{ 531{
410 return (struct xfs_dir2_data_entry *) 532 return (struct xfs_dir2_data_entry *)
411 ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); 533 ((char *)hdr + xfs_dir3_data_dot_offset(mp));
412} 534}
413 535
414static inline struct xfs_dir2_data_entry * 536static inline struct xfs_dir2_data_entry *
415xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) 537xfs_dir3_data_dotdot_entry_p(
538 struct xfs_mount *mp,
539 struct xfs_dir2_data_hdr *hdr)
416{ 540{
417 return (struct xfs_dir2_data_entry *) 541 return (struct xfs_dir2_data_entry *)
418 ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); 542 ((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
419} 543}
420 544
421static inline struct xfs_dir2_data_entry * 545static inline struct xfs_dir2_data_entry *
422xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) 546xfs_dir3_data_first_entry_p(
547 struct xfs_mount *mp,
548 struct xfs_dir2_data_hdr *hdr)
423{ 549{
424 return (struct xfs_dir2_data_entry *) 550 return (struct xfs_dir2_data_entry *)
425 ((char *)hdr + xfs_dir3_data_first_offset(hdr)); 551 ((char *)hdr + xfs_dir3_data_first_offset(mp));
426} 552}
427 553
428/* 554/*
@@ -519,6 +645,9 @@ struct xfs_dir3_leaf {
519 645
520#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) 646#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
521 647
648extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
649 struct xfs_dir2_leaf *from);
650
522static inline int 651static inline int
523xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) 652xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
524{ 653{
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 2aed25cae04d..1021c8356d08 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -31,6 +31,7 @@
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_dir2_format.h" 33#include "xfs_dir2_format.h"
34#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h" 35#include "xfs_dir2_priv.h"
35#include "xfs_error.h" 36#include "xfs_error.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
@@ -179,6 +180,11 @@ xfs_dir3_leaf_check_int(
179 return true; 180 return true;
180} 181}
181 182
183/*
184 * We verify the magic numbers before decoding the leaf header so that on debug
185 * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
186 * to incorrect magic numbers.
187 */
182static bool 188static bool
183xfs_dir3_leaf_verify( 189xfs_dir3_leaf_verify(
184 struct xfs_buf *bp, 190 struct xfs_buf *bp,
@@ -190,24 +196,25 @@ xfs_dir3_leaf_verify(
190 196
191 ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); 197 ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
192 198
193 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
194 if (xfs_sb_version_hascrc(&mp->m_sb)) { 199 if (xfs_sb_version_hascrc(&mp->m_sb)) {
195 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; 200 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
201 __uint16_t magic3;
196 202
197 if ((magic == XFS_DIR2_LEAF1_MAGIC && 203 magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
198 leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || 204 : XFS_DIR3_LEAFN_MAGIC;
199 (magic == XFS_DIR2_LEAFN_MAGIC &&
200 leafhdr.magic != XFS_DIR3_LEAFN_MAGIC))
201 return false;
202 205
206 if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
207 return false;
203 if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) 208 if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
204 return false; 209 return false;
205 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) 210 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
206 return false; 211 return false;
207 } else { 212 } else {
208 if (leafhdr.magic != magic) 213 if (leaf->hdr.info.magic != cpu_to_be16(magic))
209 return false; 214 return false;
210 } 215 }
216
217 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
211 return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); 218 return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
212} 219}
213 220
@@ -695,7 +702,7 @@ xfs_dir2_leaf_addname(
695 ents = xfs_dir3_leaf_ents_p(leaf); 702 ents = xfs_dir3_leaf_ents_p(leaf);
696 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); 703 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
697 bestsp = xfs_dir2_leaf_bests_p(ltp); 704 bestsp = xfs_dir2_leaf_bests_p(ltp);
698 length = xfs_dir2_data_entsize(args->namelen); 705 length = xfs_dir3_data_entsize(mp, args->namelen);
699 706
700 /* 707 /*
701 * See if there are any entries with the same hash value 708 * See if there are any entries with the same hash value
@@ -896,7 +903,8 @@ xfs_dir2_leaf_addname(
896 dep->inumber = cpu_to_be64(args->inumber); 903 dep->inumber = cpu_to_be64(args->inumber);
897 dep->namelen = args->namelen; 904 dep->namelen = args->namelen;
898 memcpy(dep->name, args->name, dep->namelen); 905 memcpy(dep->name, args->name, dep->namelen);
899 tagp = xfs_dir2_data_entry_tag_p(dep); 906 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
907 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
900 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 908 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
901 /* 909 /*
902 * Need to scan fix up the bestfree table. 910 * Need to scan fix up the bestfree table.
@@ -1083,396 +1091,6 @@ xfs_dir3_leaf_compact_x1(
1083 *highstalep = highstale; 1091 *highstalep = highstale;
1084} 1092}
1085 1093
1086struct xfs_dir2_leaf_map_info {
1087 xfs_extlen_t map_blocks; /* number of fsbs in map */
1088 xfs_dablk_t map_off; /* last mapped file offset */
1089 int map_size; /* total entries in *map */
1090 int map_valid; /* valid entries in *map */
1091 int nmap; /* mappings to ask xfs_bmapi */
1092 xfs_dir2_db_t curdb; /* db for current block */
1093 int ra_current; /* number of read-ahead blks */
1094 int ra_index; /* *map index for read-ahead */
1095 int ra_offset; /* map entry offset for ra */
1096 int ra_want; /* readahead count wanted */
1097 struct xfs_bmbt_irec map[]; /* map vector for blocks */
1098};
1099
1100STATIC int
1101xfs_dir2_leaf_readbuf(
1102 struct xfs_inode *dp,
1103 size_t bufsize,
1104 struct xfs_dir2_leaf_map_info *mip,
1105 xfs_dir2_off_t *curoff,
1106 struct xfs_buf **bpp)
1107{
1108 struct xfs_mount *mp = dp->i_mount;
1109 struct xfs_buf *bp = *bpp;
1110 struct xfs_bmbt_irec *map = mip->map;
1111 struct blk_plug plug;
1112 int error = 0;
1113 int length;
1114 int i;
1115 int j;
1116
1117 /*
1118 * If we have a buffer, we need to release it and
1119 * take it out of the mapping.
1120 */
1121
1122 if (bp) {
1123 xfs_trans_brelse(NULL, bp);
1124 bp = NULL;
1125 mip->map_blocks -= mp->m_dirblkfsbs;
1126 /*
1127 * Loop to get rid of the extents for the
1128 * directory block.
1129 */
1130 for (i = mp->m_dirblkfsbs; i > 0; ) {
1131 j = min_t(int, map->br_blockcount, i);
1132 map->br_blockcount -= j;
1133 map->br_startblock += j;
1134 map->br_startoff += j;
1135 /*
1136 * If mapping is done, pitch it from
1137 * the table.
1138 */
1139 if (!map->br_blockcount && --mip->map_valid)
1140 memmove(&map[0], &map[1],
1141 sizeof(map[0]) * mip->map_valid);
1142 i -= j;
1143 }
1144 }
1145
1146 /*
1147 * Recalculate the readahead blocks wanted.
1148 */
1149 mip->ra_want = howmany(bufsize + mp->m_dirblksize,
1150 mp->m_sb.sb_blocksize) - 1;
1151 ASSERT(mip->ra_want >= 0);
1152
1153 /*
1154 * If we don't have as many as we want, and we haven't
1155 * run out of data blocks, get some more mappings.
1156 */
1157 if (1 + mip->ra_want > mip->map_blocks &&
1158 mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
1159 /*
1160 * Get more bmaps, fill in after the ones
1161 * we already have in the table.
1162 */
1163 mip->nmap = mip->map_size - mip->map_valid;
1164 error = xfs_bmapi_read(dp, mip->map_off,
1165 xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
1166 mip->map_off,
1167 &map[mip->map_valid], &mip->nmap, 0);
1168
1169 /*
1170 * Don't know if we should ignore this or try to return an
1171 * error. The trouble with returning errors is that readdir
1172 * will just stop without actually passing the error through.
1173 */
1174 if (error)
1175 goto out; /* XXX */
1176
1177 /*
1178 * If we got all the mappings we asked for, set the final map
1179 * offset based on the last bmap value received. Otherwise,
1180 * we've reached the end.
1181 */
1182 if (mip->nmap == mip->map_size - mip->map_valid) {
1183 i = mip->map_valid + mip->nmap - 1;
1184 mip->map_off = map[i].br_startoff + map[i].br_blockcount;
1185 } else
1186 mip->map_off = xfs_dir2_byte_to_da(mp,
1187 XFS_DIR2_LEAF_OFFSET);
1188
1189 /*
1190 * Look for holes in the mapping, and eliminate them. Count up
1191 * the valid blocks.
1192 */
1193 for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
1194 if (map[i].br_startblock == HOLESTARTBLOCK) {
1195 mip->nmap--;
1196 length = mip->map_valid + mip->nmap - i;
1197 if (length)
1198 memmove(&map[i], &map[i + 1],
1199 sizeof(map[i]) * length);
1200 } else {
1201 mip->map_blocks += map[i].br_blockcount;
1202 i++;
1203 }
1204 }
1205 mip->map_valid += mip->nmap;
1206 }
1207
1208 /*
1209 * No valid mappings, so no more data blocks.
1210 */
1211 if (!mip->map_valid) {
1212 *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
1213 goto out;
1214 }
1215
1216 /*
1217 * Read the directory block starting at the first mapping.
1218 */
1219 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
1220 error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
1221 map->br_blockcount >= mp->m_dirblkfsbs ?
1222 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
1223
1224 /*
1225 * Should just skip over the data block instead of giving up.
1226 */
1227 if (error)
1228 goto out; /* XXX */
1229
1230 /*
1231 * Adjust the current amount of read-ahead: we just read a block that
1232 * was previously ra.
1233 */
1234 if (mip->ra_current)
1235 mip->ra_current -= mp->m_dirblkfsbs;
1236
1237 /*
1238 * Do we need more readahead?
1239 */
1240 blk_start_plug(&plug);
1241 for (mip->ra_index = mip->ra_offset = i = 0;
1242 mip->ra_want > mip->ra_current && i < mip->map_blocks;
1243 i += mp->m_dirblkfsbs) {
1244 ASSERT(mip->ra_index < mip->map_valid);
1245 /*
1246 * Read-ahead a contiguous directory block.
1247 */
1248 if (i > mip->ra_current &&
1249 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
1250 xfs_dir3_data_readahead(NULL, dp,
1251 map[mip->ra_index].br_startoff + mip->ra_offset,
1252 XFS_FSB_TO_DADDR(mp,
1253 map[mip->ra_index].br_startblock +
1254 mip->ra_offset));
1255 mip->ra_current = i;
1256 }
1257
1258 /*
1259 * Read-ahead a non-contiguous directory block. This doesn't
1260 * use our mapping, but this is a very rare case.
1261 */
1262 else if (i > mip->ra_current) {
1263 xfs_dir3_data_readahead(NULL, dp,
1264 map[mip->ra_index].br_startoff +
1265 mip->ra_offset, -1);
1266 mip->ra_current = i;
1267 }
1268
1269 /*
1270 * Advance offset through the mapping table.
1271 */
1272 for (j = 0; j < mp->m_dirblkfsbs; j++) {
1273 /*
1274 * The rest of this extent but not more than a dir
1275 * block.
1276 */
1277 length = min_t(int, mp->m_dirblkfsbs,
1278 map[mip->ra_index].br_blockcount -
1279 mip->ra_offset);
1280 j += length;
1281 mip->ra_offset += length;
1282
1283 /*
1284 * Advance to the next mapping if this one is used up.
1285 */
1286 if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
1287 mip->ra_offset = 0;
1288 mip->ra_index++;
1289 }
1290 }
1291 }
1292 blk_finish_plug(&plug);
1293
1294out:
1295 *bpp = bp;
1296 return error;
1297}
1298
1299/*
1300 * Getdents (readdir) for leaf and node directories.
1301 * This reads the data blocks only, so is the same for both forms.
1302 */
1303int /* error */
1304xfs_dir2_leaf_getdents(
1305 xfs_inode_t *dp, /* incore directory inode */
1306 struct dir_context *ctx,
1307 size_t bufsize)
1308{
1309 struct xfs_buf *bp = NULL; /* data block buffer */
1310 xfs_dir2_data_hdr_t *hdr; /* data block header */
1311 xfs_dir2_data_entry_t *dep; /* data entry */
1312 xfs_dir2_data_unused_t *dup; /* unused entry */
1313 int error = 0; /* error return value */
1314 int length; /* temporary length value */
1315 xfs_mount_t *mp; /* filesystem mount point */
1316 int byteoff; /* offset in current block */
1317 xfs_dir2_off_t curoff; /* current overall offset */
1318 xfs_dir2_off_t newoff; /* new curoff after new blk */
1319 char *ptr = NULL; /* pointer to current data */
1320 struct xfs_dir2_leaf_map_info *map_info;
1321
1322 /*
1323 * If the offset is at or past the largest allowed value,
1324 * give up right away.
1325 */
1326 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
1327 return 0;
1328
1329 mp = dp->i_mount;
1330
1331 /*
1332 * Set up to bmap a number of blocks based on the caller's
1333 * buffer size, the directory block size, and the filesystem
1334 * block size.
1335 */
1336 length = howmany(bufsize + mp->m_dirblksize,
1337 mp->m_sb.sb_blocksize);
1338 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
1339 (length * sizeof(struct xfs_bmbt_irec)),
1340 KM_SLEEP | KM_NOFS);
1341 map_info->map_size = length;
1342
1343 /*
1344 * Inside the loop we keep the main offset value as a byte offset
1345 * in the directory file.
1346 */
1347 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
1348
1349 /*
1350 * Force this conversion through db so we truncate the offset
1351 * down to get the start of the data block.
1352 */
1353 map_info->map_off = xfs_dir2_db_to_da(mp,
1354 xfs_dir2_byte_to_db(mp, curoff));
1355
1356 /*
1357 * Loop over directory entries until we reach the end offset.
1358 * Get more blocks and readahead as necessary.
1359 */
1360 while (curoff < XFS_DIR2_LEAF_OFFSET) {
1361 /*
1362 * If we have no buffer, or we're off the end of the
1363 * current buffer, need to get another one.
1364 */
1365 if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
1366
1367 error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
1368 &curoff, &bp);
1369 if (error || !map_info->map_valid)
1370 break;
1371
1372 /*
1373 * Having done a read, we need to set a new offset.
1374 */
1375 newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
1376 /*
1377 * Start of the current block.
1378 */
1379 if (curoff < newoff)
1380 curoff = newoff;
1381 /*
1382 * Make sure we're in the right block.
1383 */
1384 else if (curoff > newoff)
1385 ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
1386 map_info->curdb);
1387 hdr = bp->b_addr;
1388 xfs_dir3_data_check(dp, bp);
1389 /*
1390 * Find our position in the block.
1391 */
1392 ptr = (char *)xfs_dir3_data_entry_p(hdr);
1393 byteoff = xfs_dir2_byte_to_off(mp, curoff);
1394 /*
1395 * Skip past the header.
1396 */
1397 if (byteoff == 0)
1398 curoff += xfs_dir3_data_entry_offset(hdr);
1399 /*
1400 * Skip past entries until we reach our offset.
1401 */
1402 else {
1403 while ((char *)ptr - (char *)hdr < byteoff) {
1404 dup = (xfs_dir2_data_unused_t *)ptr;
1405
1406 if (be16_to_cpu(dup->freetag)
1407 == XFS_DIR2_DATA_FREE_TAG) {
1408
1409 length = be16_to_cpu(dup->length);
1410 ptr += length;
1411 continue;
1412 }
1413 dep = (xfs_dir2_data_entry_t *)ptr;
1414 length =
1415 xfs_dir2_data_entsize(dep->namelen);
1416 ptr += length;
1417 }
1418 /*
1419 * Now set our real offset.
1420 */
1421 curoff =
1422 xfs_dir2_db_off_to_byte(mp,
1423 xfs_dir2_byte_to_db(mp, curoff),
1424 (char *)ptr - (char *)hdr);
1425 if (ptr >= (char *)hdr + mp->m_dirblksize) {
1426 continue;
1427 }
1428 }
1429 }
1430 /*
1431 * We have a pointer to an entry.
1432 * Is it a live one?
1433 */
1434 dup = (xfs_dir2_data_unused_t *)ptr;
1435 /*
1436 * No, it's unused, skip over it.
1437 */
1438 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
1439 length = be16_to_cpu(dup->length);
1440 ptr += length;
1441 curoff += length;
1442 continue;
1443 }
1444
1445 dep = (xfs_dir2_data_entry_t *)ptr;
1446 length = xfs_dir2_data_entsize(dep->namelen);
1447
1448 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1449 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
1450 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1451 break;
1452
1453 /*
1454 * Advance to next entry in the block.
1455 */
1456 ptr += length;
1457 curoff += length;
1458 /* bufsize may have just been a guess; don't go negative */
1459 bufsize = bufsize > length ? bufsize - length : 0;
1460 }
1461
1462 /*
1463 * All done. Set output offset value to current offset.
1464 */
1465 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1466 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
1467 else
1468 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1469 kmem_free(map_info);
1470 if (bp)
1471 xfs_trans_brelse(NULL, bp);
1472 return error;
1473}
1474
1475
1476/* 1094/*
1477 * Log the bests entries indicated from a leaf1 block. 1095 * Log the bests entries indicated from a leaf1 block.
1478 */ 1096 */
@@ -1614,6 +1232,7 @@ xfs_dir2_leaf_lookup(
1614 * Return the found inode number & CI name if appropriate 1232 * Return the found inode number & CI name if appropriate
1615 */ 1233 */
1616 args->inumber = be64_to_cpu(dep->inumber); 1234 args->inumber = be64_to_cpu(dep->inumber);
1235 args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
1617 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 1236 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
1618 xfs_trans_brelse(tp, dbp); 1237 xfs_trans_brelse(tp, dbp);
1619 xfs_trans_brelse(tp, lbp); 1238 xfs_trans_brelse(tp, lbp);
@@ -1816,7 +1435,7 @@ xfs_dir2_leaf_removename(
1816 */ 1435 */
1817 xfs_dir2_data_make_free(tp, dbp, 1436 xfs_dir2_data_make_free(tp, dbp,
1818 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), 1437 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
1819 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 1438 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
1820 /* 1439 /*
1821 * We just mark the leaf entry stale by putting a null in it. 1440 * We just mark the leaf entry stale by putting a null in it.
1822 */ 1441 */
@@ -1944,6 +1563,7 @@ xfs_dir2_leaf_replace(
1944 * Put the new inode number in, log it. 1563 * Put the new inode number in, log it.
1945 */ 1564 */
1946 dep->inumber = cpu_to_be64(args->inumber); 1565 dep->inumber = cpu_to_be64(args->inumber);
1566 xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
1947 tp = args->trans; 1567 tp = args->trans;
1948 xfs_dir2_data_log_entry(tp, dbp, dep); 1568 xfs_dir2_data_log_entry(tp, dbp, dep);
1949 xfs_dir3_leaf_check(dp->i_mount, lbp); 1569 xfs_dir3_leaf_check(dp->i_mount, lbp);
@@ -1975,10 +1595,6 @@ xfs_dir2_leaf_search_hash(
1975 ents = xfs_dir3_leaf_ents_p(leaf); 1595 ents = xfs_dir3_leaf_ents_p(leaf);
1976 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); 1596 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
1977 1597
1978#ifndef __KERNEL__
1979 if (!leafhdr.count)
1980 return 0;
1981#endif
1982 /* 1598 /*
1983 * Note, the table cannot be empty, so we have to go through the loop. 1599 * Note, the table cannot be empty, so we have to go through the loop.
1984 * Binary search the leaf entries looking for our hash value. 1600 * Binary search the leaf entries looking for our hash value.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 2226a00acd15..4c3dba7ffb74 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -30,6 +30,7 @@
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_dir2_format.h" 32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
33#include "xfs_dir2_priv.h" 34#include "xfs_dir2_priv.h"
34#include "xfs_error.h" 35#include "xfs_error.h"
35#include "xfs_trace.h" 36#include "xfs_trace.h"
@@ -312,11 +313,13 @@ xfs_dir2_free_log_header(
312 struct xfs_trans *tp, 313 struct xfs_trans *tp,
313 struct xfs_buf *bp) 314 struct xfs_buf *bp)
314{ 315{
316#ifdef DEBUG
315 xfs_dir2_free_t *free; /* freespace structure */ 317 xfs_dir2_free_t *free; /* freespace structure */
316 318
317 free = bp->b_addr; 319 free = bp->b_addr;
318 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || 320 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
319 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); 321 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
322#endif
320 xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); 323 xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
321} 324}
322 325
@@ -602,7 +605,7 @@ xfs_dir2_leafn_lookup_for_addname(
602 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || 605 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
603 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); 606 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
604 } 607 }
605 length = xfs_dir2_data_entsize(args->namelen); 608 length = xfs_dir3_data_entsize(mp, args->namelen);
606 /* 609 /*
607 * Loop over leaf entries with the right hash value. 610 * Loop over leaf entries with the right hash value.
608 */ 611 */
@@ -813,6 +816,7 @@ xfs_dir2_leafn_lookup_for_entry(
813 xfs_trans_brelse(tp, state->extrablk.bp); 816 xfs_trans_brelse(tp, state->extrablk.bp);
814 args->cmpresult = cmp; 817 args->cmpresult = cmp;
815 args->inumber = be64_to_cpu(dep->inumber); 818 args->inumber = be64_to_cpu(dep->inumber);
819 args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
816 *indexp = index; 820 *indexp = index;
817 state->extravalid = 1; 821 state->extravalid = 1;
818 state->extrablk.bp = curbp; 822 state->extrablk.bp = curbp;
@@ -1256,7 +1260,7 @@ xfs_dir2_leafn_remove(
1256 longest = be16_to_cpu(bf[0].length); 1260 longest = be16_to_cpu(bf[0].length);
1257 needlog = needscan = 0; 1261 needlog = needscan = 0;
1258 xfs_dir2_data_make_free(tp, dbp, off, 1262 xfs_dir2_data_make_free(tp, dbp, off,
1259 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 1263 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
1260 /* 1264 /*
1261 * Rescan the data block freespaces for bestfree. 1265 * Rescan the data block freespaces for bestfree.
1262 * Log the data block header if needed. 1266 * Log the data block header if needed.
@@ -1708,7 +1712,7 @@ xfs_dir2_node_addname_int(
1708 dp = args->dp; 1712 dp = args->dp;
1709 mp = dp->i_mount; 1713 mp = dp->i_mount;
1710 tp = args->trans; 1714 tp = args->trans;
1711 length = xfs_dir2_data_entsize(args->namelen); 1715 length = xfs_dir3_data_entsize(mp, args->namelen);
1712 /* 1716 /*
1713 * If we came in with a freespace block that means that lookup 1717 * If we came in with a freespace block that means that lookup
1714 * found an entry with our hash value. This is the freespace 1718 * found an entry with our hash value. This is the freespace
@@ -2004,7 +2008,8 @@ xfs_dir2_node_addname_int(
2004 dep->inumber = cpu_to_be64(args->inumber); 2008 dep->inumber = cpu_to_be64(args->inumber);
2005 dep->namelen = args->namelen; 2009 dep->namelen = args->namelen;
2006 memcpy(dep->name, args->name, dep->namelen); 2010 memcpy(dep->name, args->name, dep->namelen);
2007 tagp = xfs_dir2_data_entry_tag_p(dep); 2011 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
2012 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
2008 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 2013 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
2009 xfs_dir2_data_log_entry(tp, dbp, dep); 2014 xfs_dir2_data_log_entry(tp, dbp, dep);
2010 /* 2015 /*
@@ -2224,6 +2229,7 @@ xfs_dir2_node_replace(
2224 * Fill in the new inode number and log the entry. 2229 * Fill in the new inode number and log the entry.
2225 */ 2230 */
2226 dep->inumber = cpu_to_be64(inum); 2231 dep->inumber = cpu_to_be64(inum);
2232 xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
2227 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); 2233 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
2228 rval = 0; 2234 rval = 0;
2229 } 2235 }
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 0511cda4a712..1bad84c40829 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -18,23 +18,26 @@
18#ifndef __XFS_DIR2_PRIV_H__ 18#ifndef __XFS_DIR2_PRIV_H__
19#define __XFS_DIR2_PRIV_H__ 19#define __XFS_DIR2_PRIV_H__
20 20
21struct dir_context;
22
21/* xfs_dir2.c */ 23/* xfs_dir2.c */
22extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); 24extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
23extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
24extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
25extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, 25extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
26 xfs_dir2_db_t *dbp); 26 xfs_dir2_db_t *dbp);
27extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
28 struct xfs_buf *bp);
29extern int xfs_dir_cilookup_result(struct xfs_da_args *args, 27extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 28 const unsigned char *name, int len);
31 29
32/* xfs_dir2_block.c */ 30#define S_SHIFT 12
33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; 31extern const unsigned char xfs_mode_to_ftype[];
32
33extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
34 __uint8_t filetype);
34 35
36
37/* xfs_dir2_block.c */
38extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
39 struct xfs_buf **bpp);
35extern int xfs_dir2_block_addname(struct xfs_da_args *args); 40extern int xfs_dir2_block_addname(struct xfs_da_args *args);
36extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
37 struct dir_context *ctx);
38extern int xfs_dir2_block_lookup(struct xfs_da_args *args); 41extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
39extern int xfs_dir2_block_removename(struct xfs_da_args *args); 42extern int xfs_dir2_block_removename(struct xfs_da_args *args);
40extern int xfs_dir2_block_replace(struct xfs_da_args *args); 43extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -48,9 +51,6 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
48#define xfs_dir3_data_check(dp,bp) 51#define xfs_dir3_data_check(dp,bp)
49#endif 52#endif
50 53
51extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
52extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
53
54extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 54extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
55extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, 55extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
56 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); 56 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -60,27 +60,10 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
60extern struct xfs_dir2_data_free * 60extern struct xfs_dir2_data_free *
61xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 61xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
62 struct xfs_dir2_data_unused *dup, int *loghead); 62 struct xfs_dir2_data_unused *dup, int *loghead);
63extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
64 struct xfs_dir2_data_hdr *hdr, int *loghead);
65extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, 63extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
66 struct xfs_buf **bpp); 64 struct xfs_buf **bpp);
67extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
68 struct xfs_dir2_data_entry *dep);
69extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
70 struct xfs_buf *bp);
71extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
72 struct xfs_dir2_data_unused *dup);
73extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
74 xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
75 int *needlogp, int *needscanp);
76extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
77 struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
78 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
79 65
80/* xfs_dir2_leaf.c */ 66/* xfs_dir2_leaf.c */
81extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
82extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
83
84extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, 67extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
85 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); 68 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
86extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
@@ -91,8 +74,6 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, 74extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
92 struct xfs_dir2_leaf_entry *ents, int *indexp, 75 struct xfs_dir2_leaf_entry *ents, int *indexp,
93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); 76 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
95 size_t bufsize);
96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, 77extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
97 struct xfs_buf **bpp, __uint16_t magic); 78 struct xfs_buf **bpp, __uint16_t magic);
98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, 79extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -144,18 +125,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
144 xfs_dablk_t fbno, struct xfs_buf **bpp); 125 xfs_dablk_t fbno, struct xfs_buf **bpp);
145 126
146/* xfs_dir2_sf.c */ 127/* xfs_dir2_sf.c */
147extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
148extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
149 struct xfs_dir2_sf_entry *sfep);
150extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, 128extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
151 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); 129 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
152extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, 130extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
153 int size, xfs_dir2_sf_hdr_t *sfhp); 131 int size, xfs_dir2_sf_hdr_t *sfhp);
154extern int xfs_dir2_sf_addname(struct xfs_da_args *args); 132extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); 133extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
157extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); 134extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
158extern int xfs_dir2_sf_removename(struct xfs_da_args *args); 135extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
159extern int xfs_dir2_sf_replace(struct xfs_da_args *args); 136extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
160 137
138/* xfs_dir2_readdir.c */
139extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
140 size_t bufsize);
141
161#endif /* __XFS_DIR2_PRIV_H__ */ 142#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 000000000000..8f84153e98a8
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,695 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h"
35#include "xfs_error.h"
36#include "xfs_trace.h"
37#include "xfs_bmap.h"
38
39/*
40 * Directory file type support functions
41 */
42static unsigned char xfs_dir3_filetype_table[] = {
43 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
44 DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
45};
46
47unsigned char
48xfs_dir3_get_dtype(
49 struct xfs_mount *mp,
50 __uint8_t filetype)
51{
52 if (!xfs_sb_version_hasftype(&mp->m_sb))
53 return DT_UNKNOWN;
54
55 if (filetype >= XFS_DIR3_FT_MAX)
56 return DT_UNKNOWN;
57
58 return xfs_dir3_filetype_table[filetype];
59}
60/*
61 * @mode, if set, indicates that the type field needs to be set up.
62 * This uses the transformation from file mode to DT_* as defined in linux/fs.h
63 * for file type specification. This will be propagated into the directory
64 * structure if appropriate for the given operation and filesystem config.
65 */
66const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
67 [0] = XFS_DIR3_FT_UNKNOWN,
68 [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
69 [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
70 [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
71 [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
72 [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
73 [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
74 [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
75};
76
77STATIC int
78xfs_dir2_sf_getdents(
79 xfs_inode_t *dp, /* incore directory inode */
80 struct dir_context *ctx)
81{
82 int i; /* shortform entry number */
83 xfs_mount_t *mp; /* filesystem mount point */
84 xfs_dir2_dataptr_t off; /* current entry's offset */
85 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
86 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
87 xfs_dir2_dataptr_t dot_offset;
88 xfs_dir2_dataptr_t dotdot_offset;
89 xfs_ino_t ino;
90
91 mp = dp->i_mount;
92
93 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
94 /*
95 * Give up if the directory is way too short.
96 */
97 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
98 ASSERT(XFS_FORCED_SHUTDOWN(mp));
99 return XFS_ERROR(EIO);
100 }
101
102 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
103 ASSERT(dp->i_df.if_u1.if_data != NULL);
104
105 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
106
107 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
108
109 /*
110 * If the block number in the offset is out of range, we're done.
111 */
112 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
113 return 0;
114
115 /*
116 * Precalculate offsets for . and .. as we will always need them.
117 *
118 * XXX(hch): the second argument is sometimes 0 and sometimes
119 * mp->m_dirdatablk.
120 */
121 dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
122 xfs_dir3_data_dot_offset(mp));
123 dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
124 xfs_dir3_data_dotdot_offset(mp));
125
126 /*
127 * Put . entry unless we're starting past it.
128 */
129 if (ctx->pos <= dot_offset) {
130 ctx->pos = dot_offset & 0x7fffffff;
131 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
132 return 0;
133 }
134
135 /*
136 * Put .. entry unless we're starting past it.
137 */
138 if (ctx->pos <= dotdot_offset) {
139 ino = xfs_dir2_sf_get_parent_ino(sfp);
140 ctx->pos = dotdot_offset & 0x7fffffff;
141 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
142 return 0;
143 }
144
145 /*
146 * Loop while there are more entries and put'ing works.
147 */
148 sfep = xfs_dir2_sf_firstentry(sfp);
149 for (i = 0; i < sfp->count; i++) {
150 __uint8_t filetype;
151
152 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
153 xfs_dir2_sf_get_offset(sfep));
154
155 if (ctx->pos > off) {
156 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
157 continue;
158 }
159
160 ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
161 filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
162 ctx->pos = off & 0x7fffffff;
163 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
164 xfs_dir3_get_dtype(mp, filetype)))
165 return 0;
166 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
167 }
168
169 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
170 0x7fffffff;
171 return 0;
172}
173
174/*
175 * Readdir for block directories.
176 */
177STATIC int
178xfs_dir2_block_getdents(
179 xfs_inode_t *dp, /* incore inode */
180 struct dir_context *ctx)
181{
182 xfs_dir2_data_hdr_t *hdr; /* block header */
183 struct xfs_buf *bp; /* buffer for block */
184 xfs_dir2_block_tail_t *btp; /* block tail */
185 xfs_dir2_data_entry_t *dep; /* block data entry */
186 xfs_dir2_data_unused_t *dup; /* block unused entry */
187 char *endptr; /* end of the data entries */
188 int error; /* error return value */
189 xfs_mount_t *mp; /* filesystem mount point */
190 char *ptr; /* current data entry */
191 int wantoff; /* starting block offset */
192 xfs_off_t cook;
193
194 mp = dp->i_mount;
195 /*
196 * If the block number in the offset is out of range, we're done.
197 */
198 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
199 return 0;
200
201 error = xfs_dir3_block_read(NULL, dp, &bp);
202 if (error)
203 return error;
204
205 /*
206 * Extract the byte offset we start at from the seek pointer.
207 * We'll skip entries before this.
208 */
209 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
210 hdr = bp->b_addr;
211 xfs_dir3_data_check(dp, bp);
212 /*
213 * Set up values for the loop.
214 */
215 btp = xfs_dir2_block_tail_p(mp, hdr);
216 ptr = (char *)xfs_dir3_data_entry_p(hdr);
217 endptr = (char *)xfs_dir2_block_leaf_p(btp);
218
219 /*
220 * Loop over the data portion of the block.
221 * Each object is a real entry (dep) or an unused one (dup).
222 */
223 while (ptr < endptr) {
224 __uint8_t filetype;
225
226 dup = (xfs_dir2_data_unused_t *)ptr;
227 /*
228 * Unused, skip it.
229 */
230 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
231 ptr += be16_to_cpu(dup->length);
232 continue;
233 }
234
235 dep = (xfs_dir2_data_entry_t *)ptr;
236
237 /*
238 * Bump pointer for the next iteration.
239 */
240 ptr += xfs_dir3_data_entsize(mp, dep->namelen);
241 /*
242 * The entry is before the desired starting point, skip it.
243 */
244 if ((char *)dep - (char *)hdr < wantoff)
245 continue;
246
247 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
248 (char *)dep - (char *)hdr);
249
250 ctx->pos = cook & 0x7fffffff;
251 filetype = xfs_dir3_dirent_get_ftype(mp, dep);
252 /*
253 * If it didn't fit, set the final offset to here & return.
254 */
255 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
256 be64_to_cpu(dep->inumber),
257 xfs_dir3_get_dtype(mp, filetype))) {
258 xfs_trans_brelse(NULL, bp);
259 return 0;
260 }
261 }
262
263 /*
264 * Reached the end of the block.
265 * Set the offset to a non-existent block 1 and return.
266 */
267 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
268 0x7fffffff;
269 xfs_trans_brelse(NULL, bp);
270 return 0;
271}
272
273struct xfs_dir2_leaf_map_info {
274 xfs_extlen_t map_blocks; /* number of fsbs in map */
275 xfs_dablk_t map_off; /* last mapped file offset */
276 int map_size; /* total entries in *map */
277 int map_valid; /* valid entries in *map */
278 int nmap; /* mappings to ask xfs_bmapi */
279 xfs_dir2_db_t curdb; /* db for current block */
280 int ra_current; /* number of read-ahead blks */
281 int ra_index; /* *map index for read-ahead */
282 int ra_offset; /* map entry offset for ra */
283 int ra_want; /* readahead count wanted */
284 struct xfs_bmbt_irec map[]; /* map vector for blocks */
285};
286
287STATIC int
288xfs_dir2_leaf_readbuf(
289 struct xfs_inode *dp,
290 size_t bufsize,
291 struct xfs_dir2_leaf_map_info *mip,
292 xfs_dir2_off_t *curoff,
293 struct xfs_buf **bpp)
294{
295 struct xfs_mount *mp = dp->i_mount;
296 struct xfs_buf *bp = *bpp;
297 struct xfs_bmbt_irec *map = mip->map;
298 struct blk_plug plug;
299 int error = 0;
300 int length;
301 int i;
302 int j;
303
304 /*
305 * If we have a buffer, we need to release it and
306 * take it out of the mapping.
307 */
308
309 if (bp) {
310 xfs_trans_brelse(NULL, bp);
311 bp = NULL;
312 mip->map_blocks -= mp->m_dirblkfsbs;
313 /*
314 * Loop to get rid of the extents for the
315 * directory block.
316 */
317 for (i = mp->m_dirblkfsbs; i > 0; ) {
318 j = min_t(int, map->br_blockcount, i);
319 map->br_blockcount -= j;
320 map->br_startblock += j;
321 map->br_startoff += j;
322 /*
323 * If mapping is done, pitch it from
324 * the table.
325 */
326 if (!map->br_blockcount && --mip->map_valid)
327 memmove(&map[0], &map[1],
328 sizeof(map[0]) * mip->map_valid);
329 i -= j;
330 }
331 }
332
333 /*
334 * Recalculate the readahead blocks wanted.
335 */
336 mip->ra_want = howmany(bufsize + mp->m_dirblksize,
337 mp->m_sb.sb_blocksize) - 1;
338 ASSERT(mip->ra_want >= 0);
339
340 /*
341 * If we don't have as many as we want, and we haven't
342 * run out of data blocks, get some more mappings.
343 */
344 if (1 + mip->ra_want > mip->map_blocks &&
345 mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
346 /*
347 * Get more bmaps, fill in after the ones
348 * we already have in the table.
349 */
350 mip->nmap = mip->map_size - mip->map_valid;
351 error = xfs_bmapi_read(dp, mip->map_off,
352 xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
353 mip->map_off,
354 &map[mip->map_valid], &mip->nmap, 0);
355
356 /*
357 * Don't know if we should ignore this or try to return an
358 * error. The trouble with returning errors is that readdir
359 * will just stop without actually passing the error through.
360 */
361 if (error)
362 goto out; /* XXX */
363
364 /*
365 * If we got all the mappings we asked for, set the final map
366 * offset based on the last bmap value received. Otherwise,
367 * we've reached the end.
368 */
369 if (mip->nmap == mip->map_size - mip->map_valid) {
370 i = mip->map_valid + mip->nmap - 1;
371 mip->map_off = map[i].br_startoff + map[i].br_blockcount;
372 } else
373 mip->map_off = xfs_dir2_byte_to_da(mp,
374 XFS_DIR2_LEAF_OFFSET);
375
376 /*
377 * Look for holes in the mapping, and eliminate them. Count up
378 * the valid blocks.
379 */
380 for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
381 if (map[i].br_startblock == HOLESTARTBLOCK) {
382 mip->nmap--;
383 length = mip->map_valid + mip->nmap - i;
384 if (length)
385 memmove(&map[i], &map[i + 1],
386 sizeof(map[i]) * length);
387 } else {
388 mip->map_blocks += map[i].br_blockcount;
389 i++;
390 }
391 }
392 mip->map_valid += mip->nmap;
393 }
394
395 /*
396 * No valid mappings, so no more data blocks.
397 */
398 if (!mip->map_valid) {
399 *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
400 goto out;
401 }
402
403 /*
404 * Read the directory block starting at the first mapping.
405 */
406 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
407 error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
408 map->br_blockcount >= mp->m_dirblkfsbs ?
409 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
410
411 /*
412 * Should just skip over the data block instead of giving up.
413 */
414 if (error)
415 goto out; /* XXX */
416
417 /*
418 * Adjust the current amount of read-ahead: we just read a block that
419 * was previously ra.
420 */
421 if (mip->ra_current)
422 mip->ra_current -= mp->m_dirblkfsbs;
423
424 /*
425 * Do we need more readahead?
426 */
427 blk_start_plug(&plug);
428 for (mip->ra_index = mip->ra_offset = i = 0;
429 mip->ra_want > mip->ra_current && i < mip->map_blocks;
430 i += mp->m_dirblkfsbs) {
431 ASSERT(mip->ra_index < mip->map_valid);
432 /*
433 * Read-ahead a contiguous directory block.
434 */
435 if (i > mip->ra_current &&
436 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
437 xfs_dir3_data_readahead(NULL, dp,
438 map[mip->ra_index].br_startoff + mip->ra_offset,
439 XFS_FSB_TO_DADDR(mp,
440 map[mip->ra_index].br_startblock +
441 mip->ra_offset));
442 mip->ra_current = i;
443 }
444
445 /*
446 * Read-ahead a non-contiguous directory block. This doesn't
447 * use our mapping, but this is a very rare case.
448 */
449 else if (i > mip->ra_current) {
450 xfs_dir3_data_readahead(NULL, dp,
451 map[mip->ra_index].br_startoff +
452 mip->ra_offset, -1);
453 mip->ra_current = i;
454 }
455
456 /*
457 * Advance offset through the mapping table.
458 */
459 for (j = 0; j < mp->m_dirblkfsbs; j++) {
460 /*
461 * The rest of this extent but not more than a dir
462 * block.
463 */
464 length = min_t(int, mp->m_dirblkfsbs,
465 map[mip->ra_index].br_blockcount -
466 mip->ra_offset);
467 j += length;
468 mip->ra_offset += length;
469
470 /*
471 * Advance to the next mapping if this one is used up.
472 */
473 if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
474 mip->ra_offset = 0;
475 mip->ra_index++;
476 }
477 }
478 }
479 blk_finish_plug(&plug);
480
481out:
482 *bpp = bp;
483 return error;
484}
485
486/*
487 * Getdents (readdir) for leaf and node directories.
488 * This reads the data blocks only, so is the same for both forms.
489 */
490STATIC int
491xfs_dir2_leaf_getdents(
492 xfs_inode_t *dp, /* incore directory inode */
493 struct dir_context *ctx,
494 size_t bufsize)
495{
496 struct xfs_buf *bp = NULL; /* data block buffer */
497 xfs_dir2_data_hdr_t *hdr; /* data block header */
498 xfs_dir2_data_entry_t *dep; /* data entry */
499 xfs_dir2_data_unused_t *dup; /* unused entry */
500 int error = 0; /* error return value */
501 int length; /* temporary length value */
502 xfs_mount_t *mp; /* filesystem mount point */
503 int byteoff; /* offset in current block */
504 xfs_dir2_off_t curoff; /* current overall offset */
505 xfs_dir2_off_t newoff; /* new curoff after new blk */
506 char *ptr = NULL; /* pointer to current data */
507 struct xfs_dir2_leaf_map_info *map_info;
508
509 /*
510 * If the offset is at or past the largest allowed value,
511 * give up right away.
512 */
513 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
514 return 0;
515
516 mp = dp->i_mount;
517
518 /*
519 * Set up to bmap a number of blocks based on the caller's
520 * buffer size, the directory block size, and the filesystem
521 * block size.
522 */
523 length = howmany(bufsize + mp->m_dirblksize,
524 mp->m_sb.sb_blocksize);
525 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
526 (length * sizeof(struct xfs_bmbt_irec)),
527 KM_SLEEP | KM_NOFS);
528 map_info->map_size = length;
529
530 /*
531 * Inside the loop we keep the main offset value as a byte offset
532 * in the directory file.
533 */
534 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
535
536 /*
537 * Force this conversion through db so we truncate the offset
538 * down to get the start of the data block.
539 */
540 map_info->map_off = xfs_dir2_db_to_da(mp,
541 xfs_dir2_byte_to_db(mp, curoff));
542
543 /*
544 * Loop over directory entries until we reach the end offset.
545 * Get more blocks and readahead as necessary.
546 */
547 while (curoff < XFS_DIR2_LEAF_OFFSET) {
548 __uint8_t filetype;
549
550 /*
551 * If we have no buffer, or we're off the end of the
552 * current buffer, need to get another one.
553 */
554 if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
555
556 error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
557 &curoff, &bp);
558 if (error || !map_info->map_valid)
559 break;
560
561 /*
562 * Having done a read, we need to set a new offset.
563 */
564 newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
565 /*
566 * Start of the current block.
567 */
568 if (curoff < newoff)
569 curoff = newoff;
570 /*
571 * Make sure we're in the right block.
572 */
573 else if (curoff > newoff)
574 ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
575 map_info->curdb);
576 hdr = bp->b_addr;
577 xfs_dir3_data_check(dp, bp);
578 /*
579 * Find our position in the block.
580 */
581 ptr = (char *)xfs_dir3_data_entry_p(hdr);
582 byteoff = xfs_dir2_byte_to_off(mp, curoff);
583 /*
584 * Skip past the header.
585 */
586 if (byteoff == 0)
587 curoff += xfs_dir3_data_entry_offset(hdr);
588 /*
589 * Skip past entries until we reach our offset.
590 */
591 else {
592 while ((char *)ptr - (char *)hdr < byteoff) {
593 dup = (xfs_dir2_data_unused_t *)ptr;
594
595 if (be16_to_cpu(dup->freetag)
596 == XFS_DIR2_DATA_FREE_TAG) {
597
598 length = be16_to_cpu(dup->length);
599 ptr += length;
600 continue;
601 }
602 dep = (xfs_dir2_data_entry_t *)ptr;
603 length =
604 xfs_dir3_data_entsize(mp, dep->namelen);
605 ptr += length;
606 }
607 /*
608 * Now set our real offset.
609 */
610 curoff =
611 xfs_dir2_db_off_to_byte(mp,
612 xfs_dir2_byte_to_db(mp, curoff),
613 (char *)ptr - (char *)hdr);
614 if (ptr >= (char *)hdr + mp->m_dirblksize) {
615 continue;
616 }
617 }
618 }
619 /*
620 * We have a pointer to an entry.
621 * Is it a live one?
622 */
623 dup = (xfs_dir2_data_unused_t *)ptr;
624 /*
625 * No, it's unused, skip over it.
626 */
627 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
628 length = be16_to_cpu(dup->length);
629 ptr += length;
630 curoff += length;
631 continue;
632 }
633
634 dep = (xfs_dir2_data_entry_t *)ptr;
635 length = xfs_dir3_data_entsize(mp, dep->namelen);
636 filetype = xfs_dir3_dirent_get_ftype(mp, dep);
637
638 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
639 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
640 be64_to_cpu(dep->inumber),
641 xfs_dir3_get_dtype(mp, filetype)))
642 break;
643
644 /*
645 * Advance to next entry in the block.
646 */
647 ptr += length;
648 curoff += length;
649 /* bufsize may have just been a guess; don't go negative */
650 bufsize = bufsize > length ? bufsize - length : 0;
651 }
652
653 /*
654 * All done. Set output offset value to current offset.
655 */
656 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
657 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
658 else
659 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
660 kmem_free(map_info);
661 if (bp)
662 xfs_trans_brelse(NULL, bp);
663 return error;
664}
665
666/*
667 * Read a directory.
668 */
669int
670xfs_readdir(
671 xfs_inode_t *dp,
672 struct dir_context *ctx,
673 size_t bufsize)
674{
675 int rval; /* return value */
676 int v; /* type-checking value */
677
678 trace_xfs_readdir(dp);
679
680 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
681 return XFS_ERROR(EIO);
682
683 ASSERT(S_ISDIR(dp->i_d.di_mode));
684 XFS_STATS_INC(xs_dir_getdents);
685
686 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
687 rval = xfs_dir2_sf_getdents(dp, ctx);
688 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
689 ;
690 else if (v)
691 rval = xfs_dir2_block_getdents(dp, ctx);
692 else
693 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
694 return rval;
695}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 97676a347da1..3ef6d402084c 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -29,8 +29,8 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_dir2.h"
33#include "xfs_dir2_format.h" 32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h" 34#include "xfs_dir2_priv.h"
35#include "xfs_trace.h" 35#include "xfs_trace.h"
36 36
@@ -95,7 +95,7 @@ xfs_dir2_sf_get_parent_ino(
95 return xfs_dir2_sf_get_ino(hdr, &hdr->parent); 95 return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
96} 96}
97 97
98static void 98void
99xfs_dir2_sf_put_parent_ino( 99xfs_dir2_sf_put_parent_ino(
100 struct xfs_dir2_sf_hdr *hdr, 100 struct xfs_dir2_sf_hdr *hdr,
101 xfs_ino_t ino) 101 xfs_ino_t ino)
@@ -105,31 +105,38 @@ xfs_dir2_sf_put_parent_ino(
105 105
106/* 106/*
107 * In short-form directory entries the inode numbers are stored at variable 107 * In short-form directory entries the inode numbers are stored at variable
108 * offset behind the entry name. The inode numbers may only be accessed 108 * offset behind the entry name. If the entry stores a filetype value, then it
109 * through the helpers below. 109 * sits between the name and the inode number. Hence the inode numbers may only
110 * be accessed through the helpers below.
110 */ 111 */
111static xfs_dir2_inou_t * 112static xfs_dir2_inou_t *
112xfs_dir2_sfe_inop( 113xfs_dir3_sfe_inop(
114 struct xfs_mount *mp,
113 struct xfs_dir2_sf_entry *sfep) 115 struct xfs_dir2_sf_entry *sfep)
114{ 116{
115 return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]; 117 __uint8_t *ptr = &sfep->name[sfep->namelen];
118 if (xfs_sb_version_hasftype(&mp->m_sb))
119 ptr++;
120 return (xfs_dir2_inou_t *)ptr;
116} 121}
117 122
118xfs_ino_t 123xfs_ino_t
119xfs_dir2_sfe_get_ino( 124xfs_dir3_sfe_get_ino(
125 struct xfs_mount *mp,
120 struct xfs_dir2_sf_hdr *hdr, 126 struct xfs_dir2_sf_hdr *hdr,
121 struct xfs_dir2_sf_entry *sfep) 127 struct xfs_dir2_sf_entry *sfep)
122{ 128{
123 return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep)); 129 return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
124} 130}
125 131
126static void 132void
127xfs_dir2_sfe_put_ino( 133xfs_dir3_sfe_put_ino(
134 struct xfs_mount *mp,
128 struct xfs_dir2_sf_hdr *hdr, 135 struct xfs_dir2_sf_hdr *hdr,
129 struct xfs_dir2_sf_entry *sfep, 136 struct xfs_dir2_sf_entry *sfep,
130 xfs_ino_t ino) 137 xfs_ino_t ino)
131{ 138{
132 xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino); 139 xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
133} 140}
134 141
135/* 142/*
@@ -157,9 +164,16 @@ xfs_dir2_block_sfsize(
157 int namelen; /* total name bytes */ 164 int namelen; /* total name bytes */
158 xfs_ino_t parent = 0; /* parent inode number */ 165 xfs_ino_t parent = 0; /* parent inode number */
159 int size=0; /* total computed size */ 166 int size=0; /* total computed size */
167 int has_ftype;
160 168
161 mp = dp->i_mount; 169 mp = dp->i_mount;
162 170
171 /*
172 * if there is a filetype field, add the extra byte to the namelen
173 * for each entry that we see.
174 */
175 has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
176
163 count = i8count = namelen = 0; 177 count = i8count = namelen = 0;
164 btp = xfs_dir2_block_tail_p(mp, hdr); 178 btp = xfs_dir2_block_tail_p(mp, hdr);
165 blp = xfs_dir2_block_leaf_p(btp); 179 blp = xfs_dir2_block_leaf_p(btp);
@@ -188,9 +202,10 @@ xfs_dir2_block_sfsize(
188 if (!isdot) 202 if (!isdot)
189 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; 203 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
190#endif 204#endif
205 /* take into account the file type field */
191 if (!isdot && !isdotdot) { 206 if (!isdot && !isdotdot) {
192 count++; 207 count++;
193 namelen += dep->namelen; 208 namelen += dep->namelen + has_ftype;
194 } else if (isdotdot) 209 } else if (isdotdot)
195 parent = be64_to_cpu(dep->inumber); 210 parent = be64_to_cpu(dep->inumber);
196 /* 211 /*
@@ -316,12 +331,14 @@ xfs_dir2_block_to_sf(
316 (xfs_dir2_data_aoff_t) 331 (xfs_dir2_data_aoff_t)
317 ((char *)dep - (char *)hdr)); 332 ((char *)dep - (char *)hdr));
318 memcpy(sfep->name, dep->name, dep->namelen); 333 memcpy(sfep->name, dep->name, dep->namelen);
319 xfs_dir2_sfe_put_ino(sfp, sfep, 334 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
320 be64_to_cpu(dep->inumber)); 335 be64_to_cpu(dep->inumber));
336 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
337 xfs_dir3_dirent_get_ftype(mp, dep));
321 338
322 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 339 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
323 } 340 }
324 ptr += xfs_dir2_data_entsize(dep->namelen); 341 ptr += xfs_dir3_data_entsize(mp, dep->namelen);
325 } 342 }
326 ASSERT((char *)sfep - (char *)sfp == size); 343 ASSERT((char *)sfep - (char *)sfp == size);
327 xfs_dir2_sf_check(args); 344 xfs_dir2_sf_check(args);
@@ -372,7 +389,7 @@ xfs_dir2_sf_addname(
372 /* 389 /*
373 * Compute entry (and change in) size. 390 * Compute entry (and change in) size.
374 */ 391 */
375 add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen); 392 add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
376 incr_isize = add_entsize; 393 incr_isize = add_entsize;
377 objchange = 0; 394 objchange = 0;
378#if XFS_BIG_INUMS 395#if XFS_BIG_INUMS
@@ -466,8 +483,9 @@ xfs_dir2_sf_addname_easy(
466 /* 483 /*
467 * Grow the in-inode space. 484 * Grow the in-inode space.
468 */ 485 */
469 xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen), 486 xfs_idata_realloc(dp,
470 XFS_DATA_FORK); 487 xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
488 XFS_DATA_FORK);
471 /* 489 /*
472 * Need to set up again due to realloc of the inode data. 490 * Need to set up again due to realloc of the inode data.
473 */ 491 */
@@ -479,7 +497,9 @@ xfs_dir2_sf_addname_easy(
479 sfep->namelen = args->namelen; 497 sfep->namelen = args->namelen;
480 xfs_dir2_sf_put_offset(sfep, offset); 498 xfs_dir2_sf_put_offset(sfep, offset);
481 memcpy(sfep->name, args->name, sfep->namelen); 499 memcpy(sfep->name, args->name, sfep->namelen);
482 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 500 xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
501 xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
502
483 /* 503 /*
484 * Update the header and inode. 504 * Update the header and inode.
485 */ 505 */
@@ -519,11 +539,13 @@ xfs_dir2_sf_addname_hard(
519 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ 539 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
520 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ 540 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
521 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ 541 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
542 struct xfs_mount *mp;
522 543
523 /* 544 /*
524 * Copy the old directory to the stack buffer. 545 * Copy the old directory to the stack buffer.
525 */ 546 */
526 dp = args->dp; 547 dp = args->dp;
548 mp = dp->i_mount;
527 549
528 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 550 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
529 old_isize = (int)dp->i_d.di_size; 551 old_isize = (int)dp->i_d.di_size;
@@ -535,13 +557,13 @@ xfs_dir2_sf_addname_hard(
535 * to insert the new entry. 557 * to insert the new entry.
536 * If it's going to end up at the end then oldsfep will point there. 558 * If it's going to end up at the end then oldsfep will point there.
537 */ 559 */
538 for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), 560 for (offset = xfs_dir3_data_first_offset(mp),
539 oldsfep = xfs_dir2_sf_firstentry(oldsfp), 561 oldsfep = xfs_dir2_sf_firstentry(oldsfp),
540 add_datasize = xfs_dir2_data_entsize(args->namelen), 562 add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
541 eof = (char *)oldsfep == &buf[old_isize]; 563 eof = (char *)oldsfep == &buf[old_isize];
542 !eof; 564 !eof;
543 offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), 565 offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
544 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), 566 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
545 eof = (char *)oldsfep == &buf[old_isize]) { 567 eof = (char *)oldsfep == &buf[old_isize]) {
546 new_offset = xfs_dir2_sf_get_offset(oldsfep); 568 new_offset = xfs_dir2_sf_get_offset(oldsfep);
547 if (offset + add_datasize <= new_offset) 569 if (offset + add_datasize <= new_offset)
@@ -570,7 +592,8 @@ xfs_dir2_sf_addname_hard(
570 sfep->namelen = args->namelen; 592 sfep->namelen = args->namelen;
571 xfs_dir2_sf_put_offset(sfep, offset); 593 xfs_dir2_sf_put_offset(sfep, offset);
572 memcpy(sfep->name, args->name, sfep->namelen); 594 memcpy(sfep->name, args->name, sfep->namelen);
573 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 595 xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
596 xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
574 sfp->count++; 597 sfp->count++;
575#if XFS_BIG_INUMS 598#if XFS_BIG_INUMS
576 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) 599 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -580,7 +603,7 @@ xfs_dir2_sf_addname_hard(
580 * If there's more left to copy, do that. 603 * If there's more left to copy, do that.
581 */ 604 */
582 if (!eof) { 605 if (!eof) {
583 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 606 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
584 memcpy(sfep, oldsfep, old_isize - nbytes); 607 memcpy(sfep, oldsfep, old_isize - nbytes);
585 } 608 }
586 kmem_free(buf); 609 kmem_free(buf);
@@ -616,8 +639,8 @@ xfs_dir2_sf_addname_pick(
616 mp = dp->i_mount; 639 mp = dp->i_mount;
617 640
618 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 641 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
619 size = xfs_dir2_data_entsize(args->namelen); 642 size = xfs_dir3_data_entsize(mp, args->namelen);
620 offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); 643 offset = xfs_dir3_data_first_offset(mp);
621 sfep = xfs_dir2_sf_firstentry(sfp); 644 sfep = xfs_dir2_sf_firstentry(sfp);
622 holefit = 0; 645 holefit = 0;
623 /* 646 /*
@@ -629,8 +652,8 @@ xfs_dir2_sf_addname_pick(
629 if (!holefit) 652 if (!holefit)
630 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); 653 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
631 offset = xfs_dir2_sf_get_offset(sfep) + 654 offset = xfs_dir2_sf_get_offset(sfep) +
632 xfs_dir2_data_entsize(sfep->namelen); 655 xfs_dir3_data_entsize(mp, sfep->namelen);
633 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 656 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
634 } 657 }
635 /* 658 /*
636 * Calculate data bytes used excluding the new entry, if this 659 * Calculate data bytes used excluding the new entry, if this
@@ -684,31 +707,34 @@ xfs_dir2_sf_check(
684 int offset; /* data offset */ 707 int offset; /* data offset */
685 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ 708 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
686 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ 709 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
710 struct xfs_mount *mp;
687 711
688 dp = args->dp; 712 dp = args->dp;
713 mp = dp->i_mount;
689 714
690 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 715 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
691 offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); 716 offset = xfs_dir3_data_first_offset(mp);
692 ino = xfs_dir2_sf_get_parent_ino(sfp); 717 ino = xfs_dir2_sf_get_parent_ino(sfp);
693 i8count = ino > XFS_DIR2_MAX_SHORT_INUM; 718 i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
694 719
695 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); 720 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
696 i < sfp->count; 721 i < sfp->count;
697 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 722 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
698 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); 723 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
699 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 724 ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
700 i8count += ino > XFS_DIR2_MAX_SHORT_INUM; 725 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
701 offset = 726 offset =
702 xfs_dir2_sf_get_offset(sfep) + 727 xfs_dir2_sf_get_offset(sfep) +
703 xfs_dir2_data_entsize(sfep->namelen); 728 xfs_dir3_data_entsize(mp, sfep->namelen);
729 ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
730 XFS_DIR3_FT_MAX);
704 } 731 }
705 ASSERT(i8count == sfp->i8count); 732 ASSERT(i8count == sfp->i8count);
706 ASSERT(XFS_BIG_INUMS || i8count == 0); 733 ASSERT(XFS_BIG_INUMS || i8count == 0);
707 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); 734 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
708 ASSERT(offset + 735 ASSERT(offset +
709 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + 736 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
710 (uint)sizeof(xfs_dir2_block_tail_t) <= 737 (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
711 dp->i_mount->m_dirblksize);
712} 738}
713#endif /* DEBUG */ 739#endif /* DEBUG */
714 740
@@ -765,100 +791,6 @@ xfs_dir2_sf_create(
765 return 0; 791 return 0;
766} 792}
767 793
768int /* error */
769xfs_dir2_sf_getdents(
770 xfs_inode_t *dp, /* incore directory inode */
771 struct dir_context *ctx)
772{
773 int i; /* shortform entry number */
774 xfs_mount_t *mp; /* filesystem mount point */
775 xfs_dir2_dataptr_t off; /* current entry's offset */
776 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
777 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
778 xfs_dir2_dataptr_t dot_offset;
779 xfs_dir2_dataptr_t dotdot_offset;
780 xfs_ino_t ino;
781
782 mp = dp->i_mount;
783
784 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
785 /*
786 * Give up if the directory is way too short.
787 */
788 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
789 ASSERT(XFS_FORCED_SHUTDOWN(mp));
790 return XFS_ERROR(EIO);
791 }
792
793 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
794 ASSERT(dp->i_df.if_u1.if_data != NULL);
795
796 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
797
798 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
799
800 /*
801 * If the block number in the offset is out of range, we're done.
802 */
803 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
804 return 0;
805
806 /*
807 * Precalculate offsets for . and .. as we will always need them.
808 *
809 * XXX(hch): the second argument is sometimes 0 and sometimes
810 * mp->m_dirdatablk.
811 */
812 dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
813 XFS_DIR3_DATA_DOT_OFFSET(mp));
814 dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
815 XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
816
817 /*
818 * Put . entry unless we're starting past it.
819 */
820 if (ctx->pos <= dot_offset) {
821 ctx->pos = dot_offset & 0x7fffffff;
822 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
823 return 0;
824 }
825
826 /*
827 * Put .. entry unless we're starting past it.
828 */
829 if (ctx->pos <= dotdot_offset) {
830 ino = xfs_dir2_sf_get_parent_ino(sfp);
831 ctx->pos = dotdot_offset & 0x7fffffff;
832 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
833 return 0;
834 }
835
836 /*
837 * Loop while there are more entries and put'ing works.
838 */
839 sfep = xfs_dir2_sf_firstentry(sfp);
840 for (i = 0; i < sfp->count; i++) {
841 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
842 xfs_dir2_sf_get_offset(sfep));
843
844 if (ctx->pos > off) {
845 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
846 continue;
847 }
848
849 ino = xfs_dir2_sfe_get_ino(sfp, sfep);
850 ctx->pos = off & 0x7fffffff;
851 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
852 ino, DT_UNKNOWN))
853 return 0;
854 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
855 }
856
857 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
858 0x7fffffff;
859 return 0;
860}
861
862/* 794/*
863 * Lookup an entry in a shortform directory. 795 * Lookup an entry in a shortform directory.
864 * Returns EEXIST if found, ENOENT if not found. 796 * Returns EEXIST if found, ENOENT if not found.
@@ -898,6 +830,7 @@ xfs_dir2_sf_lookup(
898 if (args->namelen == 1 && args->name[0] == '.') { 830 if (args->namelen == 1 && args->name[0] == '.') {
899 args->inumber = dp->i_ino; 831 args->inumber = dp->i_ino;
900 args->cmpresult = XFS_CMP_EXACT; 832 args->cmpresult = XFS_CMP_EXACT;
833 args->filetype = XFS_DIR3_FT_DIR;
901 return XFS_ERROR(EEXIST); 834 return XFS_ERROR(EEXIST);
902 } 835 }
903 /* 836 /*
@@ -907,6 +840,7 @@ xfs_dir2_sf_lookup(
907 args->name[0] == '.' && args->name[1] == '.') { 840 args->name[0] == '.' && args->name[1] == '.') {
908 args->inumber = xfs_dir2_sf_get_parent_ino(sfp); 841 args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
909 args->cmpresult = XFS_CMP_EXACT; 842 args->cmpresult = XFS_CMP_EXACT;
843 args->filetype = XFS_DIR3_FT_DIR;
910 return XFS_ERROR(EEXIST); 844 return XFS_ERROR(EEXIST);
911 } 845 }
912 /* 846 /*
@@ -914,7 +848,7 @@ xfs_dir2_sf_lookup(
914 */ 848 */
915 ci_sfep = NULL; 849 ci_sfep = NULL;
916 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; 850 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
917 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 851 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
918 /* 852 /*
919 * Compare name and if it's an exact match, return the inode 853 * Compare name and if it's an exact match, return the inode
920 * number. If it's the first case-insensitive match, store the 854 * number. If it's the first case-insensitive match, store the
@@ -924,7 +858,10 @@ xfs_dir2_sf_lookup(
924 sfep->namelen); 858 sfep->namelen);
925 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { 859 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
926 args->cmpresult = cmp; 860 args->cmpresult = cmp;
927 args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep); 861 args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
862 sfp, sfep);
863 args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
864 sfp, sfep);
928 if (cmp == XFS_CMP_EXACT) 865 if (cmp == XFS_CMP_EXACT)
929 return XFS_ERROR(EEXIST); 866 return XFS_ERROR(EEXIST);
930 ci_sfep = sfep; 867 ci_sfep = sfep;
@@ -980,10 +917,10 @@ xfs_dir2_sf_removename(
980 * Find the one we're deleting. 917 * Find the one we're deleting.
981 */ 918 */
982 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; 919 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
983 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 920 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
984 if (xfs_da_compname(args, sfep->name, sfep->namelen) == 921 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
985 XFS_CMP_EXACT) { 922 XFS_CMP_EXACT) {
986 ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) == 923 ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
987 args->inumber); 924 args->inumber);
988 break; 925 break;
989 } 926 }
@@ -997,7 +934,7 @@ xfs_dir2_sf_removename(
997 * Calculate sizes. 934 * Calculate sizes.
998 */ 935 */
999 byteoff = (int)((char *)sfep - (char *)sfp); 936 byteoff = (int)((char *)sfep - (char *)sfp);
1000 entsize = xfs_dir2_sf_entsize(sfp, args->namelen); 937 entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
1001 newsize = oldsize - entsize; 938 newsize = oldsize - entsize;
1002 /* 939 /*
1003 * Copy the part if any after the removed entry, sliding it down. 940 * Copy the part if any after the removed entry, sliding it down.
@@ -1113,16 +1050,19 @@ xfs_dir2_sf_replace(
1113 * Normal entry, look for the name. 1050 * Normal entry, look for the name.
1114 */ 1051 */
1115 else { 1052 else {
1116 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); 1053 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
1117 i < sfp->count; 1054 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
1118 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
1119 if (xfs_da_compname(args, sfep->name, sfep->namelen) == 1055 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
1120 XFS_CMP_EXACT) { 1056 XFS_CMP_EXACT) {
1121#if XFS_BIG_INUMS || defined(DEBUG) 1057#if XFS_BIG_INUMS || defined(DEBUG)
1122 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 1058 ino = xfs_dir3_sfe_get_ino(dp->i_mount,
1059 sfp, sfep);
1123 ASSERT(args->inumber != ino); 1060 ASSERT(args->inumber != ino);
1124#endif 1061#endif
1125 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 1062 xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
1063 args->inumber);
1064 xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
1065 args->filetype);
1126 break; 1066 break;
1127 } 1067 }
1128 } 1068 }
@@ -1189,10 +1129,12 @@ xfs_dir2_sf_toino4(
1189 int oldsize; /* old inode size */ 1129 int oldsize; /* old inode size */
1190 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1130 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1191 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1131 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1132 struct xfs_mount *mp;
1192 1133
1193 trace_xfs_dir2_sf_toino4(args); 1134 trace_xfs_dir2_sf_toino4(args);
1194 1135
1195 dp = args->dp; 1136 dp = args->dp;
1137 mp = dp->i_mount;
1196 1138
1197 /* 1139 /*
1198 * Copy the old directory to the buffer. 1140 * Copy the old directory to the buffer.
@@ -1230,13 +1172,15 @@ xfs_dir2_sf_toino4(
1230 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), 1172 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1231 oldsfep = xfs_dir2_sf_firstentry(oldsfp); 1173 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1232 i < sfp->count; 1174 i < sfp->count;
1233 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), 1175 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
1234 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { 1176 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
1235 sfep->namelen = oldsfep->namelen; 1177 sfep->namelen = oldsfep->namelen;
1236 sfep->offset = oldsfep->offset; 1178 sfep->offset = oldsfep->offset;
1237 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1179 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1238 xfs_dir2_sfe_put_ino(sfp, sfep, 1180 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
1239 xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); 1181 xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
1182 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
1183 xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
1240 } 1184 }
1241 /* 1185 /*
1242 * Clean up the inode. 1186 * Clean up the inode.
@@ -1264,10 +1208,12 @@ xfs_dir2_sf_toino8(
1264 int oldsize; /* old inode size */ 1208 int oldsize; /* old inode size */
1265 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1209 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1266 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1210 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1211 struct xfs_mount *mp;
1267 1212
1268 trace_xfs_dir2_sf_toino8(args); 1213 trace_xfs_dir2_sf_toino8(args);
1269 1214
1270 dp = args->dp; 1215 dp = args->dp;
1216 mp = dp->i_mount;
1271 1217
1272 /* 1218 /*
1273 * Copy the old directory to the buffer. 1219 * Copy the old directory to the buffer.
@@ -1305,13 +1251,15 @@ xfs_dir2_sf_toino8(
1305 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), 1251 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1306 oldsfep = xfs_dir2_sf_firstentry(oldsfp); 1252 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1307 i < sfp->count; 1253 i < sfp->count;
1308 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), 1254 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
1309 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { 1255 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
1310 sfep->namelen = oldsfep->namelen; 1256 sfep->namelen = oldsfep->namelen;
1311 sfep->offset = oldsfep->offset; 1257 sfep->offset = oldsfep->offset;
1312 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1258 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1313 xfs_dir2_sfe_put_ino(sfp, sfep, 1259 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
1314 xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); 1260 xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
1261 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
1262 xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
1315 } 1263 }
1316 /* 1264 /*
1317 * Clean up the inode. 1265 * Clean up the inode.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 69cf4fcde03e..45560ee1a4ba 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -16,12 +16,13 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_format.h"
20#include "xfs_log.h" 20#include "xfs_log.h"
21#include "xfs_trans.h"
22#include "xfs_sb.h"
21#include "xfs_ag.h" 23#include "xfs_ag.h"
22#include "xfs_mount.h" 24#include "xfs_mount.h"
23#include "xfs_quota.h" 25#include "xfs_quota.h"
24#include "xfs_trans.h"
25#include "xfs_alloc_btree.h" 26#include "xfs_alloc_btree.h"
26#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
27#include "xfs_ialloc_btree.h" 28#include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0adf27ecf3f1..1ee776d477c3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_bit.h" 21#include "xfs_bit.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -28,6 +29,7 @@
28#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 30#include "xfs_inode.h"
30#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_bmap_util.h"
31#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
32#include "xfs_error.h" 34#include "xfs_error.h"
33#include "xfs_itable.h" 35#include "xfs_itable.h"
@@ -62,7 +64,8 @@ int xfs_dqerror_mod = 33;
62struct kmem_zone *xfs_qm_dqtrxzone; 64struct kmem_zone *xfs_qm_dqtrxzone;
63static struct kmem_zone *xfs_qm_dqzone; 65static struct kmem_zone *xfs_qm_dqzone;
64 66
65static struct lock_class_key xfs_dquot_other_class; 67static struct lock_class_key xfs_dquot_group_class;
68static struct lock_class_key xfs_dquot_project_class;
66 69
67/* 70/*
68 * This is called to free all the memory associated with a dquot 71 * This is called to free all the memory associated with a dquot
@@ -701,8 +704,20 @@ xfs_qm_dqread(
701 * Make sure group quotas have a different lock class than user 704 * Make sure group quotas have a different lock class than user
702 * quotas. 705 * quotas.
703 */ 706 */
704 if (!(type & XFS_DQ_USER)) 707 switch (type) {
705 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); 708 case XFS_DQ_USER:
709 /* uses the default lock class */
710 break;
711 case XFS_DQ_GROUP:
712 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
713 break;
714 case XFS_DQ_PROJ:
715 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
716 break;
717 default:
718 ASSERT(0);
719 break;
720 }
706 721
707 XFS_STATS_INC(xs_qm_dquot); 722 XFS_STATS_INC(xs_qm_dquot);
708 723
@@ -710,10 +725,8 @@ xfs_qm_dqread(
710 725
711 if (flags & XFS_QMOPT_DQALLOC) { 726 if (flags & XFS_QMOPT_DQALLOC) {
712 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 727 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
713 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), 728 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
714 XFS_QM_DQALLOC_LOG_RES(mp), 0, 729 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
715 XFS_TRANS_PERM_LOG_RES,
716 XFS_WRITE_LOG_COUNT);
717 if (error) 730 if (error)
718 goto error1; 731 goto error1;
719 cancelflags = XFS_TRANS_RELEASE_LOG_RES; 732 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
@@ -940,13 +953,8 @@ xfs_qm_dqput_final(
940 953
941 trace_xfs_dqput_free(dqp); 954 trace_xfs_dqput_free(dqp);
942 955
943 mutex_lock(&qi->qi_lru_lock); 956 if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
944 if (list_empty(&dqp->q_lru)) {
945 list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
946 qi->qi_lru_count++;
947 XFS_STATS_INC(xs_qm_dquot_unused); 957 XFS_STATS_INC(xs_qm_dquot_unused);
948 }
949 mutex_unlock(&qi->qi_lru_lock);
950 958
951 /* 959 /*
952 * If we just added a udquot to the freelist, then we want to release 960 * If we just added a udquot to the freelist, then we want to release
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 57aa4b03720c..e838d84b4e85 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -43,14 +44,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
43/* 44/*
44 * returns the number of iovecs needed to log the given dquot item. 45 * returns the number of iovecs needed to log the given dquot item.
45 */ 46 */
46STATIC uint 47STATIC void
47xfs_qm_dquot_logitem_size( 48xfs_qm_dquot_logitem_size(
48 struct xfs_log_item *lip) 49 struct xfs_log_item *lip,
50 int *nvecs,
51 int *nbytes)
49{ 52{
50 /* 53 *nvecs += 2;
51 * we need only two iovecs, one for the format, one for the real thing 54 *nbytes += sizeof(struct xfs_dq_logformat) +
52 */ 55 sizeof(struct xfs_disk_dquot);
53 return 2;
54} 56}
55 57
56/* 58/*
@@ -140,7 +142,8 @@ xfs_qm_dqunpin_wait(
140STATIC uint 142STATIC uint
141xfs_qm_dquot_logitem_push( 143xfs_qm_dquot_logitem_push(
142 struct xfs_log_item *lip, 144 struct xfs_log_item *lip,
143 struct list_head *buffer_list) 145 struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock)
146 __acquires(&lip->li_ailp->xa_lock)
144{ 147{
145 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; 148 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
146 struct xfs_buf *bp = NULL; 149 struct xfs_buf *bp = NULL;
@@ -285,11 +288,14 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
285 * We only need 1 iovec for an quotaoff item. It just logs the 288 * We only need 1 iovec for an quotaoff item. It just logs the
286 * quotaoff_log_format structure. 289 * quotaoff_log_format structure.
287 */ 290 */
288STATIC uint 291STATIC void
289xfs_qm_qoff_logitem_size( 292xfs_qm_qoff_logitem_size(
290 struct xfs_log_item *lip) 293 struct xfs_log_item *lip,
294 int *nvecs,
295 int *nbytes)
291{ 296{
292 return 1; 297 *nvecs += 1;
298 *nbytes += sizeof(struct xfs_qoff_logitem);
293} 299}
294 300
295/* 301/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 35d3f5b041dd..1123d93ff795 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -26,7 +26,6 @@
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_dinode.h" 27#include "xfs_dinode.h"
28#include "xfs_inode.h" 28#include "xfs_inode.h"
29#include "xfs_utils.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
31 30
32#ifdef DEBUG 31#ifdef DEBUG
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index c585bc646395..066df425c14f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -21,10 +21,11 @@
21#include "xfs_trans.h" 21#include "xfs_trans.h"
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_ag.h" 23#include "xfs_ag.h"
24#include "xfs_dir2.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
25#include "xfs_da_btree.h"
26#include "xfs_dir2_format.h"
27#include "xfs_dir2.h"
26#include "xfs_export.h" 28#include "xfs_export.h"
27#include "xfs_vnodeops.h"
28#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 30#include "xfs_inode.h"
30#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 85e9f87a1a7c..e43708e2f080 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -147,7 +147,7 @@ xfs_extent_busy_search(
147 * extent. If the overlap covers the beginning, the end, or all of the busy 147 * extent. If the overlap covers the beginning, the end, or all of the busy
148 * extent, the overlapping portion can be made unbusy and used for the 148 * extent, the overlapping portion can be made unbusy and used for the
149 * allocation. We can't split a busy extent because we can't modify a 149 * allocation. We can't split a busy extent because we can't modify a
150 * transaction/CIL context busy list, but we can update an entries block 150 * transaction/CIL context busy list, but we can update an entry's block
151 * number or length. 151 * number or length.
152 * 152 *
153 * Returns true if the extent can safely be reused, or false if the search 153 * Returns true if the extent can safely be reused, or false if the search
@@ -160,7 +160,8 @@ xfs_extent_busy_update_extent(
160 struct xfs_extent_busy *busyp, 160 struct xfs_extent_busy *busyp,
161 xfs_agblock_t fbno, 161 xfs_agblock_t fbno,
162 xfs_extlen_t flen, 162 xfs_extlen_t flen,
163 bool userdata) 163 bool userdata) __releases(&pag->pagb_lock)
164 __acquires(&pag->pagb_lock)
164{ 165{
165 xfs_agblock_t fend = fbno + flen; 166 xfs_agblock_t fend = fbno + flen;
166 xfs_agblock_t bbno = busyp->bno; 167 xfs_agblock_t bbno = busyp->bno;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 452920a3f03f..dc53e8febbbe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -73,11 +73,22 @@ __xfs_efi_release(
73 * We only need 1 iovec for an efi item. It just logs the efi_log_format 73 * We only need 1 iovec for an efi item. It just logs the efi_log_format
74 * structure. 74 * structure.
75 */ 75 */
76STATIC uint 76static inline int
77xfs_efi_item_sizeof(
78 struct xfs_efi_log_item *efip)
79{
80 return sizeof(struct xfs_efi_log_format) +
81 (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
82}
83
84STATIC void
77xfs_efi_item_size( 85xfs_efi_item_size(
78 struct xfs_log_item *lip) 86 struct xfs_log_item *lip,
87 int *nvecs,
88 int *nbytes)
79{ 89{
80 return 1; 90 *nvecs += 1;
91 *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
81} 92}
82 93
83/* 94/*
@@ -93,21 +104,17 @@ xfs_efi_item_format(
93 struct xfs_log_iovec *log_vector) 104 struct xfs_log_iovec *log_vector)
94{ 105{
95 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 106 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
96 uint size;
97 107
98 ASSERT(atomic_read(&efip->efi_next_extent) == 108 ASSERT(atomic_read(&efip->efi_next_extent) ==
99 efip->efi_format.efi_nextents); 109 efip->efi_format.efi_nextents);
100 110
101 efip->efi_format.efi_type = XFS_LI_EFI; 111 efip->efi_format.efi_type = XFS_LI_EFI;
102
103 size = sizeof(xfs_efi_log_format_t);
104 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
105 efip->efi_format.efi_size = 1; 112 efip->efi_format.efi_size = 1;
106 113
107 log_vector->i_addr = &efip->efi_format; 114 log_vector->i_addr = &efip->efi_format;
108 log_vector->i_len = size; 115 log_vector->i_len = xfs_efi_item_sizeof(efip);
109 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 116 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
110 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 117 ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
111} 118}
112 119
113 120
@@ -333,11 +340,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
333 * We only need 1 iovec for an efd item. It just logs the efd_log_format 340 * We only need 1 iovec for an efd item. It just logs the efd_log_format
334 * structure. 341 * structure.
335 */ 342 */
336STATIC uint 343static inline int
344xfs_efd_item_sizeof(
345 struct xfs_efd_log_item *efdp)
346{
347 return sizeof(xfs_efd_log_format_t) +
348 (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
349}
350
351STATIC void
337xfs_efd_item_size( 352xfs_efd_item_size(
338 struct xfs_log_item *lip) 353 struct xfs_log_item *lip,
354 int *nvecs,
355 int *nbytes)
339{ 356{
340 return 1; 357 *nvecs += 1;
358 *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
341} 359}
342 360
343/* 361/*
@@ -353,20 +371,16 @@ xfs_efd_item_format(
353 struct xfs_log_iovec *log_vector) 371 struct xfs_log_iovec *log_vector)
354{ 372{
355 struct xfs_efd_log_item *efdp = EFD_ITEM(lip); 373 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
356 uint size;
357 374
358 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 375 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
359 376
360 efdp->efd_format.efd_type = XFS_LI_EFD; 377 efdp->efd_format.efd_type = XFS_LI_EFD;
361
362 size = sizeof(xfs_efd_log_format_t);
363 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
364 efdp->efd_format.efd_size = 1; 378 efdp->efd_format.efd_size = 1;
365 379
366 log_vector->i_addr = &efdp->efd_format; 380 log_vector->i_addr = &efdp->efd_format;
367 log_vector->i_len = size; 381 log_vector->i_len = xfs_efd_item_sizeof(efdp);
368 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 382 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
369 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 383 ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
370} 384}
371 385
372/* 386/*
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 432222418c56..0ffbce32d569 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -18,93 +18,11 @@
18#ifndef __XFS_EXTFREE_ITEM_H__ 18#ifndef __XFS_EXTFREE_ITEM_H__
19#define __XFS_EXTFREE_ITEM_H__ 19#define __XFS_EXTFREE_ITEM_H__
20 20
21/* kernel only EFI/EFD definitions */
22
21struct xfs_mount; 23struct xfs_mount;
22struct kmem_zone; 24struct kmem_zone;
23 25
24typedef struct xfs_extent {
25 xfs_dfsbno_t ext_start;
26 xfs_extlen_t ext_len;
27} xfs_extent_t;
28
29/*
30 * Since an xfs_extent_t has types (start:64, len: 32)
31 * there are different alignments on 32 bit and 64 bit kernels.
32 * So we provide the different variants for use by a
33 * conversion routine.
34 */
35
36typedef struct xfs_extent_32 {
37 __uint64_t ext_start;
38 __uint32_t ext_len;
39} __attribute__((packed)) xfs_extent_32_t;
40
41typedef struct xfs_extent_64 {
42 __uint64_t ext_start;
43 __uint32_t ext_len;
44 __uint32_t ext_pad;
45} xfs_extent_64_t;
46
47/*
48 * This is the structure used to lay out an efi log item in the
49 * log. The efi_extents field is a variable size array whose
50 * size is given by efi_nextents.
51 */
52typedef struct xfs_efi_log_format {
53 __uint16_t efi_type; /* efi log item type */
54 __uint16_t efi_size; /* size of this item */
55 __uint32_t efi_nextents; /* # extents to free */
56 __uint64_t efi_id; /* efi identifier */
57 xfs_extent_t efi_extents[1]; /* array of extents to free */
58} xfs_efi_log_format_t;
59
60typedef struct xfs_efi_log_format_32 {
61 __uint16_t efi_type; /* efi log item type */
62 __uint16_t efi_size; /* size of this item */
63 __uint32_t efi_nextents; /* # extents to free */
64 __uint64_t efi_id; /* efi identifier */
65 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
66} __attribute__((packed)) xfs_efi_log_format_32_t;
67
68typedef struct xfs_efi_log_format_64 {
69 __uint16_t efi_type; /* efi log item type */
70 __uint16_t efi_size; /* size of this item */
71 __uint32_t efi_nextents; /* # extents to free */
72 __uint64_t efi_id; /* efi identifier */
73 xfs_extent_64_t efi_extents[1]; /* array of extents to free */
74} xfs_efi_log_format_64_t;
75
76/*
77 * This is the structure used to lay out an efd log item in the
78 * log. The efd_extents array is a variable size array whose
79 * size is given by efd_nextents;
80 */
81typedef struct xfs_efd_log_format {
82 __uint16_t efd_type; /* efd log item type */
83 __uint16_t efd_size; /* size of this item */
84 __uint32_t efd_nextents; /* # of extents freed */
85 __uint64_t efd_efi_id; /* id of corresponding efi */
86 xfs_extent_t efd_extents[1]; /* array of extents freed */
87} xfs_efd_log_format_t;
88
89typedef struct xfs_efd_log_format_32 {
90 __uint16_t efd_type; /* efd log item type */
91 __uint16_t efd_size; /* size of this item */
92 __uint32_t efd_nextents; /* # of extents freed */
93 __uint64_t efd_efi_id; /* id of corresponding efi */
94 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
95} __attribute__((packed)) xfs_efd_log_format_32_t;
96
97typedef struct xfs_efd_log_format_64 {
98 __uint16_t efd_type; /* efd log item type */
99 __uint16_t efd_size; /* size of this item */
100 __uint32_t efd_nextents; /* # of extents freed */
101 __uint64_t efd_efi_id; /* id of corresponding efi */
102 xfs_extent_64_t efd_extents[1]; /* array of extents freed */
103} xfs_efd_log_format_64_t;
104
105
106#ifdef __KERNEL__
107
108/* 26/*
109 * Max number of extents in fast allocation path. 27 * Max number of extents in fast allocation path.
110 */ 28 */
@@ -160,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf,
160 xfs_efi_log_format_t *dst_efi_fmt); 78 xfs_efi_log_format_t *dst_efi_fmt);
161void xfs_efi_item_free(xfs_efi_log_item_t *); 79void xfs_efi_item_free(xfs_efi_log_item_t *);
162 80
163#endif /* __KERNEL__ */
164
165#endif /* __XFS_EXTFREE_ITEM_H__ */ 81#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de3dc98f4e8f..4c749ab543d0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,10 +28,11 @@
28#include "xfs_inode.h" 28#include "xfs_inode.h"
29#include "xfs_inode_item.h" 29#include "xfs_inode_item.h"
30#include "xfs_bmap.h" 30#include "xfs_bmap.h"
31#include "xfs_bmap_util.h"
31#include "xfs_error.h" 32#include "xfs_error.h"
32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
35#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
36#include "xfs_ioctl.h" 37#include "xfs_ioctl.h"
37#include "xfs_trace.h" 38#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 5170306a1009..ce78e654d37b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -16,18 +16,18 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_log.h"
19#include "xfs_bmap_btree.h" 20#include "xfs_bmap_btree.h"
20#include "xfs_inum.h" 21#include "xfs_inum.h"
21#include "xfs_dinode.h" 22#include "xfs_dinode.h"
22#include "xfs_inode.h" 23#include "xfs_inode.h"
23#include "xfs_ag.h" 24#include "xfs_ag.h"
24#include "xfs_log.h"
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_bmap.h" 28#include "xfs_bmap.h"
29#include "xfs_bmap_util.h"
29#include "xfs_alloc.h" 30#include "xfs_alloc.h"
30#include "xfs_utils.h"
31#include "xfs_mru_cache.h" 31#include "xfs_mru_cache.h"
32#include "xfs_filestream.h" 32#include "xfs_filestream.h"
33#include "xfs_trace.h" 33#include "xfs_trace.h"
@@ -668,8 +668,8 @@ exit:
668 */ 668 */
669int 669int
670xfs_filestream_new_ag( 670xfs_filestream_new_ag(
671 xfs_bmalloca_t *ap, 671 struct xfs_bmalloca *ap,
672 xfs_agnumber_t *agp) 672 xfs_agnumber_t *agp)
673{ 673{
674 int flags, err; 674 int flags, err;
675 xfs_inode_t *ip, *pip = NULL; 675 xfs_inode_t *ip, *pip = NULL;
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 09dd9af45434..6d61dbee8564 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -18,8 +18,6 @@
18#ifndef __XFS_FILESTREAM_H__ 18#ifndef __XFS_FILESTREAM_H__
19#define __XFS_FILESTREAM_H__ 19#define __XFS_FILESTREAM_H__
20 20
21#ifdef __KERNEL__
22
23struct xfs_mount; 21struct xfs_mount;
24struct xfs_inode; 22struct xfs_inode;
25struct xfs_perag; 23struct xfs_perag;
@@ -69,6 +67,4 @@ xfs_inode_is_filestream(
69 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); 67 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
70} 68}
71 69
72#endif /* __KERNEL__ */
73
74#endif /* __XFS_FILESTREAM_H__ */ 70#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
new file mode 100644
index 000000000000..35c08ff54ca0
--- /dev/null
+++ b/fs/xfs/xfs_format.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FORMAT_H__
19#define __XFS_FORMAT_H__
20
21/*
22 * XFS On Disk Format Definitions
23 *
24 * This header file defines all the on-disk format definitions for
25 * general XFS objects. Directory and attribute related objects are defined in
26 * xfs_da_format.h, which log and log item formats are defined in
27 * xfs_log_format.h. Everything else goes here.
28 */
29
30struct xfs_mount;
31struct xfs_trans;
32struct xfs_inode;
33struct xfs_buf;
34struct xfs_ifork;
35
36/*
37 * RealTime Device format definitions
38 */
39
40/* Min and max rt extent sizes, specified in bytes */
41#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
42#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
43#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
44
45#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
46#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
47#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
48#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
49
50/*
51 * RT Summary and bit manipulation macros.
52 */
53#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
54#define XFS_SUMOFFSTOBLOCK(mp,s) \
55 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
56#define XFS_SUMPTR(mp,bp,so) \
57 ((xfs_suminfo_t *)((bp)->b_addr + \
58 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
59
60#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
61#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
62#define XFS_BITTOWORD(mp,bi) \
63 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
64
65#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
66#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
67
68#define XFS_RTLOBIT(w) xfs_lowbit32(w)
69#define XFS_RTHIBIT(w) xfs_highbit32(w)
70
71#if XFS_BIG_BLKNOS
72#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
73#else
74#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
75#endif
76
77/*
78 * Dquot and dquot block format definitions
79 */
80#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
81#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
82
83/*
84 * This is the main portion of the on-disk representation of quota
85 * information for a user. This is the q_core of the xfs_dquot_t that
86 * is kept in kernel memory. We pad this with some more expansion room
87 * to construct the on disk structure.
88 */
89typedef struct xfs_disk_dquot {
90 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
91 __u8 d_version; /* dquot version */
92 __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
93 __be32 d_id; /* user,project,group id */
94 __be64 d_blk_hardlimit;/* absolute limit on disk blks */
95 __be64 d_blk_softlimit;/* preferred limit on disk blks */
96 __be64 d_ino_hardlimit;/* maximum # allocated inodes */
97 __be64 d_ino_softlimit;/* preferred inode limit */
98 __be64 d_bcount; /* disk blocks owned by the user */
99 __be64 d_icount; /* inodes owned by the user */
100 __be32 d_itimer; /* zero if within inode limits if not,
101 this is when we refuse service */
102 __be32 d_btimer; /* similar to above; for disk blocks */
103 __be16 d_iwarns; /* warnings issued wrt num inodes */
104 __be16 d_bwarns; /* warnings issued wrt disk blocks */
105 __be32 d_pad0; /* 64 bit align */
106 __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
107 __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
108 __be64 d_rtbcount; /* realtime blocks owned */
109 __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
110 __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
111 __be16 d_pad;
112} xfs_disk_dquot_t;
113
114/*
115 * This is what goes on disk. This is separated from the xfs_disk_dquot because
116 * carrying the unnecessary padding would be a waste of memory.
117 */
118typedef struct xfs_dqblk {
119 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
120 char dd_fill[4]; /* filling for posterity */
121
122 /*
123 * These two are only present on filesystems with the CRC bits set.
124 */
125 __be32 dd_crc; /* checksum */
126 __be64 dd_lsn; /* last modification in log */
127 uuid_t dd_uuid; /* location information */
128} xfs_dqblk_t;
129
130#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
131
132/*
133 * Remote symlink format and access functions.
134 */
135#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
136
137struct xfs_dsymlink_hdr {
138 __be32 sl_magic;
139 __be32 sl_offset;
140 __be32 sl_bytes;
141 __be32 sl_crc;
142 uuid_t sl_uuid;
143 __be64 sl_owner;
144 __be64 sl_blkno;
145 __be64 sl_lsn;
146};
147
148/*
149 * The maximum pathlen is 1024 bytes. Since the minimum file system
150 * blocksize is 512 bytes, we can get a max of 3 extents back from
151 * bmapi when crc headers are taken into account.
152 */
153#define XFS_SYMLINK_MAPS 3
154
155#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
156 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
157 sizeof(struct xfs_dsymlink_hdr) : 0))
158
159int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
160int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
161 uint32_t size, struct xfs_buf *bp);
162bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
163 uint32_t size, struct xfs_buf *bp);
164void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
165 struct xfs_inode *ip, struct xfs_ifork *ifp);
166
167extern const struct xfs_buf_ops xfs_symlink_buf_ops;
168
169#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d04695545397..18272c766a50 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -240,7 +240,9 @@ typedef struct xfs_fsop_resblks {
240 240
241 241
242/* 242/*
243 * Minimum and maximum sizes need for growth checks 243 * Minimum and maximum sizes need for growth checks.
244 *
245 * Block counts are in units of filesystem blocks, not basic blocks.
244 */ 246 */
245#define XFS_MIN_AG_BLOCKS 64 247#define XFS_MIN_AG_BLOCKS 64
246#define XFS_MIN_LOG_BLOCKS 512ULL 248#define XFS_MIN_LOG_BLOCKS 512ULL
@@ -311,6 +313,17 @@ typedef struct xfs_bstat {
311} xfs_bstat_t; 313} xfs_bstat_t;
312 314
313/* 315/*
316 * Project quota id helpers (previously projid was 16bit only
317 * and using two 16bit values to hold new 32bit projid was choosen
318 * to retain compatibility with "old" filesystems).
319 */
320static inline __uint32_t
321bstat_get_projid(struct xfs_bstat *bs)
322{
323 return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
324}
325
326/*
314 * The user-level BulkStat Request interface structure. 327 * The user-level BulkStat Request interface structure.
315 */ 328 */
316typedef struct xfs_fsop_bulkreq { 329typedef struct xfs_fsop_bulkreq {
@@ -344,7 +357,7 @@ typedef struct xfs_error_injection {
344 * Speculative preallocation trimming. 357 * Speculative preallocation trimming.
345 */ 358 */
346#define XFS_EOFBLOCKS_VERSION 1 359#define XFS_EOFBLOCKS_VERSION 1
347struct xfs_eofblocks { 360struct xfs_fs_eofblocks {
348 __u32 eof_version; 361 __u32 eof_version;
349 __u32 eof_flags; 362 __u32 eof_flags;
350 uid_t eof_uid; 363 uid_t eof_uid;
@@ -450,6 +463,21 @@ typedef struct xfs_handle {
450 + (handle).ha_fid.fid_len) 463 + (handle).ha_fid.fid_len)
451 464
452/* 465/*
466 * Structure passed to XFS_IOC_SWAPEXT
467 */
468typedef struct xfs_swapext
469{
470 __int64_t sx_version; /* version */
471#define XFS_SX_VERSION 0
472 __int64_t sx_fdtarget; /* fd of target file */
473 __int64_t sx_fdtmp; /* fd of tmp file */
474 xfs_off_t sx_offset; /* offset into file */
475 xfs_off_t sx_length; /* leng from offset */
476 char sx_pad[16]; /* pad space, unused */
477 xfs_bstat_t sx_stat; /* stat of target b4 copy */
478} xfs_swapext_t;
479
480/*
453 * Flags for going down operation 481 * Flags for going down operation
454 */ 482 */
455#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ 483#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -487,7 +515,7 @@ typedef struct xfs_handle {
487/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 515/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
488#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 516#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
489#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) 517#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
490#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) 518#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
491 519
492/* 520/*
493 * ioctl commands that replace IRIX syssgi()'s 521 * ioctl commands that replace IRIX syssgi()'s
@@ -511,8 +539,14 @@ typedef struct xfs_handle {
511#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) 539#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
512#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 540#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
513/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 541/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
542
514/* XFS_IOC_FREEZE -- FIFREEZE 119 */ 543/* XFS_IOC_FREEZE -- FIFREEZE 119 */
515/* XFS_IOC_THAW -- FITHAW 120 */ 544/* XFS_IOC_THAW -- FITHAW 120 */
545#ifndef FIFREEZE
546#define XFS_IOC_FREEZE _IOWR('X', 119, int)
547#define XFS_IOC_THAW _IOWR('X', 120, int)
548#endif
549
516#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 550#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
517#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 551#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
518#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) 552#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 614eb0cc3608..e64ee5288b86 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -203,8 +203,9 @@ xfs_growfs_data_private(
203 203
204 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 204 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
205 tp->t_flags |= XFS_TRANS_RESERVE; 205 tp->t_flags |= XFS_TRANS_RESERVE;
206 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 206 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
207 XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { 207 XFS_GROWFS_SPACE_RES(mp), 0);
208 if (error) {
208 xfs_trans_cancel(tp, 0); 209 xfs_trans_cancel(tp, 0);
209 return error; 210 return error;
210 } 211 }
@@ -739,8 +740,7 @@ xfs_fs_log_dummy(
739 int error; 740 int error;
740 741
741 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 742 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
742 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 743 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
743 XFS_DEFAULT_LOG_COUNT);
744 if (error) { 744 if (error) {
745 xfs_trans_cancel(tp, 0); 745 xfs_trans_cancel(tp, 0);
746 return error; 746 return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7a0c17d7ec09..ccf2fb143962 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -39,6 +39,7 @@
39#include "xfs_cksum.h" 39#include "xfs_cksum.h"
40#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
41#include "xfs_icreate_item.h" 41#include "xfs_icreate_item.h"
42#include "xfs_icache.h"
42 43
43 44
44/* 45/*
@@ -506,7 +507,7 @@ xfs_ialloc_next_ag(
506 507
507/* 508/*
508 * Select an allocation group to look for a free inode in, based on the parent 509 * Select an allocation group to look for a free inode in, based on the parent
509 * inode and then mode. Return the allocation group buffer. 510 * inode and the mode. Return the allocation group buffer.
510 */ 511 */
511STATIC xfs_agnumber_t 512STATIC xfs_agnumber_t
512xfs_ialloc_ag_select( 513xfs_ialloc_ag_select(
@@ -728,7 +729,7 @@ xfs_dialloc_ag(
728 error = xfs_inobt_get_rec(cur, &rec, &j); 729 error = xfs_inobt_get_rec(cur, &rec, &j);
729 if (error) 730 if (error)
730 goto error0; 731 goto error0;
731 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 732 XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
732 733
733 if (rec.ir_freecount > 0) { 734 if (rec.ir_freecount > 0) {
734 /* 735 /*
@@ -1341,7 +1342,7 @@ xfs_imap(
1341 xfs_agblock_t cluster_agbno; /* first block in inode cluster */ 1342 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
1342 int error; /* error code */ 1343 int error; /* error code */
1343 int offset; /* index of inode in its buffer */ 1344 int offset; /* index of inode in its buffer */
1344 int offset_agbno; /* blks from chunk start to inode */ 1345 xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
1345 1346
1346 ASSERT(ino != NULLFSINO); 1347 ASSERT(ino != NULLFSINO);
1347 1348
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3f90e1ceb8d6..474807a401c8 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_types.h" 21#include "xfs_types.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_log_priv.h" 23#include "xfs_log_priv.h"
@@ -31,12 +32,12 @@
31#include "xfs_dinode.h" 32#include "xfs_dinode.h"
32#include "xfs_error.h" 33#include "xfs_error.h"
33#include "xfs_filestream.h" 34#include "xfs_filestream.h"
34#include "xfs_vnodeops.h"
35#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
36#include "xfs_quota.h" 36#include "xfs_quota.h"
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h" 39#include "xfs_icache.h"
40#include "xfs_bmap_util.h"
40 41
41#include <linux/kthread.h> 42#include <linux/kthread.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
@@ -47,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
47/* 48/*
48 * Allocate and initialise an xfs_inode. 49 * Allocate and initialise an xfs_inode.
49 */ 50 */
50STATIC struct xfs_inode * 51struct xfs_inode *
51xfs_inode_alloc( 52xfs_inode_alloc(
52 struct xfs_mount *mp, 53 struct xfs_mount *mp,
53 xfs_ino_t ino) 54 xfs_ino_t ino)
@@ -97,7 +98,7 @@ xfs_inode_free_callback(
97 kmem_zone_free(xfs_inode_zone, ip); 98 kmem_zone_free(xfs_inode_zone, ip);
98} 99}
99 100
100STATIC void 101void
101xfs_inode_free( 102xfs_inode_free(
102 struct xfs_inode *ip) 103 struct xfs_inode *ip)
103{ 104{
@@ -118,11 +119,6 @@ xfs_inode_free(
118 ip->i_itemp = NULL; 119 ip->i_itemp = NULL;
119 } 120 }
120 121
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!spin_is_locked(&ip->i_flags_lock));
124 ASSERT(!xfs_isiflocked(ip));
125
126 /* 122 /*
127 * Because we use RCU freeing we need to ensure the inode always 123 * Because we use RCU freeing we need to ensure the inode always
128 * appears to be reclaimed with an invalid inode number when in the 124 * appears to be reclaimed with an invalid inode number when in the
@@ -134,6 +130,10 @@ xfs_inode_free(
134 ip->i_ino = 0; 130 ip->i_ino = 0;
135 spin_unlock(&ip->i_flags_lock); 131 spin_unlock(&ip->i_flags_lock);
136 132
133 /* asserts to verify all state is correct here */
134 ASSERT(atomic_read(&ip->i_pincount) == 0);
135 ASSERT(!xfs_isiflocked(ip));
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 138}
139 139
@@ -619,7 +619,7 @@ restart:
619 619
620/* 620/*
621 * Background scanning to trim post-EOF preallocated space. This is queued 621 * Background scanning to trim post-EOF preallocated space. This is queued
622 * based on the 'background_prealloc_discard_period' tunable (5m by default). 622 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
623 */ 623 */
624STATIC void 624STATIC void
625xfs_queue_eofblocks( 625xfs_queue_eofblocks(
@@ -1166,7 +1166,7 @@ xfs_reclaim_inodes(
1166 * them to be cleaned, which we hope will not be very long due to the 1166 * them to be cleaned, which we hope will not be very long due to the
1167 * background walker having already kicked the IO off on those dirty inodes. 1167 * background walker having already kicked the IO off on those dirty inodes.
1168 */ 1168 */
1169void 1169long
1170xfs_reclaim_inodes_nr( 1170xfs_reclaim_inodes_nr(
1171 struct xfs_mount *mp, 1171 struct xfs_mount *mp,
1172 int nr_to_scan) 1172 int nr_to_scan)
@@ -1175,7 +1175,7 @@ xfs_reclaim_inodes_nr(
1175 xfs_reclaim_work_queue(mp); 1175 xfs_reclaim_work_queue(mp);
1176 xfs_ail_push_all(mp->m_ail); 1176 xfs_ail_push_all(mp->m_ail);
1177 1177
1178 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1178 return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
1179} 1179}
1180 1180
1181/* 1181/*
@@ -1203,15 +1203,15 @@ xfs_inode_match_id(
1203 struct xfs_inode *ip, 1203 struct xfs_inode *ip,
1204 struct xfs_eofblocks *eofb) 1204 struct xfs_eofblocks *eofb)
1205{ 1205{
1206 if (eofb->eof_flags & XFS_EOF_FLAGS_UID && 1206 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1207 ip->i_d.di_uid != eofb->eof_uid) 1207 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1208 return 0; 1208 return 0;
1209 1209
1210 if (eofb->eof_flags & XFS_EOF_FLAGS_GID && 1210 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1211 ip->i_d.di_gid != eofb->eof_gid) 1211 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1212 return 0; 1212 return 0;
1213 1213
1214 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && 1214 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1215 xfs_get_projid(ip) != eofb->eof_prid) 1215 xfs_get_projid(ip) != eofb->eof_prid)
1216 return 0; 1216 return 0;
1217 1217
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a01afbb3909a..9ed68bb750f5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -21,17 +21,36 @@
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag; 22struct xfs_perag;
23 23
24struct xfs_eofblocks {
25 __u32 eof_flags;
26 kuid_t eof_uid;
27 kgid_t eof_gid;
28 prid_t eof_prid;
29 __u64 eof_min_file_size;
30};
31
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 34
35/*
36 * Flags for xfs_iget()
37 */
38#define XFS_IGET_CREATE 0x1
39#define XFS_IGET_UNTRUSTED 0x2
40#define XFS_IGET_DONTCACHE 0x4
41
27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, 42int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp); 43 uint flags, uint lock_flags, xfs_inode_t **ipp);
29 44
45/* recovery needs direct inode allocation capability */
46struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
47void xfs_inode_free(struct xfs_inode *ip);
48
30void xfs_reclaim_worker(struct work_struct *work); 49void xfs_reclaim_worker(struct work_struct *work);
31 50
32int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 51int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
33int xfs_reclaim_inodes_count(struct xfs_mount *mp); 52int xfs_reclaim_inodes_count(struct xfs_mount *mp);
34void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 53long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
35 54
36void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 55void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
37 56
@@ -49,4 +68,39 @@ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int flags, void *args), 68 int flags, void *args),
50 int flags, void *args, int tag); 69 int flags, void *args, int tag);
51 70
71static inline int
72xfs_fs_eofblocks_from_user(
73 struct xfs_fs_eofblocks *src,
74 struct xfs_eofblocks *dst)
75{
76 if (src->eof_version != XFS_EOFBLOCKS_VERSION)
77 return EINVAL;
78
79 if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
80 return EINVAL;
81
82 if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
83 memchr_inv(src->pad64, 0, sizeof(src->pad64)))
84 return EINVAL;
85
86 dst->eof_flags = src->eof_flags;
87 dst->eof_prid = src->eof_prid;
88 dst->eof_min_file_size = src->eof_min_file_size;
89
90 dst->eof_uid = INVALID_UID;
91 if (src->eof_flags & XFS_EOF_FLAGS_UID) {
92 dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
93 if (!uid_valid(dst->eof_uid))
94 return EINVAL;
95 }
96
97 dst->eof_gid = INVALID_GID;
98 if (src->eof_flags & XFS_EOF_FLAGS_GID) {
99 dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
100 if (!gid_valid(dst->eof_gid))
101 return EINVAL;
102 }
103 return 0;
104}
105
52#endif 106#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 7716a4e7375e..5a5a593994d4 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,23 +20,11 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h" 24#include "xfs_sb.h"
27#include "xfs_ag.h" 25#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h" 26#include "xfs_mount.h"
30#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_error.h" 28#include "xfs_error.h"
41#include "xfs_icreate_item.h" 29#include "xfs_icreate_item.h"
42 30
@@ -52,11 +40,14 @@ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
52 * 40 *
53 * We only need one iovec for the icreate log structure. 41 * We only need one iovec for the icreate log structure.
54 */ 42 */
55STATIC uint 43STATIC void
56xfs_icreate_item_size( 44xfs_icreate_item_size(
57 struct xfs_log_item *lip) 45 struct xfs_log_item *lip,
46 int *nvecs,
47 int *nbytes)
58{ 48{
59 return 1; 49 *nvecs += 1;
50 *nbytes += sizeof(struct xfs_icreate_log);
60} 51}
61 52
62/* 53/*
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index 88ba8aa0bc41..59e89f87c09b 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -18,24 +18,6 @@
18#ifndef XFS_ICREATE_ITEM_H 18#ifndef XFS_ICREATE_ITEM_H
19#define XFS_ICREATE_ITEM_H 1 19#define XFS_ICREATE_ITEM_H 1
20 20
21/*
22 * on disk log item structure
23 *
24 * Log recovery assumes the first two entries are the type and size and they fit
25 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
26 * decoding can be done correctly.
27 */
28struct xfs_icreate_log {
29 __uint16_t icl_type; /* type of log format structure */
30 __uint16_t icl_size; /* size of log format structure */
31 __be32 icl_ag; /* ag being allocated in */
32 __be32 icl_agbno; /* start block of inode range */
33 __be32 icl_count; /* number of inodes to initialise */
34 __be32 icl_isize; /* size of inodes */
35 __be32 icl_length; /* length of extent to initialise */
36 __be32 icl_gen; /* inode generation number to use */
37};
38
39/* in memory log item structure */ 21/* in memory log item structure */
40struct xfs_icreate_item { 22struct xfs_icreate_item {
41 struct xfs_log_item ic_item; 23 struct xfs_log_item ic_item;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bb262c25c8de..e3d75385aa76 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,18 +19,23 @@
19 19
20#include "xfs.h" 20#include "xfs.h"
21#include "xfs_fs.h" 21#include "xfs_fs.h"
22#include "xfs_types.h" 22#include "xfs_format.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_inum.h" 24#include "xfs_inum.h"
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_trans_space.h"
26#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
27#include "xfs_sb.h" 28#include "xfs_sb.h"
28#include "xfs_ag.h" 29#include "xfs_ag.h"
29#include "xfs_mount.h" 30#include "xfs_mount.h"
31#include "xfs_da_btree.h"
32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 34#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 35#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 36#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h" 37#include "xfs_attr_sf.h"
38#include "xfs_attr.h"
34#include "xfs_dinode.h" 39#include "xfs_dinode.h"
35#include "xfs_inode.h" 40#include "xfs_inode.h"
36#include "xfs_buf_item.h" 41#include "xfs_buf_item.h"
@@ -39,16 +44,15 @@
39#include "xfs_alloc.h" 44#include "xfs_alloc.h"
40#include "xfs_ialloc.h" 45#include "xfs_ialloc.h"
41#include "xfs_bmap.h" 46#include "xfs_bmap.h"
47#include "xfs_bmap_util.h"
42#include "xfs_error.h" 48#include "xfs_error.h"
43#include "xfs_utils.h"
44#include "xfs_quota.h" 49#include "xfs_quota.h"
45#include "xfs_filestream.h" 50#include "xfs_filestream.h"
46#include "xfs_vnodeops.h"
47#include "xfs_cksum.h" 51#include "xfs_cksum.h"
48#include "xfs_trace.h" 52#include "xfs_trace.h"
49#include "xfs_icache.h" 53#include "xfs_icache.h"
54#include "xfs_symlink.h"
50 55
51kmem_zone_t *xfs_ifork_zone;
52kmem_zone_t *xfs_inode_zone; 56kmem_zone_t *xfs_inode_zone;
53 57
54/* 58/*
@@ -58,9 +62,6 @@ kmem_zone_t *xfs_inode_zone;
58#define XFS_ITRUNC_MAX_EXTENTS 2 62#define XFS_ITRUNC_MAX_EXTENTS 2
59 63
60STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 64STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
61STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
62STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
63STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
64 65
65/* 66/*
66 * helper function to extract extent size hint from inode 67 * helper function to extract extent size hint from inode
@@ -310,623 +311,202 @@ xfs_isilocked(
310} 311}
311#endif 312#endif
312 313
313void
314__xfs_iflock(
315 struct xfs_inode *ip)
316{
317 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
318 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
319
320 do {
321 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
322 if (xfs_isiflocked(ip))
323 io_schedule();
324 } while (!xfs_iflock_nowait(ip));
325
326 finish_wait(wq, &wait.wait);
327}
328
329#ifdef DEBUG 314#ifdef DEBUG
315int xfs_locked_n;
316int xfs_small_retries;
317int xfs_middle_retries;
318int xfs_lots_retries;
319int xfs_lock_delays;
320#endif
321
330/* 322/*
331 * Make sure that the extents in the given memory buffer 323 * Bump the subclass so xfs_lock_inodes() acquires each lock with
332 * are valid. 324 * a different value
333 */ 325 */
334STATIC void 326static inline int
335xfs_validate_extents( 327xfs_lock_inumorder(int lock_mode, int subclass)
336 xfs_ifork_t *ifp,
337 int nrecs,
338 xfs_exntfmt_t fmt)
339{ 328{
340 xfs_bmbt_irec_t irec; 329 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
341 xfs_bmbt_rec_host_t rec; 330 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
342 int i; 331 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
332 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
343 333
344 for (i = 0; i < nrecs; i++) { 334 return lock_mode;
345 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
346 rec.l0 = get_unaligned(&ep->l0);
347 rec.l1 = get_unaligned(&ep->l1);
348 xfs_bmbt_get_all(&rec, &irec);
349 if (fmt == XFS_EXTFMT_NOSTATE)
350 ASSERT(irec.br_state == XFS_EXT_NORM);
351 }
352} 335}
353#else /* DEBUG */
354#define xfs_validate_extents(ifp, nrecs, fmt)
355#endif /* DEBUG */
356 336
357/* 337/*
358 * Check that none of the inode's in the buffer have a next 338 * The following routine will lock n inodes in exclusive mode.
359 * unlinked field of 0. 339 * We assume the caller calls us with the inodes in i_ino order.
340 *
341 * We need to detect deadlock where an inode that we lock
342 * is in the AIL and we start waiting for another inode that is locked
343 * by a thread in a long running transaction (such as truncate). This can
344 * result in deadlock since the long running trans might need to wait
345 * for the inode we just locked in order to push the tail and free space
346 * in the log.
360 */ 347 */
361#if defined(DEBUG)
362void 348void
363xfs_inobp_check( 349xfs_lock_inodes(
364 xfs_mount_t *mp, 350 xfs_inode_t **ips,
365 xfs_buf_t *bp) 351 int inodes,
352 uint lock_mode)
366{ 353{
367 int i; 354 int attempts = 0, i, j, try_lock;
368 int j; 355 xfs_log_item_t *lp;
369 xfs_dinode_t *dip;
370 356
371 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 357 ASSERT(ips && (inodes >= 2)); /* we need at least two */
372 358
373 for (i = 0; i < j; i++) { 359 try_lock = 0;
374 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 360 i = 0;
375 i * mp->m_sb.sb_inodesize);
376 if (!dip->di_next_unlinked) {
377 xfs_alert(mp,
378 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
379 bp);
380 ASSERT(dip->di_next_unlinked);
381 }
382 }
383}
384#endif
385 361
386static void 362again:
387xfs_inode_buf_verify( 363 for (; i < inodes; i++) {
388 struct xfs_buf *bp) 364 ASSERT(ips[i]);
389{
390 struct xfs_mount *mp = bp->b_target->bt_mount;
391 int i;
392 int ni;
393
394 /*
395 * Validate the magic number and version of every inode in the buffer
396 */
397 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
398 for (i = 0; i < ni; i++) {
399 int di_ok;
400 xfs_dinode_t *dip;
401
402 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
403 (i << mp->m_sb.sb_inodelog));
404 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
405 XFS_DINODE_GOOD_VERSION(dip->di_version);
406 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
407 XFS_ERRTAG_ITOBP_INOTOBP,
408 XFS_RANDOM_ITOBP_INOTOBP))) {
409 xfs_buf_ioerror(bp, EFSCORRUPTED);
410 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
411 mp, dip);
412#ifdef DEBUG
413 xfs_emerg(mp,
414 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
415 (unsigned long long)bp->b_bn, i,
416 be16_to_cpu(dip->di_magic));
417 ASSERT(0);
418#endif
419 }
420 }
421 xfs_inobp_check(mp, bp);
422}
423
424
425static void
426xfs_inode_buf_read_verify(
427 struct xfs_buf *bp)
428{
429 xfs_inode_buf_verify(bp);
430}
431
432static void
433xfs_inode_buf_write_verify(
434 struct xfs_buf *bp)
435{
436 xfs_inode_buf_verify(bp);
437}
438
439const struct xfs_buf_ops xfs_inode_buf_ops = {
440 .verify_read = xfs_inode_buf_read_verify,
441 .verify_write = xfs_inode_buf_write_verify,
442};
443 365
366 if (i && (ips[i] == ips[i-1])) /* Already locked */
367 continue;
444 368
445/* 369 /*
446 * This routine is called to map an inode to the buffer containing the on-disk 370 * If try_lock is not set yet, make sure all locked inodes
447 * version of the inode. It returns a pointer to the buffer containing the 371 * are not in the AIL.
448 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a 372 * If any are, set try_lock to be used later.
449 * pointer to the on-disk inode within that buffer. 373 */
450 *
451 * If a non-zero error is returned, then the contents of bpp and dipp are
452 * undefined.
453 */
454int
455xfs_imap_to_bp(
456 struct xfs_mount *mp,
457 struct xfs_trans *tp,
458 struct xfs_imap *imap,
459 struct xfs_dinode **dipp,
460 struct xfs_buf **bpp,
461 uint buf_flags,
462 uint iget_flags)
463{
464 struct xfs_buf *bp;
465 int error;
466 374
467 buf_flags |= XBF_UNMAPPED; 375 if (!try_lock) {
468 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 376 for (j = (i - 1); j >= 0 && !try_lock; j--) {
469 (int)imap->im_len, buf_flags, &bp, 377 lp = (xfs_log_item_t *)ips[j]->i_itemp;
470 &xfs_inode_buf_ops); 378 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
471 if (error) { 379 try_lock++;
472 if (error == EAGAIN) { 380 }
473 ASSERT(buf_flags & XBF_TRYLOCK); 381 }
474 return error;
475 } 382 }
476 383
477 if (error == EFSCORRUPTED && 384 /*
478 (iget_flags & XFS_IGET_UNTRUSTED)) 385 * If any of the previous locks we have locked is in the AIL,
479 return XFS_ERROR(EINVAL); 386 * we must TRY to get the second and subsequent locks. If
480 387 * we can't get any, we must release all we have
481 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", 388 * and try again.
482 __func__, error); 389 */
483 return error;
484 }
485
486 *bpp = bp;
487 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
488 return 0;
489}
490
491/*
492 * Move inode type and inode format specific information from the
493 * on-disk inode to the in-core inode. For fifos, devs, and sockets
494 * this means set if_rdev to the proper value. For files, directories,
495 * and symlinks this means to bring in the in-line data or extent
496 * pointers. For a file in B-tree format, only the root is immediately
497 * brought in-core. The rest will be in-lined in if_extents when it
498 * is first referenced (see xfs_iread_extents()).
499 */
500STATIC int
501xfs_iformat(
502 xfs_inode_t *ip,
503 xfs_dinode_t *dip)
504{
505 xfs_attr_shortform_t *atp;
506 int size;
507 int error = 0;
508 xfs_fsize_t di_size;
509
510 if (unlikely(be32_to_cpu(dip->di_nextents) +
511 be16_to_cpu(dip->di_anextents) >
512 be64_to_cpu(dip->di_nblocks))) {
513 xfs_warn(ip->i_mount,
514 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
515 (unsigned long long)ip->i_ino,
516 (int)(be32_to_cpu(dip->di_nextents) +
517 be16_to_cpu(dip->di_anextents)),
518 (unsigned long long)
519 be64_to_cpu(dip->di_nblocks));
520 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
521 ip->i_mount, dip);
522 return XFS_ERROR(EFSCORRUPTED);
523 }
524
525 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
526 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
527 (unsigned long long)ip->i_ino,
528 dip->di_forkoff);
529 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
530 ip->i_mount, dip);
531 return XFS_ERROR(EFSCORRUPTED);
532 }
533
534 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
535 !ip->i_mount->m_rtdev_targp)) {
536 xfs_warn(ip->i_mount,
537 "corrupt dinode %Lu, has realtime flag set.",
538 ip->i_ino);
539 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
540 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
541 return XFS_ERROR(EFSCORRUPTED);
542 }
543
544 switch (ip->i_d.di_mode & S_IFMT) {
545 case S_IFIFO:
546 case S_IFCHR:
547 case S_IFBLK:
548 case S_IFSOCK:
549 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
550 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
551 ip->i_mount, dip);
552 return XFS_ERROR(EFSCORRUPTED);
553 }
554 ip->i_d.di_size = 0;
555 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
556 break;
557 390
558 case S_IFREG: 391 if (try_lock) {
559 case S_IFLNK: 392 /* try_lock must be 0 if i is 0. */
560 case S_IFDIR:
561 switch (dip->di_format) {
562 case XFS_DINODE_FMT_LOCAL:
563 /* 393 /*
564 * no local regular files yet 394 * try_lock means we have an inode locked
395 * that is in the AIL.
565 */ 396 */
566 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { 397 ASSERT(i != 0);
567 xfs_warn(ip->i_mount, 398 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
568 "corrupt inode %Lu (local format for regular file).", 399 attempts++;
569 (unsigned long long) ip->i_ino); 400
570 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 401 /*
571 XFS_ERRLEVEL_LOW, 402 * Unlock all previous guys and try again.
572 ip->i_mount, dip); 403 * xfs_iunlock will try to push the tail
573 return XFS_ERROR(EFSCORRUPTED); 404 * if the inode is in the AIL.
574 } 405 */
406
407 for(j = i - 1; j >= 0; j--) {
408
409 /*
410 * Check to see if we've already
411 * unlocked this one.
412 * Not the first one going back,
413 * and the inode ptr is the same.
414 */
415 if ((j != (i - 1)) && ips[j] ==
416 ips[j+1])
417 continue;
418
419 xfs_iunlock(ips[j], lock_mode);
420 }
575 421
576 di_size = be64_to_cpu(dip->di_size); 422 if ((attempts % 5) == 0) {
577 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 423 delay(1); /* Don't just spin the CPU */
578 xfs_warn(ip->i_mount, 424#ifdef DEBUG
579 "corrupt inode %Lu (bad size %Ld for local inode).", 425 xfs_lock_delays++;
580 (unsigned long long) ip->i_ino, 426#endif
581 (long long) di_size); 427 }
582 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 428 i = 0;
583 XFS_ERRLEVEL_LOW, 429 try_lock = 0;
584 ip->i_mount, dip); 430 goto again;
585 return XFS_ERROR(EFSCORRUPTED);
586 } 431 }
587 432 } else {
588 size = (int)di_size; 433 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
589 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
590 break;
591 case XFS_DINODE_FMT_EXTENTS:
592 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
593 break;
594 case XFS_DINODE_FMT_BTREE:
595 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
596 break;
597 default:
598 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
599 ip->i_mount);
600 return XFS_ERROR(EFSCORRUPTED);
601 } 434 }
602 break;
603
604 default:
605 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
606 return XFS_ERROR(EFSCORRUPTED);
607 }
608 if (error) {
609 return error;
610 } 435 }
611 if (!XFS_DFORK_Q(dip))
612 return 0;
613
614 ASSERT(ip->i_afp == NULL);
615 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
616
617 switch (dip->di_aformat) {
618 case XFS_DINODE_FMT_LOCAL:
619 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
620 size = be16_to_cpu(atp->hdr.totsize);
621
622 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
623 xfs_warn(ip->i_mount,
624 "corrupt inode %Lu (bad attr fork size %Ld).",
625 (unsigned long long) ip->i_ino,
626 (long long) size);
627 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
628 XFS_ERRLEVEL_LOW,
629 ip->i_mount, dip);
630 return XFS_ERROR(EFSCORRUPTED);
631 }
632 436
633 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 437#ifdef DEBUG
634 break; 438 if (attempts) {
635 case XFS_DINODE_FMT_EXTENTS: 439 if (attempts < 5) xfs_small_retries++;
636 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 440 else if (attempts < 100) xfs_middle_retries++;
637 break; 441 else xfs_lots_retries++;
638 case XFS_DINODE_FMT_BTREE: 442 } else {
639 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 443 xfs_locked_n++;
640 break;
641 default:
642 error = XFS_ERROR(EFSCORRUPTED);
643 break;
644 }
645 if (error) {
646 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
647 ip->i_afp = NULL;
648 xfs_idestroy_fork(ip, XFS_DATA_FORK);
649 } 444 }
650 return error; 445#endif
651} 446}
652 447
653/* 448/*
654 * The file is in-lined in the on-disk inode. 449 * xfs_lock_two_inodes() can only be used to lock one type of lock
655 * If it fits into if_inline_data, then copy 450 * at a time - the iolock or the ilock, but not both at once. If
656 * it there, otherwise allocate a buffer for it 451 * we lock both at once, lockdep will report false positives saying
657 * and copy the data there. Either way, set 452 * we have violated locking orders.
658 * if_data to point at the data.
659 * If we allocate a buffer for the data, make
660 * sure that its size is a multiple of 4 and
661 * record the real size in i_real_bytes.
662 */ 453 */
663STATIC int 454void
664xfs_iformat_local( 455xfs_lock_two_inodes(
665 xfs_inode_t *ip, 456 xfs_inode_t *ip0,
666 xfs_dinode_t *dip, 457 xfs_inode_t *ip1,
667 int whichfork, 458 uint lock_mode)
668 int size)
669{ 459{
670 xfs_ifork_t *ifp; 460 xfs_inode_t *temp;
671 int real_size; 461 int attempts = 0;
672 462 xfs_log_item_t *lp;
673 /*
674 * If the size is unreasonable, then something
675 * is wrong and we just bail out rather than crash in
676 * kmem_alloc() or memcpy() below.
677 */
678 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
679 xfs_warn(ip->i_mount,
680 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
681 (unsigned long long) ip->i_ino, size,
682 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
683 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
684 ip->i_mount, dip);
685 return XFS_ERROR(EFSCORRUPTED);
686 }
687 ifp = XFS_IFORK_PTR(ip, whichfork);
688 real_size = 0;
689 if (size == 0)
690 ifp->if_u1.if_data = NULL;
691 else if (size <= sizeof(ifp->if_u2.if_inline_data))
692 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
693 else {
694 real_size = roundup(size, 4);
695 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
696 }
697 ifp->if_bytes = size;
698 ifp->if_real_bytes = real_size;
699 if (size)
700 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
701 ifp->if_flags &= ~XFS_IFEXTENTS;
702 ifp->if_flags |= XFS_IFINLINE;
703 return 0;
704}
705 463
706/* 464 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
707 * The file consists of a set of extents all 465 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
708 * of which fit into the on-disk inode. 466 ASSERT(ip0->i_ino != ip1->i_ino);
709 * If there are few enough extents to fit into
710 * the if_inline_ext, then copy them there.
711 * Otherwise allocate a buffer for them and copy
712 * them into it. Either way, set if_extents
713 * to point at the extents.
714 */
715STATIC int
716xfs_iformat_extents(
717 xfs_inode_t *ip,
718 xfs_dinode_t *dip,
719 int whichfork)
720{
721 xfs_bmbt_rec_t *dp;
722 xfs_ifork_t *ifp;
723 int nex;
724 int size;
725 int i;
726
727 ifp = XFS_IFORK_PTR(ip, whichfork);
728 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
729 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
730
731 /*
732 * If the number of extents is unreasonable, then something
733 * is wrong and we just bail out rather than crash in
734 * kmem_alloc() or memcpy() below.
735 */
736 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
737 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
738 (unsigned long long) ip->i_ino, nex);
739 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
740 ip->i_mount, dip);
741 return XFS_ERROR(EFSCORRUPTED);
742 }
743
744 ifp->if_real_bytes = 0;
745 if (nex == 0)
746 ifp->if_u1.if_extents = NULL;
747 else if (nex <= XFS_INLINE_EXTS)
748 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
749 else
750 xfs_iext_add(ifp, 0, nex);
751
752 ifp->if_bytes = size;
753 if (size) {
754 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
755 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
756 for (i = 0; i < nex; i++, dp++) {
757 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
758 ep->l0 = get_unaligned_be64(&dp->l0);
759 ep->l1 = get_unaligned_be64(&dp->l1);
760 }
761 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
762 if (whichfork != XFS_DATA_FORK ||
763 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
764 if (unlikely(xfs_check_nostate_extents(
765 ifp, 0, nex))) {
766 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
767 XFS_ERRLEVEL_LOW,
768 ip->i_mount);
769 return XFS_ERROR(EFSCORRUPTED);
770 }
771 }
772 ifp->if_flags |= XFS_IFEXTENTS;
773 return 0;
774}
775 467
776/* 468 if (ip0->i_ino > ip1->i_ino) {
777 * The file has too many extents to fit into 469 temp = ip0;
778 * the inode, so they are in B-tree format. 470 ip0 = ip1;
779 * Allocate a buffer for the root of the B-tree 471 ip1 = temp;
780 * and copy the root into it. The i_extents 472 }
781 * field will remain NULL until all of the
782 * extents are read in (when they are needed).
783 */
784STATIC int
785xfs_iformat_btree(
786 xfs_inode_t *ip,
787 xfs_dinode_t *dip,
788 int whichfork)
789{
790 struct xfs_mount *mp = ip->i_mount;
791 xfs_bmdr_block_t *dfp;
792 xfs_ifork_t *ifp;
793 /* REFERENCED */
794 int nrecs;
795 int size;
796
797 ifp = XFS_IFORK_PTR(ip, whichfork);
798 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
799 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
800 nrecs = be16_to_cpu(dfp->bb_numrecs);
801
802 /*
803 * blow out if -- fork has less extents than can fit in
804 * fork (fork shouldn't be a btree format), root btree
805 * block has more records than can fit into the fork,
806 * or the number of extents is greater than the number of
807 * blocks.
808 */
809 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
810 XFS_IFORK_MAXEXT(ip, whichfork) ||
811 XFS_BMDR_SPACE_CALC(nrecs) >
812 XFS_DFORK_SIZE(dip, mp, whichfork) ||
813 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
814 xfs_warn(mp, "corrupt inode %Lu (btree).",
815 (unsigned long long) ip->i_ino);
816 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
817 mp, dip);
818 return XFS_ERROR(EFSCORRUPTED);
819 }
820
821 ifp->if_broot_bytes = size;
822 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
823 ASSERT(ifp->if_broot != NULL);
824 /*
825 * Copy and convert from the on-disk structure
826 * to the in-memory structure.
827 */
828 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
829 ifp->if_broot, size);
830 ifp->if_flags &= ~XFS_IFEXTENTS;
831 ifp->if_flags |= XFS_IFBROOT;
832 473
833 return 0; 474 again:
834} 475 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
835 476
836STATIC void 477 /*
837xfs_dinode_from_disk( 478 * If the first lock we have locked is in the AIL, we must TRY to get
838 xfs_icdinode_t *to, 479 * the second lock. If we can't get it, we must release the first one
839 xfs_dinode_t *from) 480 * and try again.
840{ 481 */
841 to->di_magic = be16_to_cpu(from->di_magic); 482 lp = (xfs_log_item_t *)ip0->i_itemp;
842 to->di_mode = be16_to_cpu(from->di_mode); 483 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
843 to->di_version = from ->di_version; 484 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
844 to->di_format = from->di_format; 485 xfs_iunlock(ip0, lock_mode);
845 to->di_onlink = be16_to_cpu(from->di_onlink); 486 if ((++attempts % 5) == 0)
846 to->di_uid = be32_to_cpu(from->di_uid); 487 delay(1); /* Don't just spin the CPU */
847 to->di_gid = be32_to_cpu(from->di_gid); 488 goto again;
848 to->di_nlink = be32_to_cpu(from->di_nlink); 489 }
849 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 490 } else {
850 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 491 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
851 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
852 to->di_flushiter = be16_to_cpu(from->di_flushiter);
853 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
854 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
855 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
856 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
857 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
858 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
859 to->di_size = be64_to_cpu(from->di_size);
860 to->di_nblocks = be64_to_cpu(from->di_nblocks);
861 to->di_extsize = be32_to_cpu(from->di_extsize);
862 to->di_nextents = be32_to_cpu(from->di_nextents);
863 to->di_anextents = be16_to_cpu(from->di_anextents);
864 to->di_forkoff = from->di_forkoff;
865 to->di_aformat = from->di_aformat;
866 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
867 to->di_dmstate = be16_to_cpu(from->di_dmstate);
868 to->di_flags = be16_to_cpu(from->di_flags);
869 to->di_gen = be32_to_cpu(from->di_gen);
870
871 if (to->di_version == 3) {
872 to->di_changecount = be64_to_cpu(from->di_changecount);
873 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
874 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
875 to->di_flags2 = be64_to_cpu(from->di_flags2);
876 to->di_ino = be64_to_cpu(from->di_ino);
877 to->di_lsn = be64_to_cpu(from->di_lsn);
878 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
879 uuid_copy(&to->di_uuid, &from->di_uuid);
880 } 492 }
881} 493}
882 494
495
883void 496void
884xfs_dinode_to_disk( 497__xfs_iflock(
885 xfs_dinode_t *to, 498 struct xfs_inode *ip)
886 xfs_icdinode_t *from)
887{ 499{
888 to->di_magic = cpu_to_be16(from->di_magic); 500 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
889 to->di_mode = cpu_to_be16(from->di_mode); 501 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
890 to->di_version = from ->di_version; 502
891 to->di_format = from->di_format; 503 do {
892 to->di_onlink = cpu_to_be16(from->di_onlink); 504 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
893 to->di_uid = cpu_to_be32(from->di_uid); 505 if (xfs_isiflocked(ip))
894 to->di_gid = cpu_to_be32(from->di_gid); 506 io_schedule();
895 to->di_nlink = cpu_to_be32(from->di_nlink); 507 } while (!xfs_iflock_nowait(ip));
896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 508
897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 509 finish_wait(wq, &wait.wait);
898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
899 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
900 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
901 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
902 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
903 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
904 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
905 to->di_size = cpu_to_be64(from->di_size);
906 to->di_nblocks = cpu_to_be64(from->di_nblocks);
907 to->di_extsize = cpu_to_be32(from->di_extsize);
908 to->di_nextents = cpu_to_be32(from->di_nextents);
909 to->di_anextents = cpu_to_be16(from->di_anextents);
910 to->di_forkoff = from->di_forkoff;
911 to->di_aformat = from->di_aformat;
912 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
913 to->di_dmstate = cpu_to_be16(from->di_dmstate);
914 to->di_flags = cpu_to_be16(from->di_flags);
915 to->di_gen = cpu_to_be32(from->di_gen);
916
917 if (from->di_version == 3) {
918 to->di_changecount = cpu_to_be64(from->di_changecount);
919 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
920 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
921 to->di_flags2 = cpu_to_be64(from->di_flags2);
922 to->di_ino = cpu_to_be64(from->di_ino);
923 to->di_lsn = cpu_to_be64(from->di_lsn);
924 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
925 uuid_copy(&to->di_uuid, &from->di_uuid);
926 to->di_flushiter = 0;
927 } else {
928 to->di_flushiter = cpu_to_be16(from->di_flushiter);
929 }
930} 510}
931 511
932STATIC uint 512STATIC uint
@@ -987,235 +567,50 @@ xfs_dic2xflags(
987 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 567 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
988} 568}
989 569
990static bool
991xfs_dinode_verify(
992 struct xfs_mount *mp,
993 struct xfs_inode *ip,
994 struct xfs_dinode *dip)
995{
996 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
997 return false;
998
999 /* only version 3 or greater inodes are extensively verified here */
1000 if (dip->di_version < 3)
1001 return true;
1002
1003 if (!xfs_sb_version_hascrc(&mp->m_sb))
1004 return false;
1005 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
1006 offsetof(struct xfs_dinode, di_crc)))
1007 return false;
1008 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
1009 return false;
1010 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
1011 return false;
1012 return true;
1013}
1014
1015void
1016xfs_dinode_calc_crc(
1017 struct xfs_mount *mp,
1018 struct xfs_dinode *dip)
1019{
1020 __uint32_t crc;
1021
1022 if (dip->di_version < 3)
1023 return;
1024
1025 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
1026 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
1027 offsetof(struct xfs_dinode, di_crc));
1028 dip->di_crc = xfs_end_cksum(crc);
1029}
1030
1031/* 570/*
1032 * Read the disk inode attributes into the in-core inode structure. 571 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
1033 * 572 * is allowed, otherwise it has to be an exact match. If a CI match is found,
1034 * For version 5 superblocks, if we are initialising a new inode and we are not 573 * ci_name->name will point to a the actual name (caller must free) or
1035 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new 574 * will be set to NULL if an exact match is found.
1036 * inode core with a random generation number. If we are keeping inodes around,
1037 * we need to read the inode cluster to get the existing generation number off
1038 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
1039 * format) then log recovery is dependent on the di_flushiter field being
1040 * initialised from the current on-disk value and hence we must also read the
1041 * inode off disk.
1042 */ 575 */
1043int 576int
1044xfs_iread( 577xfs_lookup(
1045 xfs_mount_t *mp, 578 xfs_inode_t *dp,
1046 xfs_trans_t *tp, 579 struct xfs_name *name,
1047 xfs_inode_t *ip, 580 xfs_inode_t **ipp,
1048 uint iget_flags) 581 struct xfs_name *ci_name)
1049{ 582{
1050 xfs_buf_t *bp; 583 xfs_ino_t inum;
1051 xfs_dinode_t *dip; 584 int error;
1052 int error; 585 uint lock_mode;
1053
1054 /*
1055 * Fill in the location information in the in-core inode.
1056 */
1057 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
1058 if (error)
1059 return error;
1060
1061 /* shortcut IO on inode allocation if possible */
1062 if ((iget_flags & XFS_IGET_CREATE) &&
1063 xfs_sb_version_hascrc(&mp->m_sb) &&
1064 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1065 /* initialise the on-disk inode core */
1066 memset(&ip->i_d, 0, sizeof(ip->i_d));
1067 ip->i_d.di_magic = XFS_DINODE_MAGIC;
1068 ip->i_d.di_gen = prandom_u32();
1069 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1070 ip->i_d.di_version = 3;
1071 ip->i_d.di_ino = ip->i_ino;
1072 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
1073 } else
1074 ip->i_d.di_version = 2;
1075 return 0;
1076 }
1077
1078 /*
1079 * Get pointers to the on-disk inode and the buffer containing it.
1080 */
1081 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
1082 if (error)
1083 return error;
1084 586
1085 /* even unallocated inodes are verified */ 587 trace_xfs_lookup(dp, name);
1086 if (!xfs_dinode_verify(mp, ip, dip)) {
1087 xfs_alert(mp, "%s: validation failed for inode %lld failed",
1088 __func__, ip->i_ino);
1089 588
1090 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); 589 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1091 error = XFS_ERROR(EFSCORRUPTED); 590 return XFS_ERROR(EIO);
1092 goto out_brelse;
1093 }
1094 591
1095 /* 592 lock_mode = xfs_ilock_map_shared(dp);
1096 * If the on-disk inode is already linked to a directory 593 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
1097 * entry, copy all of the inode into the in-core inode. 594 xfs_iunlock_map_shared(dp, lock_mode);
1098 * xfs_iformat() handles copying in the inode format
1099 * specific information.
1100 * Otherwise, just get the truly permanent information.
1101 */
1102 if (dip->di_mode) {
1103 xfs_dinode_from_disk(&ip->i_d, dip);
1104 error = xfs_iformat(ip, dip);
1105 if (error) {
1106#ifdef DEBUG
1107 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
1108 __func__, error);
1109#endif /* DEBUG */
1110 goto out_brelse;
1111 }
1112 } else {
1113 /*
1114 * Partial initialisation of the in-core inode. Just the bits
1115 * that xfs_ialloc won't overwrite or relies on being correct.
1116 */
1117 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1118 ip->i_d.di_version = dip->di_version;
1119 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1120 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1121
1122 if (dip->di_version == 3) {
1123 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
1124 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
1125 }
1126 595
1127 /* 596 if (error)
1128 * Make sure to pull in the mode here as well in 597 goto out;
1129 * case the inode is released without being used.
1130 * This ensures that xfs_inactive() will see that
1131 * the inode is already free and not try to mess
1132 * with the uninitialized part of it.
1133 */
1134 ip->i_d.di_mode = 0;
1135 }
1136
1137 /*
1138 * The inode format changed when we moved the link count and
1139 * made it 32 bits long. If this is an old format inode,
1140 * convert it in memory to look like a new one. If it gets
1141 * flushed to disk we will convert back before flushing or
1142 * logging it. We zero out the new projid field and the old link
1143 * count field. We'll handle clearing the pad field (the remains
1144 * of the old uuid field) when we actually convert the inode to
1145 * the new format. We don't change the version number so that we
1146 * can distinguish this from a real new format inode.
1147 */
1148 if (ip->i_d.di_version == 1) {
1149 ip->i_d.di_nlink = ip->i_d.di_onlink;
1150 ip->i_d.di_onlink = 0;
1151 xfs_set_projid(ip, 0);
1152 }
1153 598
1154 ip->i_delayed_blks = 0; 599 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
600 if (error)
601 goto out_free_name;
1155 602
1156 /* 603 return 0;
1157 * Mark the buffer containing the inode as something to keep
1158 * around for a while. This helps to keep recently accessed
1159 * meta-data in-core longer.
1160 */
1161 xfs_buf_set_ref(bp, XFS_INO_REF);
1162 604
1163 /* 605out_free_name:
1164 * Use xfs_trans_brelse() to release the buffer containing the on-disk 606 if (ci_name)
1165 * inode, because it was acquired with xfs_trans_read_buf() in 607 kmem_free(ci_name->name);
1166 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal 608out:
1167 * brelse(). If we're within a transaction, then xfs_trans_brelse() 609 *ipp = NULL;
1168 * will only release the buffer if it is not dirty within the
1169 * transaction. It will be OK to release the buffer in this case,
1170 * because inodes on disk are never destroyed and we will be locking the
1171 * new in-core inode before putting it in the cache where other
1172 * processes can find it. Thus we don't have to worry about the inode
1173 * being changed just because we released the buffer.
1174 */
1175 out_brelse:
1176 xfs_trans_brelse(tp, bp);
1177 return error; 610 return error;
1178} 611}
1179 612
1180/* 613/*
1181 * Read in extents from a btree-format inode.
1182 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
1183 */
1184int
1185xfs_iread_extents(
1186 xfs_trans_t *tp,
1187 xfs_inode_t *ip,
1188 int whichfork)
1189{
1190 int error;
1191 xfs_ifork_t *ifp;
1192 xfs_extnum_t nextents;
1193
1194 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1195 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1196 ip->i_mount);
1197 return XFS_ERROR(EFSCORRUPTED);
1198 }
1199 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1200 ifp = XFS_IFORK_PTR(ip, whichfork);
1201
1202 /*
1203 * We know that the size is valid (it's checked in iformat_btree)
1204 */
1205 ifp->if_bytes = ifp->if_real_bytes = 0;
1206 ifp->if_flags |= XFS_IFEXTENTS;
1207 xfs_iext_add(ifp, 0, nextents);
1208 error = xfs_bmap_read_extents(tp, ip, whichfork);
1209 if (error) {
1210 xfs_iext_destroy(ifp);
1211 ifp->if_flags &= ~XFS_IFEXTENTS;
1212 return error;
1213 }
1214 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1215 return 0;
1216}
1217
1218/*
1219 * Allocate an inode on disk and return a copy of its in-core version. 614 * Allocate an inode on disk and return a copy of its in-core version.
1220 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 615 * The in-core inode is locked exclusively. Set mode, nlink, and rdev
1221 * appropriately within the inode. The uid and gid for the inode are 616 * appropriately within the inode. The uid and gid for the inode are
@@ -1295,8 +690,8 @@ xfs_ialloc(
1295 ip->i_d.di_onlink = 0; 690 ip->i_d.di_onlink = 0;
1296 ip->i_d.di_nlink = nlink; 691 ip->i_d.di_nlink = nlink;
1297 ASSERT(ip->i_d.di_nlink == nlink); 692 ASSERT(ip->i_d.di_nlink == nlink);
1298 ip->i_d.di_uid = current_fsuid(); 693 ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
1299 ip->i_d.di_gid = current_fsgid(); 694 ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
1300 xfs_set_projid(ip, prid); 695 xfs_set_projid(ip, prid);
1301 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 696 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1302 697
@@ -1335,7 +730,7 @@ xfs_ialloc(
1335 */ 730 */
1336 if ((irix_sgid_inherit) && 731 if ((irix_sgid_inherit) &&
1337 (ip->i_d.di_mode & S_ISGID) && 732 (ip->i_d.di_mode & S_ISGID) &&
1338 (!in_group_p((gid_t)ip->i_d.di_gid))) { 733 (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
1339 ip->i_d.di_mode &= ~S_ISGID; 734 ip->i_d.di_mode &= ~S_ISGID;
1340 } 735 }
1341 736
@@ -1467,6 +862,583 @@ xfs_ialloc(
1467} 862}
1468 863
1469/* 864/*
865 * Allocates a new inode from disk and return a pointer to the
866 * incore copy. This routine will internally commit the current
867 * transaction and allocate a new one if the Space Manager needed
868 * to do an allocation to replenish the inode free-list.
869 *
870 * This routine is designed to be called from xfs_create and
871 * xfs_create_dir.
872 *
873 */
874int
875xfs_dir_ialloc(
876 xfs_trans_t **tpp, /* input: current transaction;
877 output: may be a new transaction. */
878 xfs_inode_t *dp, /* directory within whose allocate
879 the inode. */
880 umode_t mode,
881 xfs_nlink_t nlink,
882 xfs_dev_t rdev,
883 prid_t prid, /* project id */
884 int okalloc, /* ok to allocate new space */
885 xfs_inode_t **ipp, /* pointer to inode; it will be
886 locked. */
887 int *committed)
888
889{
890 xfs_trans_t *tp;
891 xfs_trans_t *ntp;
892 xfs_inode_t *ip;
893 xfs_buf_t *ialloc_context = NULL;
894 int code;
895 void *dqinfo;
896 uint tflags;
897
898 tp = *tpp;
899 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
900
901 /*
902 * xfs_ialloc will return a pointer to an incore inode if
903 * the Space Manager has an available inode on the free
904 * list. Otherwise, it will do an allocation and replenish
905 * the freelist. Since we can only do one allocation per
906 * transaction without deadlocks, we will need to commit the
907 * current transaction and start a new one. We will then
908 * need to call xfs_ialloc again to get the inode.
909 *
910 * If xfs_ialloc did an allocation to replenish the freelist,
911 * it returns the bp containing the head of the freelist as
912 * ialloc_context. We will hold a lock on it across the
913 * transaction commit so that no other process can steal
914 * the inode(s) that we've just allocated.
915 */
916 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
917 &ialloc_context, &ip);
918
919 /*
920 * Return an error if we were unable to allocate a new inode.
921 * This should only happen if we run out of space on disk or
922 * encounter a disk error.
923 */
924 if (code) {
925 *ipp = NULL;
926 return code;
927 }
928 if (!ialloc_context && !ip) {
929 *ipp = NULL;
930 return XFS_ERROR(ENOSPC);
931 }
932
933 /*
934 * If the AGI buffer is non-NULL, then we were unable to get an
935 * inode in one operation. We need to commit the current
936 * transaction and call xfs_ialloc() again. It is guaranteed
937 * to succeed the second time.
938 */
939 if (ialloc_context) {
940 struct xfs_trans_res tres;
941
942 /*
943 * Normally, xfs_trans_commit releases all the locks.
944 * We call bhold to hang on to the ialloc_context across
945 * the commit. Holding this buffer prevents any other
946 * processes from doing any allocations in this
947 * allocation group.
948 */
949 xfs_trans_bhold(tp, ialloc_context);
950 /*
951 * Save the log reservation so we can use
952 * them in the next transaction.
953 */
954 tres.tr_logres = xfs_trans_get_log_res(tp);
955 tres.tr_logcount = xfs_trans_get_log_count(tp);
956
957 /*
958 * We want the quota changes to be associated with the next
959 * transaction, NOT this one. So, detach the dqinfo from this
960 * and attach it to the next transaction.
961 */
962 dqinfo = NULL;
963 tflags = 0;
964 if (tp->t_dqinfo) {
965 dqinfo = (void *)tp->t_dqinfo;
966 tp->t_dqinfo = NULL;
967 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
968 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
969 }
970
971 ntp = xfs_trans_dup(tp);
972 code = xfs_trans_commit(tp, 0);
973 tp = ntp;
974 if (committed != NULL) {
975 *committed = 1;
976 }
977 /*
978 * If we get an error during the commit processing,
979 * release the buffer that is still held and return
980 * to the caller.
981 */
982 if (code) {
983 xfs_buf_relse(ialloc_context);
984 if (dqinfo) {
985 tp->t_dqinfo = dqinfo;
986 xfs_trans_free_dqinfo(tp);
987 }
988 *tpp = ntp;
989 *ipp = NULL;
990 return code;
991 }
992
993 /*
994 * transaction commit worked ok so we can drop the extra ticket
995 * reference that we gained in xfs_trans_dup()
996 */
997 xfs_log_ticket_put(tp->t_ticket);
998 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
999 code = xfs_trans_reserve(tp, &tres, 0, 0);
1000
1001 /*
1002 * Re-attach the quota info that we detached from prev trx.
1003 */
1004 if (dqinfo) {
1005 tp->t_dqinfo = dqinfo;
1006 tp->t_flags |= tflags;
1007 }
1008
1009 if (code) {
1010 xfs_buf_relse(ialloc_context);
1011 *tpp = ntp;
1012 *ipp = NULL;
1013 return code;
1014 }
1015 xfs_trans_bjoin(tp, ialloc_context);
1016
1017 /*
1018 * Call ialloc again. Since we've locked out all
1019 * other allocations in this allocation group,
1020 * this call should always succeed.
1021 */
1022 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1023 okalloc, &ialloc_context, &ip);
1024
1025 /*
1026 * If we get an error at this point, return to the caller
1027 * so that the current transaction can be aborted.
1028 */
1029 if (code) {
1030 *tpp = tp;
1031 *ipp = NULL;
1032 return code;
1033 }
1034 ASSERT(!ialloc_context && ip);
1035
1036 } else {
1037 if (committed != NULL)
1038 *committed = 0;
1039 }
1040
1041 *ipp = ip;
1042 *tpp = tp;
1043
1044 return 0;
1045}
1046
1047/*
1048 * Decrement the link count on an inode & log the change.
1049 * If this causes the link count to go to zero, initiate the
1050 * logging activity required to truncate a file.
1051 */
1052int /* error */
1053xfs_droplink(
1054 xfs_trans_t *tp,
1055 xfs_inode_t *ip)
1056{
1057 int error;
1058
1059 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1060
1061 ASSERT (ip->i_d.di_nlink > 0);
1062 ip->i_d.di_nlink--;
1063 drop_nlink(VFS_I(ip));
1064 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1065
1066 error = 0;
1067 if (ip->i_d.di_nlink == 0) {
1068 /*
1069 * We're dropping the last link to this file.
1070 * Move the on-disk inode to the AGI unlinked list.
1071 * From xfs_inactive() we will pull the inode from
1072 * the list and free it.
1073 */
1074 error = xfs_iunlink(tp, ip);
1075 }
1076 return error;
1077}
1078
1079/*
1080 * This gets called when the inode's version needs to be changed from 1 to 2.
1081 * Currently this happens when the nlink field overflows the old 16-bit value
1082 * or when chproj is called to change the project for the first time.
1083 * As a side effect the superblock version will also get rev'd
1084 * to contain the NLINK bit.
1085 */
1086void
1087xfs_bump_ino_vers2(
1088 xfs_trans_t *tp,
1089 xfs_inode_t *ip)
1090{
1091 xfs_mount_t *mp;
1092
1093 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1094 ASSERT(ip->i_d.di_version == 1);
1095
1096 ip->i_d.di_version = 2;
1097 ip->i_d.di_onlink = 0;
1098 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1099 mp = tp->t_mountp;
1100 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1101 spin_lock(&mp->m_sb_lock);
1102 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1103 xfs_sb_version_addnlink(&mp->m_sb);
1104 spin_unlock(&mp->m_sb_lock);
1105 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
1106 } else {
1107 spin_unlock(&mp->m_sb_lock);
1108 }
1109 }
1110 /* Caller must log the inode */
1111}
1112
1113/*
1114 * Increment the link count on an inode & log the change.
1115 */
1116int
1117xfs_bumplink(
1118 xfs_trans_t *tp,
1119 xfs_inode_t *ip)
1120{
1121 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1122
1123 ASSERT(ip->i_d.di_nlink > 0);
1124 ip->i_d.di_nlink++;
1125 inc_nlink(VFS_I(ip));
1126 if ((ip->i_d.di_version == 1) &&
1127 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
1128 /*
1129 * The inode has increased its number of links beyond
1130 * what can fit in an old format inode. It now needs
1131 * to be converted to a version 2 inode with a 32 bit
1132 * link count. If this is the first inode in the file
1133 * system to do this, then we need to bump the superblock
1134 * version number as well.
1135 */
1136 xfs_bump_ino_vers2(tp, ip);
1137 }
1138
1139 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1140 return 0;
1141}
1142
1143int
1144xfs_create(
1145 xfs_inode_t *dp,
1146 struct xfs_name *name,
1147 umode_t mode,
1148 xfs_dev_t rdev,
1149 xfs_inode_t **ipp)
1150{
1151 int is_dir = S_ISDIR(mode);
1152 struct xfs_mount *mp = dp->i_mount;
1153 struct xfs_inode *ip = NULL;
1154 struct xfs_trans *tp = NULL;
1155 int error;
1156 xfs_bmap_free_t free_list;
1157 xfs_fsblock_t first_block;
1158 bool unlock_dp_on_error = false;
1159 uint cancel_flags;
1160 int committed;
1161 prid_t prid;
1162 struct xfs_dquot *udqp = NULL;
1163 struct xfs_dquot *gdqp = NULL;
1164 struct xfs_dquot *pdqp = NULL;
1165 struct xfs_trans_res tres;
1166 uint resblks;
1167
1168 trace_xfs_create(dp, name);
1169
1170 if (XFS_FORCED_SHUTDOWN(mp))
1171 return XFS_ERROR(EIO);
1172
1173 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1174 prid = xfs_get_projid(dp);
1175 else
1176 prid = XFS_PROJID_DEFAULT;
1177
1178 /*
1179 * Make sure that we have allocated dquot(s) on disk.
1180 */
1181 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1182 xfs_kgid_to_gid(current_fsgid()), prid,
1183 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1184 &udqp, &gdqp, &pdqp);
1185 if (error)
1186 return error;
1187
1188 if (is_dir) {
1189 rdev = 0;
1190 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1191 tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
1192 tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
1193 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1194 } else {
1195 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1196 tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
1197 tres.tr_logcount = XFS_CREATE_LOG_COUNT;
1198 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1199 }
1200
1201 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1202
1203 /*
1204 * Initially assume that the file does not exist and
1205 * reserve the resources for that case. If that is not
1206 * the case we'll drop the one we have and get a more
1207 * appropriate transaction later.
1208 */
1209 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1210 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1211 if (error == ENOSPC) {
1212 /* flush outstanding delalloc blocks and retry */
1213 xfs_flush_inodes(mp);
1214 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1215 }
1216 if (error == ENOSPC) {
1217 /* No space at all so try a "no-allocation" reservation */
1218 resblks = 0;
1219 error = xfs_trans_reserve(tp, &tres, 0, 0);
1220 }
1221 if (error) {
1222 cancel_flags = 0;
1223 goto out_trans_cancel;
1224 }
1225
1226 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1227 unlock_dp_on_error = true;
1228
1229 xfs_bmap_init(&free_list, &first_block);
1230
1231 /*
1232 * Reserve disk quota and the inode.
1233 */
1234 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1235 pdqp, resblks, 1, 0);
1236 if (error)
1237 goto out_trans_cancel;
1238
1239 error = xfs_dir_canenter(tp, dp, name, resblks);
1240 if (error)
1241 goto out_trans_cancel;
1242
1243 /*
1244 * A newly created regular or special file just has one directory
1245 * entry pointing to them, but a directory also the "." entry
1246 * pointing to itself.
1247 */
1248 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1249 prid, resblks > 0, &ip, &committed);
1250 if (error) {
1251 if (error == ENOSPC)
1252 goto out_trans_cancel;
1253 goto out_trans_abort;
1254 }
1255
1256 /*
1257 * Now we join the directory inode to the transaction. We do not do it
1258 * earlier because xfs_dir_ialloc might commit the previous transaction
1259 * (and release all the locks). An error from here on will result in
1260 * the transaction cancel unlocking dp so don't do it explicitly in the
1261 * error path.
1262 */
1263 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1264 unlock_dp_on_error = false;
1265
1266 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1267 &first_block, &free_list, resblks ?
1268 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1269 if (error) {
1270 ASSERT(error != ENOSPC);
1271 goto out_trans_abort;
1272 }
1273 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1274 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1275
1276 if (is_dir) {
1277 error = xfs_dir_init(tp, ip, dp);
1278 if (error)
1279 goto out_bmap_cancel;
1280
1281 error = xfs_bumplink(tp, dp);
1282 if (error)
1283 goto out_bmap_cancel;
1284 }
1285
1286 /*
1287 * If this is a synchronous mount, make sure that the
1288 * create transaction goes to disk before returning to
1289 * the user.
1290 */
1291 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1292 xfs_trans_set_sync(tp);
1293
1294 /*
1295 * Attach the dquot(s) to the inodes and modify them incore.
1296 * These ids of the inode couldn't have changed since the new
1297 * inode has been locked ever since it was created.
1298 */
1299 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1300
1301 error = xfs_bmap_finish(&tp, &free_list, &committed);
1302 if (error)
1303 goto out_bmap_cancel;
1304
1305 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1306 if (error)
1307 goto out_release_inode;
1308
1309 xfs_qm_dqrele(udqp);
1310 xfs_qm_dqrele(gdqp);
1311 xfs_qm_dqrele(pdqp);
1312
1313 *ipp = ip;
1314 return 0;
1315
1316 out_bmap_cancel:
1317 xfs_bmap_cancel(&free_list);
1318 out_trans_abort:
1319 cancel_flags |= XFS_TRANS_ABORT;
1320 out_trans_cancel:
1321 xfs_trans_cancel(tp, cancel_flags);
1322 out_release_inode:
1323 /*
1324 * Wait until after the current transaction is aborted to
1325 * release the inode. This prevents recursive transactions
1326 * and deadlocks from xfs_inactive.
1327 */
1328 if (ip)
1329 IRELE(ip);
1330
1331 xfs_qm_dqrele(udqp);
1332 xfs_qm_dqrele(gdqp);
1333 xfs_qm_dqrele(pdqp);
1334
1335 if (unlock_dp_on_error)
1336 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1337 return error;
1338}
1339
1340int
1341xfs_link(
1342 xfs_inode_t *tdp,
1343 xfs_inode_t *sip,
1344 struct xfs_name *target_name)
1345{
1346 xfs_mount_t *mp = tdp->i_mount;
1347 xfs_trans_t *tp;
1348 int error;
1349 xfs_bmap_free_t free_list;
1350 xfs_fsblock_t first_block;
1351 int cancel_flags;
1352 int committed;
1353 int resblks;
1354
1355 trace_xfs_link(tdp, target_name);
1356
1357 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1358
1359 if (XFS_FORCED_SHUTDOWN(mp))
1360 return XFS_ERROR(EIO);
1361
1362 error = xfs_qm_dqattach(sip, 0);
1363 if (error)
1364 goto std_return;
1365
1366 error = xfs_qm_dqattach(tdp, 0);
1367 if (error)
1368 goto std_return;
1369
1370 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1371 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1372 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1373 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1374 if (error == ENOSPC) {
1375 resblks = 0;
1376 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1377 }
1378 if (error) {
1379 cancel_flags = 0;
1380 goto error_return;
1381 }
1382
1383 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1384
1385 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1386 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1387
1388 /*
1389 * If we are using project inheritance, we only allow hard link
1390 * creation in our tree when the project IDs are the same; else
1391 * the tree quota mechanism could be circumvented.
1392 */
1393 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1394 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1395 error = XFS_ERROR(EXDEV);
1396 goto error_return;
1397 }
1398
1399 error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1400 if (error)
1401 goto error_return;
1402
1403 xfs_bmap_init(&free_list, &first_block);
1404
1405 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1406 &first_block, &free_list, resblks);
1407 if (error)
1408 goto abort_return;
1409 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1410 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1411
1412 error = xfs_bumplink(tp, sip);
1413 if (error)
1414 goto abort_return;
1415
1416 /*
1417 * If this is a synchronous mount, make sure that the
1418 * link transaction goes to disk before returning to
1419 * the user.
1420 */
1421 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1422 xfs_trans_set_sync(tp);
1423 }
1424
1425 error = xfs_bmap_finish (&tp, &free_list, &committed);
1426 if (error) {
1427 xfs_bmap_cancel(&free_list);
1428 goto abort_return;
1429 }
1430
1431 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1432
1433 abort_return:
1434 cancel_flags |= XFS_TRANS_ABORT;
1435 error_return:
1436 xfs_trans_cancel(tp, cancel_flags);
1437 std_return:
1438 return error;
1439}
1440
1441/*
1470 * Free up the underlying blocks past new_size. The new size must be smaller 1442 * Free up the underlying blocks past new_size. The new size must be smaller
1471 * than the current size. This routine can be used both for the attribute and 1443 * than the current size. This routine can be used both for the attribute and
1472 * data fork, and does not modify the inode size, which is left to the caller. 1444 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1576,10 +1548,7 @@ xfs_itruncate_extents(
1576 * reference that we gained in xfs_trans_dup() 1548 * reference that we gained in xfs_trans_dup()
1577 */ 1549 */
1578 xfs_log_ticket_put(tp->t_ticket); 1550 xfs_log_ticket_put(tp->t_ticket);
1579 error = xfs_trans_reserve(tp, 0, 1551 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1580 XFS_ITRUNCATE_LOG_RES(mp), 0,
1581 XFS_TRANS_PERM_LOG_RES,
1582 XFS_ITRUNCATE_LOG_COUNT);
1583 if (error) 1552 if (error)
1584 goto out; 1553 goto out;
1585 } 1554 }
@@ -1605,6 +1574,271 @@ out_bmap_cancel:
1605 goto out; 1574 goto out;
1606} 1575}
1607 1576
1577int
1578xfs_release(
1579 xfs_inode_t *ip)
1580{
1581 xfs_mount_t *mp = ip->i_mount;
1582 int error;
1583
1584 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1585 return 0;
1586
1587 /* If this is a read-only mount, don't do this (would generate I/O) */
1588 if (mp->m_flags & XFS_MOUNT_RDONLY)
1589 return 0;
1590
1591 if (!XFS_FORCED_SHUTDOWN(mp)) {
1592 int truncated;
1593
1594 /*
1595 * If we are using filestreams, and we have an unlinked
1596 * file that we are processing the last close on, then nothing
1597 * will be able to reopen and write to this file. Purge this
1598 * inode from the filestreams cache so that it doesn't delay
1599 * teardown of the inode.
1600 */
1601 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1602 xfs_filestream_deassociate(ip);
1603
1604 /*
1605 * If we previously truncated this file and removed old data
1606 * in the process, we want to initiate "early" writeout on
1607 * the last close. This is an attempt to combat the notorious
1608 * NULL files problem which is particularly noticeable from a
1609 * truncate down, buffered (re-)write (delalloc), followed by
1610 * a crash. What we are effectively doing here is
1611 * significantly reducing the time window where we'd otherwise
1612 * be exposed to that problem.
1613 */
1614 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1615 if (truncated) {
1616 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1617 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
1618 error = -filemap_flush(VFS_I(ip)->i_mapping);
1619 if (error)
1620 return error;
1621 }
1622 }
1623 }
1624
1625 if (ip->i_d.di_nlink == 0)
1626 return 0;
1627
1628 if (xfs_can_free_eofblocks(ip, false)) {
1629
1630 /*
1631 * If we can't get the iolock just skip truncating the blocks
1632 * past EOF because we could deadlock with the mmap_sem
1633 * otherwise. We'll get another chance to drop them once the
1634 * last reference to the inode is dropped, so we'll never leak
1635 * blocks permanently.
1636 *
1637 * Further, check if the inode is being opened, written and
1638 * closed frequently and we have delayed allocation blocks
1639 * outstanding (e.g. streaming writes from the NFS server),
1640 * truncating the blocks past EOF will cause fragmentation to
1641 * occur.
1642 *
1643 * In this case don't do the truncation, either, but we have to
1644 * be careful how we detect this case. Blocks beyond EOF show
1645 * up as i_delayed_blks even when the inode is clean, so we
1646 * need to truncate them away first before checking for a dirty
1647 * release. Hence on the first dirty close we will still remove
1648 * the speculative allocation, but after that we will leave it
1649 * in place.
1650 */
1651 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1652 return 0;
1653
1654 error = xfs_free_eofblocks(mp, ip, true);
1655 if (error && error != EAGAIN)
1656 return error;
1657
1658 /* delalloc blocks after truncation means it really is dirty */
1659 if (ip->i_delayed_blks)
1660 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1661 }
1662 return 0;
1663}
1664
1665/*
1666 * xfs_inactive
1667 *
1668 * This is called when the vnode reference count for the vnode
1669 * goes to zero. If the file has been unlinked, then it must
1670 * now be truncated. Also, we clear all of the read-ahead state
1671 * kept for the inode here since the file is now closed.
1672 */
1673int
1674xfs_inactive(
1675 xfs_inode_t *ip)
1676{
1677 xfs_bmap_free_t free_list;
1678 xfs_fsblock_t first_block;
1679 int committed;
1680 struct xfs_trans *tp;
1681 struct xfs_mount *mp;
1682 struct xfs_trans_res *resp;
1683 int error;
1684 int truncate = 0;
1685
1686 /*
1687 * If the inode is already free, then there can be nothing
1688 * to clean up here.
1689 */
1690 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1691 ASSERT(ip->i_df.if_real_bytes == 0);
1692 ASSERT(ip->i_df.if_broot_bytes == 0);
1693 return VN_INACTIVE_CACHE;
1694 }
1695
1696 mp = ip->i_mount;
1697
1698 error = 0;
1699
1700 /* If this is a read-only mount, don't do this (would generate I/O) */
1701 if (mp->m_flags & XFS_MOUNT_RDONLY)
1702 goto out;
1703
1704 if (ip->i_d.di_nlink != 0) {
1705 /*
1706 * force is true because we are evicting an inode from the
1707 * cache. Post-eof blocks must be freed, lest we end up with
1708 * broken free space accounting.
1709 */
1710 if (xfs_can_free_eofblocks(ip, true)) {
1711 error = xfs_free_eofblocks(mp, ip, false);
1712 if (error)
1713 return VN_INACTIVE_CACHE;
1714 }
1715 goto out;
1716 }
1717
1718 if (S_ISREG(ip->i_d.di_mode) &&
1719 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1720 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1721 truncate = 1;
1722
1723 error = xfs_qm_dqattach(ip, 0);
1724 if (error)
1725 return VN_INACTIVE_CACHE;
1726
1727 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1728 resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
1729 &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
1730
1731 error = xfs_trans_reserve(tp, resp, 0, 0);
1732 if (error) {
1733 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1734 xfs_trans_cancel(tp, 0);
1735 return VN_INACTIVE_CACHE;
1736 }
1737
1738 xfs_ilock(ip, XFS_ILOCK_EXCL);
1739 xfs_trans_ijoin(tp, ip, 0);
1740
1741 if (S_ISLNK(ip->i_d.di_mode)) {
1742 error = xfs_inactive_symlink(ip, &tp);
1743 if (error)
1744 goto out_cancel;
1745 } else if (truncate) {
1746 ip->i_d.di_size = 0;
1747 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1748
1749 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1750 if (error)
1751 goto out_cancel;
1752
1753 ASSERT(ip->i_d.di_nextents == 0);
1754 }
1755
1756 /*
1757 * If there are attributes associated with the file then blow them away
1758 * now. The code calls a routine that recursively deconstructs the
1759 * attribute fork. We need to just commit the current transaction
1760 * because we can't use it for xfs_attr_inactive().
1761 */
1762 if (ip->i_d.di_anextents > 0) {
1763 ASSERT(ip->i_d.di_forkoff != 0);
1764
1765 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1766 if (error)
1767 goto out_unlock;
1768
1769 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1770
1771 error = xfs_attr_inactive(ip);
1772 if (error)
1773 goto out;
1774
1775 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1776 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
1777 if (error) {
1778 xfs_trans_cancel(tp, 0);
1779 goto out;
1780 }
1781
1782 xfs_ilock(ip, XFS_ILOCK_EXCL);
1783 xfs_trans_ijoin(tp, ip, 0);
1784 }
1785
1786 if (ip->i_afp)
1787 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1788
1789 ASSERT(ip->i_d.di_anextents == 0);
1790
1791 /*
1792 * Free the inode.
1793 */
1794 xfs_bmap_init(&free_list, &first_block);
1795 error = xfs_ifree(tp, ip, &free_list);
1796 if (error) {
1797 /*
1798 * If we fail to free the inode, shut down. The cancel
1799 * might do that, we need to make sure. Otherwise the
1800 * inode might be lost for a long time or forever.
1801 */
1802 if (!XFS_FORCED_SHUTDOWN(mp)) {
1803 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1804 __func__, error);
1805 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1806 }
1807 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1808 } else {
1809 /*
1810 * Credit the quota account(s). The inode is gone.
1811 */
1812 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1813
1814 /*
1815 * Just ignore errors at this point. There is nothing we can
1816 * do except to try to keep going. Make sure it's not a silent
1817 * error.
1818 */
1819 error = xfs_bmap_finish(&tp, &free_list, &committed);
1820 if (error)
1821 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1822 __func__, error);
1823 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1824 if (error)
1825 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1826 __func__, error);
1827 }
1828
1829 /*
1830 * Release the dquots held by inode, if any.
1831 */
1832 xfs_qm_dqdetach(ip);
1833out_unlock:
1834 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1835out:
1836 return VN_INACTIVE_CACHE;
1837out_cancel:
1838 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1839 goto out_unlock;
1840}
1841
1608/* 1842/*
1609 * This is called when the inode's link count goes to 0. 1843 * This is called when the inode's link count goes to 0.
1610 * We place the on-disk inode on a list in the AGI. It 1844 * We place the on-disk inode on a list in the AGI. It
@@ -1861,7 +2095,7 @@ xfs_iunlink_remove(
1861} 2095}
1862 2096
1863/* 2097/*
1864 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 2098 * A big issue when freeing the inode cluster is that we _cannot_ skip any
1865 * inodes that are in memory - they all must be marked stale and attached to 2099 * inodes that are in memory - they all must be marked stale and attached to
1866 * the cluster buffer. 2100 * the cluster buffer.
1867 */ 2101 */
@@ -2094,272 +2328,6 @@ xfs_ifree(
2094} 2328}
2095 2329
2096/* 2330/*
2097 * Reallocate the space for if_broot based on the number of records
2098 * being added or deleted as indicated in rec_diff. Move the records
2099 * and pointers in if_broot to fit the new size. When shrinking this
2100 * will eliminate holes between the records and pointers created by
2101 * the caller. When growing this will create holes to be filled in
2102 * by the caller.
2103 *
2104 * The caller must not request to add more records than would fit in
2105 * the on-disk inode root. If the if_broot is currently NULL, then
2106 * if we adding records one will be allocated. The caller must also
2107 * not request that the number of records go below zero, although
2108 * it can go to zero.
2109 *
2110 * ip -- the inode whose if_broot area is changing
2111 * ext_diff -- the change in the number of records, positive or negative,
2112 * requested for the if_broot array.
2113 */
2114void
2115xfs_iroot_realloc(
2116 xfs_inode_t *ip,
2117 int rec_diff,
2118 int whichfork)
2119{
2120 struct xfs_mount *mp = ip->i_mount;
2121 int cur_max;
2122 xfs_ifork_t *ifp;
2123 struct xfs_btree_block *new_broot;
2124 int new_max;
2125 size_t new_size;
2126 char *np;
2127 char *op;
2128
2129 /*
2130 * Handle the degenerate case quietly.
2131 */
2132 if (rec_diff == 0) {
2133 return;
2134 }
2135
2136 ifp = XFS_IFORK_PTR(ip, whichfork);
2137 if (rec_diff > 0) {
2138 /*
2139 * If there wasn't any memory allocated before, just
2140 * allocate it now and get out.
2141 */
2142 if (ifp->if_broot_bytes == 0) {
2143 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
2144 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2145 ifp->if_broot_bytes = (int)new_size;
2146 return;
2147 }
2148
2149 /*
2150 * If there is already an existing if_broot, then we need
2151 * to realloc() it and shift the pointers to their new
2152 * location. The records don't change location because
2153 * they are kept butted up against the btree block header.
2154 */
2155 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2156 new_max = cur_max + rec_diff;
2157 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
2158 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2159 XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
2160 KM_SLEEP | KM_NOFS);
2161 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2162 ifp->if_broot_bytes);
2163 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2164 (int)new_size);
2165 ifp->if_broot_bytes = (int)new_size;
2166 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2167 XFS_IFORK_SIZE(ip, whichfork));
2168 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2169 return;
2170 }
2171
2172 /*
2173 * rec_diff is less than 0. In this case, we are shrinking the
2174 * if_broot buffer. It must already exist. If we go to zero
2175 * records, just get rid of the root and clear the status bit.
2176 */
2177 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2178 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2179 new_max = cur_max + rec_diff;
2180 ASSERT(new_max >= 0);
2181 if (new_max > 0)
2182 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
2183 else
2184 new_size = 0;
2185 if (new_size > 0) {
2186 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2187 /*
2188 * First copy over the btree block header.
2189 */
2190 memcpy(new_broot, ifp->if_broot,
2191 XFS_BMBT_BLOCK_LEN(ip->i_mount));
2192 } else {
2193 new_broot = NULL;
2194 ifp->if_flags &= ~XFS_IFBROOT;
2195 }
2196
2197 /*
2198 * Only copy the records and pointers if there are any.
2199 */
2200 if (new_max > 0) {
2201 /*
2202 * First copy the records.
2203 */
2204 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2205 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2206 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2207
2208 /*
2209 * Then copy the pointers.
2210 */
2211 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2212 ifp->if_broot_bytes);
2213 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2214 (int)new_size);
2215 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2216 }
2217 kmem_free(ifp->if_broot);
2218 ifp->if_broot = new_broot;
2219 ifp->if_broot_bytes = (int)new_size;
2220 if (ifp->if_broot)
2221 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2222 XFS_IFORK_SIZE(ip, whichfork));
2223 return;
2224}
2225
2226
2227/*
2228 * This is called when the amount of space needed for if_data
2229 * is increased or decreased. The change in size is indicated by
2230 * the number of bytes that need to be added or deleted in the
2231 * byte_diff parameter.
2232 *
2233 * If the amount of space needed has decreased below the size of the
2234 * inline buffer, then switch to using the inline buffer. Otherwise,
2235 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2236 * to what is needed.
2237 *
2238 * ip -- the inode whose if_data area is changing
2239 * byte_diff -- the change in the number of bytes, positive or negative,
2240 * requested for the if_data array.
2241 */
2242void
2243xfs_idata_realloc(
2244 xfs_inode_t *ip,
2245 int byte_diff,
2246 int whichfork)
2247{
2248 xfs_ifork_t *ifp;
2249 int new_size;
2250 int real_size;
2251
2252 if (byte_diff == 0) {
2253 return;
2254 }
2255
2256 ifp = XFS_IFORK_PTR(ip, whichfork);
2257 new_size = (int)ifp->if_bytes + byte_diff;
2258 ASSERT(new_size >= 0);
2259
2260 if (new_size == 0) {
2261 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2262 kmem_free(ifp->if_u1.if_data);
2263 }
2264 ifp->if_u1.if_data = NULL;
2265 real_size = 0;
2266 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2267 /*
2268 * If the valid extents/data can fit in if_inline_ext/data,
2269 * copy them from the malloc'd vector and free it.
2270 */
2271 if (ifp->if_u1.if_data == NULL) {
2272 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2273 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2274 ASSERT(ifp->if_real_bytes != 0);
2275 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2276 new_size);
2277 kmem_free(ifp->if_u1.if_data);
2278 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2279 }
2280 real_size = 0;
2281 } else {
2282 /*
2283 * Stuck with malloc/realloc.
2284 * For inline data, the underlying buffer must be
2285 * a multiple of 4 bytes in size so that it can be
2286 * logged and stay on word boundaries. We enforce
2287 * that here.
2288 */
2289 real_size = roundup(new_size, 4);
2290 if (ifp->if_u1.if_data == NULL) {
2291 ASSERT(ifp->if_real_bytes == 0);
2292 ifp->if_u1.if_data = kmem_alloc(real_size,
2293 KM_SLEEP | KM_NOFS);
2294 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2295 /*
2296 * Only do the realloc if the underlying size
2297 * is really changing.
2298 */
2299 if (ifp->if_real_bytes != real_size) {
2300 ifp->if_u1.if_data =
2301 kmem_realloc(ifp->if_u1.if_data,
2302 real_size,
2303 ifp->if_real_bytes,
2304 KM_SLEEP | KM_NOFS);
2305 }
2306 } else {
2307 ASSERT(ifp->if_real_bytes == 0);
2308 ifp->if_u1.if_data = kmem_alloc(real_size,
2309 KM_SLEEP | KM_NOFS);
2310 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2311 ifp->if_bytes);
2312 }
2313 }
2314 ifp->if_real_bytes = real_size;
2315 ifp->if_bytes = new_size;
2316 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2317}
2318
2319void
2320xfs_idestroy_fork(
2321 xfs_inode_t *ip,
2322 int whichfork)
2323{
2324 xfs_ifork_t *ifp;
2325
2326 ifp = XFS_IFORK_PTR(ip, whichfork);
2327 if (ifp->if_broot != NULL) {
2328 kmem_free(ifp->if_broot);
2329 ifp->if_broot = NULL;
2330 }
2331
2332 /*
2333 * If the format is local, then we can't have an extents
2334 * array so just look for an inline data array. If we're
2335 * not local then we may or may not have an extents list,
2336 * so check and free it up if we do.
2337 */
2338 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2339 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2340 (ifp->if_u1.if_data != NULL)) {
2341 ASSERT(ifp->if_real_bytes != 0);
2342 kmem_free(ifp->if_u1.if_data);
2343 ifp->if_u1.if_data = NULL;
2344 ifp->if_real_bytes = 0;
2345 }
2346 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2347 ((ifp->if_flags & XFS_IFEXTIREC) ||
2348 ((ifp->if_u1.if_extents != NULL) &&
2349 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2350 ASSERT(ifp->if_real_bytes != 0);
2351 xfs_iext_destroy(ifp);
2352 }
2353 ASSERT(ifp->if_u1.if_extents == NULL ||
2354 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2355 ASSERT(ifp->if_real_bytes == 0);
2356 if (whichfork == XFS_ATTR_FORK) {
2357 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2358 ip->i_afp = NULL;
2359 }
2360}
2361
2362/*
2363 * This is called to unpin an inode. The caller must have the inode locked 2331 * This is called to unpin an inode. The caller must have the inode locked
2364 * in at least shared mode so that the buffer cannot be subsequently pinned 2332 * in at least shared mode so that the buffer cannot be subsequently pinned
2365 * once someone is waiting for it to be unpinned. 2333 * once someone is waiting for it to be unpinned.
@@ -2402,162 +2370,471 @@ xfs_iunpin_wait(
2402 __xfs_iunpin_wait(ip); 2370 __xfs_iunpin_wait(ip);
2403} 2371}
2404 2372
2405/*
2406 * xfs_iextents_copy()
2407 *
2408 * This is called to copy the REAL extents (as opposed to the delayed
2409 * allocation extents) from the inode into the given buffer. It
2410 * returns the number of bytes copied into the buffer.
2411 *
2412 * If there are no delayed allocation extents, then we can just
2413 * memcpy() the extents into the buffer. Otherwise, we need to
2414 * examine each extent in turn and skip those which are delayed.
2415 */
2416int 2373int
2417xfs_iextents_copy( 2374xfs_remove(
2418 xfs_inode_t *ip, 2375 xfs_inode_t *dp,
2419 xfs_bmbt_rec_t *dp, 2376 struct xfs_name *name,
2420 int whichfork) 2377 xfs_inode_t *ip)
2421{ 2378{
2422 int copied; 2379 xfs_mount_t *mp = dp->i_mount;
2423 int i; 2380 xfs_trans_t *tp = NULL;
2424 xfs_ifork_t *ifp; 2381 int is_dir = S_ISDIR(ip->i_d.di_mode);
2425 int nrecs; 2382 int error = 0;
2426 xfs_fsblock_t start_block; 2383 xfs_bmap_free_t free_list;
2384 xfs_fsblock_t first_block;
2385 int cancel_flags;
2386 int committed;
2387 int link_zero;
2388 uint resblks;
2389 uint log_count;
2427 2390
2428 ifp = XFS_IFORK_PTR(ip, whichfork); 2391 trace_xfs_remove(dp, name);
2429 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2392
2430 ASSERT(ifp->if_bytes > 0); 2393 if (XFS_FORCED_SHUTDOWN(mp))
2394 return XFS_ERROR(EIO);
2395
2396 error = xfs_qm_dqattach(dp, 0);
2397 if (error)
2398 goto std_return;
2399
2400 error = xfs_qm_dqattach(ip, 0);
2401 if (error)
2402 goto std_return;
2431 2403
2432 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2404 if (is_dir) {
2433 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2405 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2434 ASSERT(nrecs > 0); 2406 log_count = XFS_DEFAULT_LOG_COUNT;
2407 } else {
2408 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2409 log_count = XFS_REMOVE_LOG_COUNT;
2410 }
2411 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2435 2412
2436 /* 2413 /*
2437 * There are some delayed allocation extents in the 2414 * We try to get the real space reservation first,
2438 * inode, so copy the extents one at a time and skip 2415 * allowing for directory btree deletion(s) implying
2439 * the delayed ones. There must be at least one 2416 * possible bmap insert(s). If we can't get the space
2440 * non-delayed extent. 2417 * reservation then we use 0 instead, and avoid the bmap
2418 * btree insert(s) in the directory code by, if the bmap
2419 * insert tries to happen, instead trimming the LAST
2420 * block from the directory.
2441 */ 2421 */
2442 copied = 0; 2422 resblks = XFS_REMOVE_SPACE_RES(mp);
2443 for (i = 0; i < nrecs; i++) { 2423 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
2444 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2424 if (error == ENOSPC) {
2445 start_block = xfs_bmbt_get_startblock(ep); 2425 resblks = 0;
2446 if (isnullstartblock(start_block)) { 2426 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
2447 /* 2427 }
2448 * It's a delayed allocation extent, so skip it. 2428 if (error) {
2449 */ 2429 ASSERT(error != ENOSPC);
2450 continue; 2430 cancel_flags = 0;
2431 goto out_trans_cancel;
2432 }
2433
2434 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2435
2436 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2437 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2438
2439 /*
2440 * If we're removing a directory perform some additional validation.
2441 */
2442 if (is_dir) {
2443 ASSERT(ip->i_d.di_nlink >= 2);
2444 if (ip->i_d.di_nlink != 2) {
2445 error = XFS_ERROR(ENOTEMPTY);
2446 goto out_trans_cancel;
2451 } 2447 }
2448 if (!xfs_dir_isempty(ip)) {
2449 error = XFS_ERROR(ENOTEMPTY);
2450 goto out_trans_cancel;
2451 }
2452 }
2453
2454 xfs_bmap_init(&free_list, &first_block);
2455 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2456 &first_block, &free_list, resblks);
2457 if (error) {
2458 ASSERT(error != ENOENT);
2459 goto out_bmap_cancel;
2460 }
2461 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2452 2462
2453 /* Translate to on disk format */ 2463 if (is_dir) {
2454 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2464 /*
2455 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2465 * Drop the link from ip's "..".
2456 dp++; 2466 */
2457 copied++; 2467 error = xfs_droplink(tp, dp);
2468 if (error)
2469 goto out_bmap_cancel;
2470
2471 /*
2472 * Drop the "." link from ip to self.
2473 */
2474 error = xfs_droplink(tp, ip);
2475 if (error)
2476 goto out_bmap_cancel;
2477 } else {
2478 /*
2479 * When removing a non-directory we need to log the parent
2480 * inode here. For a directory this is done implicitly
2481 * by the xfs_droplink call for the ".." entry.
2482 */
2483 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2458 } 2484 }
2459 ASSERT(copied != 0);
2460 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2461 2485
2462 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2486 /*
2487 * Drop the link from dp to ip.
2488 */
2489 error = xfs_droplink(tp, ip);
2490 if (error)
2491 goto out_bmap_cancel;
2492
2493 /*
2494 * Determine if this is the last link while
2495 * we are in the transaction.
2496 */
2497 link_zero = (ip->i_d.di_nlink == 0);
2498
2499 /*
2500 * If this is a synchronous mount, make sure that the
2501 * remove transaction goes to disk before returning to
2502 * the user.
2503 */
2504 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2505 xfs_trans_set_sync(tp);
2506
2507 error = xfs_bmap_finish(&tp, &free_list, &committed);
2508 if (error)
2509 goto out_bmap_cancel;
2510
2511 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2512 if (error)
2513 goto std_return;
2514
2515 /*
2516 * If we are using filestreams, kill the stream association.
2517 * If the file is still open it may get a new one but that
2518 * will get killed on last close in xfs_close() so we don't
2519 * have to worry about that.
2520 */
2521 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2522 xfs_filestream_deassociate(ip);
2523
2524 return 0;
2525
2526 out_bmap_cancel:
2527 xfs_bmap_cancel(&free_list);
2528 cancel_flags |= XFS_TRANS_ABORT;
2529 out_trans_cancel:
2530 xfs_trans_cancel(tp, cancel_flags);
2531 std_return:
2532 return error;
2463} 2533}
2464 2534
2465/* 2535/*
2466 * Each of the following cases stores data into the same region 2536 * Enter all inodes for a rename transaction into a sorted array.
2467 * of the on-disk inode, so only one of them can be valid at
2468 * any given time. While it is possible to have conflicting formats
2469 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2470 * in EXTENTS format, this can only happen when the fork has
2471 * changed formats after being modified but before being flushed.
2472 * In these cases, the format always takes precedence, because the
2473 * format indicates the current state of the fork.
2474 */ 2537 */
2475/*ARGSUSED*/
2476STATIC void 2538STATIC void
2477xfs_iflush_fork( 2539xfs_sort_for_rename(
2478 xfs_inode_t *ip, 2540 xfs_inode_t *dp1, /* in: old (source) directory inode */
2479 xfs_dinode_t *dip, 2541 xfs_inode_t *dp2, /* in: new (target) directory inode */
2480 xfs_inode_log_item_t *iip, 2542 xfs_inode_t *ip1, /* in: inode of old entry */
2481 int whichfork, 2543 xfs_inode_t *ip2, /* in: inode of new entry, if it
2482 xfs_buf_t *bp) 2544 already exists, NULL otherwise. */
2483{ 2545 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
2484 char *cp; 2546 int *num_inodes) /* out: number of inodes in array */
2485 xfs_ifork_t *ifp; 2547{
2486 xfs_mount_t *mp; 2548 xfs_inode_t *temp;
2487 static const short brootflag[2] = 2549 int i, j;
2488 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2489 static const short dataflag[2] =
2490 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2491 static const short extflag[2] =
2492 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2493
2494 if (!iip)
2495 return;
2496 ifp = XFS_IFORK_PTR(ip, whichfork);
2497 /*
2498 * This can happen if we gave up in iformat in an error path,
2499 * for the attribute fork.
2500 */
2501 if (!ifp) {
2502 ASSERT(whichfork == XFS_ATTR_FORK);
2503 return;
2504 }
2505 cp = XFS_DFORK_PTR(dip, whichfork);
2506 mp = ip->i_mount;
2507 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2508 case XFS_DINODE_FMT_LOCAL:
2509 if ((iip->ili_fields & dataflag[whichfork]) &&
2510 (ifp->if_bytes > 0)) {
2511 ASSERT(ifp->if_u1.if_data != NULL);
2512 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2513 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2514 }
2515 break;
2516 2550
2517 case XFS_DINODE_FMT_EXTENTS: 2551 /*
2518 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2552 * i_tab contains a list of pointers to inodes. We initialize
2519 !(iip->ili_fields & extflag[whichfork])); 2553 * the table here & we'll sort it. We will then use it to
2520 if ((iip->ili_fields & extflag[whichfork]) && 2554 * order the acquisition of the inode locks.
2521 (ifp->if_bytes > 0)) { 2555 *
2522 ASSERT(xfs_iext_get_ext(ifp, 0)); 2556 * Note that the table may contain duplicates. e.g., dp1 == dp2.
2523 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2557 */
2524 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2558 i_tab[0] = dp1;
2525 whichfork); 2559 i_tab[1] = dp2;
2526 } 2560 i_tab[2] = ip1;
2527 break; 2561 if (ip2) {
2562 *num_inodes = 4;
2563 i_tab[3] = ip2;
2564 } else {
2565 *num_inodes = 3;
2566 i_tab[3] = NULL;
2567 }
2528 2568
2529 case XFS_DINODE_FMT_BTREE: 2569 /*
2530 if ((iip->ili_fields & brootflag[whichfork]) && 2570 * Sort the elements via bubble sort. (Remember, there are at
2531 (ifp->if_broot_bytes > 0)) { 2571 * most 4 elements to sort, so this is adequate.)
2532 ASSERT(ifp->if_broot != NULL); 2572 */
2533 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= 2573 for (i = 0; i < *num_inodes; i++) {
2534 XFS_IFORK_SIZE(ip, whichfork)); 2574 for (j = 1; j < *num_inodes; j++) {
2535 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2575 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2536 (xfs_bmdr_block_t *)cp, 2576 temp = i_tab[j];
2537 XFS_DFORK_SIZE(dip, mp, whichfork)); 2577 i_tab[j] = i_tab[j-1];
2578 i_tab[j-1] = temp;
2579 }
2538 } 2580 }
2539 break; 2581 }
2582}
2583
2584/*
2585 * xfs_rename
2586 */
2587int
2588xfs_rename(
2589 xfs_inode_t *src_dp,
2590 struct xfs_name *src_name,
2591 xfs_inode_t *src_ip,
2592 xfs_inode_t *target_dp,
2593 struct xfs_name *target_name,
2594 xfs_inode_t *target_ip)
2595{
2596 xfs_trans_t *tp = NULL;
2597 xfs_mount_t *mp = src_dp->i_mount;
2598 int new_parent; /* moving to a new dir */
2599 int src_is_directory; /* src_name is a directory */
2600 int error;
2601 xfs_bmap_free_t free_list;
2602 xfs_fsblock_t first_block;
2603 int cancel_flags;
2604 int committed;
2605 xfs_inode_t *inodes[4];
2606 int spaceres;
2607 int num_inodes;
2608
2609 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2610
2611 new_parent = (src_dp != target_dp);
2612 src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2613
2614 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
2615 inodes, &num_inodes);
2616
2617 xfs_bmap_init(&free_list, &first_block);
2618 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
2619 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2620 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2621 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2622 if (error == ENOSPC) {
2623 spaceres = 0;
2624 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2625 }
2626 if (error) {
2627 xfs_trans_cancel(tp, 0);
2628 goto std_return;
2629 }
2630
2631 /*
2632 * Attach the dquots to the inodes
2633 */
2634 error = xfs_qm_vop_rename_dqattach(inodes);
2635 if (error) {
2636 xfs_trans_cancel(tp, cancel_flags);
2637 goto std_return;
2638 }
2639
2640 /*
2641 * Lock all the participating inodes. Depending upon whether
2642 * the target_name exists in the target directory, and
2643 * whether the target directory is the same as the source
2644 * directory, we can lock from 2 to 4 inodes.
2645 */
2646 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2647
2648 /*
2649 * Join all the inodes to the transaction. From this point on,
2650 * we can rely on either trans_commit or trans_cancel to unlock
2651 * them.
2652 */
2653 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2654 if (new_parent)
2655 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2656 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2657 if (target_ip)
2658 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2659
2660 /*
2661 * If we are using project inheritance, we only allow renames
2662 * into our tree when the project IDs are the same; else the
2663 * tree quota mechanism would be circumvented.
2664 */
2665 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2666 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2667 error = XFS_ERROR(EXDEV);
2668 goto error_return;
2669 }
2670
2671 /*
2672 * Set up the target.
2673 */
2674 if (target_ip == NULL) {
2675 /*
2676 * If there's no space reservation, check the entry will
2677 * fit before actually inserting it.
2678 */
2679 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
2680 if (error)
2681 goto error_return;
2682 /*
2683 * If target does not exist and the rename crosses
2684 * directories, adjust the target directory link count
2685 * to account for the ".." reference from the new entry.
2686 */
2687 error = xfs_dir_createname(tp, target_dp, target_name,
2688 src_ip->i_ino, &first_block,
2689 &free_list, spaceres);
2690 if (error == ENOSPC)
2691 goto error_return;
2692 if (error)
2693 goto abort_return;
2694
2695 xfs_trans_ichgtime(tp, target_dp,
2696 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2540 2697
2541 case XFS_DINODE_FMT_DEV: 2698 if (new_parent && src_is_directory) {
2542 if (iip->ili_fields & XFS_ILOG_DEV) { 2699 error = xfs_bumplink(tp, target_dp);
2543 ASSERT(whichfork == XFS_DATA_FORK); 2700 if (error)
2544 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2701 goto abort_return;
2702 }
2703 } else { /* target_ip != NULL */
2704 /*
2705 * If target exists and it's a directory, check that both
2706 * target and source are directories and that target can be
2707 * destroyed, or that neither is a directory.
2708 */
2709 if (S_ISDIR(target_ip->i_d.di_mode)) {
2710 /*
2711 * Make sure target dir is empty.
2712 */
2713 if (!(xfs_dir_isempty(target_ip)) ||
2714 (target_ip->i_d.di_nlink > 2)) {
2715 error = XFS_ERROR(EEXIST);
2716 goto error_return;
2717 }
2545 } 2718 }
2546 break;
2547 2719
2548 case XFS_DINODE_FMT_UUID: 2720 /*
2549 if (iip->ili_fields & XFS_ILOG_UUID) { 2721 * Link the source inode under the target name.
2550 ASSERT(whichfork == XFS_DATA_FORK); 2722 * If the source inode is a directory and we are moving
2551 memcpy(XFS_DFORK_DPTR(dip), 2723 * it across directories, its ".." entry will be
2552 &ip->i_df.if_u2.if_uuid, 2724 * inconsistent until we replace that down below.
2553 sizeof(uuid_t)); 2725 *
2726 * In case there is already an entry with the same
2727 * name at the destination directory, remove it first.
2728 */
2729 error = xfs_dir_replace(tp, target_dp, target_name,
2730 src_ip->i_ino,
2731 &first_block, &free_list, spaceres);
2732 if (error)
2733 goto abort_return;
2734
2735 xfs_trans_ichgtime(tp, target_dp,
2736 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2737
2738 /*
2739 * Decrement the link count on the target since the target
2740 * dir no longer points to it.
2741 */
2742 error = xfs_droplink(tp, target_ip);
2743 if (error)
2744 goto abort_return;
2745
2746 if (src_is_directory) {
2747 /*
2748 * Drop the link from the old "." entry.
2749 */
2750 error = xfs_droplink(tp, target_ip);
2751 if (error)
2752 goto abort_return;
2554 } 2753 }
2555 break; 2754 } /* target_ip != NULL */
2556 2755
2557 default: 2756 /*
2558 ASSERT(0); 2757 * Remove the source.
2559 break; 2758 */
2759 if (new_parent && src_is_directory) {
2760 /*
2761 * Rewrite the ".." entry to point to the new
2762 * directory.
2763 */
2764 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
2765 target_dp->i_ino,
2766 &first_block, &free_list, spaceres);
2767 ASSERT(error != EEXIST);
2768 if (error)
2769 goto abort_return;
2770 }
2771
2772 /*
2773 * We always want to hit the ctime on the source inode.
2774 *
2775 * This isn't strictly required by the standards since the source
2776 * inode isn't really being changed, but old unix file systems did
2777 * it and some incremental backup programs won't work without it.
2778 */
2779 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
2780 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
2781
2782 /*
2783 * Adjust the link count on src_dp. This is necessary when
2784 * renaming a directory, either within one parent when
2785 * the target existed, or across two parent directories.
2786 */
2787 if (src_is_directory && (new_parent || target_ip != NULL)) {
2788
2789 /*
2790 * Decrement link count on src_directory since the
2791 * entry that's moved no longer points to it.
2792 */
2793 error = xfs_droplink(tp, src_dp);
2794 if (error)
2795 goto abort_return;
2796 }
2797
2798 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
2799 &first_block, &free_list, spaceres);
2800 if (error)
2801 goto abort_return;
2802
2803 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2804 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
2805 if (new_parent)
2806 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2807
2808 /*
2809 * If this is a synchronous mount, make sure that the
2810 * rename transaction goes to disk before returning to
2811 * the user.
2812 */
2813 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2814 xfs_trans_set_sync(tp);
2560 } 2815 }
2816
2817 error = xfs_bmap_finish(&tp, &free_list, &committed);
2818 if (error) {
2819 xfs_bmap_cancel(&free_list);
2820 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
2821 XFS_TRANS_ABORT));
2822 goto std_return;
2823 }
2824
2825 /*
2826 * trans_commit will unlock src_ip, target_ip & decrement
2827 * the vnode references.
2828 */
2829 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2830
2831 abort_return:
2832 cancel_flags |= XFS_TRANS_ABORT;
2833 error_return:
2834 xfs_bmap_cancel(&free_list);
2835 xfs_trans_cancel(tp, cancel_flags);
2836 std_return:
2837 return error;
2561} 2838}
2562 2839
2563STATIC int 2840STATIC int
@@ -2816,7 +3093,6 @@ abort_out:
2816 return error; 3093 return error;
2817} 3094}
2818 3095
2819
2820STATIC int 3096STATIC int
2821xfs_iflush_int( 3097xfs_iflush_int(
2822 struct xfs_inode *ip, 3098 struct xfs_inode *ip,
@@ -3004,1072 +3280,3 @@ xfs_iflush_int(
3004corrupt_out: 3280corrupt_out:
3005 return XFS_ERROR(EFSCORRUPTED); 3281 return XFS_ERROR(EFSCORRUPTED);
3006} 3282}
3007
3008/*
3009 * Return a pointer to the extent record at file index idx.
3010 */
3011xfs_bmbt_rec_host_t *
3012xfs_iext_get_ext(
3013 xfs_ifork_t *ifp, /* inode fork pointer */
3014 xfs_extnum_t idx) /* index of target extent */
3015{
3016 ASSERT(idx >= 0);
3017 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3018
3019 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3020 return ifp->if_u1.if_ext_irec->er_extbuf;
3021 } else if (ifp->if_flags & XFS_IFEXTIREC) {
3022 xfs_ext_irec_t *erp; /* irec pointer */
3023 int erp_idx = 0; /* irec index */
3024 xfs_extnum_t page_idx = idx; /* ext index in target list */
3025
3026 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3027 return &erp->er_extbuf[page_idx];
3028 } else if (ifp->if_bytes) {
3029 return &ifp->if_u1.if_extents[idx];
3030 } else {
3031 return NULL;
3032 }
3033}
3034
3035/*
3036 * Insert new item(s) into the extent records for incore inode
3037 * fork 'ifp'. 'count' new items are inserted at index 'idx'.
3038 */
3039void
3040xfs_iext_insert(
3041 xfs_inode_t *ip, /* incore inode pointer */
3042 xfs_extnum_t idx, /* starting index of new items */
3043 xfs_extnum_t count, /* number of inserted items */
3044 xfs_bmbt_irec_t *new, /* items to insert */
3045 int state) /* type of extent conversion */
3046{
3047 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3048 xfs_extnum_t i; /* extent record index */
3049
3050 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
3051
3052 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3053 xfs_iext_add(ifp, idx, count);
3054 for (i = idx; i < idx + count; i++, new++)
3055 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
3056}
3057
3058/*
3059 * This is called when the amount of space required for incore file
3060 * extents needs to be increased. The ext_diff parameter stores the
3061 * number of new extents being added and the idx parameter contains
3062 * the extent index where the new extents will be added. If the new
3063 * extents are being appended, then we just need to (re)allocate and
3064 * initialize the space. Otherwise, if the new extents are being
3065 * inserted into the middle of the existing entries, a bit more work
3066 * is required to make room for the new extents to be inserted. The
3067 * caller is responsible for filling in the new extent entries upon
3068 * return.
3069 */
3070void
3071xfs_iext_add(
3072 xfs_ifork_t *ifp, /* inode fork pointer */
3073 xfs_extnum_t idx, /* index to begin adding exts */
3074 int ext_diff) /* number of extents to add */
3075{
3076 int byte_diff; /* new bytes being added */
3077 int new_size; /* size of extents after adding */
3078 xfs_extnum_t nextents; /* number of extents in file */
3079
3080 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3081 ASSERT((idx >= 0) && (idx <= nextents));
3082 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
3083 new_size = ifp->if_bytes + byte_diff;
3084 /*
3085 * If the new number of extents (nextents + ext_diff)
3086 * fits inside the inode, then continue to use the inline
3087 * extent buffer.
3088 */
3089 if (nextents + ext_diff <= XFS_INLINE_EXTS) {
3090 if (idx < nextents) {
3091 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
3092 &ifp->if_u2.if_inline_ext[idx],
3093 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3094 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3095 }
3096 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3097 ifp->if_real_bytes = 0;
3098 }
3099 /*
3100 * Otherwise use a linear (direct) extent list.
3101 * If the extents are currently inside the inode,
3102 * xfs_iext_realloc_direct will switch us from
3103 * inline to direct extent allocation mode.
3104 */
3105 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3106 xfs_iext_realloc_direct(ifp, new_size);
3107 if (idx < nextents) {
3108 memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3109 &ifp->if_u1.if_extents[idx],
3110 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3111 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3112 }
3113 }
3114 /* Indirection array */
3115 else {
3116 xfs_ext_irec_t *erp;
3117 int erp_idx = 0;
3118 int page_idx = idx;
3119
3120 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3121 if (ifp->if_flags & XFS_IFEXTIREC) {
3122 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3123 } else {
3124 xfs_iext_irec_init(ifp);
3125 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3126 erp = ifp->if_u1.if_ext_irec;
3127 }
3128 /* Extents fit in target extent page */
3129 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3130 if (page_idx < erp->er_extcount) {
3131 memmove(&erp->er_extbuf[page_idx + ext_diff],
3132 &erp->er_extbuf[page_idx],
3133 (erp->er_extcount - page_idx) *
3134 sizeof(xfs_bmbt_rec_t));
3135 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3136 }
3137 erp->er_extcount += ext_diff;
3138 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3139 }
3140 /* Insert a new extent page */
3141 else if (erp) {
3142 xfs_iext_add_indirect_multi(ifp,
3143 erp_idx, page_idx, ext_diff);
3144 }
3145 /*
3146 * If extent(s) are being appended to the last page in
3147 * the indirection array and the new extent(s) don't fit
3148 * in the page, then erp is NULL and erp_idx is set to
3149 * the next index needed in the indirection array.
3150 */
3151 else {
3152 int count = ext_diff;
3153
3154 while (count) {
3155 erp = xfs_iext_irec_new(ifp, erp_idx);
3156 erp->er_extcount = count;
3157 count -= MIN(count, (int)XFS_LINEAR_EXTS);
3158 if (count) {
3159 erp_idx++;
3160 }
3161 }
3162 }
3163 }
3164 ifp->if_bytes = new_size;
3165}
3166
3167/*
3168 * This is called when incore extents are being added to the indirection
3169 * array and the new extents do not fit in the target extent list. The
3170 * erp_idx parameter contains the irec index for the target extent list
3171 * in the indirection array, and the idx parameter contains the extent
3172 * index within the list. The number of extents being added is stored
3173 * in the count parameter.
3174 *
3175 * |-------| |-------|
3176 * | | | | idx - number of extents before idx
3177 * | idx | | count |
3178 * | | | | count - number of extents being inserted at idx
3179 * |-------| |-------|
3180 * | count | | nex2 | nex2 - number of extents after idx + count
3181 * |-------| |-------|
3182 */
3183void
3184xfs_iext_add_indirect_multi(
3185 xfs_ifork_t *ifp, /* inode fork pointer */
3186 int erp_idx, /* target extent irec index */
3187 xfs_extnum_t idx, /* index within target list */
3188 int count) /* new extents being added */
3189{
3190 int byte_diff; /* new bytes being added */
3191 xfs_ext_irec_t *erp; /* pointer to irec entry */
3192 xfs_extnum_t ext_diff; /* number of extents to add */
3193 xfs_extnum_t ext_cnt; /* new extents still needed */
3194 xfs_extnum_t nex2; /* extents after idx + count */
3195 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
3196 int nlists; /* number of irec's (lists) */
3197
3198 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3199 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3200 nex2 = erp->er_extcount - idx;
3201 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3202
3203 /*
3204 * Save second part of target extent list
3205 * (all extents past */
3206 if (nex2) {
3207 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3208 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3209 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3210 erp->er_extcount -= nex2;
3211 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3212 memset(&erp->er_extbuf[idx], 0, byte_diff);
3213 }
3214
3215 /*
3216 * Add the new extents to the end of the target
3217 * list, then allocate new irec record(s) and
3218 * extent buffer(s) as needed to store the rest
3219 * of the new extents.
3220 */
3221 ext_cnt = count;
3222 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3223 if (ext_diff) {
3224 erp->er_extcount += ext_diff;
3225 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3226 ext_cnt -= ext_diff;
3227 }
3228 while (ext_cnt) {
3229 erp_idx++;
3230 erp = xfs_iext_irec_new(ifp, erp_idx);
3231 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3232 erp->er_extcount = ext_diff;
3233 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3234 ext_cnt -= ext_diff;
3235 }
3236
3237 /* Add nex2 extents back to indirection array */
3238 if (nex2) {
3239 xfs_extnum_t ext_avail;
3240 int i;
3241
3242 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3243 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3244 i = 0;
3245 /*
3246 * If nex2 extents fit in the current page, append
3247 * nex2_ep after the new extents.
3248 */
3249 if (nex2 <= ext_avail) {
3250 i = erp->er_extcount;
3251 }
3252 /*
3253 * Otherwise, check if space is available in the
3254 * next page.
3255 */
3256 else if ((erp_idx < nlists - 1) &&
3257 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3258 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3259 erp_idx++;
3260 erp++;
3261 /* Create a hole for nex2 extents */
3262 memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3263 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3264 }
3265 /*
3266 * Final choice, create a new extent page for
3267 * nex2 extents.
3268 */
3269 else {
3270 erp_idx++;
3271 erp = xfs_iext_irec_new(ifp, erp_idx);
3272 }
3273 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3274 kmem_free(nex2_ep);
3275 erp->er_extcount += nex2;
3276 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3277 }
3278}
3279
3280/*
3281 * This is called when the amount of space required for incore file
3282 * extents needs to be decreased. The ext_diff parameter stores the
3283 * number of extents to be removed and the idx parameter contains
3284 * the extent index where the extents will be removed from.
3285 *
3286 * If the amount of space needed has decreased below the linear
3287 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3288 * extent array. Otherwise, use kmem_realloc() to adjust the
3289 * size to what is needed.
3290 */
3291void
3292xfs_iext_remove(
3293 xfs_inode_t *ip, /* incore inode pointer */
3294 xfs_extnum_t idx, /* index to begin removing exts */
3295 int ext_diff, /* number of extents to remove */
3296 int state) /* type of extent conversion */
3297{
3298 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3299 xfs_extnum_t nextents; /* number of extents in file */
3300 int new_size; /* size of extents after removal */
3301
3302 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3303
3304 ASSERT(ext_diff > 0);
3305 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3306 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3307
3308 if (new_size == 0) {
3309 xfs_iext_destroy(ifp);
3310 } else if (ifp->if_flags & XFS_IFEXTIREC) {
3311 xfs_iext_remove_indirect(ifp, idx, ext_diff);
3312 } else if (ifp->if_real_bytes) {
3313 xfs_iext_remove_direct(ifp, idx, ext_diff);
3314 } else {
3315 xfs_iext_remove_inline(ifp, idx, ext_diff);
3316 }
3317 ifp->if_bytes = new_size;
3318}
3319
3320/*
3321 * This removes ext_diff extents from the inline buffer, beginning
3322 * at extent index idx.
3323 */
3324void
3325xfs_iext_remove_inline(
3326 xfs_ifork_t *ifp, /* inode fork pointer */
3327 xfs_extnum_t idx, /* index to begin removing exts */
3328 int ext_diff) /* number of extents to remove */
3329{
3330 int nextents; /* number of extents in file */
3331
3332 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3333 ASSERT(idx < XFS_INLINE_EXTS);
3334 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3335 ASSERT(((nextents - ext_diff) > 0) &&
3336 (nextents - ext_diff) < XFS_INLINE_EXTS);
3337
3338 if (idx + ext_diff < nextents) {
3339 memmove(&ifp->if_u2.if_inline_ext[idx],
3340 &ifp->if_u2.if_inline_ext[idx + ext_diff],
3341 (nextents - (idx + ext_diff)) *
3342 sizeof(xfs_bmbt_rec_t));
3343 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3344 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3345 } else {
3346 memset(&ifp->if_u2.if_inline_ext[idx], 0,
3347 ext_diff * sizeof(xfs_bmbt_rec_t));
3348 }
3349}
3350
3351/*
3352 * This removes ext_diff extents from a linear (direct) extent list,
3353 * beginning at extent index idx. If the extents are being removed
3354 * from the end of the list (ie. truncate) then we just need to re-
3355 * allocate the list to remove the extra space. Otherwise, if the
3356 * extents are being removed from the middle of the existing extent
3357 * entries, then we first need to move the extent records beginning
3358 * at idx + ext_diff up in the list to overwrite the records being
3359 * removed, then remove the extra space via kmem_realloc.
3360 */
3361void
3362xfs_iext_remove_direct(
3363 xfs_ifork_t *ifp, /* inode fork pointer */
3364 xfs_extnum_t idx, /* index to begin removing exts */
3365 int ext_diff) /* number of extents to remove */
3366{
3367 xfs_extnum_t nextents; /* number of extents in file */
3368 int new_size; /* size of extents after removal */
3369
3370 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3371 new_size = ifp->if_bytes -
3372 (ext_diff * sizeof(xfs_bmbt_rec_t));
3373 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3374
3375 if (new_size == 0) {
3376 xfs_iext_destroy(ifp);
3377 return;
3378 }
3379 /* Move extents up in the list (if needed) */
3380 if (idx + ext_diff < nextents) {
3381 memmove(&ifp->if_u1.if_extents[idx],
3382 &ifp->if_u1.if_extents[idx + ext_diff],
3383 (nextents - (idx + ext_diff)) *
3384 sizeof(xfs_bmbt_rec_t));
3385 }
3386 memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3387 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3388 /*
3389 * Reallocate the direct extent list. If the extents
3390 * will fit inside the inode then xfs_iext_realloc_direct
3391 * will switch from direct to inline extent allocation
3392 * mode for us.
3393 */
3394 xfs_iext_realloc_direct(ifp, new_size);
3395 ifp->if_bytes = new_size;
3396}
3397
3398/*
3399 * This is called when incore extents are being removed from the
3400 * indirection array and the extents being removed span multiple extent
3401 * buffers. The idx parameter contains the file extent index where we
3402 * want to begin removing extents, and the count parameter contains
3403 * how many extents need to be removed.
3404 *
3405 * |-------| |-------|
3406 * | nex1 | | | nex1 - number of extents before idx
3407 * |-------| | count |
3408 * | | | | count - number of extents being removed at idx
3409 * | count | |-------|
3410 * | | | nex2 | nex2 - number of extents after idx + count
3411 * |-------| |-------|
3412 */
3413void
3414xfs_iext_remove_indirect(
3415 xfs_ifork_t *ifp, /* inode fork pointer */
3416 xfs_extnum_t idx, /* index to begin removing extents */
3417 int count) /* number of extents to remove */
3418{
3419 xfs_ext_irec_t *erp; /* indirection array pointer */
3420 int erp_idx = 0; /* indirection array index */
3421 xfs_extnum_t ext_cnt; /* extents left to remove */
3422 xfs_extnum_t ext_diff; /* extents to remove in current list */
3423 xfs_extnum_t nex1; /* number of extents before idx */
3424 xfs_extnum_t nex2; /* extents after idx + count */
3425 int page_idx = idx; /* index in target extent list */
3426
3427 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3428 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3429 ASSERT(erp != NULL);
3430 nex1 = page_idx;
3431 ext_cnt = count;
3432 while (ext_cnt) {
3433 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3434 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3435 /*
3436 * Check for deletion of entire list;
3437 * xfs_iext_irec_remove() updates extent offsets.
3438 */
3439 if (ext_diff == erp->er_extcount) {
3440 xfs_iext_irec_remove(ifp, erp_idx);
3441 ext_cnt -= ext_diff;
3442 nex1 = 0;
3443 if (ext_cnt) {
3444 ASSERT(erp_idx < ifp->if_real_bytes /
3445 XFS_IEXT_BUFSZ);
3446 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3447 nex1 = 0;
3448 continue;
3449 } else {
3450 break;
3451 }
3452 }
3453 /* Move extents up (if needed) */
3454 if (nex2) {
3455 memmove(&erp->er_extbuf[nex1],
3456 &erp->er_extbuf[nex1 + ext_diff],
3457 nex2 * sizeof(xfs_bmbt_rec_t));
3458 }
3459 /* Zero out rest of page */
3460 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3461 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3462 /* Update remaining counters */
3463 erp->er_extcount -= ext_diff;
3464 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3465 ext_cnt -= ext_diff;
3466 nex1 = 0;
3467 erp_idx++;
3468 erp++;
3469 }
3470 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3471 xfs_iext_irec_compact(ifp);
3472}
3473
3474/*
3475 * Create, destroy, or resize a linear (direct) block of extents.
3476 */
3477void
3478xfs_iext_realloc_direct(
3479 xfs_ifork_t *ifp, /* inode fork pointer */
3480 int new_size) /* new size of extents */
3481{
3482 int rnew_size; /* real new size of extents */
3483
3484 rnew_size = new_size;
3485
3486 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3487 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3488 (new_size != ifp->if_real_bytes)));
3489
3490 /* Free extent records */
3491 if (new_size == 0) {
3492 xfs_iext_destroy(ifp);
3493 }
3494 /* Resize direct extent list and zero any new bytes */
3495 else if (ifp->if_real_bytes) {
3496 /* Check if extents will fit inside the inode */
3497 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3498 xfs_iext_direct_to_inline(ifp, new_size /
3499 (uint)sizeof(xfs_bmbt_rec_t));
3500 ifp->if_bytes = new_size;
3501 return;
3502 }
3503 if (!is_power_of_2(new_size)){
3504 rnew_size = roundup_pow_of_two(new_size);
3505 }
3506 if (rnew_size != ifp->if_real_bytes) {
3507 ifp->if_u1.if_extents =
3508 kmem_realloc(ifp->if_u1.if_extents,
3509 rnew_size,
3510 ifp->if_real_bytes, KM_NOFS);
3511 }
3512 if (rnew_size > ifp->if_real_bytes) {
3513 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3514 (uint)sizeof(xfs_bmbt_rec_t)], 0,
3515 rnew_size - ifp->if_real_bytes);
3516 }
3517 }
3518 /*
3519 * Switch from the inline extent buffer to a direct
3520 * extent list. Be sure to include the inline extent
3521 * bytes in new_size.
3522 */
3523 else {
3524 new_size += ifp->if_bytes;
3525 if (!is_power_of_2(new_size)) {
3526 rnew_size = roundup_pow_of_two(new_size);
3527 }
3528 xfs_iext_inline_to_direct(ifp, rnew_size);
3529 }
3530 ifp->if_real_bytes = rnew_size;
3531 ifp->if_bytes = new_size;
3532}
3533
3534/*
3535 * Switch from linear (direct) extent records to inline buffer.
3536 */
3537void
3538xfs_iext_direct_to_inline(
3539 xfs_ifork_t *ifp, /* inode fork pointer */
3540 xfs_extnum_t nextents) /* number of extents in file */
3541{
3542 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3543 ASSERT(nextents <= XFS_INLINE_EXTS);
3544 /*
3545 * The inline buffer was zeroed when we switched
3546 * from inline to direct extent allocation mode,
3547 * so we don't need to clear it here.
3548 */
3549 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3550 nextents * sizeof(xfs_bmbt_rec_t));
3551 kmem_free(ifp->if_u1.if_extents);
3552 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3553 ifp->if_real_bytes = 0;
3554}
3555
3556/*
3557 * Switch from inline buffer to linear (direct) extent records.
3558 * new_size should already be rounded up to the next power of 2
3559 * by the caller (when appropriate), so use new_size as it is.
3560 * However, since new_size may be rounded up, we can't update
3561 * if_bytes here. It is the caller's responsibility to update
3562 * if_bytes upon return.
3563 */
3564void
3565xfs_iext_inline_to_direct(
3566 xfs_ifork_t *ifp, /* inode fork pointer */
3567 int new_size) /* number of extents in file */
3568{
3569 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3570 memset(ifp->if_u1.if_extents, 0, new_size);
3571 if (ifp->if_bytes) {
3572 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3573 ifp->if_bytes);
3574 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3575 sizeof(xfs_bmbt_rec_t));
3576 }
3577 ifp->if_real_bytes = new_size;
3578}
3579
3580/*
3581 * Resize an extent indirection array to new_size bytes.
3582 */
3583STATIC void
3584xfs_iext_realloc_indirect(
3585 xfs_ifork_t *ifp, /* inode fork pointer */
3586 int new_size) /* new indirection array size */
3587{
3588 int nlists; /* number of irec's (ex lists) */
3589 int size; /* current indirection array size */
3590
3591 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3592 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3593 size = nlists * sizeof(xfs_ext_irec_t);
3594 ASSERT(ifp->if_real_bytes);
3595 ASSERT((new_size >= 0) && (new_size != size));
3596 if (new_size == 0) {
3597 xfs_iext_destroy(ifp);
3598 } else {
3599 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3600 kmem_realloc(ifp->if_u1.if_ext_irec,
3601 new_size, size, KM_NOFS);
3602 }
3603}
3604
3605/*
3606 * Switch from indirection array to linear (direct) extent allocations.
3607 */
3608STATIC void
3609xfs_iext_indirect_to_direct(
3610 xfs_ifork_t *ifp) /* inode fork pointer */
3611{
3612 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
3613 xfs_extnum_t nextents; /* number of extents in file */
3614 int size; /* size of file extents */
3615
3616 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3617 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3618 ASSERT(nextents <= XFS_LINEAR_EXTS);
3619 size = nextents * sizeof(xfs_bmbt_rec_t);
3620
3621 xfs_iext_irec_compact_pages(ifp);
3622 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3623
3624 ep = ifp->if_u1.if_ext_irec->er_extbuf;
3625 kmem_free(ifp->if_u1.if_ext_irec);
3626 ifp->if_flags &= ~XFS_IFEXTIREC;
3627 ifp->if_u1.if_extents = ep;
3628 ifp->if_bytes = size;
3629 if (nextents < XFS_LINEAR_EXTS) {
3630 xfs_iext_realloc_direct(ifp, size);
3631 }
3632}
3633
3634/*
3635 * Free incore file extents.
3636 */
3637void
3638xfs_iext_destroy(
3639 xfs_ifork_t *ifp) /* inode fork pointer */
3640{
3641 if (ifp->if_flags & XFS_IFEXTIREC) {
3642 int erp_idx;
3643 int nlists;
3644
3645 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3646 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3647 xfs_iext_irec_remove(ifp, erp_idx);
3648 }
3649 ifp->if_flags &= ~XFS_IFEXTIREC;
3650 } else if (ifp->if_real_bytes) {
3651 kmem_free(ifp->if_u1.if_extents);
3652 } else if (ifp->if_bytes) {
3653 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3654 sizeof(xfs_bmbt_rec_t));
3655 }
3656 ifp->if_u1.if_extents = NULL;
3657 ifp->if_real_bytes = 0;
3658 ifp->if_bytes = 0;
3659}
3660
3661/*
3662 * Return a pointer to the extent record for file system block bno.
3663 */
3664xfs_bmbt_rec_host_t * /* pointer to found extent record */
3665xfs_iext_bno_to_ext(
3666 xfs_ifork_t *ifp, /* inode fork pointer */
3667 xfs_fileoff_t bno, /* block number to search for */
3668 xfs_extnum_t *idxp) /* index of target extent */
3669{
3670 xfs_bmbt_rec_host_t *base; /* pointer to first extent */
3671 xfs_filblks_t blockcount = 0; /* number of blocks in extent */
3672 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3673 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
3674 int high; /* upper boundary in search */
3675 xfs_extnum_t idx = 0; /* index of target extent */
3676 int low; /* lower boundary in search */
3677 xfs_extnum_t nextents; /* number of file extents */
3678 xfs_fileoff_t startoff = 0; /* start offset of extent */
3679
3680 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3681 if (nextents == 0) {
3682 *idxp = 0;
3683 return NULL;
3684 }
3685 low = 0;
3686 if (ifp->if_flags & XFS_IFEXTIREC) {
3687 /* Find target extent list */
3688 int erp_idx = 0;
3689 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3690 base = erp->er_extbuf;
3691 high = erp->er_extcount - 1;
3692 } else {
3693 base = ifp->if_u1.if_extents;
3694 high = nextents - 1;
3695 }
3696 /* Binary search extent records */
3697 while (low <= high) {
3698 idx = (low + high) >> 1;
3699 ep = base + idx;
3700 startoff = xfs_bmbt_get_startoff(ep);
3701 blockcount = xfs_bmbt_get_blockcount(ep);
3702 if (bno < startoff) {
3703 high = idx - 1;
3704 } else if (bno >= startoff + blockcount) {
3705 low = idx + 1;
3706 } else {
3707 /* Convert back to file-based extent index */
3708 if (ifp->if_flags & XFS_IFEXTIREC) {
3709 idx += erp->er_extoff;
3710 }
3711 *idxp = idx;
3712 return ep;
3713 }
3714 }
3715 /* Convert back to file-based extent index */
3716 if (ifp->if_flags & XFS_IFEXTIREC) {
3717 idx += erp->er_extoff;
3718 }
3719 if (bno >= startoff + blockcount) {
3720 if (++idx == nextents) {
3721 ep = NULL;
3722 } else {
3723 ep = xfs_iext_get_ext(ifp, idx);
3724 }
3725 }
3726 *idxp = idx;
3727 return ep;
3728}
3729
3730/*
3731 * Return a pointer to the indirection array entry containing the
3732 * extent record for filesystem block bno. Store the index of the
3733 * target irec in *erp_idxp.
3734 */
3735xfs_ext_irec_t * /* pointer to found extent record */
3736xfs_iext_bno_to_irec(
3737 xfs_ifork_t *ifp, /* inode fork pointer */
3738 xfs_fileoff_t bno, /* block number to search for */
3739 int *erp_idxp) /* irec index of target ext list */
3740{
3741 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
3742 xfs_ext_irec_t *erp_next; /* next indirection array entry */
3743 int erp_idx; /* indirection array index */
3744 int nlists; /* number of extent irec's (lists) */
3745 int high; /* binary search upper limit */
3746 int low; /* binary search lower limit */
3747
3748 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3749 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3750 erp_idx = 0;
3751 low = 0;
3752 high = nlists - 1;
3753 while (low <= high) {
3754 erp_idx = (low + high) >> 1;
3755 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3756 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3757 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3758 high = erp_idx - 1;
3759 } else if (erp_next && bno >=
3760 xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3761 low = erp_idx + 1;
3762 } else {
3763 break;
3764 }
3765 }
3766 *erp_idxp = erp_idx;
3767 return erp;
3768}
3769
3770/*
3771 * Return a pointer to the indirection array entry containing the
3772 * extent record at file extent index *idxp. Store the index of the
3773 * target irec in *erp_idxp and store the page index of the target
3774 * extent record in *idxp.
3775 */
3776xfs_ext_irec_t *
3777xfs_iext_idx_to_irec(
3778 xfs_ifork_t *ifp, /* inode fork pointer */
3779 xfs_extnum_t *idxp, /* extent index (file -> page) */
3780 int *erp_idxp, /* pointer to target irec */
3781 int realloc) /* new bytes were just added */
3782{
3783 xfs_ext_irec_t *prev; /* pointer to previous irec */
3784 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
3785 int erp_idx; /* indirection array index */
3786 int nlists; /* number of irec's (ex lists) */
3787 int high; /* binary search upper limit */
3788 int low; /* binary search lower limit */
3789 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
3790
3791 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3792 ASSERT(page_idx >= 0);
3793 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3794 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3795
3796 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3797 erp_idx = 0;
3798 low = 0;
3799 high = nlists - 1;
3800
3801 /* Binary search extent irec's */
3802 while (low <= high) {
3803 erp_idx = (low + high) >> 1;
3804 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3805 prev = erp_idx > 0 ? erp - 1 : NULL;
3806 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3807 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3808 high = erp_idx - 1;
3809 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3810 (page_idx == erp->er_extoff + erp->er_extcount &&
3811 !realloc)) {
3812 low = erp_idx + 1;
3813 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3814 erp->er_extcount == XFS_LINEAR_EXTS) {
3815 ASSERT(realloc);
3816 page_idx = 0;
3817 erp_idx++;
3818 erp = erp_idx < nlists ? erp + 1 : NULL;
3819 break;
3820 } else {
3821 page_idx -= erp->er_extoff;
3822 break;
3823 }
3824 }
3825 *idxp = page_idx;
3826 *erp_idxp = erp_idx;
3827 return(erp);
3828}
3829
3830/*
3831 * Allocate and initialize an indirection array once the space needed
3832 * for incore extents increases above XFS_IEXT_BUFSZ.
3833 */
3834void
3835xfs_iext_irec_init(
3836 xfs_ifork_t *ifp) /* inode fork pointer */
3837{
3838 xfs_ext_irec_t *erp; /* indirection array pointer */
3839 xfs_extnum_t nextents; /* number of extents in file */
3840
3841 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3842 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3843 ASSERT(nextents <= XFS_LINEAR_EXTS);
3844
3845 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3846
3847 if (nextents == 0) {
3848 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3849 } else if (!ifp->if_real_bytes) {
3850 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3851 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3852 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3853 }
3854 erp->er_extbuf = ifp->if_u1.if_extents;
3855 erp->er_extcount = nextents;
3856 erp->er_extoff = 0;
3857
3858 ifp->if_flags |= XFS_IFEXTIREC;
3859 ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3860 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3861 ifp->if_u1.if_ext_irec = erp;
3862
3863 return;
3864}
3865
3866/*
3867 * Allocate and initialize a new entry in the indirection array.
3868 */
3869xfs_ext_irec_t *
3870xfs_iext_irec_new(
3871 xfs_ifork_t *ifp, /* inode fork pointer */
3872 int erp_idx) /* index for new irec */
3873{
3874 xfs_ext_irec_t *erp; /* indirection array pointer */
3875 int i; /* loop counter */
3876 int nlists; /* number of irec's (ex lists) */
3877
3878 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3879 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3880
3881 /* Resize indirection array */
3882 xfs_iext_realloc_indirect(ifp, ++nlists *
3883 sizeof(xfs_ext_irec_t));
3884 /*
3885 * Move records down in the array so the
3886 * new page can use erp_idx.
3887 */
3888 erp = ifp->if_u1.if_ext_irec;
3889 for (i = nlists - 1; i > erp_idx; i--) {
3890 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3891 }
3892 ASSERT(i == erp_idx);
3893
3894 /* Initialize new extent record */
3895 erp = ifp->if_u1.if_ext_irec;
3896 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3897 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3898 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3899 erp[erp_idx].er_extcount = 0;
3900 erp[erp_idx].er_extoff = erp_idx > 0 ?
3901 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3902 return (&erp[erp_idx]);
3903}
3904
3905/*
3906 * Remove a record from the indirection array.
3907 */
3908void
3909xfs_iext_irec_remove(
3910 xfs_ifork_t *ifp, /* inode fork pointer */
3911 int erp_idx) /* irec index to remove */
3912{
3913 xfs_ext_irec_t *erp; /* indirection array pointer */
3914 int i; /* loop counter */
3915 int nlists; /* number of irec's (ex lists) */
3916
3917 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3918 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3919 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3920 if (erp->er_extbuf) {
3921 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3922 -erp->er_extcount);
3923 kmem_free(erp->er_extbuf);
3924 }
3925 /* Compact extent records */
3926 erp = ifp->if_u1.if_ext_irec;
3927 for (i = erp_idx; i < nlists - 1; i++) {
3928 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3929 }
3930 /*
3931 * Manually free the last extent record from the indirection
3932 * array. A call to xfs_iext_realloc_indirect() with a size
3933 * of zero would result in a call to xfs_iext_destroy() which
3934 * would in turn call this function again, creating a nasty
3935 * infinite loop.
3936 */
3937 if (--nlists) {
3938 xfs_iext_realloc_indirect(ifp,
3939 nlists * sizeof(xfs_ext_irec_t));
3940 } else {
3941 kmem_free(ifp->if_u1.if_ext_irec);
3942 }
3943 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3944}
3945
3946/*
3947 * This is called to clean up large amounts of unused memory allocated
3948 * by the indirection array. Before compacting anything though, verify
3949 * that the indirection array is still needed and switch back to the
3950 * linear extent list (or even the inline buffer) if possible. The
3951 * compaction policy is as follows:
3952 *
3953 * Full Compaction: Extents fit into a single page (or inline buffer)
3954 * Partial Compaction: Extents occupy less than 50% of allocated space
3955 * No Compaction: Extents occupy at least 50% of allocated space
3956 */
3957void
3958xfs_iext_irec_compact(
3959 xfs_ifork_t *ifp) /* inode fork pointer */
3960{
3961 xfs_extnum_t nextents; /* number of extents in file */
3962 int nlists; /* number of irec's (ex lists) */
3963
3964 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3965 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3966 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3967
3968 if (nextents == 0) {
3969 xfs_iext_destroy(ifp);
3970 } else if (nextents <= XFS_INLINE_EXTS) {
3971 xfs_iext_indirect_to_direct(ifp);
3972 xfs_iext_direct_to_inline(ifp, nextents);
3973 } else if (nextents <= XFS_LINEAR_EXTS) {
3974 xfs_iext_indirect_to_direct(ifp);
3975 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3976 xfs_iext_irec_compact_pages(ifp);
3977 }
3978}
3979
3980/*
3981 * Combine extents from neighboring extent pages.
3982 */
3983void
3984xfs_iext_irec_compact_pages(
3985 xfs_ifork_t *ifp) /* inode fork pointer */
3986{
3987 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
3988 int erp_idx = 0; /* indirection array index */
3989 int nlists; /* number of irec's (ex lists) */
3990
3991 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3992 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3993 while (erp_idx < nlists - 1) {
3994 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3995 erp_next = erp + 1;
3996 if (erp_next->er_extcount <=
3997 (XFS_LINEAR_EXTS - erp->er_extcount)) {
3998 memcpy(&erp->er_extbuf[erp->er_extcount],
3999 erp_next->er_extbuf, erp_next->er_extcount *
4000 sizeof(xfs_bmbt_rec_t));
4001 erp->er_extcount += erp_next->er_extcount;
4002 /*
4003 * Free page before removing extent record
4004 * so er_extoffs don't get modified in
4005 * xfs_iext_irec_remove.
4006 */
4007 kmem_free(erp_next->er_extbuf);
4008 erp_next->er_extbuf = NULL;
4009 xfs_iext_irec_remove(ifp, erp_idx + 1);
4010 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4011 } else {
4012 erp_idx++;
4013 }
4014 }
4015}
4016
4017/*
4018 * This is called to update the er_extoff field in the indirection
4019 * array when extents have been added or removed from one of the
4020 * extent lists. erp_idx contains the irec index to begin updating
4021 * at and ext_diff contains the number of extents that were added
4022 * or removed.
4023 */
4024void
4025xfs_iext_irec_update_extoffs(
4026 xfs_ifork_t *ifp, /* inode fork pointer */
4027 int erp_idx, /* irec index to update */
4028 int ext_diff) /* number of new extents */
4029{
4030 int i; /* loop counter */
4031 int nlists; /* number of irec's (ex lists */
4032
4033 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4034 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4035 for (i = erp_idx; i < nlists; i++) {
4036 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
4037 }
4038}
4039
4040/*
4041 * Test whether it is appropriate to check an inode for and free post EOF
4042 * blocks. The 'force' parameter determines whether we should also consider
4043 * regular files that are marked preallocated or append-only.
4044 */
4045bool
4046xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
4047{
4048 /* prealloc/delalloc exists only on regular files */
4049 if (!S_ISREG(ip->i_d.di_mode))
4050 return false;
4051
4052 /*
4053 * Zero sized files with no cached pages and delalloc blocks will not
4054 * have speculative prealloc/delalloc blocks to remove.
4055 */
4056 if (VFS_I(ip)->i_size == 0 &&
4057 VN_CACHED(VFS_I(ip)) == 0 &&
4058 ip->i_delayed_blks == 0)
4059 return false;
4060
4061 /* If we haven't read in the extent list, then don't do it now. */
4062 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
4063 return false;
4064
4065 /*
4066 * Do not free real preallocated or append-only files unless the file
4067 * has delalloc blocks and we are forced to remove them.
4068 */
4069 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
4070 if (!force || ip->i_delayed_blks == 0)
4071 return false;
4072
4073 return true;
4074}
4075
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b55fd347ab5b..4a91358c1470 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,225 +18,15 @@
18#ifndef __XFS_INODE_H__ 18#ifndef __XFS_INODE_H__
19#define __XFS_INODE_H__ 19#define __XFS_INODE_H__
20 20
21struct posix_acl; 21#include "xfs_inode_buf.h"
22struct xfs_dinode; 22#include "xfs_inode_fork.h"
23struct xfs_inode;
24
25/*
26 * Fork identifiers.
27 */
28#define XFS_DATA_FORK 0
29#define XFS_ATTR_FORK 1
30
31/*
32 * The following xfs_ext_irec_t struct introduces a second (top) level
33 * to the in-core extent allocation scheme. These structs are allocated
34 * in a contiguous block, creating an indirection array where each entry
35 * (irec) contains a pointer to a buffer of in-core extent records which
36 * it manages. Each extent buffer is 4k in size, since 4k is the system
37 * page size on Linux i386 and systems with larger page sizes don't seem
38 * to gain much, if anything, by using their native page size as the
39 * extent buffer size. Also, using 4k extent buffers everywhere provides
40 * a consistent interface for CXFS across different platforms.
41 *
42 * There is currently no limit on the number of irec's (extent lists)
43 * allowed, so heavily fragmented files may require an indirection array
44 * which spans multiple system pages of memory. The number of extents
45 * which would require this amount of contiguous memory is very large
46 * and should not cause problems in the foreseeable future. However,
47 * if the memory needed for the contiguous array ever becomes a problem,
48 * it is possible that a third level of indirection may be required.
49 */
50typedef struct xfs_ext_irec {
51 xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
52 xfs_extnum_t er_extoff; /* extent offset in file */
53 xfs_extnum_t er_extcount; /* number of extents in page/block */
54} xfs_ext_irec_t;
55 23
56/* 24/*
57 * File incore extent information, present for each of data & attr forks. 25 * Kernel only inode definitions
58 */ 26 */
59#define XFS_IEXT_BUFSZ 4096
60#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
61#define XFS_INLINE_EXTS 2
62#define XFS_INLINE_DATA 32
63typedef struct xfs_ifork {
64 int if_bytes; /* bytes in if_u1 */
65 int if_real_bytes; /* bytes allocated in if_u1 */
66 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */
69 union {
70 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
71 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
72 char *if_data; /* inline file data */
73 } if_u1;
74 union {
75 xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
76 /* very small file extents */
77 char if_inline_data[XFS_INLINE_DATA];
78 /* very small file data */
79 xfs_dev_t if_rdev; /* dev number if special */
80 uuid_t if_uuid; /* mount point value */
81 } if_u2;
82} xfs_ifork_t;
83
84/*
85 * Inode location information. Stored in the inode and passed to
86 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
87 */
88struct xfs_imap {
89 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
90 ushort im_len; /* length in BBs of inode chunk */
91 ushort im_boffset; /* inode offset in block in bytes */
92};
93
94/*
95 * This is the xfs in-core inode structure.
96 * Most of the on-disk inode is embedded in the i_d field.
97 *
98 * The extent pointers/inline file space, however, are managed
99 * separately. The memory for this information is pointed to by
100 * the if_u1 unions depending on the type of the data.
101 * This is used to linearize the array of extents for fast in-core
102 * access. This is used until the file's number of extents
103 * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
104 * are accessed through the buffer cache.
105 *
106 * Other state kept in the in-core inode is used for identification,
107 * locking, transactional updating, etc of the inode.
108 *
109 * Generally, we do not want to hold the i_rlock while holding the
110 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
111 *
112 * xfs_iptr_t contains all the inode fields up to and including the
113 * i_mnext and i_mprev fields, it is used as a marker in the inode
114 * chain off the mount structure by xfs_sync calls.
115 */
116
117typedef struct xfs_ictimestamp {
118 __int32_t t_sec; /* timestamp seconds */
119 __int32_t t_nsec; /* timestamp nanoseconds */
120} xfs_ictimestamp_t;
121
122/*
123 * NOTE: This structure must be kept identical to struct xfs_dinode
124 * in xfs_dinode.h except for the endianness annotations.
125 */
126typedef struct xfs_icdinode {
127 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
128 __uint16_t di_mode; /* mode and type of file */
129 __int8_t di_version; /* inode version */
130 __int8_t di_format; /* format of di_c data */
131 __uint16_t di_onlink; /* old number of links to file */
132 __uint32_t di_uid; /* owner's user id */
133 __uint32_t di_gid; /* owner's group id */
134 __uint32_t di_nlink; /* number of links to file */
135 __uint16_t di_projid_lo; /* lower part of owner's project id */
136 __uint16_t di_projid_hi; /* higher part of owner's project id */
137 __uint8_t di_pad[6]; /* unused, zeroed space */
138 __uint16_t di_flushiter; /* incremented on flush */
139 xfs_ictimestamp_t di_atime; /* time last accessed */
140 xfs_ictimestamp_t di_mtime; /* time last modified */
141 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
142 xfs_fsize_t di_size; /* number of bytes in file */
143 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
144 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
145 xfs_extnum_t di_nextents; /* number of extents in data fork */
146 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
147 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
148 __int8_t di_aformat; /* format of attr fork's data */
149 __uint32_t di_dmevmask; /* DMIG event mask */
150 __uint16_t di_dmstate; /* DMIG state info */
151 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
152 __uint32_t di_gen; /* generation number */
153
154 /* di_next_unlinked is the only non-core field in the old dinode */
155 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
156
157 /* start of the extended dinode, writable fields */
158 __uint32_t di_crc; /* CRC of the inode */
159 __uint64_t di_changecount; /* number of attribute changes */
160 xfs_lsn_t di_lsn; /* flush sequence */
161 __uint64_t di_flags2; /* more random flags */
162 __uint8_t di_pad2[16]; /* more padding for future expansion */
163
164 /* fields only written to during inode creation */
165 xfs_ictimestamp_t di_crtime; /* time created */
166 xfs_ino_t di_ino; /* inode number */
167 uuid_t di_uuid; /* UUID of the filesystem */
168
169 /* structure must be padded to 64 bit alignment */
170} xfs_icdinode_t;
171
172static inline uint xfs_icdinode_size(int version)
173{
174 if (version == 3)
175 return sizeof(struct xfs_icdinode);
176 return offsetof(struct xfs_icdinode, di_next_unlinked);
177}
178
179/*
180 * Flags for xfs_ichgtime().
181 */
182#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
183#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
184#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
185
186/*
187 * Per-fork incore inode flags.
188 */
189#define XFS_IFINLINE 0x01 /* Inline data is read in */
190#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
191#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
192#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
193
194/*
195 * Fork handling.
196 */
197
198#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
199#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
200
201#define XFS_IFORK_PTR(ip,w) \
202 ((w) == XFS_DATA_FORK ? \
203 &(ip)->i_df : \
204 (ip)->i_afp)
205#define XFS_IFORK_DSIZE(ip) \
206 (XFS_IFORK_Q(ip) ? \
207 XFS_IFORK_BOFF(ip) : \
208 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
209#define XFS_IFORK_ASIZE(ip) \
210 (XFS_IFORK_Q(ip) ? \
211 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
212 XFS_IFORK_BOFF(ip) : \
213 0)
214#define XFS_IFORK_SIZE(ip,w) \
215 ((w) == XFS_DATA_FORK ? \
216 XFS_IFORK_DSIZE(ip) : \
217 XFS_IFORK_ASIZE(ip))
218#define XFS_IFORK_FORMAT(ip,w) \
219 ((w) == XFS_DATA_FORK ? \
220 (ip)->i_d.di_format : \
221 (ip)->i_d.di_aformat)
222#define XFS_IFORK_FMT_SET(ip,w,n) \
223 ((w) == XFS_DATA_FORK ? \
224 ((ip)->i_d.di_format = (n)) : \
225 ((ip)->i_d.di_aformat = (n)))
226#define XFS_IFORK_NEXTENTS(ip,w) \
227 ((w) == XFS_DATA_FORK ? \
228 (ip)->i_d.di_nextents : \
229 (ip)->i_d.di_anextents)
230#define XFS_IFORK_NEXT_SET(ip,w,n) \
231 ((w) == XFS_DATA_FORK ? \
232 ((ip)->i_d.di_nextents = (n)) : \
233 ((ip)->i_d.di_anextents = (n)))
234#define XFS_IFORK_MAXEXT(ip, w) \
235 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
236
237
238#ifdef __KERNEL__
239 27
28struct xfs_dinode;
29struct xfs_inode;
240struct xfs_buf; 30struct xfs_buf;
241struct xfs_bmap_free; 31struct xfs_bmap_free;
242struct xfs_bmbt_irec; 32struct xfs_bmbt_irec;
@@ -525,9 +315,21 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
525 ((pip)->i_d.di_mode & S_ISGID)) 315 ((pip)->i_d.di_mode & S_ISGID))
526 316
527 317
528/* 318int xfs_release(struct xfs_inode *ip);
529 * xfs_inode.c prototypes. 319int xfs_inactive(struct xfs_inode *ip);
530 */ 320int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
321 struct xfs_inode **ipp, struct xfs_name *ci_name);
322int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
323 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
324int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
325 struct xfs_inode *ip);
326int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
327 struct xfs_name *target_name);
328int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
329 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
330 struct xfs_name *target_name,
331 struct xfs_inode *target_ip);
332
531void xfs_ilock(xfs_inode_t *, uint); 333void xfs_ilock(xfs_inode_t *, uint);
532int xfs_ilock_nowait(xfs_inode_t *, uint); 334int xfs_ilock_nowait(xfs_inode_t *, uint);
533void xfs_iunlock(xfs_inode_t *, uint); 335void xfs_iunlock(xfs_inode_t *, uint);
@@ -548,13 +350,28 @@ int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
548int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 350int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
549 351
550void xfs_iext_realloc(xfs_inode_t *, int, int); 352void xfs_iext_realloc(xfs_inode_t *, int, int);
353
551void xfs_iunpin_wait(xfs_inode_t *); 354void xfs_iunpin_wait(xfs_inode_t *);
355#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
356
552int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 357int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
553void xfs_lock_inodes(xfs_inode_t **, int, uint); 358void xfs_lock_inodes(xfs_inode_t **, int, uint);
554void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 359void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
555 360
556xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 361xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
557 362
363int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
364 xfs_nlink_t, xfs_dev_t, prid_t, int,
365 struct xfs_inode **, int *);
366int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
367int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
368void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
369
370/* from xfs_file.c */
371int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
372int xfs_iozero(struct xfs_inode *, loff_t, size_t);
373
374
558#define IHOLD(ip) \ 375#define IHOLD(ip) \
559do { \ 376do { \
560 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 377 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -568,65 +385,6 @@ do { \
568 iput(VFS_I(ip)); \ 385 iput(VFS_I(ip)); \
569} while (0) 386} while (0)
570 387
571#endif /* __KERNEL__ */
572
573/*
574 * Flags for xfs_iget()
575 */
576#define XFS_IGET_CREATE 0x1
577#define XFS_IGET_UNTRUSTED 0x2
578#define XFS_IGET_DONTCACHE 0x4
579
580int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
581 struct xfs_imap *, struct xfs_dinode **,
582 struct xfs_buf **, uint, uint);
583int xfs_iread(struct xfs_mount *, struct xfs_trans *,
584 struct xfs_inode *, uint);
585void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
586void xfs_dinode_to_disk(struct xfs_dinode *,
587 struct xfs_icdinode *);
588void xfs_idestroy_fork(struct xfs_inode *, int);
589void xfs_idata_realloc(struct xfs_inode *, int, int);
590void xfs_iroot_realloc(struct xfs_inode *, int, int);
591int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
592int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
593
594xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
595void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
596 xfs_bmbt_irec_t *, int);
597void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
598void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
599void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
600void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
601void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
602void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
603void xfs_iext_realloc_direct(xfs_ifork_t *, int);
604void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
605void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
606void xfs_iext_destroy(xfs_ifork_t *);
607xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
608xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
609xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
610void xfs_iext_irec_init(xfs_ifork_t *);
611xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
612void xfs_iext_irec_remove(xfs_ifork_t *, int);
613void xfs_iext_irec_compact(xfs_ifork_t *);
614void xfs_iext_irec_compact_pages(xfs_ifork_t *);
615void xfs_iext_irec_compact_full(xfs_ifork_t *);
616void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
617bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
618
619#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
620
621#if defined(DEBUG)
622void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
623#else
624#define xfs_inobp_check(mp, bp)
625#endif /* DEBUG */
626
627extern struct kmem_zone *xfs_ifork_zone;
628extern struct kmem_zone *xfs_inode_zone; 388extern struct kmem_zone *xfs_inode_zone;
629extern struct kmem_zone *xfs_ili_zone;
630extern const struct xfs_buf_ops xfs_inode_buf_ops;
631 389
632#endif /* __XFS_INODE_H__ */ 390#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
new file mode 100644
index 000000000000..63382d37f565
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.c
@@ -0,0 +1,481 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_ialloc_btree.h"
28#include "xfs_dinode.h"
29#include "xfs_inode.h"
30#include "xfs_error.h"
31#include "xfs_cksum.h"
32#include "xfs_icache.h"
33#include "xfs_ialloc.h"
34
35/*
36 * Check that none of the inode's in the buffer have a next
37 * unlinked field of 0.
38 */
39#if defined(DEBUG)
40void
41xfs_inobp_check(
42 xfs_mount_t *mp,
43 xfs_buf_t *bp)
44{
45 int i;
46 int j;
47 xfs_dinode_t *dip;
48
49 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
50
51 for (i = 0; i < j; i++) {
52 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
53 i * mp->m_sb.sb_inodesize);
54 if (!dip->di_next_unlinked) {
55 xfs_alert(mp,
56 "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
57 i, (long long)bp->b_bn);
58 }
59 }
60}
61#endif
62
63/*
64 * If we are doing readahead on an inode buffer, we might be in log recovery
65 * reading an inode allocation buffer that hasn't yet been replayed, and hence
66 * has not had the inode cores stamped into it. Hence for readahead, the buffer
67 * may be potentially invalid.
68 *
69 * If the readahead buffer is invalid, we don't want to mark it with an error,
70 * but we do want to clear the DONE status of the buffer so that a followup read
71 * will re-read it from disk. This will ensure that we don't get an unnecessary
72 * warnings during log recovery and we don't get unnecssary panics on debug
73 * kernels.
74 */
75static void
76xfs_inode_buf_verify(
77 struct xfs_buf *bp,
78 bool readahead)
79{
80 struct xfs_mount *mp = bp->b_target->bt_mount;
81 int i;
82 int ni;
83
84 /*
85 * Validate the magic number and version of every inode in the buffer
86 */
87 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
88 for (i = 0; i < ni; i++) {
89 int di_ok;
90 xfs_dinode_t *dip;
91
92 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
93 (i << mp->m_sb.sb_inodelog));
94 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
95 XFS_DINODE_GOOD_VERSION(dip->di_version);
96 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
97 XFS_ERRTAG_ITOBP_INOTOBP,
98 XFS_RANDOM_ITOBP_INOTOBP))) {
99 if (readahead) {
100 bp->b_flags &= ~XBF_DONE;
101 return;
102 }
103
104 xfs_buf_ioerror(bp, EFSCORRUPTED);
105 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
106 mp, dip);
107#ifdef DEBUG
108 xfs_alert(mp,
109 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
110 (unsigned long long)bp->b_bn, i,
111 be16_to_cpu(dip->di_magic));
112#endif
113 }
114 }
115 xfs_inobp_check(mp, bp);
116}
117
118
119static void
120xfs_inode_buf_read_verify(
121 struct xfs_buf *bp)
122{
123 xfs_inode_buf_verify(bp, false);
124}
125
126static void
127xfs_inode_buf_readahead_verify(
128 struct xfs_buf *bp)
129{
130 xfs_inode_buf_verify(bp, true);
131}
132
133static void
134xfs_inode_buf_write_verify(
135 struct xfs_buf *bp)
136{
137 xfs_inode_buf_verify(bp, false);
138}
139
140const struct xfs_buf_ops xfs_inode_buf_ops = {
141 .verify_read = xfs_inode_buf_read_verify,
142 .verify_write = xfs_inode_buf_write_verify,
143};
144
145const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
146 .verify_read = xfs_inode_buf_readahead_verify,
147 .verify_write = xfs_inode_buf_write_verify,
148};
149
150
151/*
152 * This routine is called to map an inode to the buffer containing the on-disk
153 * version of the inode. It returns a pointer to the buffer containing the
154 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
155 * pointer to the on-disk inode within that buffer.
156 *
157 * If a non-zero error is returned, then the contents of bpp and dipp are
158 * undefined.
159 */
160int
161xfs_imap_to_bp(
162 struct xfs_mount *mp,
163 struct xfs_trans *tp,
164 struct xfs_imap *imap,
165 struct xfs_dinode **dipp,
166 struct xfs_buf **bpp,
167 uint buf_flags,
168 uint iget_flags)
169{
170 struct xfs_buf *bp;
171 int error;
172
173 buf_flags |= XBF_UNMAPPED;
174 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
175 (int)imap->im_len, buf_flags, &bp,
176 &xfs_inode_buf_ops);
177 if (error) {
178 if (error == EAGAIN) {
179 ASSERT(buf_flags & XBF_TRYLOCK);
180 return error;
181 }
182
183 if (error == EFSCORRUPTED &&
184 (iget_flags & XFS_IGET_UNTRUSTED))
185 return XFS_ERROR(EINVAL);
186
187 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
188 __func__, error);
189 return error;
190 }
191
192 *bpp = bp;
193 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
194 return 0;
195}
196
197void
198xfs_dinode_from_disk(
199 xfs_icdinode_t *to,
200 xfs_dinode_t *from)
201{
202 to->di_magic = be16_to_cpu(from->di_magic);
203 to->di_mode = be16_to_cpu(from->di_mode);
204 to->di_version = from ->di_version;
205 to->di_format = from->di_format;
206 to->di_onlink = be16_to_cpu(from->di_onlink);
207 to->di_uid = be32_to_cpu(from->di_uid);
208 to->di_gid = be32_to_cpu(from->di_gid);
209 to->di_nlink = be32_to_cpu(from->di_nlink);
210 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
211 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
212 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
213 to->di_flushiter = be16_to_cpu(from->di_flushiter);
214 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
215 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
216 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
217 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
218 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
219 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
220 to->di_size = be64_to_cpu(from->di_size);
221 to->di_nblocks = be64_to_cpu(from->di_nblocks);
222 to->di_extsize = be32_to_cpu(from->di_extsize);
223 to->di_nextents = be32_to_cpu(from->di_nextents);
224 to->di_anextents = be16_to_cpu(from->di_anextents);
225 to->di_forkoff = from->di_forkoff;
226 to->di_aformat = from->di_aformat;
227 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
228 to->di_dmstate = be16_to_cpu(from->di_dmstate);
229 to->di_flags = be16_to_cpu(from->di_flags);
230 to->di_gen = be32_to_cpu(from->di_gen);
231
232 if (to->di_version == 3) {
233 to->di_changecount = be64_to_cpu(from->di_changecount);
234 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
235 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
236 to->di_flags2 = be64_to_cpu(from->di_flags2);
237 to->di_ino = be64_to_cpu(from->di_ino);
238 to->di_lsn = be64_to_cpu(from->di_lsn);
239 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
240 uuid_copy(&to->di_uuid, &from->di_uuid);
241 }
242}
243
244void
245xfs_dinode_to_disk(
246 xfs_dinode_t *to,
247 xfs_icdinode_t *from)
248{
249 to->di_magic = cpu_to_be16(from->di_magic);
250 to->di_mode = cpu_to_be16(from->di_mode);
251 to->di_version = from ->di_version;
252 to->di_format = from->di_format;
253 to->di_onlink = cpu_to_be16(from->di_onlink);
254 to->di_uid = cpu_to_be32(from->di_uid);
255 to->di_gid = cpu_to_be32(from->di_gid);
256 to->di_nlink = cpu_to_be32(from->di_nlink);
257 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
258 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
259 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
260 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
261 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
262 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
263 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
264 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
265 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
266 to->di_size = cpu_to_be64(from->di_size);
267 to->di_nblocks = cpu_to_be64(from->di_nblocks);
268 to->di_extsize = cpu_to_be32(from->di_extsize);
269 to->di_nextents = cpu_to_be32(from->di_nextents);
270 to->di_anextents = cpu_to_be16(from->di_anextents);
271 to->di_forkoff = from->di_forkoff;
272 to->di_aformat = from->di_aformat;
273 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
274 to->di_dmstate = cpu_to_be16(from->di_dmstate);
275 to->di_flags = cpu_to_be16(from->di_flags);
276 to->di_gen = cpu_to_be32(from->di_gen);
277
278 if (from->di_version == 3) {
279 to->di_changecount = cpu_to_be64(from->di_changecount);
280 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
281 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
282 to->di_flags2 = cpu_to_be64(from->di_flags2);
283 to->di_ino = cpu_to_be64(from->di_ino);
284 to->di_lsn = cpu_to_be64(from->di_lsn);
285 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
286 uuid_copy(&to->di_uuid, &from->di_uuid);
287 to->di_flushiter = 0;
288 } else {
289 to->di_flushiter = cpu_to_be16(from->di_flushiter);
290 }
291}
292
293static bool
294xfs_dinode_verify(
295 struct xfs_mount *mp,
296 struct xfs_inode *ip,
297 struct xfs_dinode *dip)
298{
299 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
300 return false;
301
302 /* only version 3 or greater inodes are extensively verified here */
303 if (dip->di_version < 3)
304 return true;
305
306 if (!xfs_sb_version_hascrc(&mp->m_sb))
307 return false;
308 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
309 offsetof(struct xfs_dinode, di_crc)))
310 return false;
311 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
312 return false;
313 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
314 return false;
315 return true;
316}
317
318void
319xfs_dinode_calc_crc(
320 struct xfs_mount *mp,
321 struct xfs_dinode *dip)
322{
323 __uint32_t crc;
324
325 if (dip->di_version < 3)
326 return;
327
328 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
329 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
330 offsetof(struct xfs_dinode, di_crc));
331 dip->di_crc = xfs_end_cksum(crc);
332}
333
334/*
335 * Read the disk inode attributes into the in-core inode structure.
336 *
337 * For version 5 superblocks, if we are initialising a new inode and we are not
338 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
339 * inode core with a random generation number. If we are keeping inodes around,
340 * we need to read the inode cluster to get the existing generation number off
341 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
342 * format) then log recovery is dependent on the di_flushiter field being
343 * initialised from the current on-disk value and hence we must also read the
344 * inode off disk.
345 */
346int
347xfs_iread(
348 xfs_mount_t *mp,
349 xfs_trans_t *tp,
350 xfs_inode_t *ip,
351 uint iget_flags)
352{
353 xfs_buf_t *bp;
354 xfs_dinode_t *dip;
355 int error;
356
357 /*
358 * Fill in the location information in the in-core inode.
359 */
360 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
361 if (error)
362 return error;
363
364 /* shortcut IO on inode allocation if possible */
365 if ((iget_flags & XFS_IGET_CREATE) &&
366 xfs_sb_version_hascrc(&mp->m_sb) &&
367 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
368 /* initialise the on-disk inode core */
369 memset(&ip->i_d, 0, sizeof(ip->i_d));
370 ip->i_d.di_magic = XFS_DINODE_MAGIC;
371 ip->i_d.di_gen = prandom_u32();
372 if (xfs_sb_version_hascrc(&mp->m_sb)) {
373 ip->i_d.di_version = 3;
374 ip->i_d.di_ino = ip->i_ino;
375 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
376 } else
377 ip->i_d.di_version = 2;
378 return 0;
379 }
380
381 /*
382 * Get pointers to the on-disk inode and the buffer containing it.
383 */
384 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
385 if (error)
386 return error;
387
388 /* even unallocated inodes are verified */
389 if (!xfs_dinode_verify(mp, ip, dip)) {
390 xfs_alert(mp, "%s: validation failed for inode %lld failed",
391 __func__, ip->i_ino);
392
393 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
394 error = XFS_ERROR(EFSCORRUPTED);
395 goto out_brelse;
396 }
397
398 /*
399 * If the on-disk inode is already linked to a directory
400 * entry, copy all of the inode into the in-core inode.
401 * xfs_iformat_fork() handles copying in the inode format
402 * specific information.
403 * Otherwise, just get the truly permanent information.
404 */
405 if (dip->di_mode) {
406 xfs_dinode_from_disk(&ip->i_d, dip);
407 error = xfs_iformat_fork(ip, dip);
408 if (error) {
409#ifdef DEBUG
410 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
411 __func__, error);
412#endif /* DEBUG */
413 goto out_brelse;
414 }
415 } else {
416 /*
417 * Partial initialisation of the in-core inode. Just the bits
418 * that xfs_ialloc won't overwrite or relies on being correct.
419 */
420 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
421 ip->i_d.di_version = dip->di_version;
422 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
423 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
424
425 if (dip->di_version == 3) {
426 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
427 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
428 }
429
430 /*
431 * Make sure to pull in the mode here as well in
432 * case the inode is released without being used.
433 * This ensures that xfs_inactive() will see that
434 * the inode is already free and not try to mess
435 * with the uninitialized part of it.
436 */
437 ip->i_d.di_mode = 0;
438 }
439
440 /*
441 * The inode format changed when we moved the link count and
442 * made it 32 bits long. If this is an old format inode,
443 * convert it in memory to look like a new one. If it gets
444 * flushed to disk we will convert back before flushing or
445 * logging it. We zero out the new projid field and the old link
446 * count field. We'll handle clearing the pad field (the remains
447 * of the old uuid field) when we actually convert the inode to
448 * the new format. We don't change the version number so that we
449 * can distinguish this from a real new format inode.
450 */
451 if (ip->i_d.di_version == 1) {
452 ip->i_d.di_nlink = ip->i_d.di_onlink;
453 ip->i_d.di_onlink = 0;
454 xfs_set_projid(ip, 0);
455 }
456
457 ip->i_delayed_blks = 0;
458
459 /*
460 * Mark the buffer containing the inode as something to keep
461 * around for a while. This helps to keep recently accessed
462 * meta-data in-core longer.
463 */
464 xfs_buf_set_ref(bp, XFS_INO_REF);
465
466 /*
467 * Use xfs_trans_brelse() to release the buffer containing the on-disk
468 * inode, because it was acquired with xfs_trans_read_buf() in
469 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
470 * brelse(). If we're within a transaction, then xfs_trans_brelse()
471 * will only release the buffer if it is not dirty within the
472 * transaction. It will be OK to release the buffer in this case,
473 * because inodes on disk are never destroyed and we will be locking the
474 * new in-core inode before putting it in the cache where other
475 * processes can find it. Thus we don't have to worry about the inode
476 * being changed just because we released the buffer.
477 */
478 out_brelse:
479 xfs_trans_brelse(tp, bp);
480 return error;
481}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
new file mode 100644
index 000000000000..abba0ae8cf2d
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_BUF_H__
19#define __XFS_INODE_BUF_H__
20
21struct xfs_inode;
22struct xfs_dinode;
23struct xfs_icdinode;
24
25/*
26 * Inode location information. Stored in the inode and passed to
27 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
28 */
29struct xfs_imap {
30 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
31 ushort im_len; /* length in BBs of inode chunk */
32 ushort im_boffset; /* inode offset in block in bytes */
33};
34
35int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
36 struct xfs_imap *, struct xfs_dinode **,
37 struct xfs_buf **, uint, uint);
38int xfs_iread(struct xfs_mount *, struct xfs_trans *,
39 struct xfs_inode *, uint);
40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
41void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
42void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
43
44#if defined(DEBUG)
45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
46#else
47#define xfs_inobp_check(mp, bp)
48#endif /* DEBUG */
49
50extern const struct xfs_buf_ops xfs_inode_buf_ops;
51extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
52
53#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
new file mode 100644
index 000000000000..02f1083955bb
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.c
@@ -0,0 +1,1920 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <linux/log2.h>
19
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_format.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_trans_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_mount.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h"
35#include "xfs_inode.h"
36#include "xfs_buf_item.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_alloc.h"
40#include "xfs_ialloc.h"
41#include "xfs_bmap.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_filestream.h"
45#include "xfs_cksum.h"
46#include "xfs_trace.h"
47#include "xfs_icache.h"
48
49kmem_zone_t *xfs_ifork_zone;
50
51STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
52STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
53STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
54
55#ifdef DEBUG
56/*
57 * Make sure that the extents in the given memory buffer
58 * are valid.
59 */
60void
61xfs_validate_extents(
62 xfs_ifork_t *ifp,
63 int nrecs,
64 xfs_exntfmt_t fmt)
65{
66 xfs_bmbt_irec_t irec;
67 xfs_bmbt_rec_host_t rec;
68 int i;
69
70 for (i = 0; i < nrecs; i++) {
71 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
72 rec.l0 = get_unaligned(&ep->l0);
73 rec.l1 = get_unaligned(&ep->l1);
74 xfs_bmbt_get_all(&rec, &irec);
75 if (fmt == XFS_EXTFMT_NOSTATE)
76 ASSERT(irec.br_state == XFS_EXT_NORM);
77 }
78}
79#else /* DEBUG */
80#define xfs_validate_extents(ifp, nrecs, fmt)
81#endif /* DEBUG */
82
83
84/*
85 * Move inode type and inode format specific information from the
86 * on-disk inode to the in-core inode. For fifos, devs, and sockets
87 * this means set if_rdev to the proper value. For files, directories,
88 * and symlinks this means to bring in the in-line data or extent
89 * pointers. For a file in B-tree format, only the root is immediately
90 * brought in-core. The rest will be in-lined in if_extents when it
91 * is first referenced (see xfs_iread_extents()).
92 */
93int
94xfs_iformat_fork(
95 xfs_inode_t *ip,
96 xfs_dinode_t *dip)
97{
98 xfs_attr_shortform_t *atp;
99 int size;
100 int error = 0;
101 xfs_fsize_t di_size;
102
103 if (unlikely(be32_to_cpu(dip->di_nextents) +
104 be16_to_cpu(dip->di_anextents) >
105 be64_to_cpu(dip->di_nblocks))) {
106 xfs_warn(ip->i_mount,
107 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
108 (unsigned long long)ip->i_ino,
109 (int)(be32_to_cpu(dip->di_nextents) +
110 be16_to_cpu(dip->di_anextents)),
111 (unsigned long long)
112 be64_to_cpu(dip->di_nblocks));
113 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
114 ip->i_mount, dip);
115 return XFS_ERROR(EFSCORRUPTED);
116 }
117
118 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
119 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
120 (unsigned long long)ip->i_ino,
121 dip->di_forkoff);
122 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
123 ip->i_mount, dip);
124 return XFS_ERROR(EFSCORRUPTED);
125 }
126
127 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
128 !ip->i_mount->m_rtdev_targp)) {
129 xfs_warn(ip->i_mount,
130 "corrupt dinode %Lu, has realtime flag set.",
131 ip->i_ino);
132 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
133 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
134 return XFS_ERROR(EFSCORRUPTED);
135 }
136
137 switch (ip->i_d.di_mode & S_IFMT) {
138 case S_IFIFO:
139 case S_IFCHR:
140 case S_IFBLK:
141 case S_IFSOCK:
142 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
143 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
144 ip->i_mount, dip);
145 return XFS_ERROR(EFSCORRUPTED);
146 }
147 ip->i_d.di_size = 0;
148 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
149 break;
150
151 case S_IFREG:
152 case S_IFLNK:
153 case S_IFDIR:
154 switch (dip->di_format) {
155 case XFS_DINODE_FMT_LOCAL:
156 /*
157 * no local regular files yet
158 */
159 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
160 xfs_warn(ip->i_mount,
161 "corrupt inode %Lu (local format for regular file).",
162 (unsigned long long) ip->i_ino);
163 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
164 XFS_ERRLEVEL_LOW,
165 ip->i_mount, dip);
166 return XFS_ERROR(EFSCORRUPTED);
167 }
168
169 di_size = be64_to_cpu(dip->di_size);
170 if (unlikely(di_size < 0 ||
171 di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
172 xfs_warn(ip->i_mount,
173 "corrupt inode %Lu (bad size %Ld for local inode).",
174 (unsigned long long) ip->i_ino,
175 (long long) di_size);
176 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
177 XFS_ERRLEVEL_LOW,
178 ip->i_mount, dip);
179 return XFS_ERROR(EFSCORRUPTED);
180 }
181
182 size = (int)di_size;
183 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
184 break;
185 case XFS_DINODE_FMT_EXTENTS:
186 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
187 break;
188 case XFS_DINODE_FMT_BTREE:
189 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
190 break;
191 default:
192 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
193 ip->i_mount);
194 return XFS_ERROR(EFSCORRUPTED);
195 }
196 break;
197
198 default:
199 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
200 return XFS_ERROR(EFSCORRUPTED);
201 }
202 if (error) {
203 return error;
204 }
205 if (!XFS_DFORK_Q(dip))
206 return 0;
207
208 ASSERT(ip->i_afp == NULL);
209 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
210
211 switch (dip->di_aformat) {
212 case XFS_DINODE_FMT_LOCAL:
213 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
214 size = be16_to_cpu(atp->hdr.totsize);
215
216 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
217 xfs_warn(ip->i_mount,
218 "corrupt inode %Lu (bad attr fork size %Ld).",
219 (unsigned long long) ip->i_ino,
220 (long long) size);
221 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
222 XFS_ERRLEVEL_LOW,
223 ip->i_mount, dip);
224 return XFS_ERROR(EFSCORRUPTED);
225 }
226
227 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
228 break;
229 case XFS_DINODE_FMT_EXTENTS:
230 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
231 break;
232 case XFS_DINODE_FMT_BTREE:
233 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
234 break;
235 default:
236 error = XFS_ERROR(EFSCORRUPTED);
237 break;
238 }
239 if (error) {
240 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
241 ip->i_afp = NULL;
242 xfs_idestroy_fork(ip, XFS_DATA_FORK);
243 }
244 return error;
245}
246
247/*
248 * The file is in-lined in the on-disk inode.
249 * If it fits into if_inline_data, then copy
250 * it there, otherwise allocate a buffer for it
251 * and copy the data there. Either way, set
252 * if_data to point at the data.
253 * If we allocate a buffer for the data, make
254 * sure that its size is a multiple of 4 and
255 * record the real size in i_real_bytes.
256 */
257STATIC int
258xfs_iformat_local(
259 xfs_inode_t *ip,
260 xfs_dinode_t *dip,
261 int whichfork,
262 int size)
263{
264 xfs_ifork_t *ifp;
265 int real_size;
266
267 /*
268 * If the size is unreasonable, then something
269 * is wrong and we just bail out rather than crash in
270 * kmem_alloc() or memcpy() below.
271 */
272 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
273 xfs_warn(ip->i_mount,
274 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
275 (unsigned long long) ip->i_ino, size,
276 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
277 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
278 ip->i_mount, dip);
279 return XFS_ERROR(EFSCORRUPTED);
280 }
281 ifp = XFS_IFORK_PTR(ip, whichfork);
282 real_size = 0;
283 if (size == 0)
284 ifp->if_u1.if_data = NULL;
285 else if (size <= sizeof(ifp->if_u2.if_inline_data))
286 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
287 else {
288 real_size = roundup(size, 4);
289 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
290 }
291 ifp->if_bytes = size;
292 ifp->if_real_bytes = real_size;
293 if (size)
294 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
295 ifp->if_flags &= ~XFS_IFEXTENTS;
296 ifp->if_flags |= XFS_IFINLINE;
297 return 0;
298}
299
300/*
301 * The file consists of a set of extents all
302 * of which fit into the on-disk inode.
303 * If there are few enough extents to fit into
304 * the if_inline_ext, then copy them there.
305 * Otherwise allocate a buffer for them and copy
306 * them into it. Either way, set if_extents
307 * to point at the extents.
308 */
309STATIC int
310xfs_iformat_extents(
311 xfs_inode_t *ip,
312 xfs_dinode_t *dip,
313 int whichfork)
314{
315 xfs_bmbt_rec_t *dp;
316 xfs_ifork_t *ifp;
317 int nex;
318 int size;
319 int i;
320
321 ifp = XFS_IFORK_PTR(ip, whichfork);
322 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
323 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
324
325 /*
326 * If the number of extents is unreasonable, then something
327 * is wrong and we just bail out rather than crash in
328 * kmem_alloc() or memcpy() below.
329 */
330 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
331 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
332 (unsigned long long) ip->i_ino, nex);
333 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
334 ip->i_mount, dip);
335 return XFS_ERROR(EFSCORRUPTED);
336 }
337
338 ifp->if_real_bytes = 0;
339 if (nex == 0)
340 ifp->if_u1.if_extents = NULL;
341 else if (nex <= XFS_INLINE_EXTS)
342 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
343 else
344 xfs_iext_add(ifp, 0, nex);
345
346 ifp->if_bytes = size;
347 if (size) {
348 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
349 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
350 for (i = 0; i < nex; i++, dp++) {
351 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
352 ep->l0 = get_unaligned_be64(&dp->l0);
353 ep->l1 = get_unaligned_be64(&dp->l1);
354 }
355 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
356 if (whichfork != XFS_DATA_FORK ||
357 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
358 if (unlikely(xfs_check_nostate_extents(
359 ifp, 0, nex))) {
360 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
361 XFS_ERRLEVEL_LOW,
362 ip->i_mount);
363 return XFS_ERROR(EFSCORRUPTED);
364 }
365 }
366 ifp->if_flags |= XFS_IFEXTENTS;
367 return 0;
368}
369
370/*
371 * The file has too many extents to fit into
372 * the inode, so they are in B-tree format.
373 * Allocate a buffer for the root of the B-tree
374 * and copy the root into it. The i_extents
375 * field will remain NULL until all of the
376 * extents are read in (when they are needed).
377 */
378STATIC int
379xfs_iformat_btree(
380 xfs_inode_t *ip,
381 xfs_dinode_t *dip,
382 int whichfork)
383{
384 struct xfs_mount *mp = ip->i_mount;
385 xfs_bmdr_block_t *dfp;
386 xfs_ifork_t *ifp;
387 /* REFERENCED */
388 int nrecs;
389 int size;
390
391 ifp = XFS_IFORK_PTR(ip, whichfork);
392 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
393 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
394 nrecs = be16_to_cpu(dfp->bb_numrecs);
395
396 /*
397 * blow out if -- fork has less extents than can fit in
398 * fork (fork shouldn't be a btree format), root btree
399 * block has more records than can fit into the fork,
400 * or the number of extents is greater than the number of
401 * blocks.
402 */
403 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
404 XFS_IFORK_MAXEXT(ip, whichfork) ||
405 XFS_BMDR_SPACE_CALC(nrecs) >
406 XFS_DFORK_SIZE(dip, mp, whichfork) ||
407 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
408 xfs_warn(mp, "corrupt inode %Lu (btree).",
409 (unsigned long long) ip->i_ino);
410 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
411 mp, dip);
412 return XFS_ERROR(EFSCORRUPTED);
413 }
414
415 ifp->if_broot_bytes = size;
416 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
417 ASSERT(ifp->if_broot != NULL);
418 /*
419 * Copy and convert from the on-disk structure
420 * to the in-memory structure.
421 */
422 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
423 ifp->if_broot, size);
424 ifp->if_flags &= ~XFS_IFEXTENTS;
425 ifp->if_flags |= XFS_IFBROOT;
426
427 return 0;
428}
429
430/*
431 * Read in extents from a btree-format inode.
432 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
433 */
434int
435xfs_iread_extents(
436 xfs_trans_t *tp,
437 xfs_inode_t *ip,
438 int whichfork)
439{
440 int error;
441 xfs_ifork_t *ifp;
442 xfs_extnum_t nextents;
443
444 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
445 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
446 ip->i_mount);
447 return XFS_ERROR(EFSCORRUPTED);
448 }
449 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
450 ifp = XFS_IFORK_PTR(ip, whichfork);
451
452 /*
453 * We know that the size is valid (it's checked in iformat_btree)
454 */
455 ifp->if_bytes = ifp->if_real_bytes = 0;
456 ifp->if_flags |= XFS_IFEXTENTS;
457 xfs_iext_add(ifp, 0, nextents);
458 error = xfs_bmap_read_extents(tp, ip, whichfork);
459 if (error) {
460 xfs_iext_destroy(ifp);
461 ifp->if_flags &= ~XFS_IFEXTENTS;
462 return error;
463 }
464 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
465 return 0;
466}
467/*
468 * Reallocate the space for if_broot based on the number of records
469 * being added or deleted as indicated in rec_diff. Move the records
470 * and pointers in if_broot to fit the new size. When shrinking this
471 * will eliminate holes between the records and pointers created by
472 * the caller. When growing this will create holes to be filled in
473 * by the caller.
474 *
475 * The caller must not request to add more records than would fit in
476 * the on-disk inode root. If the if_broot is currently NULL, then
477 * if we are adding records, one will be allocated. The caller must also
478 * not request that the number of records go below zero, although
479 * it can go to zero.
480 *
481 * ip -- the inode whose if_broot area is changing
482 * ext_diff -- the change in the number of records, positive or negative,
483 * requested for the if_broot array.
484 */
485void
486xfs_iroot_realloc(
487 xfs_inode_t *ip,
488 int rec_diff,
489 int whichfork)
490{
491 struct xfs_mount *mp = ip->i_mount;
492 int cur_max;
493 xfs_ifork_t *ifp;
494 struct xfs_btree_block *new_broot;
495 int new_max;
496 size_t new_size;
497 char *np;
498 char *op;
499
500 /*
501 * Handle the degenerate case quietly.
502 */
503 if (rec_diff == 0) {
504 return;
505 }
506
507 ifp = XFS_IFORK_PTR(ip, whichfork);
508 if (rec_diff > 0) {
509 /*
510 * If there wasn't any memory allocated before, just
511 * allocate it now and get out.
512 */
513 if (ifp->if_broot_bytes == 0) {
514 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
515 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
516 ifp->if_broot_bytes = (int)new_size;
517 return;
518 }
519
520 /*
521 * If there is already an existing if_broot, then we need
522 * to realloc() it and shift the pointers to their new
523 * location. The records don't change location because
524 * they are kept butted up against the btree block header.
525 */
526 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
527 new_max = cur_max + rec_diff;
528 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
529 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
530 XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
531 KM_SLEEP | KM_NOFS);
532 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
533 ifp->if_broot_bytes);
534 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
535 (int)new_size);
536 ifp->if_broot_bytes = (int)new_size;
537 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
538 XFS_IFORK_SIZE(ip, whichfork));
539 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
540 return;
541 }
542
543 /*
544 * rec_diff is less than 0. In this case, we are shrinking the
545 * if_broot buffer. It must already exist. If we go to zero
546 * records, just get rid of the root and clear the status bit.
547 */
548 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
549 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
550 new_max = cur_max + rec_diff;
551 ASSERT(new_max >= 0);
552 if (new_max > 0)
553 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
554 else
555 new_size = 0;
556 if (new_size > 0) {
557 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
558 /*
559 * First copy over the btree block header.
560 */
561 memcpy(new_broot, ifp->if_broot,
562 XFS_BMBT_BLOCK_LEN(ip->i_mount));
563 } else {
564 new_broot = NULL;
565 ifp->if_flags &= ~XFS_IFBROOT;
566 }
567
568 /*
569 * Only copy the records and pointers if there are any.
570 */
571 if (new_max > 0) {
572 /*
573 * First copy the records.
574 */
575 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
576 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
577 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
578
579 /*
580 * Then copy the pointers.
581 */
582 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
583 ifp->if_broot_bytes);
584 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
585 (int)new_size);
586 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
587 }
588 kmem_free(ifp->if_broot);
589 ifp->if_broot = new_broot;
590 ifp->if_broot_bytes = (int)new_size;
591 if (ifp->if_broot)
592 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
593 XFS_IFORK_SIZE(ip, whichfork));
594 return;
595}
596
597
598/*
599 * This is called when the amount of space needed for if_data
600 * is increased or decreased. The change in size is indicated by
601 * the number of bytes that need to be added or deleted in the
602 * byte_diff parameter.
603 *
604 * If the amount of space needed has decreased below the size of the
605 * inline buffer, then switch to using the inline buffer. Otherwise,
606 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
607 * to what is needed.
608 *
609 * ip -- the inode whose if_data area is changing
610 * byte_diff -- the change in the number of bytes, positive or negative,
611 * requested for the if_data array.
612 */
613void
614xfs_idata_realloc(
615 xfs_inode_t *ip,
616 int byte_diff,
617 int whichfork)
618{
619 xfs_ifork_t *ifp;
620 int new_size;
621 int real_size;
622
623 if (byte_diff == 0) {
624 return;
625 }
626
627 ifp = XFS_IFORK_PTR(ip, whichfork);
628 new_size = (int)ifp->if_bytes + byte_diff;
629 ASSERT(new_size >= 0);
630
631 if (new_size == 0) {
632 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
633 kmem_free(ifp->if_u1.if_data);
634 }
635 ifp->if_u1.if_data = NULL;
636 real_size = 0;
637 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
638 /*
639 * If the valid extents/data can fit in if_inline_ext/data,
640 * copy them from the malloc'd vector and free it.
641 */
642 if (ifp->if_u1.if_data == NULL) {
643 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
644 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
645 ASSERT(ifp->if_real_bytes != 0);
646 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
647 new_size);
648 kmem_free(ifp->if_u1.if_data);
649 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
650 }
651 real_size = 0;
652 } else {
653 /*
654 * Stuck with malloc/realloc.
655 * For inline data, the underlying buffer must be
656 * a multiple of 4 bytes in size so that it can be
657 * logged and stay on word boundaries. We enforce
658 * that here.
659 */
660 real_size = roundup(new_size, 4);
661 if (ifp->if_u1.if_data == NULL) {
662 ASSERT(ifp->if_real_bytes == 0);
663 ifp->if_u1.if_data = kmem_alloc(real_size,
664 KM_SLEEP | KM_NOFS);
665 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
666 /*
667 * Only do the realloc if the underlying size
668 * is really changing.
669 */
670 if (ifp->if_real_bytes != real_size) {
671 ifp->if_u1.if_data =
672 kmem_realloc(ifp->if_u1.if_data,
673 real_size,
674 ifp->if_real_bytes,
675 KM_SLEEP | KM_NOFS);
676 }
677 } else {
678 ASSERT(ifp->if_real_bytes == 0);
679 ifp->if_u1.if_data = kmem_alloc(real_size,
680 KM_SLEEP | KM_NOFS);
681 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
682 ifp->if_bytes);
683 }
684 }
685 ifp->if_real_bytes = real_size;
686 ifp->if_bytes = new_size;
687 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
688}
689
690void
691xfs_idestroy_fork(
692 xfs_inode_t *ip,
693 int whichfork)
694{
695 xfs_ifork_t *ifp;
696
697 ifp = XFS_IFORK_PTR(ip, whichfork);
698 if (ifp->if_broot != NULL) {
699 kmem_free(ifp->if_broot);
700 ifp->if_broot = NULL;
701 }
702
703 /*
704 * If the format is local, then we can't have an extents
705 * array so just look for an inline data array. If we're
706 * not local then we may or may not have an extents list,
707 * so check and free it up if we do.
708 */
709 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
710 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
711 (ifp->if_u1.if_data != NULL)) {
712 ASSERT(ifp->if_real_bytes != 0);
713 kmem_free(ifp->if_u1.if_data);
714 ifp->if_u1.if_data = NULL;
715 ifp->if_real_bytes = 0;
716 }
717 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
718 ((ifp->if_flags & XFS_IFEXTIREC) ||
719 ((ifp->if_u1.if_extents != NULL) &&
720 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
721 ASSERT(ifp->if_real_bytes != 0);
722 xfs_iext_destroy(ifp);
723 }
724 ASSERT(ifp->if_u1.if_extents == NULL ||
725 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
726 ASSERT(ifp->if_real_bytes == 0);
727 if (whichfork == XFS_ATTR_FORK) {
728 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
729 ip->i_afp = NULL;
730 }
731}
732
733/*
734 * xfs_iextents_copy()
735 *
736 * This is called to copy the REAL extents (as opposed to the delayed
737 * allocation extents) from the inode into the given buffer. It
738 * returns the number of bytes copied into the buffer.
739 *
740 * If there are no delayed allocation extents, then we can just
741 * memcpy() the extents into the buffer. Otherwise, we need to
742 * examine each extent in turn and skip those which are delayed.
743 */
744int
745xfs_iextents_copy(
746 xfs_inode_t *ip,
747 xfs_bmbt_rec_t *dp,
748 int whichfork)
749{
750 int copied;
751 int i;
752 xfs_ifork_t *ifp;
753 int nrecs;
754 xfs_fsblock_t start_block;
755
756 ifp = XFS_IFORK_PTR(ip, whichfork);
757 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
758 ASSERT(ifp->if_bytes > 0);
759
760 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
761 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
762 ASSERT(nrecs > 0);
763
764 /*
765 * There are some delayed allocation extents in the
766 * inode, so copy the extents one at a time and skip
767 * the delayed ones. There must be at least one
768 * non-delayed extent.
769 */
770 copied = 0;
771 for (i = 0; i < nrecs; i++) {
772 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
773 start_block = xfs_bmbt_get_startblock(ep);
774 if (isnullstartblock(start_block)) {
775 /*
776 * It's a delayed allocation extent, so skip it.
777 */
778 continue;
779 }
780
781 /* Translate to on disk format */
782 put_unaligned_be64(ep->l0, &dp->l0);
783 put_unaligned_be64(ep->l1, &dp->l1);
784 dp++;
785 copied++;
786 }
787 ASSERT(copied != 0);
788 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
789
790 return (copied * (uint)sizeof(xfs_bmbt_rec_t));
791}
792
793/*
794 * Each of the following cases stores data into the same region
795 * of the on-disk inode, so only one of them can be valid at
796 * any given time. While it is possible to have conflicting formats
797 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
798 * in EXTENTS format, this can only happen when the fork has
799 * changed formats after being modified but before being flushed.
800 * In these cases, the format always takes precedence, because the
801 * format indicates the current state of the fork.
802 */
803void
804xfs_iflush_fork(
805 xfs_inode_t *ip,
806 xfs_dinode_t *dip,
807 xfs_inode_log_item_t *iip,
808 int whichfork,
809 xfs_buf_t *bp)
810{
811 char *cp;
812 xfs_ifork_t *ifp;
813 xfs_mount_t *mp;
814 static const short brootflag[2] =
815 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
816 static const short dataflag[2] =
817 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
818 static const short extflag[2] =
819 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
820
821 if (!iip)
822 return;
823 ifp = XFS_IFORK_PTR(ip, whichfork);
824 /*
825 * This can happen if we gave up in iformat in an error path,
826 * for the attribute fork.
827 */
828 if (!ifp) {
829 ASSERT(whichfork == XFS_ATTR_FORK);
830 return;
831 }
832 cp = XFS_DFORK_PTR(dip, whichfork);
833 mp = ip->i_mount;
834 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
835 case XFS_DINODE_FMT_LOCAL:
836 if ((iip->ili_fields & dataflag[whichfork]) &&
837 (ifp->if_bytes > 0)) {
838 ASSERT(ifp->if_u1.if_data != NULL);
839 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
840 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
841 }
842 break;
843
844 case XFS_DINODE_FMT_EXTENTS:
845 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
846 !(iip->ili_fields & extflag[whichfork]));
847 if ((iip->ili_fields & extflag[whichfork]) &&
848 (ifp->if_bytes > 0)) {
849 ASSERT(xfs_iext_get_ext(ifp, 0));
850 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
851 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
852 whichfork);
853 }
854 break;
855
856 case XFS_DINODE_FMT_BTREE:
857 if ((iip->ili_fields & brootflag[whichfork]) &&
858 (ifp->if_broot_bytes > 0)) {
859 ASSERT(ifp->if_broot != NULL);
860 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
861 XFS_IFORK_SIZE(ip, whichfork));
862 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
863 (xfs_bmdr_block_t *)cp,
864 XFS_DFORK_SIZE(dip, mp, whichfork));
865 }
866 break;
867
868 case XFS_DINODE_FMT_DEV:
869 if (iip->ili_fields & XFS_ILOG_DEV) {
870 ASSERT(whichfork == XFS_DATA_FORK);
871 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
872 }
873 break;
874
875 case XFS_DINODE_FMT_UUID:
876 if (iip->ili_fields & XFS_ILOG_UUID) {
877 ASSERT(whichfork == XFS_DATA_FORK);
878 memcpy(XFS_DFORK_DPTR(dip),
879 &ip->i_df.if_u2.if_uuid,
880 sizeof(uuid_t));
881 }
882 break;
883
884 default:
885 ASSERT(0);
886 break;
887 }
888}
889
890/*
891 * Return a pointer to the extent record at file index idx.
892 */
893xfs_bmbt_rec_host_t *
894xfs_iext_get_ext(
895 xfs_ifork_t *ifp, /* inode fork pointer */
896 xfs_extnum_t idx) /* index of target extent */
897{
898 ASSERT(idx >= 0);
899 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
900
901 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
902 return ifp->if_u1.if_ext_irec->er_extbuf;
903 } else if (ifp->if_flags & XFS_IFEXTIREC) {
904 xfs_ext_irec_t *erp; /* irec pointer */
905 int erp_idx = 0; /* irec index */
906 xfs_extnum_t page_idx = idx; /* ext index in target list */
907
908 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
909 return &erp->er_extbuf[page_idx];
910 } else if (ifp->if_bytes) {
911 return &ifp->if_u1.if_extents[idx];
912 } else {
913 return NULL;
914 }
915}
916
917/*
918 * Insert new item(s) into the extent records for incore inode
919 * fork 'ifp'. 'count' new items are inserted at index 'idx'.
920 */
921void
922xfs_iext_insert(
923 xfs_inode_t *ip, /* incore inode pointer */
924 xfs_extnum_t idx, /* starting index of new items */
925 xfs_extnum_t count, /* number of inserted items */
926 xfs_bmbt_irec_t *new, /* items to insert */
927 int state) /* type of extent conversion */
928{
929 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
930 xfs_extnum_t i; /* extent record index */
931
932 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
933
934 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
935 xfs_iext_add(ifp, idx, count);
936 for (i = idx; i < idx + count; i++, new++)
937 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
938}
939
940/*
941 * This is called when the amount of space required for incore file
942 * extents needs to be increased. The ext_diff parameter stores the
943 * number of new extents being added and the idx parameter contains
944 * the extent index where the new extents will be added. If the new
945 * extents are being appended, then we just need to (re)allocate and
946 * initialize the space. Otherwise, if the new extents are being
947 * inserted into the middle of the existing entries, a bit more work
948 * is required to make room for the new extents to be inserted. The
949 * caller is responsible for filling in the new extent entries upon
950 * return.
951 */
952void
953xfs_iext_add(
954 xfs_ifork_t *ifp, /* inode fork pointer */
955 xfs_extnum_t idx, /* index to begin adding exts */
956 int ext_diff) /* number of extents to add */
957{
958 int byte_diff; /* new bytes being added */
959 int new_size; /* size of extents after adding */
960 xfs_extnum_t nextents; /* number of extents in file */
961
962 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
963 ASSERT((idx >= 0) && (idx <= nextents));
964 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
965 new_size = ifp->if_bytes + byte_diff;
966 /*
967 * If the new number of extents (nextents + ext_diff)
968 * fits inside the inode, then continue to use the inline
969 * extent buffer.
970 */
971 if (nextents + ext_diff <= XFS_INLINE_EXTS) {
972 if (idx < nextents) {
973 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
974 &ifp->if_u2.if_inline_ext[idx],
975 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
976 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
977 }
978 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
979 ifp->if_real_bytes = 0;
980 }
981 /*
982 * Otherwise use a linear (direct) extent list.
983 * If the extents are currently inside the inode,
984 * xfs_iext_realloc_direct will switch us from
985 * inline to direct extent allocation mode.
986 */
987 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
988 xfs_iext_realloc_direct(ifp, new_size);
989 if (idx < nextents) {
990 memmove(&ifp->if_u1.if_extents[idx + ext_diff],
991 &ifp->if_u1.if_extents[idx],
992 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
993 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
994 }
995 }
996 /* Indirection array */
997 else {
998 xfs_ext_irec_t *erp;
999 int erp_idx = 0;
1000 int page_idx = idx;
1001
1002 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
1003 if (ifp->if_flags & XFS_IFEXTIREC) {
1004 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
1005 } else {
1006 xfs_iext_irec_init(ifp);
1007 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1008 erp = ifp->if_u1.if_ext_irec;
1009 }
1010 /* Extents fit in target extent page */
1011 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
1012 if (page_idx < erp->er_extcount) {
1013 memmove(&erp->er_extbuf[page_idx + ext_diff],
1014 &erp->er_extbuf[page_idx],
1015 (erp->er_extcount - page_idx) *
1016 sizeof(xfs_bmbt_rec_t));
1017 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
1018 }
1019 erp->er_extcount += ext_diff;
1020 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1021 }
1022 /* Insert a new extent page */
1023 else if (erp) {
1024 xfs_iext_add_indirect_multi(ifp,
1025 erp_idx, page_idx, ext_diff);
1026 }
1027 /*
1028 * If extent(s) are being appended to the last page in
1029 * the indirection array and the new extent(s) don't fit
1030 * in the page, then erp is NULL and erp_idx is set to
1031 * the next index needed in the indirection array.
1032 */
1033 else {
1034 int count = ext_diff;
1035
1036 while (count) {
1037 erp = xfs_iext_irec_new(ifp, erp_idx);
1038 erp->er_extcount = count;
1039 count -= MIN(count, (int)XFS_LINEAR_EXTS);
1040 if (count) {
1041 erp_idx++;
1042 }
1043 }
1044 }
1045 }
1046 ifp->if_bytes = new_size;
1047}
1048
1049/*
1050 * This is called when incore extents are being added to the indirection
1051 * array and the new extents do not fit in the target extent list. The
1052 * erp_idx parameter contains the irec index for the target extent list
1053 * in the indirection array, and the idx parameter contains the extent
1054 * index within the list. The number of extents being added is stored
1055 * in the count parameter.
1056 *
1057 * |-------| |-------|
1058 * | | | | idx - number of extents before idx
1059 * | idx | | count |
1060 * | | | | count - number of extents being inserted at idx
1061 * |-------| |-------|
1062 * | count | | nex2 | nex2 - number of extents after idx + count
1063 * |-------| |-------|
1064 */
1065void
1066xfs_iext_add_indirect_multi(
1067 xfs_ifork_t *ifp, /* inode fork pointer */
1068 int erp_idx, /* target extent irec index */
1069 xfs_extnum_t idx, /* index within target list */
1070 int count) /* new extents being added */
1071{
1072 int byte_diff; /* new bytes being added */
1073 xfs_ext_irec_t *erp; /* pointer to irec entry */
1074 xfs_extnum_t ext_diff; /* number of extents to add */
1075 xfs_extnum_t ext_cnt; /* new extents still needed */
1076 xfs_extnum_t nex2; /* extents after idx + count */
1077 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
1078 int nlists; /* number of irec's (lists) */
1079
1080 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1081 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1082 nex2 = erp->er_extcount - idx;
1083 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1084
1085 /*
1086 * Save second part of target extent list
1087 * (all extents past */
1088 if (nex2) {
1089 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1090 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
1091 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
1092 erp->er_extcount -= nex2;
1093 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
1094 memset(&erp->er_extbuf[idx], 0, byte_diff);
1095 }
1096
1097 /*
1098 * Add the new extents to the end of the target
1099 * list, then allocate new irec record(s) and
1100 * extent buffer(s) as needed to store the rest
1101 * of the new extents.
1102 */
1103 ext_cnt = count;
1104 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
1105 if (ext_diff) {
1106 erp->er_extcount += ext_diff;
1107 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1108 ext_cnt -= ext_diff;
1109 }
1110 while (ext_cnt) {
1111 erp_idx++;
1112 erp = xfs_iext_irec_new(ifp, erp_idx);
1113 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
1114 erp->er_extcount = ext_diff;
1115 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1116 ext_cnt -= ext_diff;
1117 }
1118
1119 /* Add nex2 extents back to indirection array */
1120 if (nex2) {
1121 xfs_extnum_t ext_avail;
1122 int i;
1123
1124 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1125 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
1126 i = 0;
1127 /*
1128 * If nex2 extents fit in the current page, append
1129 * nex2_ep after the new extents.
1130 */
1131 if (nex2 <= ext_avail) {
1132 i = erp->er_extcount;
1133 }
1134 /*
1135 * Otherwise, check if space is available in the
1136 * next page.
1137 */
1138 else if ((erp_idx < nlists - 1) &&
1139 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
1140 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
1141 erp_idx++;
1142 erp++;
1143 /* Create a hole for nex2 extents */
1144 memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
1145 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
1146 }
1147 /*
1148 * Final choice, create a new extent page for
1149 * nex2 extents.
1150 */
1151 else {
1152 erp_idx++;
1153 erp = xfs_iext_irec_new(ifp, erp_idx);
1154 }
1155 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
1156 kmem_free(nex2_ep);
1157 erp->er_extcount += nex2;
1158 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
1159 }
1160}
1161
1162/*
1163 * This is called when the amount of space required for incore file
1164 * extents needs to be decreased. The ext_diff parameter stores the
1165 * number of extents to be removed and the idx parameter contains
1166 * the extent index where the extents will be removed from.
1167 *
1168 * If the amount of space needed has decreased below the linear
1169 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
1170 * extent array. Otherwise, use kmem_realloc() to adjust the
1171 * size to what is needed.
1172 */
1173void
1174xfs_iext_remove(
1175 xfs_inode_t *ip, /* incore inode pointer */
1176 xfs_extnum_t idx, /* index to begin removing exts */
1177 int ext_diff, /* number of extents to remove */
1178 int state) /* type of extent conversion */
1179{
1180 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
1181 xfs_extnum_t nextents; /* number of extents in file */
1182 int new_size; /* size of extents after removal */
1183
1184 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
1185
1186 ASSERT(ext_diff > 0);
1187 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1188 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
1189
1190 if (new_size == 0) {
1191 xfs_iext_destroy(ifp);
1192 } else if (ifp->if_flags & XFS_IFEXTIREC) {
1193 xfs_iext_remove_indirect(ifp, idx, ext_diff);
1194 } else if (ifp->if_real_bytes) {
1195 xfs_iext_remove_direct(ifp, idx, ext_diff);
1196 } else {
1197 xfs_iext_remove_inline(ifp, idx, ext_diff);
1198 }
1199 ifp->if_bytes = new_size;
1200}
1201
1202/*
1203 * This removes ext_diff extents from the inline buffer, beginning
1204 * at extent index idx.
1205 */
1206void
1207xfs_iext_remove_inline(
1208 xfs_ifork_t *ifp, /* inode fork pointer */
1209 xfs_extnum_t idx, /* index to begin removing exts */
1210 int ext_diff) /* number of extents to remove */
1211{
1212 int nextents; /* number of extents in file */
1213
1214 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1215 ASSERT(idx < XFS_INLINE_EXTS);
1216 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1217 ASSERT(((nextents - ext_diff) > 0) &&
1218 (nextents - ext_diff) < XFS_INLINE_EXTS);
1219
1220 if (idx + ext_diff < nextents) {
1221 memmove(&ifp->if_u2.if_inline_ext[idx],
1222 &ifp->if_u2.if_inline_ext[idx + ext_diff],
1223 (nextents - (idx + ext_diff)) *
1224 sizeof(xfs_bmbt_rec_t));
1225 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
1226 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1227 } else {
1228 memset(&ifp->if_u2.if_inline_ext[idx], 0,
1229 ext_diff * sizeof(xfs_bmbt_rec_t));
1230 }
1231}
1232
1233/*
1234 * This removes ext_diff extents from a linear (direct) extent list,
1235 * beginning at extent index idx. If the extents are being removed
1236 * from the end of the list (ie. truncate) then we just need to re-
1237 * allocate the list to remove the extra space. Otherwise, if the
1238 * extents are being removed from the middle of the existing extent
1239 * entries, then we first need to move the extent records beginning
1240 * at idx + ext_diff up in the list to overwrite the records being
1241 * removed, then remove the extra space via kmem_realloc.
1242 */
1243void
1244xfs_iext_remove_direct(
1245 xfs_ifork_t *ifp, /* inode fork pointer */
1246 xfs_extnum_t idx, /* index to begin removing exts */
1247 int ext_diff) /* number of extents to remove */
1248{
1249 xfs_extnum_t nextents; /* number of extents in file */
1250 int new_size; /* size of extents after removal */
1251
1252 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1253 new_size = ifp->if_bytes -
1254 (ext_diff * sizeof(xfs_bmbt_rec_t));
1255 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1256
1257 if (new_size == 0) {
1258 xfs_iext_destroy(ifp);
1259 return;
1260 }
1261 /* Move extents up in the list (if needed) */
1262 if (idx + ext_diff < nextents) {
1263 memmove(&ifp->if_u1.if_extents[idx],
1264 &ifp->if_u1.if_extents[idx + ext_diff],
1265 (nextents - (idx + ext_diff)) *
1266 sizeof(xfs_bmbt_rec_t));
1267 }
1268 memset(&ifp->if_u1.if_extents[nextents - ext_diff],
1269 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1270 /*
1271 * Reallocate the direct extent list. If the extents
1272 * will fit inside the inode then xfs_iext_realloc_direct
1273 * will switch from direct to inline extent allocation
1274 * mode for us.
1275 */
1276 xfs_iext_realloc_direct(ifp, new_size);
1277 ifp->if_bytes = new_size;
1278}
1279
1280/*
1281 * This is called when incore extents are being removed from the
1282 * indirection array and the extents being removed span multiple extent
1283 * buffers. The idx parameter contains the file extent index where we
1284 * want to begin removing extents, and the count parameter contains
1285 * how many extents need to be removed.
1286 *
1287 * |-------| |-------|
1288 * | nex1 | | | nex1 - number of extents before idx
1289 * |-------| | count |
1290 * | | | | count - number of extents being removed at idx
1291 * | count | |-------|
1292 * | | | nex2 | nex2 - number of extents after idx + count
1293 * |-------| |-------|
1294 */
1295void
1296xfs_iext_remove_indirect(
1297 xfs_ifork_t *ifp, /* inode fork pointer */
1298 xfs_extnum_t idx, /* index to begin removing extents */
1299 int count) /* number of extents to remove */
1300{
1301 xfs_ext_irec_t *erp; /* indirection array pointer */
1302 int erp_idx = 0; /* indirection array index */
1303 xfs_extnum_t ext_cnt; /* extents left to remove */
1304 xfs_extnum_t ext_diff; /* extents to remove in current list */
1305 xfs_extnum_t nex1; /* number of extents before idx */
1306 xfs_extnum_t nex2; /* extents after idx + count */
1307 int page_idx = idx; /* index in target extent list */
1308
1309 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1310 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
1311 ASSERT(erp != NULL);
1312 nex1 = page_idx;
1313 ext_cnt = count;
1314 while (ext_cnt) {
1315 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
1316 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
1317 /*
1318 * Check for deletion of entire list;
1319 * xfs_iext_irec_remove() updates extent offsets.
1320 */
1321 if (ext_diff == erp->er_extcount) {
1322 xfs_iext_irec_remove(ifp, erp_idx);
1323 ext_cnt -= ext_diff;
1324 nex1 = 0;
1325 if (ext_cnt) {
1326 ASSERT(erp_idx < ifp->if_real_bytes /
1327 XFS_IEXT_BUFSZ);
1328 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1329 nex1 = 0;
1330 continue;
1331 } else {
1332 break;
1333 }
1334 }
1335 /* Move extents up (if needed) */
1336 if (nex2) {
1337 memmove(&erp->er_extbuf[nex1],
1338 &erp->er_extbuf[nex1 + ext_diff],
1339 nex2 * sizeof(xfs_bmbt_rec_t));
1340 }
1341 /* Zero out rest of page */
1342 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
1343 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
1344 /* Update remaining counters */
1345 erp->er_extcount -= ext_diff;
1346 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
1347 ext_cnt -= ext_diff;
1348 nex1 = 0;
1349 erp_idx++;
1350 erp++;
1351 }
1352 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
1353 xfs_iext_irec_compact(ifp);
1354}
1355
1356/*
1357 * Create, destroy, or resize a linear (direct) block of extents.
1358 */
1359void
1360xfs_iext_realloc_direct(
1361 xfs_ifork_t *ifp, /* inode fork pointer */
1362 int new_size) /* new size of extents */
1363{
1364 int rnew_size; /* real new size of extents */
1365
1366 rnew_size = new_size;
1367
1368 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
1369 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
1370 (new_size != ifp->if_real_bytes)));
1371
1372 /* Free extent records */
1373 if (new_size == 0) {
1374 xfs_iext_destroy(ifp);
1375 }
1376 /* Resize direct extent list and zero any new bytes */
1377 else if (ifp->if_real_bytes) {
1378 /* Check if extents will fit inside the inode */
1379 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
1380 xfs_iext_direct_to_inline(ifp, new_size /
1381 (uint)sizeof(xfs_bmbt_rec_t));
1382 ifp->if_bytes = new_size;
1383 return;
1384 }
1385 if (!is_power_of_2(new_size)){
1386 rnew_size = roundup_pow_of_two(new_size);
1387 }
1388 if (rnew_size != ifp->if_real_bytes) {
1389 ifp->if_u1.if_extents =
1390 kmem_realloc(ifp->if_u1.if_extents,
1391 rnew_size,
1392 ifp->if_real_bytes, KM_NOFS);
1393 }
1394 if (rnew_size > ifp->if_real_bytes) {
1395 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
1396 (uint)sizeof(xfs_bmbt_rec_t)], 0,
1397 rnew_size - ifp->if_real_bytes);
1398 }
1399 }
1400 /*
1401 * Switch from the inline extent buffer to a direct
1402 * extent list. Be sure to include the inline extent
1403 * bytes in new_size.
1404 */
1405 else {
1406 new_size += ifp->if_bytes;
1407 if (!is_power_of_2(new_size)) {
1408 rnew_size = roundup_pow_of_two(new_size);
1409 }
1410 xfs_iext_inline_to_direct(ifp, rnew_size);
1411 }
1412 ifp->if_real_bytes = rnew_size;
1413 ifp->if_bytes = new_size;
1414}
1415
1416/*
1417 * Switch from linear (direct) extent records to inline buffer.
1418 */
1419void
1420xfs_iext_direct_to_inline(
1421 xfs_ifork_t *ifp, /* inode fork pointer */
1422 xfs_extnum_t nextents) /* number of extents in file */
1423{
1424 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
1425 ASSERT(nextents <= XFS_INLINE_EXTS);
1426 /*
1427 * The inline buffer was zeroed when we switched
1428 * from inline to direct extent allocation mode,
1429 * so we don't need to clear it here.
1430 */
1431 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
1432 nextents * sizeof(xfs_bmbt_rec_t));
1433 kmem_free(ifp->if_u1.if_extents);
1434 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
1435 ifp->if_real_bytes = 0;
1436}
1437
1438/*
1439 * Switch from inline buffer to linear (direct) extent records.
1440 * new_size should already be rounded up to the next power of 2
1441 * by the caller (when appropriate), so use new_size as it is.
1442 * However, since new_size may be rounded up, we can't update
1443 * if_bytes here. It is the caller's responsibility to update
1444 * if_bytes upon return.
1445 */
1446void
1447xfs_iext_inline_to_direct(
1448 xfs_ifork_t *ifp, /* inode fork pointer */
1449 int new_size) /* number of extents in file */
1450{
1451 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
1452 memset(ifp->if_u1.if_extents, 0, new_size);
1453 if (ifp->if_bytes) {
1454 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
1455 ifp->if_bytes);
1456 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1457 sizeof(xfs_bmbt_rec_t));
1458 }
1459 ifp->if_real_bytes = new_size;
1460}
1461
1462/*
1463 * Resize an extent indirection array to new_size bytes.
1464 */
1465STATIC void
1466xfs_iext_realloc_indirect(
1467 xfs_ifork_t *ifp, /* inode fork pointer */
1468 int new_size) /* new indirection array size */
1469{
1470 int nlists; /* number of irec's (ex lists) */
1471 int size; /* current indirection array size */
1472
1473 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1474 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1475 size = nlists * sizeof(xfs_ext_irec_t);
1476 ASSERT(ifp->if_real_bytes);
1477 ASSERT((new_size >= 0) && (new_size != size));
1478 if (new_size == 0) {
1479 xfs_iext_destroy(ifp);
1480 } else {
1481 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
1482 kmem_realloc(ifp->if_u1.if_ext_irec,
1483 new_size, size, KM_NOFS);
1484 }
1485}
1486
1487/*
1488 * Switch from indirection array to linear (direct) extent allocations.
1489 */
1490STATIC void
1491xfs_iext_indirect_to_direct(
1492 xfs_ifork_t *ifp) /* inode fork pointer */
1493{
1494 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1495 xfs_extnum_t nextents; /* number of extents in file */
1496 int size; /* size of file extents */
1497
1498 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1499 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1500 ASSERT(nextents <= XFS_LINEAR_EXTS);
1501 size = nextents * sizeof(xfs_bmbt_rec_t);
1502
1503 xfs_iext_irec_compact_pages(ifp);
1504 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
1505
1506 ep = ifp->if_u1.if_ext_irec->er_extbuf;
1507 kmem_free(ifp->if_u1.if_ext_irec);
1508 ifp->if_flags &= ~XFS_IFEXTIREC;
1509 ifp->if_u1.if_extents = ep;
1510 ifp->if_bytes = size;
1511 if (nextents < XFS_LINEAR_EXTS) {
1512 xfs_iext_realloc_direct(ifp, size);
1513 }
1514}
1515
1516/*
1517 * Free incore file extents.
1518 */
1519void
1520xfs_iext_destroy(
1521 xfs_ifork_t *ifp) /* inode fork pointer */
1522{
1523 if (ifp->if_flags & XFS_IFEXTIREC) {
1524 int erp_idx;
1525 int nlists;
1526
1527 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1528 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
1529 xfs_iext_irec_remove(ifp, erp_idx);
1530 }
1531 ifp->if_flags &= ~XFS_IFEXTIREC;
1532 } else if (ifp->if_real_bytes) {
1533 kmem_free(ifp->if_u1.if_extents);
1534 } else if (ifp->if_bytes) {
1535 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1536 sizeof(xfs_bmbt_rec_t));
1537 }
1538 ifp->if_u1.if_extents = NULL;
1539 ifp->if_real_bytes = 0;
1540 ifp->if_bytes = 0;
1541}
1542
1543/*
1544 * Return a pointer to the extent record for file system block bno.
1545 */
1546xfs_bmbt_rec_host_t * /* pointer to found extent record */
1547xfs_iext_bno_to_ext(
1548 xfs_ifork_t *ifp, /* inode fork pointer */
1549 xfs_fileoff_t bno, /* block number to search for */
1550 xfs_extnum_t *idxp) /* index of target extent */
1551{
1552 xfs_bmbt_rec_host_t *base; /* pointer to first extent */
1553 xfs_filblks_t blockcount = 0; /* number of blocks in extent */
1554 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
1555 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1556 int high; /* upper boundary in search */
1557 xfs_extnum_t idx = 0; /* index of target extent */
1558 int low; /* lower boundary in search */
1559 xfs_extnum_t nextents; /* number of file extents */
1560 xfs_fileoff_t startoff = 0; /* start offset of extent */
1561
1562 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1563 if (nextents == 0) {
1564 *idxp = 0;
1565 return NULL;
1566 }
1567 low = 0;
1568 if (ifp->if_flags & XFS_IFEXTIREC) {
1569 /* Find target extent list */
1570 int erp_idx = 0;
1571 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
1572 base = erp->er_extbuf;
1573 high = erp->er_extcount - 1;
1574 } else {
1575 base = ifp->if_u1.if_extents;
1576 high = nextents - 1;
1577 }
1578 /* Binary search extent records */
1579 while (low <= high) {
1580 idx = (low + high) >> 1;
1581 ep = base + idx;
1582 startoff = xfs_bmbt_get_startoff(ep);
1583 blockcount = xfs_bmbt_get_blockcount(ep);
1584 if (bno < startoff) {
1585 high = idx - 1;
1586 } else if (bno >= startoff + blockcount) {
1587 low = idx + 1;
1588 } else {
1589 /* Convert back to file-based extent index */
1590 if (ifp->if_flags & XFS_IFEXTIREC) {
1591 idx += erp->er_extoff;
1592 }
1593 *idxp = idx;
1594 return ep;
1595 }
1596 }
1597 /* Convert back to file-based extent index */
1598 if (ifp->if_flags & XFS_IFEXTIREC) {
1599 idx += erp->er_extoff;
1600 }
1601 if (bno >= startoff + blockcount) {
1602 if (++idx == nextents) {
1603 ep = NULL;
1604 } else {
1605 ep = xfs_iext_get_ext(ifp, idx);
1606 }
1607 }
1608 *idxp = idx;
1609 return ep;
1610}
1611
1612/*
1613 * Return a pointer to the indirection array entry containing the
1614 * extent record for filesystem block bno. Store the index of the
1615 * target irec in *erp_idxp.
1616 */
1617xfs_ext_irec_t * /* pointer to found extent record */
1618xfs_iext_bno_to_irec(
1619 xfs_ifork_t *ifp, /* inode fork pointer */
1620 xfs_fileoff_t bno, /* block number to search for */
1621 int *erp_idxp) /* irec index of target ext list */
1622{
1623 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1624 xfs_ext_irec_t *erp_next; /* next indirection array entry */
1625 int erp_idx; /* indirection array index */
1626 int nlists; /* number of extent irec's (lists) */
1627 int high; /* binary search upper limit */
1628 int low; /* binary search lower limit */
1629
1630 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1631 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1632 erp_idx = 0;
1633 low = 0;
1634 high = nlists - 1;
1635 while (low <= high) {
1636 erp_idx = (low + high) >> 1;
1637 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1638 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
1639 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
1640 high = erp_idx - 1;
1641 } else if (erp_next && bno >=
1642 xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
1643 low = erp_idx + 1;
1644 } else {
1645 break;
1646 }
1647 }
1648 *erp_idxp = erp_idx;
1649 return erp;
1650}
1651
1652/*
1653 * Return a pointer to the indirection array entry containing the
1654 * extent record at file extent index *idxp. Store the index of the
1655 * target irec in *erp_idxp and store the page index of the target
1656 * extent record in *idxp.
1657 */
1658xfs_ext_irec_t *
1659xfs_iext_idx_to_irec(
1660 xfs_ifork_t *ifp, /* inode fork pointer */
1661 xfs_extnum_t *idxp, /* extent index (file -> page) */
1662 int *erp_idxp, /* pointer to target irec */
1663 int realloc) /* new bytes were just added */
1664{
1665 xfs_ext_irec_t *prev; /* pointer to previous irec */
1666 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
1667 int erp_idx; /* indirection array index */
1668 int nlists; /* number of irec's (ex lists) */
1669 int high; /* binary search upper limit */
1670 int low; /* binary search lower limit */
1671 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
1672
1673 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1674 ASSERT(page_idx >= 0);
1675 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
1676 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
1677
1678 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1679 erp_idx = 0;
1680 low = 0;
1681 high = nlists - 1;
1682
1683 /* Binary search extent irec's */
1684 while (low <= high) {
1685 erp_idx = (low + high) >> 1;
1686 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1687 prev = erp_idx > 0 ? erp - 1 : NULL;
1688 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
1689 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
1690 high = erp_idx - 1;
1691 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
1692 (page_idx == erp->er_extoff + erp->er_extcount &&
1693 !realloc)) {
1694 low = erp_idx + 1;
1695 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
1696 erp->er_extcount == XFS_LINEAR_EXTS) {
1697 ASSERT(realloc);
1698 page_idx = 0;
1699 erp_idx++;
1700 erp = erp_idx < nlists ? erp + 1 : NULL;
1701 break;
1702 } else {
1703 page_idx -= erp->er_extoff;
1704 break;
1705 }
1706 }
1707 *idxp = page_idx;
1708 *erp_idxp = erp_idx;
1709 return(erp);
1710}
1711
1712/*
1713 * Allocate and initialize an indirection array once the space needed
1714 * for incore extents increases above XFS_IEXT_BUFSZ.
1715 */
1716void
1717xfs_iext_irec_init(
1718 xfs_ifork_t *ifp) /* inode fork pointer */
1719{
1720 xfs_ext_irec_t *erp; /* indirection array pointer */
1721 xfs_extnum_t nextents; /* number of extents in file */
1722
1723 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1724 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1725 ASSERT(nextents <= XFS_LINEAR_EXTS);
1726
1727 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
1728
1729 if (nextents == 0) {
1730 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1731 } else if (!ifp->if_real_bytes) {
1732 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
1733 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
1734 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
1735 }
1736 erp->er_extbuf = ifp->if_u1.if_extents;
1737 erp->er_extcount = nextents;
1738 erp->er_extoff = 0;
1739
1740 ifp->if_flags |= XFS_IFEXTIREC;
1741 ifp->if_real_bytes = XFS_IEXT_BUFSZ;
1742 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
1743 ifp->if_u1.if_ext_irec = erp;
1744
1745 return;
1746}
1747
1748/*
1749 * Allocate and initialize a new entry in the indirection array.
1750 */
1751xfs_ext_irec_t *
1752xfs_iext_irec_new(
1753 xfs_ifork_t *ifp, /* inode fork pointer */
1754 int erp_idx) /* index for new irec */
1755{
1756 xfs_ext_irec_t *erp; /* indirection array pointer */
1757 int i; /* loop counter */
1758 int nlists; /* number of irec's (ex lists) */
1759
1760 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1761 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1762
1763 /* Resize indirection array */
1764 xfs_iext_realloc_indirect(ifp, ++nlists *
1765 sizeof(xfs_ext_irec_t));
1766 /*
1767 * Move records down in the array so the
1768 * new page can use erp_idx.
1769 */
1770 erp = ifp->if_u1.if_ext_irec;
1771 for (i = nlists - 1; i > erp_idx; i--) {
1772 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
1773 }
1774 ASSERT(i == erp_idx);
1775
1776 /* Initialize new extent record */
1777 erp = ifp->if_u1.if_ext_irec;
1778 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1779 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1780 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
1781 erp[erp_idx].er_extcount = 0;
1782 erp[erp_idx].er_extoff = erp_idx > 0 ?
1783 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
1784 return (&erp[erp_idx]);
1785}
1786
1787/*
1788 * Remove a record from the indirection array.
1789 */
1790void
1791xfs_iext_irec_remove(
1792 xfs_ifork_t *ifp, /* inode fork pointer */
1793 int erp_idx) /* irec index to remove */
1794{
1795 xfs_ext_irec_t *erp; /* indirection array pointer */
1796 int i; /* loop counter */
1797 int nlists; /* number of irec's (ex lists) */
1798
1799 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1800 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1801 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1802 if (erp->er_extbuf) {
1803 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
1804 -erp->er_extcount);
1805 kmem_free(erp->er_extbuf);
1806 }
1807 /* Compact extent records */
1808 erp = ifp->if_u1.if_ext_irec;
1809 for (i = erp_idx; i < nlists - 1; i++) {
1810 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
1811 }
1812 /*
1813 * Manually free the last extent record from the indirection
1814 * array. A call to xfs_iext_realloc_indirect() with a size
1815 * of zero would result in a call to xfs_iext_destroy() which
1816 * would in turn call this function again, creating a nasty
1817 * infinite loop.
1818 */
1819 if (--nlists) {
1820 xfs_iext_realloc_indirect(ifp,
1821 nlists * sizeof(xfs_ext_irec_t));
1822 } else {
1823 kmem_free(ifp->if_u1.if_ext_irec);
1824 }
1825 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1826}
1827
1828/*
1829 * This is called to clean up large amounts of unused memory allocated
1830 * by the indirection array. Before compacting anything though, verify
1831 * that the indirection array is still needed and switch back to the
1832 * linear extent list (or even the inline buffer) if possible. The
1833 * compaction policy is as follows:
1834 *
1835 * Full Compaction: Extents fit into a single page (or inline buffer)
1836 * Partial Compaction: Extents occupy less than 50% of allocated space
1837 * No Compaction: Extents occupy at least 50% of allocated space
1838 */
1839void
1840xfs_iext_irec_compact(
1841 xfs_ifork_t *ifp) /* inode fork pointer */
1842{
1843 xfs_extnum_t nextents; /* number of extents in file */
1844 int nlists; /* number of irec's (ex lists) */
1845
1846 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1847 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1848 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1849
1850 if (nextents == 0) {
1851 xfs_iext_destroy(ifp);
1852 } else if (nextents <= XFS_INLINE_EXTS) {
1853 xfs_iext_indirect_to_direct(ifp);
1854 xfs_iext_direct_to_inline(ifp, nextents);
1855 } else if (nextents <= XFS_LINEAR_EXTS) {
1856 xfs_iext_indirect_to_direct(ifp);
1857 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
1858 xfs_iext_irec_compact_pages(ifp);
1859 }
1860}
1861
1862/*
1863 * Combine extents from neighboring extent pages.
1864 */
1865void
1866xfs_iext_irec_compact_pages(
1867 xfs_ifork_t *ifp) /* inode fork pointer */
1868{
1869 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
1870 int erp_idx = 0; /* indirection array index */
1871 int nlists; /* number of irec's (ex lists) */
1872
1873 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1874 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1875 while (erp_idx < nlists - 1) {
1876 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1877 erp_next = erp + 1;
1878 if (erp_next->er_extcount <=
1879 (XFS_LINEAR_EXTS - erp->er_extcount)) {
1880 memcpy(&erp->er_extbuf[erp->er_extcount],
1881 erp_next->er_extbuf, erp_next->er_extcount *
1882 sizeof(xfs_bmbt_rec_t));
1883 erp->er_extcount += erp_next->er_extcount;
1884 /*
1885 * Free page before removing extent record
1886 * so er_extoffs don't get modified in
1887 * xfs_iext_irec_remove.
1888 */
1889 kmem_free(erp_next->er_extbuf);
1890 erp_next->er_extbuf = NULL;
1891 xfs_iext_irec_remove(ifp, erp_idx + 1);
1892 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1893 } else {
1894 erp_idx++;
1895 }
1896 }
1897}
1898
1899/*
1900 * This is called to update the er_extoff field in the indirection
1901 * array when extents have been added or removed from one of the
1902 * extent lists. erp_idx contains the irec index to begin updating
1903 * at and ext_diff contains the number of extents that were added
1904 * or removed.
1905 */
1906void
1907xfs_iext_irec_update_extoffs(
1908 xfs_ifork_t *ifp, /* inode fork pointer */
1909 int erp_idx, /* irec index to update */
1910 int ext_diff) /* number of new extents */
1911{
1912 int i; /* loop counter */
1913 int nlists; /* number of irec's (ex lists */
1914
1915 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1916 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1917 for (i = erp_idx; i < nlists; i++) {
1918 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
1919 }
1920}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
new file mode 100644
index 000000000000..28661a0d9058
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_FORK_H__
19#define __XFS_INODE_FORK_H__
20
21struct xfs_inode_log_item;
22
23/*
24 * The following xfs_ext_irec_t struct introduces a second (top) level
25 * to the in-core extent allocation scheme. These structs are allocated
26 * in a contiguous block, creating an indirection array where each entry
27 * (irec) contains a pointer to a buffer of in-core extent records which
28 * it manages. Each extent buffer is 4k in size, since 4k is the system
29 * page size on Linux i386 and systems with larger page sizes don't seem
30 * to gain much, if anything, by using their native page size as the
31 * extent buffer size. Also, using 4k extent buffers everywhere provides
32 * a consistent interface for CXFS across different platforms.
33 *
34 * There is currently no limit on the number of irec's (extent lists)
35 * allowed, so heavily fragmented files may require an indirection array
36 * which spans multiple system pages of memory. The number of extents
37 * which would require this amount of contiguous memory is very large
38 * and should not cause problems in the foreseeable future. However,
39 * if the memory needed for the contiguous array ever becomes a problem,
40 * it is possible that a third level of indirection may be required.
41 */
42typedef struct xfs_ext_irec {
43 xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
44 xfs_extnum_t er_extoff; /* extent offset in file */
45 xfs_extnum_t er_extcount; /* number of extents in page/block */
46} xfs_ext_irec_t;
47
48/*
49 * File incore extent information, present for each of data & attr forks.
50 */
51#define XFS_IEXT_BUFSZ 4096
52#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
53#define XFS_INLINE_EXTS 2
54#define XFS_INLINE_DATA 32
55typedef struct xfs_ifork {
56 int if_bytes; /* bytes in if_u1 */
57 int if_real_bytes; /* bytes allocated in if_u1 */
58 struct xfs_btree_block *if_broot; /* file's incore btree root */
59 short if_broot_bytes; /* bytes allocated for root */
60 unsigned char if_flags; /* per-fork flags */
61 union {
62 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
63 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
64 char *if_data; /* inline file data */
65 } if_u1;
66 union {
67 xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
68 /* very small file extents */
69 char if_inline_data[XFS_INLINE_DATA];
70 /* very small file data */
71 xfs_dev_t if_rdev; /* dev number if special */
72 uuid_t if_uuid; /* mount point value */
73 } if_u2;
74} xfs_ifork_t;
75
76/*
77 * Per-fork incore inode flags.
78 */
79#define XFS_IFINLINE 0x01 /* Inline data is read in */
80#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
81#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
82#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
83
84/*
85 * Fork handling.
86 */
87
88#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
89#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
90
91#define XFS_IFORK_PTR(ip,w) \
92 ((w) == XFS_DATA_FORK ? \
93 &(ip)->i_df : \
94 (ip)->i_afp)
95#define XFS_IFORK_DSIZE(ip) \
96 (XFS_IFORK_Q(ip) ? \
97 XFS_IFORK_BOFF(ip) : \
98 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
99#define XFS_IFORK_ASIZE(ip) \
100 (XFS_IFORK_Q(ip) ? \
101 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
102 XFS_IFORK_BOFF(ip) : \
103 0)
104#define XFS_IFORK_SIZE(ip,w) \
105 ((w) == XFS_DATA_FORK ? \
106 XFS_IFORK_DSIZE(ip) : \
107 XFS_IFORK_ASIZE(ip))
108#define XFS_IFORK_FORMAT(ip,w) \
109 ((w) == XFS_DATA_FORK ? \
110 (ip)->i_d.di_format : \
111 (ip)->i_d.di_aformat)
112#define XFS_IFORK_FMT_SET(ip,w,n) \
113 ((w) == XFS_DATA_FORK ? \
114 ((ip)->i_d.di_format = (n)) : \
115 ((ip)->i_d.di_aformat = (n)))
116#define XFS_IFORK_NEXTENTS(ip,w) \
117 ((w) == XFS_DATA_FORK ? \
118 (ip)->i_d.di_nextents : \
119 (ip)->i_d.di_anextents)
120#define XFS_IFORK_NEXT_SET(ip,w,n) \
121 ((w) == XFS_DATA_FORK ? \
122 ((ip)->i_d.di_nextents = (n)) : \
123 ((ip)->i_d.di_anextents = (n)))
124#define XFS_IFORK_MAXEXT(ip, w) \
125 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
126
127int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
128void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
129 struct xfs_inode_log_item *, int,
130 struct xfs_buf *);
131void xfs_idestroy_fork(struct xfs_inode *, int);
132void xfs_idata_realloc(struct xfs_inode *, int, int);
133void xfs_iroot_realloc(struct xfs_inode *, int, int);
134int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
135int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
136 int);
137
138struct xfs_bmbt_rec_host *
139 xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
140void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
141 struct xfs_bmbt_irec *, int);
142void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
143void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
144 xfs_extnum_t, int);
145void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
146void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
147void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
148void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
149void xfs_iext_realloc_direct(struct xfs_ifork *, int);
150void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
151void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
152void xfs_iext_destroy(struct xfs_ifork *);
153struct xfs_bmbt_rec_host *
154 xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
155struct xfs_ext_irec *
156 xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
157struct xfs_ext_irec *
158 xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
159 int);
160void xfs_iext_irec_init(struct xfs_ifork *);
161struct xfs_ext_irec *
162 xfs_iext_irec_new(struct xfs_ifork *, int);
163void xfs_iext_irec_remove(struct xfs_ifork *, int);
164void xfs_iext_irec_compact(struct xfs_ifork *);
165void xfs_iext_irec_compact_pages(struct xfs_ifork *);
166void xfs_iext_irec_compact_full(struct xfs_ifork *);
167void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
168
169extern struct kmem_zone *xfs_ifork_zone;
170
171#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f76ff52e43c0..378081109844 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -47,32 +47,44 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
47 * inode core, and possibly one for the inode data/extents/b-tree root 47 * inode core, and possibly one for the inode data/extents/b-tree root
48 * and one for the inode attribute data/extents/b-tree root. 48 * and one for the inode attribute data/extents/b-tree root.
49 */ 49 */
50STATIC uint 50STATIC void
51xfs_inode_item_size( 51xfs_inode_item_size(
52 struct xfs_log_item *lip) 52 struct xfs_log_item *lip,
53 int *nvecs,
54 int *nbytes)
53{ 55{
54 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
55 struct xfs_inode *ip = iip->ili_inode; 57 struct xfs_inode *ip = iip->ili_inode;
56 uint nvecs = 2; 58
59 *nvecs += 2;
60 *nbytes += sizeof(struct xfs_inode_log_format) +
61 xfs_icdinode_size(ip->i_d.di_version);
57 62
58 switch (ip->i_d.di_format) { 63 switch (ip->i_d.di_format) {
59 case XFS_DINODE_FMT_EXTENTS: 64 case XFS_DINODE_FMT_EXTENTS:
60 if ((iip->ili_fields & XFS_ILOG_DEXT) && 65 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
61 ip->i_d.di_nextents > 0 && 66 ip->i_d.di_nextents > 0 &&
62 ip->i_df.if_bytes > 0) 67 ip->i_df.if_bytes > 0) {
63 nvecs++; 68 /* worst case, doesn't subtract delalloc extents */
69 *nbytes += XFS_IFORK_DSIZE(ip);
70 *nvecs += 1;
71 }
64 break; 72 break;
65 73
66 case XFS_DINODE_FMT_BTREE: 74 case XFS_DINODE_FMT_BTREE:
67 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 75 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
68 ip->i_df.if_broot_bytes > 0) 76 ip->i_df.if_broot_bytes > 0) {
69 nvecs++; 77 *nbytes += ip->i_df.if_broot_bytes;
78 *nvecs += 1;
79 }
70 break; 80 break;
71 81
72 case XFS_DINODE_FMT_LOCAL: 82 case XFS_DINODE_FMT_LOCAL:
73 if ((iip->ili_fields & XFS_ILOG_DDATA) && 83 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
74 ip->i_df.if_bytes > 0) 84 ip->i_df.if_bytes > 0) {
75 nvecs++; 85 *nbytes += roundup(ip->i_df.if_bytes, 4);
86 *nvecs += 1;
87 }
76 break; 88 break;
77 89
78 case XFS_DINODE_FMT_DEV: 90 case XFS_DINODE_FMT_DEV:
@@ -85,7 +97,7 @@ xfs_inode_item_size(
85 } 97 }
86 98
87 if (!XFS_IFORK_Q(ip)) 99 if (!XFS_IFORK_Q(ip))
88 return nvecs; 100 return;
89 101
90 102
91 /* 103 /*
@@ -95,28 +107,33 @@ xfs_inode_item_size(
95 case XFS_DINODE_FMT_EXTENTS: 107 case XFS_DINODE_FMT_EXTENTS:
96 if ((iip->ili_fields & XFS_ILOG_AEXT) && 108 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
97 ip->i_d.di_anextents > 0 && 109 ip->i_d.di_anextents > 0 &&
98 ip->i_afp->if_bytes > 0) 110 ip->i_afp->if_bytes > 0) {
99 nvecs++; 111 /* worst case, doesn't subtract unused space */
112 *nbytes += XFS_IFORK_ASIZE(ip);
113 *nvecs += 1;
114 }
100 break; 115 break;
101 116
102 case XFS_DINODE_FMT_BTREE: 117 case XFS_DINODE_FMT_BTREE:
103 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 118 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
104 ip->i_afp->if_broot_bytes > 0) 119 ip->i_afp->if_broot_bytes > 0) {
105 nvecs++; 120 *nbytes += ip->i_afp->if_broot_bytes;
121 *nvecs += 1;
122 }
106 break; 123 break;
107 124
108 case XFS_DINODE_FMT_LOCAL: 125 case XFS_DINODE_FMT_LOCAL:
109 if ((iip->ili_fields & XFS_ILOG_ADATA) && 126 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
110 ip->i_afp->if_bytes > 0) 127 ip->i_afp->if_bytes > 0) {
111 nvecs++; 128 *nbytes += roundup(ip->i_afp->if_bytes, 4);
129 *nvecs += 1;
130 }
112 break; 131 break;
113 132
114 default: 133 default:
115 ASSERT(0); 134 ASSERT(0);
116 break; 135 break;
117 } 136 }
118
119 return nvecs;
120} 137}
121 138
122/* 139/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 779812fb3d80..dce4d656768c 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -18,123 +18,13 @@
18#ifndef __XFS_INODE_ITEM_H__ 18#ifndef __XFS_INODE_ITEM_H__
19#define __XFS_INODE_ITEM_H__ 19#define __XFS_INODE_ITEM_H__
20 20
21/* 21/* kernel only definitions */
22 * This is the structure used to lay out an inode log item in the
23 * log. The size of the inline data/extents/b-tree root to be logged
24 * (if any) is indicated in the ilf_dsize field. Changes to this structure
25 * must be added on to the end.
26 */
27typedef struct xfs_inode_log_format {
28 __uint16_t ilf_type; /* inode log item type */
29 __uint16_t ilf_size; /* size of this item */
30 __uint32_t ilf_fields; /* flags for fields logged */
31 __uint16_t ilf_asize; /* size of attr d/ext/root */
32 __uint16_t ilf_dsize; /* size of data/ext/root */
33 __uint64_t ilf_ino; /* inode number */
34 union {
35 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
36 uuid_t ilfu_uuid; /* mount point value */
37 } ilf_u;
38 __int64_t ilf_blkno; /* blkno of inode buffer */
39 __int32_t ilf_len; /* len of inode buffer */
40 __int32_t ilf_boffset; /* off of inode in buffer */
41} xfs_inode_log_format_t;
42
43typedef struct xfs_inode_log_format_32 {
44 __uint16_t ilf_type; /* inode log item type */
45 __uint16_t ilf_size; /* size of this item */
46 __uint32_t ilf_fields; /* flags for fields logged */
47 __uint16_t ilf_asize; /* size of attr d/ext/root */
48 __uint16_t ilf_dsize; /* size of data/ext/root */
49 __uint64_t ilf_ino; /* inode number */
50 union {
51 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
52 uuid_t ilfu_uuid; /* mount point value */
53 } ilf_u;
54 __int64_t ilf_blkno; /* blkno of inode buffer */
55 __int32_t ilf_len; /* len of inode buffer */
56 __int32_t ilf_boffset; /* off of inode in buffer */
57} __attribute__((packed)) xfs_inode_log_format_32_t;
58
59typedef struct xfs_inode_log_format_64 {
60 __uint16_t ilf_type; /* inode log item type */
61 __uint16_t ilf_size; /* size of this item */
62 __uint32_t ilf_fields; /* flags for fields logged */
63 __uint16_t ilf_asize; /* size of attr d/ext/root */
64 __uint16_t ilf_dsize; /* size of data/ext/root */
65 __uint32_t ilf_pad; /* pad for 64 bit boundary */
66 __uint64_t ilf_ino; /* inode number */
67 union {
68 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
69 uuid_t ilfu_uuid; /* mount point value */
70 } ilf_u;
71 __int64_t ilf_blkno; /* blkno of inode buffer */
72 __int32_t ilf_len; /* len of inode buffer */
73 __int32_t ilf_boffset; /* off of inode in buffer */
74} xfs_inode_log_format_64_t;
75
76/*
77 * Flags for xfs_trans_log_inode flags field.
78 */
79#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
80#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
81#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
82#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
83#define XFS_ILOG_DEV 0x010 /* log the dev field */
84#define XFS_ILOG_UUID 0x020 /* log the uuid field */
85#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
86#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
87#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
88
89
90/*
91 * The timestamps are dirty, but not necessarily anything else in the inode
92 * core. Unlike the other fields above this one must never make it to disk
93 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
94 * ili_fields in the inode_log_item.
95 */
96#define XFS_ILOG_TIMESTAMP 0x4000
97
98#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
99 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
100 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
101 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
102
103#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
104 XFS_ILOG_DBROOT)
105
106#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
107 XFS_ILOG_ABROOT)
108
109#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
110 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
111 XFS_ILOG_DEV | XFS_ILOG_UUID | \
112 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
113 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
114
115static inline int xfs_ilog_fbroot(int w)
116{
117 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
118}
119
120static inline int xfs_ilog_fext(int w)
121{
122 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
123}
124
125static inline int xfs_ilog_fdata(int w)
126{
127 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
128}
129
130#ifdef __KERNEL__
131 22
132struct xfs_buf; 23struct xfs_buf;
133struct xfs_bmbt_rec; 24struct xfs_bmbt_rec;
134struct xfs_inode; 25struct xfs_inode;
135struct xfs_mount; 26struct xfs_mount;
136 27
137
138typedef struct xfs_inode_log_item { 28typedef struct xfs_inode_log_item {
139 xfs_log_item_t ili_item; /* common portion */ 29 xfs_log_item_t ili_item; /* common portion */
140 struct xfs_inode *ili_inode; /* inode ptr */ 30 struct xfs_inode *ili_inode; /* inode ptr */
@@ -151,7 +41,6 @@ typedef struct xfs_inode_log_item {
151 xfs_inode_log_format_t ili_format; /* logged structure */ 41 xfs_inode_log_format_t ili_format; /* logged structure */
152} xfs_inode_log_item_t; 42} xfs_inode_log_item_t;
153 43
154
155static inline int xfs_inode_clean(xfs_inode_t *ip) 44static inline int xfs_inode_clean(xfs_inode_t *ip)
156{ 45{
157 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); 46 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
@@ -165,6 +54,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool);
165extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 54extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
166 xfs_inode_log_format_t *); 55 xfs_inode_log_format_t *);
167 56
168#endif /* __KERNEL__ */ 57extern struct kmem_zone *xfs_ili_zone;
169 58
170#endif /* __XFS_INODE_ITEM_H__ */ 59#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6e2bca5d44d6..668e8f4ccf5e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -32,17 +33,16 @@
32#include "xfs_error.h" 33#include "xfs_error.h"
33#include "xfs_attr.h" 34#include "xfs_attr.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
36#include "xfs_utils.h"
37#include "xfs_dfrag.h"
38#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_vnodeops.h"
40#include "xfs_discard.h" 39#include "xfs_discard.h"
41#include "xfs_quota.h" 40#include "xfs_quota.h"
42#include "xfs_inode_item.h" 41#include "xfs_inode_item.h"
43#include "xfs_export.h" 42#include "xfs_export.h"
44#include "xfs_trace.h" 43#include "xfs_trace.h"
45#include "xfs_icache.h" 44#include "xfs_icache.h"
45#include "xfs_symlink.h"
46 46
47#include <linux/capability.h> 47#include <linux/capability.h>
48#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -71,7 +71,7 @@ xfs_find_handle(
71 int hsize; 71 int hsize;
72 xfs_handle_t handle; 72 xfs_handle_t handle;
73 struct inode *inode; 73 struct inode *inode;
74 struct fd f = {0}; 74 struct fd f = {NULL};
75 struct path path; 75 struct path path;
76 int error; 76 int error;
77 struct xfs_inode *ip; 77 struct xfs_inode *ip;
@@ -350,6 +350,40 @@ xfs_readlink_by_handle(
350 return error; 350 return error;
351} 351}
352 352
353int
354xfs_set_dmattrs(
355 xfs_inode_t *ip,
356 u_int evmask,
357 u_int16_t state)
358{
359 xfs_mount_t *mp = ip->i_mount;
360 xfs_trans_t *tp;
361 int error;
362
363 if (!capable(CAP_SYS_ADMIN))
364 return XFS_ERROR(EPERM);
365
366 if (XFS_FORCED_SHUTDOWN(mp))
367 return XFS_ERROR(EIO);
368
369 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
370 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
371 if (error) {
372 xfs_trans_cancel(tp, 0);
373 return error;
374 }
375 xfs_ilock(ip, XFS_ILOCK_EXCL);
376 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
377
378 ip->i_d.di_dmevmask = evmask;
379 ip->i_d.di_dmstate = state;
380
381 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
382 error = xfs_trans_commit(tp, 0);
383
384 return error;
385}
386
353STATIC int 387STATIC int
354xfs_fssetdm_by_handle( 388xfs_fssetdm_by_handle(
355 struct file *parfilp, 389 struct file *parfilp,
@@ -422,12 +456,9 @@ xfs_attrlist_by_handle(
422 if (IS_ERR(dentry)) 456 if (IS_ERR(dentry))
423 return PTR_ERR(dentry); 457 return PTR_ERR(dentry);
424 458
425 kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); 459 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
426 if (!kbuf) { 460 if (!kbuf)
427 kbuf = kmem_zalloc_large(al_hreq.buflen); 461 goto out_dput;
428 if (!kbuf)
429 goto out_dput;
430 }
431 462
432 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 463 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
433 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, 464 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -438,12 +469,9 @@ xfs_attrlist_by_handle(
438 if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) 469 if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
439 error = -EFAULT; 470 error = -EFAULT;
440 471
441 out_kfree: 472out_kfree:
442 if (is_vmalloc_addr(kbuf)) 473 kmem_free(kbuf);
443 kmem_free_large(kbuf); 474out_dput:
444 else
445 kmem_free(kbuf);
446 out_dput:
447 dput(dentry); 475 dput(dentry);
448 return error; 476 return error;
449} 477}
@@ -461,12 +489,9 @@ xfs_attrmulti_attr_get(
461 489
462 if (*len > XATTR_SIZE_MAX) 490 if (*len > XATTR_SIZE_MAX)
463 return EINVAL; 491 return EINVAL;
464 kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); 492 kbuf = kmem_zalloc_large(*len, KM_SLEEP);
465 if (!kbuf) { 493 if (!kbuf)
466 kbuf = kmem_zalloc_large(*len); 494 return ENOMEM;
467 if (!kbuf)
468 return ENOMEM;
469 }
470 495
471 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 496 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
472 if (error) 497 if (error)
@@ -475,11 +500,8 @@ xfs_attrmulti_attr_get(
475 if (copy_to_user(ubuf, kbuf, *len)) 500 if (copy_to_user(ubuf, kbuf, *len))
476 error = EFAULT; 501 error = EFAULT;
477 502
478 out_kfree: 503out_kfree:
479 if (is_vmalloc_addr(kbuf)) 504 kmem_free(kbuf);
480 kmem_free_large(kbuf);
481 else
482 kmem_free(kbuf);
483 return error; 505 return error;
484} 506}
485 507
@@ -967,7 +989,7 @@ xfs_ioctl_setattr(
967 * first do an error checking pass. 989 * first do an error checking pass.
968 */ 990 */
969 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 991 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
970 code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 992 code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
971 if (code) 993 if (code)
972 goto error_return; 994 goto error_return;
973 995
@@ -981,15 +1003,22 @@ xfs_ioctl_setattr(
981 * to the file owner ID, except in cases where the 1003 * to the file owner ID, except in cases where the
982 * CAP_FSETID capability is applicable. 1004 * CAP_FSETID capability is applicable.
983 */ 1005 */
984 if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { 1006 if (!inode_owner_or_capable(VFS_I(ip))) {
985 code = XFS_ERROR(EPERM); 1007 code = XFS_ERROR(EPERM);
986 goto error_return; 1008 goto error_return;
987 } 1009 }
988 1010
989 /* 1011 /*
990 * Do a quota reservation only if projid is actually going to change. 1012 * Do a quota reservation only if projid is actually going to change.
1013 * Only allow changing of projid from init_user_ns since it is a
1014 * non user namespace aware identifier.
991 */ 1015 */
992 if (mask & FSX_PROJID) { 1016 if (mask & FSX_PROJID) {
1017 if (current_user_ns() != &init_user_ns) {
1018 code = XFS_ERROR(EINVAL);
1019 goto error_return;
1020 }
1021
993 if (XFS_IS_QUOTA_RUNNING(mp) && 1022 if (XFS_IS_QUOTA_RUNNING(mp) &&
994 XFS_IS_PQUOTA_ON(mp) && 1023 XFS_IS_PQUOTA_ON(mp) &&
995 xfs_get_projid(ip) != fa->fsx_projid) { 1024 xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1103,7 +1132,7 @@ xfs_ioctl_setattr(
1103 * cleared upon successful return from chown() 1132 * cleared upon successful return from chown()
1104 */ 1133 */
1105 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && 1134 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1106 !capable(CAP_FSETID)) 1135 !inode_capable(VFS_I(ip), CAP_FSETID))
1107 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); 1136 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1108 1137
1109 /* 1138 /*
@@ -1328,6 +1357,75 @@ xfs_ioc_getbmapx(
1328 return 0; 1357 return 0;
1329} 1358}
1330 1359
1360int
1361xfs_ioc_swapext(
1362 xfs_swapext_t *sxp)
1363{
1364 xfs_inode_t *ip, *tip;
1365 struct fd f, tmp;
1366 int error = 0;
1367
1368 /* Pull information for the target fd */
1369 f = fdget((int)sxp->sx_fdtarget);
1370 if (!f.file) {
1371 error = XFS_ERROR(EINVAL);
1372 goto out;
1373 }
1374
1375 if (!(f.file->f_mode & FMODE_WRITE) ||
1376 !(f.file->f_mode & FMODE_READ) ||
1377 (f.file->f_flags & O_APPEND)) {
1378 error = XFS_ERROR(EBADF);
1379 goto out_put_file;
1380 }
1381
1382 tmp = fdget((int)sxp->sx_fdtmp);
1383 if (!tmp.file) {
1384 error = XFS_ERROR(EINVAL);
1385 goto out_put_file;
1386 }
1387
1388 if (!(tmp.file->f_mode & FMODE_WRITE) ||
1389 !(tmp.file->f_mode & FMODE_READ) ||
1390 (tmp.file->f_flags & O_APPEND)) {
1391 error = XFS_ERROR(EBADF);
1392 goto out_put_tmp_file;
1393 }
1394
1395 if (IS_SWAPFILE(file_inode(f.file)) ||
1396 IS_SWAPFILE(file_inode(tmp.file))) {
1397 error = XFS_ERROR(EINVAL);
1398 goto out_put_tmp_file;
1399 }
1400
1401 ip = XFS_I(file_inode(f.file));
1402 tip = XFS_I(file_inode(tmp.file));
1403
1404 if (ip->i_mount != tip->i_mount) {
1405 error = XFS_ERROR(EINVAL);
1406 goto out_put_tmp_file;
1407 }
1408
1409 if (ip->i_ino == tip->i_ino) {
1410 error = XFS_ERROR(EINVAL);
1411 goto out_put_tmp_file;
1412 }
1413
1414 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1415 error = XFS_ERROR(EIO);
1416 goto out_put_tmp_file;
1417 }
1418
1419 error = xfs_swap_extents(ip, tip, sxp);
1420
1421 out_put_tmp_file:
1422 fdput(tmp);
1423 out_put_file:
1424 fdput(f);
1425 out:
1426 return error;
1427}
1428
1331/* 1429/*
1332 * Note: some of the ioctl's return positive numbers as a 1430 * Note: some of the ioctl's return positive numbers as a
1333 * byte count indicating success, such as readlink_by_handle. 1431 * byte count indicating success, such as readlink_by_handle.
@@ -1472,7 +1570,7 @@ xfs_file_ioctl(
1472 error = mnt_want_write_file(filp); 1570 error = mnt_want_write_file(filp);
1473 if (error) 1571 if (error)
1474 return error; 1572 return error;
1475 error = xfs_swapext(&sxp); 1573 error = xfs_ioc_swapext(&sxp);
1476 mnt_drop_write_file(filp); 1574 mnt_drop_write_file(filp);
1477 return -error; 1575 return -error;
1478 } 1576 }
@@ -1610,23 +1708,23 @@ xfs_file_ioctl(
1610 return -error; 1708 return -error;
1611 1709
1612 case XFS_IOC_FREE_EOFBLOCKS: { 1710 case XFS_IOC_FREE_EOFBLOCKS: {
1613 struct xfs_eofblocks eofb; 1711 struct xfs_fs_eofblocks eofb;
1712 struct xfs_eofblocks keofb;
1614 1713
1615 if (copy_from_user(&eofb, arg, sizeof(eofb))) 1714 if (!capable(CAP_SYS_ADMIN))
1616 return -XFS_ERROR(EFAULT); 1715 return -EPERM;
1617 1716
1618 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) 1717 if (mp->m_flags & XFS_MOUNT_RDONLY)
1619 return -XFS_ERROR(EINVAL); 1718 return -XFS_ERROR(EROFS);
1620 1719
1621 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) 1720 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1622 return -XFS_ERROR(EINVAL); 1721 return -XFS_ERROR(EFAULT);
1623 1722
1624 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || 1723 error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
1625 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) 1724 if (error)
1626 return -XFS_ERROR(EINVAL); 1725 return -error;
1627 1726
1628 error = xfs_icache_free_eofblocks(mp, &eofb); 1727 return -xfs_icache_free_eofblocks(mp, &keofb);
1629 return -error;
1630 } 1728 }
1631 1729
1632 default: 1730 default:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..77c02c7900b6 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -27,6 +27,10 @@ xfs_ioc_space(
27 unsigned int cmd, 27 unsigned int cmd,
28 xfs_flock64_t *bf); 28 xfs_flock64_t *bf);
29 29
30int
31xfs_ioc_swapext(
32 xfs_swapext_t *sxp);
33
30extern int 34extern int
31xfs_find_handle( 35xfs_find_handle(
32 unsigned int cmd, 36 unsigned int cmd,
@@ -82,4 +86,10 @@ xfs_file_compat_ioctl(
82 unsigned int cmd, 86 unsigned int cmd,
83 unsigned long arg); 87 unsigned long arg);
84 88
89extern int
90xfs_set_dmattrs(
91 struct xfs_inode *ip,
92 u_int evmask,
93 u_int16_t state);
94
85#endif 95#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c0c66259cc91..f671f7e472ac 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -33,8 +33,6 @@
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_itable.h" 34#include "xfs_itable.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_dfrag.h"
37#include "xfs_vnodeops.h"
38#include "xfs_fsops.h" 36#include "xfs_fsops.h"
39#include "xfs_alloc.h" 37#include "xfs_alloc.h"
40#include "xfs_rtalloc.h" 38#include "xfs_rtalloc.h"
@@ -373,12 +371,9 @@ xfs_compat_attrlist_by_handle(
373 return PTR_ERR(dentry); 371 return PTR_ERR(dentry);
374 372
375 error = -ENOMEM; 373 error = -ENOMEM;
376 kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); 374 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
377 if (!kbuf) { 375 if (!kbuf)
378 kbuf = kmem_zalloc_large(al_hreq.buflen); 376 goto out_dput;
379 if (!kbuf)
380 goto out_dput;
381 }
382 377
383 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 378 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
384 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, 379 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -389,12 +384,9 @@ xfs_compat_attrlist_by_handle(
389 if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) 384 if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
390 error = -EFAULT; 385 error = -EFAULT;
391 386
392 out_kfree: 387out_kfree:
393 if (is_vmalloc_addr(kbuf)) 388 kmem_free(kbuf);
394 kmem_free_large(kbuf); 389out_dput:
395 else
396 kmem_free(kbuf);
397 out_dput:
398 dput(dentry); 390 dput(dentry);
399 return error; 391 return error;
400} 392}
@@ -644,7 +636,7 @@ xfs_file_compat_ioctl(
644 error = mnt_want_write_file(filp); 636 error = mnt_want_write_file(filp);
645 if (error) 637 if (error)
646 return error; 638 return error;
647 error = xfs_swapext(&sxp); 639 error = xfs_ioc_swapext(&sxp);
648 mnt_drop_write_file(filp); 640 mnt_drop_write_file(filp);
649 return -error; 641 return -error;
650 } 642 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6a7096422295..8d4d49b6fbf3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -32,13 +33,13 @@
32#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
33#include "xfs_btree.h" 34#include "xfs_btree.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
36#include "xfs_error.h" 38#include "xfs_error.h"
37#include "xfs_itable.h" 39#include "xfs_itable.h"
38#include "xfs_attr.h" 40#include "xfs_attr.h"
39#include "xfs_buf_item.h" 41#include "xfs_buf_item.h"
40#include "xfs_trans_space.h" 42#include "xfs_trans_space.h"
41#include "xfs_utils.h"
42#include "xfs_iomap.h" 43#include "xfs_iomap.h"
43#include "xfs_trace.h" 44#include "xfs_trace.h"
44#include "xfs_icache.h" 45#include "xfs_icache.h"
@@ -187,10 +188,8 @@ xfs_iomap_write_direct(
187 * Allocate and setup the transaction 188 * Allocate and setup the transaction
188 */ 189 */
189 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 190 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
190 error = xfs_trans_reserve(tp, resblks, 191 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
191 XFS_WRITE_LOG_RES(mp), resrtextents, 192 resblks, resrtextents);
192 XFS_TRANS_PERM_LOG_RES,
193 XFS_WRITE_LOG_COUNT);
194 /* 193 /*
195 * Check for running out of space, note: need lock to return 194 * Check for running out of space, note: need lock to return
196 */ 195 */
@@ -698,10 +697,8 @@ xfs_iomap_write_allocate(
698 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 697 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
699 tp->t_flags |= XFS_TRANS_RESERVE; 698 tp->t_flags |= XFS_TRANS_RESERVE;
700 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 699 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
701 error = xfs_trans_reserve(tp, nres, 700 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
702 XFS_WRITE_LOG_RES(mp), 701 nres, 0);
703 0, XFS_TRANS_PERM_LOG_RES,
704 XFS_WRITE_LOG_COUNT);
705 if (error) { 702 if (error) {
706 xfs_trans_cancel(tp, 0); 703 xfs_trans_cancel(tp, 0);
707 return XFS_ERROR(error); 704 return XFS_ERROR(error);
@@ -864,10 +861,8 @@ xfs_iomap_write_unwritten(
864 sb_start_intwrite(mp->m_super); 861 sb_start_intwrite(mp->m_super);
865 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); 862 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
866 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; 863 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
867 error = xfs_trans_reserve(tp, resblks, 864 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
868 XFS_WRITE_LOG_RES(mp), 0, 865 resblks, 0);
869 XFS_TRANS_PERM_LOG_RES,
870 XFS_WRITE_LOG_COUNT);
871 if (error) { 866 if (error) {
872 xfs_trans_cancel(tp, 0); 867 xfs_trans_cancel(tp, 0);
873 return XFS_ERROR(error); 868 return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 96dda62d497b..2b8952d9199b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_acl.h" 21#include "xfs_acl.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -29,16 +30,19 @@
29#include "xfs_dinode.h" 30#include "xfs_dinode.h"
30#include "xfs_inode.h" 31#include "xfs_inode.h"
31#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_bmap_util.h"
32#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
33#include "xfs_error.h" 35#include "xfs_error.h"
34#include "xfs_itable.h" 36#include "xfs_itable.h"
35#include "xfs_attr.h" 37#include "xfs_attr.h"
36#include "xfs_buf_item.h" 38#include "xfs_buf_item.h"
37#include "xfs_utils.h"
38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h" 41#include "xfs_icache.h"
42#include "xfs_symlink.h"
43#include "xfs_da_btree.h"
44#include "xfs_dir2_format.h"
45#include "xfs_dir2_priv.h"
42 46
43#include <linux/capability.h> 47#include <linux/capability.h>
44#include <linux/xattr.h> 48#include <linux/xattr.h>
@@ -87,10 +91,12 @@ xfs_init_security(
87static void 91static void
88xfs_dentry_to_name( 92xfs_dentry_to_name(
89 struct xfs_name *namep, 93 struct xfs_name *namep,
90 struct dentry *dentry) 94 struct dentry *dentry,
95 int mode)
91{ 96{
92 namep->name = dentry->d_name.name; 97 namep->name = dentry->d_name.name;
93 namep->len = dentry->d_name.len; 98 namep->len = dentry->d_name.len;
99 namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
94} 100}
95 101
96STATIC void 102STATIC void
@@ -106,7 +112,7 @@ xfs_cleanup_inode(
106 * xfs_init_security we must back out. 112 * xfs_init_security we must back out.
107 * ENOSPC can hit here, among other things. 113 * ENOSPC can hit here, among other things.
108 */ 114 */
109 xfs_dentry_to_name(&teardown, dentry); 115 xfs_dentry_to_name(&teardown, dentry, 0);
110 116
111 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); 117 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
112 iput(inode); 118 iput(inode);
@@ -146,7 +152,7 @@ xfs_vn_mknod(
146 mode &= ~current_umask(); 152 mode &= ~current_umask();
147 } 153 }
148 154
149 xfs_dentry_to_name(&name, dentry); 155 xfs_dentry_to_name(&name, dentry, mode);
150 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); 156 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
151 if (unlikely(error)) 157 if (unlikely(error))
152 goto out_free_acl; 158 goto out_free_acl;
@@ -207,7 +213,7 @@ xfs_vn_lookup(
207 if (dentry->d_name.len >= MAXNAMELEN) 213 if (dentry->d_name.len >= MAXNAMELEN)
208 return ERR_PTR(-ENAMETOOLONG); 214 return ERR_PTR(-ENAMETOOLONG);
209 215
210 xfs_dentry_to_name(&name, dentry); 216 xfs_dentry_to_name(&name, dentry, 0);
211 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); 217 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
212 if (unlikely(error)) { 218 if (unlikely(error)) {
213 if (unlikely(error != ENOENT)) 219 if (unlikely(error != ENOENT))
@@ -234,7 +240,7 @@ xfs_vn_ci_lookup(
234 if (dentry->d_name.len >= MAXNAMELEN) 240 if (dentry->d_name.len >= MAXNAMELEN)
235 return ERR_PTR(-ENAMETOOLONG); 241 return ERR_PTR(-ENAMETOOLONG);
236 242
237 xfs_dentry_to_name(&xname, dentry); 243 xfs_dentry_to_name(&xname, dentry, 0);
238 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); 244 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
239 if (unlikely(error)) { 245 if (unlikely(error)) {
240 if (unlikely(error != ENOENT)) 246 if (unlikely(error != ENOENT))
@@ -269,7 +275,7 @@ xfs_vn_link(
269 struct xfs_name name; 275 struct xfs_name name;
270 int error; 276 int error;
271 277
272 xfs_dentry_to_name(&name, dentry); 278 xfs_dentry_to_name(&name, dentry, inode->i_mode);
273 279
274 error = xfs_link(XFS_I(dir), XFS_I(inode), &name); 280 error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
275 if (unlikely(error)) 281 if (unlikely(error))
@@ -288,7 +294,7 @@ xfs_vn_unlink(
288 struct xfs_name name; 294 struct xfs_name name;
289 int error; 295 int error;
290 296
291 xfs_dentry_to_name(&name, dentry); 297 xfs_dentry_to_name(&name, dentry, 0);
292 298
293 error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); 299 error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
294 if (error) 300 if (error)
@@ -318,7 +324,7 @@ xfs_vn_symlink(
318 324
319 mode = S_IFLNK | 325 mode = S_IFLNK |
320 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 326 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
321 xfs_dentry_to_name(&name, dentry); 327 xfs_dentry_to_name(&name, dentry, mode);
322 328
323 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); 329 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
324 if (unlikely(error)) 330 if (unlikely(error))
@@ -350,12 +356,12 @@ xfs_vn_rename(
350 struct xfs_name oname; 356 struct xfs_name oname;
351 struct xfs_name nname; 357 struct xfs_name nname;
352 358
353 xfs_dentry_to_name(&oname, odentry); 359 xfs_dentry_to_name(&oname, odentry, 0);
354 xfs_dentry_to_name(&nname, ndentry); 360 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
355 361
356 return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), 362 return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
357 XFS_I(ndir), &nname, new_inode ? 363 XFS_I(ndir), &nname, new_inode ?
358 XFS_I(new_inode) : NULL); 364 XFS_I(new_inode) : NULL);
359} 365}
360 366
361/* 367/*
@@ -420,8 +426,8 @@ xfs_vn_getattr(
420 stat->dev = inode->i_sb->s_dev; 426 stat->dev = inode->i_sb->s_dev;
421 stat->mode = ip->i_d.di_mode; 427 stat->mode = ip->i_d.di_mode;
422 stat->nlink = ip->i_d.di_nlink; 428 stat->nlink = ip->i_d.di_nlink;
423 stat->uid = ip->i_d.di_uid; 429 stat->uid = inode->i_uid;
424 stat->gid = ip->i_d.di_gid; 430 stat->gid = inode->i_gid;
425 stat->ino = ip->i_ino; 431 stat->ino = ip->i_ino;
426 stat->atime = inode->i_atime; 432 stat->atime = inode->i_atime;
427 stat->mtime = inode->i_mtime; 433 stat->mtime = inode->i_mtime;
@@ -485,8 +491,8 @@ xfs_setattr_nonsize(
485 int mask = iattr->ia_valid; 491 int mask = iattr->ia_valid;
486 xfs_trans_t *tp; 492 xfs_trans_t *tp;
487 int error; 493 int error;
488 uid_t uid = 0, iuid = 0; 494 kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
489 gid_t gid = 0, igid = 0; 495 kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
490 struct xfs_dquot *udqp = NULL, *gdqp = NULL; 496 struct xfs_dquot *udqp = NULL, *gdqp = NULL;
491 struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; 497 struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
492 498
@@ -522,13 +528,13 @@ xfs_setattr_nonsize(
522 uid = iattr->ia_uid; 528 uid = iattr->ia_uid;
523 qflags |= XFS_QMOPT_UQUOTA; 529 qflags |= XFS_QMOPT_UQUOTA;
524 } else { 530 } else {
525 uid = ip->i_d.di_uid; 531 uid = inode->i_uid;
526 } 532 }
527 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { 533 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
528 gid = iattr->ia_gid; 534 gid = iattr->ia_gid;
529 qflags |= XFS_QMOPT_GQUOTA; 535 qflags |= XFS_QMOPT_GQUOTA;
530 } else { 536 } else {
531 gid = ip->i_d.di_gid; 537 gid = inode->i_gid;
532 } 538 }
533 539
534 /* 540 /*
@@ -538,14 +544,16 @@ xfs_setattr_nonsize(
538 */ 544 */
539 ASSERT(udqp == NULL); 545 ASSERT(udqp == NULL);
540 ASSERT(gdqp == NULL); 546 ASSERT(gdqp == NULL);
541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), 547 error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
542 qflags, &udqp, &gdqp, NULL); 548 xfs_kgid_to_gid(gid),
549 xfs_get_projid(ip),
550 qflags, &udqp, &gdqp, NULL);
543 if (error) 551 if (error)
544 return error; 552 return error;
545 } 553 }
546 554
547 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 555 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
548 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 556 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
549 if (error) 557 if (error)
550 goto out_dqrele; 558 goto out_dqrele;
551 559
@@ -561,8 +569,8 @@ xfs_setattr_nonsize(
561 * while we didn't have the inode locked, inode's dquot(s) 569 * while we didn't have the inode locked, inode's dquot(s)
562 * would have changed also. 570 * would have changed also.
563 */ 571 */
564 iuid = ip->i_d.di_uid; 572 iuid = inode->i_uid;
565 igid = ip->i_d.di_gid; 573 igid = inode->i_gid;
566 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; 574 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
567 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; 575 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
568 576
@@ -571,8 +579,8 @@ xfs_setattr_nonsize(
571 * going to change. 579 * going to change.
572 */ 580 */
573 if (XFS_IS_QUOTA_RUNNING(mp) && 581 if (XFS_IS_QUOTA_RUNNING(mp) &&
574 ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || 582 ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { 583 (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
576 ASSERT(tp); 584 ASSERT(tp);
577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 585 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
578 NULL, capable(CAP_FOWNER) ? 586 NULL, capable(CAP_FOWNER) ?
@@ -602,17 +610,17 @@ xfs_setattr_nonsize(
602 * Change the ownerships and register quota modifications 610 * Change the ownerships and register quota modifications
603 * in the transaction. 611 * in the transaction.
604 */ 612 */
605 if (iuid != uid) { 613 if (!uid_eq(iuid, uid)) {
606 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { 614 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
607 ASSERT(mask & ATTR_UID); 615 ASSERT(mask & ATTR_UID);
608 ASSERT(udqp); 616 ASSERT(udqp);
609 olddquot1 = xfs_qm_vop_chown(tp, ip, 617 olddquot1 = xfs_qm_vop_chown(tp, ip,
610 &ip->i_udquot, udqp); 618 &ip->i_udquot, udqp);
611 } 619 }
612 ip->i_d.di_uid = uid; 620 ip->i_d.di_uid = xfs_kuid_to_uid(uid);
613 inode->i_uid = uid; 621 inode->i_uid = uid;
614 } 622 }
615 if (igid != gid) { 623 if (!gid_eq(igid, gid)) {
616 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { 624 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
617 ASSERT(!XFS_IS_PQUOTA_ON(mp)); 625 ASSERT(!XFS_IS_PQUOTA_ON(mp));
618 ASSERT(mask & ATTR_GID); 626 ASSERT(mask & ATTR_GID);
@@ -620,7 +628,7 @@ xfs_setattr_nonsize(
620 olddquot2 = xfs_qm_vop_chown(tp, ip, 628 olddquot2 = xfs_qm_vop_chown(tp, ip,
621 &ip->i_gdquot, gdqp); 629 &ip->i_gdquot, gdqp);
622 } 630 }
623 ip->i_d.di_gid = gid; 631 ip->i_d.di_gid = xfs_kgid_to_gid(gid);
624 inode->i_gid = gid; 632 inode->i_gid = gid;
625 } 633 }
626 } 634 }
@@ -807,9 +815,7 @@ xfs_setattr_size(
807 goto out_unlock; 815 goto out_unlock;
808 816
809 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 817 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
810 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 818 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
811 XFS_TRANS_PERM_LOG_RES,
812 XFS_ITRUNCATE_LOG_COUNT);
813 if (error) 819 if (error)
814 goto out_trans_cancel; 820 goto out_trans_cancel;
815 821
@@ -932,7 +938,7 @@ xfs_vn_update_time(
932 trace_xfs_update_time(ip); 938 trace_xfs_update_time(ip);
933 939
934 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 940 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
935 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 941 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
936 if (error) { 942 if (error) {
937 xfs_trans_cancel(tp, 0); 943 xfs_trans_cancel(tp, 0);
938 return -error; 944 return -error;
@@ -1173,8 +1179,8 @@ xfs_setup_inode(
1173 1179
1174 inode->i_mode = ip->i_d.di_mode; 1180 inode->i_mode = ip->i_d.di_mode;
1175 set_nlink(inode, ip->i_d.di_nlink); 1181 set_nlink(inode, ip->i_d.di_nlink);
1176 inode->i_uid = ip->i_d.di_uid; 1182 inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
1177 inode->i_gid = ip->i_d.di_gid; 1183 inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
1178 1184
1179 switch (inode->i_mode & S_IFMT) { 1185 switch (inode->i_mode & S_IFMT) {
1180 case S_IFBLK: 1186 case S_IFBLK:
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..d81fb41205ec 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -27,4 +27,17 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
27 27
28extern void xfs_setup_inode(struct xfs_inode *); 28extern void xfs_setup_inode(struct xfs_inode *);
29 29
30/*
31 * Internal setattr interfaces.
32 */
33#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
34#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if op would block */
35#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
36#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
37#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
38
39extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
40 int flags);
41extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
42
30#endif /* __XFS_IOPS_H__ */ 43#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b93e14b86754..084b3e1741fd 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -495,7 +495,7 @@ xfs_bulkstat(
495 /* 495 /*
496 * Done, we're either out of filesystem or space to put the data. 496 * Done, we're either out of filesystem or space to put the data.
497 */ 497 */
498 kmem_free_large(irbuf); 498 kmem_free(irbuf);
499 *ubcountp = ubelem; 499 *ubcountp = ubelem;
500 /* 500 /*
501 * Found some inodes, return them now and return the error next time. 501 * Found some inodes, return them now and return the error next time.
@@ -541,8 +541,9 @@ xfs_bulkstat_single(
541 * at the expense of the error case. 541 * at the expense of the error case.
542 */ 542 */
543 543
544 ino = (xfs_ino_t)*lastinop; 544 ino = *lastinop;
545 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); 545 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
546 NULL, &res);
546 if (error) { 547 if (error) {
547 /* 548 /*
548 * Special case way failed, do it the "long" way 549 * Special case way failed, do it the "long" way
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 800f896a6cc4..f9bb590acc0e 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,6 +32,38 @@
32# define XFS_BIG_INUMS 0 32# define XFS_BIG_INUMS 0
33#endif 33#endif
34 34
35/*
36 * Kernel specific type declarations for XFS
37 */
38typedef signed char __int8_t;
39typedef unsigned char __uint8_t;
40typedef signed short int __int16_t;
41typedef unsigned short int __uint16_t;
42typedef signed int __int32_t;
43typedef unsigned int __uint32_t;
44typedef signed long long int __int64_t;
45typedef unsigned long long int __uint64_t;
46
47typedef __uint32_t inst_t; /* an instruction */
48
49typedef __s64 xfs_off_t; /* <file offset> type */
50typedef unsigned long long xfs_ino_t; /* <inode> type */
51typedef __s64 xfs_daddr_t; /* <disk address> type */
52typedef char * xfs_caddr_t; /* <core address> type */
53typedef __u32 xfs_dev_t;
54typedef __u32 xfs_nlink_t;
55
56/* __psint_t is the same size as a pointer */
57#if (BITS_PER_LONG == 32)
58typedef __int32_t __psint_t;
59typedef __uint32_t __psunsigned_t;
60#elif (BITS_PER_LONG == 64)
61typedef __int64_t __psint_t;
62typedef __uint64_t __psunsigned_t;
63#else
64#error BITS_PER_LONG must be 32 or 64
65#endif
66
35#include "xfs_types.h" 67#include "xfs_types.h"
36 68
37#include "kmem.h" 69#include "kmem.h"
@@ -114,8 +146,6 @@
114#define xfs_inherit_sync xfs_params.inherit_sync.val 146#define xfs_inherit_sync xfs_params.inherit_sync.val
115#define xfs_inherit_nodump xfs_params.inherit_nodump.val 147#define xfs_inherit_nodump xfs_params.inherit_nodump.val
116#define xfs_inherit_noatime xfs_params.inherit_noatim.val 148#define xfs_inherit_noatime xfs_params.inherit_noatim.val
117#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
118#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
119#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val 149#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
120#define xfs_rotorstep xfs_params.rotorstep.val 150#define xfs_rotorstep xfs_params.rotorstep.val
121#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 151#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
@@ -159,6 +189,32 @@
159#define MAX(a,b) (max(a,b)) 189#define MAX(a,b) (max(a,b))
160#define howmany(x, y) (((x)+((y)-1))/(y)) 190#define howmany(x, y) (((x)+((y)-1))/(y))
161 191
192/* Kernel uid/gid conversion. These are used to convert to/from the on disk
193 * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
194 * The conversion here is type only, the value will remain the same since we
195 * are converting to the init_user_ns. The uid is later mapped to a particular
196 * user namespace value when crossing the kernel/user boundary.
197 */
198static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
199{
200 return from_kuid(&init_user_ns, uid);
201}
202
203static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
204{
205 return make_kuid(&init_user_ns, uid);
206}
207
208static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
209{
210 return from_kgid(&init_user_ns, gid);
211}
212
213static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
214{
215 return make_kgid(&init_user_ns, gid);
216}
217
162/* 218/*
163 * Various platform dependent calls that don't fit anywhere else 219 * Various platform dependent calls that don't fit anywhere else
164 */ 220 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index d852a2b3e1fd..a2dea108071a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -257,7 +257,8 @@ xlog_grant_head_wait(
257 struct xlog *log, 257 struct xlog *log,
258 struct xlog_grant_head *head, 258 struct xlog_grant_head *head,
259 struct xlog_ticket *tic, 259 struct xlog_ticket *tic,
260 int need_bytes) 260 int need_bytes) __releases(&head->lock)
261 __acquires(&head->lock)
261{ 262{
262 list_add_tail(&tic->t_queue, &head->waiters); 263 list_add_tail(&tic->t_queue, &head->waiters);
263 264
@@ -614,7 +615,8 @@ xfs_log_mount(
614 xfs_daddr_t blk_offset, 615 xfs_daddr_t blk_offset,
615 int num_bblks) 616 int num_bblks)
616{ 617{
617 int error; 618 int error = 0;
619 int min_logfsbs;
618 620
619 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 621 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
620 xfs_notice(mp, "Mounting Filesystem"); 622 xfs_notice(mp, "Mounting Filesystem");
@@ -631,6 +633,50 @@ xfs_log_mount(
631 } 633 }
632 634
633 /* 635 /*
636 * Validate the given log space and drop a critical message via syslog
637 * if the log size is too small that would lead to some unexpected
638 * situations in transaction log space reservation stage.
639 *
640 * Note: we can't just reject the mount if the validation fails. This
641 * would mean that people would have to downgrade their kernel just to
642 * remedy the situation as there is no way to grow the log (short of
643 * black magic surgery with xfs_db).
644 *
645 * We can, however, reject mounts for CRC format filesystems, as the
646 * mkfs binary being used to make the filesystem should never create a
647 * filesystem with a log that is too small.
648 */
649 min_logfsbs = xfs_log_calc_minimum_size(mp);
650
651 if (mp->m_sb.sb_logblocks < min_logfsbs) {
652 xfs_warn(mp,
653 "Log size %d blocks too small, minimum size is %d blocks",
654 mp->m_sb.sb_logblocks, min_logfsbs);
655 error = EINVAL;
656 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
657 xfs_warn(mp,
658 "Log size %d blocks too large, maximum size is %lld blocks",
659 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
660 error = EINVAL;
661 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
662 xfs_warn(mp,
663 "log size %lld bytes too large, maximum size is %lld bytes",
664 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
665 XFS_MAX_LOG_BYTES);
666 error = EINVAL;
667 }
668 if (error) {
669 if (xfs_sb_version_hascrc(&mp->m_sb)) {
670 xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
671 ASSERT(0);
672 goto out_free_log;
673 }
674 xfs_crit(mp,
675"Log size out of supported range. Continuing onwards, but if log hangs are\n"
676"experienced then please report this message in the bug report.");
677 }
678
679 /*
634 * Initialize the AIL now we have a log. 680 * Initialize the AIL now we have a log.
635 */ 681 */
636 error = xfs_trans_ail_init(mp); 682 error = xfs_trans_ail_init(mp);
@@ -720,7 +766,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
720 * Unmount record used to have a string "Unmount filesystem--" in the 766 * Unmount record used to have a string "Unmount filesystem--" in the
721 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 767 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
722 * We just write the magic number now since that particular field isn't 768 * We just write the magic number now since that particular field isn't
723 * currently architecture converted and "nUmount" is a bit foo. 769 * currently architecture converted and "Unmount" is a bit foo.
724 * As far as I know, there weren't any dependencies on the old behaviour. 770 * As far as I know, there weren't any dependencies on the old behaviour.
725 */ 771 */
726 772
@@ -1941,7 +1987,7 @@ xlog_print_tic_res(
1941 1987
1942 xfs_alert_tag(mp, XFS_PTAG_LOGRES, 1988 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1943 "xlog_write: reservation ran out. Need to up reservation"); 1989 "xlog_write: reservation ran out. Need to up reservation");
1944 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1990 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1945} 1991}
1946 1992
1947/* 1993/*
@@ -2044,7 +2090,7 @@ xlog_write_setup_ophdr(
2044 * Set up the parameters of the region copy into the log. This has 2090 * Set up the parameters of the region copy into the log. This has
2045 * to handle region write split across multiple log buffers - this 2091 * to handle region write split across multiple log buffers - this
2046 * state is kept external to this function so that this code can 2092 * state is kept external to this function so that this code can
2047 * can be written in an obvious, self documenting manner. 2093 * be written in an obvious, self documenting manner.
2048 */ 2094 */
2049static int 2095static int
2050xlog_write_setup_copy( 2096xlog_write_setup_copy(
@@ -3391,24 +3437,17 @@ xfs_log_ticket_get(
3391} 3437}
3392 3438
3393/* 3439/*
3394 * Allocate and initialise a new log ticket. 3440 * Figure out the total log space unit (in bytes) that would be
3441 * required for a log ticket.
3395 */ 3442 */
3396struct xlog_ticket * 3443int
3397xlog_ticket_alloc( 3444xfs_log_calc_unit_res(
3398 struct xlog *log, 3445 struct xfs_mount *mp,
3399 int unit_bytes, 3446 int unit_bytes)
3400 int cnt,
3401 char client,
3402 bool permanent,
3403 xfs_km_flags_t alloc_flags)
3404{ 3447{
3405 struct xlog_ticket *tic; 3448 struct xlog *log = mp->m_log;
3406 uint num_headers; 3449 int iclog_space;
3407 int iclog_space; 3450 uint num_headers;
3408
3409 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3410 if (!tic)
3411 return NULL;
3412 3451
3413 /* 3452 /*
3414 * Permanent reservations have up to 'cnt'-1 active log operations 3453 * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3483,20 +3522,43 @@ xlog_ticket_alloc(
3483 unit_bytes += log->l_iclog_hsize; 3522 unit_bytes += log->l_iclog_hsize;
3484 3523
3485 /* for roundoff padding for transaction data and one for commit record */ 3524 /* for roundoff padding for transaction data and one for commit record */
3486 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3525 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
3487 log->l_mp->m_sb.sb_logsunit > 1) {
3488 /* log su roundoff */ 3526 /* log su roundoff */
3489 unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; 3527 unit_bytes += 2 * mp->m_sb.sb_logsunit;
3490 } else { 3528 } else {
3491 /* BB roundoff */ 3529 /* BB roundoff */
3492 unit_bytes += 2*BBSIZE; 3530 unit_bytes += 2 * BBSIZE;
3493 } 3531 }
3494 3532
3533 return unit_bytes;
3534}
3535
3536/*
3537 * Allocate and initialise a new log ticket.
3538 */
3539struct xlog_ticket *
3540xlog_ticket_alloc(
3541 struct xlog *log,
3542 int unit_bytes,
3543 int cnt,
3544 char client,
3545 bool permanent,
3546 xfs_km_flags_t alloc_flags)
3547{
3548 struct xlog_ticket *tic;
3549 int unit_res;
3550
3551 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3552 if (!tic)
3553 return NULL;
3554
3555 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
3556
3495 atomic_set(&tic->t_ref, 1); 3557 atomic_set(&tic->t_ref, 1);
3496 tic->t_task = current; 3558 tic->t_task = current;
3497 INIT_LIST_HEAD(&tic->t_queue); 3559 INIT_LIST_HEAD(&tic->t_queue);
3498 tic->t_unit_res = unit_bytes; 3560 tic->t_unit_res = unit_res;
3499 tic->t_curr_res = unit_bytes; 3561 tic->t_curr_res = unit_res;
3500 tic->t_cnt = cnt; 3562 tic->t_cnt = cnt;
3501 tic->t_ocnt = cnt; 3563 tic->t_ocnt = cnt;
3502 tic->t_tid = prandom_u32(); 3564 tic->t_tid = prandom_u32();
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index fb630e496c12..1c458487f000 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,14 +18,30 @@
18#ifndef __XFS_LOG_H__ 18#ifndef __XFS_LOG_H__
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21#include "xfs_log_format.h"
22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
23#define BLOCK_LSN(lsn) ((uint)(lsn))
24 22
25/* this is used in a spot where we might otherwise double-endian-flip */ 23struct xfs_log_vec {
26#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) 24 struct xfs_log_vec *lv_next; /* next lv in build list */
25 int lv_niovecs; /* number of iovecs in lv */
26 struct xfs_log_iovec *lv_iovecp; /* iovec array */
27 struct xfs_log_item *lv_item; /* owner */
28 char *lv_buf; /* formatted buffer */
29 int lv_buf_len; /* size of formatted buffer */
30 int lv_size; /* size of allocated lv */
31};
32
33#define XFS_LOG_VEC_ORDERED (-1)
34
35/*
36 * Structure used to pass callback function and the function's argument
37 * to the log manager.
38 */
39typedef struct xfs_log_callback {
40 struct xfs_log_callback *cb_next;
41 void (*cb_func)(void *, int);
42 void *cb_arg;
43} xfs_log_callback_t;
27 44
28#ifdef __KERNEL__
29/* 45/*
30 * By comparing each component, we don't have to worry about extra 46 * By comparing each component, we don't have to worry about extra
31 * endian issues in treating two 32 bit numbers as one 64 bit number 47 * endian issues in treating two 32 bit numbers as one 64 bit number
@@ -59,67 +75,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
59 */ 75 */
60#define XFS_LOG_SYNC 0x1 76#define XFS_LOG_SYNC 0x1
61 77
62#endif /* __KERNEL__ */
63
64
65/* Log Clients */
66#define XFS_TRANSACTION 0x69
67#define XFS_VOLUME 0x2
68#define XFS_LOG 0xaa
69
70
71/* Region types for iovec's i_type */
72#define XLOG_REG_TYPE_BFORMAT 1
73#define XLOG_REG_TYPE_BCHUNK 2
74#define XLOG_REG_TYPE_EFI_FORMAT 3
75#define XLOG_REG_TYPE_EFD_FORMAT 4
76#define XLOG_REG_TYPE_IFORMAT 5
77#define XLOG_REG_TYPE_ICORE 6
78#define XLOG_REG_TYPE_IEXT 7
79#define XLOG_REG_TYPE_IBROOT 8
80#define XLOG_REG_TYPE_ILOCAL 9
81#define XLOG_REG_TYPE_IATTR_EXT 10
82#define XLOG_REG_TYPE_IATTR_BROOT 11
83#define XLOG_REG_TYPE_IATTR_LOCAL 12
84#define XLOG_REG_TYPE_QFORMAT 13
85#define XLOG_REG_TYPE_DQUOT 14
86#define XLOG_REG_TYPE_QUOTAOFF 15
87#define XLOG_REG_TYPE_LRHEADER 16
88#define XLOG_REG_TYPE_UNMOUNT 17
89#define XLOG_REG_TYPE_COMMIT 18
90#define XLOG_REG_TYPE_TRANSHDR 19
91#define XLOG_REG_TYPE_ICREATE 20
92#define XLOG_REG_TYPE_MAX 20
93
94typedef struct xfs_log_iovec {
95 void *i_addr; /* beginning address of region */
96 int i_len; /* length in bytes of region */
97 uint i_type; /* type of region */
98} xfs_log_iovec_t;
99
100struct xfs_log_vec {
101 struct xfs_log_vec *lv_next; /* next lv in build list */
102 int lv_niovecs; /* number of iovecs in lv */
103 struct xfs_log_iovec *lv_iovecp; /* iovec array */
104 struct xfs_log_item *lv_item; /* owner */
105 char *lv_buf; /* formatted buffer */
106 int lv_buf_len; /* size of formatted buffer */
107};
108
109#define XFS_LOG_VEC_ORDERED (-1)
110
111/*
112 * Structure used to pass callback function and the function's argument
113 * to the log manager.
114 */
115typedef struct xfs_log_callback {
116 struct xfs_log_callback *cb_next;
117 void (*cb_func)(void *, int);
118 void *cb_arg;
119} xfs_log_callback_t;
120
121
122#ifdef __KERNEL__
123/* Log manager interfaces */ 78/* Log manager interfaces */
124struct xfs_mount; 79struct xfs_mount;
125struct xlog_in_core; 80struct xlog_in_core;
@@ -188,5 +143,4 @@ void xfs_log_work_queue(struct xfs_mount *mp);
188void xfs_log_worker(struct work_struct *work); 143void xfs_log_worker(struct work_struct *work);
189void xfs_log_quiesce(struct xfs_mount *mp); 144void xfs_log_quiesce(struct xfs_mount *mp);
190 145
191#endif
192#endif /* __XFS_LOG_H__ */ 146#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 02b9cf3f8252..cfe97973ba36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -80,6 +80,83 @@ xlog_cil_init_post_recovery(
80 log->l_curr_block); 80 log->l_curr_block);
81} 81}
82 82
83STATIC int
84xlog_cil_lv_item_format(
85 struct xfs_log_item *lip,
86 struct xfs_log_vec *lv)
87{
88 int index;
89 char *ptr;
90
91 /* format new vectors into array */
92 lip->li_ops->iop_format(lip, lv->lv_iovecp);
93
94 /* copy data into existing array */
95 ptr = lv->lv_buf;
96 for (index = 0; index < lv->lv_niovecs; index++) {
97 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
98
99 memcpy(ptr, vec->i_addr, vec->i_len);
100 vec->i_addr = ptr;
101 ptr += vec->i_len;
102 }
103
104 /*
105 * some size calculations for log vectors over-estimate, so the caller
106 * doesn't know the amount of space actually used by the item. Return
107 * the byte count to the caller so they can check and store it
108 * appropriately.
109 */
110 return ptr - lv->lv_buf;
111}
112
113/*
114 * Prepare the log item for insertion into the CIL. Calculate the difference in
115 * log space and vectors it will consume, and if it is a new item pin it as
116 * well.
117 */
118STATIC void
119xfs_cil_prepare_item(
120 struct xlog *log,
121 struct xfs_log_vec *lv,
122 struct xfs_log_vec *old_lv,
123 int *diff_len,
124 int *diff_iovecs)
125{
126 /* Account for the new LV being passed in */
127 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
128 *diff_len += lv->lv_buf_len;
129 *diff_iovecs += lv->lv_niovecs;
130 }
131
132 /*
133 * If there is no old LV, this is the first time we've seen the item in
134 * this CIL context and so we need to pin it. If we are replacing the
135 * old_lv, then remove the space it accounts for and free it.
136 */
137 if (!old_lv)
138 lv->lv_item->li_ops->iop_pin(lv->lv_item);
139 else if (old_lv != lv) {
140 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
141
142 *diff_len -= old_lv->lv_buf_len;
143 *diff_iovecs -= old_lv->lv_niovecs;
144 kmem_free(old_lv);
145 }
146
147 /* attach new log vector to log item */
148 lv->lv_item->li_lv = lv;
149
150 /*
151 * If this is the first time the item is being committed to the
152 * CIL, store the sequence number on the log item so we can
153 * tell in future commits whether this is the first checkpoint
154 * the item is being committed into.
155 */
156 if (!lv->lv_item->li_seq)
157 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
158}
159
83/* 160/*
84 * Format log item into a flat buffers 161 * Format log item into a flat buffers
85 * 162 *
@@ -106,35 +183,39 @@ xlog_cil_init_post_recovery(
106 * format the regions into the iclog as though they are being formatted 183 * format the regions into the iclog as though they are being formatted
107 * directly out of the objects themselves. 184 * directly out of the objects themselves.
108 */ 185 */
109static struct xfs_log_vec * 186static void
110xlog_cil_prepare_log_vecs( 187xlog_cil_insert_format_items(
111 struct xfs_trans *tp) 188 struct xlog *log,
189 struct xfs_trans *tp,
190 int *diff_len,
191 int *diff_iovecs)
112{ 192{
113 struct xfs_log_item_desc *lidp; 193 struct xfs_log_item_desc *lidp;
114 struct xfs_log_vec *lv = NULL;
115 struct xfs_log_vec *ret_lv = NULL;
116 194
117 195
118 /* Bail out if we didn't find a log item. */ 196 /* Bail out if we didn't find a log item. */
119 if (list_empty(&tp->t_items)) { 197 if (list_empty(&tp->t_items)) {
120 ASSERT(0); 198 ASSERT(0);
121 return NULL; 199 return;
122 } 200 }
123 201
124 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 202 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
125 struct xfs_log_vec *new_lv; 203 struct xfs_log_item *lip = lidp->lid_item;
126 void *ptr; 204 struct xfs_log_vec *lv;
127 int index; 205 struct xfs_log_vec *old_lv;
128 int len = 0; 206 int niovecs = 0;
129 uint niovecs; 207 int nbytes = 0;
208 int buf_size;
130 bool ordered = false; 209 bool ordered = false;
131 210
132 /* Skip items which aren't dirty in this transaction. */ 211 /* Skip items which aren't dirty in this transaction. */
133 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 212 if (!(lidp->lid_flags & XFS_LID_DIRTY))
134 continue; 213 continue;
135 214
215 /* get number of vecs and size of data to be stored */
216 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
217
136 /* Skip items that do not have any vectors for writing */ 218 /* Skip items that do not have any vectors for writing */
137 niovecs = IOP_SIZE(lidp->lid_item);
138 if (!niovecs) 219 if (!niovecs)
139 continue; 220 continue;
140 221
@@ -146,109 +227,63 @@ xlog_cil_prepare_log_vecs(
146 if (niovecs == XFS_LOG_VEC_ORDERED) { 227 if (niovecs == XFS_LOG_VEC_ORDERED) {
147 ordered = true; 228 ordered = true;
148 niovecs = 0; 229 niovecs = 0;
230 nbytes = 0;
149 } 231 }
150 232
151 new_lv = kmem_zalloc(sizeof(*new_lv) + 233 /* grab the old item if it exists for reservation accounting */
152 niovecs * sizeof(struct xfs_log_iovec), 234 old_lv = lip->li_lv;
153 KM_SLEEP|KM_NOFS);
154
155 new_lv->lv_item = lidp->lid_item;
156 new_lv->lv_niovecs = niovecs;
157 if (ordered) {
158 /* track as an ordered logvec */
159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
160 goto next;
161 }
162
163 /* The allocated iovec region lies beyond the log vector. */
164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
165 235
166 /* build the vector array and calculate it's length */ 236 /* calc buffer size */
167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 237 buf_size = sizeof(struct xfs_log_vec) + nbytes +
168 for (index = 0; index < new_lv->lv_niovecs; index++) 238 niovecs * sizeof(struct xfs_log_iovec);
169 len += new_lv->lv_iovecp[index].i_len;
170 239
171 new_lv->lv_buf_len = len; 240 /* compare to existing item size */
172 new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, 241 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
173 KM_SLEEP|KM_NOFS); 242 /* same or smaller, optimise common overwrite case */
174 ptr = new_lv->lv_buf; 243 lv = lip->li_lv;
244 lv->lv_next = NULL;
175 245
176 for (index = 0; index < new_lv->lv_niovecs; index++) { 246 if (ordered)
177 struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; 247 goto insert;
178 248
179 memcpy(ptr, vec->i_addr, vec->i_len); 249 /*
180 vec->i_addr = ptr; 250 * set the item up as though it is a new insertion so
181 ptr += vec->i_len; 251 * that the space reservation accounting is correct.
182 } 252 */
183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 253 *diff_iovecs -= lv->lv_niovecs;
184 254 *diff_len -= lv->lv_buf_len;
185next:
186 if (!ret_lv)
187 ret_lv = new_lv;
188 else
189 lv->lv_next = new_lv;
190 lv = new_lv;
191 }
192
193 return ret_lv;
194}
195
196/*
197 * Prepare the log item for insertion into the CIL. Calculate the difference in
198 * log space and vectors it will consume, and if it is a new item pin it as
199 * well.
200 */
201STATIC void
202xfs_cil_prepare_item(
203 struct xlog *log,
204 struct xfs_log_vec *lv,
205 int *len,
206 int *diff_iovecs)
207{
208 struct xfs_log_vec *old = lv->lv_item->li_lv;
209 255
210 if (old) { 256 /* Ensure the lv is set up according to ->iop_size */
211 /* existing lv on log item, space used is a delta */ 257 lv->lv_niovecs = niovecs;
212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || 258 lv->lv_buf = (char *)lv + buf_size - nbytes;
213 old->lv_buf_len == XFS_LOG_VEC_ORDERED);
214 259
215 /* 260 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
216 * If the new item is ordered, keep the old one that is already 261 goto insert;
217 * tracking dirty or ordered regions
218 */
219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
220 ASSERT(!lv->lv_buf);
221 kmem_free(lv);
222 return;
223 } 262 }
224 263
225 *len += lv->lv_buf_len - old->lv_buf_len; 264 /* allocate new data chunk */
226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 265 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
227 kmem_free(old->lv_buf); 266 lv->lv_item = lip;
228 kmem_free(old); 267 lv->lv_size = buf_size;
229 } else { 268 lv->lv_niovecs = niovecs;
230 /* new lv, must pin the log item */ 269 if (ordered) {
231 ASSERT(!lv->lv_item->li_lv); 270 /* track as an ordered logvec */
232 271 ASSERT(lip->li_lv == NULL);
233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { 272 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
234 *len += lv->lv_buf_len; 273 goto insert;
235 *diff_iovecs += lv->lv_niovecs;
236 } 274 }
237 IOP_PIN(lv->lv_item);
238 275
239 } 276 /* The allocated iovec region lies beyond the log vector. */
277 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
240 278
241 /* attach new log vector to log item */ 279 /* The allocated data region lies beyond the iovec region */
242 lv->lv_item->li_lv = lv; 280 lv->lv_buf = (char *)lv + buf_size - nbytes;
243 281
244 /* 282 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
245 * If this is the first time the item is being committed to the 283insert:
246 * CIL, store the sequence number on the log item so we can 284 ASSERT(lv->lv_buf_len <= nbytes);
247 * tell in future commits whether this is the first checkpoint 285 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
248 * the item is being committed into. 286 }
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
252} 287}
253 288
254/* 289/*
@@ -261,53 +296,47 @@ xfs_cil_prepare_item(
261static void 296static void
262xlog_cil_insert_items( 297xlog_cil_insert_items(
263 struct xlog *log, 298 struct xlog *log,
264 struct xfs_log_vec *log_vector, 299 struct xfs_trans *tp)
265 struct xlog_ticket *ticket)
266{ 300{
267 struct xfs_cil *cil = log->l_cilp; 301 struct xfs_cil *cil = log->l_cilp;
268 struct xfs_cil_ctx *ctx = cil->xc_ctx; 302 struct xfs_cil_ctx *ctx = cil->xc_ctx;
269 struct xfs_log_vec *lv; 303 struct xfs_log_item_desc *lidp;
270 int len = 0; 304 int len = 0;
271 int diff_iovecs = 0; 305 int diff_iovecs = 0;
272 int iclog_space; 306 int iclog_space;
273 307
274 ASSERT(log_vector); 308 ASSERT(tp);
275 309
276 /* 310 /*
277 * Do all the accounting aggregation and switching of log vectors
278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we 311 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL. 312 * are done so it doesn't matter exactly how we update the CIL.
290 */ 313 */
314 xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
315
316 /*
317 * Now (re-)position everything modified at the tail of the CIL.
318 * We do this here so we only need to take the CIL lock once during
319 * the transaction commit.
320 */
291 spin_lock(&cil->xc_cil_lock); 321 spin_lock(&cil->xc_cil_lock);
292 for (lv = log_vector; lv; ) { 322 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
293 struct xfs_log_vec *next = lv->lv_next; 323 struct xfs_log_item *lip = lidp->lid_item;
294 324
295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); 325 /* Skip items which aren't dirty in this transaction. */
296 lv->lv_next = NULL; 326 if (!(lidp->lid_flags & XFS_LID_DIRTY))
327 continue;
297 328
298 /* 329 list_move_tail(&lip->li_cil, &cil->xc_cil);
299 * xfs_cil_prepare_item() may free the lv, so move the item on
300 * the CIL first.
301 */
302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
304 lv = next;
305 } 330 }
306 331
307 /* account for space used by new iovec headers */ 332 /* account for space used by new iovec headers */
308 len += diff_iovecs * sizeof(xlog_op_header_t); 333 len += diff_iovecs * sizeof(xlog_op_header_t);
309 ctx->nvecs += diff_iovecs; 334 ctx->nvecs += diff_iovecs;
310 335
336 /* attach the transaction to the CIL if it has any busy extents */
337 if (!list_empty(&tp->t_busy))
338 list_splice_init(&tp->t_busy, &ctx->busy_extents);
339
311 /* 340 /*
312 * Now transfer enough transaction reservation to the context ticket 341 * Now transfer enough transaction reservation to the context ticket
313 * for the checkpoint. The context ticket is special - the unit 342 * for the checkpoint. The context ticket is special - the unit
@@ -316,10 +345,8 @@ xlog_cil_insert_items(
316 * during the transaction commit. 345 * during the transaction commit.
317 */ 346 */
318 if (ctx->ticket->t_curr_res == 0) { 347 if (ctx->ticket->t_curr_res == 0) {
319 /* first commit in checkpoint, steal the header reservation */
320 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
321 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; 348 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
322 ticket->t_curr_res -= ctx->ticket->t_unit_res; 349 tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
323 } 350 }
324 351
325 /* do we need space for more log record headers? */ 352 /* do we need space for more log record headers? */
@@ -333,10 +360,10 @@ xlog_cil_insert_items(
333 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); 360 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
334 ctx->ticket->t_unit_res += hdrs; 361 ctx->ticket->t_unit_res += hdrs;
335 ctx->ticket->t_curr_res += hdrs; 362 ctx->ticket->t_curr_res += hdrs;
336 ticket->t_curr_res -= hdrs; 363 tp->t_ticket->t_curr_res -= hdrs;
337 ASSERT(ticket->t_curr_res >= len); 364 ASSERT(tp->t_ticket->t_curr_res >= len);
338 } 365 }
339 ticket->t_curr_res -= len; 366 tp->t_ticket->t_curr_res -= len;
340 ctx->space_used += len; 367 ctx->space_used += len;
341 368
342 spin_unlock(&cil->xc_cil_lock); 369 spin_unlock(&cil->xc_cil_lock);
@@ -350,7 +377,6 @@ xlog_cil_free_logvec(
350 377
351 for (lv = log_vector; lv; ) { 378 for (lv = log_vector; lv; ) {
352 struct xfs_log_vec *next = lv->lv_next; 379 struct xfs_log_vec *next = lv->lv_next;
353 kmem_free(lv->lv_buf);
354 kmem_free(lv); 380 kmem_free(lv);
355 lv = next; 381 lv = next;
356 } 382 }
@@ -376,9 +402,9 @@ xlog_cil_committed(
376 xfs_extent_busy_clear(mp, &ctx->busy_extents, 402 xfs_extent_busy_clear(mp, &ctx->busy_extents,
377 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); 403 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
378 404
379 spin_lock(&ctx->cil->xc_cil_lock); 405 spin_lock(&ctx->cil->xc_push_lock);
380 list_del(&ctx->committing); 406 list_del(&ctx->committing);
381 spin_unlock(&ctx->cil->xc_cil_lock); 407 spin_unlock(&ctx->cil->xc_push_lock);
382 408
383 xlog_cil_free_logvec(ctx->lv_chain); 409 xlog_cil_free_logvec(ctx->lv_chain);
384 410
@@ -433,7 +459,7 @@ xlog_cil_push(
433 down_write(&cil->xc_ctx_lock); 459 down_write(&cil->xc_ctx_lock);
434 ctx = cil->xc_ctx; 460 ctx = cil->xc_ctx;
435 461
436 spin_lock(&cil->xc_cil_lock); 462 spin_lock(&cil->xc_push_lock);
437 push_seq = cil->xc_push_seq; 463 push_seq = cil->xc_push_seq;
438 ASSERT(push_seq <= ctx->sequence); 464 ASSERT(push_seq <= ctx->sequence);
439 465
@@ -444,10 +470,10 @@ xlog_cil_push(
444 */ 470 */
445 if (list_empty(&cil->xc_cil)) { 471 if (list_empty(&cil->xc_cil)) {
446 cil->xc_push_seq = 0; 472 cil->xc_push_seq = 0;
447 spin_unlock(&cil->xc_cil_lock); 473 spin_unlock(&cil->xc_push_lock);
448 goto out_skip; 474 goto out_skip;
449 } 475 }
450 spin_unlock(&cil->xc_cil_lock); 476 spin_unlock(&cil->xc_push_lock);
451 477
452 478
453 /* check for a previously pushed seqeunce */ 479 /* check for a previously pushed seqeunce */
@@ -515,9 +541,9 @@ xlog_cil_push(
515 * that higher sequences will wait for us to write out a commit record 541 * that higher sequences will wait for us to write out a commit record
516 * before they do. 542 * before they do.
517 */ 543 */
518 spin_lock(&cil->xc_cil_lock); 544 spin_lock(&cil->xc_push_lock);
519 list_add(&ctx->committing, &cil->xc_committing); 545 list_add(&ctx->committing, &cil->xc_committing);
520 spin_unlock(&cil->xc_cil_lock); 546 spin_unlock(&cil->xc_push_lock);
521 up_write(&cil->xc_ctx_lock); 547 up_write(&cil->xc_ctx_lock);
522 548
523 /* 549 /*
@@ -552,7 +578,7 @@ xlog_cil_push(
552 * order the commit records so replay will get them in the right order. 578 * order the commit records so replay will get them in the right order.
553 */ 579 */
554restart: 580restart:
555 spin_lock(&cil->xc_cil_lock); 581 spin_lock(&cil->xc_push_lock);
556 list_for_each_entry(new_ctx, &cil->xc_committing, committing) { 582 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
557 /* 583 /*
558 * Higher sequences will wait for this one so skip them. 584 * Higher sequences will wait for this one so skip them.
@@ -565,11 +591,11 @@ restart:
565 * It is still being pushed! Wait for the push to 591 * It is still being pushed! Wait for the push to
566 * complete, then start again from the beginning. 592 * complete, then start again from the beginning.
567 */ 593 */
568 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 594 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
569 goto restart; 595 goto restart;
570 } 596 }
571 } 597 }
572 spin_unlock(&cil->xc_cil_lock); 598 spin_unlock(&cil->xc_push_lock);
573 599
574 /* xfs_log_done always frees the ticket on error. */ 600 /* xfs_log_done always frees the ticket on error. */
575 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 601 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
@@ -588,10 +614,10 @@ restart:
588 * callbacks to the iclog we can assign the commit LSN to the context 614 * callbacks to the iclog we can assign the commit LSN to the context
589 * and wake up anyone who is waiting for the commit to complete. 615 * and wake up anyone who is waiting for the commit to complete.
590 */ 616 */
591 spin_lock(&cil->xc_cil_lock); 617 spin_lock(&cil->xc_push_lock);
592 ctx->commit_lsn = commit_lsn; 618 ctx->commit_lsn = commit_lsn;
593 wake_up_all(&cil->xc_commit_wait); 619 wake_up_all(&cil->xc_commit_wait);
594 spin_unlock(&cil->xc_cil_lock); 620 spin_unlock(&cil->xc_push_lock);
595 621
596 /* release the hounds! */ 622 /* release the hounds! */
597 return xfs_log_release_iclog(log->l_mp, commit_iclog); 623 return xfs_log_release_iclog(log->l_mp, commit_iclog);
@@ -644,12 +670,12 @@ xlog_cil_push_background(
644 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 670 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
645 return; 671 return;
646 672
647 spin_lock(&cil->xc_cil_lock); 673 spin_lock(&cil->xc_push_lock);
648 if (cil->xc_push_seq < cil->xc_current_sequence) { 674 if (cil->xc_push_seq < cil->xc_current_sequence) {
649 cil->xc_push_seq = cil->xc_current_sequence; 675 cil->xc_push_seq = cil->xc_current_sequence;
650 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); 676 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
651 } 677 }
652 spin_unlock(&cil->xc_cil_lock); 678 spin_unlock(&cil->xc_push_lock);
653 679
654} 680}
655 681
@@ -672,14 +698,14 @@ xlog_cil_push_foreground(
672 * If the CIL is empty or we've already pushed the sequence then 698 * If the CIL is empty or we've already pushed the sequence then
673 * there's no work we need to do. 699 * there's no work we need to do.
674 */ 700 */
675 spin_lock(&cil->xc_cil_lock); 701 spin_lock(&cil->xc_push_lock);
676 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { 702 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
677 spin_unlock(&cil->xc_cil_lock); 703 spin_unlock(&cil->xc_push_lock);
678 return; 704 return;
679 } 705 }
680 706
681 cil->xc_push_seq = push_seq; 707 cil->xc_push_seq = push_seq;
682 spin_unlock(&cil->xc_cil_lock); 708 spin_unlock(&cil->xc_push_lock);
683 709
684 /* do the push now */ 710 /* do the push now */
685 xlog_cil_push(log); 711 xlog_cil_push(log);
@@ -706,43 +732,25 @@ xfs_log_commit_cil(
706 int flags) 732 int flags)
707{ 733{
708 struct xlog *log = mp->m_log; 734 struct xlog *log = mp->m_log;
735 struct xfs_cil *cil = log->l_cilp;
709 int log_flags = 0; 736 int log_flags = 0;
710 struct xfs_log_vec *log_vector;
711 737
712 if (flags & XFS_TRANS_RELEASE_LOG_RES) 738 if (flags & XFS_TRANS_RELEASE_LOG_RES)
713 log_flags = XFS_LOG_REL_PERM_RESERV; 739 log_flags = XFS_LOG_REL_PERM_RESERV;
714 740
715 /*
716 * Do all the hard work of formatting items (including memory
717 * allocation) outside the CIL context lock. This prevents stalling CIL
718 * pushes when we are low on memory and a transaction commit spends a
719 * lot of time in memory reclaim.
720 */
721 log_vector = xlog_cil_prepare_log_vecs(tp);
722 if (!log_vector)
723 return ENOMEM;
724
725 /* lock out background commit */ 741 /* lock out background commit */
726 down_read(&log->l_cilp->xc_ctx_lock); 742 down_read(&cil->xc_ctx_lock);
727 if (commit_lsn)
728 *commit_lsn = log->l_cilp->xc_ctx->sequence;
729 743
730 /* xlog_cil_insert_items() destroys log_vector list */ 744 xlog_cil_insert_items(log, tp);
731 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
732 745
733 /* check we didn't blow the reservation */ 746 /* check we didn't blow the reservation */
734 if (tp->t_ticket->t_curr_res < 0) 747 if (tp->t_ticket->t_curr_res < 0)
735 xlog_print_tic_res(log->l_mp, tp->t_ticket); 748 xlog_print_tic_res(mp, tp->t_ticket);
736 749
737 /* attach the transaction to the CIL if it has any busy extents */ 750 tp->t_commit_lsn = cil->xc_ctx->sequence;
738 if (!list_empty(&tp->t_busy)) { 751 if (commit_lsn)
739 spin_lock(&log->l_cilp->xc_cil_lock); 752 *commit_lsn = tp->t_commit_lsn;
740 list_splice_init(&tp->t_busy,
741 &log->l_cilp->xc_ctx->busy_extents);
742 spin_unlock(&log->l_cilp->xc_cil_lock);
743 }
744 753
745 tp->t_commit_lsn = *commit_lsn;
746 xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 754 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
747 xfs_trans_unreserve_and_mod_sb(tp); 755 xfs_trans_unreserve_and_mod_sb(tp);
748 756
@@ -757,11 +765,11 @@ xfs_log_commit_cil(
757 * the log items. This affects (at least) processing of stale buffers, 765 * the log items. This affects (at least) processing of stale buffers,
758 * inodes and EFIs. 766 * inodes and EFIs.
759 */ 767 */
760 xfs_trans_free_items(tp, *commit_lsn, 0); 768 xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
761 769
762 xlog_cil_push_background(log); 770 xlog_cil_push_background(log);
763 771
764 up_read(&log->l_cilp->xc_ctx_lock); 772 up_read(&cil->xc_ctx_lock);
765 return 0; 773 return 0;
766} 774}
767 775
@@ -800,7 +808,7 @@ xlog_cil_force_lsn(
800 * on commits for those as well. 808 * on commits for those as well.
801 */ 809 */
802restart: 810restart:
803 spin_lock(&cil->xc_cil_lock); 811 spin_lock(&cil->xc_push_lock);
804 list_for_each_entry(ctx, &cil->xc_committing, committing) { 812 list_for_each_entry(ctx, &cil->xc_committing, committing) {
805 if (ctx->sequence > sequence) 813 if (ctx->sequence > sequence)
806 continue; 814 continue;
@@ -809,7 +817,7 @@ restart:
809 * It is still being pushed! Wait for the push to 817 * It is still being pushed! Wait for the push to
810 * complete, then start again from the beginning. 818 * complete, then start again from the beginning.
811 */ 819 */
812 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 820 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
813 goto restart; 821 goto restart;
814 } 822 }
815 if (ctx->sequence != sequence) 823 if (ctx->sequence != sequence)
@@ -817,7 +825,7 @@ restart:
817 /* found it! */ 825 /* found it! */
818 commit_lsn = ctx->commit_lsn; 826 commit_lsn = ctx->commit_lsn;
819 } 827 }
820 spin_unlock(&cil->xc_cil_lock); 828 spin_unlock(&cil->xc_push_lock);
821 return commit_lsn; 829 return commit_lsn;
822} 830}
823 831
@@ -875,6 +883,7 @@ xlog_cil_init(
875 INIT_LIST_HEAD(&cil->xc_cil); 883 INIT_LIST_HEAD(&cil->xc_cil);
876 INIT_LIST_HEAD(&cil->xc_committing); 884 INIT_LIST_HEAD(&cil->xc_committing);
877 spin_lock_init(&cil->xc_cil_lock); 885 spin_lock_init(&cil->xc_cil_lock);
886 spin_lock_init(&cil->xc_push_lock);
878 init_rwsem(&cil->xc_ctx_lock); 887 init_rwsem(&cil->xc_ctx_lock);
879 init_waitqueue_head(&cil->xc_commit_wait); 888 init_waitqueue_head(&cil->xc_commit_wait);
880 889
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
new file mode 100644
index 000000000000..ca7e28a8ed31
--- /dev/null
+++ b/fs/xfs/xfs_log_format.h
@@ -0,0 +1,856 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LOG_FORMAT_H__
19#define __XFS_LOG_FORMAT_H__
20
21struct xfs_mount;
22struct xfs_trans_res;
23
24/*
25 * On-disk Log Format definitions.
26 *
27 * This file contains all the on-disk format definitions used within the log. It
28 * includes the physical log structure itself, as well as all the log item
29 * format structures that are written into the log and intepreted by log
30 * recovery. We start with the physical log format definitions, and then work
31 * through all the log items definitions and everything they encode into the
32 * log.
33 */
34typedef __uint32_t xlog_tid_t;
35
36#define XLOG_MIN_ICLOGS 2
37#define XLOG_MAX_ICLOGS 8
38#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
39#define XLOG_VERSION_1 1
40#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
41#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
42#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
43#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
44#define XLOG_MAX_RECORD_BSIZE (256*1024)
45#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
46#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
47#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
48#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
49#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
50 (log)->l_mp->m_sb.sb_logsunit)
51#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
52
53#define XLOG_HEADER_SIZE 512
54
55/* Minimum number of transactions that must fit in the log (defined by mkfs) */
56#define XFS_MIN_LOG_FACTOR 3
57
58#define XLOG_REC_SHIFT(log) \
59 BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
60 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
61#define XLOG_TOTAL_REC_SHIFT(log) \
62 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
63 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
64
65/* get lsn fields */
66#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
67#define BLOCK_LSN(lsn) ((uint)(lsn))
68
69/* this is used in a spot where we might otherwise double-endian-flip */
70#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
71
72static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
73{
74 return ((xfs_lsn_t)cycle << 32) | block;
75}
76
77static inline uint xlog_get_cycle(char *ptr)
78{
79 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
80 return be32_to_cpu(*((__be32 *)ptr + 1));
81 else
82 return be32_to_cpu(*(__be32 *)ptr);
83}
84
85/* Log Clients */
86#define XFS_TRANSACTION 0x69
87#define XFS_VOLUME 0x2
88#define XFS_LOG 0xaa
89
90#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
91
92/* Region types for iovec's i_type */
93#define XLOG_REG_TYPE_BFORMAT 1
94#define XLOG_REG_TYPE_BCHUNK 2
95#define XLOG_REG_TYPE_EFI_FORMAT 3
96#define XLOG_REG_TYPE_EFD_FORMAT 4
97#define XLOG_REG_TYPE_IFORMAT 5
98#define XLOG_REG_TYPE_ICORE 6
99#define XLOG_REG_TYPE_IEXT 7
100#define XLOG_REG_TYPE_IBROOT 8
101#define XLOG_REG_TYPE_ILOCAL 9
102#define XLOG_REG_TYPE_IATTR_EXT 10
103#define XLOG_REG_TYPE_IATTR_BROOT 11
104#define XLOG_REG_TYPE_IATTR_LOCAL 12
105#define XLOG_REG_TYPE_QFORMAT 13
106#define XLOG_REG_TYPE_DQUOT 14
107#define XLOG_REG_TYPE_QUOTAOFF 15
108#define XLOG_REG_TYPE_LRHEADER 16
109#define XLOG_REG_TYPE_UNMOUNT 17
110#define XLOG_REG_TYPE_COMMIT 18
111#define XLOG_REG_TYPE_TRANSHDR 19
112#define XLOG_REG_TYPE_ICREATE 20
113#define XLOG_REG_TYPE_MAX 20
114
115/*
116 * Flags to log operation header
117 *
118 * The first write of a new transaction will be preceded with a start
119 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
120 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
121 * the remainder of the current active in-core log, it is split up into
122 * multiple regions. Each partial region will be marked with a
123 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
124 *
125 */
126#define XLOG_START_TRANS 0x01 /* Start a new transaction */
127#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
128#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
129#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
130#define XLOG_END_TRANS 0x10 /* End a continued transaction */
131#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
132
133
134typedef struct xlog_op_header {
135 __be32 oh_tid; /* transaction id of operation : 4 b */
136 __be32 oh_len; /* bytes in data region : 4 b */
137 __u8 oh_clientid; /* who sent me this : 1 b */
138 __u8 oh_flags; /* : 1 b */
139 __u16 oh_res2; /* 32 bit align : 2 b */
140} xlog_op_header_t;
141
142/* valid values for h_fmt */
143#define XLOG_FMT_UNKNOWN 0
144#define XLOG_FMT_LINUX_LE 1
145#define XLOG_FMT_LINUX_BE 2
146#define XLOG_FMT_IRIX_BE 3
147
148/* our fmt */
149#ifdef XFS_NATIVE_HOST
150#define XLOG_FMT XLOG_FMT_LINUX_BE
151#else
152#define XLOG_FMT XLOG_FMT_LINUX_LE
153#endif
154
155typedef struct xlog_rec_header {
156 __be32 h_magicno; /* log record (LR) identifier : 4 */
157 __be32 h_cycle; /* write cycle of log : 4 */
158 __be32 h_version; /* LR version : 4 */
159 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
160 __be64 h_lsn; /* lsn of this LR : 8 */
161 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
162 __le32 h_crc; /* crc of log record : 4 */
163 __be32 h_prev_block; /* block number to previous LR : 4 */
164 __be32 h_num_logops; /* number of log operations in this LR : 4 */
165 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
166 /* new fields */
167 __be32 h_fmt; /* format of log record : 4 */
168 uuid_t h_fs_uuid; /* uuid of FS : 16 */
169 __be32 h_size; /* iclog size : 4 */
170} xlog_rec_header_t;
171
172typedef struct xlog_rec_ext_header {
173 __be32 xh_cycle; /* write cycle of log : 4 */
174 __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
175} xlog_rec_ext_header_t;
176
177/*
178 * Quite misnamed, because this union lays out the actual on-disk log buffer.
179 */
180typedef union xlog_in_core2 {
181 xlog_rec_header_t hic_header;
182 xlog_rec_ext_header_t hic_xheader;
183 char hic_sector[XLOG_HEADER_SIZE];
184} xlog_in_core_2_t;
185
186/* not an on-disk structure, but needed by log recovery in userspace */
187typedef struct xfs_log_iovec {
188 void *i_addr; /* beginning address of region */
189 int i_len; /* length in bytes of region */
190 uint i_type; /* type of region */
191} xfs_log_iovec_t;
192
193
194/*
195 * Transaction Header definitions.
196 *
197 * This is the structure written in the log at the head of every transaction. It
198 * identifies the type and id of the transaction, and contains the number of
199 * items logged by the transaction so we know how many to expect during
200 * recovery.
201 *
202 * Do not change the below structure without redoing the code in
203 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
204 */
205typedef struct xfs_trans_header {
206 uint th_magic; /* magic number */
207 uint th_type; /* transaction type */
208 __int32_t th_tid; /* transaction id (unused) */
209 uint th_num_items; /* num items logged by trans */
210} xfs_trans_header_t;
211
212#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
213
214/*
215 * Log item types.
216 */
217#define XFS_LI_EFI 0x1236
218#define XFS_LI_EFD 0x1237
219#define XFS_LI_IUNLINK 0x1238
220#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
221#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
222#define XFS_LI_DQUOT 0x123d
223#define XFS_LI_QUOTAOFF 0x123e
224#define XFS_LI_ICREATE 0x123f
225
226#define XFS_LI_TYPE_DESC \
227 { XFS_LI_EFI, "XFS_LI_EFI" }, \
228 { XFS_LI_EFD, "XFS_LI_EFD" }, \
229 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
230 { XFS_LI_INODE, "XFS_LI_INODE" }, \
231 { XFS_LI_BUF, "XFS_LI_BUF" }, \
232 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
233 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
234 { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
235
236/*
237 * Transaction types. Used to distinguish types of buffers.
238 */
239#define XFS_TRANS_SETATTR_NOT_SIZE 1
240#define XFS_TRANS_SETATTR_SIZE 2
241#define XFS_TRANS_INACTIVE 3
242#define XFS_TRANS_CREATE 4
243#define XFS_TRANS_CREATE_TRUNC 5
244#define XFS_TRANS_TRUNCATE_FILE 6
245#define XFS_TRANS_REMOVE 7
246#define XFS_TRANS_LINK 8
247#define XFS_TRANS_RENAME 9
248#define XFS_TRANS_MKDIR 10
249#define XFS_TRANS_RMDIR 11
250#define XFS_TRANS_SYMLINK 12
251#define XFS_TRANS_SET_DMATTRS 13
252#define XFS_TRANS_GROWFS 14
253#define XFS_TRANS_STRAT_WRITE 15
254#define XFS_TRANS_DIOSTRAT 16
255/* 17 was XFS_TRANS_WRITE_SYNC */
256#define XFS_TRANS_WRITEID 18
257#define XFS_TRANS_ADDAFORK 19
258#define XFS_TRANS_ATTRINVAL 20
259#define XFS_TRANS_ATRUNCATE 21
260#define XFS_TRANS_ATTR_SET 22
261#define XFS_TRANS_ATTR_RM 23
262#define XFS_TRANS_ATTR_FLAG 24
263#define XFS_TRANS_CLEAR_AGI_BUCKET 25
264#define XFS_TRANS_QM_SBCHANGE 26
265/*
266 * Dummy entries since we use the transaction type to index into the
267 * trans_type[] in xlog_recover_print_trans_head()
268 */
269#define XFS_TRANS_DUMMY1 27
270#define XFS_TRANS_DUMMY2 28
271#define XFS_TRANS_QM_QUOTAOFF 29
272#define XFS_TRANS_QM_DQALLOC 30
273#define XFS_TRANS_QM_SETQLIM 31
274#define XFS_TRANS_QM_DQCLUSTER 32
275#define XFS_TRANS_QM_QINOCREATE 33
276#define XFS_TRANS_QM_QUOTAOFF_END 34
277#define XFS_TRANS_SB_UNIT 35
278#define XFS_TRANS_FSYNC_TS 36
279#define XFS_TRANS_GROWFSRT_ALLOC 37
280#define XFS_TRANS_GROWFSRT_ZERO 38
281#define XFS_TRANS_GROWFSRT_FREE 39
282#define XFS_TRANS_SWAPEXT 40
283#define XFS_TRANS_SB_COUNT 41
284#define XFS_TRANS_CHECKPOINT 42
285#define XFS_TRANS_ICREATE 43
286#define XFS_TRANS_TYPE_MAX 43
287/* new transaction types need to be reflected in xfs_logprint(8) */
288
289#define XFS_TRANS_TYPES \
290 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
291 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
292 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
293 { XFS_TRANS_CREATE, "CREATE" }, \
294 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
295 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
296 { XFS_TRANS_REMOVE, "REMOVE" }, \
297 { XFS_TRANS_LINK, "LINK" }, \
298 { XFS_TRANS_RENAME, "RENAME" }, \
299 { XFS_TRANS_MKDIR, "MKDIR" }, \
300 { XFS_TRANS_RMDIR, "RMDIR" }, \
301 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
302 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
303 { XFS_TRANS_GROWFS, "GROWFS" }, \
304 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
305 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
306 { XFS_TRANS_WRITEID, "WRITEID" }, \
307 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
308 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
309 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
310 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
311 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
312 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
313 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
314 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
315 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
316 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
317 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
318 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
319 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
320 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
321 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
322 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
323 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
324 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
325 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
326 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
327 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
328 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
329 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
330 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
331 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
332
333/*
334 * This structure is used to track log items associated with
335 * a transaction. It points to the log item and keeps some
336 * flags to track the state of the log item. It also tracks
337 * the amount of space needed to log the item it describes
338 * once we get to commit processing (see xfs_trans_commit()).
339 */
340struct xfs_log_item_desc {
341 struct xfs_log_item *lid_item;
342 struct list_head lid_trans;
343 unsigned char lid_flags;
344};
345
346#define XFS_LID_DIRTY 0x1
347
348/*
349 * Values for t_flags.
350 */
351#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
352#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
353#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
354#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
355#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
356#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
357#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
358 count in superblock */
359
360/*
361 * Values for call flags parameter.
362 */
363#define XFS_TRANS_RELEASE_LOG_RES 0x4
364#define XFS_TRANS_ABORT 0x8
365
366/*
367 * Field values for xfs_trans_mod_sb.
368 */
369#define XFS_TRANS_SB_ICOUNT 0x00000001
370#define XFS_TRANS_SB_IFREE 0x00000002
371#define XFS_TRANS_SB_FDBLOCKS 0x00000004
372#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
373#define XFS_TRANS_SB_FREXTENTS 0x00000010
374#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
375#define XFS_TRANS_SB_DBLOCKS 0x00000040
376#define XFS_TRANS_SB_AGCOUNT 0x00000080
377#define XFS_TRANS_SB_IMAXPCT 0x00000100
378#define XFS_TRANS_SB_REXTSIZE 0x00000200
379#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
380#define XFS_TRANS_SB_RBLOCKS 0x00000800
381#define XFS_TRANS_SB_REXTENTS 0x00001000
382#define XFS_TRANS_SB_REXTSLOG 0x00002000
383
384/*
385 * Here we centralize the specification of XFS meta-data buffer
386 * reference count values. This determine how hard the buffer
387 * cache tries to hold onto the buffer.
388 */
389#define XFS_AGF_REF 4
390#define XFS_AGI_REF 4
391#define XFS_AGFL_REF 3
392#define XFS_INO_BTREE_REF 3
393#define XFS_ALLOC_BTREE_REF 2
394#define XFS_BMAP_BTREE_REF 2
395#define XFS_DIR_BTREE_REF 2
396#define XFS_INO_REF 2
397#define XFS_ATTR_BTREE_REF 1
398#define XFS_DQUOT_REF 1
399
400/*
401 * Flags for xfs_trans_ichgtime().
402 */
403#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
404#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
405#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
406
407
408/*
409 * Inode Log Item Format definitions.
410 *
411 * This is the structure used to lay out an inode log item in the
412 * log. The size of the inline data/extents/b-tree root to be logged
413 * (if any) is indicated in the ilf_dsize field. Changes to this structure
414 * must be added on to the end.
415 */
416typedef struct xfs_inode_log_format {
417 __uint16_t ilf_type; /* inode log item type */
418 __uint16_t ilf_size; /* size of this item */
419 __uint32_t ilf_fields; /* flags for fields logged */
420 __uint16_t ilf_asize; /* size of attr d/ext/root */
421 __uint16_t ilf_dsize; /* size of data/ext/root */
422 __uint64_t ilf_ino; /* inode number */
423 union {
424 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
425 uuid_t ilfu_uuid; /* mount point value */
426 } ilf_u;
427 __int64_t ilf_blkno; /* blkno of inode buffer */
428 __int32_t ilf_len; /* len of inode buffer */
429 __int32_t ilf_boffset; /* off of inode in buffer */
430} xfs_inode_log_format_t;
431
432typedef struct xfs_inode_log_format_32 {
433 __uint16_t ilf_type; /* inode log item type */
434 __uint16_t ilf_size; /* size of this item */
435 __uint32_t ilf_fields; /* flags for fields logged */
436 __uint16_t ilf_asize; /* size of attr d/ext/root */
437 __uint16_t ilf_dsize; /* size of data/ext/root */
438 __uint64_t ilf_ino; /* inode number */
439 union {
440 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
441 uuid_t ilfu_uuid; /* mount point value */
442 } ilf_u;
443 __int64_t ilf_blkno; /* blkno of inode buffer */
444 __int32_t ilf_len; /* len of inode buffer */
445 __int32_t ilf_boffset; /* off of inode in buffer */
446} __attribute__((packed)) xfs_inode_log_format_32_t;
447
448typedef struct xfs_inode_log_format_64 {
449 __uint16_t ilf_type; /* inode log item type */
450 __uint16_t ilf_size; /* size of this item */
451 __uint32_t ilf_fields; /* flags for fields logged */
452 __uint16_t ilf_asize; /* size of attr d/ext/root */
453 __uint16_t ilf_dsize; /* size of data/ext/root */
454 __uint32_t ilf_pad; /* pad for 64 bit boundary */
455 __uint64_t ilf_ino; /* inode number */
456 union {
457 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
458 uuid_t ilfu_uuid; /* mount point value */
459 } ilf_u;
460 __int64_t ilf_blkno; /* blkno of inode buffer */
461 __int32_t ilf_len; /* len of inode buffer */
462 __int32_t ilf_boffset; /* off of inode in buffer */
463} xfs_inode_log_format_64_t;
464
465/*
466 * Flags for xfs_trans_log_inode flags field.
467 */
468#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
469#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
470#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
471#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
472#define XFS_ILOG_DEV 0x010 /* log the dev field */
473#define XFS_ILOG_UUID 0x020 /* log the uuid field */
474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
477#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
478#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
479
480
481/*
482 * The timestamps are dirty, but not necessarily anything else in the inode
483 * core. Unlike the other fields above this one must never make it to disk
484 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
485 * ili_fields in the inode_log_item.
486 */
487#define XFS_ILOG_TIMESTAMP 0x4000
488
489#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
490 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
491 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
492 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
493 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
494
495#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
496 XFS_ILOG_DBROOT)
497
498#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
499 XFS_ILOG_ABROOT)
500
501#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
502 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
503 XFS_ILOG_DEV | XFS_ILOG_UUID | \
504 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
505 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
506 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
507
508static inline int xfs_ilog_fbroot(int w)
509{
510 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
511}
512
513static inline int xfs_ilog_fext(int w)
514{
515 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
516}
517
518static inline int xfs_ilog_fdata(int w)
519{
520 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
521}
522
523/*
524 * Incore version of the on-disk inode core structures. We log this directly
525 * into the journal in host CPU format (for better or worse) and as such
526 * directly mirrors the xfs_dinode structure as it must contain all the same
527 * information.
528 */
529typedef struct xfs_ictimestamp {
530 __int32_t t_sec; /* timestamp seconds */
531 __int32_t t_nsec; /* timestamp nanoseconds */
532} xfs_ictimestamp_t;
533
534/*
535 * NOTE: This structure must be kept identical to struct xfs_dinode
536 * in xfs_dinode.h except for the endianness annotations.
537 */
538typedef struct xfs_icdinode {
539 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
540 __uint16_t di_mode; /* mode and type of file */
541 __int8_t di_version; /* inode version */
542 __int8_t di_format; /* format of di_c data */
543 __uint16_t di_onlink; /* old number of links to file */
544 __uint32_t di_uid; /* owner's user id */
545 __uint32_t di_gid; /* owner's group id */
546 __uint32_t di_nlink; /* number of links to file */
547 __uint16_t di_projid_lo; /* lower part of owner's project id */
548 __uint16_t di_projid_hi; /* higher part of owner's project id */
549 __uint8_t di_pad[6]; /* unused, zeroed space */
550 __uint16_t di_flushiter; /* incremented on flush */
551 xfs_ictimestamp_t di_atime; /* time last accessed */
552 xfs_ictimestamp_t di_mtime; /* time last modified */
553 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
554 xfs_fsize_t di_size; /* number of bytes in file */
555 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
556 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
557 xfs_extnum_t di_nextents; /* number of extents in data fork */
558 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
559 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
560 __int8_t di_aformat; /* format of attr fork's data */
561 __uint32_t di_dmevmask; /* DMIG event mask */
562 __uint16_t di_dmstate; /* DMIG state info */
563 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
564 __uint32_t di_gen; /* generation number */
565
566 /* di_next_unlinked is the only non-core field in the old dinode */
567 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
568
569 /* start of the extended dinode, writable fields */
570 __uint32_t di_crc; /* CRC of the inode */
571 __uint64_t di_changecount; /* number of attribute changes */
572 xfs_lsn_t di_lsn; /* flush sequence */
573 __uint64_t di_flags2; /* more random flags */
574 __uint8_t di_pad2[16]; /* more padding for future expansion */
575
576 /* fields only written to during inode creation */
577 xfs_ictimestamp_t di_crtime; /* time created */
578 xfs_ino_t di_ino; /* inode number */
579 uuid_t di_uuid; /* UUID of the filesystem */
580
581 /* structure must be padded to 64 bit alignment */
582} xfs_icdinode_t;
583
584static inline uint xfs_icdinode_size(int version)
585{
586 if (version == 3)
587 return sizeof(struct xfs_icdinode);
588 return offsetof(struct xfs_icdinode, di_next_unlinked);
589}
590
591/*
592 * Buffer Log Format defintions
593 *
594 * These are the physical dirty bitmap defintions for the log format structure.
595 */
596#define XFS_BLF_CHUNK 128
597#define XFS_BLF_SHIFT 7
598#define BIT_TO_WORD_SHIFT 5
599#define NBWORD (NBBY * sizeof(unsigned int))
600
601/*
602 * This flag indicates that the buffer contains on disk inodes
603 * and requires special recovery handling.
604 */
605#define XFS_BLF_INODE_BUF (1<<0)
606
607/*
608 * This flag indicates that the buffer should not be replayed
609 * during recovery because its blocks are being freed.
610 */
611#define XFS_BLF_CANCEL (1<<1)
612
613/*
614 * This flag indicates that the buffer contains on disk
615 * user or group dquots and may require special recovery handling.
616 */
617#define XFS_BLF_UDQUOT_BUF (1<<2)
618#define XFS_BLF_PDQUOT_BUF (1<<3)
619#define XFS_BLF_GDQUOT_BUF (1<<4)
620
621/*
622 * This is the structure used to lay out a buf log item in the
623 * log. The data map describes which 128 byte chunks of the buffer
624 * have been logged.
625 */
626#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
627
628typedef struct xfs_buf_log_format {
629 unsigned short blf_type; /* buf log item type indicator */
630 unsigned short blf_size; /* size of this item */
631 ushort blf_flags; /* misc state */
632 ushort blf_len; /* number of blocks in this buf */
633 __int64_t blf_blkno; /* starting blkno of this buf */
634 unsigned int blf_map_size; /* used size of data bitmap in words */
635 unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
636} xfs_buf_log_format_t;
637
638/*
639 * All buffers now need to tell recovery where the magic number
640 * is so that it can verify and calculate the CRCs on the buffer correctly
641 * once the changes have been replayed into the buffer.
642 *
643 * The type value is held in the upper 5 bits of the blf_flags field, which is
644 * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
645 */
646#define XFS_BLFT_BITS 5
647#define XFS_BLFT_SHIFT 11
648#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
649
650enum xfs_blft {
651 XFS_BLFT_UNKNOWN_BUF = 0,
652 XFS_BLFT_UDQUOT_BUF,
653 XFS_BLFT_PDQUOT_BUF,
654 XFS_BLFT_GDQUOT_BUF,
655 XFS_BLFT_BTREE_BUF,
656 XFS_BLFT_AGF_BUF,
657 XFS_BLFT_AGFL_BUF,
658 XFS_BLFT_AGI_BUF,
659 XFS_BLFT_DINO_BUF,
660 XFS_BLFT_SYMLINK_BUF,
661 XFS_BLFT_DIR_BLOCK_BUF,
662 XFS_BLFT_DIR_DATA_BUF,
663 XFS_BLFT_DIR_FREE_BUF,
664 XFS_BLFT_DIR_LEAF1_BUF,
665 XFS_BLFT_DIR_LEAFN_BUF,
666 XFS_BLFT_DA_NODE_BUF,
667 XFS_BLFT_ATTR_LEAF_BUF,
668 XFS_BLFT_ATTR_RMT_BUF,
669 XFS_BLFT_SB_BUF,
670 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
671};
672
673static inline void
674xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
675{
676 ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
677 blf->blf_flags &= ~XFS_BLFT_MASK;
678 blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
679}
680
681static inline __uint16_t
682xfs_blft_from_flags(struct xfs_buf_log_format *blf)
683{
684 return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
685}
686
687/*
688 * EFI/EFD log format definitions
689 */
690typedef struct xfs_extent {
691 xfs_dfsbno_t ext_start;
692 xfs_extlen_t ext_len;
693} xfs_extent_t;
694
695/*
696 * Since an xfs_extent_t has types (start:64, len: 32)
697 * there are different alignments on 32 bit and 64 bit kernels.
698 * So we provide the different variants for use by a
699 * conversion routine.
700 */
701typedef struct xfs_extent_32 {
702 __uint64_t ext_start;
703 __uint32_t ext_len;
704} __attribute__((packed)) xfs_extent_32_t;
705
706typedef struct xfs_extent_64 {
707 __uint64_t ext_start;
708 __uint32_t ext_len;
709 __uint32_t ext_pad;
710} xfs_extent_64_t;
711
712/*
713 * This is the structure used to lay out an efi log item in the
714 * log. The efi_extents field is a variable size array whose
715 * size is given by efi_nextents.
716 */
717typedef struct xfs_efi_log_format {
718 __uint16_t efi_type; /* efi log item type */
719 __uint16_t efi_size; /* size of this item */
720 __uint32_t efi_nextents; /* # extents to free */
721 __uint64_t efi_id; /* efi identifier */
722 xfs_extent_t efi_extents[1]; /* array of extents to free */
723} xfs_efi_log_format_t;
724
725typedef struct xfs_efi_log_format_32 {
726 __uint16_t efi_type; /* efi log item type */
727 __uint16_t efi_size; /* size of this item */
728 __uint32_t efi_nextents; /* # extents to free */
729 __uint64_t efi_id; /* efi identifier */
730 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
731} __attribute__((packed)) xfs_efi_log_format_32_t;
732
733typedef struct xfs_efi_log_format_64 {
734 __uint16_t efi_type; /* efi log item type */
735 __uint16_t efi_size; /* size of this item */
736 __uint32_t efi_nextents; /* # extents to free */
737 __uint64_t efi_id; /* efi identifier */
738 xfs_extent_64_t efi_extents[1]; /* array of extents to free */
739} xfs_efi_log_format_64_t;
740
741/*
742 * This is the structure used to lay out an efd log item in the
743 * log. The efd_extents array is a variable size array whose
744 * size is given by efd_nextents;
745 */
746typedef struct xfs_efd_log_format {
747 __uint16_t efd_type; /* efd log item type */
748 __uint16_t efd_size; /* size of this item */
749 __uint32_t efd_nextents; /* # of extents freed */
750 __uint64_t efd_efi_id; /* id of corresponding efi */
751 xfs_extent_t efd_extents[1]; /* array of extents freed */
752} xfs_efd_log_format_t;
753
754typedef struct xfs_efd_log_format_32 {
755 __uint16_t efd_type; /* efd log item type */
756 __uint16_t efd_size; /* size of this item */
757 __uint32_t efd_nextents; /* # of extents freed */
758 __uint64_t efd_efi_id; /* id of corresponding efi */
759 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
760} __attribute__((packed)) xfs_efd_log_format_32_t;
761
762typedef struct xfs_efd_log_format_64 {
763 __uint16_t efd_type; /* efd log item type */
764 __uint16_t efd_size; /* size of this item */
765 __uint32_t efd_nextents; /* # of extents freed */
766 __uint64_t efd_efi_id; /* id of corresponding efi */
767 xfs_extent_64_t efd_extents[1]; /* array of extents freed */
768} xfs_efd_log_format_64_t;
769
770/*
771 * Dquot Log format definitions.
772 *
773 * The first two fields must be the type and size fitting into
774 * 32 bits : log_recovery code assumes that.
775 */
776typedef struct xfs_dq_logformat {
777 __uint16_t qlf_type; /* dquot log item type */
778 __uint16_t qlf_size; /* size of this item */
779 xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
780 __int64_t qlf_blkno; /* blkno of dquot buffer */
781 __int32_t qlf_len; /* len of dquot buffer */
782 __uint32_t qlf_boffset; /* off of dquot in buffer */
783} xfs_dq_logformat_t;
784
785/*
786 * log format struct for QUOTAOFF records.
787 * The first two fields must be the type and size fitting into
788 * 32 bits : log_recovery code assumes that.
789 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
790 * to the first and ensures that the first logitem is taken out of the AIL
791 * only when the last one is securely committed.
792 */
793typedef struct xfs_qoff_logformat {
794 unsigned short qf_type; /* quotaoff log item type */
795 unsigned short qf_size; /* size of this item */
796 unsigned int qf_flags; /* USR and/or GRP */
797 char qf_pad[12]; /* padding for future */
798} xfs_qoff_logformat_t;
799
800
801/*
802 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
803 */
804#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
805#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
806#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
807#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
808#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
809#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
810#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
811
812/*
813 * Conversion to and from the combined OQUOTA flag (if necessary)
814 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
815 */
816#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
817#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
818#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
819#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
820
821#define XFS_ALL_QUOTA_ACCT \
822 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
823#define XFS_ALL_QUOTA_ENFD \
824 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
825#define XFS_ALL_QUOTA_CHKD \
826 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
827
828#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
829 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
830 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
831 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
832 XFS_PQUOTA_CHKD)
833
834/*
835 * Inode create log item structure
836 *
837 * Log recovery assumes the first two entries are the type and size and they fit
838 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
839 * decoding can be done correctly.
840 */
841struct xfs_icreate_log {
842 __uint16_t icl_type; /* type of log format structure */
843 __uint16_t icl_size; /* size of log format structure */
844 __be32 icl_ag; /* ag being allocated in */
845 __be32 icl_agbno; /* start block of inode range */
846 __be32 icl_count; /* number of inodes to initialise */
847 __be32 icl_isize; /* size of inodes */
848 __be32 icl_length; /* length of extent to initialise */
849 __be32 icl_gen; /* inode generation number to use */
850};
851
852int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
853int xfs_log_calc_minimum_size(struct xfs_mount *);
854
855
856#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b9ea262dd1c2..136654b9400d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -24,51 +24,13 @@ struct xlog_ticket;
24struct xfs_mount; 24struct xfs_mount;
25 25
26/* 26/*
27 * Macros, structures, prototypes for internal log manager use. 27 * Flags for log structure
28 */ 28 */
29 29#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
30#define XLOG_MIN_ICLOGS 2 30#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
31#define XLOG_MAX_ICLOGS 8 31#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
32#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ 32 shutdown */
33#define XLOG_VERSION_1 1 33#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
34#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
35#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
36#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
37#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
38#define XLOG_MAX_RECORD_BSIZE (256*1024)
39#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
40#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
41#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
42#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
43#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
44 (log)->l_mp->m_sb.sb_logsunit)
45#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
46
47#define XLOG_HEADER_SIZE 512
48
49#define XLOG_REC_SHIFT(log) \
50 BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
51 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
52#define XLOG_TOTAL_REC_SHIFT(log) \
53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
55
56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
57{
58 return ((xfs_lsn_t)cycle << 32) | block;
59}
60
61static inline uint xlog_get_cycle(char *ptr)
62{
63 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
64 return be32_to_cpu(*((__be32 *)ptr + 1));
65 else
66 return be32_to_cpu(*(__be32 *)ptr);
67}
68
69#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
70
71#ifdef __KERNEL__
72 34
73/* 35/*
74 * get client id from packed copy. 36 * get client id from packed copy.
@@ -101,28 +63,8 @@ static inline uint xlog_get_client_id(__be32 i)
101#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ 63#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
102#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ 64#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
103#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ 65#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
104#endif /* __KERNEL__ */
105 66
106/* 67/*
107 * Flags to log operation header
108 *
109 * The first write of a new transaction will be preceded with a start
110 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
111 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
112 * the remainder of the current active in-core log, it is split up into
113 * multiple regions. Each partial region will be marked with a
114 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
115 *
116 */
117#define XLOG_START_TRANS 0x01 /* Start a new transaction */
118#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
119#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
120#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
121#define XLOG_END_TRANS 0x10 /* End a continued transaction */
122#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
123
124#ifdef __KERNEL__
125/*
126 * Flags to log ticket 68 * Flags to log ticket
127 */ 69 */
128#define XLOG_TIC_INITED 0x1 /* has been initialized */ 70#define XLOG_TIC_INITED 0x1 /* has been initialized */
@@ -132,22 +74,6 @@ static inline uint xlog_get_client_id(__be32 i)
132 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 74 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
133 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } 75 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
134 76
135#endif /* __KERNEL__ */
136
137#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
138
139/*
140 * Flags for log structure
141 */
142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
145 shutdown */
146#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
147
148typedef __uint32_t xlog_tid_t;
149
150#ifdef __KERNEL__
151/* 77/*
152 * Below are states for covering allocation transactions. 78 * Below are states for covering allocation transactions.
153 * By covering, we mean changing the h_tail_lsn in the last on-disk 79 * By covering, we mean changing the h_tail_lsn in the last on-disk
@@ -223,7 +149,6 @@ typedef __uint32_t xlog_tid_t;
223 149
224#define XLOG_COVER_OPS 5 150#define XLOG_COVER_OPS 5
225 151
226
227/* Ticket reservation region accounting */ 152/* Ticket reservation region accounting */
228#define XLOG_TIC_LEN_MAX 15 153#define XLOG_TIC_LEN_MAX 15
229 154
@@ -258,64 +183,6 @@ typedef struct xlog_ticket {
258 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ 183 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
259} xlog_ticket_t; 184} xlog_ticket_t;
260 185
261#endif
262
263
264typedef struct xlog_op_header {
265 __be32 oh_tid; /* transaction id of operation : 4 b */
266 __be32 oh_len; /* bytes in data region : 4 b */
267 __u8 oh_clientid; /* who sent me this : 1 b */
268 __u8 oh_flags; /* : 1 b */
269 __u16 oh_res2; /* 32 bit align : 2 b */
270} xlog_op_header_t;
271
272
273/* valid values for h_fmt */
274#define XLOG_FMT_UNKNOWN 0
275#define XLOG_FMT_LINUX_LE 1
276#define XLOG_FMT_LINUX_BE 2
277#define XLOG_FMT_IRIX_BE 3
278
279/* our fmt */
280#ifdef XFS_NATIVE_HOST
281#define XLOG_FMT XLOG_FMT_LINUX_BE
282#else
283#define XLOG_FMT XLOG_FMT_LINUX_LE
284#endif
285
286typedef struct xlog_rec_header {
287 __be32 h_magicno; /* log record (LR) identifier : 4 */
288 __be32 h_cycle; /* write cycle of log : 4 */
289 __be32 h_version; /* LR version : 4 */
290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
291 __be64 h_lsn; /* lsn of this LR : 8 */
292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
293 __le32 h_crc; /* crc of log record : 4 */
294 __be32 h_prev_block; /* block number to previous LR : 4 */
295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
297 /* new fields */
298 __be32 h_fmt; /* format of log record : 4 */
299 uuid_t h_fs_uuid; /* uuid of FS : 16 */
300 __be32 h_size; /* iclog size : 4 */
301} xlog_rec_header_t;
302
303typedef struct xlog_rec_ext_header {
304 __be32 xh_cycle; /* write cycle of log : 4 */
305 __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
306} xlog_rec_ext_header_t;
307
308#ifdef __KERNEL__
309
310/*
311 * Quite misnamed, because this union lays out the actual on-disk log buffer.
312 */
313typedef union xlog_in_core2 {
314 xlog_rec_header_t hic_header;
315 xlog_rec_ext_header_t hic_xheader;
316 char hic_sector[XLOG_HEADER_SIZE];
317} xlog_in_core_2_t;
318
319/* 186/*
320 * - A log record header is 512 bytes. There is plenty of room to grow the 187 * - A log record header is 512 bytes. There is plenty of room to grow the
321 * xlog_rec_header_t into the reserved space. 188 * xlog_rec_header_t into the reserved space.
@@ -411,14 +278,17 @@ struct xfs_cil {
411 struct xlog *xc_log; 278 struct xlog *xc_log;
412 struct list_head xc_cil; 279 struct list_head xc_cil;
413 spinlock_t xc_cil_lock; 280 spinlock_t xc_cil_lock;
281
282 struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
414 struct xfs_cil_ctx *xc_ctx; 283 struct xfs_cil_ctx *xc_ctx;
415 struct rw_semaphore xc_ctx_lock; 284
285 spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
286 xfs_lsn_t xc_push_seq;
416 struct list_head xc_committing; 287 struct list_head xc_committing;
417 wait_queue_head_t xc_commit_wait; 288 wait_queue_head_t xc_commit_wait;
418 xfs_lsn_t xc_current_sequence; 289 xfs_lsn_t xc_current_sequence;
419 struct work_struct xc_push_work; 290 struct work_struct xc_push_work;
420 xfs_lsn_t xc_push_seq; 291} ____cacheline_aligned_in_smp;
421};
422 292
423/* 293/*
424 * The amount of log space we allow the CIL to aggregate is difficult to size. 294 * The amount of log space we allow the CIL to aggregate is difficult to size.
@@ -686,6 +556,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
686 schedule(); 556 schedule();
687 remove_wait_queue(wq, &wait); 557 remove_wait_queue(wq, &wait);
688} 558}
689#endif /* __KERNEL__ */
690 559
691#endif /* __XFS_LOG_PRIV_H__ */ 560#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7681b19aa5dc..39797490a1f1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
@@ -41,7 +41,6 @@
41#include "xfs_extfree_item.h" 41#include "xfs_extfree_item.h"
42#include "xfs_trans_priv.h" 42#include "xfs_trans_priv.h"
43#include "xfs_quota.h" 43#include "xfs_quota.h"
44#include "xfs_utils.h"
45#include "xfs_cksum.h" 44#include "xfs_cksum.h"
46#include "xfs_trace.h" 45#include "xfs_trace.h"
47#include "xfs_icache.h" 46#include "xfs_icache.h"
@@ -51,10 +50,12 @@
51#include "xfs_symlink.h" 50#include "xfs_symlink.h"
52#include "xfs_da_btree.h" 51#include "xfs_da_btree.h"
53#include "xfs_dir2_format.h" 52#include "xfs_dir2_format.h"
54#include "xfs_dir2_priv.h" 53#include "xfs_dir2.h"
55#include "xfs_attr_leaf.h" 54#include "xfs_attr_leaf.h"
56#include "xfs_attr_remote.h" 55#include "xfs_attr_remote.h"
57 56
57#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
58
58STATIC int 59STATIC int
59xlog_find_zeroed( 60xlog_find_zeroed(
60 struct xlog *, 61 struct xlog *,
@@ -607,7 +608,7 @@ out:
607 608
608/* 609/*
609 * Head is defined to be the point of the log where the next log write 610 * Head is defined to be the point of the log where the next log write
610 * write could go. This means that incomplete LR writes at the end are 611 * could go. This means that incomplete LR writes at the end are
611 * eliminated when calculating the head. We aren't guaranteed that previous 612 * eliminated when calculating the head. We aren't guaranteed that previous
612 * LR have complete transactions. We only know that a cycle number of 613 * LR have complete transactions. We only know that a cycle number of
613 * current cycle number -1 won't be present in the log if we start writing 614 * current cycle number -1 won't be present in the log if we start writing
@@ -963,6 +964,7 @@ xlog_find_tail(
963 } 964 }
964 if (!found) { 965 if (!found) {
965 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); 966 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
967 xlog_put_bp(bp);
966 ASSERT(0); 968 ASSERT(0);
967 return XFS_ERROR(EIO); 969 return XFS_ERROR(EIO);
968 } 970 }
@@ -1144,7 +1146,8 @@ xlog_find_zeroed(
1144 */ 1146 */
1145 xfs_warn(log->l_mp, 1147 xfs_warn(log->l_mp,
1146 "Log inconsistent or not a log (last==0, first!=1)"); 1148 "Log inconsistent or not a log (last==0, first!=1)");
1147 return XFS_ERROR(EINVAL); 1149 error = XFS_ERROR(EINVAL);
1150 goto bp_err;
1148 } 1151 }
1149 1152
1150 /* we have a partially zeroed log */ 1153 /* we have a partially zeroed log */
@@ -1582,6 +1585,7 @@ xlog_recover_add_to_trans(
1582 "bad number of regions (%d) in inode log format", 1585 "bad number of regions (%d) in inode log format",
1583 in_f->ilf_size); 1586 in_f->ilf_size);
1584 ASSERT(0); 1587 ASSERT(0);
1588 kmem_free(ptr);
1585 return XFS_ERROR(EIO); 1589 return XFS_ERROR(EIO);
1586 } 1590 }
1587 1591
@@ -1766,19 +1770,11 @@ xlog_recover_buffer_pass1(
1766 1770
1767/* 1771/*
1768 * Check to see whether the buffer being recovered has a corresponding 1772 * Check to see whether the buffer being recovered has a corresponding
1769 * entry in the buffer cancel record table. If it does then return 1 1773 * entry in the buffer cancel record table. If it is, return the cancel
1770 * so that it will be cancelled, otherwise return 0. If the buffer is 1774 * buffer structure to the caller.
1771 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1772 * the refcount on the entry in the table and remove it from the table
1773 * if this is the last reference.
1774 *
1775 * We remove the cancel record from the table when we encounter its
1776 * last occurrence in the log so that if the same buffer is re-used
1777 * again after its last cancellation we actually replay the changes
1778 * made at that point.
1779 */ 1775 */
1780STATIC int 1776STATIC struct xfs_buf_cancel *
1781xlog_check_buffer_cancelled( 1777xlog_peek_buffer_cancelled(
1782 struct xlog *log, 1778 struct xlog *log,
1783 xfs_daddr_t blkno, 1779 xfs_daddr_t blkno,
1784 uint len, 1780 uint len,
@@ -1787,22 +1783,16 @@ xlog_check_buffer_cancelled(
1787 struct list_head *bucket; 1783 struct list_head *bucket;
1788 struct xfs_buf_cancel *bcp; 1784 struct xfs_buf_cancel *bcp;
1789 1785
1790 if (log->l_buf_cancel_table == NULL) { 1786 if (!log->l_buf_cancel_table) {
1791 /* 1787 /* empty table means no cancelled buffers in the log */
1792 * There is nothing in the table built in pass one,
1793 * so this buffer must not be cancelled.
1794 */
1795 ASSERT(!(flags & XFS_BLF_CANCEL)); 1788 ASSERT(!(flags & XFS_BLF_CANCEL));
1796 return 0; 1789 return NULL;
1797 } 1790 }
1798 1791
1799 /*
1800 * Search for an entry in the cancel table that matches our buffer.
1801 */
1802 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 1792 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1803 list_for_each_entry(bcp, bucket, bc_list) { 1793 list_for_each_entry(bcp, bucket, bc_list) {
1804 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 1794 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1805 goto found; 1795 return bcp;
1806 } 1796 }
1807 1797
1808 /* 1798 /*
@@ -1810,9 +1800,32 @@ xlog_check_buffer_cancelled(
1810 * that the buffer is NOT cancelled. 1800 * that the buffer is NOT cancelled.
1811 */ 1801 */
1812 ASSERT(!(flags & XFS_BLF_CANCEL)); 1802 ASSERT(!(flags & XFS_BLF_CANCEL));
1813 return 0; 1803 return NULL;
1804}
1805
1806/*
1807 * If the buffer is being cancelled then return 1 so that it will be cancelled,
1808 * otherwise return 0. If the buffer is actually a buffer cancel item
1809 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
1810 * table and remove it from the table if this is the last reference.
1811 *
1812 * We remove the cancel record from the table when we encounter its last
1813 * occurrence in the log so that if the same buffer is re-used again after its
1814 * last cancellation we actually replay the changes made at that point.
1815 */
1816STATIC int
1817xlog_check_buffer_cancelled(
1818 struct xlog *log,
1819 xfs_daddr_t blkno,
1820 uint len,
1821 ushort flags)
1822{
1823 struct xfs_buf_cancel *bcp;
1824
1825 bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
1826 if (!bcp)
1827 return 0;
1814 1828
1815found:
1816 /* 1829 /*
1817 * We've go a match, so return 1 so that the recovery of this buffer 1830 * We've go a match, so return 1 so that the recovery of this buffer
1818 * is cancelled. If this buffer is actually a buffer cancel log 1831 * is cancelled. If this buffer is actually a buffer cancel log
@@ -1947,6 +1960,149 @@ xlog_recover_do_inode_buffer(
1947} 1960}
1948 1961
1949/* 1962/*
1963 * V5 filesystems know the age of the buffer on disk being recovered. We can
1964 * have newer objects on disk than we are replaying, and so for these cases we
1965 * don't want to replay the current change as that will make the buffer contents
1966 * temporarily invalid on disk.
1967 *
1968 * The magic number might not match the buffer type we are going to recover
1969 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
1970 * extract the LSN of the existing object in the buffer based on it's current
1971 * magic number. If we don't recognise the magic number in the buffer, then
1972 * return a LSN of -1 so that the caller knows it was an unrecognised block and
1973 * so can recover the buffer.
1974 *
1975 * Note: we cannot rely solely on magic number matches to determine that the
1976 * buffer has a valid LSN - we also need to verify that it belongs to this
1977 * filesystem, so we need to extract the object's LSN and compare it to that
1978 * which we read from the superblock. If the UUIDs don't match, then we've got a
1979 * stale metadata block from an old filesystem instance that we need to recover
1980 * over the top of.
1981 */
1982static xfs_lsn_t
1983xlog_recover_get_buf_lsn(
1984 struct xfs_mount *mp,
1985 struct xfs_buf *bp)
1986{
1987 __uint32_t magic32;
1988 __uint16_t magic16;
1989 __uint16_t magicda;
1990 void *blk = bp->b_addr;
1991 uuid_t *uuid;
1992 xfs_lsn_t lsn = -1;
1993
1994 /* v4 filesystems always recover immediately */
1995 if (!xfs_sb_version_hascrc(&mp->m_sb))
1996 goto recover_immediately;
1997
1998 magic32 = be32_to_cpu(*(__be32 *)blk);
1999 switch (magic32) {
2000 case XFS_ABTB_CRC_MAGIC:
2001 case XFS_ABTC_CRC_MAGIC:
2002 case XFS_ABTB_MAGIC:
2003 case XFS_ABTC_MAGIC:
2004 case XFS_IBT_CRC_MAGIC:
2005 case XFS_IBT_MAGIC: {
2006 struct xfs_btree_block *btb = blk;
2007
2008 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2009 uuid = &btb->bb_u.s.bb_uuid;
2010 break;
2011 }
2012 case XFS_BMAP_CRC_MAGIC:
2013 case XFS_BMAP_MAGIC: {
2014 struct xfs_btree_block *btb = blk;
2015
2016 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2017 uuid = &btb->bb_u.l.bb_uuid;
2018 break;
2019 }
2020 case XFS_AGF_MAGIC:
2021 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2022 uuid = &((struct xfs_agf *)blk)->agf_uuid;
2023 break;
2024 case XFS_AGFL_MAGIC:
2025 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2026 uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2027 break;
2028 case XFS_AGI_MAGIC:
2029 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2030 uuid = &((struct xfs_agi *)blk)->agi_uuid;
2031 break;
2032 case XFS_SYMLINK_MAGIC:
2033 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2034 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2035 break;
2036 case XFS_DIR3_BLOCK_MAGIC:
2037 case XFS_DIR3_DATA_MAGIC:
2038 case XFS_DIR3_FREE_MAGIC:
2039 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2040 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2041 break;
2042 case XFS_ATTR3_RMT_MAGIC:
2043 lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
2044 uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
2045 break;
2046 case XFS_SB_MAGIC:
2047 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2048 uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2049 break;
2050 default:
2051 break;
2052 }
2053
2054 if (lsn != (xfs_lsn_t)-1) {
2055 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2056 goto recover_immediately;
2057 return lsn;
2058 }
2059
2060 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2061 switch (magicda) {
2062 case XFS_DIR3_LEAF1_MAGIC:
2063 case XFS_DIR3_LEAFN_MAGIC:
2064 case XFS_DA3_NODE_MAGIC:
2065 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2066 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2067 break;
2068 default:
2069 break;
2070 }
2071
2072 if (lsn != (xfs_lsn_t)-1) {
2073 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2074 goto recover_immediately;
2075 return lsn;
2076 }
2077
2078 /*
2079 * We do individual object checks on dquot and inode buffers as they
2080 * have their own individual LSN records. Also, we could have a stale
2081 * buffer here, so we have to at least recognise these buffer types.
2082 *
2083 * A notd complexity here is inode unlinked list processing - it logs
2084 * the inode directly in the buffer, but we don't know which inodes have
2085 * been modified, and there is no global buffer LSN. Hence we need to
2086 * recover all inode buffer types immediately. This problem will be
2087 * fixed by logical logging of the unlinked list modifications.
2088 */
2089 magic16 = be16_to_cpu(*(__be16 *)blk);
2090 switch (magic16) {
2091 case XFS_DQUOT_MAGIC:
2092 case XFS_DINODE_MAGIC:
2093 goto recover_immediately;
2094 default:
2095 break;
2096 }
2097
2098 /* unknown buffer contents, recover immediately */
2099
2100recover_immediately:
2101 return (xfs_lsn_t)-1;
2102
2103}
2104
2105/*
1950 * Validate the recovered buffer is of the correct type and attach the 2106 * Validate the recovered buffer is of the correct type and attach the
1951 * appropriate buffer operations to them for writeback. Magic numbers are in a 2107 * appropriate buffer operations to them for writeback. Magic numbers are in a
1952 * few places: 2108 * few places:
@@ -1955,7 +2111,7 @@ xlog_recover_do_inode_buffer(
1955 * inside a struct xfs_da_blkinfo at the start of the buffer. 2111 * inside a struct xfs_da_blkinfo at the start of the buffer.
1956 */ 2112 */
1957static void 2113static void
1958xlog_recovery_validate_buf_type( 2114xlog_recover_validate_buf_type(
1959 struct xfs_mount *mp, 2115 struct xfs_mount *mp,
1960 struct xfs_buf *bp, 2116 struct xfs_buf *bp,
1961 xfs_buf_log_format_t *buf_f) 2117 xfs_buf_log_format_t *buf_f)
@@ -2234,7 +2390,7 @@ xlog_recover_do_reg_buffer(
2234 * just avoid the verification stage for non-crc filesystems 2390 * just avoid the verification stage for non-crc filesystems
2235 */ 2391 */
2236 if (xfs_sb_version_hascrc(&mp->m_sb)) 2392 if (xfs_sb_version_hascrc(&mp->m_sb))
2237 xlog_recovery_validate_buf_type(mp, bp, buf_f); 2393 xlog_recover_validate_buf_type(mp, bp, buf_f);
2238} 2394}
2239 2395
2240/* 2396/*
@@ -2366,7 +2522,7 @@ xfs_qm_dqcheck(
2366 2522
2367/* 2523/*
2368 * Perform a dquot buffer recovery. 2524 * Perform a dquot buffer recovery.
2369 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type 2525 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2370 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2526 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2371 * Else, treat it as a regular buffer and do recovery. 2527 * Else, treat it as a regular buffer and do recovery.
2372 */ 2528 */
@@ -2425,20 +2581,22 @@ xlog_recover_do_dquot_buffer(
2425 * over the log during recovery. During the first we build a table of 2581 * over the log during recovery. During the first we build a table of
2426 * those buffers which have been cancelled, and during the second we 2582 * those buffers which have been cancelled, and during the second we
2427 * only replay those buffers which do not have corresponding cancel 2583 * only replay those buffers which do not have corresponding cancel
2428 * records in the table. See xlog_recover_do_buffer_pass[1,2] above 2584 * records in the table. See xlog_recover_buffer_pass[1,2] above
2429 * for more details on the implementation of the table of cancel records. 2585 * for more details on the implementation of the table of cancel records.
2430 */ 2586 */
2431STATIC int 2587STATIC int
2432xlog_recover_buffer_pass2( 2588xlog_recover_buffer_pass2(
2433 struct xlog *log, 2589 struct xlog *log,
2434 struct list_head *buffer_list, 2590 struct list_head *buffer_list,
2435 struct xlog_recover_item *item) 2591 struct xlog_recover_item *item,
2592 xfs_lsn_t current_lsn)
2436{ 2593{
2437 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2594 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2438 xfs_mount_t *mp = log->l_mp; 2595 xfs_mount_t *mp = log->l_mp;
2439 xfs_buf_t *bp; 2596 xfs_buf_t *bp;
2440 int error; 2597 int error;
2441 uint buf_flags; 2598 uint buf_flags;
2599 xfs_lsn_t lsn;
2442 2600
2443 /* 2601 /*
2444 * In this pass we only want to recover all the buffers which have 2602 * In this pass we only want to recover all the buffers which have
@@ -2463,10 +2621,17 @@ xlog_recover_buffer_pass2(
2463 error = bp->b_error; 2621 error = bp->b_error;
2464 if (error) { 2622 if (error) {
2465 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2623 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2466 xfs_buf_relse(bp); 2624 goto out_release;
2467 return error;
2468 } 2625 }
2469 2626
2627 /*
2628 * recover the buffer only if we get an LSN from it and it's less than
2629 * the lsn of the transaction we are replaying.
2630 */
2631 lsn = xlog_recover_get_buf_lsn(mp, bp);
2632 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
2633 goto out_release;
2634
2470 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2635 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2471 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2636 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2472 } else if (buf_f->blf_flags & 2637 } else if (buf_f->blf_flags &
@@ -2476,7 +2641,7 @@ xlog_recover_buffer_pass2(
2476 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2641 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2477 } 2642 }
2478 if (error) 2643 if (error)
2479 return XFS_ERROR(error); 2644 goto out_release;
2480 2645
2481 /* 2646 /*
2482 * Perform delayed write on the buffer. Asynchronous writes will be 2647 * Perform delayed write on the buffer. Asynchronous writes will be
@@ -2505,15 +2670,93 @@ xlog_recover_buffer_pass2(
2505 xfs_buf_delwri_queue(bp, buffer_list); 2670 xfs_buf_delwri_queue(bp, buffer_list);
2506 } 2671 }
2507 2672
2673out_release:
2508 xfs_buf_relse(bp); 2674 xfs_buf_relse(bp);
2509 return error; 2675 return error;
2510} 2676}
2511 2677
2678/*
2679 * Inode fork owner changes
2680 *
2681 * If we have been told that we have to reparent the inode fork, it's because an
2682 * extent swap operation on a CRC enabled filesystem has been done and we are
2683 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2684 * owners of it.
2685 *
2686 * The complexity here is that we don't have an inode context to work with, so
2687 * after we've replayed the inode we need to instantiate one. This is where the
2688 * fun begins.
2689 *
2690 * We are in the middle of log recovery, so we can't run transactions. That
2691 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2692 * that will result in the corresponding iput() running the inode through
2693 * xfs_inactive(). If we've just replayed an inode core that changes the link
2694 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2695 * transactions (bad!).
2696 *
2697 * So, to avoid this, we instantiate an inode directly from the inode core we've
2698 * just recovered. We have the buffer still locked, and all we really need to
2699 * instantiate is the inode core and the forks being modified. We can do this
2700 * manually, then run the inode btree owner change, and then tear down the
2701 * xfs_inode without having to run any transactions at all.
2702 *
2703 * Also, because we don't have a transaction context available here but need to
2704 * gather all the buffers we modify for writeback so we pass the buffer_list
2705 * instead for the operation to use.
2706 */
2707
2708STATIC int
2709xfs_recover_inode_owner_change(
2710 struct xfs_mount *mp,
2711 struct xfs_dinode *dip,
2712 struct xfs_inode_log_format *in_f,
2713 struct list_head *buffer_list)
2714{
2715 struct xfs_inode *ip;
2716 int error;
2717
2718 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2719
2720 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2721 if (!ip)
2722 return ENOMEM;
2723
2724 /* instantiate the inode */
2725 xfs_dinode_from_disk(&ip->i_d, dip);
2726 ASSERT(ip->i_d.di_version >= 3);
2727
2728 error = xfs_iformat_fork(ip, dip);
2729 if (error)
2730 goto out_free_ip;
2731
2732
2733 if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2734 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2735 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2736 ip->i_ino, buffer_list);
2737 if (error)
2738 goto out_free_ip;
2739 }
2740
2741 if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2742 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2743 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2744 ip->i_ino, buffer_list);
2745 if (error)
2746 goto out_free_ip;
2747 }
2748
2749out_free_ip:
2750 xfs_inode_free(ip);
2751 return error;
2752}
2753
2512STATIC int 2754STATIC int
2513xlog_recover_inode_pass2( 2755xlog_recover_inode_pass2(
2514 struct xlog *log, 2756 struct xlog *log,
2515 struct list_head *buffer_list, 2757 struct list_head *buffer_list,
2516 struct xlog_recover_item *item) 2758 struct xlog_recover_item *item,
2759 xfs_lsn_t current_lsn)
2517{ 2760{
2518 xfs_inode_log_format_t *in_f; 2761 xfs_inode_log_format_t *in_f;
2519 xfs_mount_t *mp = log->l_mp; 2762 xfs_mount_t *mp = log->l_mp;
@@ -2560,8 +2803,7 @@ xlog_recover_inode_pass2(
2560 error = bp->b_error; 2803 error = bp->b_error;
2561 if (error) { 2804 if (error) {
2562 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); 2805 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2563 xfs_buf_relse(bp); 2806 goto out_release;
2564 goto error;
2565 } 2807 }
2566 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2808 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2567 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2809 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2571,25 +2813,40 @@ xlog_recover_inode_pass2(
2571 * like an inode! 2813 * like an inode!
2572 */ 2814 */
2573 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 2815 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2574 xfs_buf_relse(bp);
2575 xfs_alert(mp, 2816 xfs_alert(mp,
2576 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", 2817 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2577 __func__, dip, bp, in_f->ilf_ino); 2818 __func__, dip, bp, in_f->ilf_ino);
2578 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2819 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2579 XFS_ERRLEVEL_LOW, mp); 2820 XFS_ERRLEVEL_LOW, mp);
2580 error = EFSCORRUPTED; 2821 error = EFSCORRUPTED;
2581 goto error; 2822 goto out_release;
2582 } 2823 }
2583 dicp = item->ri_buf[1].i_addr; 2824 dicp = item->ri_buf[1].i_addr;
2584 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2825 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2585 xfs_buf_relse(bp);
2586 xfs_alert(mp, 2826 xfs_alert(mp,
2587 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2827 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2588 __func__, item, in_f->ilf_ino); 2828 __func__, item, in_f->ilf_ino);
2589 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2829 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2590 XFS_ERRLEVEL_LOW, mp); 2830 XFS_ERRLEVEL_LOW, mp);
2591 error = EFSCORRUPTED; 2831 error = EFSCORRUPTED;
2592 goto error; 2832 goto out_release;
2833 }
2834
2835 /*
2836 * If the inode has an LSN in it, recover the inode only if it's less
2837 * than the lsn of the transaction we are replaying. Note: we still
2838 * need to replay an owner change even though the inode is more recent
2839 * than the transaction as there is no guarantee that all the btree
2840 * blocks are more recent than this transaction, too.
2841 */
2842 if (dip->di_version >= 3) {
2843 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
2844
2845 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2846 trace_xfs_log_recover_inode_skip(log, in_f);
2847 error = 0;
2848 goto out_owner_change;
2849 }
2593 } 2850 }
2594 2851
2595 /* 2852 /*
@@ -2610,10 +2867,9 @@ xlog_recover_inode_pass2(
2610 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2867 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2611 /* do nothing */ 2868 /* do nothing */
2612 } else { 2869 } else {
2613 xfs_buf_relse(bp);
2614 trace_xfs_log_recover_inode_skip(log, in_f); 2870 trace_xfs_log_recover_inode_skip(log, in_f);
2615 error = 0; 2871 error = 0;
2616 goto error; 2872 goto out_release;
2617 } 2873 }
2618 } 2874 }
2619 2875
@@ -2625,13 +2881,12 @@ xlog_recover_inode_pass2(
2625 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2881 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2626 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2882 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2627 XFS_ERRLEVEL_LOW, mp, dicp); 2883 XFS_ERRLEVEL_LOW, mp, dicp);
2628 xfs_buf_relse(bp);
2629 xfs_alert(mp, 2884 xfs_alert(mp,
2630 "%s: Bad regular inode log record, rec ptr 0x%p, " 2885 "%s: Bad regular inode log record, rec ptr 0x%p, "
2631 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2886 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2632 __func__, item, dip, bp, in_f->ilf_ino); 2887 __func__, item, dip, bp, in_f->ilf_ino);
2633 error = EFSCORRUPTED; 2888 error = EFSCORRUPTED;
2634 goto error; 2889 goto out_release;
2635 } 2890 }
2636 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 2891 } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2637 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2892 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2639,19 +2894,17 @@ xlog_recover_inode_pass2(
2639 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2894 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2640 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2895 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2641 XFS_ERRLEVEL_LOW, mp, dicp); 2896 XFS_ERRLEVEL_LOW, mp, dicp);
2642 xfs_buf_relse(bp);
2643 xfs_alert(mp, 2897 xfs_alert(mp,
2644 "%s: Bad dir inode log record, rec ptr 0x%p, " 2898 "%s: Bad dir inode log record, rec ptr 0x%p, "
2645 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2899 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2646 __func__, item, dip, bp, in_f->ilf_ino); 2900 __func__, item, dip, bp, in_f->ilf_ino);
2647 error = EFSCORRUPTED; 2901 error = EFSCORRUPTED;
2648 goto error; 2902 goto out_release;
2649 } 2903 }
2650 } 2904 }
2651 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2905 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2652 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2906 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2653 XFS_ERRLEVEL_LOW, mp, dicp); 2907 XFS_ERRLEVEL_LOW, mp, dicp);
2654 xfs_buf_relse(bp);
2655 xfs_alert(mp, 2908 xfs_alert(mp,
2656 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2909 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2657 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2910 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2659,29 +2912,27 @@ xlog_recover_inode_pass2(
2659 dicp->di_nextents + dicp->di_anextents, 2912 dicp->di_nextents + dicp->di_anextents,
2660 dicp->di_nblocks); 2913 dicp->di_nblocks);
2661 error = EFSCORRUPTED; 2914 error = EFSCORRUPTED;
2662 goto error; 2915 goto out_release;
2663 } 2916 }
2664 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2917 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2665 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2918 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2666 XFS_ERRLEVEL_LOW, mp, dicp); 2919 XFS_ERRLEVEL_LOW, mp, dicp);
2667 xfs_buf_relse(bp);
2668 xfs_alert(mp, 2920 xfs_alert(mp,
2669 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2921 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2670 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 2922 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2671 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2923 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2672 error = EFSCORRUPTED; 2924 error = EFSCORRUPTED;
2673 goto error; 2925 goto out_release;
2674 } 2926 }
2675 isize = xfs_icdinode_size(dicp->di_version); 2927 isize = xfs_icdinode_size(dicp->di_version);
2676 if (unlikely(item->ri_buf[1].i_len > isize)) { 2928 if (unlikely(item->ri_buf[1].i_len > isize)) {
2677 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2929 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2678 XFS_ERRLEVEL_LOW, mp, dicp); 2930 XFS_ERRLEVEL_LOW, mp, dicp);
2679 xfs_buf_relse(bp);
2680 xfs_alert(mp, 2931 xfs_alert(mp,
2681 "%s: Bad inode log record length %d, rec ptr 0x%p", 2932 "%s: Bad inode log record length %d, rec ptr 0x%p",
2682 __func__, item->ri_buf[1].i_len, item); 2933 __func__, item->ri_buf[1].i_len, item);
2683 error = EFSCORRUPTED; 2934 error = EFSCORRUPTED;
2684 goto error; 2935 goto out_release;
2685 } 2936 }
2686 2937
2687 /* The core is in in-core format */ 2938 /* The core is in in-core format */
@@ -2707,7 +2958,7 @@ xlog_recover_inode_pass2(
2707 } 2958 }
2708 2959
2709 if (in_f->ilf_size == 2) 2960 if (in_f->ilf_size == 2)
2710 goto write_inode_buffer; 2961 goto out_owner_change;
2711 len = item->ri_buf[2].i_len; 2962 len = item->ri_buf[2].i_len;
2712 src = item->ri_buf[2].i_addr; 2963 src = item->ri_buf[2].i_addr;
2713 ASSERT(in_f->ilf_size <= 4); 2964 ASSERT(in_f->ilf_size <= 4);
@@ -2768,19 +3019,23 @@ xlog_recover_inode_pass2(
2768 default: 3019 default:
2769 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 3020 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2770 ASSERT(0); 3021 ASSERT(0);
2771 xfs_buf_relse(bp);
2772 error = EIO; 3022 error = EIO;
2773 goto error; 3023 goto out_release;
2774 } 3024 }
2775 } 3025 }
2776 3026
2777write_inode_buffer: 3027out_owner_change:
3028 if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
3029 error = xfs_recover_inode_owner_change(mp, dip, in_f,
3030 buffer_list);
2778 /* re-generate the checksum. */ 3031 /* re-generate the checksum. */
2779 xfs_dinode_calc_crc(log->l_mp, dip); 3032 xfs_dinode_calc_crc(log->l_mp, dip);
2780 3033
2781 ASSERT(bp->b_target->bt_mount == mp); 3034 ASSERT(bp->b_target->bt_mount == mp);
2782 bp->b_iodone = xlog_recover_iodone; 3035 bp->b_iodone = xlog_recover_iodone;
2783 xfs_buf_delwri_queue(bp, buffer_list); 3036 xfs_buf_delwri_queue(bp, buffer_list);
3037
3038out_release:
2784 xfs_buf_relse(bp); 3039 xfs_buf_relse(bp);
2785error: 3040error:
2786 if (need_free) 3041 if (need_free)
@@ -2822,7 +3077,8 @@ STATIC int
2822xlog_recover_dquot_pass2( 3077xlog_recover_dquot_pass2(
2823 struct xlog *log, 3078 struct xlog *log,
2824 struct list_head *buffer_list, 3079 struct list_head *buffer_list,
2825 struct xlog_recover_item *item) 3080 struct xlog_recover_item *item,
3081 xfs_lsn_t current_lsn)
2826{ 3082{
2827 xfs_mount_t *mp = log->l_mp; 3083 xfs_mount_t *mp = log->l_mp;
2828 xfs_buf_t *bp; 3084 xfs_buf_t *bp;
@@ -2896,6 +3152,19 @@ xlog_recover_dquot_pass2(
2896 return XFS_ERROR(EIO); 3152 return XFS_ERROR(EIO);
2897 } 3153 }
2898 3154
3155 /*
3156 * If the dquot has an LSN in it, recover the dquot only if it's less
3157 * than the lsn of the transaction we are replaying.
3158 */
3159 if (xfs_sb_version_hascrc(&mp->m_sb)) {
3160 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3161 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3162
3163 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3164 goto out_release;
3165 }
3166 }
3167
2899 memcpy(ddq, recddq, item->ri_buf[1].i_len); 3168 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2900 if (xfs_sb_version_hascrc(&mp->m_sb)) { 3169 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2901 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), 3170 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2906,9 +3175,10 @@ xlog_recover_dquot_pass2(
2906 ASSERT(bp->b_target->bt_mount == mp); 3175 ASSERT(bp->b_target->bt_mount == mp);
2907 bp->b_iodone = xlog_recover_iodone; 3176 bp->b_iodone = xlog_recover_iodone;
2908 xfs_buf_delwri_queue(bp, buffer_list); 3177 xfs_buf_delwri_queue(bp, buffer_list);
2909 xfs_buf_relse(bp);
2910 3178
2911 return (0); 3179out_release:
3180 xfs_buf_relse(bp);
3181 return 0;
2912} 3182}
2913 3183
2914/* 3184/*
@@ -3116,6 +3386,106 @@ xlog_recover_free_trans(
3116 kmem_free(trans); 3386 kmem_free(trans);
3117} 3387}
3118 3388
3389STATIC void
3390xlog_recover_buffer_ra_pass2(
3391 struct xlog *log,
3392 struct xlog_recover_item *item)
3393{
3394 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
3395 struct xfs_mount *mp = log->l_mp;
3396
3397 if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3398 buf_f->blf_len, buf_f->blf_flags)) {
3399 return;
3400 }
3401
3402 xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3403 buf_f->blf_len, NULL);
3404}
3405
3406STATIC void
3407xlog_recover_inode_ra_pass2(
3408 struct xlog *log,
3409 struct xlog_recover_item *item)
3410{
3411 struct xfs_inode_log_format ilf_buf;
3412 struct xfs_inode_log_format *ilfp;
3413 struct xfs_mount *mp = log->l_mp;
3414 int error;
3415
3416 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3417 ilfp = item->ri_buf[0].i_addr;
3418 } else {
3419 ilfp = &ilf_buf;
3420 memset(ilfp, 0, sizeof(*ilfp));
3421 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3422 if (error)
3423 return;
3424 }
3425
3426 if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3427 return;
3428
3429 xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3430 ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3431}
3432
3433STATIC void
3434xlog_recover_dquot_ra_pass2(
3435 struct xlog *log,
3436 struct xlog_recover_item *item)
3437{
3438 struct xfs_mount *mp = log->l_mp;
3439 struct xfs_disk_dquot *recddq;
3440 struct xfs_dq_logformat *dq_f;
3441 uint type;
3442
3443
3444 if (mp->m_qflags == 0)
3445 return;
3446
3447 recddq = item->ri_buf[1].i_addr;
3448 if (recddq == NULL)
3449 return;
3450 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3451 return;
3452
3453 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3454 ASSERT(type);
3455 if (log->l_quotaoffs_flag & type)
3456 return;
3457
3458 dq_f = item->ri_buf[0].i_addr;
3459 ASSERT(dq_f);
3460 ASSERT(dq_f->qlf_len == 1);
3461
3462 xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
3463 XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
3464}
3465
3466STATIC void
3467xlog_recover_ra_pass2(
3468 struct xlog *log,
3469 struct xlog_recover_item *item)
3470{
3471 switch (ITEM_TYPE(item)) {
3472 case XFS_LI_BUF:
3473 xlog_recover_buffer_ra_pass2(log, item);
3474 break;
3475 case XFS_LI_INODE:
3476 xlog_recover_inode_ra_pass2(log, item);
3477 break;
3478 case XFS_LI_DQUOT:
3479 xlog_recover_dquot_ra_pass2(log, item);
3480 break;
3481 case XFS_LI_EFI:
3482 case XFS_LI_EFD:
3483 case XFS_LI_QUOTAOFF:
3484 default:
3485 break;
3486 }
3487}
3488
3119STATIC int 3489STATIC int
3120xlog_recover_commit_pass1( 3490xlog_recover_commit_pass1(
3121 struct xlog *log, 3491 struct xlog *log,
@@ -3155,15 +3525,18 @@ xlog_recover_commit_pass2(
3155 3525
3156 switch (ITEM_TYPE(item)) { 3526 switch (ITEM_TYPE(item)) {
3157 case XFS_LI_BUF: 3527 case XFS_LI_BUF:
3158 return xlog_recover_buffer_pass2(log, buffer_list, item); 3528 return xlog_recover_buffer_pass2(log, buffer_list, item,
3529 trans->r_lsn);
3159 case XFS_LI_INODE: 3530 case XFS_LI_INODE:
3160 return xlog_recover_inode_pass2(log, buffer_list, item); 3531 return xlog_recover_inode_pass2(log, buffer_list, item,
3532 trans->r_lsn);
3161 case XFS_LI_EFI: 3533 case XFS_LI_EFI:
3162 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 3534 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3163 case XFS_LI_EFD: 3535 case XFS_LI_EFD:
3164 return xlog_recover_efd_pass2(log, item); 3536 return xlog_recover_efd_pass2(log, item);
3165 case XFS_LI_DQUOT: 3537 case XFS_LI_DQUOT:
3166 return xlog_recover_dquot_pass2(log, buffer_list, item); 3538 return xlog_recover_dquot_pass2(log, buffer_list, item,
3539 trans->r_lsn);
3167 case XFS_LI_ICREATE: 3540 case XFS_LI_ICREATE:
3168 return xlog_recover_do_icreate_pass2(log, buffer_list, item); 3541 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3169 case XFS_LI_QUOTAOFF: 3542 case XFS_LI_QUOTAOFF:
@@ -3177,6 +3550,26 @@ xlog_recover_commit_pass2(
3177 } 3550 }
3178} 3551}
3179 3552
3553STATIC int
3554xlog_recover_items_pass2(
3555 struct xlog *log,
3556 struct xlog_recover *trans,
3557 struct list_head *buffer_list,
3558 struct list_head *item_list)
3559{
3560 struct xlog_recover_item *item;
3561 int error = 0;
3562
3563 list_for_each_entry(item, item_list, ri_list) {
3564 error = xlog_recover_commit_pass2(log, trans,
3565 buffer_list, item);
3566 if (error)
3567 return error;
3568 }
3569
3570 return error;
3571}
3572
3180/* 3573/*
3181 * Perform the transaction. 3574 * Perform the transaction.
3182 * 3575 *
@@ -3189,9 +3582,16 @@ xlog_recover_commit_trans(
3189 struct xlog_recover *trans, 3582 struct xlog_recover *trans,
3190 int pass) 3583 int pass)
3191{ 3584{
3192 int error = 0, error2; 3585 int error = 0;
3193 xlog_recover_item_t *item; 3586 int error2;
3194 LIST_HEAD (buffer_list); 3587 int items_queued = 0;
3588 struct xlog_recover_item *item;
3589 struct xlog_recover_item *next;
3590 LIST_HEAD (buffer_list);
3591 LIST_HEAD (ra_list);
3592 LIST_HEAD (done_list);
3593
3594 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
3195 3595
3196 hlist_del(&trans->r_list); 3596 hlist_del(&trans->r_list);
3197 3597
@@ -3199,14 +3599,22 @@ xlog_recover_commit_trans(
3199 if (error) 3599 if (error)
3200 return error; 3600 return error;
3201 3601
3202 list_for_each_entry(item, &trans->r_itemq, ri_list) { 3602 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3203 switch (pass) { 3603 switch (pass) {
3204 case XLOG_RECOVER_PASS1: 3604 case XLOG_RECOVER_PASS1:
3205 error = xlog_recover_commit_pass1(log, trans, item); 3605 error = xlog_recover_commit_pass1(log, trans, item);
3206 break; 3606 break;
3207 case XLOG_RECOVER_PASS2: 3607 case XLOG_RECOVER_PASS2:
3208 error = xlog_recover_commit_pass2(log, trans, 3608 xlog_recover_ra_pass2(log, item);
3209 &buffer_list, item); 3609 list_move_tail(&item->ri_list, &ra_list);
3610 items_queued++;
3611 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3612 error = xlog_recover_items_pass2(log, trans,
3613 &buffer_list, &ra_list);
3614 list_splice_tail_init(&ra_list, &done_list);
3615 items_queued = 0;
3616 }
3617
3210 break; 3618 break;
3211 default: 3619 default:
3212 ASSERT(0); 3620 ASSERT(0);
@@ -3216,9 +3624,19 @@ xlog_recover_commit_trans(
3216 goto out; 3624 goto out;
3217 } 3625 }
3218 3626
3627out:
3628 if (!list_empty(&ra_list)) {
3629 if (!error)
3630 error = xlog_recover_items_pass2(log, trans,
3631 &buffer_list, &ra_list);
3632 list_splice_tail_init(&ra_list, &done_list);
3633 }
3634
3635 if (!list_empty(&done_list))
3636 list_splice_init(&done_list, &trans->r_itemq);
3637
3219 xlog_recover_free_trans(trans); 3638 xlog_recover_free_trans(trans);
3220 3639
3221out:
3222 error2 = xfs_buf_delwri_submit(&buffer_list); 3640 error2 = xfs_buf_delwri_submit(&buffer_list);
3223 return error ? error : error2; 3641 return error ? error : error2;
3224} 3642}
@@ -3376,7 +3794,7 @@ xlog_recover_process_efi(
3376 } 3794 }
3377 3795
3378 tp = xfs_trans_alloc(mp, 0); 3796 tp = xfs_trans_alloc(mp, 0);
3379 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); 3797 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
3380 if (error) 3798 if (error)
3381 goto abort_error; 3799 goto abort_error;
3382 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 3800 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3482,8 +3900,7 @@ xlog_recover_clear_agi_bucket(
3482 int error; 3900 int error;
3483 3901
3484 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3902 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3485 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 3903 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
3486 0, 0, 0);
3487 if (error) 3904 if (error)
3488 goto out_abort; 3905 goto out_abort;
3489 3906
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
new file mode 100644
index 000000000000..bbcec0bbc12d
--- /dev/null
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (c) 2013 Jie Liu.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_log.h"
21#include "xfs_trans.h"
22#include "xfs_ag.h"
23#include "xfs_sb.h"
24#include "xfs_mount.h"
25#include "xfs_trans_space.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_inode.h"
28#include "xfs_da_btree.h"
29#include "xfs_attr_leaf.h"
30
31/*
32 * Calculate the maximum length in bytes that would be required for a local
33 * attribute value as large attributes out of line are not logged.
34 */
35STATIC int
36xfs_log_calc_max_attrsetm_res(
37 struct xfs_mount *mp)
38{
39 int size;
40 int nblks;
41
42 size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
43 MAXNAMELEN - 1;
44 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
45 nblks += XFS_B_TO_FSB(mp, size);
46 nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
47
48 return M_RES(mp)->tr_attrsetm.tr_logres +
49 M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
50}
51
52/*
53 * Iterate over the log space reservation table to figure out and return
54 * the maximum one in terms of the pre-calculated values which were done
55 * at mount time.
56 */
57STATIC void
58xfs_log_get_max_trans_res(
59 struct xfs_mount *mp,
60 struct xfs_trans_res *max_resp)
61{
62 struct xfs_trans_res *resp;
63 struct xfs_trans_res *end_resp;
64 int log_space = 0;
65 int attr_space;
66
67 attr_space = xfs_log_calc_max_attrsetm_res(mp);
68
69 resp = (struct xfs_trans_res *)M_RES(mp);
70 end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
71 for (; resp < end_resp; resp++) {
72 int tmp = resp->tr_logcount > 1 ?
73 resp->tr_logres * resp->tr_logcount :
74 resp->tr_logres;
75 if (log_space < tmp) {
76 log_space = tmp;
77 *max_resp = *resp; /* struct copy */
78 }
79 }
80
81 if (attr_space > log_space) {
82 *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
83 max_resp->tr_logres = attr_space;
84 }
85}
86
87/*
88 * Calculate the minimum valid log size for the given superblock configuration.
89 * Used to calculate the minimum log size at mkfs time, and to determine if
90 * the log is large enough or not at mount time. Returns the minimum size in
91 * filesystem block size units.
92 */
93int
94xfs_log_calc_minimum_size(
95 struct xfs_mount *mp)
96{
97 struct xfs_trans_res tres = {0};
98 int max_logres;
99 int min_logblks = 0;
100 int lsunit = 0;
101
102 xfs_log_get_max_trans_res(mp, &tres);
103
104 max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
105 if (tres.tr_logcount > 1)
106 max_logres *= tres.tr_logcount;
107
108 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
109 lsunit = BTOBB(mp->m_sb.sb_logsunit);
110
111 /*
112 * Two factors should be taken into account for calculating the minimum
113 * log space.
114 * 1) The fundamental limitation is that no single transaction can be
115 * larger than half size of the log.
116 *
117 * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
118 * define, which is set to 3. That means we can definitely fit
119 * maximally sized 2 transactions in the log. We'll use this same
120 * value here.
121 *
122 * 2) If the lsunit option is specified, a transaction requires 2 LSU
123 * for the reservation because there are two log writes that can
124 * require padding - the transaction data and the commit record which
125 * are written separately and both can require padding to the LSU.
126 * Consider that we can have an active CIL reservation holding 2*LSU,
127 * but the CIL is not over a push threshold, in this case, if we
128 * don't have enough log space for at one new transaction, which
129 * includes another 2*LSU in the reservation, we will run into dead
130 * loop situation in log space grant procedure. i.e.
131 * xlog_grant_head_wait().
132 *
133 * Hence the log size needs to be able to contain two maximally sized
134 * and padded transactions, which is (2 * (2 * LSU + maxlres)).
135 *
136 * Also, the log size should be a multiple of the log stripe unit, round
137 * it up to lsunit boundary if lsunit is specified.
138 */
139 if (lsunit) {
140 min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
141 2 * lsunit;
142 } else
143 min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
144 min_logblks *= XFS_MIN_LOG_FACTOR;
145
146 return XFS_BB_TO_FSB(mp, min_logblks);
147}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2b0ba3581656..5dcc68019d1b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
@@ -25,8 +25,10 @@
25#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 33#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 34#include "xfs_ialloc_btree.h"
@@ -40,7 +42,6 @@
40#include "xfs_error.h" 42#include "xfs_error.h"
41#include "xfs_quota.h" 43#include "xfs_quota.h"
42#include "xfs_fsops.h" 44#include "xfs_fsops.h"
43#include "xfs_utils.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
45#include "xfs_icache.h" 46#include "xfs_icache.h"
46#include "xfs_cksum.h" 47#include "xfs_cksum.h"
@@ -59,69 +60,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 60#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
60#endif 61#endif
61 62
62static const struct {
63 short offset;
64 short type; /* 0 = integer
65 * 1 = binary / string (no translation)
66 */
67} xfs_sb_info[] = {
68 { offsetof(xfs_sb_t, sb_magicnum), 0 },
69 { offsetof(xfs_sb_t, sb_blocksize), 0 },
70 { offsetof(xfs_sb_t, sb_dblocks), 0 },
71 { offsetof(xfs_sb_t, sb_rblocks), 0 },
72 { offsetof(xfs_sb_t, sb_rextents), 0 },
73 { offsetof(xfs_sb_t, sb_uuid), 1 },
74 { offsetof(xfs_sb_t, sb_logstart), 0 },
75 { offsetof(xfs_sb_t, sb_rootino), 0 },
76 { offsetof(xfs_sb_t, sb_rbmino), 0 },
77 { offsetof(xfs_sb_t, sb_rsumino), 0 },
78 { offsetof(xfs_sb_t, sb_rextsize), 0 },
79 { offsetof(xfs_sb_t, sb_agblocks), 0 },
80 { offsetof(xfs_sb_t, sb_agcount), 0 },
81 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
82 { offsetof(xfs_sb_t, sb_logblocks), 0 },
83 { offsetof(xfs_sb_t, sb_versionnum), 0 },
84 { offsetof(xfs_sb_t, sb_sectsize), 0 },
85 { offsetof(xfs_sb_t, sb_inodesize), 0 },
86 { offsetof(xfs_sb_t, sb_inopblock), 0 },
87 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
88 { offsetof(xfs_sb_t, sb_blocklog), 0 },
89 { offsetof(xfs_sb_t, sb_sectlog), 0 },
90 { offsetof(xfs_sb_t, sb_inodelog), 0 },
91 { offsetof(xfs_sb_t, sb_inopblog), 0 },
92 { offsetof(xfs_sb_t, sb_agblklog), 0 },
93 { offsetof(xfs_sb_t, sb_rextslog), 0 },
94 { offsetof(xfs_sb_t, sb_inprogress), 0 },
95 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
96 { offsetof(xfs_sb_t, sb_icount), 0 },
97 { offsetof(xfs_sb_t, sb_ifree), 0 },
98 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
99 { offsetof(xfs_sb_t, sb_frextents), 0 },
100 { offsetof(xfs_sb_t, sb_uquotino), 0 },
101 { offsetof(xfs_sb_t, sb_gquotino), 0 },
102 { offsetof(xfs_sb_t, sb_qflags), 0 },
103 { offsetof(xfs_sb_t, sb_flags), 0 },
104 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
105 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
106 { offsetof(xfs_sb_t, sb_unit), 0 },
107 { offsetof(xfs_sb_t, sb_width), 0 },
108 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
109 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
110 { offsetof(xfs_sb_t, sb_logsectsize),0 },
111 { offsetof(xfs_sb_t, sb_logsunit), 0 },
112 { offsetof(xfs_sb_t, sb_features2), 0 },
113 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
114 { offsetof(xfs_sb_t, sb_features_compat), 0 },
115 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
116 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
117 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
118 { offsetof(xfs_sb_t, sb_crc), 0 },
119 { offsetof(xfs_sb_t, sb_pad), 0 },
120 { offsetof(xfs_sb_t, sb_pquotino), 0 },
121 { offsetof(xfs_sb_t, sb_lsn), 0 },
122 { sizeof(xfs_sb_t), 0 }
123};
124
125static DEFINE_MUTEX(xfs_uuid_table_mutex); 63static DEFINE_MUTEX(xfs_uuid_table_mutex);
126static int xfs_uuid_table_size; 64static int xfs_uuid_table_size;
127static uuid_t *xfs_uuid_table; 65static uuid_t *xfs_uuid_table;
@@ -197,64 +135,6 @@ xfs_uuid_unmount(
197} 135}
198 136
199 137
200/*
201 * Reference counting access wrappers to the perag structures.
202 * Because we never free per-ag structures, the only thing we
203 * have to protect against changes is the tree structure itself.
204 */
205struct xfs_perag *
206xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
207{
208 struct xfs_perag *pag;
209 int ref = 0;
210
211 rcu_read_lock();
212 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
213 if (pag) {
214 ASSERT(atomic_read(&pag->pag_ref) >= 0);
215 ref = atomic_inc_return(&pag->pag_ref);
216 }
217 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag;
220}
221
222/*
223 * search from @first to find the next perag with the given tag set.
224 */
225struct xfs_perag *
226xfs_perag_get_tag(
227 struct xfs_mount *mp,
228 xfs_agnumber_t first,
229 int tag)
230{
231 struct xfs_perag *pag;
232 int found;
233 int ref;
234
235 rcu_read_lock();
236 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
237 (void **)&pag, first, 1, tag);
238 if (found <= 0) {
239 rcu_read_unlock();
240 return NULL;
241 }
242 ref = atomic_inc_return(&pag->pag_ref);
243 rcu_read_unlock();
244 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
245 return pag;
246}
247
248void
249xfs_perag_put(struct xfs_perag *pag)
250{
251 int ref;
252
253 ASSERT(atomic_read(&pag->pag_ref) > 0);
254 ref = atomic_dec_return(&pag->pag_ref);
255 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
256}
257
258STATIC void 138STATIC void
259__xfs_free_perag( 139__xfs_free_perag(
260 struct rcu_head *head) 140 struct rcu_head *head)
@@ -307,184 +187,6 @@ xfs_sb_validate_fsb_count(
307 return 0; 187 return 0;
308} 188}
309 189
310/*
311 * Check the validity of the SB found.
312 */
313STATIC int
314xfs_mount_validate_sb(
315 xfs_mount_t *mp,
316 xfs_sb_t *sbp,
317 bool check_inprogress,
318 bool check_version)
319{
320
321 /*
322 * If the log device and data device have the
323 * same device number, the log is internal.
324 * Consequently, the sb_logstart should be non-zero. If
325 * we have a zero sb_logstart in this case, we may be trying to mount
326 * a volume filesystem in a non-volume manner.
327 */
328 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
329 xfs_warn(mp, "bad magic number");
330 return XFS_ERROR(EWRONGFS);
331 }
332
333
334 if (!xfs_sb_good_version(sbp)) {
335 xfs_warn(mp, "bad version");
336 return XFS_ERROR(EWRONGFS);
337 }
338
339 if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
340 (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
341 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
342 xfs_notice(mp,
343"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
344 return XFS_ERROR(EFSCORRUPTED);
345 }
346
347 /*
348 * Version 5 superblock feature mask validation. Reject combinations the
349 * kernel cannot support up front before checking anything else. For
350 * write validation, we don't need to check feature masks.
351 */
352 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
353 xfs_alert(mp,
354"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
355"Use of these features in this kernel is at your own risk!");
356
357 if (xfs_sb_has_compat_feature(sbp,
358 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
359 xfs_warn(mp,
360"Superblock has unknown compatible features (0x%x) enabled.\n"
361"Using a more recent kernel is recommended.",
362 (sbp->sb_features_compat &
363 XFS_SB_FEAT_COMPAT_UNKNOWN));
364 }
365
366 if (xfs_sb_has_ro_compat_feature(sbp,
367 XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
368 xfs_alert(mp,
369"Superblock has unknown read-only compatible features (0x%x) enabled.",
370 (sbp->sb_features_ro_compat &
371 XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
372 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
373 xfs_warn(mp,
374"Attempted to mount read-only compatible filesystem read-write.\n"
375"Filesystem can only be safely mounted read only.");
376 return XFS_ERROR(EINVAL);
377 }
378 }
379 if (xfs_sb_has_incompat_feature(sbp,
380 XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
381 xfs_warn(mp,
382"Superblock has unknown incompatible features (0x%x) enabled.\n"
383"Filesystem can not be safely mounted by this kernel.",
384 (sbp->sb_features_incompat &
385 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
386 return XFS_ERROR(EINVAL);
387 }
388 }
389
390 if (unlikely(
391 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
392 xfs_warn(mp,
393 "filesystem is marked as having an external log; "
394 "specify logdev on the mount command line.");
395 return XFS_ERROR(EINVAL);
396 }
397
398 if (unlikely(
399 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
400 xfs_warn(mp,
401 "filesystem is marked as having an internal log; "
402 "do not specify logdev on the mount command line.");
403 return XFS_ERROR(EINVAL);
404 }
405
406 /*
407 * More sanity checking. Most of these were stolen directly from
408 * xfs_repair.
409 */
410 if (unlikely(
411 sbp->sb_agcount <= 0 ||
412 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
413 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
414 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
415 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
416 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
417 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
418 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
419 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
420 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
421 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
422 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
423 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
424 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
425 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
426 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
427 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
428 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
429 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
430 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
431 sbp->sb_dblocks == 0 ||
432 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
433 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
434 XFS_CORRUPTION_ERROR("SB sanity check failed",
435 XFS_ERRLEVEL_LOW, mp, sbp);
436 return XFS_ERROR(EFSCORRUPTED);
437 }
438
439 /*
440 * Until this is fixed only page-sized or smaller data blocks work.
441 */
442 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
443 xfs_warn(mp,
444 "File system with blocksize %d bytes. "
445 "Only pagesize (%ld) or less will currently work.",
446 sbp->sb_blocksize, PAGE_SIZE);
447 return XFS_ERROR(ENOSYS);
448 }
449
450 /*
451 * Currently only very few inode sizes are supported.
452 */
453 switch (sbp->sb_inodesize) {
454 case 256:
455 case 512:
456 case 1024:
457 case 2048:
458 break;
459 default:
460 xfs_warn(mp, "inode size of %d bytes not supported",
461 sbp->sb_inodesize);
462 return XFS_ERROR(ENOSYS);
463 }
464
465 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
466 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
467 xfs_warn(mp,
468 "file system too large to be mounted on this system.");
469 return XFS_ERROR(EFBIG);
470 }
471
472 if (check_inprogress && sbp->sb_inprogress) {
473 xfs_warn(mp, "Offline file system operation in progress!");
474 return XFS_ERROR(EFSCORRUPTED);
475 }
476
477 /*
478 * Version 1 directory format has never worked on Linux.
479 */
480 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
481 xfs_warn(mp, "file system using version 1 directory format");
482 return XFS_ERROR(ENOSYS);
483 }
484
485 return 0;
486}
487
488int 190int
489xfs_initialize_perag( 191xfs_initialize_perag(
490 xfs_mount_t *mp, 192 xfs_mount_t *mp,
@@ -569,283 +271,15 @@ out_unwind:
569 return error; 271 return error;
570} 272}
571 273
572static void
573xfs_sb_quota_from_disk(struct xfs_sb *sbp)
574{
575 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
576 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
577 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
578 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
579 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
580 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
581 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
582}
583
584void
585xfs_sb_from_disk(
586 struct xfs_sb *to,
587 xfs_dsb_t *from)
588{
589 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
590 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
591 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
592 to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
593 to->sb_rextents = be64_to_cpu(from->sb_rextents);
594 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
595 to->sb_logstart = be64_to_cpu(from->sb_logstart);
596 to->sb_rootino = be64_to_cpu(from->sb_rootino);
597 to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
598 to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
599 to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
600 to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
601 to->sb_agcount = be32_to_cpu(from->sb_agcount);
602 to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
603 to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
604 to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
605 to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
606 to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
607 to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
608 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
609 to->sb_blocklog = from->sb_blocklog;
610 to->sb_sectlog = from->sb_sectlog;
611 to->sb_inodelog = from->sb_inodelog;
612 to->sb_inopblog = from->sb_inopblog;
613 to->sb_agblklog = from->sb_agblklog;
614 to->sb_rextslog = from->sb_rextslog;
615 to->sb_inprogress = from->sb_inprogress;
616 to->sb_imax_pct = from->sb_imax_pct;
617 to->sb_icount = be64_to_cpu(from->sb_icount);
618 to->sb_ifree = be64_to_cpu(from->sb_ifree);
619 to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
620 to->sb_frextents = be64_to_cpu(from->sb_frextents);
621 to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
622 to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
623 to->sb_qflags = be16_to_cpu(from->sb_qflags);
624 to->sb_flags = from->sb_flags;
625 to->sb_shared_vn = from->sb_shared_vn;
626 to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
627 to->sb_unit = be32_to_cpu(from->sb_unit);
628 to->sb_width = be32_to_cpu(from->sb_width);
629 to->sb_dirblklog = from->sb_dirblklog;
630 to->sb_logsectlog = from->sb_logsectlog;
631 to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
632 to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
633 to->sb_features2 = be32_to_cpu(from->sb_features2);
634 to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
635 to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
636 to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
637 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
638 to->sb_features_log_incompat =
639 be32_to_cpu(from->sb_features_log_incompat);
640 to->sb_pad = 0;
641 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
642 to->sb_lsn = be64_to_cpu(from->sb_lsn);
643}
644
645static inline void
646xfs_sb_quota_to_disk(
647 xfs_dsb_t *to,
648 xfs_sb_t *from,
649 __int64_t *fields)
650{
651 __uint16_t qflags = from->sb_qflags;
652
653 if (*fields & XFS_SB_QFLAGS) {
654 /*
655 * The in-core version of sb_qflags do not have
656 * XFS_OQUOTA_* flags, whereas the on-disk version
657 * does. So, convert incore XFS_{PG}QUOTA_* flags
658 * to on-disk XFS_OQUOTA_* flags.
659 */
660 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
661 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
662
663 if (from->sb_qflags &
664 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
665 qflags |= XFS_OQUOTA_ENFD;
666 if (from->sb_qflags &
667 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
668 qflags |= XFS_OQUOTA_CHKD;
669 to->sb_qflags = cpu_to_be16(qflags);
670 *fields &= ~XFS_SB_QFLAGS;
671 }
672}
673
674/*
675 * Copy in core superblock to ondisk one.
676 *
677 * The fields argument is mask of superblock fields to copy.
678 */
679void
680xfs_sb_to_disk(
681 xfs_dsb_t *to,
682 xfs_sb_t *from,
683 __int64_t fields)
684{
685 xfs_caddr_t to_ptr = (xfs_caddr_t)to;
686 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
687 xfs_sb_field_t f;
688 int first;
689 int size;
690
691 ASSERT(fields);
692 if (!fields)
693 return;
694
695 xfs_sb_quota_to_disk(to, from, &fields);
696 while (fields) {
697 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
698 first = xfs_sb_info[f].offset;
699 size = xfs_sb_info[f + 1].offset - first;
700
701 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
702
703 if (size == 1 || xfs_sb_info[f].type == 1) {
704 memcpy(to_ptr + first, from_ptr + first, size);
705 } else {
706 switch (size) {
707 case 2:
708 *(__be16 *)(to_ptr + first) =
709 cpu_to_be16(*(__u16 *)(from_ptr + first));
710 break;
711 case 4:
712 *(__be32 *)(to_ptr + first) =
713 cpu_to_be32(*(__u32 *)(from_ptr + first));
714 break;
715 case 8:
716 *(__be64 *)(to_ptr + first) =
717 cpu_to_be64(*(__u64 *)(from_ptr + first));
718 break;
719 default:
720 ASSERT(0);
721 }
722 }
723
724 fields &= ~(1LL << f);
725 }
726}
727
728static int
729xfs_sb_verify(
730 struct xfs_buf *bp,
731 bool check_version)
732{
733 struct xfs_mount *mp = bp->b_target->bt_mount;
734 struct xfs_sb sb;
735
736 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
737
738 /*
739 * Only check the in progress field for the primary superblock as
740 * mkfs.xfs doesn't clear it from secondary superblocks.
741 */
742 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
743 check_version);
744}
745
746/*
747 * If the superblock has the CRC feature bit set or the CRC field is non-null,
748 * check that the CRC is valid. We check the CRC field is non-null because a
749 * single bit error could clear the feature bit and unused parts of the
750 * superblock are supposed to be zero. Hence a non-null crc field indicates that
751 * we've potentially lost a feature bit and we should check it anyway.
752 */
753static void
754xfs_sb_read_verify(
755 struct xfs_buf *bp)
756{
757 struct xfs_mount *mp = bp->b_target->bt_mount;
758 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
759 int error;
760
761 /*
762 * open code the version check to avoid needing to convert the entire
763 * superblock from disk order just to check the version number
764 */
765 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
766 (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
767 XFS_SB_VERSION_5) ||
768 dsb->sb_crc != 0)) {
769
770 if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
771 offsetof(struct xfs_sb, sb_crc))) {
772 error = EFSCORRUPTED;
773 goto out_error;
774 }
775 }
776 error = xfs_sb_verify(bp, true);
777
778out_error:
779 if (error) {
780 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
781 xfs_buf_ioerror(bp, error);
782 }
783}
784
785/*
786 * We may be probed for a filesystem match, so we may not want to emit
787 * messages when the superblock buffer is not actually an XFS superblock.
788 * If we find an XFS superblock, the run a normal, noisy mount because we are
789 * really going to mount it and want to know about errors.
790 */
791static void
792xfs_sb_quiet_read_verify(
793 struct xfs_buf *bp)
794{
795 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
796
797
798 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
799 /* XFS filesystem, verify noisily! */
800 xfs_sb_read_verify(bp);
801 return;
802 }
803 /* quietly fail */
804 xfs_buf_ioerror(bp, EWRONGFS);
805}
806
807static void
808xfs_sb_write_verify(
809 struct xfs_buf *bp)
810{
811 struct xfs_mount *mp = bp->b_target->bt_mount;
812 struct xfs_buf_log_item *bip = bp->b_fspriv;
813 int error;
814
815 error = xfs_sb_verify(bp, false);
816 if (error) {
817 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
818 xfs_buf_ioerror(bp, error);
819 return;
820 }
821
822 if (!xfs_sb_version_hascrc(&mp->m_sb))
823 return;
824
825 if (bip)
826 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
827
828 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
829 offsetof(struct xfs_sb, sb_crc));
830}
831
832const struct xfs_buf_ops xfs_sb_buf_ops = {
833 .verify_read = xfs_sb_read_verify,
834 .verify_write = xfs_sb_write_verify,
835};
836
837static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
838 .verify_read = xfs_sb_quiet_read_verify,
839 .verify_write = xfs_sb_write_verify,
840};
841
842/* 274/*
843 * xfs_readsb 275 * xfs_readsb
844 * 276 *
845 * Does the initial read of the superblock. 277 * Does the initial read of the superblock.
846 */ 278 */
847int 279int
848xfs_readsb(xfs_mount_t *mp, int flags) 280xfs_readsb(
281 struct xfs_mount *mp,
282 int flags)
849{ 283{
850 unsigned int sector_size; 284 unsigned int sector_size;
851 struct xfs_buf *bp; 285 struct xfs_buf *bp;
@@ -884,8 +318,8 @@ reread:
884 * Initialize the mount structure from the superblock. 318 * Initialize the mount structure from the superblock.
885 */ 319 */
886 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 320 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
887
888 xfs_sb_quota_from_disk(&mp->m_sb); 321 xfs_sb_quota_from_disk(&mp->m_sb);
322
889 /* 323 /*
890 * We must be able to do sector-sized and sector-aligned IO. 324 * We must be able to do sector-sized and sector-aligned IO.
891 */ 325 */
@@ -922,107 +356,6 @@ release_buf:
922 return error; 356 return error;
923} 357}
924 358
925
926/*
927 * xfs_mount_common
928 *
929 * Mount initialization code establishing various mount
930 * fields from the superblock associated with the given
931 * mount structure
932 */
933STATIC void
934xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
935{
936 mp->m_agfrotor = mp->m_agirotor = 0;
937 spin_lock_init(&mp->m_agirotor_lock);
938 mp->m_maxagi = mp->m_sb.sb_agcount;
939 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
940 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
941 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
942 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
943 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
944 mp->m_blockmask = sbp->sb_blocksize - 1;
945 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
946 mp->m_blockwmask = mp->m_blockwsize - 1;
947
948 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
949 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
950 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
951 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
952
953 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
954 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
955 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
956 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
957
958 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
959 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
960 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
961 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
962
963 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
964 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
965 sbp->sb_inopblock);
966 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
967}
968
969/*
970 * xfs_initialize_perag_data
971 *
972 * Read in each per-ag structure so we can count up the number of
973 * allocated inodes, free inodes and used filesystem blocks as this
974 * information is no longer persistent in the superblock. Once we have
975 * this information, write it into the in-core superblock structure.
976 */
977STATIC int
978xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
979{
980 xfs_agnumber_t index;
981 xfs_perag_t *pag;
982 xfs_sb_t *sbp = &mp->m_sb;
983 uint64_t ifree = 0;
984 uint64_t ialloc = 0;
985 uint64_t bfree = 0;
986 uint64_t bfreelst = 0;
987 uint64_t btree = 0;
988 int error;
989
990 for (index = 0; index < agcount; index++) {
991 /*
992 * read the agf, then the agi. This gets us
993 * all the information we need and populates the
994 * per-ag structures for us.
995 */
996 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
997 if (error)
998 return error;
999
1000 error = xfs_ialloc_pagi_init(mp, NULL, index);
1001 if (error)
1002 return error;
1003 pag = xfs_perag_get(mp, index);
1004 ifree += pag->pagi_freecount;
1005 ialloc += pag->pagi_count;
1006 bfree += pag->pagf_freeblks;
1007 bfreelst += pag->pagf_flcount;
1008 btree += pag->pagf_btreeblks;
1009 xfs_perag_put(pag);
1010 }
1011 /*
1012 * Overwrite incore superblock counters with just-read data
1013 */
1014 spin_lock(&mp->m_sb_lock);
1015 sbp->sb_ifree = ifree;
1016 sbp->sb_icount = ialloc;
1017 sbp->sb_fdblocks = bfree + bfreelst + btree;
1018 spin_unlock(&mp->m_sb_lock);
1019
1020 /* Fixup the per-cpu counters as well. */
1021 xfs_icsb_reinit_counters(mp);
1022
1023 return 0;
1024}
1025
1026/* 359/*
1027 * Update alignment values based on mount options and sb values 360 * Update alignment values based on mount options and sb values
1028 */ 361 */
@@ -1194,7 +527,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
1194} 527}
1195 528
1196/* 529/*
1197 * Check that the data (and log if separate) are an ok size. 530 * Check that the data (and log if separate) is an ok size.
1198 */ 531 */
1199STATIC int 532STATIC int
1200xfs_check_sizes(xfs_mount_t *mp) 533xfs_check_sizes(xfs_mount_t *mp)
@@ -1264,8 +597,7 @@ xfs_mount_reset_sbqflags(
1264 return 0; 597 return 0;
1265 598
1266 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 599 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1267 error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), 600 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1268 0, 0, XFS_DEFAULT_LOG_COUNT);
1269 if (error) { 601 if (error) {
1270 xfs_trans_cancel(tp, 0); 602 xfs_trans_cancel(tp, 0);
1271 xfs_alert(mp, "%s: Superblock update failed!", __func__); 603 xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1315,7 +647,7 @@ xfs_mountfs(
1315 uint quotaflags = 0; 647 uint quotaflags = 0;
1316 int error = 0; 648 int error = 0;
1317 649
1318 xfs_mount_common(mp, sbp); 650 xfs_sb_mount_common(mp, sbp);
1319 651
1320 /* 652 /*
1321 * Check for a mismatched features2 values. Older kernels 653 * Check for a mismatched features2 values. Older kernels
@@ -1400,7 +732,7 @@ xfs_mountfs(
1400 xfs_set_inoalignment(mp); 732 xfs_set_inoalignment(mp);
1401 733
1402 /* 734 /*
1403 * Check that the data (and log if separate) are an ok size. 735 * Check that the data (and log if separate) is an ok size.
1404 */ 736 */
1405 error = xfs_check_sizes(mp); 737 error = xfs_check_sizes(mp);
1406 if (error) 738 if (error)
@@ -1738,8 +1070,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1738 return 0; 1070 return 0;
1739 1071
1740 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); 1072 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
1741 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 1073 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1742 XFS_DEFAULT_LOG_COUNT);
1743 if (error) { 1074 if (error) {
1744 xfs_trans_cancel(tp, 0); 1075 xfs_trans_cancel(tp, 0);
1745 return error; 1076 return error;
@@ -1752,49 +1083,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1752} 1083}
1753 1084
1754/* 1085/*
1755 * xfs_mod_sb() can be used to copy arbitrary changes to the 1086 * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
1756 * in-core superblock into the superblock buffer to be logged.
1757 * It does not provide the higher level of locking that is
1758 * needed to protect the in-core superblock from concurrent
1759 * access.
1760 */
1761void
1762xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1763{
1764 xfs_buf_t *bp;
1765 int first;
1766 int last;
1767 xfs_mount_t *mp;
1768 xfs_sb_field_t f;
1769
1770 ASSERT(fields);
1771 if (!fields)
1772 return;
1773 mp = tp->t_mountp;
1774 bp = xfs_trans_getsb(tp, mp, 0);
1775 first = sizeof(xfs_sb_t);
1776 last = 0;
1777
1778 /* translate/copy */
1779
1780 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1781
1782 /* find modified range */
1783 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1784 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1785 last = xfs_sb_info[f + 1].offset - 1;
1786
1787 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1788 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1789 first = xfs_sb_info[f].offset;
1790
1791 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
1792 xfs_trans_log_buf(tp, bp, first, last);
1793}
1794
1795
1796/*
1797 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1798 * a delta to a specified field in the in-core superblock. Simply 1087 * a delta to a specified field in the in-core superblock. Simply
1799 * switch on the field indicated and apply the delta to that field. 1088 * switch on the field indicated and apply the delta to that field.
1800 * Fields are not allowed to dip below zero, so if the delta would 1089 * Fields are not allowed to dip below zero, so if the delta would
@@ -2101,8 +1390,7 @@ xfs_mount_log_sb(
2101 XFS_SB_VERSIONNUM)); 1390 XFS_SB_VERSIONNUM));
2102 1391
2103 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); 1392 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
2104 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 1393 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
2105 XFS_DEFAULT_LOG_COUNT);
2106 if (error) { 1394 if (error) {
2107 xfs_trans_cancel(tp, 0); 1395 xfs_trans_cancel(tp, 0);
2108 return error; 1396 return error;
@@ -2260,12 +1548,6 @@ xfs_icsb_init_counters(
2260 if (mp->m_sb_cnts == NULL) 1548 if (mp->m_sb_cnts == NULL)
2261 return -ENOMEM; 1549 return -ENOMEM;
2262 1550
2263#ifdef CONFIG_HOTPLUG_CPU
2264 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
2265 mp->m_icsb_notifier.priority = 0;
2266 register_hotcpu_notifier(&mp->m_icsb_notifier);
2267#endif /* CONFIG_HOTPLUG_CPU */
2268
2269 for_each_online_cpu(i) { 1551 for_each_online_cpu(i) {
2270 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 1552 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2271 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 1553 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
@@ -2278,6 +1560,13 @@ xfs_icsb_init_counters(
2278 * initial balance kicks us off correctly 1560 * initial balance kicks us off correctly
2279 */ 1561 */
2280 mp->m_icsb_counters = -1; 1562 mp->m_icsb_counters = -1;
1563
1564#ifdef CONFIG_HOTPLUG_CPU
1565 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1566 mp->m_icsb_notifier.priority = 0;
1567 register_hotcpu_notifier(&mp->m_icsb_notifier);
1568#endif /* CONFIG_HOTPLUG_CPU */
1569
2281 return 0; 1570 return 0;
2282} 1571}
2283 1572
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4e374d4a9189..1fa0584b5627 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,45 +18,7 @@
18#ifndef __XFS_MOUNT_H__ 18#ifndef __XFS_MOUNT_H__
19#define __XFS_MOUNT_H__ 19#define __XFS_MOUNT_H__
20 20
21typedef struct xfs_trans_reservations { 21#ifdef __KERNEL__
22 uint tr_write; /* extent alloc trans */
23 uint tr_itruncate; /* truncate trans */
24 uint tr_rename; /* rename trans */
25 uint tr_link; /* link trans */
26 uint tr_remove; /* unlink trans */
27 uint tr_symlink; /* symlink trans */
28 uint tr_create; /* create trans */
29 uint tr_mkdir; /* mkdir trans */
30 uint tr_ifree; /* inode free trans */
31 uint tr_ichange; /* inode update trans */
32 uint tr_growdata; /* fs data section grow trans */
33 uint tr_swrite; /* sync write inode trans */
34 uint tr_addafork; /* cvt inode to attributed trans */
35 uint tr_writeid; /* write setuid/setgid file */
36 uint tr_attrinval; /* attr fork buffer invalidation */
37 uint tr_attrsetm; /* set/create an attribute at mount time */
38 uint tr_attrsetrt; /* set/create an attribute at runtime */
39 uint tr_attrrm; /* remove an attribute */
40 uint tr_clearagi; /* clear bad agi unlinked ino bucket */
41 uint tr_growrtalloc; /* grow realtime allocations */
42 uint tr_growrtzero; /* grow realtime zeroing */
43 uint tr_growrtfree; /* grow realtime freeing */
44 uint tr_qm_sbchange; /* change quota flags */
45 uint tr_qm_setqlim; /* adjust quota limits */
46 uint tr_qm_dqalloc; /* allocate quota on disk */
47 uint tr_qm_quotaoff; /* turn quota off */
48 uint tr_qm_equotaoff;/* end of turn quota off */
49 uint tr_sb; /* modify superblock */
50} xfs_trans_reservations_t;
51
52#ifndef __KERNEL__
53
54#define xfs_daddr_to_agno(mp,d) \
55 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
56#define xfs_daddr_to_agbno(mp,d) \
57 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
58
59#else /* __KERNEL__ */
60 22
61struct xlog; 23struct xlog;
62struct xfs_inode; 24struct xfs_inode;
@@ -174,7 +136,7 @@ typedef struct xfs_mount {
174 int m_ialloc_blks; /* blocks in inode allocation */ 136 int m_ialloc_blks; /* blocks in inode allocation */
175 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 137 int m_inoalign_mask;/* mask sb_inoalignmt if used */
176 uint m_qflags; /* quota status flags */ 138 uint m_qflags; /* quota status flags */
177 xfs_trans_reservations_t m_reservations;/* precomputed res values */ 139 struct xfs_trans_resv m_resv; /* precomputed res values */
178 __uint64_t m_maxicount; /* maximum inode count */ 140 __uint64_t m_maxicount; /* maximum inode count */
179 __uint64_t m_resblks; /* total reserved blocks */ 141 __uint64_t m_resblks; /* total reserved blocks */
180 __uint64_t m_resblks_avail;/* available reserved blocks */ 142 __uint64_t m_resblks_avail;/* available reserved blocks */
@@ -330,14 +292,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
330} 292}
331 293
332/* 294/*
333 * perag get/put wrappers for ref counting
334 */
335struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
336struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
337 int tag);
338void xfs_perag_put(struct xfs_perag *pag);
339
340/*
341 * Per-cpu superblock locking functions 295 * Per-cpu superblock locking functions
342 */ 296 */
343#ifdef HAVE_PERCPU_SB 297#ifdef HAVE_PERCPU_SB
@@ -366,9 +320,63 @@ typedef struct xfs_mod_sb {
366 int64_t msb_delta; /* Change to make to specified field */ 320 int64_t msb_delta; /* Change to make to specified field */
367} xfs_mod_sb_t; 321} xfs_mod_sb_t;
368 322
323/*
324 * Per-ag incore structure, copies of information in agf and agi, to improve the
325 * performance of allocation group selection. This is defined for the kernel
326 * only, and hence is defined here instead of in xfs_ag.h. You need the struct
327 * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
328 * so this doesn't introduce any strange header file dependencies.
329 */
330typedef struct xfs_perag {
331 struct xfs_mount *pag_mount; /* owner filesystem */
332 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
333 atomic_t pag_ref; /* perag reference count */
334 char pagf_init; /* this agf's entry is initialized */
335 char pagi_init; /* this agi's entry is initialized */
336 char pagf_metadata; /* the agf is preferred to be metadata */
337 char pagi_inodeok; /* The agi is ok for inodes */
338 __uint8_t pagf_levels[XFS_BTNUM_AGF];
339 /* # of levels in bno & cnt btree */
340 __uint32_t pagf_flcount; /* count of blocks in freelist */
341 xfs_extlen_t pagf_freeblks; /* total free blocks */
342 xfs_extlen_t pagf_longest; /* longest free space */
343 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
344 xfs_agino_t pagi_freecount; /* number of free inodes */
345 xfs_agino_t pagi_count; /* number of allocated inodes */
346
347 /*
348 * Inode allocation search lookup optimisation.
349 * If the pagino matches, the search for new inodes
350 * doesn't need to search the near ones again straight away
351 */
352 xfs_agino_t pagl_pagino;
353 xfs_agino_t pagl_leftrec;
354 xfs_agino_t pagl_rightrec;
355 spinlock_t pagb_lock; /* lock for pagb_tree */
356 struct rb_root pagb_tree; /* ordered tree of busy extents */
357
358 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
359
360 spinlock_t pag_ici_lock; /* incore inode cache lock */
361 struct radix_tree_root pag_ici_root; /* incore inode cache root */
362 int pag_ici_reclaimable; /* reclaimable inodes */
363 struct mutex pag_ici_reclaim_lock; /* serialisation point */
364 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
365
366 /* buffer cache index */
367 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
368 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
369
370 /* for rcu-safe freeing */
371 struct rcu_head rcu_head;
372 int pagb_count; /* pagb slots in use */
373} xfs_perag_t;
374
369extern int xfs_log_sbcount(xfs_mount_t *); 375extern int xfs_log_sbcount(xfs_mount_t *);
370extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); 376extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
371extern int xfs_mountfs(xfs_mount_t *mp); 377extern int xfs_mountfs(xfs_mount_t *mp);
378extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
379 xfs_agnumber_t *maxagi);
372 380
373extern void xfs_unmountfs(xfs_mount_t *); 381extern void xfs_unmountfs(xfs_mount_t *);
374extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 382extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -387,13 +395,4 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387 395
388#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
389 397
390extern void xfs_sb_calc_crc(struct xfs_buf *);
391extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
392extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
393 xfs_agnumber_t *);
394extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
395extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
396
397extern const struct xfs_buf_ops xfs_sb_buf_ops;
398
399#endif /* __XFS_MOUNT_H__ */ 398#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d320794d03ce..3e6c2e6c9cd2 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_bit.h" 21#include "xfs_bit.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -37,7 +38,6 @@
37#include "xfs_attr.h" 38#include "xfs_attr.h"
38#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
39#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
@@ -51,8 +51,9 @@
51 */ 51 */
52STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 52STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
53STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 53STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
54STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
55 54
55
56STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
56/* 57/*
57 * We use the batch lookup interface to iterate over the dquots as it 58 * We use the batch lookup interface to iterate over the dquots as it
58 * currently is the only interface into the radix tree code that allows 59 * currently is the only interface into the radix tree code that allows
@@ -203,12 +204,9 @@ xfs_qm_dqpurge(
203 * We move dquots to the freelist as soon as their reference count 204 * We move dquots to the freelist as soon as their reference count
204 * hits zero, so it really should be on the freelist here. 205 * hits zero, so it really should be on the freelist here.
205 */ 206 */
206 mutex_lock(&qi->qi_lru_lock);
207 ASSERT(!list_empty(&dqp->q_lru)); 207 ASSERT(!list_empty(&dqp->q_lru));
208 list_del_init(&dqp->q_lru); 208 list_lru_del(&qi->qi_lru, &dqp->q_lru);
209 qi->qi_lru_count--;
210 XFS_STATS_DEC(xs_qm_dquot_unused); 209 XFS_STATS_DEC(xs_qm_dquot_unused);
211 mutex_unlock(&qi->qi_lru_lock);
212 210
213 xfs_qm_dqdestroy(dqp); 211 xfs_qm_dqdestroy(dqp);
214 212
@@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk(
680 return ndquots; 678 return ndquots;
681} 679}
682 680
681struct xfs_qm_isolate {
682 struct list_head buffers;
683 struct list_head dispose;
684};
685
686static enum lru_status
687xfs_qm_dquot_isolate(
688 struct list_head *item,
689 spinlock_t *lru_lock,
690 void *arg)
691{
692 struct xfs_dquot *dqp = container_of(item,
693 struct xfs_dquot, q_lru);
694 struct xfs_qm_isolate *isol = arg;
695
696 if (!xfs_dqlock_nowait(dqp))
697 goto out_miss_busy;
698
699 /*
700 * This dquot has acquired a reference in the meantime remove it from
701 * the freelist and try again.
702 */
703 if (dqp->q_nrefs) {
704 xfs_dqunlock(dqp);
705 XFS_STATS_INC(xs_qm_dqwants);
706
707 trace_xfs_dqreclaim_want(dqp);
708 list_del_init(&dqp->q_lru);
709 XFS_STATS_DEC(xs_qm_dquot_unused);
710 return LRU_REMOVED;
711 }
712
713 /*
714 * If the dquot is dirty, flush it. If it's already being flushed, just
715 * skip it so there is time for the IO to complete before we try to
716 * reclaim it again on the next LRU pass.
717 */
718 if (!xfs_dqflock_nowait(dqp)) {
719 xfs_dqunlock(dqp);
720 goto out_miss_busy;
721 }
722
723 if (XFS_DQ_IS_DIRTY(dqp)) {
724 struct xfs_buf *bp = NULL;
725 int error;
726
727 trace_xfs_dqreclaim_dirty(dqp);
728
729 /* we have to drop the LRU lock to flush the dquot */
730 spin_unlock(lru_lock);
731
732 error = xfs_qm_dqflush(dqp, &bp);
733 if (error) {
734 xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
735 __func__, dqp);
736 goto out_unlock_dirty;
737 }
738
739 xfs_buf_delwri_queue(bp, &isol->buffers);
740 xfs_buf_relse(bp);
741 goto out_unlock_dirty;
742 }
743 xfs_dqfunlock(dqp);
744
745 /*
746 * Prevent lookups now that we are past the point of no return.
747 */
748 dqp->dq_flags |= XFS_DQ_FREEING;
749 xfs_dqunlock(dqp);
750
751 ASSERT(dqp->q_nrefs == 0);
752 list_move_tail(&dqp->q_lru, &isol->dispose);
753 XFS_STATS_DEC(xs_qm_dquot_unused);
754 trace_xfs_dqreclaim_done(dqp);
755 XFS_STATS_INC(xs_qm_dqreclaims);
756 return LRU_REMOVED;
757
758out_miss_busy:
759 trace_xfs_dqreclaim_busy(dqp);
760 XFS_STATS_INC(xs_qm_dqreclaim_misses);
761 return LRU_SKIP;
762
763out_unlock_dirty:
764 trace_xfs_dqreclaim_busy(dqp);
765 XFS_STATS_INC(xs_qm_dqreclaim_misses);
766 xfs_dqunlock(dqp);
767 spin_lock(lru_lock);
768 return LRU_RETRY;
769}
770
771static unsigned long
772xfs_qm_shrink_scan(
773 struct shrinker *shrink,
774 struct shrink_control *sc)
775{
776 struct xfs_quotainfo *qi = container_of(shrink,
777 struct xfs_quotainfo, qi_shrinker);
778 struct xfs_qm_isolate isol;
779 unsigned long freed;
780 int error;
781 unsigned long nr_to_scan = sc->nr_to_scan;
782
783 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
784 return 0;
785
786 INIT_LIST_HEAD(&isol.buffers);
787 INIT_LIST_HEAD(&isol.dispose);
788
789 freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
790 &nr_to_scan);
791
792 error = xfs_buf_delwri_submit(&isol.buffers);
793 if (error)
794 xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
795
796 while (!list_empty(&isol.dispose)) {
797 struct xfs_dquot *dqp;
798
799 dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
800 list_del_init(&dqp->q_lru);
801 xfs_qm_dqfree_one(dqp);
802 }
803
804 return freed;
805}
806
807static unsigned long
808xfs_qm_shrink_count(
809 struct shrinker *shrink,
810 struct shrink_control *sc)
811{
812 struct xfs_quotainfo *qi = container_of(shrink,
813 struct xfs_quotainfo, qi_shrinker);
814
815 return list_lru_count_node(&qi->qi_lru, sc->nid);
816}
817
683/* 818/*
684 * This initializes all the quota information that's kept in the 819 * This initializes all the quota information that's kept in the
685 * mount structure 820 * mount structure
@@ -696,11 +831,18 @@ xfs_qm_init_quotainfo(
696 831
697 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 832 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
698 833
834 if ((error = list_lru_init(&qinf->qi_lru))) {
835 kmem_free(qinf);
836 mp->m_quotainfo = NULL;
837 return error;
838 }
839
699 /* 840 /*
700 * See if quotainodes are setup, and if not, allocate them, 841 * See if quotainodes are setup, and if not, allocate them,
701 * and change the superblock accordingly. 842 * and change the superblock accordingly.
702 */ 843 */
703 if ((error = xfs_qm_init_quotainos(mp))) { 844 if ((error = xfs_qm_init_quotainos(mp))) {
845 list_lru_destroy(&qinf->qi_lru);
704 kmem_free(qinf); 846 kmem_free(qinf);
705 mp->m_quotainfo = NULL; 847 mp->m_quotainfo = NULL;
706 return error; 848 return error;
@@ -711,10 +853,6 @@ xfs_qm_init_quotainfo(
711 INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); 853 INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
712 mutex_init(&qinf->qi_tree_lock); 854 mutex_init(&qinf->qi_tree_lock);
713 855
714 INIT_LIST_HEAD(&qinf->qi_lru_list);
715 qinf->qi_lru_count = 0;
716 mutex_init(&qinf->qi_lru_lock);
717
718 /* mutex used to serialize quotaoffs */ 856 /* mutex used to serialize quotaoffs */
719 mutex_init(&qinf->qi_quotaofflock); 857 mutex_init(&qinf->qi_quotaofflock);
720 858
@@ -779,8 +917,10 @@ xfs_qm_init_quotainfo(
779 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; 917 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
780 } 918 }
781 919
782 qinf->qi_shrinker.shrink = xfs_qm_shake; 920 qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
921 qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
783 qinf->qi_shrinker.seeks = DEFAULT_SEEKS; 922 qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
923 qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
784 register_shrinker(&qinf->qi_shrinker); 924 register_shrinker(&qinf->qi_shrinker);
785 return 0; 925 return 0;
786} 926}
@@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo(
801 ASSERT(qi != NULL); 941 ASSERT(qi != NULL);
802 942
803 unregister_shrinker(&qi->qi_shrinker); 943 unregister_shrinker(&qi->qi_shrinker);
944 list_lru_destroy(&qi->qi_lru);
804 945
805 if (qi->qi_uquotaip) { 946 if (qi->qi_uquotaip) {
806 IRELE(qi->qi_uquotaip); 947 IRELE(qi->qi_uquotaip);
@@ -834,21 +975,52 @@ xfs_qm_qino_alloc(
834 int error; 975 int error;
835 int committed; 976 int committed;
836 977
978 *ip = NULL;
979 /*
980 * With superblock that doesn't have separate pquotino, we
981 * share an inode between gquota and pquota. If the on-disk
982 * superblock has GQUOTA and the filesystem is now mounted
983 * with PQUOTA, just use sb_gquotino for sb_pquotino and
984 * vice-versa.
985 */
986 if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
987 (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
988 xfs_ino_t ino = NULLFSINO;
989
990 if ((flags & XFS_QMOPT_PQUOTA) &&
991 (mp->m_sb.sb_gquotino != NULLFSINO)) {
992 ino = mp->m_sb.sb_gquotino;
993 ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
994 } else if ((flags & XFS_QMOPT_GQUOTA) &&
995 (mp->m_sb.sb_pquotino != NULLFSINO)) {
996 ino = mp->m_sb.sb_pquotino;
997 ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
998 }
999 if (ino != NULLFSINO) {
1000 error = xfs_iget(mp, NULL, ino, 0, 0, ip);
1001 if (error)
1002 return error;
1003 mp->m_sb.sb_gquotino = NULLFSINO;
1004 mp->m_sb.sb_pquotino = NULLFSINO;
1005 }
1006 }
1007
837 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); 1008 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
838 if ((error = xfs_trans_reserve(tp, 1009 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
839 XFS_QM_QINOCREATE_SPACE_RES(mp), 1010 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
840 XFS_CREATE_LOG_RES(mp), 0, 1011 if (error) {
841 XFS_TRANS_PERM_LOG_RES,
842 XFS_CREATE_LOG_COUNT))) {
843 xfs_trans_cancel(tp, 0); 1012 xfs_trans_cancel(tp, 0);
844 return error; 1013 return error;
845 } 1014 }
846 1015
847 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); 1016 if (!*ip) {
848 if (error) { 1017 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
849 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1018 &committed);
850 XFS_TRANS_ABORT); 1019 if (error) {
851 return error; 1020 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1021 XFS_TRANS_ABORT);
1022 return error;
1023 }
852 } 1024 }
853 1025
854 /* 1026 /*
@@ -860,21 +1032,25 @@ xfs_qm_qino_alloc(
860 if (flags & XFS_QMOPT_SBVERSION) { 1032 if (flags & XFS_QMOPT_SBVERSION) {
861 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 1033 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
862 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1034 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
863 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 1035 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
864 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1036 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
865 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); 1037 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
1038 XFS_SB_QFLAGS));
866 1039
867 xfs_sb_version_addquota(&mp->m_sb); 1040 xfs_sb_version_addquota(&mp->m_sb);
868 mp->m_sb.sb_uquotino = NULLFSINO; 1041 mp->m_sb.sb_uquotino = NULLFSINO;
869 mp->m_sb.sb_gquotino = NULLFSINO; 1042 mp->m_sb.sb_gquotino = NULLFSINO;
1043 mp->m_sb.sb_pquotino = NULLFSINO;
870 1044
871 /* qflags will get updated _after_ quotacheck */ 1045 /* qflags will get updated fully _after_ quotacheck */
872 mp->m_sb.sb_qflags = 0; 1046 mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
873 } 1047 }
874 if (flags & XFS_QMOPT_UQUOTA) 1048 if (flags & XFS_QMOPT_UQUOTA)
875 mp->m_sb.sb_uquotino = (*ip)->i_ino; 1049 mp->m_sb.sb_uquotino = (*ip)->i_ino;
876 else 1050 else if (flags & XFS_QMOPT_GQUOTA)
877 mp->m_sb.sb_gquotino = (*ip)->i_ino; 1051 mp->m_sb.sb_gquotino = (*ip)->i_ino;
1052 else
1053 mp->m_sb.sb_pquotino = (*ip)->i_ino;
878 spin_unlock(&mp->m_sb_lock); 1054 spin_unlock(&mp->m_sb_lock);
879 xfs_mod_sb(tp, sbfields); 1055 xfs_mod_sb(tp, sbfields);
880 1056
@@ -1484,11 +1660,10 @@ xfs_qm_init_quotainos(
1484 if (error) 1660 if (error)
1485 goto error_rele; 1661 goto error_rele;
1486 } 1662 }
1487 /* XXX: Use gquotino for now */
1488 if (XFS_IS_PQUOTA_ON(mp) && 1663 if (XFS_IS_PQUOTA_ON(mp) &&
1489 mp->m_sb.sb_gquotino != NULLFSINO) { 1664 mp->m_sb.sb_pquotino != NULLFSINO) {
1490 ASSERT(mp->m_sb.sb_gquotino > 0); 1665 ASSERT(mp->m_sb.sb_pquotino > 0);
1491 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1666 error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
1492 0, 0, &pip); 1667 0, 0, &pip);
1493 if (error) 1668 if (error)
1494 goto error_rele; 1669 goto error_rele;
@@ -1496,7 +1671,8 @@ xfs_qm_init_quotainos(
1496 } else { 1671 } else {
1497 flags |= XFS_QMOPT_SBVERSION; 1672 flags |= XFS_QMOPT_SBVERSION;
1498 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1673 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1499 XFS_SB_GQUOTINO | XFS_SB_QFLAGS); 1674 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
1675 XFS_SB_QFLAGS);
1500 } 1676 }
1501 1677
1502 /* 1678 /*
@@ -1524,9 +1700,8 @@ xfs_qm_init_quotainos(
1524 flags &= ~XFS_QMOPT_SBVERSION; 1700 flags &= ~XFS_QMOPT_SBVERSION;
1525 } 1701 }
1526 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { 1702 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1527 /* XXX: Use XFS_SB_GQUOTINO for now */
1528 error = xfs_qm_qino_alloc(mp, &pip, 1703 error = xfs_qm_qino_alloc(mp, &pip,
1529 sbflags | XFS_SB_GQUOTINO, 1704 sbflags | XFS_SB_PQUOTINO,
1530 flags | XFS_QMOPT_PQUOTA); 1705 flags | XFS_QMOPT_PQUOTA);
1531 if (error) 1706 if (error)
1532 goto error_rele; 1707 goto error_rele;
@@ -1565,132 +1740,6 @@ xfs_qm_dqfree_one(
1565 xfs_qm_dqdestroy(dqp); 1740 xfs_qm_dqdestroy(dqp);
1566} 1741}
1567 1742
1568STATIC void
1569xfs_qm_dqreclaim_one(
1570 struct xfs_dquot *dqp,
1571 struct list_head *buffer_list,
1572 struct list_head *dispose_list)
1573{
1574 struct xfs_mount *mp = dqp->q_mount;
1575 struct xfs_quotainfo *qi = mp->m_quotainfo;
1576 int error;
1577
1578 if (!xfs_dqlock_nowait(dqp))
1579 goto out_move_tail;
1580
1581 /*
1582 * This dquot has acquired a reference in the meantime remove it from
1583 * the freelist and try again.
1584 */
1585 if (dqp->q_nrefs) {
1586 xfs_dqunlock(dqp);
1587
1588 trace_xfs_dqreclaim_want(dqp);
1589 XFS_STATS_INC(xs_qm_dqwants);
1590
1591 list_del_init(&dqp->q_lru);
1592 qi->qi_lru_count--;
1593 XFS_STATS_DEC(xs_qm_dquot_unused);
1594 return;
1595 }
1596
1597 /*
1598 * Try to grab the flush lock. If this dquot is in the process of
1599 * getting flushed to disk, we don't want to reclaim it.
1600 */
1601 if (!xfs_dqflock_nowait(dqp))
1602 goto out_unlock_move_tail;
1603
1604 if (XFS_DQ_IS_DIRTY(dqp)) {
1605 struct xfs_buf *bp = NULL;
1606
1607 trace_xfs_dqreclaim_dirty(dqp);
1608
1609 error = xfs_qm_dqflush(dqp, &bp);
1610 if (error) {
1611 xfs_warn(mp, "%s: dquot %p flush failed",
1612 __func__, dqp);
1613 goto out_unlock_move_tail;
1614 }
1615
1616 xfs_buf_delwri_queue(bp, buffer_list);
1617 xfs_buf_relse(bp);
1618 /*
1619 * Give the dquot another try on the freelist, as the
1620 * flushing will take some time.
1621 */
1622 goto out_unlock_move_tail;
1623 }
1624 xfs_dqfunlock(dqp);
1625
1626 /*
1627 * Prevent lookups now that we are past the point of no return.
1628 */
1629 dqp->dq_flags |= XFS_DQ_FREEING;
1630 xfs_dqunlock(dqp);
1631
1632 ASSERT(dqp->q_nrefs == 0);
1633 list_move_tail(&dqp->q_lru, dispose_list);
1634 qi->qi_lru_count--;
1635 XFS_STATS_DEC(xs_qm_dquot_unused);
1636
1637 trace_xfs_dqreclaim_done(dqp);
1638 XFS_STATS_INC(xs_qm_dqreclaims);
1639 return;
1640
1641 /*
1642 * Move the dquot to the tail of the list so that we don't spin on it.
1643 */
1644out_unlock_move_tail:
1645 xfs_dqunlock(dqp);
1646out_move_tail:
1647 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1648 trace_xfs_dqreclaim_busy(dqp);
1649 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1650}
1651
1652STATIC int
1653xfs_qm_shake(
1654 struct shrinker *shrink,
1655 struct shrink_control *sc)
1656{
1657 struct xfs_quotainfo *qi =
1658 container_of(shrink, struct xfs_quotainfo, qi_shrinker);
1659 int nr_to_scan = sc->nr_to_scan;
1660 LIST_HEAD (buffer_list);
1661 LIST_HEAD (dispose_list);
1662 struct xfs_dquot *dqp;
1663 int error;
1664
1665 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
1666 return 0;
1667 if (!nr_to_scan)
1668 goto out;
1669
1670 mutex_lock(&qi->qi_lru_lock);
1671 while (!list_empty(&qi->qi_lru_list)) {
1672 if (nr_to_scan-- <= 0)
1673 break;
1674 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
1675 q_lru);
1676 xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
1677 }
1678 mutex_unlock(&qi->qi_lru_lock);
1679
1680 error = xfs_buf_delwri_submit(&buffer_list);
1681 if (error)
1682 xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
1683
1684 while (!list_empty(&dispose_list)) {
1685 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
1686 list_del_init(&dqp->q_lru);
1687 xfs_qm_dqfree_one(dqp);
1688 }
1689
1690out:
1691 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
1692}
1693
1694/* 1743/*
1695 * Start a transaction and write the incore superblock changes to 1744 * Start a transaction and write the incore superblock changes to
1696 * disk. flags parameter indicates which fields have changed. 1745 * disk. flags parameter indicates which fields have changed.
@@ -1704,8 +1753,7 @@ xfs_qm_write_sb_changes(
1704 int error; 1753 int error;
1705 1754
1706 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1755 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1707 error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), 1756 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1708 0, 0, XFS_DEFAULT_LOG_COUNT);
1709 if (error) { 1757 if (error) {
1710 xfs_trans_cancel(tp, 0); 1758 xfs_trans_cancel(tp, 0);
1711 return error; 1759 return error;
@@ -1734,8 +1782,8 @@ xfs_qm_write_sb_changes(
1734int 1782int
1735xfs_qm_vop_dqalloc( 1783xfs_qm_vop_dqalloc(
1736 struct xfs_inode *ip, 1784 struct xfs_inode *ip,
1737 uid_t uid, 1785 xfs_dqid_t uid,
1738 gid_t gid, 1786 xfs_dqid_t gid,
1739 prid_t prid, 1787 prid_t prid,
1740 uint flags, 1788 uint flags,
1741 struct xfs_dquot **O_udqpp, 1789 struct xfs_dquot **O_udqpp,
@@ -1782,7 +1830,7 @@ xfs_qm_vop_dqalloc(
1782 * holding ilock. 1830 * holding ilock.
1783 */ 1831 */
1784 xfs_iunlock(ip, lockflags); 1832 xfs_iunlock(ip, lockflags);
1785 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, 1833 error = xfs_qm_dqget(mp, NULL, uid,
1786 XFS_DQ_USER, 1834 XFS_DQ_USER,
1787 XFS_QMOPT_DQALLOC | 1835 XFS_QMOPT_DQALLOC |
1788 XFS_QMOPT_DOWARN, 1836 XFS_QMOPT_DOWARN,
@@ -1809,7 +1857,7 @@ xfs_qm_vop_dqalloc(
1809 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { 1857 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
1810 if (ip->i_d.di_gid != gid) { 1858 if (ip->i_d.di_gid != gid) {
1811 xfs_iunlock(ip, lockflags); 1859 xfs_iunlock(ip, lockflags);
1812 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, 1860 error = xfs_qm_dqget(mp, NULL, gid,
1813 XFS_DQ_GROUP, 1861 XFS_DQ_GROUP,
1814 XFS_QMOPT_DQALLOC | 1862 XFS_QMOPT_DQALLOC |
1815 XFS_QMOPT_DOWARN, 1863 XFS_QMOPT_DOWARN,
@@ -1943,7 +1991,7 @@ xfs_qm_vop_chown_reserve(
1943 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1991 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
1944 1992
1945 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1993 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
1946 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { 1994 ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
1947 udq_delblks = udqp; 1995 udq_delblks = udqp;
1948 /* 1996 /*
1949 * If there are delayed allocation blocks, then we have to 1997 * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 579d6a02a5b6..2b602df9c242 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -49,9 +49,7 @@ typedef struct xfs_quotainfo {
49 struct xfs_inode *qi_uquotaip; /* user quota inode */ 49 struct xfs_inode *qi_uquotaip; /* user quota inode */
50 struct xfs_inode *qi_gquotaip; /* group quota inode */ 50 struct xfs_inode *qi_gquotaip; /* group quota inode */
51 struct xfs_inode *qi_pquotaip; /* project quota inode */ 51 struct xfs_inode *qi_pquotaip; /* project quota inode */
52 struct list_head qi_lru_list; 52 struct list_lru qi_lru;
53 struct mutex qi_lru_lock;
54 int qi_lru_count;
55 int qi_dquots; 53 int qi_dquots;
56 time_t qi_btimelimit; /* limit for blks timer */ 54 time_t qi_btimelimit; /* limit for blks timer */
57 time_t qi_itimelimit; /* limit for inodes timer */ 55 time_t qi_itimelimit; /* limit for inodes timer */
@@ -160,6 +158,8 @@ extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
160 struct fs_disk_quota *); 158 struct fs_disk_quota *);
161extern int xfs_qm_scall_getqstat(struct xfs_mount *, 159extern int xfs_qm_scall_getqstat(struct xfs_mount *,
162 struct fs_quota_stat *); 160 struct fs_quota_stat *);
161extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
162 struct fs_quota_statv *);
163extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); 163extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
164extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); 164extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
165 165
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 437a52d91f6d..3af50ccdfac1 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index e4f8b2d6f38b..8174aad0b388 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,6 +20,7 @@
20 20
21#include "xfs.h" 21#include "xfs.h"
22#include "xfs_fs.h" 22#include "xfs_fs.h"
23#include "xfs_format.h"
23#include "xfs_bit.h" 24#include "xfs_bit.h"
24#include "xfs_log.h" 25#include "xfs_log.h"
25#include "xfs_trans.h" 26#include "xfs_trans.h"
@@ -37,7 +38,6 @@
37#include "xfs_error.h" 38#include "xfs_error.h"
38#include "xfs_attr.h" 39#include "xfs_attr.h"
39#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
@@ -247,9 +247,7 @@ xfs_qm_scall_trunc_qfile(
247 xfs_ilock(ip, XFS_IOLOCK_EXCL); 247 xfs_ilock(ip, XFS_IOLOCK_EXCL);
248 248
249 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); 249 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
250 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 250 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
251 XFS_TRANS_PERM_LOG_RES,
252 XFS_ITRUNCATE_LOG_COUNT);
253 if (error) { 251 if (error) {
254 xfs_trans_cancel(tp, 0); 252 xfs_trans_cancel(tp, 0);
255 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 253 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -296,8 +294,10 @@ xfs_qm_scall_trunc_qfiles(
296 294
297 if (flags & XFS_DQ_USER) 295 if (flags & XFS_DQ_USER)
298 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); 296 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
299 if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) 297 if (flags & XFS_DQ_GROUP)
300 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); 298 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
299 if (flags & XFS_DQ_PROJ)
300 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
301 301
302 return error ? error : error2; 302 return error ? error : error2;
303} 303}
@@ -404,6 +404,7 @@ xfs_qm_scall_quotaon(
404 404
405/* 405/*
406 * Return quota status information, such as uquota-off, enforcements, etc. 406 * Return quota status information, such as uquota-off, enforcements, etc.
407 * for Q_XGETQSTAT command.
407 */ 408 */
408int 409int
409xfs_qm_scall_getqstat( 410xfs_qm_scall_getqstat(
@@ -413,8 +414,10 @@ xfs_qm_scall_getqstat(
413 struct xfs_quotainfo *q = mp->m_quotainfo; 414 struct xfs_quotainfo *q = mp->m_quotainfo;
414 struct xfs_inode *uip = NULL; 415 struct xfs_inode *uip = NULL;
415 struct xfs_inode *gip = NULL; 416 struct xfs_inode *gip = NULL;
417 struct xfs_inode *pip = NULL;
416 bool tempuqip = false; 418 bool tempuqip = false;
417 bool tempgqip = false; 419 bool tempgqip = false;
420 bool temppqip = false;
418 421
419 memset(out, 0, sizeof(fs_quota_stat_t)); 422 memset(out, 0, sizeof(fs_quota_stat_t));
420 423
@@ -424,16 +427,106 @@ xfs_qm_scall_getqstat(
424 out->qs_gquota.qfs_ino = NULLFSINO; 427 out->qs_gquota.qfs_ino = NULLFSINO;
425 return (0); 428 return (0);
426 } 429 }
430
431 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
432 (XFS_ALL_QUOTA_ACCT|
433 XFS_ALL_QUOTA_ENFD));
434 if (q) {
435 uip = q->qi_uquotaip;
436 gip = q->qi_gquotaip;
437 pip = q->qi_pquotaip;
438 }
439 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
440 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
441 0, 0, &uip) == 0)
442 tempuqip = true;
443 }
444 if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
445 if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
446 0, 0, &gip) == 0)
447 tempgqip = true;
448 }
449 /*
450 * Q_XGETQSTAT doesn't have room for both group and project quotas.
451 * So, allow the project quota values to be copied out only if
452 * there is no group quota information available.
453 */
454 if (!gip) {
455 if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
456 if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
457 0, 0, &pip) == 0)
458 temppqip = true;
459 }
460 } else
461 pip = NULL;
462 if (uip) {
463 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
464 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
465 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
466 if (tempuqip)
467 IRELE(uip);
468 }
469
470 if (gip) {
471 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
472 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
473 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
474 if (tempgqip)
475 IRELE(gip);
476 }
477 if (pip) {
478 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
479 out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
480 out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
481 if (temppqip)
482 IRELE(pip);
483 }
484 if (q) {
485 out->qs_incoredqs = q->qi_dquots;
486 out->qs_btimelimit = q->qi_btimelimit;
487 out->qs_itimelimit = q->qi_itimelimit;
488 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
489 out->qs_bwarnlimit = q->qi_bwarnlimit;
490 out->qs_iwarnlimit = q->qi_iwarnlimit;
491 }
492 return 0;
493}
494
495/*
496 * Return quota status information, such as uquota-off, enforcements, etc.
497 * for Q_XGETQSTATV command, to support separate project quota field.
498 */
499int
500xfs_qm_scall_getqstatv(
501 struct xfs_mount *mp,
502 struct fs_quota_statv *out)
503{
504 struct xfs_quotainfo *q = mp->m_quotainfo;
505 struct xfs_inode *uip = NULL;
506 struct xfs_inode *gip = NULL;
507 struct xfs_inode *pip = NULL;
508 bool tempuqip = false;
509 bool tempgqip = false;
510 bool temppqip = false;
511
512 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
513 out->qs_uquota.qfs_ino = NULLFSINO;
514 out->qs_gquota.qfs_ino = NULLFSINO;
515 out->qs_pquota.qfs_ino = NULLFSINO;
516 return (0);
517 }
518
427 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 519 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
428 (XFS_ALL_QUOTA_ACCT| 520 (XFS_ALL_QUOTA_ACCT|
429 XFS_ALL_QUOTA_ENFD)); 521 XFS_ALL_QUOTA_ENFD));
430 out->qs_pad = 0;
431 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 522 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
432 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 523 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
524 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
433 525
434 if (q) { 526 if (q) {
435 uip = q->qi_uquotaip; 527 uip = q->qi_uquotaip;
436 gip = q->qi_gquotaip; 528 gip = q->qi_gquotaip;
529 pip = q->qi_pquotaip;
437 } 530 }
438 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 531 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
439 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 532 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -445,18 +538,30 @@ xfs_qm_scall_getqstat(
445 0, 0, &gip) == 0) 538 0, 0, &gip) == 0)
446 tempgqip = true; 539 tempgqip = true;
447 } 540 }
541 if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
542 if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
543 0, 0, &pip) == 0)
544 temppqip = true;
545 }
448 if (uip) { 546 if (uip) {
449 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; 547 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
450 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; 548 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
451 if (tempuqip) 549 if (tempuqip)
452 IRELE(uip); 550 IRELE(uip);
453 } 551 }
552
454 if (gip) { 553 if (gip) {
455 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; 554 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
456 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; 555 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
457 if (tempgqip) 556 if (tempgqip)
458 IRELE(gip); 557 IRELE(gip);
459 } 558 }
559 if (pip) {
560 out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
561 out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
562 if (temppqip)
563 IRELE(pip);
564 }
460 if (q) { 565 if (q) {
461 out->qs_incoredqs = q->qi_dquots; 566 out->qs_incoredqs = q->qi_dquots;
462 out->qs_btimelimit = q->qi_btimelimit; 567 out->qs_btimelimit = q->qi_btimelimit;
@@ -515,8 +620,7 @@ xfs_qm_scall_setqlim(
515 xfs_dqunlock(dqp); 620 xfs_dqunlock(dqp);
516 621
517 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 622 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
518 error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), 623 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
519 0, 0, XFS_DEFAULT_LOG_COUNT);
520 if (error) { 624 if (error) {
521 xfs_trans_cancel(tp, 0); 625 xfs_trans_cancel(tp, 0);
522 goto out_rele; 626 goto out_rele;
@@ -650,8 +754,7 @@ xfs_qm_log_quotaoff_end(
650 754
651 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); 755 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
652 756
653 error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), 757 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
654 0, 0, XFS_DEFAULT_LOG_COUNT);
655 if (error) { 758 if (error) {
656 xfs_trans_cancel(tp, 0); 759 xfs_trans_cancel(tp, 0);
657 return (error); 760 return (error);
@@ -684,8 +787,7 @@ xfs_qm_log_quotaoff(
684 uint oldsbqflag=0; 787 uint oldsbqflag=0;
685 788
686 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); 789 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
687 error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), 790 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
688 0, 0, XFS_DEFAULT_LOG_COUNT);
689 if (error) 791 if (error)
690 goto error0; 792 goto error0;
691 793
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b14f42c714b6..e7d84d2d8683 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,267 +18,14 @@
18#ifndef __XFS_QUOTA_H__ 18#ifndef __XFS_QUOTA_H__
19#define __XFS_QUOTA_H__ 19#define __XFS_QUOTA_H__
20 20
21struct xfs_trans; 21#include "xfs_quota_defs.h"
22
23/*
24 * The ondisk form of a dquot structure.
25 */
26#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
27#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
28
29/*
30 * uid_t and gid_t are hard-coded to 32 bits in the inode.
31 * Hence, an 'id' in a dquot is 32 bits..
32 */
33typedef __uint32_t xfs_dqid_t;
34
35/*
36 * Even though users may not have quota limits occupying all 64-bits,
37 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
38 * and quota-limits. This is a waste in the common case, but hey ...
39 */
40typedef __uint64_t xfs_qcnt_t;
41typedef __uint16_t xfs_qwarncnt_t;
42
43/*
44 * This is the main portion of the on-disk representation of quota
45 * information for a user. This is the q_core of the xfs_dquot_t that
46 * is kept in kernel memory. We pad this with some more expansion room
47 * to construct the on disk structure.
48 */
49typedef struct xfs_disk_dquot {
50 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
51 __u8 d_version; /* dquot version */
52 __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
53 __be32 d_id; /* user,project,group id */
54 __be64 d_blk_hardlimit;/* absolute limit on disk blks */
55 __be64 d_blk_softlimit;/* preferred limit on disk blks */
56 __be64 d_ino_hardlimit;/* maximum # allocated inodes */
57 __be64 d_ino_softlimit;/* preferred inode limit */
58 __be64 d_bcount; /* disk blocks owned by the user */
59 __be64 d_icount; /* inodes owned by the user */
60 __be32 d_itimer; /* zero if within inode limits if not,
61 this is when we refuse service */
62 __be32 d_btimer; /* similar to above; for disk blocks */
63 __be16 d_iwarns; /* warnings issued wrt num inodes */
64 __be16 d_bwarns; /* warnings issued wrt disk blocks */
65 __be32 d_pad0; /* 64 bit align */
66 __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
67 __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
68 __be64 d_rtbcount; /* realtime blocks owned */
69 __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
70 __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
71 __be16 d_pad;
72} xfs_disk_dquot_t;
73
74/*
75 * This is what goes on disk. This is separated from the xfs_disk_dquot because
76 * carrying the unnecessary padding would be a waste of memory.
77 */
78typedef struct xfs_dqblk {
79 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
80 char dd_fill[4]; /* filling for posterity */
81
82 /*
83 * These two are only present on filesystems with the CRC bits set.
84 */
85 __be32 dd_crc; /* checksum */
86 __be64 dd_lsn; /* last modification in log */
87 uuid_t dd_uuid; /* location information */
88} xfs_dqblk_t;
89
90#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
91
92/*
93 * flags for q_flags field in the dquot.
94 */
95#define XFS_DQ_USER 0x0001 /* a user quota */
96#define XFS_DQ_PROJ 0x0002 /* project quota */
97#define XFS_DQ_GROUP 0x0004 /* a group quota */
98#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
99#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
100
101#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
102
103#define XFS_DQ_FLAGS \
104 { XFS_DQ_USER, "USER" }, \
105 { XFS_DQ_PROJ, "PROJ" }, \
106 { XFS_DQ_GROUP, "GROUP" }, \
107 { XFS_DQ_DIRTY, "DIRTY" }, \
108 { XFS_DQ_FREEING, "FREEING" }
109
110/*
111 * We have the possibility of all three quota types being active at once, and
112 * hence free space modification requires modification of all three current
113 * dquots in a single transaction. For this case we need to have a reservation
114 * of at least 3 dquots.
115 *
116 * However, a chmod operation can change both UID and GID in a single
117 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
118 * modified. Hence for this case we need to reserve space for at least 4 dquots.
119 *
120 * And in the worst case, there's a rename operation that can be modifying up to
121 * 4 inodes with dquots attached to them. In reality, the only inodes that can
122 * have their dquots modified are the source and destination directory inodes
123 * due to directory name creation and removal. That can require space allocation
124 * and/or freeing on both directory inodes, and hence all three dquots on each
125 * inode can be modified. And if the directories are world writeable, all the
126 * dquots can be unique and so 6 dquots can be modified....
127 *
128 * And, of course, we also need to take into account the dquot log format item
129 * used to describe each dquot.
130 */
131#define XFS_DQUOT_LOGRES(mp) \
132 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
133
134/*
135 * These are the structures used to lay out dquots and quotaoff
136 * records on the log. Quite similar to those of inodes.
137 */
138
139/*
140 * log format struct for dquots.
141 * The first two fields must be the type and size fitting into
142 * 32 bits : log_recovery code assumes that.
143 */
144typedef struct xfs_dq_logformat {
145 __uint16_t qlf_type; /* dquot log item type */
146 __uint16_t qlf_size; /* size of this item */
147 xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
148 __int64_t qlf_blkno; /* blkno of dquot buffer */
149 __int32_t qlf_len; /* len of dquot buffer */
150 __uint32_t qlf_boffset; /* off of dquot in buffer */
151} xfs_dq_logformat_t;
152
153/*
154 * log format struct for QUOTAOFF records.
155 * The first two fields must be the type and size fitting into
156 * 32 bits : log_recovery code assumes that.
157 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
158 * to the first and ensures that the first logitem is taken out of the AIL
159 * only when the last one is securely committed.
160 */
161typedef struct xfs_qoff_logformat {
162 unsigned short qf_type; /* quotaoff log item type */
163 unsigned short qf_size; /* size of this item */
164 unsigned int qf_flags; /* USR and/or GRP */
165 char qf_pad[12]; /* padding for future */
166} xfs_qoff_logformat_t;
167
168
169/*
170 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
171 */
172#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
173#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
174#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
175#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
176#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
177#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
178#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
179
180/*
181 * Conversion to and from the combined OQUOTA flag (if necessary)
182 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
183 */
184#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
185#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
186#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
187#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
188
189/*
190 * Quota Accounting/Enforcement flags
191 */
192#define XFS_ALL_QUOTA_ACCT \
193 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
194#define XFS_ALL_QUOTA_ENFD \
195 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
196#define XFS_ALL_QUOTA_CHKD \
197 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
198
199#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
200#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
201#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
202#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
203#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
204#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
205#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
206
207/*
208 * Incore only flags for quotaoff - these bits get cleared when quota(s)
209 * are in the process of getting turned off. These flags are in m_qflags but
210 * never in sb_qflags.
211 */
212#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
213#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
214#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
215#define XFS_ALL_QUOTA_ACTIVE \
216 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
217 22
218/* 23/*
219 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 24 * Kernel only quota definitions and functions
220 * quota will be not be switched off as long as that inode lock is held.
221 */ 25 */
222#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
223 XFS_GQUOTA_ACTIVE | \
224 XFS_PQUOTA_ACTIVE))
225#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
226 XFS_PQUOTA_ACTIVE))
227#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
228#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
229#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
230 26
231/* 27struct xfs_trans;
232 * Flags to tell various functions what to do. Not all of these are meaningful
233 * to a single function. None of these XFS_QMOPT_* flags are meant to have
234 * persistent values (ie. their values can and will change between versions)
235 */
236#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
237#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
238#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
239#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
240#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
241#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
242#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
243#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
244#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
245
246/*
247 * flags to xfs_trans_mod_dquot to indicate which field needs to be
248 * modified.
249 */
250#define XFS_QMOPT_RES_REGBLKS 0x0010000
251#define XFS_QMOPT_RES_RTBLKS 0x0020000
252#define XFS_QMOPT_BCOUNT 0x0040000
253#define XFS_QMOPT_ICOUNT 0x0080000
254#define XFS_QMOPT_RTBCOUNT 0x0100000
255#define XFS_QMOPT_DELBCOUNT 0x0200000
256#define XFS_QMOPT_DELRTBCOUNT 0x0400000
257#define XFS_QMOPT_RES_INOS 0x0800000
258
259/*
260 * flags for dqalloc.
261 */
262#define XFS_QMOPT_INHERIT 0x1000000
263
264/*
265 * flags to xfs_trans_mod_dquot.
266 */
267#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
268#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
269#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
270#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
271#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
272#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
273#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
274#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
275
276
277#define XFS_QMOPT_QUOTALL \
278 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
279#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
280 28
281#ifdef __KERNEL__
282/* 29/*
283 * This check is done typically without holding the inode lock; 30 * This check is done typically without holding the inode lock;
284 * that may seem racy, but it is harmless in the context that it is used. 31 * that may seem racy, but it is harmless in the context that it is used.
@@ -301,13 +48,6 @@ typedef struct xfs_qoff_logformat {
301 (XFS_IS_PQUOTA_ON(mp) && \ 48 (XFS_IS_PQUOTA_ON(mp) && \
302 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) 49 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
303 50
304#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
305 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
306 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
307 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
308 XFS_PQUOTA_CHKD)
309
310
311/* 51/*
312 * The structure kept inside the xfs_trans_t keep track of dquot changes 52 * The structure kept inside the xfs_trans_t keep track of dquot changes
313 * within a transaction and apply them later. 53 * within a transaction and apply them later.
@@ -340,8 +80,9 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
340 struct xfs_mount *, struct xfs_dquot *, 80 struct xfs_mount *, struct xfs_dquot *,
341 struct xfs_dquot *, struct xfs_dquot *, long, long, uint); 81 struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
342 82
343extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, 83extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
344 struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); 84 prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
85 struct xfs_dquot **);
345extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, 86extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
346 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); 87 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
347extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); 88extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -362,9 +103,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
362 103
363#else 104#else
364static inline int 105static inline int
365xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, 106xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
366 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, 107 prid_t prid, uint flags, struct xfs_dquot **udqp,
367 struct xfs_dquot **pdqp) 108 struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
368{ 109{
369 *udqp = NULL; 110 *udqp = NULL;
370 *gdqp = NULL; 111 *gdqp = NULL;
@@ -415,5 +156,4 @@ extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
415 156
416extern const struct xfs_buf_ops xfs_dquot_buf_ops; 157extern const struct xfs_buf_ops xfs_dquot_buf_ops;
417 158
418#endif /* __KERNEL__ */
419#endif /* __XFS_QUOTA_H__ */ 159#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
new file mode 100644
index 000000000000..e6b0d6e1f4f2
--- /dev/null
+++ b/fs/xfs/xfs_quota_defs.h
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QUOTA_DEFS_H__
19#define __XFS_QUOTA_DEFS_H__
20
21/*
22 * Quota definitions shared between user and kernel source trees.
23 */
24
25/*
26 * Even though users may not have quota limits occupying all 64-bits,
27 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
28 * and quota-limits. This is a waste in the common case, but hey ...
29 */
30typedef __uint64_t xfs_qcnt_t;
31typedef __uint16_t xfs_qwarncnt_t;
32
33/*
34 * flags for q_flags field in the dquot.
35 */
36#define XFS_DQ_USER 0x0001 /* a user quota */
37#define XFS_DQ_PROJ 0x0002 /* project quota */
38#define XFS_DQ_GROUP 0x0004 /* a group quota */
39#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
40#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
41
42#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
43
44#define XFS_DQ_FLAGS \
45 { XFS_DQ_USER, "USER" }, \
46 { XFS_DQ_PROJ, "PROJ" }, \
47 { XFS_DQ_GROUP, "GROUP" }, \
48 { XFS_DQ_DIRTY, "DIRTY" }, \
49 { XFS_DQ_FREEING, "FREEING" }
50
51/*
52 * We have the possibility of all three quota types being active at once, and
53 * hence free space modification requires modification of all three current
54 * dquots in a single transaction. For this case we need to have a reservation
55 * of at least 3 dquots.
56 *
57 * However, a chmod operation can change both UID and GID in a single
58 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
59 * modified. Hence for this case we need to reserve space for at least 4 dquots.
60 *
61 * And in the worst case, there's a rename operation that can be modifying up to
62 * 4 inodes with dquots attached to them. In reality, the only inodes that can
63 * have their dquots modified are the source and destination directory inodes
64 * due to directory name creation and removal. That can require space allocation
65 * and/or freeing on both directory inodes, and hence all three dquots on each
66 * inode can be modified. And if the directories are world writeable, all the
67 * dquots can be unique and so 6 dquots can be modified....
68 *
69 * And, of course, we also need to take into account the dquot log format item
70 * used to describe each dquot.
71 */
72#define XFS_DQUOT_LOGRES(mp) \
73 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
74
75#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
76#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
77#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
78#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
79#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
80#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
81#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
82
83/*
84 * Incore only flags for quotaoff - these bits get cleared when quota(s)
85 * are in the process of getting turned off. These flags are in m_qflags but
86 * never in sb_qflags.
87 */
88#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
89#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
90#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
91#define XFS_ALL_QUOTA_ACTIVE \
92 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
93
94/*
95 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
96 * quota will be not be switched off as long as that inode lock is held.
97 */
98#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
99 XFS_GQUOTA_ACTIVE | \
100 XFS_PQUOTA_ACTIVE))
101#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
102 XFS_PQUOTA_ACTIVE))
103#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
104#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
105#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
106
107/*
108 * Flags to tell various functions what to do. Not all of these are meaningful
109 * to a single function. None of these XFS_QMOPT_* flags are meant to have
110 * persistent values (ie. their values can and will change between versions)
111 */
112#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
113#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
114#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
115#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
116#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
117#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
118#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
119#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
120#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
121
122/*
123 * flags to xfs_trans_mod_dquot to indicate which field needs to be
124 * modified.
125 */
126#define XFS_QMOPT_RES_REGBLKS 0x0010000
127#define XFS_QMOPT_RES_RTBLKS 0x0020000
128#define XFS_QMOPT_BCOUNT 0x0040000
129#define XFS_QMOPT_ICOUNT 0x0080000
130#define XFS_QMOPT_RTBCOUNT 0x0100000
131#define XFS_QMOPT_DELBCOUNT 0x0200000
132#define XFS_QMOPT_DELRTBCOUNT 0x0400000
133#define XFS_QMOPT_RES_INOS 0x0800000
134
135/*
136 * flags for dqalloc.
137 */
138#define XFS_QMOPT_INHERIT 0x1000000
139
140/*
141 * flags to xfs_trans_mod_dquot.
142 */
143#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
144#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
145#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
146#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
147#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
148#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
149#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
150#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
151
152
153#define XFS_QMOPT_QUOTALL \
154 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
155#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
156
157#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 20e30f93b0c7..1326d81596c2 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -16,8 +16,10 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_format.h"
20#include "xfs_trans_resv.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_sb.h"
21#include "xfs_ag.h" 23#include "xfs_ag.h"
22#include "xfs_mount.h" 24#include "xfs_mount.h"
23#include "xfs_quota.h" 25#include "xfs_quota.h"
@@ -54,6 +56,18 @@ xfs_fs_get_xstate(
54} 56}
55 57
56STATIC int 58STATIC int
59xfs_fs_get_xstatev(
60 struct super_block *sb,
61 struct fs_quota_statv *fqs)
62{
63 struct xfs_mount *mp = XFS_M(sb);
64
65 if (!XFS_IS_QUOTA_RUNNING(mp))
66 return -ENOSYS;
67 return -xfs_qm_scall_getqstatv(mp, fqs);
68}
69
70STATIC int
57xfs_fs_set_xstate( 71xfs_fs_set_xstate(
58 struct super_block *sb, 72 struct super_block *sb,
59 unsigned int uflags, 73 unsigned int uflags,
@@ -133,6 +147,7 @@ xfs_fs_set_dqblk(
133} 147}
134 148
135const struct quotactl_ops xfs_quotactl_operations = { 149const struct quotactl_ops xfs_quotactl_operations = {
150 .get_xstatev = xfs_fs_get_xstatev,
136 .get_xstate = xfs_fs_get_xstate, 151 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 152 .set_xstate = xfs_fs_set_xstate,
138 .get_dqblk = xfs_fs_get_dqblk, 153 .get_dqblk = xfs_fs_get_dqblk,
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
deleted file mode 100644
index 30ff5f401d28..000000000000
--- a/fs/xfs/xfs_rename.c
+++ /dev/null
@@ -1,346 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_mount.h"
27#include "xfs_da_btree.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h"
30#include "xfs_inode.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
33#include "xfs_error.h"
34#include "xfs_quota.h"
35#include "xfs_utils.h"
36#include "xfs_trans_space.h"
37#include "xfs_vnodeops.h"
38#include "xfs_trace.h"
39
40
41/*
42 * Enter all inodes for a rename transaction into a sorted array.
43 */
44STATIC void
45xfs_sort_for_rename(
46 xfs_inode_t *dp1, /* in: old (source) directory inode */
47 xfs_inode_t *dp2, /* in: new (target) directory inode */
48 xfs_inode_t *ip1, /* in: inode of old entry */
49 xfs_inode_t *ip2, /* in: inode of new entry, if it
50 already exists, NULL otherwise. */
51 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
52 int *num_inodes) /* out: number of inodes in array */
53{
54 xfs_inode_t *temp;
55 int i, j;
56
57 /*
58 * i_tab contains a list of pointers to inodes. We initialize
59 * the table here & we'll sort it. We will then use it to
60 * order the acquisition of the inode locks.
61 *
62 * Note that the table may contain duplicates. e.g., dp1 == dp2.
63 */
64 i_tab[0] = dp1;
65 i_tab[1] = dp2;
66 i_tab[2] = ip1;
67 if (ip2) {
68 *num_inodes = 4;
69 i_tab[3] = ip2;
70 } else {
71 *num_inodes = 3;
72 i_tab[3] = NULL;
73 }
74
75 /*
76 * Sort the elements via bubble sort. (Remember, there are at
77 * most 4 elements to sort, so this is adequate.)
78 */
79 for (i = 0; i < *num_inodes; i++) {
80 for (j = 1; j < *num_inodes; j++) {
81 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
82 temp = i_tab[j];
83 i_tab[j] = i_tab[j-1];
84 i_tab[j-1] = temp;
85 }
86 }
87 }
88}
89
90/*
91 * xfs_rename
92 */
93int
94xfs_rename(
95 xfs_inode_t *src_dp,
96 struct xfs_name *src_name,
97 xfs_inode_t *src_ip,
98 xfs_inode_t *target_dp,
99 struct xfs_name *target_name,
100 xfs_inode_t *target_ip)
101{
102 xfs_trans_t *tp = NULL;
103 xfs_mount_t *mp = src_dp->i_mount;
104 int new_parent; /* moving to a new dir */
105 int src_is_directory; /* src_name is a directory */
106 int error;
107 xfs_bmap_free_t free_list;
108 xfs_fsblock_t first_block;
109 int cancel_flags;
110 int committed;
111 xfs_inode_t *inodes[4];
112 int spaceres;
113 int num_inodes;
114
115 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
116
117 new_parent = (src_dp != target_dp);
118 src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
119
120 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
121 inodes, &num_inodes);
122
123 xfs_bmap_init(&free_list, &first_block);
124 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
125 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
126 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
127 error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
128 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
129 if (error == ENOSPC) {
130 spaceres = 0;
131 error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
132 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
133 }
134 if (error) {
135 xfs_trans_cancel(tp, 0);
136 goto std_return;
137 }
138
139 /*
140 * Attach the dquots to the inodes
141 */
142 error = xfs_qm_vop_rename_dqattach(inodes);
143 if (error) {
144 xfs_trans_cancel(tp, cancel_flags);
145 goto std_return;
146 }
147
148 /*
149 * Lock all the participating inodes. Depending upon whether
150 * the target_name exists in the target directory, and
151 * whether the target directory is the same as the source
152 * directory, we can lock from 2 to 4 inodes.
153 */
154 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
155
156 /*
157 * Join all the inodes to the transaction. From this point on,
158 * we can rely on either trans_commit or trans_cancel to unlock
159 * them.
160 */
161 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
162 if (new_parent)
163 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
164 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
165 if (target_ip)
166 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
167
168 /*
169 * If we are using project inheritance, we only allow renames
170 * into our tree when the project IDs are the same; else the
171 * tree quota mechanism would be circumvented.
172 */
173 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
174 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
175 error = XFS_ERROR(EXDEV);
176 goto error_return;
177 }
178
179 /*
180 * Set up the target.
181 */
182 if (target_ip == NULL) {
183 /*
184 * If there's no space reservation, check the entry will
185 * fit before actually inserting it.
186 */
187 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
188 if (error)
189 goto error_return;
190 /*
191 * If target does not exist and the rename crosses
192 * directories, adjust the target directory link count
193 * to account for the ".." reference from the new entry.
194 */
195 error = xfs_dir_createname(tp, target_dp, target_name,
196 src_ip->i_ino, &first_block,
197 &free_list, spaceres);
198 if (error == ENOSPC)
199 goto error_return;
200 if (error)
201 goto abort_return;
202
203 xfs_trans_ichgtime(tp, target_dp,
204 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
205
206 if (new_parent && src_is_directory) {
207 error = xfs_bumplink(tp, target_dp);
208 if (error)
209 goto abort_return;
210 }
211 } else { /* target_ip != NULL */
212 /*
213 * If target exists and it's a directory, check that both
214 * target and source are directories and that target can be
215 * destroyed, or that neither is a directory.
216 */
217 if (S_ISDIR(target_ip->i_d.di_mode)) {
218 /*
219 * Make sure target dir is empty.
220 */
221 if (!(xfs_dir_isempty(target_ip)) ||
222 (target_ip->i_d.di_nlink > 2)) {
223 error = XFS_ERROR(EEXIST);
224 goto error_return;
225 }
226 }
227
228 /*
229 * Link the source inode under the target name.
230 * If the source inode is a directory and we are moving
231 * it across directories, its ".." entry will be
232 * inconsistent until we replace that down below.
233 *
234 * In case there is already an entry with the same
235 * name at the destination directory, remove it first.
236 */
237 error = xfs_dir_replace(tp, target_dp, target_name,
238 src_ip->i_ino,
239 &first_block, &free_list, spaceres);
240 if (error)
241 goto abort_return;
242
243 xfs_trans_ichgtime(tp, target_dp,
244 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
245
246 /*
247 * Decrement the link count on the target since the target
248 * dir no longer points to it.
249 */
250 error = xfs_droplink(tp, target_ip);
251 if (error)
252 goto abort_return;
253
254 if (src_is_directory) {
255 /*
256 * Drop the link from the old "." entry.
257 */
258 error = xfs_droplink(tp, target_ip);
259 if (error)
260 goto abort_return;
261 }
262 } /* target_ip != NULL */
263
264 /*
265 * Remove the source.
266 */
267 if (new_parent && src_is_directory) {
268 /*
269 * Rewrite the ".." entry to point to the new
270 * directory.
271 */
272 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
273 target_dp->i_ino,
274 &first_block, &free_list, spaceres);
275 ASSERT(error != EEXIST);
276 if (error)
277 goto abort_return;
278 }
279
280 /*
281 * We always want to hit the ctime on the source inode.
282 *
283 * This isn't strictly required by the standards since the source
284 * inode isn't really being changed, but old unix file systems did
285 * it and some incremental backup programs won't work without it.
286 */
287 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
288 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
289
290 /*
291 * Adjust the link count on src_dp. This is necessary when
292 * renaming a directory, either within one parent when
293 * the target existed, or across two parent directories.
294 */
295 if (src_is_directory && (new_parent || target_ip != NULL)) {
296
297 /*
298 * Decrement link count on src_directory since the
299 * entry that's moved no longer points to it.
300 */
301 error = xfs_droplink(tp, src_dp);
302 if (error)
303 goto abort_return;
304 }
305
306 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
307 &first_block, &free_list, spaceres);
308 if (error)
309 goto abort_return;
310
311 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
312 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
313 if (new_parent)
314 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
315
316 /*
317 * If this is a synchronous mount, make sure that the
318 * rename transaction goes to disk before returning to
319 * the user.
320 */
321 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
322 xfs_trans_set_sync(tp);
323 }
324
325 error = xfs_bmap_finish(&tp, &free_list, &committed);
326 if (error) {
327 xfs_bmap_cancel(&free_list);
328 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
329 XFS_TRANS_ABORT));
330 goto std_return;
331 }
332
333 /*
334 * trans_commit will unlock src_ip, target_ip & decrement
335 * the vnode references.
336 */
337 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
338
339 abort_return:
340 cancel_flags |= XFS_TRANS_ABORT;
341 error_return:
342 xfs_bmap_cancel(&free_list);
343 xfs_trans_cancel(tp, cancel_flags);
344 std_return:
345 return error;
346}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 98dc670d3ee0..6f9e63c9fc26 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,25 +17,24 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h" 28#include "xfs_dinode.h"
30#include "xfs_inode.h" 29#include "xfs_inode.h"
31#include "xfs_alloc.h" 30#include "xfs_alloc.h"
32#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_bmap_util.h"
33#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
34#include "xfs_fsops.h" 34#include "xfs_fsops.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_inode_item.h" 36#include "xfs_inode_item.h"
37#include "xfs_trans_space.h" 37#include "xfs_trans_space.h"
38#include "xfs_utils.h"
39#include "xfs_trace.h" 38#include "xfs_trace.h"
40#include "xfs_buf.h" 39#include "xfs_buf.h"
41#include "xfs_icache.h" 40#include "xfs_icache.h"
@@ -101,10 +100,9 @@ xfs_growfs_rt_alloc(
101 /* 100 /*
102 * Reserve space & log for one extent added to the file. 101 * Reserve space & log for one extent added to the file.
103 */ 102 */
104 if ((error = xfs_trans_reserve(tp, resblks, 103 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
105 XFS_GROWRTALLOC_LOG_RES(mp), 0, 104 resblks, 0);
106 XFS_TRANS_PERM_LOG_RES, 105 if (error)
107 XFS_DEFAULT_PERM_LOG_COUNT)))
108 goto error_cancel; 106 goto error_cancel;
109 cancelflags = XFS_TRANS_RELEASE_LOG_RES; 107 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
110 /* 108 /*
@@ -147,8 +145,9 @@ xfs_growfs_rt_alloc(
147 /* 145 /*
148 * Reserve log for one block zeroing. 146 * Reserve log for one block zeroing.
149 */ 147 */
150 if ((error = xfs_trans_reserve(tp, 0, 148 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
151 XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) 149 0, 0);
150 if (error)
152 goto error_cancel; 151 goto error_cancel;
153 /* 152 /*
154 * Lock the bitmap inode. 153 * Lock the bitmap inode.
@@ -736,8 +735,8 @@ xfs_rtallocate_range(
736{ 735{
737 xfs_rtblock_t end; /* end of the allocated extent */ 736 xfs_rtblock_t end; /* end of the allocated extent */
738 int error; /* error value */ 737 int error; /* error value */
739 xfs_rtblock_t postblock; /* first block allocated > end */ 738 xfs_rtblock_t postblock = 0; /* first block allocated > end */
740 xfs_rtblock_t preblock; /* first block allocated < start */ 739 xfs_rtblock_t preblock = 0; /* first block allocated < start */
741 740
742 end = start + len - 1; 741 end = start + len - 1;
743 /* 742 /*
@@ -1958,8 +1957,9 @@ xfs_growfs_rt(
1958 * Start a transaction, get the log reservation. 1957 * Start a transaction, get the log reservation.
1959 */ 1958 */
1960 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); 1959 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
1961 if ((error = xfs_trans_reserve(tp, 0, 1960 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
1962 XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) 1961 0, 0);
1962 if (error)
1963 goto error_cancel; 1963 goto error_cancel;
1964 /* 1964 /*
1965 * Lock out other callers by grabbing the bitmap inode lock. 1965 * Lock out other callers by grabbing the bitmap inode lock.
@@ -2148,7 +2148,7 @@ xfs_rtfree_extent(
2148 ASSERT(mp->m_rbmip->i_itemp != NULL); 2148 ASSERT(mp->m_rbmip->i_itemp != NULL);
2149 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); 2149 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2150 2150
2151#if defined(__KERNEL__) && defined(DEBUG) 2151#ifdef DEBUG
2152 /* 2152 /*
2153 * Check to see that this whole range is currently allocated. 2153 * Check to see that this whole range is currently allocated.
2154 */ 2154 */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7f3a359c1c5..b2a1a24c0e2f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -18,58 +18,11 @@
18#ifndef __XFS_RTALLOC_H__ 18#ifndef __XFS_RTALLOC_H__
19#define __XFS_RTALLOC_H__ 19#define __XFS_RTALLOC_H__
20 20
21/* kernel only definitions and functions */
22
21struct xfs_mount; 23struct xfs_mount;
22struct xfs_trans; 24struct xfs_trans;
23 25
24/* Min and max rt extent sizes, specified in bytes */
25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
28
29/*
30 * Constants for bit manipulations.
31 */
32#define XFS_NBBYLOG 3 /* log2(NBBY) */
33#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
34#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
35#define XFS_NBWORD (1 << XFS_NBWORDLOG)
36#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
37
38#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
39#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
40#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
41#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
42
43/*
44 * Summary and bit manipulation macros.
45 */
46#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
47#define XFS_SUMOFFSTOBLOCK(mp,s) \
48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
49#define XFS_SUMPTR(mp,bp,so) \
50 ((xfs_suminfo_t *)((bp)->b_addr + \
51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
52
53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
54#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
55#define XFS_BITTOWORD(mp,bi) \
56 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
57
58#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
59#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
60
61#define XFS_RTLOBIT(w) xfs_lowbit32(w)
62#define XFS_RTHIBIT(w) xfs_highbit32(w)
63
64#if XFS_BIG_BLKNOS
65#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
66#else
67#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
68#endif
69
70
71#ifdef __KERNEL__
72
73#ifdef CONFIG_XFS_RT 26#ifdef CONFIG_XFS_RT
74/* 27/*
75 * Function prototypes for exported functions. 28 * Function prototypes for exported functions.
@@ -161,6 +114,4 @@ xfs_rtmount_init(
161# define xfs_rtunmount_inodes(m) 114# define xfs_rtunmount_inodes(m)
162#endif /* CONFIG_XFS_RT */ 115#endif /* CONFIG_XFS_RT */
163 116
164#endif /* __KERNEL__ */
165
166#endif /* __XFS_RTALLOC_H__ */ 117#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
new file mode 100644
index 000000000000..a5b59d92eb70
--- /dev/null
+++ b/fs/xfs/xfs_sb.c
@@ -0,0 +1,834 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
32#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h"
40#include "xfs_rtalloc.h"
41#include "xfs_bmap.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_fsops.h"
45#include "xfs_trace.h"
46#include "xfs_cksum.h"
47#include "xfs_buf_item.h"
48
49/*
50 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
51 */
52
53static const struct {
54 short offset;
55 short type; /* 0 = integer
56 * 1 = binary / string (no translation)
57 */
58} xfs_sb_info[] = {
59 { offsetof(xfs_sb_t, sb_magicnum), 0 },
60 { offsetof(xfs_sb_t, sb_blocksize), 0 },
61 { offsetof(xfs_sb_t, sb_dblocks), 0 },
62 { offsetof(xfs_sb_t, sb_rblocks), 0 },
63 { offsetof(xfs_sb_t, sb_rextents), 0 },
64 { offsetof(xfs_sb_t, sb_uuid), 1 },
65 { offsetof(xfs_sb_t, sb_logstart), 0 },
66 { offsetof(xfs_sb_t, sb_rootino), 0 },
67 { offsetof(xfs_sb_t, sb_rbmino), 0 },
68 { offsetof(xfs_sb_t, sb_rsumino), 0 },
69 { offsetof(xfs_sb_t, sb_rextsize), 0 },
70 { offsetof(xfs_sb_t, sb_agblocks), 0 },
71 { offsetof(xfs_sb_t, sb_agcount), 0 },
72 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
73 { offsetof(xfs_sb_t, sb_logblocks), 0 },
74 { offsetof(xfs_sb_t, sb_versionnum), 0 },
75 { offsetof(xfs_sb_t, sb_sectsize), 0 },
76 { offsetof(xfs_sb_t, sb_inodesize), 0 },
77 { offsetof(xfs_sb_t, sb_inopblock), 0 },
78 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
79 { offsetof(xfs_sb_t, sb_blocklog), 0 },
80 { offsetof(xfs_sb_t, sb_sectlog), 0 },
81 { offsetof(xfs_sb_t, sb_inodelog), 0 },
82 { offsetof(xfs_sb_t, sb_inopblog), 0 },
83 { offsetof(xfs_sb_t, sb_agblklog), 0 },
84 { offsetof(xfs_sb_t, sb_rextslog), 0 },
85 { offsetof(xfs_sb_t, sb_inprogress), 0 },
86 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
87 { offsetof(xfs_sb_t, sb_icount), 0 },
88 { offsetof(xfs_sb_t, sb_ifree), 0 },
89 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
90 { offsetof(xfs_sb_t, sb_frextents), 0 },
91 { offsetof(xfs_sb_t, sb_uquotino), 0 },
92 { offsetof(xfs_sb_t, sb_gquotino), 0 },
93 { offsetof(xfs_sb_t, sb_qflags), 0 },
94 { offsetof(xfs_sb_t, sb_flags), 0 },
95 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
96 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
97 { offsetof(xfs_sb_t, sb_unit), 0 },
98 { offsetof(xfs_sb_t, sb_width), 0 },
99 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
100 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
101 { offsetof(xfs_sb_t, sb_logsectsize), 0 },
102 { offsetof(xfs_sb_t, sb_logsunit), 0 },
103 { offsetof(xfs_sb_t, sb_features2), 0 },
104 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
105 { offsetof(xfs_sb_t, sb_features_compat), 0 },
106 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
107 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
108 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
109 { offsetof(xfs_sb_t, sb_crc), 0 },
110 { offsetof(xfs_sb_t, sb_pad), 0 },
111 { offsetof(xfs_sb_t, sb_pquotino), 0 },
112 { offsetof(xfs_sb_t, sb_lsn), 0 },
113 { sizeof(xfs_sb_t), 0 }
114};
115
116/*
117 * Reference counting access wrappers to the perag structures.
118 * Because we never free per-ag structures, the only thing we
119 * have to protect against changes is the tree structure itself.
120 */
121struct xfs_perag *
122xfs_perag_get(
123 struct xfs_mount *mp,
124 xfs_agnumber_t agno)
125{
126 struct xfs_perag *pag;
127 int ref = 0;
128
129 rcu_read_lock();
130 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
131 if (pag) {
132 ASSERT(atomic_read(&pag->pag_ref) >= 0);
133 ref = atomic_inc_return(&pag->pag_ref);
134 }
135 rcu_read_unlock();
136 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
137 return pag;
138}
139
140/*
141 * search from @first to find the next perag with the given tag set.
142 */
143struct xfs_perag *
144xfs_perag_get_tag(
145 struct xfs_mount *mp,
146 xfs_agnumber_t first,
147 int tag)
148{
149 struct xfs_perag *pag;
150 int found;
151 int ref;
152
153 rcu_read_lock();
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, first, 1, tag);
156 if (found <= 0) {
157 rcu_read_unlock();
158 return NULL;
159 }
160 ref = atomic_inc_return(&pag->pag_ref);
161 rcu_read_unlock();
162 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
163 return pag;
164}
165
166void
167xfs_perag_put(
168 struct xfs_perag *pag)
169{
170 int ref;
171
172 ASSERT(atomic_read(&pag->pag_ref) > 0);
173 ref = atomic_dec_return(&pag->pag_ref);
174 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
175}
176
177/*
178 * Check the validity of the SB found.
179 */
180STATIC int
181xfs_mount_validate_sb(
182 xfs_mount_t *mp,
183 xfs_sb_t *sbp,
184 bool check_inprogress,
185 bool check_version)
186{
187
188 /*
189 * If the log device and data device have the
190 * same device number, the log is internal.
191 * Consequently, the sb_logstart should be non-zero. If
192 * we have a zero sb_logstart in this case, we may be trying to mount
193 * a volume filesystem in a non-volume manner.
194 */
195 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
196 xfs_warn(mp, "bad magic number");
197 return XFS_ERROR(EWRONGFS);
198 }
199
200
201 if (!xfs_sb_good_version(sbp)) {
202 xfs_warn(mp, "bad version");
203 return XFS_ERROR(EWRONGFS);
204 }
205
206 /*
207 * Version 5 superblock feature mask validation. Reject combinations the
208 * kernel cannot support up front before checking anything else. For
209 * write validation, we don't need to check feature masks.
210 */
211 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
212 xfs_alert(mp,
213"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
214"Use of these features in this kernel is at your own risk!");
215
216 if (xfs_sb_has_compat_feature(sbp,
217 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
218 xfs_warn(mp,
219"Superblock has unknown compatible features (0x%x) enabled.\n"
220"Using a more recent kernel is recommended.",
221 (sbp->sb_features_compat &
222 XFS_SB_FEAT_COMPAT_UNKNOWN));
223 }
224
225 if (xfs_sb_has_ro_compat_feature(sbp,
226 XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
227 xfs_alert(mp,
228"Superblock has unknown read-only compatible features (0x%x) enabled.",
229 (sbp->sb_features_ro_compat &
230 XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
231 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
232 xfs_warn(mp,
233"Attempted to mount read-only compatible filesystem read-write.\n"
234"Filesystem can only be safely mounted read only.");
235 return XFS_ERROR(EINVAL);
236 }
237 }
238 if (xfs_sb_has_incompat_feature(sbp,
239 XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
240 xfs_warn(mp,
241"Superblock has unknown incompatible features (0x%x) enabled.\n"
242"Filesystem can not be safely mounted by this kernel.",
243 (sbp->sb_features_incompat &
244 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
245 return XFS_ERROR(EINVAL);
246 }
247 }
248
249 if (xfs_sb_version_has_pquotino(sbp)) {
250 if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
251 xfs_notice(mp,
252 "Version 5 of Super block has XFS_OQUOTA bits.\n");
253 return XFS_ERROR(EFSCORRUPTED);
254 }
255 } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
256 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
257 xfs_notice(mp,
258"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
259 return XFS_ERROR(EFSCORRUPTED);
260 }
261
262 if (unlikely(
263 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
264 xfs_warn(mp,
265 "filesystem is marked as having an external log; "
266 "specify logdev on the mount command line.");
267 return XFS_ERROR(EINVAL);
268 }
269
270 if (unlikely(
271 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
272 xfs_warn(mp,
273 "filesystem is marked as having an internal log; "
274 "do not specify logdev on the mount command line.");
275 return XFS_ERROR(EINVAL);
276 }
277
278 /*
279 * More sanity checking. Most of these were stolen directly from
280 * xfs_repair.
281 */
282 if (unlikely(
283 sbp->sb_agcount <= 0 ||
284 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
285 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
286 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
287 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
288 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
289 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
290 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
291 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
292 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
293 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
294 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
295 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
296 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
297 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
298 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
299 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
300 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
301 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
302 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
303 sbp->sb_dblocks == 0 ||
304 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
305 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
306 XFS_CORRUPTION_ERROR("SB sanity check failed",
307 XFS_ERRLEVEL_LOW, mp, sbp);
308 return XFS_ERROR(EFSCORRUPTED);
309 }
310
311 /*
312 * Until this is fixed only page-sized or smaller data blocks work.
313 */
314 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
315 xfs_warn(mp,
316 "File system with blocksize %d bytes. "
317 "Only pagesize (%ld) or less will currently work.",
318 sbp->sb_blocksize, PAGE_SIZE);
319 return XFS_ERROR(ENOSYS);
320 }
321
322 /*
323 * Currently only very few inode sizes are supported.
324 */
325 switch (sbp->sb_inodesize) {
326 case 256:
327 case 512:
328 case 1024:
329 case 2048:
330 break;
331 default:
332 xfs_warn(mp, "inode size of %d bytes not supported",
333 sbp->sb_inodesize);
334 return XFS_ERROR(ENOSYS);
335 }
336
337 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
338 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
339 xfs_warn(mp,
340 "file system too large to be mounted on this system.");
341 return XFS_ERROR(EFBIG);
342 }
343
344 if (check_inprogress && sbp->sb_inprogress) {
345 xfs_warn(mp, "Offline file system operation in progress!");
346 return XFS_ERROR(EFSCORRUPTED);
347 }
348
349 /*
350 * Version 1 directory format has never worked on Linux.
351 */
352 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
353 xfs_warn(mp, "file system using version 1 directory format");
354 return XFS_ERROR(ENOSYS);
355 }
356
357 return 0;
358}
359
360void
361xfs_sb_quota_from_disk(struct xfs_sb *sbp)
362{
363 /*
364 * older mkfs doesn't initialize quota inodes to NULLFSINO. This
365 * leads to in-core values having two different values for a quota
366 * inode to be invalid: 0 and NULLFSINO. Change it to a single value
367 * NULLFSINO.
368 *
369 * Note that this change affect only the in-core values. These
370 * values are not written back to disk unless any quota information
371 * is written to the disk. Even in that case, sb_pquotino field is
372 * not written to disk unless the superblock supports pquotino.
373 */
374 if (sbp->sb_uquotino == 0)
375 sbp->sb_uquotino = NULLFSINO;
376 if (sbp->sb_gquotino == 0)
377 sbp->sb_gquotino = NULLFSINO;
378 if (sbp->sb_pquotino == 0)
379 sbp->sb_pquotino = NULLFSINO;
380
381 /*
382 * We need to do these manipilations only if we are working
383 * with an older version of on-disk superblock.
384 */
385 if (xfs_sb_version_has_pquotino(sbp))
386 return;
387
388 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
389 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
390 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
391 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
392 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
393 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
394 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
395
396 if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
397 /*
398 * In older version of superblock, on-disk superblock only
399 * has sb_gquotino, and in-core superblock has both sb_gquotino
400 * and sb_pquotino. But, only one of them is supported at any
401 * point of time. So, if PQUOTA is set in disk superblock,
402 * copy over sb_gquotino to sb_pquotino.
403 */
404 sbp->sb_pquotino = sbp->sb_gquotino;
405 sbp->sb_gquotino = NULLFSINO;
406 }
407}
408
409void
410xfs_sb_from_disk(
411 struct xfs_sb *to,
412 xfs_dsb_t *from)
413{
414 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
415 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
416 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
417 to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
418 to->sb_rextents = be64_to_cpu(from->sb_rextents);
419 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
420 to->sb_logstart = be64_to_cpu(from->sb_logstart);
421 to->sb_rootino = be64_to_cpu(from->sb_rootino);
422 to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
423 to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
424 to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
425 to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
426 to->sb_agcount = be32_to_cpu(from->sb_agcount);
427 to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
428 to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
429 to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
430 to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
431 to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
432 to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
433 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
434 to->sb_blocklog = from->sb_blocklog;
435 to->sb_sectlog = from->sb_sectlog;
436 to->sb_inodelog = from->sb_inodelog;
437 to->sb_inopblog = from->sb_inopblog;
438 to->sb_agblklog = from->sb_agblklog;
439 to->sb_rextslog = from->sb_rextslog;
440 to->sb_inprogress = from->sb_inprogress;
441 to->sb_imax_pct = from->sb_imax_pct;
442 to->sb_icount = be64_to_cpu(from->sb_icount);
443 to->sb_ifree = be64_to_cpu(from->sb_ifree);
444 to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
445 to->sb_frextents = be64_to_cpu(from->sb_frextents);
446 to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
447 to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
448 to->sb_qflags = be16_to_cpu(from->sb_qflags);
449 to->sb_flags = from->sb_flags;
450 to->sb_shared_vn = from->sb_shared_vn;
451 to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
452 to->sb_unit = be32_to_cpu(from->sb_unit);
453 to->sb_width = be32_to_cpu(from->sb_width);
454 to->sb_dirblklog = from->sb_dirblklog;
455 to->sb_logsectlog = from->sb_logsectlog;
456 to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
457 to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
458 to->sb_features2 = be32_to_cpu(from->sb_features2);
459 to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
460 to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
461 to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
462 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
463 to->sb_features_log_incompat =
464 be32_to_cpu(from->sb_features_log_incompat);
465 to->sb_pad = 0;
466 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
467 to->sb_lsn = be64_to_cpu(from->sb_lsn);
468}
469
470static inline void
471xfs_sb_quota_to_disk(
472 xfs_dsb_t *to,
473 xfs_sb_t *from,
474 __int64_t *fields)
475{
476 __uint16_t qflags = from->sb_qflags;
477
478 /*
479 * We need to do these manipilations only if we are working
480 * with an older version of on-disk superblock.
481 */
482 if (xfs_sb_version_has_pquotino(from))
483 return;
484
485 if (*fields & XFS_SB_QFLAGS) {
486 /*
487 * The in-core version of sb_qflags do not have
488 * XFS_OQUOTA_* flags, whereas the on-disk version
489 * does. So, convert incore XFS_{PG}QUOTA_* flags
490 * to on-disk XFS_OQUOTA_* flags.
491 */
492 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
493 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
494
495 if (from->sb_qflags &
496 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
497 qflags |= XFS_OQUOTA_ENFD;
498 if (from->sb_qflags &
499 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
500 qflags |= XFS_OQUOTA_CHKD;
501 to->sb_qflags = cpu_to_be16(qflags);
502 *fields &= ~XFS_SB_QFLAGS;
503 }
504
505 /*
506 * GQUOTINO and PQUOTINO cannot be used together in versions
507 * of superblock that do not have pquotino. from->sb_flags
508 * tells us which quota is active and should be copied to
509 * disk.
510 */
511 if ((*fields & XFS_SB_GQUOTINO) &&
512 (from->sb_qflags & XFS_GQUOTA_ACCT))
513 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
514 else if ((*fields & XFS_SB_PQUOTINO) &&
515 (from->sb_qflags & XFS_PQUOTA_ACCT))
516 to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
517
518 *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
519}
520
521/*
522 * Copy in core superblock to ondisk one.
523 *
524 * The fields argument is mask of superblock fields to copy.
525 */
526void
527xfs_sb_to_disk(
528 xfs_dsb_t *to,
529 xfs_sb_t *from,
530 __int64_t fields)
531{
532 xfs_caddr_t to_ptr = (xfs_caddr_t)to;
533 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
534 xfs_sb_field_t f;
535 int first;
536 int size;
537
538 ASSERT(fields);
539 if (!fields)
540 return;
541
542 xfs_sb_quota_to_disk(to, from, &fields);
543 while (fields) {
544 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
545 first = xfs_sb_info[f].offset;
546 size = xfs_sb_info[f + 1].offset - first;
547
548 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
549
550 if (size == 1 || xfs_sb_info[f].type == 1) {
551 memcpy(to_ptr + first, from_ptr + first, size);
552 } else {
553 switch (size) {
554 case 2:
555 *(__be16 *)(to_ptr + first) =
556 cpu_to_be16(*(__u16 *)(from_ptr + first));
557 break;
558 case 4:
559 *(__be32 *)(to_ptr + first) =
560 cpu_to_be32(*(__u32 *)(from_ptr + first));
561 break;
562 case 8:
563 *(__be64 *)(to_ptr + first) =
564 cpu_to_be64(*(__u64 *)(from_ptr + first));
565 break;
566 default:
567 ASSERT(0);
568 }
569 }
570
571 fields &= ~(1LL << f);
572 }
573}
574
575static int
576xfs_sb_verify(
577 struct xfs_buf *bp,
578 bool check_version)
579{
580 struct xfs_mount *mp = bp->b_target->bt_mount;
581 struct xfs_sb sb;
582
583 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
584
585 /*
586 * Only check the in progress field for the primary superblock as
587 * mkfs.xfs doesn't clear it from secondary superblocks.
588 */
589 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
590 check_version);
591}
592
593/*
594 * If the superblock has the CRC feature bit set or the CRC field is non-null,
595 * check that the CRC is valid. We check the CRC field is non-null because a
596 * single bit error could clear the feature bit and unused parts of the
597 * superblock are supposed to be zero. Hence a non-null crc field indicates that
598 * we've potentially lost a feature bit and we should check it anyway.
599 */
600static void
601xfs_sb_read_verify(
602 struct xfs_buf *bp)
603{
604 struct xfs_mount *mp = bp->b_target->bt_mount;
605 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
606 int error;
607
608 /*
609 * open code the version check to avoid needing to convert the entire
610 * superblock from disk order just to check the version number
611 */
612 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
613 (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
614 XFS_SB_VERSION_5) ||
615 dsb->sb_crc != 0)) {
616
617 if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
618 offsetof(struct xfs_sb, sb_crc))) {
619 error = EFSCORRUPTED;
620 goto out_error;
621 }
622 }
623 error = xfs_sb_verify(bp, true);
624
625out_error:
626 if (error) {
627 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
628 mp, bp->b_addr);
629 xfs_buf_ioerror(bp, error);
630 }
631}
632
633/*
634 * We may be probed for a filesystem match, so we may not want to emit
635 * messages when the superblock buffer is not actually an XFS superblock.
636 * If we find an XFS superblock, then run a normal, noisy mount because we are
637 * really going to mount it and want to know about errors.
638 */
639static void
640xfs_sb_quiet_read_verify(
641 struct xfs_buf *bp)
642{
643 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
644
645
646 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
647 /* XFS filesystem, verify noisily! */
648 xfs_sb_read_verify(bp);
649 return;
650 }
651 /* quietly fail */
652 xfs_buf_ioerror(bp, EWRONGFS);
653}
654
655static void
656xfs_sb_write_verify(
657 struct xfs_buf *bp)
658{
659 struct xfs_mount *mp = bp->b_target->bt_mount;
660 struct xfs_buf_log_item *bip = bp->b_fspriv;
661 int error;
662
663 error = xfs_sb_verify(bp, false);
664 if (error) {
665 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
666 mp, bp->b_addr);
667 xfs_buf_ioerror(bp, error);
668 return;
669 }
670
671 if (!xfs_sb_version_hascrc(&mp->m_sb))
672 return;
673
674 if (bip)
675 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
676
677 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
678 offsetof(struct xfs_sb, sb_crc));
679}
680
681const struct xfs_buf_ops xfs_sb_buf_ops = {
682 .verify_read = xfs_sb_read_verify,
683 .verify_write = xfs_sb_write_verify,
684};
685
686const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
687 .verify_read = xfs_sb_quiet_read_verify,
688 .verify_write = xfs_sb_write_verify,
689};
690
691/*
692 * xfs_mount_common
693 *
694 * Mount initialization code establishing various mount
695 * fields from the superblock associated with the given
696 * mount structure
697 */
698void
699xfs_sb_mount_common(
700 struct xfs_mount *mp,
701 struct xfs_sb *sbp)
702{
703 mp->m_agfrotor = mp->m_agirotor = 0;
704 spin_lock_init(&mp->m_agirotor_lock);
705 mp->m_maxagi = mp->m_sb.sb_agcount;
706 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
707 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
708 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
709 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
710 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
711 mp->m_blockmask = sbp->sb_blocksize - 1;
712 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
713 mp->m_blockwmask = mp->m_blockwsize - 1;
714
715 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
716 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
717 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
718 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
719
720 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
721 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
722 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
723 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
724
725 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
726 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
727 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
728 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
729
730 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
731 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
732 sbp->sb_inopblock);
733 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
734}
735
736/*
737 * xfs_initialize_perag_data
738 *
739 * Read in each per-ag structure so we can count up the number of
740 * allocated inodes, free inodes and used filesystem blocks as this
741 * information is no longer persistent in the superblock. Once we have
742 * this information, write it into the in-core superblock structure.
743 */
744int
745xfs_initialize_perag_data(
746 struct xfs_mount *mp,
747 xfs_agnumber_t agcount)
748{
749 xfs_agnumber_t index;
750 xfs_perag_t *pag;
751 xfs_sb_t *sbp = &mp->m_sb;
752 uint64_t ifree = 0;
753 uint64_t ialloc = 0;
754 uint64_t bfree = 0;
755 uint64_t bfreelst = 0;
756 uint64_t btree = 0;
757 int error;
758
759 for (index = 0; index < agcount; index++) {
760 /*
761 * read the agf, then the agi. This gets us
762 * all the information we need and populates the
763 * per-ag structures for us.
764 */
765 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
766 if (error)
767 return error;
768
769 error = xfs_ialloc_pagi_init(mp, NULL, index);
770 if (error)
771 return error;
772 pag = xfs_perag_get(mp, index);
773 ifree += pag->pagi_freecount;
774 ialloc += pag->pagi_count;
775 bfree += pag->pagf_freeblks;
776 bfreelst += pag->pagf_flcount;
777 btree += pag->pagf_btreeblks;
778 xfs_perag_put(pag);
779 }
780 /*
781 * Overwrite incore superblock counters with just-read data
782 */
783 spin_lock(&mp->m_sb_lock);
784 sbp->sb_ifree = ifree;
785 sbp->sb_icount = ialloc;
786 sbp->sb_fdblocks = bfree + bfreelst + btree;
787 spin_unlock(&mp->m_sb_lock);
788
789 /* Fixup the per-cpu counters as well. */
790 xfs_icsb_reinit_counters(mp);
791
792 return 0;
793}
794
795/*
796 * xfs_mod_sb() can be used to copy arbitrary changes to the
797 * in-core superblock into the superblock buffer to be logged.
798 * It does not provide the higher level of locking that is
799 * needed to protect the in-core superblock from concurrent
800 * access.
801 */
802void
803xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
804{
805 xfs_buf_t *bp;
806 int first;
807 int last;
808 xfs_mount_t *mp;
809 xfs_sb_field_t f;
810
811 ASSERT(fields);
812 if (!fields)
813 return;
814 mp = tp->t_mountp;
815 bp = xfs_trans_getsb(tp, mp, 0);
816 first = sizeof(xfs_sb_t);
817 last = 0;
818
819 /* translate/copy */
820
821 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
822
823 /* find modified range */
824 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
825 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
826 last = xfs_sb_info[f + 1].offset - 1;
827
828 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
829 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
830 first = xfs_sb_info[f].offset;
831
832 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
833 xfs_trans_log_buf(tp, bp, first, last);
834}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 78f9e70b80c7..6835b44f850e 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -26,6 +26,7 @@
26 26
27struct xfs_buf; 27struct xfs_buf;
28struct xfs_mount; 28struct xfs_mount;
29struct xfs_trans;
29 30
30#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ 31#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
31#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ 32#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
@@ -83,11 +84,13 @@ struct xfs_mount;
83#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 84#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
84#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 85#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
85#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ 86#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
87#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
86 88
87#define XFS_SB_VERSION2_OKREALFBITS \ 89#define XFS_SB_VERSION2_OKREALFBITS \
88 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 90 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
89 XFS_SB_VERSION2_ATTR2BIT | \ 91 XFS_SB_VERSION2_ATTR2BIT | \
90 XFS_SB_VERSION2_PROJID32BIT) 92 XFS_SB_VERSION2_PROJID32BIT | \
93 XFS_SB_VERSION2_FTYPE)
91#define XFS_SB_VERSION2_OKSASHFBITS \ 94#define XFS_SB_VERSION2_OKSASHFBITS \
92 (0) 95 (0)
93#define XFS_SB_VERSION2_OKREALBITS \ 96#define XFS_SB_VERSION2_OKREALBITS \
@@ -354,15 +357,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
354 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) 357 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
355 return 0; 358 return 0;
356 359
357#ifdef __KERNEL__
358 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) 360 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
359 return 0; 361 return 0;
360#else
361 if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
362 sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
363 return 0;
364#endif
365
366 return 1; 362 return 1;
367 } 363 }
368 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) 364 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
@@ -554,12 +550,13 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
554 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); 550 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
555} 551}
556 552
557static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) 553static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
558{ 554{
559 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; 555 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
556 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
557 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
560} 558}
561 559
562
563/* 560/*
564 * Extended v5 superblock feature masks. These are to be used for new v5 561 * Extended v5 superblock feature masks. These are to be used for new v5
565 * superblock features only. 562 * superblock features only.
@@ -598,7 +595,10 @@ xfs_sb_has_ro_compat_feature(
598 return (sbp->sb_features_ro_compat & feature) != 0; 595 return (sbp->sb_features_ro_compat & feature) != 0;
599} 596}
600 597
601#define XFS_SB_FEAT_INCOMPAT_ALL 0 598#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
599#define XFS_SB_FEAT_INCOMPAT_ALL \
600 (XFS_SB_FEAT_INCOMPAT_FTYPE)
601
602#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 602#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
603static inline bool 603static inline bool
604xfs_sb_has_incompat_feature( 604xfs_sb_has_incompat_feature(
@@ -618,16 +618,39 @@ xfs_sb_has_incompat_log_feature(
618 return (sbp->sb_features_log_incompat & feature) != 0; 618 return (sbp->sb_features_log_incompat & feature) != 0;
619} 619}
620 620
621static inline bool 621/*
622xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) 622 * V5 superblock specific feature checks
623 */
624static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
623{ 625{
624 return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino); 626 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
627}
628
629static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
630{
631 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
632}
633
634static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
635{
636 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
637 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
638 (xfs_sb_version_hasmorebits(sbp) &&
639 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
625} 640}
626 641
627/* 642/*
628 * end of superblock version macros 643 * end of superblock version macros
629 */ 644 */
630 645
646static inline bool
647xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
648{
649 return (ino == sbp->sb_uquotino ||
650 ino == sbp->sb_gquotino ||
651 ino == sbp->sb_pquotino);
652}
653
631#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ 654#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
632#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) 655#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
633#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) 656#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
@@ -660,4 +683,23 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
660#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) 683#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
661#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) 684#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
662 685
686/*
687 * perag get/put wrappers for ref counting
688 */
689extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
690extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
691 int tag);
692extern void xfs_perag_put(struct xfs_perag *pag);
693extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
694
695extern void xfs_sb_calc_crc(struct xfs_buf *);
696extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
697extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
698extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
699extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
700extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
701
702extern const struct xfs_buf_ops xfs_sb_buf_ops;
703extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
704
663#endif /* __XFS_SB_H__ */ 705#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1d68ffcdeaa7..15188cc99449 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,12 +17,12 @@
17 */ 17 */
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
23#include "xfs_sb.h" 24#include "xfs_sb.h"
24#include "xfs_ag.h" 25#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_alloc.h" 26#include "xfs_alloc.h"
27#include "xfs_quota.h" 27#include "xfs_quota.h"
28#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -40,12 +40,12 @@
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_attr.h" 41#include "xfs_attr.h"
42#include "xfs_buf_item.h" 42#include "xfs_buf_item.h"
43#include "xfs_utils.h"
44#include "xfs_vnodeops.h"
45#include "xfs_log_priv.h" 43#include "xfs_log_priv.h"
46#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
47#include "xfs_filestream.h" 45#include "xfs_filestream.h"
48#include "xfs_da_btree.h" 46#include "xfs_da_btree.h"
47#include "xfs_dir2_format.h"
48#include "xfs_dir2.h"
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
@@ -421,12 +421,6 @@ xfs_parseargs(
421 } 421 }
422#endif 422#endif
423 423
424 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
425 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
426 xfs_warn(mp, "cannot mount with both project and group quota");
427 return EINVAL;
428 }
429
430 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 424 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
431 xfs_warn(mp, "sunit and swidth must be specified together"); 425 xfs_warn(mp, "sunit and swidth must be specified together");
432 return EINVAL; 426 return EINVAL;
@@ -556,14 +550,13 @@ xfs_showargs(
556 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 550 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
557 seq_puts(m, "," MNTOPT_UQUOTANOENF); 551 seq_puts(m, "," MNTOPT_UQUOTANOENF);
558 552
559 /* Either project or group quotas can be active, not both */
560
561 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 553 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
562 if (mp->m_qflags & XFS_PQUOTA_ENFD) 554 if (mp->m_qflags & XFS_PQUOTA_ENFD)
563 seq_puts(m, "," MNTOPT_PRJQUOTA); 555 seq_puts(m, "," MNTOPT_PRJQUOTA);
564 else 556 else
565 seq_puts(m, "," MNTOPT_PQUOTANOENF); 557 seq_puts(m, "," MNTOPT_PQUOTANOENF);
566 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { 558 }
559 if (mp->m_qflags & XFS_GQUOTA_ACCT) {
567 if (mp->m_qflags & XFS_GQUOTA_ENFD) 560 if (mp->m_qflags & XFS_GQUOTA_ENFD)
568 seq_puts(m, "," MNTOPT_GRPQUOTA); 561 seq_puts(m, "," MNTOPT_GRPQUOTA);
569 else 562 else
@@ -870,17 +863,17 @@ xfs_init_mount_workqueues(
870 goto out_destroy_unwritten; 863 goto out_destroy_unwritten;
871 864
872 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", 865 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname); 866 0, 0, mp->m_fsname);
874 if (!mp->m_reclaim_workqueue) 867 if (!mp->m_reclaim_workqueue)
875 goto out_destroy_cil; 868 goto out_destroy_cil;
876 869
877 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", 870 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname); 871 0, 0, mp->m_fsname);
879 if (!mp->m_log_workqueue) 872 if (!mp->m_log_workqueue)
880 goto out_destroy_reclaim; 873 goto out_destroy_reclaim;
881 874
882 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", 875 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
883 WQ_NON_REENTRANT, 0, mp->m_fsname); 876 0, 0, mp->m_fsname);
884 if (!mp->m_eofblocks_workqueue) 877 if (!mp->m_eofblocks_workqueue)
885 goto out_destroy_log; 878 goto out_destroy_log;
886 879
@@ -1396,6 +1389,14 @@ xfs_finish_flags(
1396 return XFS_ERROR(EROFS); 1389 return XFS_ERROR(EROFS);
1397 } 1390 }
1398 1391
1392 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
1393 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
1394 !xfs_sb_version_has_pquotino(&mp->m_sb)) {
1395 xfs_warn(mp,
1396 "Super block does not support project and group quota together");
1397 return XFS_ERROR(EINVAL);
1398 }
1399
1399 return 0; 1400 return 0;
1400} 1401}
1401 1402
@@ -1534,19 +1535,21 @@ xfs_fs_mount(
1534 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); 1535 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1535} 1536}
1536 1537
1537static int 1538static long
1538xfs_fs_nr_cached_objects( 1539xfs_fs_nr_cached_objects(
1539 struct super_block *sb) 1540 struct super_block *sb,
1541 int nid)
1540{ 1542{
1541 return xfs_reclaim_inodes_count(XFS_M(sb)); 1543 return xfs_reclaim_inodes_count(XFS_M(sb));
1542} 1544}
1543 1545
1544static void 1546static long
1545xfs_fs_free_cached_objects( 1547xfs_fs_free_cached_objects(
1546 struct super_block *sb, 1548 struct super_block *sb,
1547 int nr_to_scan) 1549 long nr_to_scan,
1550 int nid)
1548{ 1551{
1549 xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); 1552 return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
1550} 1553}
1551 1554
1552static const struct super_operations xfs_super_operations = { 1555static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f4895b662fcb..f622a97a7e33 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -18,201 +18,31 @@
18 */ 18 */
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_format.h"
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_dir2_format.h"
30#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
31#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
32#include "xfs_dinode.h" 33#include "xfs_dinode.h"
33#include "xfs_inode.h" 34#include "xfs_inode.h"
34#include "xfs_inode_item.h"
35#include "xfs_itable.h"
36#include "xfs_ialloc.h" 35#include "xfs_ialloc.h"
37#include "xfs_alloc.h" 36#include "xfs_alloc.h"
38#include "xfs_bmap.h" 37#include "xfs_bmap.h"
38#include "xfs_bmap_util.h"
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_quota.h" 40#include "xfs_quota.h"
41#include "xfs_utils.h"
42#include "xfs_trans_space.h" 41#include "xfs_trans_space.h"
43#include "xfs_log_priv.h"
44#include "xfs_trace.h" 42#include "xfs_trace.h"
45#include "xfs_symlink.h" 43#include "xfs_symlink.h"
46#include "xfs_cksum.h"
47#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
48 45
49
50/*
51 * Each contiguous block has a header, so it is not just a simple pathlen
52 * to FSB conversion.
53 */
54int
55xfs_symlink_blocks(
56 struct xfs_mount *mp,
57 int pathlen)
58{
59 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
60
61 return (pathlen + buflen - 1) / buflen;
62}
63
64static int
65xfs_symlink_hdr_set(
66 struct xfs_mount *mp,
67 xfs_ino_t ino,
68 uint32_t offset,
69 uint32_t size,
70 struct xfs_buf *bp)
71{
72 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
73
74 if (!xfs_sb_version_hascrc(&mp->m_sb))
75 return 0;
76
77 dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
78 dsl->sl_offset = cpu_to_be32(offset);
79 dsl->sl_bytes = cpu_to_be32(size);
80 uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
81 dsl->sl_owner = cpu_to_be64(ino);
82 dsl->sl_blkno = cpu_to_be64(bp->b_bn);
83 bp->b_ops = &xfs_symlink_buf_ops;
84
85 return sizeof(struct xfs_dsymlink_hdr);
86}
87
88/*
89 * Checking of the symlink header is split into two parts. the verifier does
90 * CRC, location and bounds checking, the unpacking function checks the path
91 * parameters and owner.
92 */
93bool
94xfs_symlink_hdr_ok(
95 struct xfs_mount *mp,
96 xfs_ino_t ino,
97 uint32_t offset,
98 uint32_t size,
99 struct xfs_buf *bp)
100{
101 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
102
103 if (offset != be32_to_cpu(dsl->sl_offset))
104 return false;
105 if (size != be32_to_cpu(dsl->sl_bytes))
106 return false;
107 if (ino != be64_to_cpu(dsl->sl_owner))
108 return false;
109
110 /* ok */
111 return true;
112}
113
114static bool
115xfs_symlink_verify(
116 struct xfs_buf *bp)
117{
118 struct xfs_mount *mp = bp->b_target->bt_mount;
119 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
120
121 if (!xfs_sb_version_hascrc(&mp->m_sb))
122 return false;
123 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
124 return false;
125 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
126 return false;
127 if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
128 return false;
129 if (be32_to_cpu(dsl->sl_offset) +
130 be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
131 return false;
132 if (dsl->sl_owner == 0)
133 return false;
134
135 return true;
136}
137
138static void
139xfs_symlink_read_verify(
140 struct xfs_buf *bp)
141{
142 struct xfs_mount *mp = bp->b_target->bt_mount;
143
144 /* no verification of non-crc buffers */
145 if (!xfs_sb_version_hascrc(&mp->m_sb))
146 return;
147
148 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
149 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
150 !xfs_symlink_verify(bp)) {
151 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
152 xfs_buf_ioerror(bp, EFSCORRUPTED);
153 }
154}
155
156static void
157xfs_symlink_write_verify(
158 struct xfs_buf *bp)
159{
160 struct xfs_mount *mp = bp->b_target->bt_mount;
161 struct xfs_buf_log_item *bip = bp->b_fspriv;
162
163 /* no verification of non-crc buffers */
164 if (!xfs_sb_version_hascrc(&mp->m_sb))
165 return;
166
167 if (!xfs_symlink_verify(bp)) {
168 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
169 xfs_buf_ioerror(bp, EFSCORRUPTED);
170 return;
171 }
172
173 if (bip) {
174 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
175 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
176 }
177 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
178 offsetof(struct xfs_dsymlink_hdr, sl_crc));
179}
180
181const struct xfs_buf_ops xfs_symlink_buf_ops = {
182 .verify_read = xfs_symlink_read_verify,
183 .verify_write = xfs_symlink_write_verify,
184};
185
186void
187xfs_symlink_local_to_remote(
188 struct xfs_trans *tp,
189 struct xfs_buf *bp,
190 struct xfs_inode *ip,
191 struct xfs_ifork *ifp)
192{
193 struct xfs_mount *mp = ip->i_mount;
194 char *buf;
195
196 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
197 bp->b_ops = NULL;
198 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
199 return;
200 }
201
202 /*
203 * As this symlink fits in an inode literal area, it must also fit in
204 * the smallest buffer the filesystem supports.
205 */
206 ASSERT(BBTOB(bp->b_length) >=
207 ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
208
209 bp->b_ops = &xfs_symlink_buf_ops;
210
211 buf = bp->b_addr;
212 buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
213 memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
214}
215
216/* ----- Kernel only functions below ----- */ 46/* ----- Kernel only functions below ----- */
217STATIC int 47STATIC int
218xfs_readlink_bmap( 48xfs_readlink_bmap(
@@ -386,8 +216,11 @@ xfs_symlink(
386 /* 216 /*
387 * Make sure that we have allocated dquot(s) on disk. 217 * Make sure that we have allocated dquot(s) on disk.
388 */ 218 */
389 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 219 error = xfs_qm_vop_dqalloc(dp,
390 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); 220 xfs_kuid_to_uid(current_fsuid()),
221 xfs_kgid_to_gid(current_fsgid()), prid,
222 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
223 &udqp, &gdqp, &pdqp);
391 if (error) 224 if (error)
392 goto std_return; 225 goto std_return;
393 226
@@ -402,12 +235,10 @@ xfs_symlink(
402 else 235 else
403 fs_blocks = xfs_symlink_blocks(mp, pathlen); 236 fs_blocks = xfs_symlink_blocks(mp, pathlen);
404 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); 237 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
405 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, 238 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
406 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
407 if (error == ENOSPC && fs_blocks == 0) { 239 if (error == ENOSPC && fs_blocks == 0) {
408 resblks = 0; 240 resblks = 0;
409 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, 241 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
410 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
411 } 242 }
412 if (error) { 243 if (error) {
413 cancel_flags = 0; 244 cancel_flags = 0;
@@ -533,6 +364,7 @@ xfs_symlink(
533 pathlen -= byte_cnt; 364 pathlen -= byte_cnt;
534 offset += byte_cnt; 365 offset += byte_cnt;
535 366
367 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
536 xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - 368 xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
537 (char *)bp->b_addr); 369 (char *)bp->b_addr);
538 } 370 }
@@ -710,8 +542,8 @@ xfs_inactive_symlink_rmt(
710 * Put an itruncate log reservation in the new transaction 542 * Put an itruncate log reservation in the new transaction
711 * for our caller. 543 * for our caller.
712 */ 544 */
713 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 545 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
714 XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { 546 if (error) {
715 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 547 ASSERT(XFS_FORCED_SHUTDOWN(mp));
716 goto error0; 548 goto error0;
717 } 549 }
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 374394880c01..99338ba666ac 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -17,50 +17,11 @@
17#ifndef __XFS_SYMLINK_H 17#ifndef __XFS_SYMLINK_H
18#define __XFS_SYMLINK_H 1 18#define __XFS_SYMLINK_H 1
19 19
20struct xfs_mount; 20/* Kernel only symlink defintions */
21struct xfs_trans;
22struct xfs_inode;
23struct xfs_buf;
24struct xfs_ifork;
25struct xfs_name;
26
27#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
28
29struct xfs_dsymlink_hdr {
30 __be32 sl_magic;
31 __be32 sl_offset;
32 __be32 sl_bytes;
33 __be32 sl_crc;
34 uuid_t sl_uuid;
35 __be64 sl_owner;
36 __be64 sl_blkno;
37 __be64 sl_lsn;
38};
39
40/*
41 * The maximum pathlen is 1024 bytes. Since the minimum file system
42 * blocksize is 512 bytes, we can get a max of 3 extents back from
43 * bmapi when crc headers are taken into account.
44 */
45#define XFS_SYMLINK_MAPS 3
46
47#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
48 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
49 sizeof(struct xfs_dsymlink_hdr) : 0))
50
51int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
52
53void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
54 struct xfs_inode *ip, struct xfs_ifork *ifp);
55
56extern const struct xfs_buf_ops xfs_symlink_buf_ops;
57
58#ifdef __KERNEL__
59 21
60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 22int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
61 const char *target_path, umode_t mode, struct xfs_inode **ipp); 23 const char *target_path, umode_t mode, struct xfs_inode **ipp);
62int xfs_readlink(struct xfs_inode *ip, char *link); 24int xfs_readlink(struct xfs_inode *ip, char *link);
63int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); 25int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
64 26
65#endif /* __KERNEL__ */
66#endif /* __XFS_SYMLINK_H */ 27#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
new file mode 100644
index 000000000000..01c85e3f6470
--- /dev/null
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -0,0 +1,200 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012-2013 Red Hat, Inc.
4 * All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log.h"
23#include "xfs_trans.h"
24#include "xfs_ag.h"
25#include "xfs_sb.h"
26#include "xfs_mount.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h"
29#include "xfs_error.h"
30#include "xfs_trace.h"
31#include "xfs_symlink.h"
32#include "xfs_cksum.h"
33#include "xfs_buf_item.h"
34
35
36/*
37 * Each contiguous block has a header, so it is not just a simple pathlen
38 * to FSB conversion.
39 */
40int
41xfs_symlink_blocks(
42 struct xfs_mount *mp,
43 int pathlen)
44{
45 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
46
47 return (pathlen + buflen - 1) / buflen;
48}
49
50int
51xfs_symlink_hdr_set(
52 struct xfs_mount *mp,
53 xfs_ino_t ino,
54 uint32_t offset,
55 uint32_t size,
56 struct xfs_buf *bp)
57{
58 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
59
60 if (!xfs_sb_version_hascrc(&mp->m_sb))
61 return 0;
62
63 dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
64 dsl->sl_offset = cpu_to_be32(offset);
65 dsl->sl_bytes = cpu_to_be32(size);
66 uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
67 dsl->sl_owner = cpu_to_be64(ino);
68 dsl->sl_blkno = cpu_to_be64(bp->b_bn);
69 bp->b_ops = &xfs_symlink_buf_ops;
70
71 return sizeof(struct xfs_dsymlink_hdr);
72}
73
74/*
75 * Checking of the symlink header is split into two parts. the verifier does
76 * CRC, location and bounds checking, the unpacking function checks the path
77 * parameters and owner.
78 */
79bool
80xfs_symlink_hdr_ok(
81 struct xfs_mount *mp,
82 xfs_ino_t ino,
83 uint32_t offset,
84 uint32_t size,
85 struct xfs_buf *bp)
86{
87 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
88
89 if (offset != be32_to_cpu(dsl->sl_offset))
90 return false;
91 if (size != be32_to_cpu(dsl->sl_bytes))
92 return false;
93 if (ino != be64_to_cpu(dsl->sl_owner))
94 return false;
95
96 /* ok */
97 return true;
98}
99
100static bool
101xfs_symlink_verify(
102 struct xfs_buf *bp)
103{
104 struct xfs_mount *mp = bp->b_target->bt_mount;
105 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
106
107 if (!xfs_sb_version_hascrc(&mp->m_sb))
108 return false;
109 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
110 return false;
111 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
112 return false;
113 if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
114 return false;
115 if (be32_to_cpu(dsl->sl_offset) +
116 be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
117 return false;
118 if (dsl->sl_owner == 0)
119 return false;
120
121 return true;
122}
123
124static void
125xfs_symlink_read_verify(
126 struct xfs_buf *bp)
127{
128 struct xfs_mount *mp = bp->b_target->bt_mount;
129
130 /* no verification of non-crc buffers */
131 if (!xfs_sb_version_hascrc(&mp->m_sb))
132 return;
133
134 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
135 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
136 !xfs_symlink_verify(bp)) {
137 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
138 xfs_buf_ioerror(bp, EFSCORRUPTED);
139 }
140}
141
142static void
143xfs_symlink_write_verify(
144 struct xfs_buf *bp)
145{
146 struct xfs_mount *mp = bp->b_target->bt_mount;
147 struct xfs_buf_log_item *bip = bp->b_fspriv;
148
149 /* no verification of non-crc buffers */
150 if (!xfs_sb_version_hascrc(&mp->m_sb))
151 return;
152
153 if (!xfs_symlink_verify(bp)) {
154 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
155 xfs_buf_ioerror(bp, EFSCORRUPTED);
156 return;
157 }
158
159 if (bip) {
160 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
161 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
162 }
163 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
164 offsetof(struct xfs_dsymlink_hdr, sl_crc));
165}
166
167const struct xfs_buf_ops xfs_symlink_buf_ops = {
168 .verify_read = xfs_symlink_read_verify,
169 .verify_write = xfs_symlink_write_verify,
170};
171
172void
173xfs_symlink_local_to_remote(
174 struct xfs_trans *tp,
175 struct xfs_buf *bp,
176 struct xfs_inode *ip,
177 struct xfs_ifork *ifp)
178{
179 struct xfs_mount *mp = ip->i_mount;
180 char *buf;
181
182 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
183 bp->b_ops = NULL;
184 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
185 return;
186 }
187
188 /*
189 * As this symlink fits in an inode literal area, it must also fit in
190 * the smallest buffer the filesystem supports.
191 */
192 ASSERT(BBTOB(bp->b_length) >=
193 ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
194
195 bp->b_ops = &xfs_symlink_buf_ops;
196
197 buf = bp->b_addr;
198 buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
199 memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
200}
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index b6e3897c1d9f..5d7b3e40705f 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_format.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
23#include "xfs_sb.h" 24#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 35a229981354..5411e01ab452 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,7 +18,7 @@
18 */ 18 */
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_format.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
@@ -49,629 +49,6 @@ kmem_zone_t *xfs_trans_zone;
49kmem_zone_t *xfs_log_item_desc_zone; 49kmem_zone_t *xfs_log_item_desc_zone;
50 50
51/* 51/*
52 * A buffer has a format structure overhead in the log in addition
53 * to the data, so we need to take this into account when reserving
54 * space in a transaction for a buffer. Round the space required up
55 * to a multiple of 128 bytes so that we don't change the historical
56 * reservation that has been used for this overhead.
57 */
58STATIC uint
59xfs_buf_log_overhead(void)
60{
61 return round_up(sizeof(struct xlog_op_header) +
62 sizeof(struct xfs_buf_log_format), 128);
63}
64
65/*
66 * Calculate out transaction log reservation per item in bytes.
67 *
68 * The nbufs argument is used to indicate the number of items that
69 * will be changed in a transaction. size is used to tell how many
70 * bytes should be reserved per item.
71 */
72STATIC uint
73xfs_calc_buf_res(
74 uint nbufs,
75 uint size)
76{
77 return nbufs * (size + xfs_buf_log_overhead());
78}
79
80/*
81 * Various log reservation values.
82 *
83 * These are based on the size of the file system block because that is what
84 * most transactions manipulate. Each adds in an additional 128 bytes per
85 * item logged to try to account for the overhead of the transaction mechanism.
86 *
87 * Note: Most of the reservations underestimate the number of allocation
88 * groups into which they could free extents in the xfs_bmap_finish() call.
89 * This is because the number in the worst case is quite high and quite
90 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
91 * extents in only a single AG at a time. This will require changes to the
92 * EFI code as well, however, so that the EFI for the extents not freed is
93 * logged again in each transaction. See SGI PV #261917.
94 *
95 * Reservation functions here avoid a huge stack in xfs_trans_init due to
96 * register overflow from temporaries in the calculations.
97 */
98
99
100/*
101 * In a write transaction we can allocate a maximum of 2
102 * extents. This gives:
103 * the inode getting the new extents: inode size
104 * the inode's bmap btree: max depth * block size
105 * the agfs of the ags from which the extents are allocated: 2 * sector
106 * the superblock free block counter: sector size
107 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
108 * And the bmap_finish transaction can free bmap blocks in a join:
109 * the agfs of the ags containing the blocks: 2 * sector size
110 * the agfls of the ags containing the blocks: 2 * sector size
111 * the super block free block counter: sector size
112 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
113 */
114STATIC uint
115xfs_calc_write_reservation(
116 struct xfs_mount *mp)
117{
118 return XFS_DQUOT_LOGRES(mp) +
119 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
120 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
121 XFS_FSB_TO_B(mp, 1)) +
122 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
123 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
124 XFS_FSB_TO_B(mp, 1))),
125 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
126 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
127 XFS_FSB_TO_B(mp, 1))));
128}
129
130/*
131 * In truncating a file we free up to two extents at once. We can modify:
132 * the inode being truncated: inode size
133 * the inode's bmap btree: (max depth + 1) * block size
134 * And the bmap_finish transaction can free the blocks and bmap blocks:
135 * the agf for each of the ags: 4 * sector size
136 * the agfl for each of the ags: 4 * sector size
137 * the super block to reflect the freed blocks: sector size
138 * worst case split in allocation btrees per extent assuming 4 extents:
139 * 4 exts * 2 trees * (2 * max depth - 1) * block size
140 * the inode btree: max depth * blocksize
141 * the allocation btrees: 2 trees * (max depth - 1) * block size
142 */
143STATIC uint
144xfs_calc_itruncate_reservation(
145 struct xfs_mount *mp)
146{
147 return XFS_DQUOT_LOGRES(mp) +
148 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
149 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
150 XFS_FSB_TO_B(mp, 1))),
151 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
152 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
153 XFS_FSB_TO_B(mp, 1)) +
154 xfs_calc_buf_res(5, 0) +
155 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
156 XFS_FSB_TO_B(mp, 1)) +
157 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
158 mp->m_in_maxlevels, 0)));
159}
160
161/*
162 * In renaming a files we can modify:
163 * the four inodes involved: 4 * inode size
164 * the two directory btrees: 2 * (max depth + v2) * dir block size
165 * the two directory bmap btrees: 2 * max depth * block size
166 * And the bmap_finish transaction can free dir and bmap blocks (two sets
167 * of bmap blocks) giving:
168 * the agf for the ags in which the blocks live: 3 * sector size
169 * the agfl for the ags in which the blocks live: 3 * sector size
170 * the superblock for the free block count: sector size
171 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
172 */
173STATIC uint
174xfs_calc_rename_reservation(
175 struct xfs_mount *mp)
176{
177 return XFS_DQUOT_LOGRES(mp) +
178 MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
179 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
180 XFS_FSB_TO_B(mp, 1))),
181 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
182 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
183 XFS_FSB_TO_B(mp, 1))));
184}
185
186/*
187 * For creating a link to an inode:
188 * the parent directory inode: inode size
189 * the linked inode: inode size
190 * the directory btree could split: (max depth + v2) * dir block size
191 * the directory bmap btree could join or split: (max depth + v2) * blocksize
192 * And the bmap_finish transaction can free some bmap blocks giving:
193 * the agf for the ag in which the blocks live: sector size
194 * the agfl for the ag in which the blocks live: sector size
195 * the superblock for the free block count: sector size
196 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
197 */
198STATIC uint
199xfs_calc_link_reservation(
200 struct xfs_mount *mp)
201{
202 return XFS_DQUOT_LOGRES(mp) +
203 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
204 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
205 XFS_FSB_TO_B(mp, 1))),
206 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
207 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
208 XFS_FSB_TO_B(mp, 1))));
209}
210
211/*
212 * For removing a directory entry we can modify:
213 * the parent directory inode: inode size
214 * the removed inode: inode size
215 * the directory btree could join: (max depth + v2) * dir block size
216 * the directory bmap btree could join or split: (max depth + v2) * blocksize
217 * And the bmap_finish transaction can free the dir and bmap blocks giving:
218 * the agf for the ag in which the blocks live: 2 * sector size
219 * the agfl for the ag in which the blocks live: 2 * sector size
220 * the superblock for the free block count: sector size
221 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
222 */
223STATIC uint
224xfs_calc_remove_reservation(
225 struct xfs_mount *mp)
226{
227 return XFS_DQUOT_LOGRES(mp) +
228 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
229 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
230 XFS_FSB_TO_B(mp, 1))),
231 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
232 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
233 XFS_FSB_TO_B(mp, 1))));
234}
235
236/*
237 * For create, break it in to the two cases that the transaction
238 * covers. We start with the modify case - allocation done by modification
239 * of the state of existing inodes - and the allocation case.
240 */
241
242/*
243 * For create we can modify:
244 * the parent directory inode: inode size
245 * the new inode: inode size
246 * the inode btree entry: block size
247 * the superblock for the nlink flag: sector size
248 * the directory btree: (max depth + v2) * dir block size
249 * the directory inode's bmap btree: (max depth + v2) * block size
250 */
251STATIC uint
252xfs_calc_create_resv_modify(
253 struct xfs_mount *mp)
254{
255 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
256 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
257 (uint)XFS_FSB_TO_B(mp, 1) +
258 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
259}
260
261/*
262 * For create we can allocate some inodes giving:
263 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
264 * the superblock for the nlink flag: sector size
265 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
266 * the inode btree: max depth * blocksize
267 * the allocation btrees: 2 trees * (max depth - 1) * block size
268 */
269STATIC uint
270xfs_calc_create_resv_alloc(
271 struct xfs_mount *mp)
272{
273 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
274 mp->m_sb.sb_sectsize +
275 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
276 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
277 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
278 XFS_FSB_TO_B(mp, 1));
279}
280
281STATIC uint
282__xfs_calc_create_reservation(
283 struct xfs_mount *mp)
284{
285 return XFS_DQUOT_LOGRES(mp) +
286 MAX(xfs_calc_create_resv_alloc(mp),
287 xfs_calc_create_resv_modify(mp));
288}
289
290/*
291 * For icreate we can allocate some inodes giving:
292 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
293 * the superblock for the nlink flag: sector size
294 * the inode btree: max depth * blocksize
295 * the allocation btrees: 2 trees * (max depth - 1) * block size
296 */
297STATIC uint
298xfs_calc_icreate_resv_alloc(
299 struct xfs_mount *mp)
300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
305 XFS_FSB_TO_B(mp, 1));
306}
307
308STATIC uint
309xfs_calc_icreate_reservation(xfs_mount_t *mp)
310{
311 return XFS_DQUOT_LOGRES(mp) +
312 MAX(xfs_calc_icreate_resv_alloc(mp),
313 xfs_calc_create_resv_modify(mp));
314}
315
316STATIC uint
317xfs_calc_create_reservation(
318 struct xfs_mount *mp)
319{
320 if (xfs_sb_version_hascrc(&mp->m_sb))
321 return xfs_calc_icreate_reservation(mp);
322 return __xfs_calc_create_reservation(mp);
323
324}
325
326/*
327 * Making a new directory is the same as creating a new file.
328 */
329STATIC uint
330xfs_calc_mkdir_reservation(
331 struct xfs_mount *mp)
332{
333 return xfs_calc_create_reservation(mp);
334}
335
336
337/*
338 * Making a new symplink is the same as creating a new file, but
339 * with the added blocks for remote symlink data which can be up to 1kB in
340 * length (MAXPATHLEN).
341 */
342STATIC uint
343xfs_calc_symlink_reservation(
344 struct xfs_mount *mp)
345{
346 return xfs_calc_create_reservation(mp) +
347 xfs_calc_buf_res(1, MAXPATHLEN);
348}
349
350/*
351 * In freeing an inode we can modify:
352 * the inode being freed: inode size
353 * the super block free inode counter: sector size
354 * the agi hash list and counters: sector size
355 * the inode btree entry: block size
356 * the on disk inode before ours in the agi hash list: inode cluster size
357 * the inode btree: max depth * blocksize
358 * the allocation btrees: 2 trees * (max depth - 1) * block size
359 */
360STATIC uint
361xfs_calc_ifree_reservation(
362 struct xfs_mount *mp)
363{
364 return XFS_DQUOT_LOGRES(mp) +
365 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
366 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
367 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
368 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
369 XFS_INODE_CLUSTER_SIZE(mp)) +
370 xfs_calc_buf_res(1, 0) +
371 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
372 mp->m_in_maxlevels, 0) +
373 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
374 XFS_FSB_TO_B(mp, 1));
375}
376
377/*
378 * When only changing the inode we log the inode and possibly the superblock
379 * We also add a bit of slop for the transaction stuff.
380 */
381STATIC uint
382xfs_calc_ichange_reservation(
383 struct xfs_mount *mp)
384{
385 return XFS_DQUOT_LOGRES(mp) +
386 mp->m_sb.sb_inodesize +
387 mp->m_sb.sb_sectsize +
388 512;
389
390}
391
392/*
393 * Growing the data section of the filesystem.
394 * superblock
395 * agi and agf
396 * allocation btrees
397 */
398STATIC uint
399xfs_calc_growdata_reservation(
400 struct xfs_mount *mp)
401{
402 return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
403 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
404 XFS_FSB_TO_B(mp, 1));
405}
406
407/*
408 * Growing the rt section of the filesystem.
409 * In the first set of transactions (ALLOC) we allocate space to the
410 * bitmap or summary files.
411 * superblock: sector size
412 * agf of the ag from which the extent is allocated: sector size
413 * bmap btree for bitmap/summary inode: max depth * blocksize
414 * bitmap/summary inode: inode size
415 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
416 */
417STATIC uint
418xfs_calc_growrtalloc_reservation(
419 struct xfs_mount *mp)
420{
421 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
422 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
423 XFS_FSB_TO_B(mp, 1)) +
424 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
425 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
426 XFS_FSB_TO_B(mp, 1));
427}
428
429/*
430 * Growing the rt section of the filesystem.
431 * In the second set of transactions (ZERO) we zero the new metadata blocks.
432 * one bitmap/summary block: blocksize
433 */
434STATIC uint
435xfs_calc_growrtzero_reservation(
436 struct xfs_mount *mp)
437{
438 return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
439}
440
441/*
442 * Growing the rt section of the filesystem.
443 * In the third set of transactions (FREE) we update metadata without
444 * allocating any new blocks.
445 * superblock: sector size
446 * bitmap inode: inode size
447 * summary inode: inode size
448 * one bitmap block: blocksize
449 * summary blocks: new summary size
450 */
451STATIC uint
452xfs_calc_growrtfree_reservation(
453 struct xfs_mount *mp)
454{
455 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
456 xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
457 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
458 xfs_calc_buf_res(1, mp->m_rsumsize);
459}
460
461/*
462 * Logging the inode modification timestamp on a synchronous write.
463 * inode
464 */
465STATIC uint
466xfs_calc_swrite_reservation(
467 struct xfs_mount *mp)
468{
469 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
470}
471
472/*
473 * Logging the inode mode bits when writing a setuid/setgid file
474 * inode
475 */
476STATIC uint
477xfs_calc_writeid_reservation(xfs_mount_t *mp)
478{
479 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
480}
481
482/*
483 * Converting the inode from non-attributed to attributed.
484 * the inode being converted: inode size
485 * agf block and superblock (for block allocation)
486 * the new block (directory sized)
487 * bmap blocks for the new directory block
488 * allocation btrees
489 */
490STATIC uint
491xfs_calc_addafork_reservation(
492 struct xfs_mount *mp)
493{
494 return XFS_DQUOT_LOGRES(mp) +
495 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
496 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
497 xfs_calc_buf_res(1, mp->m_dirblksize) +
498 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
499 XFS_FSB_TO_B(mp, 1)) +
500 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
501 XFS_FSB_TO_B(mp, 1));
502}
503
504/*
505 * Removing the attribute fork of a file
506 * the inode being truncated: inode size
507 * the inode's bmap btree: max depth * block size
508 * And the bmap_finish transaction can free the blocks and bmap blocks:
509 * the agf for each of the ags: 4 * sector size
510 * the agfl for each of the ags: 4 * sector size
511 * the super block to reflect the freed blocks: sector size
512 * worst case split in allocation btrees per extent assuming 4 extents:
513 * 4 exts * 2 trees * (2 * max depth - 1) * block size
514 */
515STATIC uint
516xfs_calc_attrinval_reservation(
517 struct xfs_mount *mp)
518{
519 return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
520 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
521 XFS_FSB_TO_B(mp, 1))),
522 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
523 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
524 XFS_FSB_TO_B(mp, 1))));
525}
526
527/*
528 * Setting an attribute at mount time.
529 * the inode getting the attribute
530 * the superblock for allocations
531 * the agfs extents are allocated from
532 * the attribute btree * max depth
533 * the inode allocation btree
534 * Since attribute transaction space is dependent on the size of the attribute,
535 * the calculation is done partially at mount time and partially at runtime(see
536 * below).
537 */
538STATIC uint
539xfs_calc_attrsetm_reservation(
540 struct xfs_mount *mp)
541{
542 return XFS_DQUOT_LOGRES(mp) +
543 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
544 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
545 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
546}
547
548/*
549 * Setting an attribute at runtime, transaction space unit per block.
550 * the superblock for allocations: sector size
551 * the inode bmap btree could join or split: max depth * block size
552 * Since the runtime attribute transaction space is dependent on the total
553 * blocks needed for the 1st bmap, here we calculate out the space unit for
554 * one block so that the caller could figure out the total space according
555 * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
556 */
557STATIC uint
558xfs_calc_attrsetrt_reservation(
559 struct xfs_mount *mp)
560{
561 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
562 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
563 XFS_FSB_TO_B(mp, 1));
564}
565
566/*
567 * Removing an attribute.
568 * the inode: inode size
569 * the attribute btree could join: max depth * block size
570 * the inode bmap btree could join or split: max depth * block size
571 * And the bmap_finish transaction can free the attr blocks freed giving:
572 * the agf for the ag in which the blocks live: 2 * sector size
573 * the agfl for the ag in which the blocks live: 2 * sector size
574 * the superblock for the free block count: sector size
575 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
576 */
577STATIC uint
578xfs_calc_attrrm_reservation(
579 struct xfs_mount *mp)
580{
581 return XFS_DQUOT_LOGRES(mp) +
582 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
583 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
584 XFS_FSB_TO_B(mp, 1)) +
585 (uint)XFS_FSB_TO_B(mp,
586 XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
587 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
588 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
589 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
590 XFS_FSB_TO_B(mp, 1))));
591}
592
593/*
594 * Clearing a bad agino number in an agi hash bucket.
595 */
596STATIC uint
597xfs_calc_clear_agi_bucket_reservation(
598 struct xfs_mount *mp)
599{
600 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
601}
602
603/*
604 * Clearing the quotaflags in the superblock.
605 * the super block for changing quota flags: sector size
606 */
607STATIC uint
608xfs_calc_qm_sbchange_reservation(
609 struct xfs_mount *mp)
610{
611 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
612}
613
614/*
615 * Adjusting quota limits.
616 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
617 */
618STATIC uint
619xfs_calc_qm_setqlim_reservation(
620 struct xfs_mount *mp)
621{
622 return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
623}
624
625/*
626 * Allocating quota on disk if needed.
627 * the write transaction log space: XFS_WRITE_LOG_RES(mp)
628 * the unit of quota allocation: one system block size
629 */
630STATIC uint
631xfs_calc_qm_dqalloc_reservation(
632 struct xfs_mount *mp)
633{
634 return XFS_WRITE_LOG_RES(mp) +
635 xfs_calc_buf_res(1,
636 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
637}
638
639/*
640 * Turning off quotas.
641 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
642 * the superblock for the quota flags: sector size
643 */
644STATIC uint
645xfs_calc_qm_quotaoff_reservation(
646 struct xfs_mount *mp)
647{
648 return sizeof(struct xfs_qoff_logitem) * 2 +
649 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
650}
651
652/*
653 * End of turning off quotas.
654 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
655 */
656STATIC uint
657xfs_calc_qm_quotaoff_end_reservation(
658 struct xfs_mount *mp)
659{
660 return sizeof(struct xfs_qoff_logitem) * 2;
661}
662
663/*
664 * Syncing the incore super block changes to disk.
665 * the super block to reflect the changes: sector size
666 */
667STATIC uint
668xfs_calc_sb_reservation(
669 struct xfs_mount *mp)
670{
671 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
672}
673
674/*
675 * Initialize the precomputed transaction reservation values 52 * Initialize the precomputed transaction reservation values
676 * in the mount structure. 53 * in the mount structure.
677 */ 54 */
@@ -679,36 +56,7 @@ void
679xfs_trans_init( 56xfs_trans_init(
680 struct xfs_mount *mp) 57 struct xfs_mount *mp)
681{ 58{
682 struct xfs_trans_reservations *resp = &mp->m_reservations; 59 xfs_trans_resv_calc(mp, M_RES(mp));
683
684 resp->tr_write = xfs_calc_write_reservation(mp);
685 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
686 resp->tr_rename = xfs_calc_rename_reservation(mp);
687 resp->tr_link = xfs_calc_link_reservation(mp);
688 resp->tr_remove = xfs_calc_remove_reservation(mp);
689 resp->tr_symlink = xfs_calc_symlink_reservation(mp);
690 resp->tr_create = xfs_calc_create_reservation(mp);
691 resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
692 resp->tr_ifree = xfs_calc_ifree_reservation(mp);
693 resp->tr_ichange = xfs_calc_ichange_reservation(mp);
694 resp->tr_growdata = xfs_calc_growdata_reservation(mp);
695 resp->tr_swrite = xfs_calc_swrite_reservation(mp);
696 resp->tr_writeid = xfs_calc_writeid_reservation(mp);
697 resp->tr_addafork = xfs_calc_addafork_reservation(mp);
698 resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
699 resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
700 resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
701 resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
702 resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
703 resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
704 resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
705 resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
706 resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
707 resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
708 resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
709 resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
710 resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
711 resp->tr_sb = xfs_calc_sb_reservation(mp);
712} 60}
713 61
714/* 62/*
@@ -744,7 +92,7 @@ _xfs_trans_alloc(
744 atomic_inc(&mp->m_active_trans); 92 atomic_inc(&mp->m_active_trans);
745 93
746 tp = kmem_zone_zalloc(xfs_trans_zone, memflags); 94 tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
747 tp->t_magic = XFS_TRANS_MAGIC; 95 tp->t_magic = XFS_TRANS_HEADER_MAGIC;
748 tp->t_type = type; 96 tp->t_type = type;
749 tp->t_mountp = mp; 97 tp->t_mountp = mp;
750 INIT_LIST_HEAD(&tp->t_items); 98 INIT_LIST_HEAD(&tp->t_items);
@@ -789,7 +137,7 @@ xfs_trans_dup(
789 /* 137 /*
790 * Initialize the new transaction structure. 138 * Initialize the new transaction structure.
791 */ 139 */
792 ntp->t_magic = XFS_TRANS_MAGIC; 140 ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
793 ntp->t_type = tp->t_type; 141 ntp->t_type = tp->t_type;
794 ntp->t_mountp = tp->t_mountp; 142 ntp->t_mountp = tp->t_mountp;
795 INIT_LIST_HEAD(&ntp->t_items); 143 INIT_LIST_HEAD(&ntp->t_items);
@@ -832,12 +180,10 @@ xfs_trans_dup(
832 */ 180 */
833int 181int
834xfs_trans_reserve( 182xfs_trans_reserve(
835 xfs_trans_t *tp, 183 struct xfs_trans *tp,
836 uint blocks, 184 struct xfs_trans_res *resp,
837 uint logspace, 185 uint blocks,
838 uint rtextents, 186 uint rtextents)
839 uint flags,
840 uint logcount)
841{ 187{
842 int error = 0; 188 int error = 0;
843 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 189 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -863,13 +209,15 @@ xfs_trans_reserve(
863 /* 209 /*
864 * Reserve the log space needed for this transaction. 210 * Reserve the log space needed for this transaction.
865 */ 211 */
866 if (logspace > 0) { 212 if (resp->tr_logres > 0) {
867 bool permanent = false; 213 bool permanent = false;
868 214
869 ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); 215 ASSERT(tp->t_log_res == 0 ||
870 ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); 216 tp->t_log_res == resp->tr_logres);
217 ASSERT(tp->t_log_count == 0 ||
218 tp->t_log_count == resp->tr_logcount);
871 219
872 if (flags & XFS_TRANS_PERM_LOG_RES) { 220 if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
873 tp->t_flags |= XFS_TRANS_PERM_LOG_RES; 221 tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
874 permanent = true; 222 permanent = true;
875 } else { 223 } else {
@@ -878,20 +226,21 @@ xfs_trans_reserve(
878 } 226 }
879 227
880 if (tp->t_ticket != NULL) { 228 if (tp->t_ticket != NULL) {
881 ASSERT(flags & XFS_TRANS_PERM_LOG_RES); 229 ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
882 error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); 230 error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
883 } else { 231 } else {
884 error = xfs_log_reserve(tp->t_mountp, logspace, 232 error = xfs_log_reserve(tp->t_mountp,
885 logcount, &tp->t_ticket, 233 resp->tr_logres,
886 XFS_TRANSACTION, permanent, 234 resp->tr_logcount,
887 tp->t_type); 235 &tp->t_ticket, XFS_TRANSACTION,
236 permanent, tp->t_type);
888 } 237 }
889 238
890 if (error) 239 if (error)
891 goto undo_blocks; 240 goto undo_blocks;
892 241
893 tp->t_log_res = logspace; 242 tp->t_log_res = resp->tr_logres;
894 tp->t_log_count = logcount; 243 tp->t_log_count = resp->tr_logcount;
895 } 244 }
896 245
897 /* 246 /*
@@ -916,10 +265,10 @@ xfs_trans_reserve(
916 * reservations which have already been performed. 265 * reservations which have already been performed.
917 */ 266 */
918undo_log: 267undo_log:
919 if (logspace > 0) { 268 if (resp->tr_logres > 0) {
920 int log_flags; 269 int log_flags;
921 270
922 if (flags & XFS_TRANS_PERM_LOG_RES) { 271 if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
923 log_flags = XFS_LOG_REL_PERM_RESERV; 272 log_flags = XFS_LOG_REL_PERM_RESERV;
924 } else { 273 } else {
925 log_flags = 0; 274 log_flags = 0;
@@ -1367,10 +716,10 @@ xfs_trans_free_items(
1367 lip->li_desc = NULL; 716 lip->li_desc = NULL;
1368 717
1369 if (commit_lsn != NULLCOMMITLSN) 718 if (commit_lsn != NULLCOMMITLSN)
1370 IOP_COMMITTING(lip, commit_lsn); 719 lip->li_ops->iop_committing(lip, commit_lsn);
1371 if (flags & XFS_TRANS_ABORT) 720 if (flags & XFS_TRANS_ABORT)
1372 lip->li_flags |= XFS_LI_ABORTED; 721 lip->li_flags |= XFS_LI_ABORTED;
1373 IOP_UNLOCK(lip); 722 lip->li_ops->iop_unlock(lip);
1374 723
1375 xfs_trans_free_item_desc(lidp); 724 xfs_trans_free_item_desc(lidp);
1376 } 725 }
@@ -1390,8 +739,11 @@ xfs_log_item_batch_insert(
1390 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ 739 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1391 xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); 740 xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
1392 741
1393 for (i = 0; i < nr_items; i++) 742 for (i = 0; i < nr_items; i++) {
1394 IOP_UNPIN(log_items[i], 0); 743 struct xfs_log_item *lip = log_items[i];
744
745 lip->li_ops->iop_unpin(lip, 0);
746 }
1395} 747}
1396 748
1397/* 749/*
@@ -1401,11 +753,11 @@ xfs_log_item_batch_insert(
1401 * 753 *
1402 * If we are called with the aborted flag set, it is because a log write during 754 * If we are called with the aborted flag set, it is because a log write during
1403 * a CIL checkpoint commit has failed. In this case, all the items in the 755 * a CIL checkpoint commit has failed. In this case, all the items in the
1404 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which 756 * checkpoint have already gone through iop_commited and iop_unlock, which
1405 * means that checkpoint commit abort handling is treated exactly the same 757 * means that checkpoint commit abort handling is treated exactly the same
1406 * as an iclog write error even though we haven't started any IO yet. Hence in 758 * as an iclog write error even though we haven't started any IO yet. Hence in
1407 * this case all we need to do is IOP_COMMITTED processing, followed by an 759 * this case all we need to do is iop_committed processing, followed by an
1408 * IOP_UNPIN(aborted) call. 760 * iop_unpin(aborted) call.
1409 * 761 *
1410 * The AIL cursor is used to optimise the insert process. If commit_lsn is not 762 * The AIL cursor is used to optimise the insert process. If commit_lsn is not
1411 * at the end of the AIL, the insert cursor avoids the need to walk 763 * at the end of the AIL, the insert cursor avoids the need to walk
@@ -1438,7 +790,7 @@ xfs_trans_committed_bulk(
1438 790
1439 if (aborted) 791 if (aborted)
1440 lip->li_flags |= XFS_LI_ABORTED; 792 lip->li_flags |= XFS_LI_ABORTED;
1441 item_lsn = IOP_COMMITTED(lip, commit_lsn); 793 item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
1442 794
1443 /* item_lsn of -1 means the item needs no further processing */ 795 /* item_lsn of -1 means the item needs no further processing */
1444 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) 796 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -1450,7 +802,7 @@ xfs_trans_committed_bulk(
1450 */ 802 */
1451 if (aborted) { 803 if (aborted) {
1452 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); 804 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1453 IOP_UNPIN(lip, 1); 805 lip->li_ops->iop_unpin(lip, 1);
1454 continue; 806 continue;
1455 } 807 }
1456 808
@@ -1468,7 +820,7 @@ xfs_trans_committed_bulk(
1468 xfs_trans_ail_update(ailp, lip, item_lsn); 820 xfs_trans_ail_update(ailp, lip, item_lsn);
1469 else 821 else
1470 spin_unlock(&ailp->xa_lock); 822 spin_unlock(&ailp->xa_lock);
1471 IOP_UNPIN(lip, 0); 823 lip->li_ops->iop_unpin(lip, 0);
1472 continue; 824 continue;
1473 } 825 }
1474 826
@@ -1666,7 +1018,7 @@ xfs_trans_roll(
1666 struct xfs_inode *dp) 1018 struct xfs_inode *dp)
1667{ 1019{
1668 struct xfs_trans *trans; 1020 struct xfs_trans *trans;
1669 unsigned int logres, count; 1021 struct xfs_trans_res tres;
1670 int error; 1022 int error;
1671 1023
1672 /* 1024 /*
@@ -1678,8 +1030,8 @@ xfs_trans_roll(
1678 /* 1030 /*
1679 * Copy the critical parameters from one trans to the next. 1031 * Copy the critical parameters from one trans to the next.
1680 */ 1032 */
1681 logres = trans->t_log_res; 1033 tres.tr_logres = trans->t_log_res;
1682 count = trans->t_log_count; 1034 tres.tr_logcount = trans->t_log_count;
1683 *tpp = xfs_trans_dup(trans); 1035 *tpp = xfs_trans_dup(trans);
1684 1036
1685 /* 1037 /*
@@ -1710,8 +1062,8 @@ xfs_trans_roll(
1710 * across this call, or that anything that is locked be logged in 1062 * across this call, or that anything that is locked be logged in
1711 * the prior and the next transactions. 1063 * the prior and the next transactions.
1712 */ 1064 */
1713 error = xfs_trans_reserve(trans, 0, logres, 0, 1065 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1714 XFS_TRANS_PERM_LOG_RES, count); 1066 error = xfs_trans_reserve(trans, &tres, 0, 0);
1715 /* 1067 /*
1716 * Ensure that the inode is in the new transaction and locked. 1068 * Ensure that the inode is in the new transaction and locked.
1717 */ 1069 */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2b4946393e30..09cf40b89e8c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -20,285 +20,9 @@
20 20
21struct xfs_log_item; 21struct xfs_log_item;
22 22
23/* 23#include "xfs_trans_resv.h"
24 * This is the structure written in the log at the head of
25 * every transaction. It identifies the type and id of the
26 * transaction, and contains the number of items logged by
27 * the transaction so we know how many to expect during recovery.
28 *
29 * Do not change the below structure without redoing the code in
30 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
31 */
32typedef struct xfs_trans_header {
33 uint th_magic; /* magic number */
34 uint th_type; /* transaction type */
35 __int32_t th_tid; /* transaction id (unused) */
36 uint th_num_items; /* num items logged by trans */
37} xfs_trans_header_t;
38
39#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
40
41/*
42 * Log item types.
43 */
44#define XFS_LI_EFI 0x1236
45#define XFS_LI_EFD 0x1237
46#define XFS_LI_IUNLINK 0x1238
47#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e
51#define XFS_LI_ICREATE 0x123f
52
53#define XFS_LI_TYPE_DESC \
54 { XFS_LI_EFI, "XFS_LI_EFI" }, \
55 { XFS_LI_EFD, "XFS_LI_EFD" }, \
56 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
57 { XFS_LI_INODE, "XFS_LI_INODE" }, \
58 { XFS_LI_BUF, "XFS_LI_BUF" }, \
59 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
60 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
61
62/*
63 * Transaction types. Used to distinguish types of buffers.
64 */
65#define XFS_TRANS_SETATTR_NOT_SIZE 1
66#define XFS_TRANS_SETATTR_SIZE 2
67#define XFS_TRANS_INACTIVE 3
68#define XFS_TRANS_CREATE 4
69#define XFS_TRANS_CREATE_TRUNC 5
70#define XFS_TRANS_TRUNCATE_FILE 6
71#define XFS_TRANS_REMOVE 7
72#define XFS_TRANS_LINK 8
73#define XFS_TRANS_RENAME 9
74#define XFS_TRANS_MKDIR 10
75#define XFS_TRANS_RMDIR 11
76#define XFS_TRANS_SYMLINK 12
77#define XFS_TRANS_SET_DMATTRS 13
78#define XFS_TRANS_GROWFS 14
79#define XFS_TRANS_STRAT_WRITE 15
80#define XFS_TRANS_DIOSTRAT 16
81/* 17 was XFS_TRANS_WRITE_SYNC */
82#define XFS_TRANS_WRITEID 18
83#define XFS_TRANS_ADDAFORK 19
84#define XFS_TRANS_ATTRINVAL 20
85#define XFS_TRANS_ATRUNCATE 21
86#define XFS_TRANS_ATTR_SET 22
87#define XFS_TRANS_ATTR_RM 23
88#define XFS_TRANS_ATTR_FLAG 24
89#define XFS_TRANS_CLEAR_AGI_BUCKET 25
90#define XFS_TRANS_QM_SBCHANGE 26
91/*
92 * Dummy entries since we use the transaction type to index into the
93 * trans_type[] in xlog_recover_print_trans_head()
94 */
95#define XFS_TRANS_DUMMY1 27
96#define XFS_TRANS_DUMMY2 28
97#define XFS_TRANS_QM_QUOTAOFF 29
98#define XFS_TRANS_QM_DQALLOC 30
99#define XFS_TRANS_QM_SETQLIM 31
100#define XFS_TRANS_QM_DQCLUSTER 32
101#define XFS_TRANS_QM_QINOCREATE 33
102#define XFS_TRANS_QM_QUOTAOFF_END 34
103#define XFS_TRANS_SB_UNIT 35
104#define XFS_TRANS_FSYNC_TS 36
105#define XFS_TRANS_GROWFSRT_ALLOC 37
106#define XFS_TRANS_GROWFSRT_ZERO 38
107#define XFS_TRANS_GROWFSRT_FREE 39
108#define XFS_TRANS_SWAPEXT 40
109#define XFS_TRANS_SB_COUNT 41
110#define XFS_TRANS_CHECKPOINT 42
111#define XFS_TRANS_ICREATE 43
112#define XFS_TRANS_TYPE_MAX 43
113/* new transaction types need to be reflected in xfs_logprint(8) */
114
115#define XFS_TRANS_TYPES \
116 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
117 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
118 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
119 { XFS_TRANS_CREATE, "CREATE" }, \
120 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
121 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
122 { XFS_TRANS_REMOVE, "REMOVE" }, \
123 { XFS_TRANS_LINK, "LINK" }, \
124 { XFS_TRANS_RENAME, "RENAME" }, \
125 { XFS_TRANS_MKDIR, "MKDIR" }, \
126 { XFS_TRANS_RMDIR, "RMDIR" }, \
127 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
128 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
129 { XFS_TRANS_GROWFS, "GROWFS" }, \
130 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
131 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
132 { XFS_TRANS_WRITEID, "WRITEID" }, \
133 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
134 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
135 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
136 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
137 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
138 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
139 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
140 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
141 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
142 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
143 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
144 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
145 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
146 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
147 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
148 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
149 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
150 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
151 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
152 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
153 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
154 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
155 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
156 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
157 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
158
159/*
160 * This structure is used to track log items associated with
161 * a transaction. It points to the log item and keeps some
162 * flags to track the state of the log item. It also tracks
163 * the amount of space needed to log the item it describes
164 * once we get to commit processing (see xfs_trans_commit()).
165 */
166struct xfs_log_item_desc {
167 struct xfs_log_item *lid_item;
168 struct list_head lid_trans;
169 unsigned char lid_flags;
170};
171
172#define XFS_LID_DIRTY 0x1
173
174#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
175/*
176 * Values for t_flags.
177 */
178#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
179#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
180#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
181#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
182#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
183#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
184#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
185 count in superblock */
186
187/*
188 * Values for call flags parameter.
189 */
190#define XFS_TRANS_RELEASE_LOG_RES 0x4
191#define XFS_TRANS_ABORT 0x8
192
193/*
194 * Field values for xfs_trans_mod_sb.
195 */
196#define XFS_TRANS_SB_ICOUNT 0x00000001
197#define XFS_TRANS_SB_IFREE 0x00000002
198#define XFS_TRANS_SB_FDBLOCKS 0x00000004
199#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
200#define XFS_TRANS_SB_FREXTENTS 0x00000010
201#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
202#define XFS_TRANS_SB_DBLOCKS 0x00000040
203#define XFS_TRANS_SB_AGCOUNT 0x00000080
204#define XFS_TRANS_SB_IMAXPCT 0x00000100
205#define XFS_TRANS_SB_REXTSIZE 0x00000200
206#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
207#define XFS_TRANS_SB_RBLOCKS 0x00000800
208#define XFS_TRANS_SB_REXTENTS 0x00001000
209#define XFS_TRANS_SB_REXTSLOG 0x00002000
210
211
212/*
213 * Per-extent log reservation for the allocation btree changes
214 * involved in freeing or allocating an extent.
215 * 2 trees * (2 blocks/level * max depth - 1)
216 */
217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
219
220/*
221 * Per-directory log reservation for any directory change.
222 * dir blocks: (1 btree block per level + data block + free block)
223 * bmap btree: (levels + 2) * max depth
224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
225 * size, so account for that in the DAENTER macros.
226 */
227#define XFS_DIROP_LOG_COUNT(mp) \
228 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
229 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
230
231 24
232#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) 25/* kernel only transaction subsystem defines */
233#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
234#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
235#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
236#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
237#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
238#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
239#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
240#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
241#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
242#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
243#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
244#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
245#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
246#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
247/*
248 * Logging the inode timestamps on an fsync -- same as SWRITE
249 * as long as SWRITE logs the entire inode core
250 */
251#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
252#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
253#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
254#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
255#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm)
256#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt)
257#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
258#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
259#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange)
260#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim)
261#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc)
262#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff)
263#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff)
264#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb)
265
266/*
267 * Various log count values.
268 */
269#define XFS_DEFAULT_LOG_COUNT 1
270#define XFS_DEFAULT_PERM_LOG_COUNT 2
271#define XFS_ITRUNCATE_LOG_COUNT 2
272#define XFS_INACTIVE_LOG_COUNT 2
273#define XFS_CREATE_LOG_COUNT 2
274#define XFS_MKDIR_LOG_COUNT 3
275#define XFS_SYMLINK_LOG_COUNT 3
276#define XFS_REMOVE_LOG_COUNT 2
277#define XFS_LINK_LOG_COUNT 2
278#define XFS_RENAME_LOG_COUNT 2
279#define XFS_WRITE_LOG_COUNT 2
280#define XFS_ADDAFORK_LOG_COUNT 2
281#define XFS_ATTRINVAL_LOG_COUNT 1
282#define XFS_ATTRSET_LOG_COUNT 3
283#define XFS_ATTRRM_LOG_COUNT 3
284
285/*
286 * Here we centralize the specification of XFS meta-data buffer
287 * reference count values. This determine how hard the buffer
288 * cache tries to hold onto the buffer.
289 */
290#define XFS_AGF_REF 4
291#define XFS_AGI_REF 4
292#define XFS_AGFL_REF 3
293#define XFS_INO_BTREE_REF 3
294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
298#define XFS_ATTR_BTREE_REF 1
299#define XFS_DQUOT_REF 1
300
301#ifdef __KERNEL__
302 26
303struct xfs_buf; 27struct xfs_buf;
304struct xfs_buftarg; 28struct xfs_buftarg;
@@ -310,6 +34,7 @@ struct xfs_log_iovec;
310struct xfs_log_item_desc; 34struct xfs_log_item_desc;
311struct xfs_mount; 35struct xfs_mount;
312struct xfs_trans; 36struct xfs_trans;
37struct xfs_trans_res;
313struct xfs_dquot_acct; 38struct xfs_dquot_acct;
314struct xfs_busy_extent; 39struct xfs_busy_extent;
315 40
@@ -342,7 +67,7 @@ typedef struct xfs_log_item {
342 { XFS_LI_ABORTED, "ABORTED" } 67 { XFS_LI_ABORTED, "ABORTED" }
343 68
344struct xfs_item_ops { 69struct xfs_item_ops {
345 uint (*iop_size)(xfs_log_item_t *); 70 void (*iop_size)(xfs_log_item_t *, int *, int *);
346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 71 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
347 void (*iop_pin)(xfs_log_item_t *); 72 void (*iop_pin)(xfs_log_item_t *);
348 void (*iop_unpin)(xfs_log_item_t *, int remove); 73 void (*iop_unpin)(xfs_log_item_t *, int remove);
@@ -352,17 +77,8 @@ struct xfs_item_ops {
352 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 77 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
353}; 78};
354 79
355#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
356#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
357#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
358#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
359#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
360#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
361#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
362#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
363
364/* 80/*
365 * Return values for the IOP_PUSH() routines. 81 * Return values for the iop_push() routines.
366 */ 82 */
367#define XFS_ITEM_SUCCESS 0 83#define XFS_ITEM_SUCCESS 0
368#define XFS_ITEM_PINNED 1 84#define XFS_ITEM_PINNED 1
@@ -446,7 +162,7 @@ typedef struct xfs_trans {
446xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 162xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
447xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); 163xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
448xfs_trans_t *xfs_trans_dup(xfs_trans_t *); 164xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
449int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, 165int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
450 uint, uint); 166 uint, uint);
451void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); 167void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
452 168
@@ -528,9 +244,4 @@ void xfs_trans_ail_destroy(struct xfs_mount *);
528extern kmem_zone_t *xfs_trans_zone; 244extern kmem_zone_t *xfs_trans_zone;
529extern kmem_zone_t *xfs_log_item_desc_zone; 245extern kmem_zone_t *xfs_log_item_desc_zone;
530 246
531#endif /* __KERNEL__ */
532
533void xfs_trans_init(struct xfs_mount *);
534int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
535
536#endif /* __XFS_TRANS_H__ */ 247#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0eda7254305f..21c6d7ddbc06 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -61,20 +61,6 @@ xfs_ail_check(
61#endif /* DEBUG */ 61#endif /* DEBUG */
62 62
63/* 63/*
64 * Return a pointer to the first item in the AIL. If the AIL is empty, then
65 * return NULL.
66 */
67xfs_log_item_t *
68xfs_ail_min(
69 struct xfs_ail *ailp)
70{
71 if (list_empty(&ailp->xa_ail))
72 return NULL;
73
74 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
75}
76
77 /*
78 * Return a pointer to the last item in the AIL. If the AIL is empty, then 64 * Return a pointer to the last item in the AIL. If the AIL is empty, then
79 * return NULL. 65 * return NULL.
80 */ 66 */
@@ -393,11 +379,11 @@ xfsaild_push(
393 int lock_result; 379 int lock_result;
394 380
395 /* 381 /*
396 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We 382 * Note that iop_push may unlock and reacquire the AIL lock. We
397 * rely on the AIL cursor implementation to be able to deal with 383 * rely on the AIL cursor implementation to be able to deal with
398 * the dropped lock. 384 * the dropped lock.
399 */ 385 */
400 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); 386 lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
401 switch (lock_result) { 387 switch (lock_result) {
402 case XFS_ITEM_SUCCESS: 388 case XFS_ITEM_SUCCESS:
403 XFS_STATS_INC(xs_push_ail_success); 389 XFS_STATS_INC(xs_push_ail_success);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index aa5a04b844d6..8c75b8f67270 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -505,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
505 505
506/* 506/*
507 * Mark the buffer as not needing to be unlocked when the buf item's 507 * Mark the buffer as not needing to be unlocked when the buf item's
508 * IOP_UNLOCK() routine is called. The buffer must already be locked 508 * iop_unlock() routine is called. The buffer must already be locked
509 * and associated with the given transaction. 509 * and associated with the given transaction.
510 */ 510 */
511/* ARGSUSED */ 511/* ARGSUSED */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 61407a847b86..54ee3c5dee76 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 53b7c9b0f8f7..c52def0b441c 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,6 +25,9 @@ struct xfs_trans;
25struct xfs_ail; 25struct xfs_ail;
26struct xfs_log_vec; 26struct xfs_log_vec;
27 27
28
29void xfs_trans_init(struct xfs_mount *);
30int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 31void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
29void xfs_trans_del_item(struct xfs_log_item *); 32void xfs_trans_del_item(struct xfs_log_item *);
30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 33void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -83,6 +86,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
83 struct xfs_ail_cursor *cur, 86 struct xfs_ail_cursor *cur,
84 struct xfs_log_item **log_items, int nr_items, 87 struct xfs_log_item **log_items, int nr_items,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock); 88 xfs_lsn_t lsn) __releases(ailp->xa_lock);
89/*
90 * Return a pointer to the first item in the AIL. If the AIL is empty, then
91 * return NULL.
92 */
93static inline struct xfs_log_item *
94xfs_ail_min(
95 struct xfs_ail *ailp)
96{
97 return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
98 li_ail);
99}
100
86static inline void 101static inline void
87xfs_trans_ail_update( 102xfs_trans_ail_update(
88 struct xfs_ail *ailp, 103 struct xfs_ail *ailp,
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
new file mode 100644
index 000000000000..a65a3cc40610
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.c
@@ -0,0 +1,803 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (C) 2010 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log.h"
23#include "xfs_trans_resv.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_error.h"
29#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_btree.h"
36#include "xfs_ialloc.h"
37#include "xfs_alloc.h"
38#include "xfs_extent_busy.h"
39#include "xfs_bmap.h"
40#include "xfs_bmap_util.h"
41#include "xfs_quota.h"
42#include "xfs_qm.h"
43#include "xfs_trans_space.h"
44#include "xfs_trace.h"
45
46/*
47 * A buffer has a format structure overhead in the log in addition
48 * to the data, so we need to take this into account when reserving
49 * space in a transaction for a buffer. Round the space required up
50 * to a multiple of 128 bytes so that we don't change the historical
51 * reservation that has been used for this overhead.
52 */
53STATIC uint
54xfs_buf_log_overhead(void)
55{
56 return round_up(sizeof(struct xlog_op_header) +
57 sizeof(struct xfs_buf_log_format), 128);
58}
59
60/*
61 * Calculate out transaction log reservation per item in bytes.
62 *
63 * The nbufs argument is used to indicate the number of items that
64 * will be changed in a transaction. size is used to tell how many
65 * bytes should be reserved per item.
66 */
67STATIC uint
68xfs_calc_buf_res(
69 uint nbufs,
70 uint size)
71{
72 return nbufs * (size + xfs_buf_log_overhead());
73}
74
75/*
76 * Logging inodes is really tricksy. They are logged in memory format,
77 * which means that what we write into the log doesn't directly translate into
78 * the amount of space they use on disk.
79 *
80 * Case in point - btree format forks in memory format use more space than the
81 * on-disk format. In memory, the buffer contains a normal btree block header so
82 * the btree code can treat it as though it is just another generic buffer.
83 * However, when we write it to the inode fork, we don't write all of this
84 * header as it isn't needed. e.g. the root is only ever in the inode, so
85 * there's no need for sibling pointers which would waste 16 bytes of space.
86 *
87 * Hence when we have an inode with a maximally sized btree format fork, then
88 * amount of information we actually log is greater than the size of the inode
89 * on disk. Hence we need an inode reservation function that calculates all this
90 * correctly. So, we log:
91 *
92 * - log op headers for object
93 * - inode log format object
94 * - the entire inode contents (core + 2 forks)
95 * - two bmap btree block headers
96 */
97STATIC uint
98xfs_calc_inode_res(
99 struct xfs_mount *mp,
100 uint ninodes)
101{
102 return ninodes * (sizeof(struct xlog_op_header) +
103 sizeof(struct xfs_inode_log_format) +
104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
106}
107
108/*
109 * Various log reservation values.
110 *
111 * These are based on the size of the file system block because that is what
112 * most transactions manipulate. Each adds in an additional 128 bytes per
113 * item logged to try to account for the overhead of the transaction mechanism.
114 *
115 * Note: Most of the reservations underestimate the number of allocation
116 * groups into which they could free extents in the xfs_bmap_finish() call.
117 * This is because the number in the worst case is quite high and quite
118 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
119 * extents in only a single AG at a time. This will require changes to the
120 * EFI code as well, however, so that the EFI for the extents not freed is
121 * logged again in each transaction. See SGI PV #261917.
122 *
123 * Reservation functions here avoid a huge stack in xfs_trans_init due to
124 * register overflow from temporaries in the calculations.
125 */
126
127
128/*
129 * In a write transaction we can allocate a maximum of 2
130 * extents. This gives:
131 * the inode getting the new extents: inode size
132 * the inode's bmap btree: max depth * block size
133 * the agfs of the ags from which the extents are allocated: 2 * sector
134 * the superblock free block counter: sector size
135 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
136 * And the bmap_finish transaction can free bmap blocks in a join:
137 * the agfs of the ags containing the blocks: 2 * sector size
138 * the agfls of the ags containing the blocks: 2 * sector size
139 * the super block free block counter: sector size
140 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
141 */
142STATIC uint
143xfs_calc_write_reservation(
144 struct xfs_mount *mp)
145{
146 return XFS_DQUOT_LOGRES(mp) +
147 MAX((xfs_calc_inode_res(mp, 1) +
148 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
149 XFS_FSB_TO_B(mp, 1)) +
150 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
151 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
152 XFS_FSB_TO_B(mp, 1))),
153 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
154 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
155 XFS_FSB_TO_B(mp, 1))));
156}
157
158/*
159 * In truncating a file we free up to two extents at once. We can modify:
160 * the inode being truncated: inode size
161 * the inode's bmap btree: (max depth + 1) * block size
162 * And the bmap_finish transaction can free the blocks and bmap blocks:
163 * the agf for each of the ags: 4 * sector size
164 * the agfl for each of the ags: 4 * sector size
165 * the super block to reflect the freed blocks: sector size
166 * worst case split in allocation btrees per extent assuming 4 extents:
167 * 4 exts * 2 trees * (2 * max depth - 1) * block size
168 * the inode btree: max depth * blocksize
169 * the allocation btrees: 2 trees * (max depth - 1) * block size
170 */
171STATIC uint
172xfs_calc_itruncate_reservation(
173 struct xfs_mount *mp)
174{
175 return XFS_DQUOT_LOGRES(mp) +
176 MAX((xfs_calc_inode_res(mp, 1) +
177 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
178 XFS_FSB_TO_B(mp, 1))),
179 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
180 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
181 XFS_FSB_TO_B(mp, 1)) +
182 xfs_calc_buf_res(5, 0) +
183 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
184 XFS_FSB_TO_B(mp, 1)) +
185 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
186 mp->m_in_maxlevels, 0)));
187}
188
189/*
190 * In renaming a files we can modify:
191 * the four inodes involved: 4 * inode size
192 * the two directory btrees: 2 * (max depth + v2) * dir block size
193 * the two directory bmap btrees: 2 * max depth * block size
194 * And the bmap_finish transaction can free dir and bmap blocks (two sets
195 * of bmap blocks) giving:
196 * the agf for the ags in which the blocks live: 3 * sector size
197 * the agfl for the ags in which the blocks live: 3 * sector size
198 * the superblock for the free block count: sector size
199 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
200 */
201STATIC uint
202xfs_calc_rename_reservation(
203 struct xfs_mount *mp)
204{
205 return XFS_DQUOT_LOGRES(mp) +
206 MAX((xfs_calc_inode_res(mp, 4) +
207 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
208 XFS_FSB_TO_B(mp, 1))),
209 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
210 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
211 XFS_FSB_TO_B(mp, 1))));
212}
213
214/*
215 * For creating a link to an inode:
216 * the parent directory inode: inode size
217 * the linked inode: inode size
218 * the directory btree could split: (max depth + v2) * dir block size
219 * the directory bmap btree could join or split: (max depth + v2) * blocksize
220 * And the bmap_finish transaction can free some bmap blocks giving:
221 * the agf for the ag in which the blocks live: sector size
222 * the agfl for the ag in which the blocks live: sector size
223 * the superblock for the free block count: sector size
224 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
225 */
226STATIC uint
227xfs_calc_link_reservation(
228 struct xfs_mount *mp)
229{
230 return XFS_DQUOT_LOGRES(mp) +
231 MAX((xfs_calc_inode_res(mp, 2) +
232 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
233 XFS_FSB_TO_B(mp, 1))),
234 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
235 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
236 XFS_FSB_TO_B(mp, 1))));
237}
238
239/*
240 * For removing a directory entry we can modify:
241 * the parent directory inode: inode size
242 * the removed inode: inode size
243 * the directory btree could join: (max depth + v2) * dir block size
244 * the directory bmap btree could join or split: (max depth + v2) * blocksize
245 * And the bmap_finish transaction can free the dir and bmap blocks giving:
246 * the agf for the ag in which the blocks live: 2 * sector size
247 * the agfl for the ag in which the blocks live: 2 * sector size
248 * the superblock for the free block count: sector size
249 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
250 */
251STATIC uint
252xfs_calc_remove_reservation(
253 struct xfs_mount *mp)
254{
255 return XFS_DQUOT_LOGRES(mp) +
256 MAX((xfs_calc_inode_res(mp, 2) +
257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
258 XFS_FSB_TO_B(mp, 1))),
259 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
260 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
261 XFS_FSB_TO_B(mp, 1))));
262}
263
264/*
265 * For create, break it in to the two cases that the transaction
266 * covers. We start with the modify case - allocation done by modification
267 * of the state of existing inodes - and the allocation case.
268 */
269
270/*
271 * For create we can modify:
272 * the parent directory inode: inode size
273 * the new inode: inode size
274 * the inode btree entry: block size
275 * the superblock for the nlink flag: sector size
276 * the directory btree: (max depth + v2) * dir block size
277 * the directory inode's bmap btree: (max depth + v2) * block size
278 */
279STATIC uint
280xfs_calc_create_resv_modify(
281 struct xfs_mount *mp)
282{
283 return xfs_calc_inode_res(mp, 2) +
284 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
285 (uint)XFS_FSB_TO_B(mp, 1) +
286 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
287}
288
289/*
290 * For create we can allocate some inodes giving:
291 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
292 * the superblock for the nlink flag: sector size
293 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
294 * the inode btree: max depth * blocksize
295 * the allocation btrees: 2 trees * (max depth - 1) * block size
296 */
297STATIC uint
298xfs_calc_create_resv_alloc(
299 struct xfs_mount *mp)
300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
305 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
306 XFS_FSB_TO_B(mp, 1));
307}
308
309STATIC uint
310__xfs_calc_create_reservation(
311 struct xfs_mount *mp)
312{
313 return XFS_DQUOT_LOGRES(mp) +
314 MAX(xfs_calc_create_resv_alloc(mp),
315 xfs_calc_create_resv_modify(mp));
316}
317
318/*
319 * For icreate we can allocate some inodes giving:
320 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
321 * the superblock for the nlink flag: sector size
322 * the inode btree: max depth * blocksize
323 * the allocation btrees: 2 trees * (max depth - 1) * block size
324 */
325STATIC uint
326xfs_calc_icreate_resv_alloc(
327 struct xfs_mount *mp)
328{
329 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
330 mp->m_sb.sb_sectsize +
331 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
332 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
333 XFS_FSB_TO_B(mp, 1));
334}
335
336STATIC uint
337xfs_calc_icreate_reservation(xfs_mount_t *mp)
338{
339 return XFS_DQUOT_LOGRES(mp) +
340 MAX(xfs_calc_icreate_resv_alloc(mp),
341 xfs_calc_create_resv_modify(mp));
342}
343
344STATIC uint
345xfs_calc_create_reservation(
346 struct xfs_mount *mp)
347{
348 if (xfs_sb_version_hascrc(&mp->m_sb))
349 return xfs_calc_icreate_reservation(mp);
350 return __xfs_calc_create_reservation(mp);
351
352}
353
354/*
355 * Making a new directory is the same as creating a new file.
356 */
357STATIC uint
358xfs_calc_mkdir_reservation(
359 struct xfs_mount *mp)
360{
361 return xfs_calc_create_reservation(mp);
362}
363
364
365/*
366 * Making a new symplink is the same as creating a new file, but
367 * with the added blocks for remote symlink data which can be up to 1kB in
368 * length (MAXPATHLEN).
369 */
370STATIC uint
371xfs_calc_symlink_reservation(
372 struct xfs_mount *mp)
373{
374 return xfs_calc_create_reservation(mp) +
375 xfs_calc_buf_res(1, MAXPATHLEN);
376}
377
378/*
379 * In freeing an inode we can modify:
380 * the inode being freed: inode size
381 * the super block free inode counter: sector size
382 * the agi hash list and counters: sector size
383 * the inode btree entry: block size
384 * the on disk inode before ours in the agi hash list: inode cluster size
385 * the inode btree: max depth * blocksize
386 * the allocation btrees: 2 trees * (max depth - 1) * block size
387 */
388STATIC uint
389xfs_calc_ifree_reservation(
390 struct xfs_mount *mp)
391{
392 return XFS_DQUOT_LOGRES(mp) +
393 xfs_calc_inode_res(mp, 1) +
394 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
395 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
396 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
397 XFS_INODE_CLUSTER_SIZE(mp)) +
398 xfs_calc_buf_res(1, 0) +
399 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
400 mp->m_in_maxlevels, 0) +
401 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
402 XFS_FSB_TO_B(mp, 1));
403}
404
405/*
406 * When only changing the inode we log the inode and possibly the superblock
407 * We also add a bit of slop for the transaction stuff.
408 */
409STATIC uint
410xfs_calc_ichange_reservation(
411 struct xfs_mount *mp)
412{
413 return XFS_DQUOT_LOGRES(mp) +
414 xfs_calc_inode_res(mp, 1) +
415 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
416
417}
418
419/*
420 * Growing the data section of the filesystem.
421 * superblock
422 * agi and agf
423 * allocation btrees
424 */
425STATIC uint
426xfs_calc_growdata_reservation(
427 struct xfs_mount *mp)
428{
429 return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
430 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
431 XFS_FSB_TO_B(mp, 1));
432}
433
434/*
435 * Growing the rt section of the filesystem.
436 * In the first set of transactions (ALLOC) we allocate space to the
437 * bitmap or summary files.
438 * superblock: sector size
439 * agf of the ag from which the extent is allocated: sector size
440 * bmap btree for bitmap/summary inode: max depth * blocksize
441 * bitmap/summary inode: inode size
442 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
443 */
444STATIC uint
445xfs_calc_growrtalloc_reservation(
446 struct xfs_mount *mp)
447{
448 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
449 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
450 XFS_FSB_TO_B(mp, 1)) +
451 xfs_calc_inode_res(mp, 1) +
452 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
453 XFS_FSB_TO_B(mp, 1));
454}
455
456/*
457 * Growing the rt section of the filesystem.
458 * In the second set of transactions (ZERO) we zero the new metadata blocks.
459 * one bitmap/summary block: blocksize
460 */
461STATIC uint
462xfs_calc_growrtzero_reservation(
463 struct xfs_mount *mp)
464{
465 return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
466}
467
468/*
469 * Growing the rt section of the filesystem.
470 * In the third set of transactions (FREE) we update metadata without
471 * allocating any new blocks.
472 * superblock: sector size
473 * bitmap inode: inode size
474 * summary inode: inode size
475 * one bitmap block: blocksize
476 * summary blocks: new summary size
477 */
478STATIC uint
479xfs_calc_growrtfree_reservation(
480 struct xfs_mount *mp)
481{
482 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
483 xfs_calc_inode_res(mp, 2) +
484 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
485 xfs_calc_buf_res(1, mp->m_rsumsize);
486}
487
488/*
489 * Logging the inode modification timestamp on a synchronous write.
490 * inode
491 */
492STATIC uint
493xfs_calc_swrite_reservation(
494 struct xfs_mount *mp)
495{
496 return xfs_calc_inode_res(mp, 1);
497}
498
499/*
500 * Logging the inode mode bits when writing a setuid/setgid file
501 * inode
502 */
503STATIC uint
504xfs_calc_writeid_reservation(
505 struct xfs_mount *mp)
506{
507 return xfs_calc_inode_res(mp, 1);
508}
509
510/*
511 * Converting the inode from non-attributed to attributed.
512 * the inode being converted: inode size
513 * agf block and superblock (for block allocation)
514 * the new block (directory sized)
515 * bmap blocks for the new directory block
516 * allocation btrees
517 */
518STATIC uint
519xfs_calc_addafork_reservation(
520 struct xfs_mount *mp)
521{
522 return XFS_DQUOT_LOGRES(mp) +
523 xfs_calc_inode_res(mp, 1) +
524 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
525 xfs_calc_buf_res(1, mp->m_dirblksize) +
526 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
527 XFS_FSB_TO_B(mp, 1)) +
528 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
529 XFS_FSB_TO_B(mp, 1));
530}
531
532/*
533 * Removing the attribute fork of a file
534 * the inode being truncated: inode size
535 * the inode's bmap btree: max depth * block size
536 * And the bmap_finish transaction can free the blocks and bmap blocks:
537 * the agf for each of the ags: 4 * sector size
538 * the agfl for each of the ags: 4 * sector size
539 * the super block to reflect the freed blocks: sector size
540 * worst case split in allocation btrees per extent assuming 4 extents:
541 * 4 exts * 2 trees * (2 * max depth - 1) * block size
542 */
543STATIC uint
544xfs_calc_attrinval_reservation(
545 struct xfs_mount *mp)
546{
547 return MAX((xfs_calc_inode_res(mp, 1) +
548 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
549 XFS_FSB_TO_B(mp, 1))),
550 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
551 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
552 XFS_FSB_TO_B(mp, 1))));
553}
554
555/*
556 * Setting an attribute at mount time.
557 * the inode getting the attribute
558 * the superblock for allocations
559 * the agfs extents are allocated from
560 * the attribute btree * max depth
561 * the inode allocation btree
562 * Since attribute transaction space is dependent on the size of the attribute,
563 * the calculation is done partially at mount time and partially at runtime(see
564 * below).
565 */
566STATIC uint
567xfs_calc_attrsetm_reservation(
568 struct xfs_mount *mp)
569{
570 return XFS_DQUOT_LOGRES(mp) +
571 xfs_calc_inode_res(mp, 1) +
572 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
573 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
574}
575
576/*
577 * Setting an attribute at runtime, transaction space unit per block.
578 * the superblock for allocations: sector size
579 * the inode bmap btree could join or split: max depth * block size
580 * Since the runtime attribute transaction space is dependent on the total
581 * blocks needed for the 1st bmap, here we calculate out the space unit for
582 * one block so that the caller could figure out the total space according
583 * to the attibute extent length in blocks by:
584 * ext * M_RES(mp)->tr_attrsetrt.tr_logres
585 */
586STATIC uint
587xfs_calc_attrsetrt_reservation(
588 struct xfs_mount *mp)
589{
590 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
591 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
592 XFS_FSB_TO_B(mp, 1));
593}
594
595/*
596 * Removing an attribute.
597 * the inode: inode size
598 * the attribute btree could join: max depth * block size
599 * the inode bmap btree could join or split: max depth * block size
600 * And the bmap_finish transaction can free the attr blocks freed giving:
601 * the agf for the ag in which the blocks live: 2 * sector size
602 * the agfl for the ag in which the blocks live: 2 * sector size
603 * the superblock for the free block count: sector size
604 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
605 */
606STATIC uint
607xfs_calc_attrrm_reservation(
608 struct xfs_mount *mp)
609{
610 return XFS_DQUOT_LOGRES(mp) +
611 MAX((xfs_calc_inode_res(mp, 1) +
612 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
613 XFS_FSB_TO_B(mp, 1)) +
614 (uint)XFS_FSB_TO_B(mp,
615 XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
616 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
617 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
618 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
619 XFS_FSB_TO_B(mp, 1))));
620}
621
622/*
623 * Clearing a bad agino number in an agi hash bucket.
624 */
625STATIC uint
626xfs_calc_clear_agi_bucket_reservation(
627 struct xfs_mount *mp)
628{
629 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
630}
631
632/*
633 * Clearing the quotaflags in the superblock.
634 * the super block for changing quota flags: sector size
635 */
636STATIC uint
637xfs_calc_qm_sbchange_reservation(
638 struct xfs_mount *mp)
639{
640 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
641}
642
643/*
644 * Adjusting quota limits.
645 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
646 */
647STATIC uint
648xfs_calc_qm_setqlim_reservation(
649 struct xfs_mount *mp)
650{
651 return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
652}
653
654/*
655 * Allocating quota on disk if needed.
656 * the write transaction log space: M_RES(mp)->tr_write.tr_logres
657 * the unit of quota allocation: one system block size
658 */
659STATIC uint
660xfs_calc_qm_dqalloc_reservation(
661 struct xfs_mount *mp)
662{
663 ASSERT(M_RES(mp)->tr_write.tr_logres);
664 return M_RES(mp)->tr_write.tr_logres +
665 xfs_calc_buf_res(1,
666 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
667}
668
669/*
670 * Turning off quotas.
671 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
672 * the superblock for the quota flags: sector size
673 */
674STATIC uint
675xfs_calc_qm_quotaoff_reservation(
676 struct xfs_mount *mp)
677{
678 return sizeof(struct xfs_qoff_logitem) * 2 +
679 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
680}
681
682/*
683 * End of turning off quotas.
684 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
685 */
686STATIC uint
687xfs_calc_qm_quotaoff_end_reservation(
688 struct xfs_mount *mp)
689{
690 return sizeof(struct xfs_qoff_logitem) * 2;
691}
692
693/*
694 * Syncing the incore super block changes to disk.
695 * the super block to reflect the changes: sector size
696 */
697STATIC uint
698xfs_calc_sb_reservation(
699 struct xfs_mount *mp)
700{
701 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
702}
703
704void
705xfs_trans_resv_calc(
706 struct xfs_mount *mp,
707 struct xfs_trans_resv *resp)
708{
709 /*
710 * The following transactions are logged in physical format and
711 * require a permanent reservation on space.
712 */
713 resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
714 resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
715 resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
716
717 resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
718 resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
719 resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
720
721 resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
722 resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
723 resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
724
725 resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
726 resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
727 resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
728
729 resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
730 resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
731 resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
732
733 resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
734 resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
735 resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
736
737 resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
738 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
739 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
740
741 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
742 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
743 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
744
745 resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
746 resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
747 resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
748
749 resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
750 resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
751 resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
752
753 resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
754 resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
755 resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
756
757 resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
758 resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
759 resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
760
761 resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
762 resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
763 resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
764
765 resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
766 resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
767 resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
768
769 resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
770 resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
771 resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
772
773 /*
774 * The following transactions are logged in logical format with
775 * a default log count.
776 */
777 resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
778 resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
779
780 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
781 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
782
783 resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
784 resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
785
786 resp->tr_qm_equotaoff.tr_logres =
787 xfs_calc_qm_quotaoff_end_reservation(mp);
788 resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
789
790 resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
791 resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
792
793 /* The following transaction are logged in logical format */
794 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
795 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
796 resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
797 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
798 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
799 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
800 resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
801 resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
802 resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
803}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
new file mode 100644
index 000000000000..de7de9aaad8a
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_TRANS_RESV_H__
19#define __XFS_TRANS_RESV_H__
20
21struct xfs_mount;
22
23/*
24 * structure for maintaining pre-calculated transaction reservations.
25 */
26struct xfs_trans_res {
27 uint tr_logres; /* log space unit in bytes per log ticket */
28 int tr_logcount; /* number of log operations per log ticket */
29 int tr_logflags; /* log flags, currently only used for indicating
30 * a reservation request is permanent or not */
31};
32
33struct xfs_trans_resv {
34 struct xfs_trans_res tr_write; /* extent alloc trans */
35 struct xfs_trans_res tr_itruncate; /* truncate trans */
36 struct xfs_trans_res tr_rename; /* rename trans */
37 struct xfs_trans_res tr_link; /* link trans */
38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_mkdir; /* mkdir trans */
42 struct xfs_trans_res tr_ifree; /* inode free trans */
43 struct xfs_trans_res tr_ichange; /* inode update trans */
44 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
45 struct xfs_trans_res tr_swrite; /* sync write inode trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
49 * invalidation */
50 struct xfs_trans_res tr_attrsetm; /* set/create an attribute at
51 * mount time */
52 struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at
53 * runtime */
54 struct xfs_trans_res tr_attrrm; /* remove an attribute */
55 struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */
56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
59 struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
60 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
61 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
62 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
63 struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
64 struct xfs_trans_res tr_sb; /* modify superblock */
65 struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
66};
67
68/* shorthand way of accessing reservation structure */
69#define M_RES(mp) (&(mp)->m_resv)
70
71/*
72 * Per-extent log reservation for the allocation btree changes
73 * involved in freeing or allocating an extent.
74 * 2 trees * (2 blocks/level * max depth - 1) * block size
75 */
76#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
77 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
78#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
79 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
80
81/*
82 * Per-directory log reservation for any directory change.
83 * dir blocks: (1 btree block per level + data block + free block) * dblock size
84 * bmap btree: (levels + 2) * max depth * block size
85 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
86 * size, so account for that in the DAENTER macros.
87 */
88#define XFS_DIROP_LOG_RES(mp) \
89 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
90 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
91#define XFS_DIROP_LOG_COUNT(mp) \
92 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
93 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
94
95/*
96 * Various log count values.
97 */
98#define XFS_DEFAULT_LOG_COUNT 1
99#define XFS_DEFAULT_PERM_LOG_COUNT 2
100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_MKDIR_LOG_COUNT 3
104#define XFS_SYMLINK_LOG_COUNT 3
105#define XFS_REMOVE_LOG_COUNT 2
106#define XFS_LINK_LOG_COUNT 2
107#define XFS_RENAME_LOG_COUNT 2
108#define XFS_WRITE_LOG_COUNT 2
109#define XFS_ADDAFORK_LOG_COUNT 2
110#define XFS_ATTRINVAL_LOG_COUNT 1
111#define XFS_ATTRSET_LOG_COUNT 3
112#define XFS_ATTRRM_LOG_COUNT 3
113
114void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
115
116#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 61ba1cfa974c..82bbc34d54a3 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -18,42 +18,7 @@
18#ifndef __XFS_TYPES_H__ 18#ifndef __XFS_TYPES_H__
19#define __XFS_TYPES_H__ 19#define __XFS_TYPES_H__
20 20
21#ifdef __KERNEL__ 21typedef __uint32_t prid_t; /* project ID */
22
23/*
24 * Additional type declarations for XFS
25 */
26typedef signed char __int8_t;
27typedef unsigned char __uint8_t;
28typedef signed short int __int16_t;
29typedef unsigned short int __uint16_t;
30typedef signed int __int32_t;
31typedef unsigned int __uint32_t;
32typedef signed long long int __int64_t;
33typedef unsigned long long int __uint64_t;
34
35typedef __uint32_t prid_t; /* project ID */
36typedef __uint32_t inst_t; /* an instruction */
37
38typedef __s64 xfs_off_t; /* <file offset> type */
39typedef unsigned long long xfs_ino_t; /* <inode> type */
40typedef __s64 xfs_daddr_t; /* <disk address> type */
41typedef char * xfs_caddr_t; /* <core address> type */
42typedef __u32 xfs_dev_t;
43typedef __u32 xfs_nlink_t;
44
45/* __psint_t is the same size as a pointer */
46#if (BITS_PER_LONG == 32)
47typedef __int32_t __psint_t;
48typedef __uint32_t __psunsigned_t;
49#elif (BITS_PER_LONG == 64)
50typedef __int64_t __psint_t;
51typedef __uint64_t __psunsigned_t;
52#else
53#error BITS_PER_LONG must be 32 or 64
54#endif
55
56#endif /* __KERNEL__ */
57 22
58typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ 23typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
59typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ 24typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
@@ -146,6 +111,12 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
146#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) 111#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
147 112
148/* 113/*
114 * Inode fork identifiers.
115 */
116#define XFS_DATA_FORK 0
117#define XFS_ATTR_FORK 1
118
119/*
149 * Min numbers of data/attr fork btree root pointers. 120 * Min numbers of data/attr fork btree root pointers.
150 */ 121 */
151#define MINDBTPTRS 3 122#define MINDBTPTRS 3
@@ -169,6 +140,23 @@ typedef enum {
169struct xfs_name { 140struct xfs_name {
170 const unsigned char *name; 141 const unsigned char *name;
171 int len; 142 int len;
143 int type;
172}; 144};
173 145
146/*
147 * uid_t and gid_t are hard-coded to 32 bits in the inode.
148 * Hence, an 'id' in a dquot is 32 bits..
149 */
150typedef __uint32_t xfs_dqid_t;
151
152/*
153 * Constants for bit manipulations.
154 */
155#define XFS_NBBYLOG 3 /* log2(NBBY) */
156#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
157#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
158#define XFS_NBWORD (1 << XFS_NBWORDLOG)
159#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
160
161
174#endif /* __XFS_TYPES_H__ */ 162#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
deleted file mode 100644
index 0025c78ac03c..000000000000
--- a/fs/xfs/xfs_utils.c
+++ /dev/null
@@ -1,314 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_mount.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_dinode.h"
29#include "xfs_inode.h"
30#include "xfs_inode_item.h"
31#include "xfs_bmap.h"
32#include "xfs_error.h"
33#include "xfs_quota.h"
34#include "xfs_itable.h"
35#include "xfs_utils.h"
36
37
38/*
39 * Allocates a new inode from disk and return a pointer to the
40 * incore copy. This routine will internally commit the current
41 * transaction and allocate a new one if the Space Manager needed
42 * to do an allocation to replenish the inode free-list.
43 *
44 * This routine is designed to be called from xfs_create and
45 * xfs_create_dir.
46 *
47 */
48int
49xfs_dir_ialloc(
50 xfs_trans_t **tpp, /* input: current transaction;
51 output: may be a new transaction. */
52 xfs_inode_t *dp, /* directory within whose allocate
53 the inode. */
54 umode_t mode,
55 xfs_nlink_t nlink,
56 xfs_dev_t rdev,
57 prid_t prid, /* project id */
58 int okalloc, /* ok to allocate new space */
59 xfs_inode_t **ipp, /* pointer to inode; it will be
60 locked. */
61 int *committed)
62
63{
64 xfs_trans_t *tp;
65 xfs_trans_t *ntp;
66 xfs_inode_t *ip;
67 xfs_buf_t *ialloc_context = NULL;
68 int code;
69 uint log_res;
70 uint log_count;
71 void *dqinfo;
72 uint tflags;
73
74 tp = *tpp;
75 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
76
77 /*
78 * xfs_ialloc will return a pointer to an incore inode if
79 * the Space Manager has an available inode on the free
80 * list. Otherwise, it will do an allocation and replenish
81 * the freelist. Since we can only do one allocation per
82 * transaction without deadlocks, we will need to commit the
83 * current transaction and start a new one. We will then
84 * need to call xfs_ialloc again to get the inode.
85 *
86 * If xfs_ialloc did an allocation to replenish the freelist,
87 * it returns the bp containing the head of the freelist as
88 * ialloc_context. We will hold a lock on it across the
89 * transaction commit so that no other process can steal
90 * the inode(s) that we've just allocated.
91 */
92 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
93 &ialloc_context, &ip);
94
95 /*
96 * Return an error if we were unable to allocate a new inode.
97 * This should only happen if we run out of space on disk or
98 * encounter a disk error.
99 */
100 if (code) {
101 *ipp = NULL;
102 return code;
103 }
104 if (!ialloc_context && !ip) {
105 *ipp = NULL;
106 return XFS_ERROR(ENOSPC);
107 }
108
109 /*
110 * If the AGI buffer is non-NULL, then we were unable to get an
111 * inode in one operation. We need to commit the current
112 * transaction and call xfs_ialloc() again. It is guaranteed
113 * to succeed the second time.
114 */
115 if (ialloc_context) {
116 /*
117 * Normally, xfs_trans_commit releases all the locks.
118 * We call bhold to hang on to the ialloc_context across
119 * the commit. Holding this buffer prevents any other
120 * processes from doing any allocations in this
121 * allocation group.
122 */
123 xfs_trans_bhold(tp, ialloc_context);
124 /*
125 * Save the log reservation so we can use
126 * them in the next transaction.
127 */
128 log_res = xfs_trans_get_log_res(tp);
129 log_count = xfs_trans_get_log_count(tp);
130
131 /*
132 * We want the quota changes to be associated with the next
133 * transaction, NOT this one. So, detach the dqinfo from this
134 * and attach it to the next transaction.
135 */
136 dqinfo = NULL;
137 tflags = 0;
138 if (tp->t_dqinfo) {
139 dqinfo = (void *)tp->t_dqinfo;
140 tp->t_dqinfo = NULL;
141 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
142 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
143 }
144
145 ntp = xfs_trans_dup(tp);
146 code = xfs_trans_commit(tp, 0);
147 tp = ntp;
148 if (committed != NULL) {
149 *committed = 1;
150 }
151 /*
152 * If we get an error during the commit processing,
153 * release the buffer that is still held and return
154 * to the caller.
155 */
156 if (code) {
157 xfs_buf_relse(ialloc_context);
158 if (dqinfo) {
159 tp->t_dqinfo = dqinfo;
160 xfs_trans_free_dqinfo(tp);
161 }
162 *tpp = ntp;
163 *ipp = NULL;
164 return code;
165 }
166
167 /*
168 * transaction commit worked ok so we can drop the extra ticket
169 * reference that we gained in xfs_trans_dup()
170 */
171 xfs_log_ticket_put(tp->t_ticket);
172 code = xfs_trans_reserve(tp, 0, log_res, 0,
173 XFS_TRANS_PERM_LOG_RES, log_count);
174 /*
175 * Re-attach the quota info that we detached from prev trx.
176 */
177 if (dqinfo) {
178 tp->t_dqinfo = dqinfo;
179 tp->t_flags |= tflags;
180 }
181
182 if (code) {
183 xfs_buf_relse(ialloc_context);
184 *tpp = ntp;
185 *ipp = NULL;
186 return code;
187 }
188 xfs_trans_bjoin(tp, ialloc_context);
189
190 /*
191 * Call ialloc again. Since we've locked out all
192 * other allocations in this allocation group,
193 * this call should always succeed.
194 */
195 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
196 okalloc, &ialloc_context, &ip);
197
198 /*
199 * If we get an error at this point, return to the caller
200 * so that the current transaction can be aborted.
201 */
202 if (code) {
203 *tpp = tp;
204 *ipp = NULL;
205 return code;
206 }
207 ASSERT(!ialloc_context && ip);
208
209 } else {
210 if (committed != NULL)
211 *committed = 0;
212 }
213
214 *ipp = ip;
215 *tpp = tp;
216
217 return 0;
218}
219
220/*
221 * Decrement the link count on an inode & log the change.
222 * If this causes the link count to go to zero, initiate the
223 * logging activity required to truncate a file.
224 */
225int /* error */
226xfs_droplink(
227 xfs_trans_t *tp,
228 xfs_inode_t *ip)
229{
230 int error;
231
232 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
233
234 ASSERT (ip->i_d.di_nlink > 0);
235 ip->i_d.di_nlink--;
236 drop_nlink(VFS_I(ip));
237 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
238
239 error = 0;
240 if (ip->i_d.di_nlink == 0) {
241 /*
242 * We're dropping the last link to this file.
243 * Move the on-disk inode to the AGI unlinked list.
244 * From xfs_inactive() we will pull the inode from
245 * the list and free it.
246 */
247 error = xfs_iunlink(tp, ip);
248 }
249 return error;
250}
251
252/*
253 * This gets called when the inode's version needs to be changed from 1 to 2.
254 * Currently this happens when the nlink field overflows the old 16-bit value
255 * or when chproj is called to change the project for the first time.
256 * As a side effect the superblock version will also get rev'd
257 * to contain the NLINK bit.
258 */
259void
260xfs_bump_ino_vers2(
261 xfs_trans_t *tp,
262 xfs_inode_t *ip)
263{
264 xfs_mount_t *mp;
265
266 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
267 ASSERT(ip->i_d.di_version == 1);
268
269 ip->i_d.di_version = 2;
270 ip->i_d.di_onlink = 0;
271 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
272 mp = tp->t_mountp;
273 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
274 spin_lock(&mp->m_sb_lock);
275 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
276 xfs_sb_version_addnlink(&mp->m_sb);
277 spin_unlock(&mp->m_sb_lock);
278 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
279 } else {
280 spin_unlock(&mp->m_sb_lock);
281 }
282 }
283 /* Caller must log the inode */
284}
285
286/*
287 * Increment the link count on an inode & log the change.
288 */
289int
290xfs_bumplink(
291 xfs_trans_t *tp,
292 xfs_inode_t *ip)
293{
294 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
295
296 ASSERT(ip->i_d.di_nlink > 0);
297 ip->i_d.di_nlink++;
298 inc_nlink(VFS_I(ip));
299 if ((ip->i_d.di_version == 1) &&
300 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
301 /*
302 * The inode has increased its number of links beyond
303 * what can fit in an old format inode. It now needs
304 * to be converted to a version 2 inode with a 32 bit
305 * link count. If this is the first inode in the file
306 * system to do this, then we need to bump the superblock
307 * version number as well.
308 */
309 xfs_bump_ino_vers2(tp, ip);
310 }
311
312 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
313 return 0;
314}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
deleted file mode 100644
index 5eeab4690cfe..000000000000
--- a/fs/xfs/xfs_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_UTILS_H__
19#define __XFS_UTILS_H__
20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
26
27#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
deleted file mode 100644
index dc730ac272be..000000000000
--- a/fs/xfs/xfs_vnodeops.c
+++ /dev/null
@@ -1,1870 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_types.h"
23#include "xfs_bit.h"
24#include "xfs_log.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h"
30#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_inode_item.h"
36#include "xfs_itable.h"
37#include "xfs_ialloc.h"
38#include "xfs_alloc.h"
39#include "xfs_bmap.h"
40#include "xfs_acl.h"
41#include "xfs_attr.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_utils.h"
45#include "xfs_rtalloc.h"
46#include "xfs_trans_space.h"
47#include "xfs_log_priv.h"
48#include "xfs_filestream.h"
49#include "xfs_vnodeops.h"
50#include "xfs_trace.h"
51#include "xfs_icache.h"
52#include "xfs_symlink.h"
53
54
55/*
56 * This is called by xfs_inactive to free any blocks beyond eof
57 * when the link count isn't zero and by xfs_dm_punch_hole() when
58 * punching a hole to EOF.
59 */
60int
61xfs_free_eofblocks(
62 xfs_mount_t *mp,
63 xfs_inode_t *ip,
64 bool need_iolock)
65{
66 xfs_trans_t *tp;
67 int error;
68 xfs_fileoff_t end_fsb;
69 xfs_fileoff_t last_fsb;
70 xfs_filblks_t map_len;
71 int nimaps;
72 xfs_bmbt_irec_t imap;
73
74 /*
75 * Figure out if there are any blocks beyond the end
76 * of the file. If not, then there is nothing to do.
77 */
78 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
79 last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
80 if (last_fsb <= end_fsb)
81 return 0;
82 map_len = last_fsb - end_fsb;
83
84 nimaps = 1;
85 xfs_ilock(ip, XFS_ILOCK_SHARED);
86 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
87 xfs_iunlock(ip, XFS_ILOCK_SHARED);
88
89 if (!error && (nimaps != 0) &&
90 (imap.br_startblock != HOLESTARTBLOCK ||
91 ip->i_delayed_blks)) {
92 /*
93 * Attach the dquots to the inode up front.
94 */
95 error = xfs_qm_dqattach(ip, 0);
96 if (error)
97 return error;
98
99 /*
100 * There are blocks after the end of file.
101 * Free them up now by truncating the file to
102 * its current size.
103 */
104 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
105
106 if (need_iolock) {
107 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
108 xfs_trans_cancel(tp, 0);
109 return EAGAIN;
110 }
111 }
112
113 error = xfs_trans_reserve(tp, 0,
114 XFS_ITRUNCATE_LOG_RES(mp),
115 0, XFS_TRANS_PERM_LOG_RES,
116 XFS_ITRUNCATE_LOG_COUNT);
117 if (error) {
118 ASSERT(XFS_FORCED_SHUTDOWN(mp));
119 xfs_trans_cancel(tp, 0);
120 if (need_iolock)
121 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
122 return error;
123 }
124
125 xfs_ilock(ip, XFS_ILOCK_EXCL);
126 xfs_trans_ijoin(tp, ip, 0);
127
128 /*
129 * Do not update the on-disk file size. If we update the
130 * on-disk file size and then the system crashes before the
131 * contents of the file are flushed to disk then the files
132 * may be full of holes (ie NULL files bug).
133 */
134 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
135 XFS_ISIZE(ip));
136 if (error) {
137 /*
138 * If we get an error at this point we simply don't
139 * bother truncating the file.
140 */
141 xfs_trans_cancel(tp,
142 (XFS_TRANS_RELEASE_LOG_RES |
143 XFS_TRANS_ABORT));
144 } else {
145 error = xfs_trans_commit(tp,
146 XFS_TRANS_RELEASE_LOG_RES);
147 if (!error)
148 xfs_inode_clear_eofblocks_tag(ip);
149 }
150
151 xfs_iunlock(ip, XFS_ILOCK_EXCL);
152 if (need_iolock)
153 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
154 }
155 return error;
156}
157
158int
159xfs_release(
160 xfs_inode_t *ip)
161{
162 xfs_mount_t *mp = ip->i_mount;
163 int error;
164
165 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
166 return 0;
167
168 /* If this is a read-only mount, don't do this (would generate I/O) */
169 if (mp->m_flags & XFS_MOUNT_RDONLY)
170 return 0;
171
172 if (!XFS_FORCED_SHUTDOWN(mp)) {
173 int truncated;
174
175 /*
176 * If we are using filestreams, and we have an unlinked
177 * file that we are processing the last close on, then nothing
178 * will be able to reopen and write to this file. Purge this
179 * inode from the filestreams cache so that it doesn't delay
180 * teardown of the inode.
181 */
182 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
183 xfs_filestream_deassociate(ip);
184
185 /*
186 * If we previously truncated this file and removed old data
187 * in the process, we want to initiate "early" writeout on
188 * the last close. This is an attempt to combat the notorious
189 * NULL files problem which is particularly noticeable from a
190 * truncate down, buffered (re-)write (delalloc), followed by
191 * a crash. What we are effectively doing here is
192 * significantly reducing the time window where we'd otherwise
193 * be exposed to that problem.
194 */
195 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
196 if (truncated) {
197 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
198 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
199 error = -filemap_flush(VFS_I(ip)->i_mapping);
200 if (error)
201 return error;
202 }
203 }
204 }
205
206 if (ip->i_d.di_nlink == 0)
207 return 0;
208
209 if (xfs_can_free_eofblocks(ip, false)) {
210
211 /*
212 * If we can't get the iolock just skip truncating the blocks
213 * past EOF because we could deadlock with the mmap_sem
214 * otherwise. We'll get another chance to drop them once the
215 * last reference to the inode is dropped, so we'll never leak
216 * blocks permanently.
217 *
218 * Further, check if the inode is being opened, written and
219 * closed frequently and we have delayed allocation blocks
220 * outstanding (e.g. streaming writes from the NFS server),
221 * truncating the blocks past EOF will cause fragmentation to
222 * occur.
223 *
224 * In this case don't do the truncation, either, but we have to
225 * be careful how we detect this case. Blocks beyond EOF show
226 * up as i_delayed_blks even when the inode is clean, so we
227 * need to truncate them away first before checking for a dirty
228 * release. Hence on the first dirty close we will still remove
229 * the speculative allocation, but after that we will leave it
230 * in place.
231 */
232 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
233 return 0;
234
235 error = xfs_free_eofblocks(mp, ip, true);
236 if (error && error != EAGAIN)
237 return error;
238
239 /* delalloc blocks after truncation means it really is dirty */
240 if (ip->i_delayed_blks)
241 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
242 }
243 return 0;
244}
245
246/*
247 * xfs_inactive
248 *
249 * This is called when the vnode reference count for the vnode
250 * goes to zero. If the file has been unlinked, then it must
251 * now be truncated. Also, we clear all of the read-ahead state
252 * kept for the inode here since the file is now closed.
253 */
254int
255xfs_inactive(
256 xfs_inode_t *ip)
257{
258 xfs_bmap_free_t free_list;
259 xfs_fsblock_t first_block;
260 int committed;
261 xfs_trans_t *tp;
262 xfs_mount_t *mp;
263 int error;
264 int truncate = 0;
265
266 /*
267 * If the inode is already free, then there can be nothing
268 * to clean up here.
269 */
270 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
271 ASSERT(ip->i_df.if_real_bytes == 0);
272 ASSERT(ip->i_df.if_broot_bytes == 0);
273 return VN_INACTIVE_CACHE;
274 }
275
276 mp = ip->i_mount;
277
278 error = 0;
279
280 /* If this is a read-only mount, don't do this (would generate I/O) */
281 if (mp->m_flags & XFS_MOUNT_RDONLY)
282 goto out;
283
284 if (ip->i_d.di_nlink != 0) {
285 /*
286 * force is true because we are evicting an inode from the
287 * cache. Post-eof blocks must be freed, lest we end up with
288 * broken free space accounting.
289 */
290 if (xfs_can_free_eofblocks(ip, true)) {
291 error = xfs_free_eofblocks(mp, ip, false);
292 if (error)
293 return VN_INACTIVE_CACHE;
294 }
295 goto out;
296 }
297
298 if (S_ISREG(ip->i_d.di_mode) &&
299 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
300 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
301 truncate = 1;
302
303 error = xfs_qm_dqattach(ip, 0);
304 if (error)
305 return VN_INACTIVE_CACHE;
306
307 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
308 error = xfs_trans_reserve(tp, 0,
309 (truncate || S_ISLNK(ip->i_d.di_mode)) ?
310 XFS_ITRUNCATE_LOG_RES(mp) :
311 XFS_IFREE_LOG_RES(mp),
312 0,
313 XFS_TRANS_PERM_LOG_RES,
314 XFS_ITRUNCATE_LOG_COUNT);
315 if (error) {
316 ASSERT(XFS_FORCED_SHUTDOWN(mp));
317 xfs_trans_cancel(tp, 0);
318 return VN_INACTIVE_CACHE;
319 }
320
321 xfs_ilock(ip, XFS_ILOCK_EXCL);
322 xfs_trans_ijoin(tp, ip, 0);
323
324 if (S_ISLNK(ip->i_d.di_mode)) {
325 error = xfs_inactive_symlink(ip, &tp);
326 if (error)
327 goto out_cancel;
328 } else if (truncate) {
329 ip->i_d.di_size = 0;
330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
331
332 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
333 if (error)
334 goto out_cancel;
335
336 ASSERT(ip->i_d.di_nextents == 0);
337 }
338
339 /*
340 * If there are attributes associated with the file then blow them away
341 * now. The code calls a routine that recursively deconstructs the
342 * attribute fork. We need to just commit the current transaction
343 * because we can't use it for xfs_attr_inactive().
344 */
345 if (ip->i_d.di_anextents > 0) {
346 ASSERT(ip->i_d.di_forkoff != 0);
347
348 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
349 if (error)
350 goto out_unlock;
351
352 xfs_iunlock(ip, XFS_ILOCK_EXCL);
353
354 error = xfs_attr_inactive(ip);
355 if (error)
356 goto out;
357
358 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
359 error = xfs_trans_reserve(tp, 0,
360 XFS_IFREE_LOG_RES(mp),
361 0, XFS_TRANS_PERM_LOG_RES,
362 XFS_INACTIVE_LOG_COUNT);
363 if (error) {
364 xfs_trans_cancel(tp, 0);
365 goto out;
366 }
367
368 xfs_ilock(ip, XFS_ILOCK_EXCL);
369 xfs_trans_ijoin(tp, ip, 0);
370 }
371
372 if (ip->i_afp)
373 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
374
375 ASSERT(ip->i_d.di_anextents == 0);
376
377 /*
378 * Free the inode.
379 */
380 xfs_bmap_init(&free_list, &first_block);
381 error = xfs_ifree(tp, ip, &free_list);
382 if (error) {
383 /*
384 * If we fail to free the inode, shut down. The cancel
385 * might do that, we need to make sure. Otherwise the
386 * inode might be lost for a long time or forever.
387 */
388 if (!XFS_FORCED_SHUTDOWN(mp)) {
389 xfs_notice(mp, "%s: xfs_ifree returned error %d",
390 __func__, error);
391 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
392 }
393 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
394 } else {
395 /*
396 * Credit the quota account(s). The inode is gone.
397 */
398 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
399
400 /*
401 * Just ignore errors at this point. There is nothing we can
402 * do except to try to keep going. Make sure it's not a silent
403 * error.
404 */
405 error = xfs_bmap_finish(&tp, &free_list, &committed);
406 if (error)
407 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
408 __func__, error);
409 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
410 if (error)
411 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
412 __func__, error);
413 }
414
415 /*
416 * Release the dquots held by inode, if any.
417 */
418 xfs_qm_dqdetach(ip);
419out_unlock:
420 xfs_iunlock(ip, XFS_ILOCK_EXCL);
421out:
422 return VN_INACTIVE_CACHE;
423out_cancel:
424 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
425 goto out_unlock;
426}
427
428/*
429 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
430 * is allowed, otherwise it has to be an exact match. If a CI match is found,
431 * ci_name->name will point to a the actual name (caller must free) or
432 * will be set to NULL if an exact match is found.
433 */
434int
435xfs_lookup(
436 xfs_inode_t *dp,
437 struct xfs_name *name,
438 xfs_inode_t **ipp,
439 struct xfs_name *ci_name)
440{
441 xfs_ino_t inum;
442 int error;
443 uint lock_mode;
444
445 trace_xfs_lookup(dp, name);
446
447 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
448 return XFS_ERROR(EIO);
449
450 lock_mode = xfs_ilock_map_shared(dp);
451 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
452 xfs_iunlock_map_shared(dp, lock_mode);
453
454 if (error)
455 goto out;
456
457 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
458 if (error)
459 goto out_free_name;
460
461 return 0;
462
463out_free_name:
464 if (ci_name)
465 kmem_free(ci_name->name);
466out:
467 *ipp = NULL;
468 return error;
469}
470
471int
472xfs_create(
473 xfs_inode_t *dp,
474 struct xfs_name *name,
475 umode_t mode,
476 xfs_dev_t rdev,
477 xfs_inode_t **ipp)
478{
479 int is_dir = S_ISDIR(mode);
480 struct xfs_mount *mp = dp->i_mount;
481 struct xfs_inode *ip = NULL;
482 struct xfs_trans *tp = NULL;
483 int error;
484 xfs_bmap_free_t free_list;
485 xfs_fsblock_t first_block;
486 bool unlock_dp_on_error = false;
487 uint cancel_flags;
488 int committed;
489 prid_t prid;
490 struct xfs_dquot *udqp = NULL;
491 struct xfs_dquot *gdqp = NULL;
492 struct xfs_dquot *pdqp = NULL;
493 uint resblks;
494 uint log_res;
495 uint log_count;
496
497 trace_xfs_create(dp, name);
498
499 if (XFS_FORCED_SHUTDOWN(mp))
500 return XFS_ERROR(EIO);
501
502 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
503 prid = xfs_get_projid(dp);
504 else
505 prid = XFS_PROJID_DEFAULT;
506
507 /*
508 * Make sure that we have allocated dquot(s) on disk.
509 */
510 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
511 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
512 &udqp, &gdqp, &pdqp);
513 if (error)
514 return error;
515
516 if (is_dir) {
517 rdev = 0;
518 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
519 log_res = XFS_MKDIR_LOG_RES(mp);
520 log_count = XFS_MKDIR_LOG_COUNT;
521 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
522 } else {
523 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
524 log_res = XFS_CREATE_LOG_RES(mp);
525 log_count = XFS_CREATE_LOG_COUNT;
526 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
527 }
528
529 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
530
531 /*
532 * Initially assume that the file does not exist and
533 * reserve the resources for that case. If that is not
534 * the case we'll drop the one we have and get a more
535 * appropriate transaction later.
536 */
537 error = xfs_trans_reserve(tp, resblks, log_res, 0,
538 XFS_TRANS_PERM_LOG_RES, log_count);
539 if (error == ENOSPC) {
540 /* flush outstanding delalloc blocks and retry */
541 xfs_flush_inodes(mp);
542 error = xfs_trans_reserve(tp, resblks, log_res, 0,
543 XFS_TRANS_PERM_LOG_RES, log_count);
544 }
545 if (error == ENOSPC) {
546 /* No space at all so try a "no-allocation" reservation */
547 resblks = 0;
548 error = xfs_trans_reserve(tp, 0, log_res, 0,
549 XFS_TRANS_PERM_LOG_RES, log_count);
550 }
551 if (error) {
552 cancel_flags = 0;
553 goto out_trans_cancel;
554 }
555
556 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
557 unlock_dp_on_error = true;
558
559 xfs_bmap_init(&free_list, &first_block);
560
561 /*
562 * Reserve disk quota and the inode.
563 */
564 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
565 pdqp, resblks, 1, 0);
566 if (error)
567 goto out_trans_cancel;
568
569 error = xfs_dir_canenter(tp, dp, name, resblks);
570 if (error)
571 goto out_trans_cancel;
572
573 /*
574 * A newly created regular or special file just has one directory
575 * entry pointing to them, but a directory also the "." entry
576 * pointing to itself.
577 */
578 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
579 prid, resblks > 0, &ip, &committed);
580 if (error) {
581 if (error == ENOSPC)
582 goto out_trans_cancel;
583 goto out_trans_abort;
584 }
585
586 /*
587 * Now we join the directory inode to the transaction. We do not do it
588 * earlier because xfs_dir_ialloc might commit the previous transaction
589 * (and release all the locks). An error from here on will result in
590 * the transaction cancel unlocking dp so don't do it explicitly in the
591 * error path.
592 */
593 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
594 unlock_dp_on_error = false;
595
596 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
597 &first_block, &free_list, resblks ?
598 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
599 if (error) {
600 ASSERT(error != ENOSPC);
601 goto out_trans_abort;
602 }
603 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
604 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
605
606 if (is_dir) {
607 error = xfs_dir_init(tp, ip, dp);
608 if (error)
609 goto out_bmap_cancel;
610
611 error = xfs_bumplink(tp, dp);
612 if (error)
613 goto out_bmap_cancel;
614 }
615
616 /*
617 * If this is a synchronous mount, make sure that the
618 * create transaction goes to disk before returning to
619 * the user.
620 */
621 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
622 xfs_trans_set_sync(tp);
623
624 /*
625 * Attach the dquot(s) to the inodes and modify them incore.
626 * These ids of the inode couldn't have changed since the new
627 * inode has been locked ever since it was created.
628 */
629 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
630
631 error = xfs_bmap_finish(&tp, &free_list, &committed);
632 if (error)
633 goto out_bmap_cancel;
634
635 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
636 if (error)
637 goto out_release_inode;
638
639 xfs_qm_dqrele(udqp);
640 xfs_qm_dqrele(gdqp);
641 xfs_qm_dqrele(pdqp);
642
643 *ipp = ip;
644 return 0;
645
646 out_bmap_cancel:
647 xfs_bmap_cancel(&free_list);
648 out_trans_abort:
649 cancel_flags |= XFS_TRANS_ABORT;
650 out_trans_cancel:
651 xfs_trans_cancel(tp, cancel_flags);
652 out_release_inode:
653 /*
654 * Wait until after the current transaction is aborted to
655 * release the inode. This prevents recursive transactions
656 * and deadlocks from xfs_inactive.
657 */
658 if (ip)
659 IRELE(ip);
660
661 xfs_qm_dqrele(udqp);
662 xfs_qm_dqrele(gdqp);
663 xfs_qm_dqrele(pdqp);
664
665 if (unlock_dp_on_error)
666 xfs_iunlock(dp, XFS_ILOCK_EXCL);
667 return error;
668}
669
670#ifdef DEBUG
671int xfs_locked_n;
672int xfs_small_retries;
673int xfs_middle_retries;
674int xfs_lots_retries;
675int xfs_lock_delays;
676#endif
677
678/*
679 * Bump the subclass so xfs_lock_inodes() acquires each lock with
680 * a different value
681 */
682static inline int
683xfs_lock_inumorder(int lock_mode, int subclass)
684{
685 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
686 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
687 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
688 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
689
690 return lock_mode;
691}
692
693/*
694 * The following routine will lock n inodes in exclusive mode.
695 * We assume the caller calls us with the inodes in i_ino order.
696 *
697 * We need to detect deadlock where an inode that we lock
698 * is in the AIL and we start waiting for another inode that is locked
699 * by a thread in a long running transaction (such as truncate). This can
700 * result in deadlock since the long running trans might need to wait
701 * for the inode we just locked in order to push the tail and free space
702 * in the log.
703 */
704void
705xfs_lock_inodes(
706 xfs_inode_t **ips,
707 int inodes,
708 uint lock_mode)
709{
710 int attempts = 0, i, j, try_lock;
711 xfs_log_item_t *lp;
712
713 ASSERT(ips && (inodes >= 2)); /* we need at least two */
714
715 try_lock = 0;
716 i = 0;
717
718again:
719 for (; i < inodes; i++) {
720 ASSERT(ips[i]);
721
722 if (i && (ips[i] == ips[i-1])) /* Already locked */
723 continue;
724
725 /*
726 * If try_lock is not set yet, make sure all locked inodes
727 * are not in the AIL.
728 * If any are, set try_lock to be used later.
729 */
730
731 if (!try_lock) {
732 for (j = (i - 1); j >= 0 && !try_lock; j--) {
733 lp = (xfs_log_item_t *)ips[j]->i_itemp;
734 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
735 try_lock++;
736 }
737 }
738 }
739
740 /*
741 * If any of the previous locks we have locked is in the AIL,
742 * we must TRY to get the second and subsequent locks. If
743 * we can't get any, we must release all we have
744 * and try again.
745 */
746
747 if (try_lock) {
748 /* try_lock must be 0 if i is 0. */
749 /*
750 * try_lock means we have an inode locked
751 * that is in the AIL.
752 */
753 ASSERT(i != 0);
754 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
755 attempts++;
756
757 /*
758 * Unlock all previous guys and try again.
759 * xfs_iunlock will try to push the tail
760 * if the inode is in the AIL.
761 */
762
763 for(j = i - 1; j >= 0; j--) {
764
765 /*
766 * Check to see if we've already
767 * unlocked this one.
768 * Not the first one going back,
769 * and the inode ptr is the same.
770 */
771 if ((j != (i - 1)) && ips[j] ==
772 ips[j+1])
773 continue;
774
775 xfs_iunlock(ips[j], lock_mode);
776 }
777
778 if ((attempts % 5) == 0) {
779 delay(1); /* Don't just spin the CPU */
780#ifdef DEBUG
781 xfs_lock_delays++;
782#endif
783 }
784 i = 0;
785 try_lock = 0;
786 goto again;
787 }
788 } else {
789 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
790 }
791 }
792
793#ifdef DEBUG
794 if (attempts) {
795 if (attempts < 5) xfs_small_retries++;
796 else if (attempts < 100) xfs_middle_retries++;
797 else xfs_lots_retries++;
798 } else {
799 xfs_locked_n++;
800 }
801#endif
802}
803
804/*
805 * xfs_lock_two_inodes() can only be used to lock one type of lock
806 * at a time - the iolock or the ilock, but not both at once. If
807 * we lock both at once, lockdep will report false positives saying
808 * we have violated locking orders.
809 */
810void
811xfs_lock_two_inodes(
812 xfs_inode_t *ip0,
813 xfs_inode_t *ip1,
814 uint lock_mode)
815{
816 xfs_inode_t *temp;
817 int attempts = 0;
818 xfs_log_item_t *lp;
819
820 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
821 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
822 ASSERT(ip0->i_ino != ip1->i_ino);
823
824 if (ip0->i_ino > ip1->i_ino) {
825 temp = ip0;
826 ip0 = ip1;
827 ip1 = temp;
828 }
829
830 again:
831 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
832
833 /*
834 * If the first lock we have locked is in the AIL, we must TRY to get
835 * the second lock. If we can't get it, we must release the first one
836 * and try again.
837 */
838 lp = (xfs_log_item_t *)ip0->i_itemp;
839 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
840 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
841 xfs_iunlock(ip0, lock_mode);
842 if ((++attempts % 5) == 0)
843 delay(1); /* Don't just spin the CPU */
844 goto again;
845 }
846 } else {
847 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
848 }
849}
850
851int
852xfs_remove(
853 xfs_inode_t *dp,
854 struct xfs_name *name,
855 xfs_inode_t *ip)
856{
857 xfs_mount_t *mp = dp->i_mount;
858 xfs_trans_t *tp = NULL;
859 int is_dir = S_ISDIR(ip->i_d.di_mode);
860 int error = 0;
861 xfs_bmap_free_t free_list;
862 xfs_fsblock_t first_block;
863 int cancel_flags;
864 int committed;
865 int link_zero;
866 uint resblks;
867 uint log_count;
868
869 trace_xfs_remove(dp, name);
870
871 if (XFS_FORCED_SHUTDOWN(mp))
872 return XFS_ERROR(EIO);
873
874 error = xfs_qm_dqattach(dp, 0);
875 if (error)
876 goto std_return;
877
878 error = xfs_qm_dqattach(ip, 0);
879 if (error)
880 goto std_return;
881
882 if (is_dir) {
883 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
884 log_count = XFS_DEFAULT_LOG_COUNT;
885 } else {
886 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
887 log_count = XFS_REMOVE_LOG_COUNT;
888 }
889 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
890
891 /*
892 * We try to get the real space reservation first,
893 * allowing for directory btree deletion(s) implying
894 * possible bmap insert(s). If we can't get the space
895 * reservation then we use 0 instead, and avoid the bmap
896 * btree insert(s) in the directory code by, if the bmap
897 * insert tries to happen, instead trimming the LAST
898 * block from the directory.
899 */
900 resblks = XFS_REMOVE_SPACE_RES(mp);
901 error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
902 XFS_TRANS_PERM_LOG_RES, log_count);
903 if (error == ENOSPC) {
904 resblks = 0;
905 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
906 XFS_TRANS_PERM_LOG_RES, log_count);
907 }
908 if (error) {
909 ASSERT(error != ENOSPC);
910 cancel_flags = 0;
911 goto out_trans_cancel;
912 }
913
914 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
915
916 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
917 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
918
919 /*
920 * If we're removing a directory perform some additional validation.
921 */
922 if (is_dir) {
923 ASSERT(ip->i_d.di_nlink >= 2);
924 if (ip->i_d.di_nlink != 2) {
925 error = XFS_ERROR(ENOTEMPTY);
926 goto out_trans_cancel;
927 }
928 if (!xfs_dir_isempty(ip)) {
929 error = XFS_ERROR(ENOTEMPTY);
930 goto out_trans_cancel;
931 }
932 }
933
934 xfs_bmap_init(&free_list, &first_block);
935 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
936 &first_block, &free_list, resblks);
937 if (error) {
938 ASSERT(error != ENOENT);
939 goto out_bmap_cancel;
940 }
941 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
942
943 if (is_dir) {
944 /*
945 * Drop the link from ip's "..".
946 */
947 error = xfs_droplink(tp, dp);
948 if (error)
949 goto out_bmap_cancel;
950
951 /*
952 * Drop the "." link from ip to self.
953 */
954 error = xfs_droplink(tp, ip);
955 if (error)
956 goto out_bmap_cancel;
957 } else {
958 /*
959 * When removing a non-directory we need to log the parent
960 * inode here. For a directory this is done implicitly
961 * by the xfs_droplink call for the ".." entry.
962 */
963 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
964 }
965
966 /*
967 * Drop the link from dp to ip.
968 */
969 error = xfs_droplink(tp, ip);
970 if (error)
971 goto out_bmap_cancel;
972
973 /*
974 * Determine if this is the last link while
975 * we are in the transaction.
976 */
977 link_zero = (ip->i_d.di_nlink == 0);
978
979 /*
980 * If this is a synchronous mount, make sure that the
981 * remove transaction goes to disk before returning to
982 * the user.
983 */
984 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
985 xfs_trans_set_sync(tp);
986
987 error = xfs_bmap_finish(&tp, &free_list, &committed);
988 if (error)
989 goto out_bmap_cancel;
990
991 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
992 if (error)
993 goto std_return;
994
995 /*
996 * If we are using filestreams, kill the stream association.
997 * If the file is still open it may get a new one but that
998 * will get killed on last close in xfs_close() so we don't
999 * have to worry about that.
1000 */
1001 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1002 xfs_filestream_deassociate(ip);
1003
1004 return 0;
1005
1006 out_bmap_cancel:
1007 xfs_bmap_cancel(&free_list);
1008 cancel_flags |= XFS_TRANS_ABORT;
1009 out_trans_cancel:
1010 xfs_trans_cancel(tp, cancel_flags);
1011 std_return:
1012 return error;
1013}
1014
1015int
1016xfs_link(
1017 xfs_inode_t *tdp,
1018 xfs_inode_t *sip,
1019 struct xfs_name *target_name)
1020{
1021 xfs_mount_t *mp = tdp->i_mount;
1022 xfs_trans_t *tp;
1023 int error;
1024 xfs_bmap_free_t free_list;
1025 xfs_fsblock_t first_block;
1026 int cancel_flags;
1027 int committed;
1028 int resblks;
1029
1030 trace_xfs_link(tdp, target_name);
1031
1032 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1033
1034 if (XFS_FORCED_SHUTDOWN(mp))
1035 return XFS_ERROR(EIO);
1036
1037 error = xfs_qm_dqattach(sip, 0);
1038 if (error)
1039 goto std_return;
1040
1041 error = xfs_qm_dqattach(tdp, 0);
1042 if (error)
1043 goto std_return;
1044
1045 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1046 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1047 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1048 error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1049 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1050 if (error == ENOSPC) {
1051 resblks = 0;
1052 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1053 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1054 }
1055 if (error) {
1056 cancel_flags = 0;
1057 goto error_return;
1058 }
1059
1060 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1061
1062 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1063 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1064
1065 /*
1066 * If we are using project inheritance, we only allow hard link
1067 * creation in our tree when the project IDs are the same; else
1068 * the tree quota mechanism could be circumvented.
1069 */
1070 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1071 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1072 error = XFS_ERROR(EXDEV);
1073 goto error_return;
1074 }
1075
1076 error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1077 if (error)
1078 goto error_return;
1079
1080 xfs_bmap_init(&free_list, &first_block);
1081
1082 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1083 &first_block, &free_list, resblks);
1084 if (error)
1085 goto abort_return;
1086 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1087 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1088
1089 error = xfs_bumplink(tp, sip);
1090 if (error)
1091 goto abort_return;
1092
1093 /*
1094 * If this is a synchronous mount, make sure that the
1095 * link transaction goes to disk before returning to
1096 * the user.
1097 */
1098 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1099 xfs_trans_set_sync(tp);
1100 }
1101
1102 error = xfs_bmap_finish (&tp, &free_list, &committed);
1103 if (error) {
1104 xfs_bmap_cancel(&free_list);
1105 goto abort_return;
1106 }
1107
1108 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1109
1110 abort_return:
1111 cancel_flags |= XFS_TRANS_ABORT;
1112 error_return:
1113 xfs_trans_cancel(tp, cancel_flags);
1114 std_return:
1115 return error;
1116}
1117
1118int
1119xfs_set_dmattrs(
1120 xfs_inode_t *ip,
1121 u_int evmask,
1122 u_int16_t state)
1123{
1124 xfs_mount_t *mp = ip->i_mount;
1125 xfs_trans_t *tp;
1126 int error;
1127
1128 if (!capable(CAP_SYS_ADMIN))
1129 return XFS_ERROR(EPERM);
1130
1131 if (XFS_FORCED_SHUTDOWN(mp))
1132 return XFS_ERROR(EIO);
1133
1134 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1135 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1136 if (error) {
1137 xfs_trans_cancel(tp, 0);
1138 return error;
1139 }
1140 xfs_ilock(ip, XFS_ILOCK_EXCL);
1141 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1142
1143 ip->i_d.di_dmevmask = evmask;
1144 ip->i_d.di_dmstate = state;
1145
1146 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1147 error = xfs_trans_commit(tp, 0);
1148
1149 return error;
1150}
1151
1152/*
1153 * xfs_alloc_file_space()
1154 * This routine allocates disk space for the given file.
1155 *
1156 * If alloc_type == 0, this request is for an ALLOCSP type
1157 * request which will change the file size. In this case, no
1158 * DMAPI event will be generated by the call. A TRUNCATE event
1159 * will be generated later by xfs_setattr.
1160 *
1161 * If alloc_type != 0, this request is for a RESVSP type
1162 * request, and a DMAPI DM_EVENT_WRITE will be generated if the
1163 * lower block boundary byte address is less than the file's
1164 * length.
1165 *
1166 * RETURNS:
1167 * 0 on success
1168 * errno on error
1169 *
1170 */
1171STATIC int
1172xfs_alloc_file_space(
1173 xfs_inode_t *ip,
1174 xfs_off_t offset,
1175 xfs_off_t len,
1176 int alloc_type,
1177 int attr_flags)
1178{
1179 xfs_mount_t *mp = ip->i_mount;
1180 xfs_off_t count;
1181 xfs_filblks_t allocated_fsb;
1182 xfs_filblks_t allocatesize_fsb;
1183 xfs_extlen_t extsz, temp;
1184 xfs_fileoff_t startoffset_fsb;
1185 xfs_fsblock_t firstfsb;
1186 int nimaps;
1187 int quota_flag;
1188 int rt;
1189 xfs_trans_t *tp;
1190 xfs_bmbt_irec_t imaps[1], *imapp;
1191 xfs_bmap_free_t free_list;
1192 uint qblocks, resblks, resrtextents;
1193 int committed;
1194 int error;
1195
1196 trace_xfs_alloc_file_space(ip);
1197
1198 if (XFS_FORCED_SHUTDOWN(mp))
1199 return XFS_ERROR(EIO);
1200
1201 error = xfs_qm_dqattach(ip, 0);
1202 if (error)
1203 return error;
1204
1205 if (len <= 0)
1206 return XFS_ERROR(EINVAL);
1207
1208 rt = XFS_IS_REALTIME_INODE(ip);
1209 extsz = xfs_get_extsz_hint(ip);
1210
1211 count = len;
1212 imapp = &imaps[0];
1213 nimaps = 1;
1214 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1215 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1216
1217 /*
1218 * Allocate file space until done or until there is an error
1219 */
1220 while (allocatesize_fsb && !error) {
1221 xfs_fileoff_t s, e;
1222
1223 /*
1224 * Determine space reservations for data/realtime.
1225 */
1226 if (unlikely(extsz)) {
1227 s = startoffset_fsb;
1228 do_div(s, extsz);
1229 s *= extsz;
1230 e = startoffset_fsb + allocatesize_fsb;
1231 if ((temp = do_mod(startoffset_fsb, extsz)))
1232 e += temp;
1233 if ((temp = do_mod(e, extsz)))
1234 e += extsz - temp;
1235 } else {
1236 s = 0;
1237 e = allocatesize_fsb;
1238 }
1239
1240 /*
1241 * The transaction reservation is limited to a 32-bit block
1242 * count, hence we need to limit the number of blocks we are
1243 * trying to reserve to avoid an overflow. We can't allocate
1244 * more than @nimaps extents, and an extent is limited on disk
1245 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1246 */
1247 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1248 if (unlikely(rt)) {
1249 resrtextents = qblocks = resblks;
1250 resrtextents /= mp->m_sb.sb_rextsize;
1251 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1252 quota_flag = XFS_QMOPT_RES_RTBLKS;
1253 } else {
1254 resrtextents = 0;
1255 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1256 quota_flag = XFS_QMOPT_RES_REGBLKS;
1257 }
1258
1259 /*
1260 * Allocate and setup the transaction.
1261 */
1262 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1263 error = xfs_trans_reserve(tp, resblks,
1264 XFS_WRITE_LOG_RES(mp), resrtextents,
1265 XFS_TRANS_PERM_LOG_RES,
1266 XFS_WRITE_LOG_COUNT);
1267 /*
1268 * Check for running out of space
1269 */
1270 if (error) {
1271 /*
1272 * Free the transaction structure.
1273 */
1274 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1275 xfs_trans_cancel(tp, 0);
1276 break;
1277 }
1278 xfs_ilock(ip, XFS_ILOCK_EXCL);
1279 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1280 0, quota_flag);
1281 if (error)
1282 goto error1;
1283
1284 xfs_trans_ijoin(tp, ip, 0);
1285
1286 xfs_bmap_init(&free_list, &firstfsb);
1287 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1288 allocatesize_fsb, alloc_type, &firstfsb,
1289 0, imapp, &nimaps, &free_list);
1290 if (error) {
1291 goto error0;
1292 }
1293
1294 /*
1295 * Complete the transaction
1296 */
1297 error = xfs_bmap_finish(&tp, &free_list, &committed);
1298 if (error) {
1299 goto error0;
1300 }
1301
1302 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1303 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1304 if (error) {
1305 break;
1306 }
1307
1308 allocated_fsb = imapp->br_blockcount;
1309
1310 if (nimaps == 0) {
1311 error = XFS_ERROR(ENOSPC);
1312 break;
1313 }
1314
1315 startoffset_fsb += allocated_fsb;
1316 allocatesize_fsb -= allocated_fsb;
1317 }
1318
1319 return error;
1320
1321error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1322 xfs_bmap_cancel(&free_list);
1323 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1324
1325error1: /* Just cancel transaction */
1326 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1327 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1328 return error;
1329}
1330
1331/*
1332 * Zero file bytes between startoff and endoff inclusive.
1333 * The iolock is held exclusive and no blocks are buffered.
1334 *
1335 * This function is used by xfs_free_file_space() to zero
1336 * partial blocks when the range to free is not block aligned.
1337 * When unreserving space with boundaries that are not block
1338 * aligned we round up the start and round down the end
1339 * boundaries and then use this function to zero the parts of
1340 * the blocks that got dropped during the rounding.
1341 */
1342STATIC int
1343xfs_zero_remaining_bytes(
1344 xfs_inode_t *ip,
1345 xfs_off_t startoff,
1346 xfs_off_t endoff)
1347{
1348 xfs_bmbt_irec_t imap;
1349 xfs_fileoff_t offset_fsb;
1350 xfs_off_t lastoffset;
1351 xfs_off_t offset;
1352 xfs_buf_t *bp;
1353 xfs_mount_t *mp = ip->i_mount;
1354 int nimap;
1355 int error = 0;
1356
1357 /*
1358 * Avoid doing I/O beyond eof - it's not necessary
1359 * since nothing can read beyond eof. The space will
1360 * be zeroed when the file is extended anyway.
1361 */
1362 if (startoff >= XFS_ISIZE(ip))
1363 return 0;
1364
1365 if (endoff > XFS_ISIZE(ip))
1366 endoff = XFS_ISIZE(ip);
1367
1368 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1369 mp->m_rtdev_targp : mp->m_ddev_targp,
1370 BTOBB(mp->m_sb.sb_blocksize), 0);
1371 if (!bp)
1372 return XFS_ERROR(ENOMEM);
1373
1374 xfs_buf_unlock(bp);
1375
1376 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1377 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1378 nimap = 1;
1379 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1380 if (error || nimap < 1)
1381 break;
1382 ASSERT(imap.br_blockcount >= 1);
1383 ASSERT(imap.br_startoff == offset_fsb);
1384 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1385 if (lastoffset > endoff)
1386 lastoffset = endoff;
1387 if (imap.br_startblock == HOLESTARTBLOCK)
1388 continue;
1389 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1390 if (imap.br_state == XFS_EXT_UNWRITTEN)
1391 continue;
1392 XFS_BUF_UNDONE(bp);
1393 XFS_BUF_UNWRITE(bp);
1394 XFS_BUF_READ(bp);
1395 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1396 xfsbdstrat(mp, bp);
1397 error = xfs_buf_iowait(bp);
1398 if (error) {
1399 xfs_buf_ioerror_alert(bp,
1400 "xfs_zero_remaining_bytes(read)");
1401 break;
1402 }
1403 memset(bp->b_addr +
1404 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1405 0, lastoffset - offset + 1);
1406 XFS_BUF_UNDONE(bp);
1407 XFS_BUF_UNREAD(bp);
1408 XFS_BUF_WRITE(bp);
1409 xfsbdstrat(mp, bp);
1410 error = xfs_buf_iowait(bp);
1411 if (error) {
1412 xfs_buf_ioerror_alert(bp,
1413 "xfs_zero_remaining_bytes(write)");
1414 break;
1415 }
1416 }
1417 xfs_buf_free(bp);
1418 return error;
1419}
1420
1421/*
1422 * xfs_free_file_space()
1423 * This routine frees disk space for the given file.
1424 *
1425 * This routine is only called by xfs_change_file_space
1426 * for an UNRESVSP type call.
1427 *
1428 * RETURNS:
1429 * 0 on success
1430 * errno on error
1431 *
1432 */
1433STATIC int
1434xfs_free_file_space(
1435 xfs_inode_t *ip,
1436 xfs_off_t offset,
1437 xfs_off_t len,
1438 int attr_flags)
1439{
1440 int committed;
1441 int done;
1442 xfs_fileoff_t endoffset_fsb;
1443 int error;
1444 xfs_fsblock_t firstfsb;
1445 xfs_bmap_free_t free_list;
1446 xfs_bmbt_irec_t imap;
1447 xfs_off_t ioffset;
1448 xfs_extlen_t mod=0;
1449 xfs_mount_t *mp;
1450 int nimap;
1451 uint resblks;
1452 xfs_off_t rounding;
1453 int rt;
1454 xfs_fileoff_t startoffset_fsb;
1455 xfs_trans_t *tp;
1456 int need_iolock = 1;
1457
1458 mp = ip->i_mount;
1459
1460 trace_xfs_free_file_space(ip);
1461
1462 error = xfs_qm_dqattach(ip, 0);
1463 if (error)
1464 return error;
1465
1466 error = 0;
1467 if (len <= 0) /* if nothing being freed */
1468 return error;
1469 rt = XFS_IS_REALTIME_INODE(ip);
1470 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1471 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1472
1473 if (attr_flags & XFS_ATTR_NOLOCK)
1474 need_iolock = 0;
1475 if (need_iolock) {
1476 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1477 /* wait for the completion of any pending DIOs */
1478 inode_dio_wait(VFS_I(ip));
1479 }
1480
1481 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1482 ioffset = offset & ~(rounding - 1);
1483 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1484 ioffset, -1);
1485 if (error)
1486 goto out_unlock_iolock;
1487 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1488
1489 /*
1490 * Need to zero the stuff we're not freeing, on disk.
1491 * If it's a realtime file & can't use unwritten extents then we
1492 * actually need to zero the extent edges. Otherwise xfs_bunmapi
1493 * will take care of it for us.
1494 */
1495 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1496 nimap = 1;
1497 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1498 &imap, &nimap, 0);
1499 if (error)
1500 goto out_unlock_iolock;
1501 ASSERT(nimap == 0 || nimap == 1);
1502 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1503 xfs_daddr_t block;
1504
1505 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1506 block = imap.br_startblock;
1507 mod = do_div(block, mp->m_sb.sb_rextsize);
1508 if (mod)
1509 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1510 }
1511 nimap = 1;
1512 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1513 &imap, &nimap, 0);
1514 if (error)
1515 goto out_unlock_iolock;
1516 ASSERT(nimap == 0 || nimap == 1);
1517 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1518 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1519 mod++;
1520 if (mod && (mod != mp->m_sb.sb_rextsize))
1521 endoffset_fsb -= mod;
1522 }
1523 }
1524 if ((done = (endoffset_fsb <= startoffset_fsb)))
1525 /*
1526 * One contiguous piece to clear
1527 */
1528 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1529 else {
1530 /*
1531 * Some full blocks, possibly two pieces to clear
1532 */
1533 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1534 error = xfs_zero_remaining_bytes(ip, offset,
1535 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1536 if (!error &&
1537 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1538 error = xfs_zero_remaining_bytes(ip,
1539 XFS_FSB_TO_B(mp, endoffset_fsb),
1540 offset + len - 1);
1541 }
1542
1543 /*
1544 * free file space until done or until there is an error
1545 */
1546 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1547 while (!error && !done) {
1548
1549 /*
1550 * allocate and setup the transaction. Allow this
1551 * transaction to dip into the reserve blocks to ensure
1552 * the freeing of the space succeeds at ENOSPC.
1553 */
1554 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1555 tp->t_flags |= XFS_TRANS_RESERVE;
1556 error = xfs_trans_reserve(tp,
1557 resblks,
1558 XFS_WRITE_LOG_RES(mp),
1559 0,
1560 XFS_TRANS_PERM_LOG_RES,
1561 XFS_WRITE_LOG_COUNT);
1562
1563 /*
1564 * check for running out of space
1565 */
1566 if (error) {
1567 /*
1568 * Free the transaction structure.
1569 */
1570 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1571 xfs_trans_cancel(tp, 0);
1572 break;
1573 }
1574 xfs_ilock(ip, XFS_ILOCK_EXCL);
1575 error = xfs_trans_reserve_quota(tp, mp,
1576 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1577 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1578 if (error)
1579 goto error1;
1580
1581 xfs_trans_ijoin(tp, ip, 0);
1582
1583 /*
1584 * issue the bunmapi() call to free the blocks
1585 */
1586 xfs_bmap_init(&free_list, &firstfsb);
1587 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1588 endoffset_fsb - startoffset_fsb,
1589 0, 2, &firstfsb, &free_list, &done);
1590 if (error) {
1591 goto error0;
1592 }
1593
1594 /*
1595 * complete the transaction
1596 */
1597 error = xfs_bmap_finish(&tp, &free_list, &committed);
1598 if (error) {
1599 goto error0;
1600 }
1601
1602 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1603 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1604 }
1605
1606 out_unlock_iolock:
1607 if (need_iolock)
1608 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1609 return error;
1610
1611 error0:
1612 xfs_bmap_cancel(&free_list);
1613 error1:
1614 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1615 xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
1616 XFS_ILOCK_EXCL);
1617 return error;
1618}
1619
1620
1621STATIC int
1622xfs_zero_file_space(
1623 struct xfs_inode *ip,
1624 xfs_off_t offset,
1625 xfs_off_t len,
1626 int attr_flags)
1627{
1628 struct xfs_mount *mp = ip->i_mount;
1629 uint granularity;
1630 xfs_off_t start_boundary;
1631 xfs_off_t end_boundary;
1632 int error;
1633
1634 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1635
1636 /*
1637 * Round the range of extents we are going to convert inwards. If the
1638 * offset is aligned, then it doesn't get changed so we zero from the
1639 * start of the block offset points to.
1640 */
1641 start_boundary = round_up(offset, granularity);
1642 end_boundary = round_down(offset + len, granularity);
1643
1644 ASSERT(start_boundary >= offset);
1645 ASSERT(end_boundary <= offset + len);
1646
1647 if (!(attr_flags & XFS_ATTR_NOLOCK))
1648 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1649
1650 if (start_boundary < end_boundary - 1) {
1651 /* punch out the page cache over the conversion range */
1652 truncate_pagecache_range(VFS_I(ip), start_boundary,
1653 end_boundary - 1);
1654 /* convert the blocks */
1655 error = xfs_alloc_file_space(ip, start_boundary,
1656 end_boundary - start_boundary - 1,
1657 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
1658 attr_flags);
1659 if (error)
1660 goto out_unlock;
1661
1662 /* We've handled the interior of the range, now for the edges */
1663 if (start_boundary != offset)
1664 error = xfs_iozero(ip, offset, start_boundary - offset);
1665 if (error)
1666 goto out_unlock;
1667
1668 if (end_boundary != offset + len)
1669 error = xfs_iozero(ip, end_boundary,
1670 offset + len - end_boundary);
1671
1672 } else {
1673 /*
1674 * It's either a sub-granularity range or the range spanned lies
1675 * partially across two adjacent blocks.
1676 */
1677 error = xfs_iozero(ip, offset, len);
1678 }
1679
1680out_unlock:
1681 if (!(attr_flags & XFS_ATTR_NOLOCK))
1682 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1683 return error;
1684
1685}
1686
1687/*
1688 * xfs_change_file_space()
1689 * This routine allocates or frees disk space for the given file.
1690 * The user specified parameters are checked for alignment and size
1691 * limitations.
1692 *
1693 * RETURNS:
1694 * 0 on success
1695 * errno on error
1696 *
1697 */
1698int
1699xfs_change_file_space(
1700 xfs_inode_t *ip,
1701 int cmd,
1702 xfs_flock64_t *bf,
1703 xfs_off_t offset,
1704 int attr_flags)
1705{
1706 xfs_mount_t *mp = ip->i_mount;
1707 int clrprealloc;
1708 int error;
1709 xfs_fsize_t fsize;
1710 int setprealloc;
1711 xfs_off_t startoffset;
1712 xfs_trans_t *tp;
1713 struct iattr iattr;
1714
1715 if (!S_ISREG(ip->i_d.di_mode))
1716 return XFS_ERROR(EINVAL);
1717
1718 switch (bf->l_whence) {
1719 case 0: /*SEEK_SET*/
1720 break;
1721 case 1: /*SEEK_CUR*/
1722 bf->l_start += offset;
1723 break;
1724 case 2: /*SEEK_END*/
1725 bf->l_start += XFS_ISIZE(ip);
1726 break;
1727 default:
1728 return XFS_ERROR(EINVAL);
1729 }
1730
1731 /*
1732 * length of <= 0 for resv/unresv/zero is invalid. length for
1733 * alloc/free is ignored completely and we have no idea what userspace
1734 * might have set it to, so set it to zero to allow range
1735 * checks to pass.
1736 */
1737 switch (cmd) {
1738 case XFS_IOC_ZERO_RANGE:
1739 case XFS_IOC_RESVSP:
1740 case XFS_IOC_RESVSP64:
1741 case XFS_IOC_UNRESVSP:
1742 case XFS_IOC_UNRESVSP64:
1743 if (bf->l_len <= 0)
1744 return XFS_ERROR(EINVAL);
1745 break;
1746 default:
1747 bf->l_len = 0;
1748 break;
1749 }
1750
1751 if (bf->l_start < 0 ||
1752 bf->l_start > mp->m_super->s_maxbytes ||
1753 bf->l_start + bf->l_len < 0 ||
1754 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
1755 return XFS_ERROR(EINVAL);
1756
1757 bf->l_whence = 0;
1758
1759 startoffset = bf->l_start;
1760 fsize = XFS_ISIZE(ip);
1761
1762 setprealloc = clrprealloc = 0;
1763 switch (cmd) {
1764 case XFS_IOC_ZERO_RANGE:
1765 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
1766 attr_flags);
1767 if (error)
1768 return error;
1769 setprealloc = 1;
1770 break;
1771
1772 case XFS_IOC_RESVSP:
1773 case XFS_IOC_RESVSP64:
1774 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
1775 XFS_BMAPI_PREALLOC, attr_flags);
1776 if (error)
1777 return error;
1778 setprealloc = 1;
1779 break;
1780
1781 case XFS_IOC_UNRESVSP:
1782 case XFS_IOC_UNRESVSP64:
1783 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
1784 attr_flags)))
1785 return error;
1786 break;
1787
1788 case XFS_IOC_ALLOCSP:
1789 case XFS_IOC_ALLOCSP64:
1790 case XFS_IOC_FREESP:
1791 case XFS_IOC_FREESP64:
1792 /*
1793 * These operations actually do IO when extending the file, but
1794 * the allocation is done seperately to the zeroing that is
1795 * done. This set of operations need to be serialised against
1796 * other IO operations, such as truncate and buffered IO. We
1797 * need to take the IOLOCK here to serialise the allocation and
1798 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
1799 * truncate, direct IO) from racing against the transient
1800 * allocated but not written state we can have here.
1801 */
1802 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1803 if (startoffset > fsize) {
1804 error = xfs_alloc_file_space(ip, fsize,
1805 startoffset - fsize, 0,
1806 attr_flags | XFS_ATTR_NOLOCK);
1807 if (error) {
1808 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1809 break;
1810 }
1811 }
1812
1813 iattr.ia_valid = ATTR_SIZE;
1814 iattr.ia_size = startoffset;
1815
1816 error = xfs_setattr_size(ip, &iattr,
1817 attr_flags | XFS_ATTR_NOLOCK);
1818 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1819
1820 if (error)
1821 return error;
1822
1823 clrprealloc = 1;
1824 break;
1825
1826 default:
1827 ASSERT(0);
1828 return XFS_ERROR(EINVAL);
1829 }
1830
1831 /*
1832 * update the inode timestamp, mode, and prealloc flag bits
1833 */
1834 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
1835
1836 if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
1837 0, 0, 0))) {
1838 /* ASSERT(0); */
1839 xfs_trans_cancel(tp, 0);
1840 return error;
1841 }
1842
1843 xfs_ilock(ip, XFS_ILOCK_EXCL);
1844 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1845
1846 if ((attr_flags & XFS_ATTR_DMI) == 0) {
1847 ip->i_d.di_mode &= ~S_ISUID;
1848
1849 /*
1850 * Note that we don't have to worry about mandatory
1851 * file locking being disabled here because we only
1852 * clear the S_ISGID bit if the Group execute bit is
1853 * on, but if it was on then mandatory locking wouldn't
1854 * have been enabled.
1855 */
1856 if (ip->i_d.di_mode & S_IXGRP)
1857 ip->i_d.di_mode &= ~S_ISGID;
1858
1859 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1860 }
1861 if (setprealloc)
1862 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
1863 else if (clrprealloc)
1864 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
1865
1866 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1867 if (attr_flags & XFS_ATTR_SYNC)
1868 xfs_trans_set_sync(tp);
1869 return xfs_trans_commit(tp, 0);
1870}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
deleted file mode 100644
index 38c67c34d73f..000000000000
--- a/fs/xfs/xfs_vnodeops.h
+++ /dev/null
@@ -1,55 +0,0 @@
1#ifndef _XFS_VNODEOPS_H
2#define _XFS_VNODEOPS_H 1
3
4struct attrlist_cursor_kern;
5struct file;
6struct iattr;
7struct inode;
8struct iovec;
9struct kiocb;
10struct pipe_inode_info;
11struct uio;
12struct xfs_inode;
13
14
15int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
16int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
17#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
22
23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_release(struct xfs_inode *ip);
25int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name);
34int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
36 const char *target_path, umode_t mode, struct xfs_inode **ipp);
37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
40int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
41 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
42 struct xfs_name *target_name, struct xfs_inode *target_ip);
43int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
44 unsigned char *value, int *valuelenp, int flags);
45int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
46 unsigned char *value, int valuelen, int flags);
47int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
48int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
49 int flags, struct attrlist_cursor_kern *cursor);
50
51int xfs_iozero(struct xfs_inode *, loff_t, size_t);
52int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
53int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
54
55#endif /* _XFS_VNODEOPS_H */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..e01f35ea76ba 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,13 +17,13 @@
17 */ 17 */
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_log_format.h"
20#include "xfs_da_btree.h" 21#include "xfs_da_btree.h"
21#include "xfs_bmap_btree.h" 22#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 23#include "xfs_inode.h"
23#include "xfs_attr.h" 24#include "xfs_attr.h"
24#include "xfs_attr_leaf.h" 25#include "xfs_attr_leaf.h"
25#include "xfs_acl.h" 26#include "xfs_acl.h"
26#include "xfs_vnodeops.h"
27 27
28#include <linux/posix_acl_xattr.h> 28#include <linux/posix_acl_xattr.h>
29#include <linux/xattr.h> 29#include <linux/xattr.h>