aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c3
-rw-r--r--fs/affs/affs.h20
-rw-r--r--fs/affs/amigaffs.c23
-rw-r--r--fs/affs/dir.c28
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/namei.c32
-rw-r--r--fs/affs/super.c9
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/rxrpc.c12
-rw-r--r--fs/befs/Makefile2
-rw-r--r--fs/befs/befs.h3
-rw-r--r--fs/befs/btree.c93
-rw-r--r--fs/befs/datastream.c87
-rw-r--r--fs/befs/debug.c74
-rw-r--r--fs/befs/inode.c10
-rw-r--r--fs/befs/io.c24
-rw-r--r--fs/befs/linuxvfs.c113
-rw-r--r--fs/bfs/inode.c4
-rw-r--r--fs/binfmt_elf.c13
-rw-r--r--fs/binfmt_misc.c1
-rw-r--r--fs/bio-integrity.c84
-rw-r--r--fs/bio.c3
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/async-thread.c848
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h73
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c281
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c161
-rw-r--r--fs/btrfs/inode.c123
-rw-r--r--fs/btrfs/ioctl.c210
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c97
-rw-r--r--fs/btrfs/send.c821
-rw-r--r--fs/btrfs/super.c38
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/cachefiles/namei.c4
-rw-r--r--fs/cachefiles/rdwr.c33
-rw-r--r--fs/ceph/cache.c1
-rw-r--r--fs/ceph/cache.h10
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c53
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c76
-rw-r--r--fs/ceph/ioctl.c5
-rw-r--r--fs/ceph/locks.c98
-rw-r--r--fs/ceph/mds_client.c97
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c48
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/file.c1
-rw-r--r--fs/coda/coda_int.h2
-rw-r--r--fs/coda/inode.c5
-rw-r--r--fs/compat.c162
-rw-r--r--fs/compat_binfmt_elf.c5
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dcache.c50
-rw-r--r--fs/debugfs/inode.c7
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/dir.c4
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c7
-rw-r--r--fs/dlm/lockspace.c8
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dlm/recoverd.c34
-rw-r--r--fs/drop_caches.c16
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efivarfs/file.c13
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/exec.c34
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/ext2/acl.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext2/xattr_security.c4
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c88
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/ext4.h11
-rw-r--r--fs/ext4/ext4_jbd2.c10
-rw-r--r--fs/ext4/extents.c818
-rw-r--r--fs/ext4/extents_status.c28
-rw-r--r--fs/ext4/extents_status.h9
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/inode.c124
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c5
-rw-r--r--fs/ext4/namei.c480
-rw-r--r--fs/ext4/super.c40
-rw-r--r--fs/ext4/xattr.c59
-rw-r--r--fs/ext4/xattr.h6
-rw-r--r--fs/f2fs/acl.c8
-rw-r--r--fs/f2fs/checkpoint.c208
-rw-r--r--fs/f2fs/data.c106
-rw-r--r--fs/f2fs/debug.c12
-rw-r--r--fs/f2fs/dir.c85
-rw-r--r--fs/f2fs/f2fs.h105
-rw-r--r--fs/f2fs/file.c32
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c29
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c334
-rw-r--r--fs/f2fs/node.h25
-rw-r--r--fs/f2fs/recovery.c37
-rw-r--r--fs/f2fs/segment.c222
-rw-r--r--fs/f2fs/segment.h75
-rw-r--r--fs/f2fs/super.c99
-rw-r--r--fs/f2fs/xattr.c7
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c37
-rw-r--r--fs/file.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_super.c1
-rw-r--r--fs/fs-writeback.c33
-rw-r--r--fs/fuse/cuse.c9
-rw-r--r--fs/fuse/dir.c119
-rw-r--r--fs/fuse/file.c287
-rw-r--r--fs/fuse/fuse_i.h22
-rw-r--r--fs/fuse/inode.c32
-rw-r--r--fs/gfs2/acl.c23
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c132
-rw-r--r--fs/gfs2/bmap.c115
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c23
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c28
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h37
-rw-r--r--fs/gfs2/inode.c75
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c102
-rw-r--r--fs/gfs2/lops.c85
-rw-r--r--fs/gfs2/lops.h5
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c89
-rw-r--r--fs/gfs2/quota.c18
-rw-r--r--fs/gfs2/recovery.c16
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c32
-rw-r--r--fs/gfs2/super.c41
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c29
-rw-r--r--fs/gfs2/util.c101
-rw-r--r--fs/gfs2/util.h31
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/attributes.c2
-rw-r--r--fs/hfsplus/extents.c16
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/inode.c60
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd2/commit.c77
-rw-r--r--fs/jbd2/journal.c10
-rw-r--r--fs/jbd2/transaction.c46
-rw-r--r--fs/jffs2/compr_rtime.c4
-rw-r--r--fs/jffs2/fs.c13
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/nodemgmt.c14
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/kernfs/Kconfig7
-rw-r--r--fs/kernfs/dir.c753
-rw-r--r--fs/kernfs/file.c22
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h15
-rw-r--r--fs/kernfs/mount.c39
-rw-r--r--fs/kernfs/symlink.c6
-rw-r--r--fs/locks.c389
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/mbcache.c540
-rw-r--r--fs/minix/inode.c5
-rw-r--r--fs/namei.c317
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/dir.c62
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/inode.c36
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3proc.c36
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c197
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4super.c2
-rw-r--r--fs/nfs/nfs4xdr.c3
-rw-r--r--fs/nfs/pnfs.c17
-rw-r--r--fs/nfs/proc.c25
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/unlink.c35
-rw-r--r--fs/nfsd/auth.c5
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/cpfile.c12
-rw-r--r--fs/nilfs2/dat.c12
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/ioctl.c137
-rw-r--r--fs/nilfs2/sufile.c295
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/nilfs2/the_nilfs.c10
-rw-r--r--fs/notify/fanotify/fanotify.c63
-rw-r--r--fs/notify/fanotify/fanotify.h34
-rw-r--r--fs/notify/fanotify/fanotify_user.c197
-rw-r--r--fs/ntfs/debug.c58
-rw-r--r--fs/ntfs/debug.h7
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/super.c30
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/aops.h5
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/cluster/sys.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c28
-rw-r--r--fs/ocfs2/dcache.c61
-rw-r--r--fs/ocfs2/dcache.h12
-rw-r--r--fs/ocfs2/dir.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c27
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlmglue.c44
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/file.c69
-rw-r--r--fs/ocfs2/inode.c61
-rw-r--r--fs/ocfs2/inode.h17
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/move_extents.c7
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/ocfs2.h33
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c35
-rw-r--r--fs/ocfs2/stackglue.c22
-rw-r--r--fs/ocfs2/suballoc.c29
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c55
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/ocfs2/xattr.c35
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c29
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/posix_acl.c5
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c19
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/proc_devtree.c241
-rw-r--r--fs/proc/root.c5
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc/uptime.c2
-rw-r--r--fs/proc/vmcore.c3
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/Kconfig7
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/read_write.c36
-rw-r--r--fs/reiserfs/dir.c6
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/squashfs/super.c1
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/Kconfig1
-rw-r--r--fs/sysfs/dir.c44
-rw-r--r--fs/sysfs/file.c23
-rw-r--r--fs/sysfs/group.c7
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/ufs/ialloc.c4
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/xfs/kmem.c21
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc.c45
-rw-r--r--fs/xfs/xfs_alloc_btree.c16
-rw-r--r--fs/xfs/xfs_aops.c84
-rw-r--r--fs/xfs/xfs_attr_leaf.c17
-rw-r--r--fs/xfs/xfs_attr_remote.c15
-rw-r--r--fs/xfs/xfs_bmap.c193
-rw-r--r--fs/xfs/xfs_bmap.h15
-rw-r--r--fs/xfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/xfs_bmap_util.c97
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_btree.c14
-rw-r--r--fs/xfs/xfs_buf.c11
-rw-r--r--fs/xfs/xfs_buf.h14
-rw-r--r--fs/xfs/xfs_buf_item.c19
-rw-r--r--fs/xfs/xfs_da_btree.c19
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c342
-rw-r--r--fs/xfs/xfs_dir2_block.c17
-rw-r--r--fs/xfs/xfs_dir2_data.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c17
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_dquot_buf.c11
-rw-r--r--fs/xfs/xfs_error.c27
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c27
-rw-r--r--fs/xfs/xfs_format.h2
-rw-r--r--fs/xfs/xfs_ialloc.c36
-rw-r--r--fs/xfs/xfs_ialloc_btree.c16
-rw-r--r--fs/xfs/xfs_inode.c123
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_iops.c30
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_sb.c17
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_shared.h4
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_symlink_remote.c16
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c12
-rw-r--r--fs/xfs/xfs_trans_buf.c11
-rw-r--r--fs/xfs/xfs_trans_resv.c82
-rw-r--r--fs/xfs/xfs_trans_resv.h3
398 files changed, 10504 insertions, 6166 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a16b0ff497ca..d8223209d4b1 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
832 832
833static const struct vm_operations_struct v9fs_file_vm_ops = { 833static const struct vm_operations_struct v9fs_file_vm_ops = {
834 .fault = filemap_fault, 834 .fault = filemap_fault,
835 .map_pages = filemap_map_pages,
835 .page_mkwrite = v9fs_vm_page_mkwrite, 836 .page_mkwrite = v9fs_vm_page_mkwrite,
836 .remap_pages = generic_file_remap_pages, 837 .remap_pages = generic_file_remap_pages,
837}; 838};
@@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
839static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { 840static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
840 .close = v9fs_mmap_vm_close, 841 .close = v9fs_mmap_vm_close,
841 .fault = filemap_fault, 842 .fault = filemap_fault,
843 .map_pages = filemap_map_pages,
842 .page_mkwrite = v9fs_vm_page_mkwrite, 844 .page_mkwrite = v9fs_vm_page_mkwrite,
843 .remap_pages = generic_file_remap_pages, 845 .remap_pages = generic_file_remap_pages,
844}; 846};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bb7991c7e5c7..53161ec058a7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode)
451{ 451{
452 struct v9fs_inode *v9inode = V9FS_I(inode); 452 struct v9fs_inode *v9inode = V9FS_I(inode);
453 453
454 truncate_inode_pages(inode->i_mapping, 0); 454 truncate_inode_pages_final(inode->i_mapping);
455 clear_inode(inode); 455 clear_inode(inode);
456 filemap_fdatawrite(inode->i_mapping); 456 filemap_fdatawrite(inode->i_mapping);
457 457
diff --git a/fs/Kconfig b/fs/Kconfig
index 7385e54be4b9..312393f32948 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -96,6 +96,7 @@ endif # BLOCK
96menu "Pseudo filesystems" 96menu "Pseudo filesystems"
97 97
98source "fs/proc/Kconfig" 98source "fs/proc/Kconfig"
99source "fs/kernfs/Kconfig"
99source "fs/sysfs/Kconfig" 100source "fs/sysfs/Kconfig"
100 101
101config TMPFS 102config TMPFS
diff --git a/fs/Makefile b/fs/Makefile
index 47ac07bb4acc..f9cb9876e466 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
52obj-y += quota/ 52obj-y += quota/
53 53
54obj-$(CONFIG_PROC_FS) += proc/ 54obj-$(CONFIG_PROC_FS) += proc/
55obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ 55obj-$(CONFIG_KERNFS) += kernfs/
56obj-$(CONFIG_SYSFS) += sysfs/
56obj-$(CONFIG_CONFIGFS_FS) += configfs/ 57obj-$(CONFIG_CONFIGFS_FS) += configfs/
57obj-y += devpts/ 58obj-y += devpts/
58 59
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003cb6f1b..9852bdf34d76 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
212 212
213static int adfs_remount(struct super_block *sb, int *flags, char *data) 213static int adfs_remount(struct super_block *sb, int *flags, char *data)
214{ 214{
215 sync_filesystem(sb);
215 *flags |= MS_NODIRATIME; 216 *flags |= MS_NODIRATIME;
216 return parse_options(sb, data); 217 return parse_options(sb, data);
217} 218}
@@ -265,7 +266,7 @@ static void init_once(void *foo)
265 inode_init_once(&ei->vfs_inode); 266 inode_init_once(&ei->vfs_inode);
266} 267}
267 268
268static int init_inodecache(void) 269static int __init init_inodecache(void)
269{ 270{
270 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", 271 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
271 sizeof(struct adfs_inode_info), 272 sizeof(struct adfs_inode_info),
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3952121f2f28..25b23b1e7f22 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -5,14 +5,6 @@
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/workqueue.h> 6#include <linux/workqueue.h>
7 7
8/* AmigaOS allows file names with up to 30 characters length.
9 * Names longer than that will be silently truncated. If you
10 * want to disallow this, comment out the following #define.
11 * Creating filesystem objects with longer names will then
12 * result in an error (ENAMETOOLONG).
13 */
14/*#define AFFS_NO_TRUNCATE */
15
16/* Ugly macros make the code more pretty. */ 8/* Ugly macros make the code more pretty. */
17 9
18#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) 10#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
@@ -28,7 +20,6 @@
28 20
29#define AFFS_CACHE_SIZE PAGE_SIZE 21#define AFFS_CACHE_SIZE PAGE_SIZE
30 22
31#define AFFS_MAX_PREALLOC 32
32#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) 23#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2)
33#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) 24#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
34#define AFFS_AC_MASK (AFFS_AC_SIZE-1) 25#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
@@ -118,6 +109,7 @@ struct affs_sb_info {
118#define SF_OFS 0x0200 /* Old filesystem */ 109#define SF_OFS 0x0200 /* Old filesystem */
119#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ 110#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
120#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ 111#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
112#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
121 113
122/* short cut to get to the affs specific sb data */ 114/* short cut to get to the affs specific sb data */
123static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) 115static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
137extern void secs_to_datestamp(time_t secs, struct affs_date *ds); 129extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
138extern umode_t prot_to_mode(u32 prot); 130extern umode_t prot_to_mode(u32 prot);
139extern void mode_to_prot(struct inode *inode); 131extern void mode_to_prot(struct inode *inode);
140extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); 132extern void affs_error(struct super_block *sb, const char *function,
141extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); 133 const char *fmt, ...);
142extern int affs_check_name(const unsigned char *name, int len); 134extern void affs_warning(struct super_block *sb, const char *function,
135 const char *fmt, ...);
136extern bool affs_nofilenametruncate(const struct dentry *dentry);
137extern int affs_check_name(const unsigned char *name, int len,
138 bool notruncate);
143extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); 139extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry);
144 140
145/* bitmap. c */ 141/* bitmap. c */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d9a43674cb94..533a322c41c0 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
471 function,ErrorBuffer); 471 function,ErrorBuffer);
472} 472}
473 473
474bool
475affs_nofilenametruncate(const struct dentry *dentry)
476{
477 struct inode *inode = dentry->d_inode;
478 return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
479
480}
481
474/* Check if the name is valid for a affs object. */ 482/* Check if the name is valid for a affs object. */
475 483
476int 484int
477affs_check_name(const unsigned char *name, int len) 485affs_check_name(const unsigned char *name, int len, bool notruncate)
478{ 486{
479 int i; 487 int i;
480 488
481 if (len > 30) 489 if (len > 30) {
482#ifdef AFFS_NO_TRUNCATE 490 if (notruncate)
483 return -ENAMETOOLONG; 491 return -ENAMETOOLONG;
484#else 492 else
485 len = 30; 493 len = 30;
486#endif 494 }
487
488 for (i = 0; i < len; i++) { 495 for (i = 0; i < len; i++) {
489 if (name[i] < ' ' || name[i] == ':' 496 if (name[i] < ' ' || name[i] == ':'
490 || (name[i] > 0x7e && name[i] < 0xa0)) 497 || (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1eba8c3644e..cbbda476a805 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx)
52 int hash_pos; 52 int hash_pos;
53 int chain_pos; 53 int chain_pos;
54 u32 ino; 54 u32 ino;
55 int error = 0;
55 56
56 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); 57 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",
58 inode->i_ino, (unsigned long)ctx->pos);
57 59
58 if (ctx->pos < 2) { 60 if (ctx->pos < 2) {
59 file->private_data = (void *)0; 61 file->private_data = (void *)0;
@@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
72 } 74 }
73 dir_bh = affs_bread(sb, inode->i_ino); 75 dir_bh = affs_bread(sb, inode->i_ino);
74 if (!dir_bh) 76 if (!dir_bh)
75 goto readdir_out; 77 goto out_unlock_dir;
76 78
77 /* If the directory hasn't changed since the last call to readdir(), 79 /* If the directory hasn't changed since the last call to readdir(),
78 * we can jump directly to where we left off. 80 * we can jump directly to where we left off.
@@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
88 fh_bh = affs_bread(sb, ino); 90 fh_bh = affs_bread(sb, ino);
89 if (!fh_bh) { 91 if (!fh_bh) {
90 affs_error(sb, "readdir","Cannot read block %d", i); 92 affs_error(sb, "readdir","Cannot read block %d", i);
91 return -EIO; 93 error = -EIO;
94 goto out_brelse_dir;
92 } 95 }
93 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 96 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
94 affs_brelse(fh_bh); 97 affs_brelse(fh_bh);
@@ -107,29 +110,34 @@ inside:
107 do { 110 do {
108 fh_bh = affs_bread(sb, ino); 111 fh_bh = affs_bread(sb, ino);
109 if (!fh_bh) { 112 if (!fh_bh) {
110 affs_error(sb, "readdir","Cannot read block %d", ino); 113 affs_error(sb, "readdir",
114 "Cannot read block %d", ino);
111 break; 115 break;
112 } 116 }
113 117
114 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 118 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
115 name = AFFS_TAIL(sb, fh_bh)->name + 1; 119 name = AFFS_TAIL(sb, fh_bh)->name + 1;
116 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", 120 pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", "
121 "ino=%u), hash=%d, f_pos=%x\n",
117 namelen, name, ino, hash_pos, (u32)ctx->pos); 122 namelen, name, ino, hash_pos, (u32)ctx->pos);
123
118 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) 124 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
119 goto readdir_done; 125 goto done;
120 ctx->pos++; 126 ctx->pos++;
121 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 127 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
122 affs_brelse(fh_bh); 128 affs_brelse(fh_bh);
123 fh_bh = NULL; 129 fh_bh = NULL;
124 } while (ino); 130 } while (ino);
125 } 131 }
126readdir_done: 132done:
127 file->f_version = inode->i_version; 133 file->f_version = inode->i_version;
128 file->private_data = (void *)(long)ino; 134 file->private_data = (void *)(long)ino;
135 affs_brelse(fh_bh);
129 136
130readdir_out: 137out_brelse_dir:
131 affs_brelse(dir_bh); 138 affs_brelse(dir_bh);
132 affs_brelse(fh_bh); 139
140out_unlock_dir:
133 affs_unlock_dir(inode); 141 affs_unlock_dir(inode);
134 return 0; 142 return error;
135} 143}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d08680e..96df91e8c334 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode)
259{ 259{
260 unsigned long cache_page; 260 unsigned long cache_page;
261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
262 truncate_inode_pages(&inode->i_data, 0); 262 truncate_inode_pages_final(&inode->i_data);
263 263
264 if (!inode->i_nlink) { 264 if (!inode->i_nlink) {
265 inode->i_size = 0; 265 inode->i_size = 0;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c36cbb4537a2..6dae1ccd176d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
60 * Note: the dentry argument is the parent dentry. 60 * Note: the dentry argument is the parent dentry.
61 */ 61 */
62static inline int 62static inline int
63__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) 63__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
64{ 64{
65 const u8 *name = qstr->name; 65 const u8 *name = qstr->name;
66 unsigned long hash; 66 unsigned long hash;
67 int i; 67 int i;
68 68
69 i = affs_check_name(qstr->name, qstr->len); 69 i = affs_check_name(qstr->name, qstr->len, notruncate);
70 if (i) 70 if (i)
71 return i; 71 return i;
72 72
@@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
82static int 82static int
83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) 83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
84{ 84{
85 return __affs_hash_dentry(qstr, affs_toupper); 85 return __affs_hash_dentry(qstr, affs_toupper,
86 affs_nofilenametruncate(dentry));
87
86} 88}
89
87static int 90static int
88affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) 91affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
89{ 92{
90 return __affs_hash_dentry(qstr, affs_intl_toupper); 93 return __affs_hash_dentry(qstr, affs_intl_toupper,
94 affs_nofilenametruncate(dentry));
95
91} 96}
92 97
93static inline int __affs_compare_dentry(unsigned int len, 98static inline int __affs_compare_dentry(unsigned int len,
94 const char *str, const struct qstr *name, toupper_t toupper) 99 const char *str, const struct qstr *name, toupper_t toupper,
100 bool notruncate)
95{ 101{
96 const u8 *aname = str; 102 const u8 *aname = str;
97 const u8 *bname = name->name; 103 const u8 *bname = name->name;
@@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
101 * must be valid. 'name' must be validated first. 107 * must be valid. 'name' must be validated first.
102 */ 108 */
103 109
104 if (affs_check_name(name->name, name->len)) 110 if (affs_check_name(name->name, name->len, notruncate))
105 return 1; 111 return 1;
106 112
107 /* 113 /*
@@ -126,13 +132,18 @@ static int
126affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, 132affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
127 unsigned int len, const char *str, const struct qstr *name) 133 unsigned int len, const char *str, const struct qstr *name)
128{ 134{
129 return __affs_compare_dentry(len, str, name, affs_toupper); 135
136 return __affs_compare_dentry(len, str, name, affs_toupper,
137 affs_nofilenametruncate(parent));
130} 138}
139
131static int 140static int
132affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, 141affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
133 unsigned int len, const char *str, const struct qstr *name) 142 unsigned int len, const char *str, const struct qstr *name)
134{ 143{
135 return __affs_compare_dentry(len, str, name, affs_intl_toupper); 144 return __affs_compare_dentry(len, str, name, affs_intl_toupper,
145 affs_nofilenametruncate(parent));
146
136} 147}
137 148
138/* 149/*
@@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
411 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 422 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
412 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 423 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
413 424
414 retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); 425 retval = affs_check_name(new_dentry->d_name.name,
426 new_dentry->d_name.len,
427 affs_nofilenametruncate(old_dentry));
428
415 if (retval) 429 if (retval)
416 return retval; 430 return retval;
417 431
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731b82ff..6d589f28bf9b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -128,7 +128,7 @@ static void init_once(void *foo)
128 inode_init_once(&ei->vfs_inode); 128 inode_init_once(&ei->vfs_inode);
129} 129}
130 130
131static int init_inodecache(void) 131static int __init init_inodecache(void)
132{ 132{
133 affs_inode_cachep = kmem_cache_create("affs_inode_cache", 133 affs_inode_cachep = kmem_cache_create("affs_inode_cache",
134 sizeof(struct affs_inode_info), 134 sizeof(struct affs_inode_info),
@@ -163,7 +163,7 @@ static const struct super_operations affs_sops = {
163}; 163};
164 164
165enum { 165enum {
166 Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, 166 Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
167 Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, 167 Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
168 Opt_verbose, Opt_volume, Opt_ignore, Opt_err, 168 Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
169}; 169};
@@ -172,6 +172,7 @@ static const match_table_t tokens = {
172 {Opt_bs, "bs=%u"}, 172 {Opt_bs, "bs=%u"},
173 {Opt_mode, "mode=%o"}, 173 {Opt_mode, "mode=%o"},
174 {Opt_mufs, "mufs"}, 174 {Opt_mufs, "mufs"},
175 {Opt_notruncate, "nofilenametruncate"},
175 {Opt_prefix, "prefix=%s"}, 176 {Opt_prefix, "prefix=%s"},
176 {Opt_protect, "protect"}, 177 {Opt_protect, "protect"},
177 {Opt_reserved, "reserved=%u"}, 178 {Opt_reserved, "reserved=%u"},
@@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
233 case Opt_mufs: 234 case Opt_mufs:
234 *mount_opts |= SF_MUFS; 235 *mount_opts |= SF_MUFS;
235 break; 236 break;
237 case Opt_notruncate:
238 *mount_opts |= SF_NO_TRUNCATE;
239 break;
236 case Opt_prefix: 240 case Opt_prefix:
237 *prefix = match_strdup(&args[0]); 241 *prefix = match_strdup(&args[0]);
238 if (!*prefix) 242 if (!*prefix)
@@ -530,6 +534,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
530 534
531 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 535 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
532 536
537 sync_filesystem(sb);
533 *flags |= MS_NODIRATIME; 538 *flags |= MS_NODIRATIME;
534 539
535 memcpy(volume, sbi->s_volume, 32); 540 memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index ce25d755b7aa..294671288449 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)
422 422
423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
424 424
425 truncate_inode_pages(&inode->i_data, 0); 425 truncate_inode_pages_final(&inode->i_data);
426 clear_inode(inode); 426 clear_inode(inode);
427 427
428 afs_give_up_callback(vnode); 428 afs_give_up_callback(vnode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6621f8008122..be75b500005d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
75 const struct afs_call_type *type; /* type of call */ 75 const struct afs_call_type *type; /* type of call */
76 const struct afs_wait_mode *wait_mode; /* completion wait mode */ 76 const struct afs_wait_mode *wait_mode; /* completion wait mode */
77 wait_queue_head_t waitq; /* processes awaiting completion */ 77 wait_queue_head_t waitq; /* processes awaiting completion */
78 work_func_t async_workfn;
78 struct work_struct async_work; /* asynchronous work processor */ 79 struct work_struct async_work; /* asynchronous work processor */
79 struct work_struct work; /* actual work processor */ 80 struct work_struct work; /* actual work processor */
80 struct sk_buff_head rx_queue; /* received packets */ 81 struct sk_buff_head rx_queue; /* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8ad8c2a0703a..ef943df73b8c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -644,7 +644,7 @@ static void afs_process_async_call(struct work_struct *work)
644 644
645 /* we can't just delete the call because the work item may be 645 /* we can't just delete the call because the work item may be
646 * queued */ 646 * queued */
647 PREPARE_WORK(&call->async_work, afs_delete_async_call); 647 call->async_workfn = afs_delete_async_call;
648 queue_work(afs_async_calls, &call->async_work); 648 queue_work(afs_async_calls, &call->async_work);
649 } 649 }
650 650
@@ -663,6 +663,13 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
663 call->reply_size += len; 663 call->reply_size += len;
664} 664}
665 665
666static void afs_async_workfn(struct work_struct *work)
667{
668 struct afs_call *call = container_of(work, struct afs_call, async_work);
669
670 call->async_workfn(work);
671}
672
666/* 673/*
667 * accept the backlog of incoming calls 674 * accept the backlog of incoming calls
668 */ 675 */
@@ -685,7 +692,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
685 return; 692 return;
686 } 693 }
687 694
688 INIT_WORK(&call->async_work, afs_process_async_call); 695 call->async_workfn = afs_process_async_call;
696 INIT_WORK(&call->async_work, afs_async_workfn);
689 call->wait_mode = &afs_async_incoming_call; 697 call->wait_mode = &afs_async_incoming_call;
690 call->type = &afs_RXCMxxxx; 698 call->type = &afs_RXCMxxxx;
691 init_waitqueue_head(&call->waitq); 699 init_waitqueue_head(&call->waitq);
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50d..8b9f66642a83 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
3# 3#
4 4
5obj-$(CONFIG_BEFS_FS) += befs.o 5obj-$(CONFIG_BEFS_FS) += befs.o
6 6ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG
7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o 7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b26642839156..3a7813ab8c95 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -88,8 +88,11 @@ enum befs_err {
88 88
89/****************************/ 89/****************************/
90/* debug.c */ 90/* debug.c */
91__printf(2, 3)
91void befs_error(const struct super_block *sb, const char *fmt, ...); 92void befs_error(const struct super_block *sb, const char *fmt, ...);
93__printf(2, 3)
92void befs_warning(const struct super_block *sb, const char *fmt, ...); 94void befs_warning(const struct super_block *sb, const char *fmt, ...);
95__printf(2, 3)
93void befs_debug(const struct super_block *sb, const char *fmt, ...); 96void befs_debug(const struct super_block *sb, const char *fmt, ...);
94 97
95void befs_dump_super_block(const struct super_block *sb, befs_super_block *); 98void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 74e397db0b8b..a2cd305a993a 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
137 struct buffer_head *bh = NULL; 137 struct buffer_head *bh = NULL;
138 befs_disk_btree_super *od_sup = NULL; 138 befs_disk_btree_super *od_sup = NULL;
139 139
140 befs_debug(sb, "---> befs_btree_read_super()"); 140 befs_debug(sb, "---> %s", __func__);
141 141
142 bh = befs_read_datastream(sb, ds, 0, NULL); 142 bh = befs_read_datastream(sb, ds, 0, NULL);
143 143
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
162 goto error; 162 goto error;
163 } 163 }
164 164
165 befs_debug(sb, "<--- befs_btree_read_super()"); 165 befs_debug(sb, "<--- %s", __func__);
166 return BEFS_OK; 166 return BEFS_OK;
167 167
168 error: 168 error:
169 befs_debug(sb, "<--- befs_btree_read_super() ERROR"); 169 befs_debug(sb, "<--- %s ERROR", __func__);
170 return BEFS_ERR; 170 return BEFS_ERR;
171} 171}
172 172
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
195{ 195{
196 uint off = 0; 196 uint off = 0;
197 197
198 befs_debug(sb, "---> befs_bt_read_node()"); 198 befs_debug(sb, "---> %s", __func__);
199 199
200 if (node->bh) 200 if (node->bh)
201 brelse(node->bh); 201 brelse(node->bh);
202 202
203 node->bh = befs_read_datastream(sb, ds, node_off, &off); 203 node->bh = befs_read_datastream(sb, ds, node_off, &off);
204 if (!node->bh) { 204 if (!node->bh) {
205 befs_error(sb, "befs_bt_read_node() failed to read " 205 befs_error(sb, "%s failed to read "
206 "node at %Lu", node_off); 206 "node at %llu", __func__, node_off);
207 befs_debug(sb, "<--- befs_bt_read_node() ERROR"); 207 befs_debug(sb, "<--- %s ERROR", __func__);
208 208
209 return BEFS_ERR; 209 return BEFS_ERR;
210 } 210 }
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
221 node->head.all_key_length = 221 node->head.all_key_length =
222 fs16_to_cpu(sb, node->od_node->all_key_length); 222 fs16_to_cpu(sb, node->od_node->all_key_length);
223 223
224 befs_debug(sb, "<--- befs_btree_read_node()"); 224 befs_debug(sb, "<--- %s", __func__);
225 return BEFS_OK; 225 return BEFS_OK;
226} 226}
227 227
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
252 befs_off_t node_off; 252 befs_off_t node_off;
253 int res; 253 int res;
254 254
255 befs_debug(sb, "---> befs_btree_find() Key: %s", key); 255 befs_debug(sb, "---> %s Key: %s", __func__, key);
256 256
257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
258 befs_error(sb, 258 befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
263 this_node = kmalloc(sizeof (befs_btree_node), 263 this_node = kmalloc(sizeof (befs_btree_node),
264 GFP_NOFS); 264 GFP_NOFS);
265 if (!this_node) { 265 if (!this_node) {
266 befs_error(sb, "befs_btree_find() failed to allocate %u " 266 befs_error(sb, "befs_btree_find() failed to allocate %zu "
267 "bytes of memory", sizeof (befs_btree_node)); 267 "bytes of memory", sizeof (befs_btree_node));
268 goto error; 268 goto error;
269 } 269 }
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
274 node_off = bt_super.root_node_ptr; 274 node_off = bt_super.root_node_ptr;
275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
276 befs_error(sb, "befs_btree_find() failed to read " 276 befs_error(sb, "befs_btree_find() failed to read "
277 "node at %Lu", node_off); 277 "node at %llu", node_off);
278 goto error_alloc; 278 goto error_alloc;
279 } 279 }
280 280
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
285 /* if no match, go to overflow node */ 285 /* if no match, go to overflow node */
286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
287 befs_error(sb, "befs_btree_find() failed to read " 287 befs_error(sb, "befs_btree_find() failed to read "
288 "node at %Lu", node_off); 288 "node at %llu", node_off);
289 goto error_alloc; 289 goto error_alloc;
290 } 290 }
291 } 291 }
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
298 kfree(this_node); 298 kfree(this_node);
299 299
300 if (res != BEFS_BT_MATCH) { 300 if (res != BEFS_BT_MATCH) {
301 befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); 301 befs_debug(sb, "<--- %s Key %s not found", __func__, key);
302 *value = 0; 302 *value = 0;
303 return BEFS_BT_NOT_FOUND; 303 return BEFS_BT_NOT_FOUND;
304 } 304 }
305 befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", 305 befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
306 key, *value); 306 key, *value);
307 return BEFS_OK; 307 return BEFS_OK;
308 308
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
310 kfree(this_node); 310 kfree(this_node);
311 error: 311 error:
312 *value = 0; 312 *value = 0;
313 befs_debug(sb, "<--- befs_btree_find() ERROR"); 313 befs_debug(sb, "<--- %s ERROR", __func__);
314 return BEFS_ERR; 314 return BEFS_ERR;
315} 315}
316 316
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
343 char *thiskey; 343 char *thiskey;
344 fs64 *valarray; 344 fs64 *valarray;
345 345
346 befs_debug(sb, "---> befs_find_key() %s", findkey); 346 befs_debug(sb, "---> %s %s", __func__, findkey);
347 347
348 *value = 0; 348 *value = 0;
349 349
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
355 355
356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); 356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
357 if (eq < 0) { 357 if (eq < 0) {
358 befs_debug(sb, "<--- befs_find_key() %s not found", findkey); 358 befs_debug(sb, "<--- %s %s not found", __func__, findkey);
359 return BEFS_BT_NOT_FOUND; 359 return BEFS_BT_NOT_FOUND;
360 } 360 }
361 361
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
373 findkey_len); 373 findkey_len);
374 374
375 if (eq == 0) { 375 if (eq == 0) {
376 befs_debug(sb, "<--- befs_find_key() found %s at %d", 376 befs_debug(sb, "<--- %s found %s at %d",
377 thiskey, mid); 377 __func__, thiskey, mid);
378 378
379 *value = fs64_to_cpu(sb, valarray[mid]); 379 *value = fs64_to_cpu(sb, valarray[mid]);
380 return BEFS_BT_MATCH; 380 return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
388 *value = fs64_to_cpu(sb, valarray[mid + 1]); 388 *value = fs64_to_cpu(sb, valarray[mid + 1]);
389 else 389 else
390 *value = fs64_to_cpu(sb, valarray[mid]); 390 *value = fs64_to_cpu(sb, valarray[mid]);
391 befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); 391 befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
392 return BEFS_BT_PARMATCH; 392 return BEFS_BT_PARMATCH;
393} 393}
394 394
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
428 428
429 uint key_sum = 0; 429 uint key_sum = 0;
430 430
431 befs_debug(sb, "---> befs_btree_read()"); 431 befs_debug(sb, "---> %s", __func__);
432 432
433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
434 befs_error(sb, 434 befs_error(sb,
@@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
437 } 437 }
438 438
439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { 439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
440 befs_error(sb, "befs_btree_read() failed to allocate %u " 440 befs_error(sb, "befs_btree_read() failed to allocate %zu "
441 "bytes of memory", sizeof (befs_btree_node)); 441 "bytes of memory", sizeof (befs_btree_node));
442 goto error; 442 goto error;
443 } 443 }
@@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
452 kfree(this_node); 452 kfree(this_node);
453 *value = 0; 453 *value = 0;
454 *keysize = 0; 454 *keysize = 0;
455 befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); 455 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
456 return BEFS_BT_EMPTY; 456 return BEFS_BT_EMPTY;
457 } else if (res == BEFS_ERR) { 457 } else if (res == BEFS_ERR) {
458 goto error_alloc; 458 goto error_alloc;
@@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
467 *keysize = 0; 467 *keysize = 0;
468 *value = 0; 468 *value = 0;
469 befs_debug(sb, 469 befs_debug(sb,
470 "<--- befs_btree_read() END of keys at %Lu", 470 "<--- %s END of keys at %llu", __func__,
471 (unsigned long long)
471 key_sum + this_node->head.all_key_count); 472 key_sum + this_node->head.all_key_count);
472 brelse(this_node->bh); 473 brelse(this_node->bh);
473 kfree(this_node); 474 kfree(this_node);
@@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
478 node_off = this_node->head.right; 479 node_off = this_node->head.right;
479 480
480 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 481 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
481 befs_error(sb, "befs_btree_read() failed to read " 482 befs_error(sb, "%s failed to read node at %llu",
482 "node at %Lu", node_off); 483 __func__, (unsigned long long)node_off);
483 goto error_alloc; 484 goto error_alloc;
484 } 485 }
485 } 486 }
@@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
492 493
493 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); 494 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
494 495
495 befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); 496 befs_debug(sb, "Read [%llu,%d]: keysize %d",
497 (long long unsigned int)node_off, (int)cur_key,
498 (int)keylen);
496 499
497 if (bufsize < keylen + 1) { 500 if (bufsize < keylen + 1) {
498 befs_error(sb, "befs_btree_read() keybuf too small (%u) " 501 befs_error(sb, "%s keybuf too small (%zu) "
499 "for key of size %d", bufsize, keylen); 502 "for key of size %d", __func__, bufsize, keylen);
500 brelse(this_node->bh); 503 brelse(this_node->bh);
501 goto error_alloc; 504 goto error_alloc;
502 }; 505 };
@@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
506 *keysize = keylen; 509 *keysize = keylen;
507 keybuf[keylen] = '\0'; 510 keybuf[keylen] = '\0';
508 511
509 befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, 512 befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
510 cur_key, keylen, keybuf, *value); 513 cur_key, keylen, keybuf, *value);
511 514
512 brelse(this_node->bh); 515 brelse(this_node->bh);
513 kfree(this_node); 516 kfree(this_node);
514 517
515 befs_debug(sb, "<--- befs_btree_read()"); 518 befs_debug(sb, "<--- %s", __func__);
516 519
517 return BEFS_OK; 520 return BEFS_OK;
518 521
@@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
522 error: 525 error:
523 *keysize = 0; 526 *keysize = 0;
524 *value = 0; 527 *value = 0;
525 befs_debug(sb, "<--- befs_btree_read() ERROR"); 528 befs_debug(sb, "<--- %s ERROR", __func__);
526 return BEFS_ERR; 529 return BEFS_ERR;
527} 530}
528 531
@@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
547 befs_off_t * node_off) 550 befs_off_t * node_off)
548{ 551{
549 552
550 befs_debug(sb, "---> befs_btree_seekleaf()"); 553 befs_debug(sb, "---> %s", __func__);
551 554
552 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 555 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
553 befs_error(sb, "befs_btree_seekleaf() failed to read " 556 befs_error(sb, "%s failed to read "
554 "node at %Lu", *node_off); 557 "node at %llu", __func__, *node_off);
555 goto error; 558 goto error;
556 } 559 }
557 befs_debug(sb, "Seekleaf to root node %Lu", *node_off); 560 befs_debug(sb, "Seekleaf to root node %llu", *node_off);
558 561
559 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { 562 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
560 befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); 563 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
561 return BEFS_BT_EMPTY; 564 return BEFS_BT_EMPTY;
562 } 565 }
563 566
564 while (!befs_leafnode(this_node)) { 567 while (!befs_leafnode(this_node)) {
565 568
566 if (this_node->head.all_key_count == 0) { 569 if (this_node->head.all_key_count == 0) {
567 befs_debug(sb, "befs_btree_seekleaf() encountered " 570 befs_debug(sb, "%s encountered "
568 "an empty interior node: %Lu. Using Overflow " 571 "an empty interior node: %llu. Using Overflow "
569 "node: %Lu", *node_off, 572 "node: %llu", __func__, *node_off,
570 this_node->head.overflow); 573 this_node->head.overflow);
571 *node_off = this_node->head.overflow; 574 *node_off = this_node->head.overflow;
572 } else { 575 } else {
@@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
574 *node_off = fs64_to_cpu(sb, valarray[0]); 577 *node_off = fs64_to_cpu(sb, valarray[0]);
575 } 578 }
576 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 579 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
577 befs_error(sb, "befs_btree_seekleaf() failed to read " 580 befs_error(sb, "%s failed to read "
578 "node at %Lu", *node_off); 581 "node at %llu", __func__, *node_off);
579 goto error; 582 goto error;
580 } 583 }
581 584
582 befs_debug(sb, "Seekleaf to child node %Lu", *node_off); 585 befs_debug(sb, "Seekleaf to child node %llu", *node_off);
583 } 586 }
584 befs_debug(sb, "Node %Lu is a leaf node", *node_off); 587 befs_debug(sb, "Node %llu is a leaf node", *node_off);
585 588
586 return BEFS_OK; 589 return BEFS_OK;
587 590
588 error: 591 error:
589 befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); 592 befs_debug(sb, "<--- %s ERROR", __func__);
590 return BEFS_ERR; 593 return BEFS_ERR;
591} 594}
592 595
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc7..c467bebd50af 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
52 befs_block_run run; 52 befs_block_run run;
53 befs_blocknr_t block; /* block coresponding to pos */ 53 befs_blocknr_t block; /* block coresponding to pos */
54 54
55 befs_debug(sb, "---> befs_read_datastream() %Lu", pos); 55 befs_debug(sb, "---> %s %llu", __func__, pos);
56 block = pos >> BEFS_SB(sb)->block_shift; 56 block = pos >> BEFS_SB(sb)->block_shift;
57 if (off) 57 if (off)
58 *off = pos - (block << BEFS_SB(sb)->block_shift); 58 *off = pos - (block << BEFS_SB(sb)->block_shift);
59 59
60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { 60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
61 befs_error(sb, "BeFS: Error finding disk addr of block %lu", 61 befs_error(sb, "BeFS: Error finding disk addr of block %lu",
62 block); 62 (unsigned long)block);
63 befs_debug(sb, "<--- befs_read_datastream() ERROR"); 63 befs_debug(sb, "<--- %s ERROR", __func__);
64 return NULL; 64 return NULL;
65 } 65 }
66 bh = befs_bread_iaddr(sb, run); 66 bh = befs_bread_iaddr(sb, run);
67 if (!bh) { 67 if (!bh) {
68 befs_error(sb, "BeFS: Error reading block %lu from datastream", 68 befs_error(sb, "BeFS: Error reading block %lu from datastream",
69 block); 69 (unsigned long)block);
70 return NULL; 70 return NULL;
71 } 71 }
72 72
73 befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", 73 befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
74 pos);
75 74
76 return bh; 75 return bh;
77} 76}
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
106 } else { 105 } else {
107 befs_error(sb, 106 befs_error(sb,
108 "befs_fblock2brun() was asked to find block %lu, " 107 "befs_fblock2brun() was asked to find block %lu, "
109 "which is not mapped by the datastream\n", fblock); 108 "which is not mapped by the datastream\n",
109 (unsigned long)fblock);
110 err = BEFS_ERR; 110 err = BEFS_ERR;
111 } 111 }
112 return err; 112 return err;
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
128 befs_off_t bytes_read = 0; /* bytes readed */ 128 befs_off_t bytes_read = 0; /* bytes readed */
129 u16 plen; 129 u16 plen;
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); 131 befs_debug(sb, "---> %s length: %llu", __func__, len);
132 132
133 while (bytes_read < len) { 133 while (bytes_read < len) {
134 bh = befs_read_datastream(sb, ds, bytes_read, NULL); 134 bh = befs_read_datastream(sb, ds, bytes_read, NULL);
135 if (!bh) { 135 if (!bh) {
136 befs_error(sb, "BeFS: Error reading datastream block " 136 befs_error(sb, "BeFS: Error reading datastream block "
137 "starting from %Lu", bytes_read); 137 "starting from %llu", bytes_read);
138 befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); 138 befs_debug(sb, "<--- %s ERROR", __func__);
139 return bytes_read; 139 return bytes_read;
140 140
141 } 141 }
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
146 bytes_read += plen; 146 bytes_read += plen;
147 } 147 }
148 148
149 befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); 149 befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
150 bytes_read);
150 return bytes_read; 151 return bytes_read;
151} 152}
152 153
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
169 befs_blocknr_t metablocks; /* FS metadata blocks */ 170 befs_blocknr_t metablocks; /* FS metadata blocks */
170 befs_sb_info *befs_sb = BEFS_SB(sb); 171 befs_sb_info *befs_sb = BEFS_SB(sb);
171 172
172 befs_debug(sb, "---> befs_count_blocks()"); 173 befs_debug(sb, "---> %s", __func__);
173 174
174 datablocks = ds->size >> befs_sb->block_shift; 175 datablocks = ds->size >> befs_sb->block_shift;
175 if (ds->size & (befs_sb->block_size - 1)) 176 if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
206 } 207 }
207 208
208 blocks = datablocks + metablocks; 209 blocks = datablocks + metablocks;
209 befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); 210 befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
210 211
211 return blocks; 212 return blocks;
212} 213}
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
251 befs_blocknr_t max_block = 252 befs_blocknr_t max_block =
252 data->max_direct_range >> BEFS_SB(sb)->block_shift; 253 data->max_direct_range >> BEFS_SB(sb)->block_shift;
253 254
254 befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); 255 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
255 256
256 if (blockno > max_block) { 257 if (blockno > max_block) {
257 befs_error(sb, "befs_find_brun_direct() passed block outside of" 258 befs_error(sb, "%s passed block outside of direct region",
258 "direct region"); 259 __func__);
259 return BEFS_ERR; 260 return BEFS_ERR;
260 } 261 }
261 262
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
267 run->start = array[i].start + offset; 268 run->start = array[i].start + offset;
268 run->len = array[i].len - offset; 269 run->len = array[i].len - offset;
269 270
270 befs_debug(sb, "---> befs_find_brun_direct(), " 271 befs_debug(sb, "---> %s, "
271 "found %lu at direct[%d]", blockno, i); 272 "found %lu at direct[%d]", __func__,
273 (unsigned long)blockno, i);
272 return BEFS_OK; 274 return BEFS_OK;
273 } 275 }
274 } 276 }
275 277
276 befs_debug(sb, "---> befs_find_brun_direct() ERROR"); 278 befs_debug(sb, "---> %s ERROR", __func__);
277 return BEFS_ERR; 279 return BEFS_ERR;
278} 280}
279 281
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
316 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); 318 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
317 int arraylen = befs_iaddrs_per_block(sb); 319 int arraylen = befs_iaddrs_per_block(sb);
318 320
319 befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); 321 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
320 322
321 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; 323 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
322 search_blk = blockno - indir_start_blk; 324 search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
325 for (i = 0; i < indirect.len; i++) { 327 for (i = 0; i < indirect.len; i++) {
326 indirblock = befs_bread(sb, indirblockno + i); 328 indirblock = befs_bread(sb, indirblockno + i);
327 if (indirblock == NULL) { 329 if (indirblock == NULL) {
328 befs_debug(sb, 330 befs_debug(sb, "---> %s failed to read "
329 "---> befs_find_brun_indirect() failed to " 331 "disk block %lu from the indirect brun",
330 "read disk block %lu from the indirect brun", 332 __func__, (unsigned long)indirblockno + i);
331 indirblockno + i);
332 return BEFS_ERR; 333 return BEFS_ERR;
333 } 334 }
334 335
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
348 349
349 brelse(indirblock); 350 brelse(indirblock);
350 befs_debug(sb, 351 befs_debug(sb,
351 "<--- befs_find_brun_indirect() found " 352 "<--- %s found file block "
352 "file block %lu at indirect[%d]", 353 "%lu at indirect[%d]", __func__,
353 blockno, j + (i * arraylen)); 354 (unsigned long)blockno,
355 j + (i * arraylen));
354 return BEFS_OK; 356 return BEFS_OK;
355 } 357 }
356 sum += len; 358 sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
360 } 362 }
361 363
362 /* Only fallthrough is an error */ 364 /* Only fallthrough is an error */
363 befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " 365 befs_error(sb, "BeFS: %s failed to find "
364 "file block %lu", blockno); 366 "file block %lu", __func__, (unsigned long)blockno);
365 367
366 befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); 368 befs_debug(sb, "<--- %s ERROR", __func__);
367 return BEFS_ERR; 369 return BEFS_ERR;
368} 370}
369 371
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
444 size_t diblklen = iblklen * befs_iaddrs_per_block(sb) 446 size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
445 * BEFS_DBLINDIR_BRUN_LEN; 447 * BEFS_DBLINDIR_BRUN_LEN;
446 448
447 befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); 449 befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
448 450
449 /* First, discover which of the double_indir->indir blocks 451 /* First, discover which of the double_indir->indir blocks
450 * contains pos. Then figure out how much of pos that 452 * contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
460 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); 462 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
461 if (dbl_which_block > data->double_indirect.len) { 463 if (dbl_which_block > data->double_indirect.len) {
462 befs_error(sb, "The double-indirect index calculated by " 464 befs_error(sb, "The double-indirect index calculated by "
463 "befs_read_brun_dblindirect(), %d, is outside the range " 465 "%s, %d, is outside the range "
464 "of the double-indirect block", dblindir_indx); 466 "of the double-indirect block", __func__,
467 dblindir_indx);
465 return BEFS_ERR; 468 return BEFS_ERR;
466 } 469 }
467 470
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
469 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + 472 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
470 dbl_which_block); 473 dbl_which_block);
471 if (dbl_indir_block == NULL) { 474 if (dbl_indir_block == NULL) {
472 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 475 befs_error(sb, "%s couldn't read the "
473 "double-indirect block at blockno %lu", 476 "double-indirect block at blockno %lu", __func__,
474 iaddr2blockno(sb, 477 (unsigned long)
475 &data->double_indirect) + 478 iaddr2blockno(sb, &data->double_indirect) +
476 dbl_which_block); 479 dbl_which_block);
477 brelse(dbl_indir_block); 480 brelse(dbl_indir_block);
478 return BEFS_ERR; 481 return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
489 which_block = indir_indx / befs_iaddrs_per_block(sb); 492 which_block = indir_indx / befs_iaddrs_per_block(sb);
490 if (which_block > indir_run.len) { 493 if (which_block > indir_run.len) {
491 befs_error(sb, "The indirect index calculated by " 494 befs_error(sb, "The indirect index calculated by "
492 "befs_read_brun_dblindirect(), %d, is outside the range " 495 "%s, %d, is outside the range "
493 "of the indirect block", indir_indx); 496 "of the indirect block", __func__, indir_indx);
494 return BEFS_ERR; 497 return BEFS_ERR;
495 } 498 }
496 499
497 indir_block = 500 indir_block =
498 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); 501 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
499 if (indir_block == NULL) { 502 if (indir_block == NULL) {
500 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 503 befs_error(sb, "%s couldn't read the indirect block "
501 "indirect block at blockno %lu", 504 "at blockno %lu", __func__, (unsigned long)
502 iaddr2blockno(sb, &indir_run) + which_block); 505 iaddr2blockno(sb, &indir_run) + which_block);
503 brelse(indir_block); 506 brelse(indir_block);
504 return BEFS_ERR; 507 return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
519 run->len -= offset; 522 run->len -= offset;
520 523
521 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," 524 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
522 " double_indirect_leftover = %lu", 525 " double_indirect_leftover = %lu", (unsigned long)
523 blockno, dblindir_indx, indir_indx, dblindir_leftover); 526 blockno, dblindir_indx, indir_indx, dblindir_leftover);
524 527
525 return BEFS_OK; 528 return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c83..4de7cffcd662 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
10 * debug functions 10 * debug functions
11 */ 11 */
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#ifdef __KERNEL__ 14#ifdef __KERNEL__
14 15
15#include <stdarg.h> 16#include <stdarg.h>
@@ -23,43 +24,30 @@
23 24
24#include "befs.h" 25#include "befs.h"
25 26
26#define ERRBUFSIZE 1024
27
28void 27void
29befs_error(const struct super_block *sb, const char *fmt, ...) 28befs_error(const struct super_block *sb, const char *fmt, ...)
30{ 29{
30 struct va_format vaf;
31 va_list args; 31 va_list args;
32 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
33 if (err_buf == NULL) {
34 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
35 return;
36 }
37 32
38 va_start(args, fmt); 33 va_start(args, fmt);
39 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 34 vaf.fmt = fmt;
35 vaf.va = &args;
36 pr_err("(%s): %pV\n", sb->s_id, &vaf);
40 va_end(args); 37 va_end(args);
41
42 printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
43 kfree(err_buf);
44} 38}
45 39
46void 40void
47befs_warning(const struct super_block *sb, const char *fmt, ...) 41befs_warning(const struct super_block *sb, const char *fmt, ...)
48{ 42{
43 struct va_format vaf;
49 va_list args; 44 va_list args;
50 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
51 if (err_buf == NULL) {
52 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
53 return;
54 }
55 45
56 va_start(args, fmt); 46 va_start(args, fmt);
57 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 47 vaf.fmt = fmt;
48 vaf.va = &args;
49 pr_warn("(%s): %pV\n", sb->s_id, &vaf);
58 va_end(args); 50 va_end(args);
59
60 printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
61
62 kfree(err_buf);
63} 51}
64 52
65void 53void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
67{ 55{
68#ifdef CONFIG_BEFS_DEBUG 56#ifdef CONFIG_BEFS_DEBUG
69 57
58 struct va_format vaf;
70 va_list args; 59 va_list args;
71 char *err_buf = NULL; 60 va_start(args, fmt);
72 61 vaf.fmt = fmt;
73 if (BEFS_SB(sb)->mount_opts.debug) { 62 vaf.va = &args;
74 err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); 63 pr_debug("(%s): %pV\n", sb->s_id, &vaf);
75 if (err_buf == NULL) { 64 va_end(args);
76 printk(KERN_ERR "could not allocate %d bytes\n",
77 ERRBUFSIZE);
78 return;
79 }
80
81 va_start(args, fmt);
82 vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
83 va_end(args);
84
85 printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
86
87 kfree(err_buf);
88 }
89 65
90#endif //CONFIG_BEFS_DEBUG 66#endif //CONFIG_BEFS_DEBUG
91} 67}
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
109 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); 85 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid));
110 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); 86 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode));
111 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); 87 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags));
112 befs_debug(sb, " create_time %Lu", 88 befs_debug(sb, " create_time %llu",
113 fs64_to_cpu(sb, inode->create_time)); 89 fs64_to_cpu(sb, inode->create_time));
114 befs_debug(sb, " last_modified_time %Lu", 90 befs_debug(sb, " last_modified_time %llu",
115 fs64_to_cpu(sb, inode->last_modified_time)); 91 fs64_to_cpu(sb, inode->last_modified_time));
116 92
117 tmp_run = fsrun_to_cpu(sb, inode->parent); 93 tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
137 tmp_run.allocation_group, tmp_run.start, 113 tmp_run.allocation_group, tmp_run.start,
138 tmp_run.len); 114 tmp_run.len);
139 } 115 }
140 befs_debug(sb, " max_direct_range %Lu", 116 befs_debug(sb, " max_direct_range %llu",
141 fs64_to_cpu(sb, 117 fs64_to_cpu(sb,
142 inode->data.datastream. 118 inode->data.datastream.
143 max_direct_range)); 119 max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
147 tmp_run.allocation_group, 123 tmp_run.allocation_group,
148 tmp_run.start, tmp_run.len); 124 tmp_run.start, tmp_run.len);
149 125
150 befs_debug(sb, " max_indirect_range %Lu", 126 befs_debug(sb, " max_indirect_range %llu",
151 fs64_to_cpu(sb, 127 fs64_to_cpu(sb,
152 inode->data.datastream. 128 inode->data.datastream.
153 max_indirect_range)); 129 max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
158 tmp_run.allocation_group, tmp_run.start, 134 tmp_run.allocation_group, tmp_run.start,
159 tmp_run.len); 135 tmp_run.len);
160 136
161 befs_debug(sb, " max_double_indirect_range %Lu", 137 befs_debug(sb, " max_double_indirect_range %llu",
162 fs64_to_cpu(sb, 138 fs64_to_cpu(sb,
163 inode->data.datastream. 139 inode->data.datastream.
164 max_double_indirect_range)); 140 max_double_indirect_range));
165 141
166 befs_debug(sb, " size %Lu", 142 befs_debug(sb, " size %llu",
167 fs64_to_cpu(sb, inode->data.datastream.size)); 143 fs64_to_cpu(sb, inode->data.datastream.size));
168 } 144 }
169 145
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
191 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); 167 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size));
192 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); 168 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift));
193 169
194 befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); 170 befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
195 befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); 171 befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
196 172
197 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); 173 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2));
198 befs_debug(sb, " blocks_per_ag %u", 174 befs_debug(sb, " blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
206 befs_debug(sb, " log_blocks %u, %hu, %hu", 182 befs_debug(sb, " log_blocks %u, %hu, %hu",
207 tmp_run.allocation_group, tmp_run.start, tmp_run.len); 183 tmp_run.allocation_group, tmp_run.start, tmp_run.len);
208 184
209 befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); 185 befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start));
210 befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); 186 befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end));
211 187
212 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); 188 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3));
213 189
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a9576..fa4b718de597 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
25 /* check magic header. */ 25 /* check magic header. */
26 if (magic1 != BEFS_INODE_MAGIC1) { 26 if (magic1 != BEFS_INODE_MAGIC1) {
27 befs_error(sb, 27 befs_error(sb,
28 "Inode has a bad magic header - inode = %lu", inode); 28 "Inode has a bad magic header - inode = %lu",
29 (unsigned long)inode);
29 return BEFS_BAD_INODE; 30 return BEFS_BAD_INODE;
30 } 31 }
31 32
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
34 */ 35 */
35 if (inode != iaddr2blockno(sb, &ino_num)) { 36 if (inode != iaddr2blockno(sb, &ino_num)) {
36 befs_error(sb, "inode blocknr field disagrees with vfs " 37 befs_error(sb, "inode blocknr field disagrees with vfs "
37 "VFS: %lu, Inode %lu", 38 "VFS: %lu, Inode %lu", (unsigned long)
38 inode, iaddr2blockno(sb, &ino_num)); 39 inode, (unsigned long)iaddr2blockno(sb, &ino_num));
39 return BEFS_BAD_INODE; 40 return BEFS_BAD_INODE;
40 } 41 }
41 42
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
44 */ 45 */
45 46
46 if (!(flags & BEFS_INODE_IN_USE)) { 47 if (!(flags & BEFS_INODE_IN_USE)) {
47 befs_error(sb, "inode is not used - inode = %lu", inode); 48 befs_error(sb, "inode is not used - inode = %lu",
49 (unsigned long)inode);
48 return BEFS_BAD_INODE; 50 return BEFS_BAD_INODE;
49 } 51 }
50 52
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255d..0408a3d601d0 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
30 befs_blocknr_t block = 0; 30 befs_blocknr_t block = 0;
31 befs_sb_info *befs_sb = BEFS_SB(sb); 31 befs_sb_info *befs_sb = BEFS_SB(sb);
32 32
33 befs_debug(sb, "---> Enter befs_read_iaddr() " 33 befs_debug(sb, "---> Enter %s "
34 "[%u, %hu, %hu]", 34 "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
35 iaddr.allocation_group, iaddr.start, iaddr.len); 35 iaddr.start, iaddr.len);
36 36
37 if (iaddr.allocation_group > befs_sb->num_ags) { 37 if (iaddr.allocation_group > befs_sb->num_ags) {
38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", 38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
42 42
43 block = iaddr2blockno(sb, &iaddr); 43 block = iaddr2blockno(sb, &iaddr);
44 44
45 befs_debug(sb, "befs_read_iaddr: offset = %lu", block); 45 befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
46 46
47 bh = sb_bread(sb, block); 47 bh = sb_bread(sb, block);
48 48
49 if (bh == NULL) { 49 if (bh == NULL) {
50 befs_error(sb, "Failed to read block %lu", block); 50 befs_error(sb, "Failed to read block %lu",
51 (unsigned long)block);
51 goto error; 52 goto error;
52 } 53 }
53 54
54 befs_debug(sb, "<--- befs_read_iaddr()"); 55 befs_debug(sb, "<--- %s", __func__);
55 return bh; 56 return bh;
56 57
57 error: 58 error:
58 befs_debug(sb, "<--- befs_read_iaddr() ERROR"); 59 befs_debug(sb, "<--- %s ERROR", __func__);
59 return NULL; 60 return NULL;
60} 61}
61 62
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
64{ 65{
65 struct buffer_head *bh = NULL; 66 struct buffer_head *bh = NULL;
66 67
67 befs_debug(sb, "---> Enter befs_read() %Lu", block); 68 befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
68 69
69 bh = sb_bread(sb, block); 70 bh = sb_bread(sb, block);
70 71
71 if (bh == NULL) { 72 if (bh == NULL) {
72 befs_error(sb, "Failed to read block %lu", block); 73 befs_error(sb, "Failed to read block %lu",
74 (unsigned long)block);
73 goto error; 75 goto error;
74 } 76 }
75 77
76 befs_debug(sb, "<--- befs_read()"); 78 befs_debug(sb, "<--- %s", __func__);
77 79
78 return bh; 80 return bh;
79 81
80 error: 82 error:
81 befs_debug(sb, "<--- befs_read() ERROR"); 83 befs_debug(sb, "<--- %s ERROR", __func__);
82 return NULL; 84 return NULL;
83} 85}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 845d2d690ce2..d626756ff721 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
5 * 5 *
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/module.h> 10#include <linux/module.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
10#include <linux/fs.h> 12#include <linux/fs.h>
@@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int)
39static struct inode *befs_iget(struct super_block *, unsigned long); 41static struct inode *befs_iget(struct super_block *, unsigned long);
40static struct inode *befs_alloc_inode(struct super_block *sb); 42static struct inode *befs_alloc_inode(struct super_block *sb);
41static void befs_destroy_inode(struct inode *inode); 43static void befs_destroy_inode(struct inode *inode);
42static int befs_init_inodecache(void);
43static void befs_destroy_inodecache(void); 44static void befs_destroy_inodecache(void);
44static void *befs_follow_link(struct dentry *, struct nameidata *); 45static void *befs_follow_link(struct dentry *, struct nameidata *);
45static void *befs_fast_follow_link(struct dentry *, struct nameidata *); 46static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
@@ -131,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block,
131 ulong disk_off; 132 ulong disk_off;
132 133
133 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", 134 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
134 inode->i_ino, block); 135 (unsigned long)inode->i_ino, (long)block);
135 136
136 if (block < 0) { 137 if (block < 0) {
137 befs_error(sb, "befs_get_block() was asked for a block " 138 befs_error(sb, "befs_get_block() was asked for a block "
138 "number less than zero: block %ld in inode %lu", 139 "number less than zero: block %ld in inode %lu",
139 block, inode->i_ino); 140 (long)block, (unsigned long)inode->i_ino);
140 return -EIO; 141 return -EIO;
141 } 142 }
142 143
143 if (create) { 144 if (create) {
144 befs_error(sb, "befs_get_block() was asked to write to " 145 befs_error(sb, "befs_get_block() was asked to write to "
145 "block %ld in inode %lu", block, inode->i_ino); 146 "block %ld in inode %lu", (long)block,
147 (unsigned long)inode->i_ino);
146 return -EPERM; 148 return -EPERM;
147 } 149 }
148 150
149 res = befs_fblock2brun(sb, ds, block, &run); 151 res = befs_fblock2brun(sb, ds, block, &run);
150 if (res != BEFS_OK) { 152 if (res != BEFS_OK) {
151 befs_error(sb, 153 befs_error(sb,
152 "<--- befs_get_block() for inode %lu, block " 154 "<--- %s for inode %lu, block %ld ERROR",
153 "%ld ERROR", inode->i_ino, block); 155 __func__, (unsigned long)inode->i_ino,
156 (long)block);
154 return -EFBIG; 157 return -EFBIG;
155 } 158 }
156 159
@@ -158,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block,
158 161
159 map_bh(bh_result, inode->i_sb, disk_off); 162 map_bh(bh_result, inode->i_sb, disk_off);
160 163
161 befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " 164 befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
162 "disk address %lu", inode->i_ino, block, disk_off); 165 __func__, (unsigned long)inode->i_ino, (long)block,
166 (unsigned long)disk_off);
163 167
164 return 0; 168 return 0;
165} 169}
@@ -176,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
176 char *utfname; 180 char *utfname;
177 const char *name = dentry->d_name.name; 181 const char *name = dentry->d_name.name;
178 182
179 befs_debug(sb, "---> befs_lookup() " 183 befs_debug(sb, "---> %s name %s inode %ld", __func__,
180 "name %s inode %ld", dentry->d_name.name, dir->i_ino); 184 dentry->d_name.name, dir->i_ino);
181 185
182 /* Convert to UTF-8 */ 186 /* Convert to UTF-8 */
183 if (BEFS_SB(sb)->nls) { 187 if (BEFS_SB(sb)->nls) {
184 ret = 188 ret =
185 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); 189 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
186 if (ret < 0) { 190 if (ret < 0) {
187 befs_debug(sb, "<--- befs_lookup() ERROR"); 191 befs_debug(sb, "<--- %s ERROR", __func__);
188 return ERR_PTR(ret); 192 return ERR_PTR(ret);
189 } 193 }
190 ret = befs_btree_find(sb, ds, utfname, &offset); 194 ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -195,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
195 } 199 }
196 200
197 if (ret == BEFS_BT_NOT_FOUND) { 201 if (ret == BEFS_BT_NOT_FOUND) {
198 befs_debug(sb, "<--- befs_lookup() %s not found", 202 befs_debug(sb, "<--- %s %s not found", __func__,
199 dentry->d_name.name); 203 dentry->d_name.name);
200 return ERR_PTR(-ENOENT); 204 return ERR_PTR(-ENOENT);
201 205
202 } else if (ret != BEFS_OK || offset == 0) { 206 } else if (ret != BEFS_OK || offset == 0) {
203 befs_warning(sb, "<--- befs_lookup() Error"); 207 befs_warning(sb, "<--- %s Error", __func__);
204 return ERR_PTR(-ENODATA); 208 return ERR_PTR(-ENODATA);
205 } 209 }
206 210
@@ -210,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
210 214
211 d_add(dentry, inode); 215 d_add(dentry, inode);
212 216
213 befs_debug(sb, "<--- befs_lookup()"); 217 befs_debug(sb, "<--- %s", __func__);
214 218
215 return NULL; 219 return NULL;
216} 220}
@@ -228,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx)
228 char keybuf[BEFS_NAME_LEN + 1]; 232 char keybuf[BEFS_NAME_LEN + 1];
229 const char *dirname = file->f_path.dentry->d_name.name; 233 const char *dirname = file->f_path.dentry->d_name.name;
230 234
231 befs_debug(sb, "---> befs_readdir() " 235 befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
232 "name %s, inode %ld, ctx->pos %Ld", 236 __func__, dirname, inode->i_ino, ctx->pos);
233 dirname, inode->i_ino, ctx->pos);
234 237
235more: 238more:
236 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, 239 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
237 keybuf, &keysize, &value); 240 keybuf, &keysize, &value);
238 241
239 if (result == BEFS_ERR) { 242 if (result == BEFS_ERR) {
240 befs_debug(sb, "<--- befs_readdir() ERROR"); 243 befs_debug(sb, "<--- %s ERROR", __func__);
241 befs_error(sb, "IO error reading %s (inode %lu)", 244 befs_error(sb, "IO error reading %s (inode %lu)",
242 dirname, inode->i_ino); 245 dirname, inode->i_ino);
243 return -EIO; 246 return -EIO;
244 247
245 } else if (result == BEFS_BT_END) { 248 } else if (result == BEFS_BT_END) {
246 befs_debug(sb, "<--- befs_readdir() END"); 249 befs_debug(sb, "<--- %s END", __func__);
247 return 0; 250 return 0;
248 251
249 } else if (result == BEFS_BT_EMPTY) { 252 } else if (result == BEFS_BT_EMPTY) {
250 befs_debug(sb, "<--- befs_readdir() Empty directory"); 253 befs_debug(sb, "<--- %s Empty directory", __func__);
251 return 0; 254 return 0;
252 } 255 }
253 256
@@ -260,7 +263,7 @@ more:
260 result = 263 result =
261 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); 264 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
262 if (result < 0) { 265 if (result < 0) {
263 befs_debug(sb, "<--- befs_readdir() ERROR"); 266 befs_debug(sb, "<--- %s ERROR", __func__);
264 return result; 267 return result;
265 } 268 }
266 if (!dir_emit(ctx, nlsname, nlsnamelen, 269 if (!dir_emit(ctx, nlsname, nlsnamelen,
@@ -277,7 +280,7 @@ more:
277 ctx->pos++; 280 ctx->pos++;
278 goto more; 281 goto more;
279 282
280 befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); 283 befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
281 284
282 return 0; 285 return 0;
283} 286}
@@ -321,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
321 struct inode *inode; 324 struct inode *inode;
322 long ret = -EIO; 325 long ret = -EIO;
323 326
324 befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); 327 befs_debug(sb, "---> %s inode = %lu", __func__, ino);
325 328
326 inode = iget_locked(sb, ino); 329 inode = iget_locked(sb, ino);
327 if (!inode) 330 if (!inode)
@@ -428,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
428 } 431 }
429 432
430 brelse(bh); 433 brelse(bh);
431 befs_debug(sb, "<--- befs_read_inode()"); 434 befs_debug(sb, "<--- %s", __func__);
432 unlock_new_inode(inode); 435 unlock_new_inode(inode);
433 return inode; 436 return inode;
434 437
@@ -437,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
437 440
438 unacquire_none: 441 unacquire_none:
439 iget_failed(inode); 442 iget_failed(inode);
440 befs_debug(sb, "<--- befs_read_inode() - Bad inode"); 443 befs_debug(sb, "<--- %s - Bad inode", __func__);
441 return ERR_PTR(ret); 444 return ERR_PTR(ret);
442} 445}
443 446
@@ -445,7 +448,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
445 * 448 *
446 * Taken from NFS implementation by Al Viro. 449 * Taken from NFS implementation by Al Viro.
447 */ 450 */
448static int 451static int __init
449befs_init_inodecache(void) 452befs_init_inodecache(void)
450{ 453{
451 befs_inode_cachep = kmem_cache_create("befs_inode_cache", 454 befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -454,11 +457,9 @@ befs_init_inodecache(void)
454 SLAB_MEM_SPREAD), 457 SLAB_MEM_SPREAD),
455 init_once); 458 init_once);
456 if (befs_inode_cachep == NULL) { 459 if (befs_inode_cachep == NULL) {
457 printk(KERN_ERR "befs_init_inodecache: " 460 pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
458 "Couldn't initialize inode slabcache\n");
459 return -ENOMEM; 461 return -ENOMEM;
460 } 462 }
461
462 return 0; 463 return 0;
463} 464}
464 465
@@ -544,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
544 */ 545 */
545 int maxlen = in_len + 1; 546 int maxlen = in_len + 1;
546 547
547 befs_debug(sb, "---> utf2nls()"); 548 befs_debug(sb, "---> %s", __func__);
548 549
549 if (!nls) { 550 if (!nls) {
550 befs_error(sb, "befs_utf2nls called with no NLS table loaded"); 551 befs_error(sb, "%s called with no NLS table loaded", __func__);
551 return -EINVAL; 552 return -EINVAL;
552 } 553 }
553 554
554 *out = result = kmalloc(maxlen, GFP_NOFS); 555 *out = result = kmalloc(maxlen, GFP_NOFS);
555 if (!*out) { 556 if (!*out) {
556 befs_error(sb, "befs_utf2nls() cannot allocate memory"); 557 befs_error(sb, "%s cannot allocate memory", __func__);
557 *out_len = 0; 558 *out_len = 0;
558 return -ENOMEM; 559 return -ENOMEM;
559 } 560 }
@@ -575,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
575 result[o] = '\0'; 576 result[o] = '\0';
576 *out_len = o; 577 *out_len = o;
577 578
578 befs_debug(sb, "<--- utf2nls()"); 579 befs_debug(sb, "<--- %s", __func__);
579 580
580 return o; 581 return o;
581 582
582 conv_err: 583 conv_err:
583 befs_error(sb, "Name using character set %s contains a character that " 584 befs_error(sb, "Name using character set %s contains a character that "
584 "cannot be converted to unicode.", nls->charset); 585 "cannot be converted to unicode.", nls->charset);
585 befs_debug(sb, "<--- utf2nls()"); 586 befs_debug(sb, "<--- %s", __func__);
586 kfree(result); 587 kfree(result);
587 return -EILSEQ; 588 return -EILSEQ;
588} 589}
@@ -623,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
623 * in special cases */ 624 * in special cases */
624 int maxlen = (3 * in_len) + 1; 625 int maxlen = (3 * in_len) + 1;
625 626
626 befs_debug(sb, "---> nls2utf()\n"); 627 befs_debug(sb, "---> %s\n", __func__);
627 628
628 if (!nls) { 629 if (!nls) {
629 befs_error(sb, "befs_nls2utf called with no NLS table loaded."); 630 befs_error(sb, "%s called with no NLS table loaded.",
631 __func__);
630 return -EINVAL; 632 return -EINVAL;
631 } 633 }
632 634
633 *out = result = kmalloc(maxlen, GFP_NOFS); 635 *out = result = kmalloc(maxlen, GFP_NOFS);
634 if (!*out) { 636 if (!*out) {
635 befs_error(sb, "befs_nls2utf() cannot allocate memory"); 637 befs_error(sb, "%s cannot allocate memory", __func__);
636 *out_len = 0; 638 *out_len = 0;
637 return -ENOMEM; 639 return -ENOMEM;
638 } 640 }
@@ -653,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
653 result[o] = '\0'; 655 result[o] = '\0';
654 *out_len = o; 656 *out_len = o;
655 657
656 befs_debug(sb, "<--- nls2utf()"); 658 befs_debug(sb, "<--- %s", __func__);
657 659
658 return i; 660 return i;
659 661
660 conv_err: 662 conv_err:
661 befs_error(sb, "Name using charecter set %s contains a charecter that " 663 befs_error(sb, "Name using charecter set %s contains a charecter that "
662 "cannot be converted to unicode.", nls->charset); 664 "cannot be converted to unicode.", nls->charset);
663 befs_debug(sb, "<--- nls2utf()"); 665 befs_debug(sb, "<--- %s", __func__);
664 kfree(result); 666 kfree(result);
665 return -EILSEQ; 667 return -EILSEQ;
666} 668}
@@ -715,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts)
715 if (option >= 0) 717 if (option >= 0)
716 uid = make_kuid(current_user_ns(), option); 718 uid = make_kuid(current_user_ns(), option);
717 if (!uid_valid(uid)) { 719 if (!uid_valid(uid)) {
718 printk(KERN_ERR "BeFS: Invalid uid %d, " 720 pr_err("Invalid uid %d, "
719 "using default\n", option); 721 "using default\n", option);
720 break; 722 break;
721 } 723 }
722 opts->uid = uid; 724 opts->uid = uid;
@@ -729,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts)
729 if (option >= 0) 731 if (option >= 0)
730 gid = make_kgid(current_user_ns(), option); 732 gid = make_kgid(current_user_ns(), option);
731 if (!gid_valid(gid)) { 733 if (!gid_valid(gid)) {
732 printk(KERN_ERR "BeFS: Invalid gid %d, " 734 pr_err("Invalid gid %d, "
733 "using default\n", option); 735 "using default\n", option);
734 break; 736 break;
735 } 737 }
736 opts->gid = gid; 738 opts->gid = gid;
@@ -740,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
740 kfree(opts->iocharset); 742 kfree(opts->iocharset);
741 opts->iocharset = match_strdup(&args[0]); 743 opts->iocharset = match_strdup(&args[0]);
742 if (!opts->iocharset) { 744 if (!opts->iocharset) {
743 printk(KERN_ERR "BeFS: allocation failure for " 745 pr_err("allocation failure for "
744 "iocharset string\n"); 746 "iocharset string\n");
745 return 0; 747 return 0;
746 } 748 }
747 break; 749 break;
@@ -749,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts)
749 opts->debug = 1; 751 opts->debug = 1;
750 break; 752 break;
751 default: 753 default:
752 printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " 754 pr_err("Unrecognized mount option \"%s\" "
753 "or missing value\n", p); 755 "or missing value\n", p);
754 return 0; 756 return 0;
755 } 757 }
756 } 758 }
@@ -791,22 +793,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
791 793
792 save_mount_options(sb, data); 794 save_mount_options(sb, data);
793 795
794 sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); 796 sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
795 if (sb->s_fs_info == NULL) { 797 if (sb->s_fs_info == NULL) {
796 printk(KERN_ERR 798 pr_err("(%s): Unable to allocate memory for private "
797 "BeFS(%s): Unable to allocate memory for private "
798 "portion of superblock. Bailing.\n", sb->s_id); 799 "portion of superblock. Bailing.\n", sb->s_id);
799 goto unacquire_none; 800 goto unacquire_none;
800 } 801 }
801 befs_sb = BEFS_SB(sb); 802 befs_sb = BEFS_SB(sb);
802 memset(befs_sb, 0, sizeof(befs_sb_info));
803 803
804 if (!parse_options((char *) data, &befs_sb->mount_opts)) { 804 if (!parse_options((char *) data, &befs_sb->mount_opts)) {
805 befs_error(sb, "cannot parse mount options"); 805 befs_error(sb, "cannot parse mount options");
806 goto unacquire_priv_sbp; 806 goto unacquire_priv_sbp;
807 } 807 }
808 808
809 befs_debug(sb, "---> befs_fill_super()"); 809 befs_debug(sb, "---> %s", __func__);
810 810
811#ifndef CONFIG_BEFS_RW 811#ifndef CONFIG_BEFS_RW
812 if (!(sb->s_flags & MS_RDONLY)) { 812 if (!(sb->s_flags & MS_RDONLY)) {
@@ -854,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
854 goto unacquire_priv_sbp; 854 goto unacquire_priv_sbp;
855 855
856 if( befs_sb->num_blocks > ~((sector_t)0) ) { 856 if( befs_sb->num_blocks > ~((sector_t)0) ) {
857 befs_error(sb, "blocks count: %Lu " 857 befs_error(sb, "blocks count: %llu "
858 "is larger than the host can use", 858 "is larger than the host can use",
859 befs_sb->num_blocks); 859 befs_sb->num_blocks);
860 goto unacquire_priv_sbp; 860 goto unacquire_priv_sbp;
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
913static int 913static int
914befs_remount(struct super_block *sb, int *flags, char *data) 914befs_remount(struct super_block *sb, int *flags, char *data)
915{ 915{
916 sync_filesystem(sb);
916 if (!(*flags & MS_RDONLY)) 917 if (!(*flags & MS_RDONLY))
917 return -EINVAL; 918 return -EINVAL;
918 return 0; 919 return 0;
@@ -924,7 +925,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
924 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
925 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 befs_debug(sb, "---> befs_statfs()"); 928 befs_debug(sb, "---> %s", __func__);
928 929
929 buf->f_type = BEFS_SUPER_MAGIC; 930 buf->f_type = BEFS_SUPER_MAGIC;
930 buf->f_bsize = sb->s_blocksize; 931 buf->f_bsize = sb->s_blocksize;
@@ -937,7 +938,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
937 buf->f_fsid.val[1] = (u32)(id >> 32); 938 buf->f_fsid.val[1] = (u32)(id >> 32);
938 buf->f_namelen = BEFS_NAME_LEN; 939 buf->f_namelen = BEFS_NAME_LEN;
939 940
940 befs_debug(sb, "<--- befs_statfs()"); 941 befs_debug(sb, "<--- %s", __func__);
941 942
942 return 0; 943 return 0;
943} 944}
@@ -963,7 +964,7 @@ init_befs_fs(void)
963{ 964{
964 int err; 965 int err;
965 966
966 printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); 967 pr_info("version: %s\n", BEFS_VERSION);
967 968
968 err = befs_init_inodecache(); 969 err = befs_init_inodecache();
969 if (err) 970 if (err)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b3f9a2..7041ac35ace8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)
172 172
173 dprintf("ino=%08lx\n", ino); 173 dprintf("ino=%08lx\n", ino);
174 174
175 truncate_inode_pages(&inode->i_data, 0); 175 truncate_inode_pages_final(&inode->i_data);
176 invalidate_inode_buffers(inode); 176 invalidate_inode_buffers(inode);
177 clear_inode(inode); 177 clear_inode(inode);
178 178
@@ -266,7 +266,7 @@ static void init_once(void *foo)
266 inode_init_once(&bi->vfs_inode); 266 inode_init_once(&bi->vfs_inode);
267} 267}
268 268
269static int init_inodecache(void) 269static int __init init_inodecache(void)
270{ 270{
271 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", 271 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
272 sizeof(struct bfs_inode_info), 272 sizeof(struct bfs_inode_info),
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..aa3cb626671e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
46#endif 46#endif
47 47
48static int load_elf_binary(struct linux_binprm *bprm); 48static int load_elf_binary(struct linux_binprm *bprm);
49static int load_elf_library(struct file *);
50static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
51 int, int, unsigned long); 50 int, int, unsigned long);
52 51
52#ifdef CONFIG_USELIB
53static int load_elf_library(struct file *);
54#else
55#define load_elf_library NULL
56#endif
57
53/* 58/*
54 * If we don't support core dumping, then supply a NULL so we 59 * If we don't support core dumping, then supply a NULL so we
55 * don't even try. 60 * don't even try.
@@ -579,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
579 unsigned long start_code, end_code, start_data, end_data; 584 unsigned long start_code, end_code, start_data, end_data;
580 unsigned long reloc_func_desc __maybe_unused = 0; 585 unsigned long reloc_func_desc __maybe_unused = 0;
581 int executable_stack = EXSTACK_DEFAULT; 586 int executable_stack = EXSTACK_DEFAULT;
582 unsigned long def_flags = 0;
583 struct pt_regs *regs = current_pt_regs(); 587 struct pt_regs *regs = current_pt_regs();
584 struct { 588 struct {
585 struct elfhdr elf_ex; 589 struct elfhdr elf_ex;
@@ -719,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
719 if (retval) 723 if (retval)
720 goto out_free_dentry; 724 goto out_free_dentry;
721 725
722 /* OK, This is the point of no return */
723 current->mm->def_flags = def_flags;
724
725 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 726 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
726 may depend on the personality. */ 727 may depend on the personality. */
727 SET_PERSONALITY(loc->elf_ex); 728 SET_PERSONALITY(loc->elf_ex);
@@ -1005,6 +1006,7 @@ out_free_ph:
1005 goto out; 1006 goto out;
1006} 1007}
1007 1008
1009#ifdef CONFIG_USELIB
1008/* This is really simpleminded and specialized - we are loading an 1010/* This is really simpleminded and specialized - we are loading an
1009 a.out library that is given an ELF header. */ 1011 a.out library that is given an ELF header. */
1010static int load_elf_library(struct file *file) 1012static int load_elf_library(struct file *file)
@@ -1083,6 +1085,7 @@ out_free_ph:
1083out: 1085out:
1084 return error; 1086 return error;
1085} 1087}
1088#endif /* #ifdef CONFIG_USELIB */
1086 1089
1087#ifdef CONFIG_ELF_CORE 1090#ifdef CONFIG_ELF_CORE
1088/* 1091/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1c740e152f38..b60500300dd7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
656 656
657 mutex_unlock(&root->d_inode->i_mutex); 657 mutex_unlock(&root->d_inode->i_mutex);
658 dput(root); 658 dput(root);
659 break;
659 default: return res; 660 default: return res;
660 } 661 }
661 return count; 662 return count;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4f70f383132c..29696b78d1f4 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -301,25 +301,25 @@ int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
301EXPORT_SYMBOL(bio_integrity_get_tag); 301EXPORT_SYMBOL(bio_integrity_get_tag);
302 302
303/** 303/**
304 * bio_integrity_generate - Generate integrity metadata for a bio 304 * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
305 * @bio: bio to generate integrity metadata for 305 * @bio: bio to generate/verify integrity metadata for
306 * 306 * @operate: operate number, 1 for generate, 0 for verify
307 * Description: Generates integrity metadata for a bio by calling the
308 * block device's generation callback function. The bio must have a
309 * bip attached with enough room to accommodate the generated
310 * integrity metadata.
311 */ 307 */
312static void bio_integrity_generate(struct bio *bio) 308static int bio_integrity_generate_verify(struct bio *bio, int operate)
313{ 309{
314 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 310 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
315 struct blk_integrity_exchg bix; 311 struct blk_integrity_exchg bix;
316 struct bio_vec bv; 312 struct bio_vec bv;
317 struct bvec_iter iter; 313 struct bvec_iter iter;
318 sector_t sector = bio->bi_iter.bi_sector; 314 sector_t sector;
319 unsigned int sectors, total; 315 unsigned int sectors, ret = 0;
320 void *prot_buf = bio->bi_integrity->bip_buf; 316 void *prot_buf = bio->bi_integrity->bip_buf;
321 317
322 total = 0; 318 if (operate)
319 sector = bio->bi_iter.bi_sector;
320 else
321 sector = bio->bi_integrity->bip_iter.bi_sector;
322
323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
324 bix.sector_size = bi->sector_size; 324 bix.sector_size = bi->sector_size;
325 325
@@ -330,16 +330,37 @@ static void bio_integrity_generate(struct bio *bio)
330 bix.prot_buf = prot_buf; 330 bix.prot_buf = prot_buf;
331 bix.sector = sector; 331 bix.sector = sector;
332 332
333 bi->generate_fn(&bix); 333 if (operate) {
334 bi->generate_fn(&bix);
335 } else {
336 ret = bi->verify_fn(&bix);
337 if (ret) {
338 kunmap_atomic(kaddr);
339 return ret;
340 }
341 }
334 342
335 sectors = bv.bv_len / bi->sector_size; 343 sectors = bv.bv_len / bi->sector_size;
336 sector += sectors; 344 sector += sectors;
337 prot_buf += sectors * bi->tuple_size; 345 prot_buf += sectors * bi->tuple_size;
338 total += sectors * bi->tuple_size;
339 BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
340 346
341 kunmap_atomic(kaddr); 347 kunmap_atomic(kaddr);
342 } 348 }
349 return ret;
350}
351
352/**
353 * bio_integrity_generate - Generate integrity metadata for a bio
354 * @bio: bio to generate integrity metadata for
355 *
356 * Description: Generates integrity metadata for a bio by calling the
357 * block device's generation callback function. The bio must have a
358 * bip attached with enough room to accommodate the generated
359 * integrity metadata.
360 */
361static void bio_integrity_generate(struct bio *bio)
362{
363 bio_integrity_generate_verify(bio, 1);
343} 364}
344 365
345static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) 366static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
@@ -454,40 +475,7 @@ EXPORT_SYMBOL(bio_integrity_prep);
454 */ 475 */
455static int bio_integrity_verify(struct bio *bio) 476static int bio_integrity_verify(struct bio *bio)
456{ 477{
457 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 478 return bio_integrity_generate_verify(bio, 0);
458 struct blk_integrity_exchg bix;
459 struct bio_vec *bv;
460 sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
461 unsigned int sectors, ret = 0;
462 void *prot_buf = bio->bi_integrity->bip_buf;
463 int i;
464
465 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
466 bix.sector_size = bi->sector_size;
467
468 bio_for_each_segment_all(bv, bio, i) {
469 void *kaddr = kmap_atomic(bv->bv_page);
470
471 bix.data_buf = kaddr + bv->bv_offset;
472 bix.data_size = bv->bv_len;
473 bix.prot_buf = prot_buf;
474 bix.sector = sector;
475
476 ret = bi->verify_fn(&bix);
477
478 if (ret) {
479 kunmap_atomic(kaddr);
480 return ret;
481 }
482
483 sectors = bv->bv_len / bi->sector_size;
484 sector += sectors;
485 prot_buf += sectors * bi->tuple_size;
486
487 kunmap_atomic(kaddr);
488 }
489
490 return ret;
491} 479}
492 480
493/** 481/**
diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b6eb49..b1bc722b89aa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -116,7 +116,6 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
116 if (!slab) 116 if (!slab)
117 goto out_unlock; 117 goto out_unlock;
118 118
119 printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
120 bslab->slab = slab; 119 bslab->slab = slab;
121 bslab->slab_ref = 1; 120 bslab->slab_ref = 1;
122 bslab->slab_size = sz; 121 bslab->slab_size = sz;
@@ -1970,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
1970 1969
1971 /* associate blkcg if exists */ 1970 /* associate blkcg if exists */
1972 rcu_read_lock(); 1971 rcu_read_lock();
1973 css = task_css(current, blkio_subsys_id); 1972 css = task_css(current, blkio_cgrp_id);
1974 if (css && css_tryget(css)) 1973 if (css && css_tryget(css))
1975 bio->bi_css = css; 1974 bio->bi_css = css;
1976 rcu_read_unlock(); 1975 rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823a9cbd..ba0d2b05bb78 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev)
83{ 83{
84 struct address_space *mapping = bdev->bd_inode->i_mapping; 84 struct address_space *mapping = bdev->bd_inode->i_mapping;
85 85
86 if (mapping->nrpages == 0) 86 if (mapping->nrpages == 0 && mapping->nrshadows == 0)
87 return; 87 return;
88 88
89 invalidate_bh_lrus(); 89 invalidate_bh_lrus();
@@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode)
419{ 419{
420 struct block_device *bdev = &BDEV_I(inode)->bdev; 420 struct block_device *bdev = &BDEV_I(inode)->bdev;
421 struct list_head *p; 421 struct list_head *p;
422 truncate_inode_pages(&inode->i_data, 0); 422 truncate_inode_pages_final(&inode->i_data);
423 invalidate_inode_buffers(inode); /* is it needed here? */ 423 invalidate_inode_buffers(inode); /* is it needed here? */
424 clear_inode(inode); 424 clear_inode(inode);
425 spin_lock(&bdev_lock); 425 spin_lock(&bdev_lock);
@@ -1523,7 +1523,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1523 ssize_t err; 1523 ssize_t err;
1524 1524
1525 err = generic_write_sync(file, pos, ret); 1525 err = generic_write_sync(file, pos, ret);
1526 if (err < 0 && ret > 0) 1526 if (err < 0)
1527 ret = err; 1527 ret = err;
1528 } 1528 }
1529 blk_finish_plug(&plug); 1529 blk_finish_plug(&plug);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145 154
146 rmb(); 155 if (wq->thresh == NO_THRESHOLD)
147 if (!workers->atomic_start_pending)
148 return; 156 return;
149 157
150 start = kzalloc(sizeof(*start), GFP_NOFS); 158 atomic_dec(&wq->pending);
151 if (!start) 159 spin_lock(&wq->thres_lock);
152 return; 160 /*
153 161 * Use wq->count to limit the calling frequency of
154 start->work.func = start_new_worker_func; 162 * workqueue_set_max_active.
155 start->queue = workers; 163 */
156 164 wq->count++;
157 spin_lock_irqsave(&workers->lock, flags); 165 wq->count %= (wq->thresh / 4);
158 if (!workers->atomic_start_pending) 166 if (!wq->count)
159 goto out; 167 goto out;
160 168 new_max_active = wq->current_max;
161 workers->atomic_start_pending = 0;
162 if (workers->num_workers + workers->num_workers_starting >=
163 workers->max_workers)
164 goto out;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{ 236{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
295 struct btrfs_work *work)
522{ 296{
523 spin_lock_irq(&workers->lock); 297 struct __btrfs_workqueue *dest_wq;
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{
536 struct btrfs_worker_thread *worker;
537 struct list_head *next;
538 int enforce_min;
539
540 enforce_min = (workers->num_workers + workers->num_workers_starting) <
541 workers->max_workers;
542
543 /*
544 * if we find an idle thread, don't move it to the end of the
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567 298
568 if (worker->sequence % workers->idle_thresh == 0) 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
569 list_move_tail(next, &workers->worker_list); 300 dest_wq = wq->high;
570 return worker; 301 else
302 dest_wq = wq->normal;
303 __btrfs_queue_work(dest_wq, work);
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 wq->normal->max_active = max;
327 if (wq->high)
328 wq->high->max_active = max;
676} 329}
677 330
678/* 331void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 332{
683 struct btrfs_worker_thread *worker; 333 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 334}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 362 }
362 363
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 364 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 365 extent_item_pos, total_refs);
365out: 366out:
366 path->lowest_level = 0; 367 path->lowest_level = 0;
367 btrfs_release_path(path); 368 btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 375static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 376 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 377 struct list_head *head,
377 const u64 *extent_item_pos) 378 const u64 *extent_item_pos, u64 total_refs)
378{ 379{
379 int err; 380 int err;
380 int ret = 0; 381 int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 401 if (ref->count == 0)
401 continue; 402 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 403 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 404 parents, extent_item_pos,
405 total_refs);
404 /* 406 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 407 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 408 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 559 * smaller or equal that seq to the list
558 */ 560 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 561static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 562 struct list_head *prefs, u64 *total_refs)
561{ 563{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 564 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 565 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 595 default:
594 BUG_ON(1); 596 BUG_ON(1);
595 } 597 }
598 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 599 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 600 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 601 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 656 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 657static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 658 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 659 int *info_level, struct list_head *prefs,
660 u64 *total_refs)
657{ 661{
658 int ret = 0; 662 int ret = 0;
659 int slot; 663 int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 681
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 682 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 683 flags = btrfs_extent_flags(leaf, ei);
684 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 685 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 686
682 ptr = (unsigned long)(ei + 1); 687 ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 864 struct list_head prefs;
860 struct __prelim_ref *ref; 865 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 866 struct extent_inode_elem *eie = NULL;
867 u64 total_refs = 0;
862 868
863 INIT_LIST_HEAD(&prefs); 869 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 870 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 879 path = btrfs_alloc_path();
874 if (!path) 880 if (!path)
875 return -ENOMEM; 881 return -ENOMEM;
876 if (!trans) 882 if (!trans) {
877 path->search_commit_root = 1; 883 path->search_commit_root = 1;
884 path->skip_locking = 1;
885 }
878 886
879 /* 887 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 888 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
915 } 923 }
916 spin_unlock(&delayed_refs->lock); 924 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 925 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 926 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 927 mutex_unlock(&head->mutex);
920 if (ret) 928 if (ret)
921 goto out; 929 goto out;
@@ -936,7 +944,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 944 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 945 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 946 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 947 &info_level, &prefs,
948 &total_refs);
940 if (ret) 949 if (ret)
941 goto out; 950 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 951 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
956 __merge_refs(&prefs, 1); 965 __merge_refs(&prefs, 1);
957 966
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 967 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 968 extent_item_pos, total_refs);
960 if (ret) 969 if (ret)
961 goto out; 970 goto out;
962 971
@@ -965,7 +974,7 @@ again:
965 while (!list_empty(&prefs)) { 974 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 975 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 976 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 977 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 978 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 979 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 980 if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1070 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1071 const u64 *extent_item_pos)
1063{ 1072{
1064 struct ulist *tmp;
1065 int ret; 1073 int ret;
1066 1074
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1075 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1076 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1077 return -ENOMEM;
1074 }
1075 1078
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1079 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1080 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1081 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1082 free_leaf_list(*leafs);
1082 return ret; 1083 return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1334 if (ret < 0)
1334 return ret; 1335 return ret;
1335 1336
1336 while (1) { 1337 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1338 if (ret) {
1338 if (path->slots[0] == 0) { 1339 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1340 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1341 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1342 }
1367 1343 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1344 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1345 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1346 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
472 rcu_read_lock(); 472 rcu_read_lock();
473 page = radix_tree_lookup(&mapping->page_tree, pg_index); 473 page = radix_tree_lookup(&mapping->page_tree, pg_index);
474 rcu_read_unlock(); 474 rcu_read_unlock();
475 if (page) { 475 if (page && !radix_tree_exceptional_entry(page)) {
476 misses++; 476 misses++;
477 if (misses > 4) 477 if (misses > 4)
478 break; 478 break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5376 int advance_right;
5377 u64 left_blockptr; 5377 u64 left_blockptr;
5378 u64 right_blockptr; 5378 u64 right_blockptr;
5379 u64 left_gen;
5380 u64 right_gen;
5379 u64 left_start_ctransid; 5381 u64 left_start_ctransid;
5380 u64 right_start_ctransid; 5382 u64 right_start_ctransid;
5381 u64 ctransid; 5383 u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5642 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5643 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5644 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5645 left_gen = btrfs_node_ptr_generation(
5646 left_path->nodes[left_level],
5647 left_path->slots[left_level]);
5648 right_gen = btrfs_node_ptr_generation(
5649 right_path->nodes[right_level],
5650 right_path->slots[right_level]);
5651 if (left_blockptr == right_blockptr &&
5652 left_gen == right_gen) {
5644 /* 5653 /*
5645 * As we're on a shared block, don't 5654 * As we're on a shared block, don't
5646 * allow to go deeper. 5655 * allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
1489 */ 1490 */
1490 struct list_head ordered_roots; 1491 struct list_head ordered_roots;
1491 1492
1493 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1494 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1495 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1496 struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1505 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1506 * two
1505 */ 1507 */
1506 struct btrfs_workers generic_worker; 1508 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1509 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1510 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1511 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1512 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1513 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1514 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1515 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1516 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1517 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1518 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1519 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1520 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1521
1521 /* 1522 /*
1522 * fixup workers take dirty pages that didn't properly go through 1523 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1524 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1525 * for the sys_munmap function call path
1525 */ 1526 */
1526 struct btrfs_workers fixup_workers; 1527 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1528 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1529 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1530 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1531 int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1605 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1606 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1607 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1608 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1609 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1610 struct btrfs_workqueue *scrub_nocow_workers;
1610 1611
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1612#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1613 u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1648 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1649 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1650 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1651 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1652 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1653 struct btrfs_work qgroup_rescan_work;
1653 1654
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
1674 1675
1675 atomic_t mutually_exclusive_operation_running; 1676 atomic_t mutually_exclusive_operation_running;
1676 1677
1678 struct percpu_counter bio_counter;
1679 wait_queue_head_t replace_wait;
1680
1677 struct semaphore uuid_tree_rescan_sem; 1681 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1682 unsigned int update_uuid_tree_gen:1;
1679}; 1683};
1680 1684
1685struct btrfs_subvolume_writers {
1686 struct percpu_counter counter;
1687 wait_queue_head_t wait;
1688};
1689
1681/* 1690/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1691 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1692 * and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1723 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1724 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1725 wait_queue_head_t log_commit_wait[2];
1726 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1727 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1728 atomic_t log_commit[2];
1719 atomic_t log_batch; 1729 atomic_t log_batch;
1720 unsigned long log_transid; 1730 int log_transid;
1721 unsigned long last_log_commit; 1731 /* No matter the commit succeeds or not*/
1732 int log_transid_committed;
1733 /* Just be updated when the commit succeeds. */
1734 int last_log_commit;
1722 pid_t log_start_pid; 1735 pid_t log_start_pid;
1723 bool log_multiple_pids; 1736 bool log_multiple_pids;
1724 1737
@@ -1793,6 +1806,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1806 spinlock_t root_item_lock;
1794 atomic_t refs; 1807 atomic_t refs;
1795 1808
1809 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1810 spinlock_t delalloc_lock;
1797 /* 1811 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1812 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1816 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1817 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1818 u64 nr_delalloc_inodes;
1819
1820 struct mutex ordered_extent_mutex;
1805 /* 1821 /*
1806 * this is used by the balancing code to wait for all the pending 1822 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1823 * ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1838 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1839 */
1824 int send_in_progress; 1840 int send_in_progress;
1841 struct btrfs_subvolume_writers *subv_writers;
1842 atomic_t will_be_snapshoted;
1825}; 1843};
1826 1844
1827struct btrfs_ioctl_defrag_range_args { 1845struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3364int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3365 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3366int __get_raid_index(u64 flags);
3367
3368int btrfs_start_nocow_write(struct btrfs_root *root);
3369void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3370/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3371int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3372 int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3744 u32 min_type);
3724 3745
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3746int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3747int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3748 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3749int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3750 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3751int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4027int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4028 struct btrfs_scrub_progress *progress);
4007 4029
4030/* dev-replace.c */
4031void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4032void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4033void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4034
4008/* reada.c */ 4035/* reada.c */
4009struct reada_control { 4036struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4037 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 678
679 fs_info = end_io_wq->info; 679 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 680 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 681 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 682
684 if (bio->bi_rw & REQ_WRITE) { 683 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 684 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 685 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 686 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 687 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 688 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 689 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 690 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 691 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 692 &end_io_wq->work);
694 else 693 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 694 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 695 &end_io_wq->work);
697 } else { 696 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 697 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 698 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 699 &end_io_wq->work);
701 else if (end_io_wq->metadata) 700 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 701 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 702 &end_io_wq->work);
704 else 703 else
705 btrfs_queue_worker(&fs_info->endio_workers, 704 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 705 &end_io_wq->work);
707 } 706 }
708} 707}
709 708
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 737unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 738{
740 unsigned long limit = min_t(unsigned long, 739 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 740 info->thread_pool_size,
742 info->fs_devices->open_devices); 741 info->fs_devices->open_devices);
743 return 256 * limit; 742 return 256 * limit;
744} 743}
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 810 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 811 async->submit_bio_done = submit_bio_done;
813 812
814 async->work.func = run_one_async_start; 813 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 814 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 815
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 816 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 817 async->bio_offset = bio_offset;
821 818
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 821 atomic_inc(&fs_info->nr_async_submits);
825 822
826 if (rw & REQ_SYNC) 823 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 824 btrfs_set_work_high_priority(&async->work);
828 825
829 btrfs_queue_worker(&fs_info->workers, &async->work); 826 btrfs_queue_work(fs_info->workers, &async->work);
830 827
831 while (atomic_read(&fs_info->async_submit_draining) && 828 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 829 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1146 }
1150} 1147}
1151 1148
1149static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1150{
1151 struct btrfs_subvolume_writers *writers;
1152 int ret;
1153
1154 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1155 if (!writers)
1156 return ERR_PTR(-ENOMEM);
1157
1158 ret = percpu_counter_init(&writers->counter, 0);
1159 if (ret < 0) {
1160 kfree(writers);
1161 return ERR_PTR(ret);
1162 }
1163
1164 init_waitqueue_head(&writers->wait);
1165 return writers;
1166}
1167
1168static void
1169btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1170{
1171 percpu_counter_destroy(&writers->counter);
1172 kfree(writers);
1173}
1174
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1175static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1176 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1177 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1218 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1219 mutex_init(&root->log_mutex);
1220 mutex_init(&root->ordered_extent_mutex);
1221 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1222 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1223 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1224 init_waitqueue_head(&root->log_commit_wait[1]);
1225 INIT_LIST_HEAD(&root->log_ctxs[0]);
1226 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1227 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1228 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1229 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1230 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1231 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1232 atomic_set(&root->refs, 1);
1233 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1234 root->log_transid = 0;
1235 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1236 root->last_log_commit = 0;
1208 if (fs_info) 1237 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1238 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1446 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1447 root->log_root = log_root;
1419 root->log_transid = 0; 1448 root->log_transid = 0;
1449 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1450 root->last_log_commit = 0;
1421 return 0; 1451 return 0;
1422} 1452}
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1528int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1529{
1500 int ret; 1530 int ret;
1531 struct btrfs_subvolume_writers *writers;
1501 1532
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1533 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1534 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1538 goto fail;
1508 } 1539 }
1509 1540
1541 writers = btrfs_alloc_subvolume_writers();
1542 if (IS_ERR(writers)) {
1543 ret = PTR_ERR(writers);
1544 goto fail;
1545 }
1546 root->subv_writers = writers;
1547
1510 btrfs_init_free_ino_ctl(root); 1548 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex); 1549 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1550 spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1514 1552
1515 ret = get_anon_bdev(&root->anon_dev); 1553 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1554 if (ret)
1517 goto fail; 1555 goto free_writers;
1518 return 0; 1556 return 0;
1557
1558free_writers:
1559 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1560fail:
1520 kfree(root->free_ino_ctl); 1561 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1562 kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2031/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2032static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2033{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2034 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2035 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2036 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2037 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2038 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2039 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2040 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2041 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2042 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2043 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2044 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2045 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2046 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2047 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2048 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2049 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2050}
2011 2051
2012static void free_root_extent_buffers(struct btrfs_root *root) 2052static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2137 int err = -EINVAL;
2098 int num_backups_tried = 0; 2138 int num_backups_tried = 0;
2099 int backup_index = 0; 2139 int backup_index = 0;
2140 int max_active;
2141 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2142 bool create_uuid_tree;
2101 bool check_uuid_tree; 2143 bool check_uuid_tree;
2102 2144
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2175 goto fail_dirty_metadata_bytes;
2134 } 2176 }
2135 2177
2178 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2179 if (ret) {
2180 err = ret;
2181 goto fail_delalloc_bytes;
2182 }
2183
2136 fs_info->btree_inode = new_inode(sb); 2184 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2185 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2186 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2187 goto fail_bio_counter;
2140 } 2188 }
2141 2189
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2190 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2207 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2208 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2209 mutex_init(&fs_info->reloc_mutex);
2210 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2211 seqlock_init(&fs_info->profiles_lock);
2163 2212
2164 init_completion(&fs_info->kobj_unregister); 2213 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2260 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2261 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2262 atomic_set(&fs_info->scrub_cancel_req, 0);
2263 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2264 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2265 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2266#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2508 goto fail_alloc;
2459 } 2509 }
2460 2510
2461 btrfs_init_workers(&fs_info->generic_worker, 2511 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463
2464 btrfs_init_workers(&fs_info->workers, "worker",
2465 fs_info->thread_pool_size,
2466 &fs_info->generic_worker);
2467 2512
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2513 fs_info->workers =
2469 fs_info->thread_pool_size, NULL); 2514 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2515 max_active, 16);
2470 2516
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2517 fs_info->delalloc_workers =
2472 fs_info->thread_pool_size, NULL); 2518 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2473 2519
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2520 fs_info->flush_workers =
2475 min_t(u64, fs_devices->num_devices, 2521 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2522
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2523 fs_info->caching_workers =
2479 fs_info->thread_pool_size, NULL); 2524 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2480 2525
2481 /* a higher idle thresh on the submit workers makes it much more 2526 /*
2527 * a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2528 * likely that bios will be send down in a sane order to the
2483 * devices 2529 * devices
2484 */ 2530 */
2485 fs_info->submit_workers.idle_thresh = 64; 2531 fs_info->submit_workers =
2486 2532 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2533 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2534 max_active), 64);
2489 2535
2490 fs_info->delalloc_workers.idle_thresh = 2; 2536 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2537 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2538
2524 /* 2539 /*
2525 * endios are largely parallel and should have a very 2540 * endios are largely parallel and should have a very
2526 * low idle thresh 2541 * low idle thresh
2527 */ 2542 */
2528 fs_info->endio_workers.idle_thresh = 4; 2543 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2544 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2545 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2546 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2547 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2548 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2549 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2550 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2551 fs_info->rmw_workers =
2537 /* 2552 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2553 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2554 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2555 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2556 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2557 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2558 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2559 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2560 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2561 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2562 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2563
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2564 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2565 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2566 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2567 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2568 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2569 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2570 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2571 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2572 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2573 err = -ENOMEM;
2560 goto fail_sb_buffer; 2574 goto fail_sb_buffer;
2561 } 2575 }
@@ -2963,6 +2977,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2977 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2978
2965 iput(fs_info->btree_inode); 2979 iput(fs_info->btree_inode);
2980fail_bio_counter:
2981 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2982fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 2983 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 2984fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3260 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3261 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3262 list_for_each_entry_rcu(dev, head, dev_list) {
3263 if (dev->missing)
3264 continue;
3247 if (!dev->bdev) { 3265 if (!dev->bdev) {
3248 errors_send++; 3266 errors_send++;
3249 continue; 3267 continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3276
3259 /* wait for all the barriers */ 3277 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3278 list_for_each_entry_rcu(dev, head, dev_list) {
3279 if (dev->missing)
3280 continue;
3261 if (!dev->bdev) { 3281 if (!dev->bdev) {
3262 errors_wait++; 3282 errors_wait++;
3263 continue; 3283 continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3497 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3498 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3499 free_anon_bdev(root->anon_dev);
3500 if (root->subv_writers)
3501 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3502 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3503 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3504 kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
3610 3632
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3633 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3634 percpu_counter_destroy(&fs_info->delalloc_bytes);
3635 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3636 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3637 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3638
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3814 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3815 &fs_info->ordered_roots);
3793 3816
3817 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3818 btrfs_destroy_ordered_extents(root);
3795 3819
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3820 cond_resched();
3821 spin_lock(&fs_info->ordered_root_lock);
3797 } 3822 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3823 spin_unlock(&fs_info->ordered_root_lock);
3799} 3824}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3971}
3972 3972
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3974 unsigned long nr_pages, int nr_items)
3975{ 3975{
3976 struct super_block *sb = root->fs_info->sb; 3976 struct super_block *sb = root->fs_info->sb;
3977 3977
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3986 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3987 * the disk).
3988 */ 3988 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3989 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3990 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3991 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3992 }
3993} 3993}
3994 3994
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4045 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4046 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4048 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4049 /*
4050 * We need to wait for the async pages to actually start before 4050 * We need to wait for the async pages to actually start before
4051 * we do anything. 4051 * we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4112 goto commit;
4113 4113
4114 /* See if there is enough pinned space to make this reservation */ 4114 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4115 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4116 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4117 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4118
4123 /* 4119 /*
4124 * See if there is some space in the delayed insertion reservation for 4120 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4123 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4124 return -ENOSPC;
4129 4125
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4126 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4127 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4128 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4129 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4130 return -ENOSPC;
4137 } 4131 }
4138 spin_unlock(&delayed_rsv->lock); 4132 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4133
4141commit: 4134commit:
4142 trans = btrfs_join_transaction(root); 4135 trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4174 break;
4182 case FLUSH_DELALLOC: 4175 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4176 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4177 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4178 state == FLUSH_DELALLOC_WAIT);
4186 break; 4179 break;
4187 case ALLOC_CHUNK: 4180 case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8931 range->len = trimmed;
8939 return ret; 8932 return ret;
8940} 8933}
8934
8935/*
8936 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8937 * they are used to prevent the some tasks writing data into the page cache
8938 * by nocow before the subvolume is snapshoted, but flush the data into
8939 * the disk after the snapshot creation.
8940 */
8941void btrfs_end_nocow_write(struct btrfs_root *root)
8942{
8943 percpu_counter_dec(&root->subv_writers->counter);
8944 /*
8945 * Make sure counter is updated before we wake up
8946 * waiters.
8947 */
8948 smp_mb();
8949 if (waitqueue_active(&root->subv_writers->wait))
8950 wake_up(&root->subv_writers->wait);
8951}
8952
8953int btrfs_start_nocow_write(struct btrfs_root *root)
8954{
8955 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8956 return 0;
8957
8958 percpu_counter_inc(&root->subv_writers->counter);
8959 /*
8960 * Make sure counter is updated before we check for snapshot creation.
8961 */
8962 smp_mb();
8963 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8964 btrfs_end_nocow_write(root);
8965 return 0;
8966 }
8967 return 1;
8968}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2760
2758 if (em_cached && *em_cached) { 2761 if (em_cached && *em_cached) {
2759 em = *em_cached; 2762 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2763 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2764 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2765 atomic_inc(&em->refs);
2763 return em; 2766 return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..c660527af838 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 591 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 592 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 593 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 594 if (no_splits)
596 goto next; 595 goto next;
597 596
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 621 split->bdev = em->bdev;
623 split->flags = flags; 622 split->flags = flags;
624 split->compress_type = em->compress_type; 623 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 624 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 625 free_extent_map(split);
628 split = split2; 626 split = split2;
629 split2 = NULL; 627 split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 659 split->orig_block_len = 0;
662 } 660 }
663 661
664 ret = add_extent_mapping(em_tree, split, modified); 662 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 663 replace_extent_mapping(em_tree, em, split,
664 modified);
665 } else {
666 ret = add_extent_mapping(em_tree, split,
667 modified);
668 ASSERT(ret == 0); /* Logic error */
669 }
666 free_extent_map(split); 670 free_extent_map(split);
667 split = NULL; 671 split = NULL;
668 } 672 }
669next: 673next:
674 if (extent_map_in_tree(em))
675 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 676 write_unlock(&em_tree->lock);
671 677
672 /* once for us */ 678 /* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 726 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 727 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 728
723 if (start >= BTRFS_I(inode)->disk_i_size) 729 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 730 modify_tree = 0;
725 731
726 while (1) { 732 while (1) {
@@ -798,7 +804,10 @@ next_slot:
798 */ 804 */
799 if (start > key.offset && end < extent_end) { 805 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 806 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 807 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
808 ret = -EINVAL;
809 break;
810 }
802 811
803 memcpy(&new_key, &key, sizeof(new_key)); 812 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 813 new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
841 * | -------- extent -------- | 850 * | -------- extent -------- |
842 */ 851 */
843 if (start <= key.offset && end < extent_end) { 852 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 853 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
854 ret = -EINVAL;
855 break;
856 }
845 857
846 memcpy(&new_key, &key, sizeof(new_key)); 858 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 859 new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
864 */ 876 */
865 if (start > key.offset && end >= extent_end) { 877 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 878 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 879 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
880 ret = -EINVAL;
881 break;
882 }
868 883
869 btrfs_set_file_extent_num_bytes(leaf, fi, 884 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 885 start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 953 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 954 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 955 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 956 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 957 */
943 path->slots[0] = del_slot; 958 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 959 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 960 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 961 btrfs_abort_transaction(trans, root, ret);
962 }
947 963
948 leaf = path->nodes[0]; 964 leaf = path->nodes[0];
949 /* 965 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 966 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 967 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 968 * write lock.
953 */ 969 */
954 if (!ret && replace_extent && leafs_visited == 1 && 970 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 971 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 972 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 973 btrfs_leaf_free_space(root, leaf) >=
958 974 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 975
960 key.type = BTRFS_EXTENT_DATA_KEY; 976 key.objectid = ino;
961 key.offset = start; 977 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 978 key.offset = start;
963 &extent_item_size, 979 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 980 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 981
966 extent_item_size, 1); 982 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 983 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
984 path->slots[0]++;
968 } 985 }
986 setup_items_for_insert(root, path, &key,
987 &extent_item_size,
988 extent_item_size,
989 sizeof(struct btrfs_item) +
990 extent_item_size, 1);
991 *key_inserted = 1;
969 } 992 }
970 993
971 if (!replace_extent || !(*key_inserted)) 994 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1369 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1370 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1371 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1372 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1373 last_pos - start_pos + 1);
1350 if (ordered && 1374 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1375 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1376 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1377 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1378 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1379 cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1381 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1382 page_cache_release(pages[i]);
1360 } 1383 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1384 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1385 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1386 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1387 }
1368 if (ordered) 1388 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1389 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1416 u64 num_bytes;
1397 int ret; 1417 int ret;
1398 1418
1419 ret = btrfs_start_nocow_write(root);
1420 if (!ret)
1421 return -ENOSPC;
1422
1399 lockstart = round_down(pos, root->sectorsize); 1423 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1424 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1425
1402 while (1) { 1426 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1427 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1439 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1440 if (ret <= 0) {
1417 ret = 0; 1441 ret = 0;
1442 btrfs_end_nocow_write(root);
1418 } else { 1443 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1444 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1445 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1446 }
1425 1447
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1448 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1532 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1533 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1534 reserve_bytes);
1535 else
1536 btrfs_end_nocow_write(root);
1513 break; 1537 break;
1514 } 1538 }
1515 1539
@@ -1598,6 +1622,9 @@ again:
1598 } 1622 }
1599 1623
1600 release_bytes = 0; 1624 release_bytes = 0;
1625 if (only_release_metadata)
1626 btrfs_end_nocow_write(root);
1627
1601 if (only_release_metadata && copied > 0) { 1628 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1629 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1630 u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
1624 kfree(pages); 1651 kfree(pages);
1625 1652
1626 if (release_bytes) { 1653 if (release_bytes) {
1627 if (only_release_metadata) 1654 if (only_release_metadata) {
1655 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1656 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1657 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1658 btrfs_delalloc_release_space(inode, release_bytes);
1659 }
1631 } 1660 }
1632 1661
1633 return num_written ? num_written : ret; 1662 return num_written ? num_written : ret;
@@ -1797,7 +1826,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1797 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1826 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1798 if (num_written > 0) { 1827 if (num_written > 0) {
1799 err = generic_write_sync(file, pos, num_written); 1828 err = generic_write_sync(file, pos, num_written);
1800 if (err < 0 && num_written > 0) 1829 if (err < 0)
1801 num_written = err; 1830 num_written = err;
1802 } 1831 }
1803 1832
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1885 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1886 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1887 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1888 struct btrfs_trans_handle *trans;
1889 struct btrfs_log_ctx ctx;
1890 int ret = 0;
1861 bool full_sync = 0; 1891 bool full_sync = 0;
1862 1892
1863 trace_btrfs_sync_file(file, datasync); 1893 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1981 }
1952 trans->sync = true; 1982 trans->sync = true;
1953 1983
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1984 btrfs_init_log_ctx(&ctx);
1985
1986 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1987 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1988 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1989 ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2003
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2004 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2005 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2006 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2007 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2008 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2009 goto out;
@@ -1993,6 +2025,7 @@ out:
1993 2025
1994static const struct vm_operations_struct btrfs_file_vm_ops = { 2026static const struct vm_operations_struct btrfs_file_vm_ops = {
1995 .fault = filemap_fault, 2027 .fault = filemap_fault,
2028 .map_pages = filemap_map_pages,
1996 .page_mkwrite = btrfs_page_mkwrite, 2029 .page_mkwrite = btrfs_page_mkwrite,
1997 .remap_pages = generic_file_remap_pages, 2030 .remap_pages = generic_file_remap_pages,
1998}; 2031};
@@ -2157,6 +2190,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2190 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2191 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2192 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2193 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2194
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2195 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2196 if (ret)
@@ -2172,14 +2206,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2206 * entire page.
2173 */ 2207 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2208 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2209 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2210 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2211 mutex_unlock(&inode->i_mutex);
2178 return ret; 2212 return ret;
2179 } 2213 }
2180 2214
2181 /* zero back part of the first page */ 2215 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2216 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2217 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2218 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2219 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2222,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2222 }
2189 2223
2190 /* zero the front end of the last page */ 2224 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2225 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2226 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2227 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2228 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2311,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2311
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2312 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2313
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2314 if (cur_offset < ino_size) {
2281 if (ret) { 2315 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2316 drop_end);
2283 break; 2317 if (ret) {
2318 err = ret;
2319 break;
2320 }
2284 } 2321 }
2285 2322
2286 cur_offset = drop_end; 2323 cur_offset = drop_end;
@@ -2313,10 +2350,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2350 }
2314 2351
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2352 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2353 if (cur_offset < ino_size) {
2317 if (ret) { 2354 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2355 if (ret) {
2319 goto out_trans; 2356 err = ret;
2357 goto out_trans;
2358 }
2320 } 2359 }
2321 2360
2322out_trans: 2361out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
864 864
865 if (btrfs_is_free_space_inode(inode)) { 865 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 866 WARN_ON_ONCE(1);
867 return -EINVAL; 867 ret = -EINVAL;
868 goto out_unlock;
868 } 869 }
869 870
870 num_bytes = ALIGN(end - start + 1, blocksize); 871 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1076 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1077 INIT_LIST_HEAD(&async_cow->extents);
1077 1078
1078 async_cow->work.func = async_cow_start; 1079 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1080 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1081
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1082 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1083 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1084 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1085
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1086 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1087 &async_cow->work);
1089 1088
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1089 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1090 wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1842
1844 SetPageChecked(page); 1843 SetPageChecked(page);
1845 page_cache_get(page); 1844 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1845 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1846 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1847 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1848 return -EBUSY;
1850} 1849}
1851 1850
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2238 return PTR_ERR(root);
2240 } 2239 }
2241 2240
2241 if (btrfs_root_readonly(root)) {
2242 srcu_read_unlock(&fs_info->subvol_srcu, index);
2243 return 0;
2244 }
2245
2242 /* step 2: get inode */ 2246 /* step 2: get inode */
2243 key.objectid = backref->inum; 2247 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2248 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2763 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2764 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2765 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2766 struct btrfs_workqueue *workers;
2763 2767
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2768 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2769
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2772 end - start + 1, uptodate))
2769 return 0; 2773 return 0;
2770 2774
2771 ordered_extent->work.func = finish_ordered_fn; 2775 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2776
2774 if (btrfs_is_free_space_inode(inode)) 2777 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2778 workers = root->fs_info->endio_freespace_worker;
2776 else 2779 else
2777 workers = &root->fs_info->endio_write_workers; 2780 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2781 btrfs_queue_work(workers, &ordered_extent->work);
2779 2782
2780 return 0; 2783 return 0;
2781} 2784}
@@ -4593,7 +4596,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4593 struct rb_node *node; 4596 struct rb_node *node;
4594 4597
4595 ASSERT(inode->i_state & I_FREEING); 4598 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0); 4599 truncate_inode_pages_final(&inode->i_data);
4597 4600
4598 write_lock(&map_tree->lock); 4601 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4602 while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4927 struct inode *inode;
4925 u64 objectid = 0; 4928 u64 objectid = 0;
4926 4929
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4930 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4931 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4932
4929 spin_lock(&root->inode_lock); 4933 spin_lock(&root->inode_lock);
4930again: 4934again:
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5803 }
5800out_unlock: 5804out_unlock:
5801 btrfs_end_transaction(trans, root); 5805 btrfs_end_transaction(trans, root);
5806 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5807 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5808 if (drop_inode) {
5804 inode_dec_link_count(inode); 5809 inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5877 inode_dec_link_count(inode);
5873 iput(inode); 5878 iput(inode);
5874 } 5879 }
5880 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5881 btrfs_btree_balance_dirty(root);
5876 return err; 5882 return err;
5877} 5883}
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5936 }
5931 5937
5932 btrfs_end_transaction(trans, root); 5938 btrfs_end_transaction(trans, root);
5939 btrfs_balance_delayed_items(root);
5933fail: 5940fail:
5934 if (drop_inode) { 5941 if (drop_inode) {
5935 inode_dec_link_count(inode); 5942 inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6003 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6004 if (drop_on_err)
5998 iput(inode); 6005 iput(inode);
6006 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6007 btrfs_btree_balance_dirty(root);
6000 return err; 6008 return err;
6001} 6009}
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6558 int ret;
6551 struct extent_buffer *leaf; 6559 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6560 struct btrfs_root *root = BTRFS_I(inode)->root;
6561 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6562 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6563 struct btrfs_key key;
6555 u64 disk_bytenr; 6564 u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6635
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6636 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6637 goto out;
6638
6639 num_bytes = min(offset + *len, extent_end) - offset;
6640 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6641 u64 range_end;
6642
6643 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6644 ret = test_range_bit(io_tree, offset, range_end,
6645 EXTENT_DELALLOC, 0, NULL);
6646 if (ret) {
6647 ret = -EAGAIN;
6648 goto out;
6649 }
6650 }
6651
6629 btrfs_release_path(path); 6652 btrfs_release_path(path);
6630 6653
6631 /* 6654 /*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6677 */
6655 disk_bytenr += backref_offset; 6678 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6679 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6680 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6681 goto out;
6660 /* 6682 /*
@@ -7024,10 +7046,9 @@ again:
7024 if (!ret) 7046 if (!ret)
7025 goto out_test; 7047 goto out_test;
7026 7048
7027 ordered->work.func = finish_ordered_fn; 7049 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7050 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7051 &ordered->work);
7030 &ordered->work);
7031out_test: 7052out_test:
7032 /* 7053 /*
7033 * our bio might span multiple ordered extents. If we haven't 7054 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7425 smp_mb__after_atomic_inc();
7405 7426
7406 /* 7427 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7428 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7429 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7430 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7431 * that any outstanding dirty pages are on disk.
7411 */ 7432 */
7412 count = iov_length(iov, nr_segs); 7433 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7434 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7435 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7436 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7437
7417 if (rw & WRITE) { 7438 if (rw & WRITE) {
7418 /* 7439 /*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8425 work->inode = inode;
8405 work->wait = wait; 8426 work->wait = wait;
8406 work->delay_iput = delay_iput; 8427 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8428 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8429
8409 return work; 8430 return work;
8410} 8431}
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8440 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8441 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8442 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8443static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8444 int nr)
8423{ 8445{
8424 struct btrfs_inode *binode; 8446 struct btrfs_inode *binode;
8425 struct inode *inode; 8447 struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8453 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8454 INIT_LIST_HEAD(&splice);
8433 8455
8456 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8457 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8458 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8459 while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8453 else 8476 else
8454 iput(inode); 8477 iput(inode);
8455 ret = -ENOMEM; 8478 ret = -ENOMEM;
8456 goto out; 8479 break;
8457 } 8480 }
8458 list_add_tail(&work->list, &works); 8481 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8482 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8483 &work->work);
8461 8484 ret++;
8485 if (nr != -1 && ret >= nr)
8486 break;
8462 cond_resched(); 8487 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8488 spin_lock(&root->delalloc_lock);
8464 } 8489 }
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8468 list_del_init(&work->list); 8493 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work); 8494 btrfs_wait_and_free_delalloc_work(work);
8470 } 8495 }
8471 return 0;
8472out:
8473 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list);
8475 btrfs_wait_and_free_delalloc_work(work);
8476 }
8477 8496
8478 if (!list_empty_careful(&splice)) { 8497 if (!list_empty_careful(&splice)) {
8479 spin_lock(&root->delalloc_lock); 8498 spin_lock(&root->delalloc_lock);
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8499 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8500 spin_unlock(&root->delalloc_lock);
8482 } 8501 }
8502 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8503 return ret;
8484} 8504}
8485 8505
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8510 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8511 return -EROFS;
8492 8512
8493 ret = __start_delalloc_inodes(root, delay_iput); 8513 ret = __start_delalloc_inodes(root, delay_iput, -1);
8514 if (ret > 0)
8515 ret = 0;
8494 /* 8516 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8517 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8518 * we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8529 return ret;
8508} 8530}
8509 8531
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8532int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8533 int nr)
8511{ 8534{
8512 struct btrfs_root *root; 8535 struct btrfs_root *root;
8513 struct list_head splice; 8536 struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8541
8519 INIT_LIST_HEAD(&splice); 8542 INIT_LIST_HEAD(&splice);
8520 8543
8544 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8545 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8546 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8547 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8548 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8549 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8550 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8553 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8554 spin_unlock(&fs_info->delalloc_root_lock);
8531 8555
8532 ret = __start_delalloc_inodes(root, delay_iput); 8556 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8557 btrfs_put_fs_root(root);
8534 if (ret) 8558 if (ret < 0)
8535 goto out; 8559 goto out;
8536 8560
8561 if (nr != -1) {
8562 nr -= ret;
8563 WARN_ON(nr < 0);
8564 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8565 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8566 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8567 spin_unlock(&fs_info->delalloc_root_lock);
8540 8568
8569 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8570 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8571 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8572 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8575 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8576 }
8548 atomic_dec(&fs_info->async_submit_draining); 8577 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8578out:
8551 if (!list_empty_careful(&splice)) { 8579 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8580 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8581 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8583 }
8584 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8585 return ret;
8557} 8586}
8558 8587
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1642 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1643 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1644 "Snapshot src from another FS");
1576 ret = -EINVAL; 1645 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1646 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1647 /*
1579 * Subvolume creation is not restricted, but snapshots 1648 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1866 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1867 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1868 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1869 ret = -EPERM;
1870 btrfs_err(root->fs_info, "deleting default subvolume "
1871 "%llu is not allowed", key.objectid);
1801 goto out; 1872 goto out;
1802 } 1873 }
1803 btrfs_release_path(path); 1874 btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
2994 new_key.offset + datal, 3065 new_key.offset + datal,
2995 1); 3066 1);
2996 if (ret) { 3067 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3068 if (ret != -EINVAL)
2998 ret); 3069 btrfs_abort_transaction(trans,
3070 root, ret);
2999 btrfs_end_transaction(trans, root); 3071 btrfs_end_transaction(trans, root);
3000 goto out; 3072 goto out;
3001 } 3073 }
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3225 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3226 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3227 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3228 *
3157 * they don't overlap)? 3229 * - split destination inode's inline extents. The inline extents can
3230 * be either compressed or non-compressed.
3158 */ 3231 */
3159 3232
3160 /* the destination must be opened for writing */ 3233 /* the destination must be opened for writing */
@@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4426 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4427}
4355 4428
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4429static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4430 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4431{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4432 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4433 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4434 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4456 goto out;
4385 } 4457 }
4386 4458
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4459 /*
4395 * 1 - root item 4460 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4461 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4509 goto out;
4445 } 4510 }
4446 4511
4512out:
4513 up_write(&root->fs_info->subvol_sem);
4514 mnt_drop_write_file(file);
4515 return ret;
4516}
4517
4518#ifdef CONFIG_64BIT
4519static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4520 void __user *arg)
4521{
4522 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4523 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4524 int ret = 0;
4525
4526 args32 = memdup_user(arg, sizeof(*args32));
4527 if (IS_ERR(args32)) {
4528 ret = PTR_ERR(args32);
4529 args32 = NULL;
4530 goto out;
4531 }
4532
4533 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4534 if (IS_ERR(args64)) {
4535 ret = PTR_ERR(args64);
4536 args64 = NULL;
4537 goto out;
4538 }
4539
4540 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4541 args64->stransid = args32->stransid;
4542 args64->rtransid = args32->rtransid;
4543 args64->stime.sec = args32->stime.sec;
4544 args64->stime.nsec = args32->stime.nsec;
4545 args64->rtime.sec = args32->rtime.sec;
4546 args64->rtime.nsec = args32->rtime.nsec;
4547 args64->flags = args32->flags;
4548
4549 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4550 if (ret)
4551 goto out;
4552
4553 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4554 args32->stransid = args64->stransid;
4555 args32->rtransid = args64->rtransid;
4556 args32->stime.sec = args64->stime.sec;
4557 args32->stime.nsec = args64->stime.nsec;
4558 args32->rtime.sec = args64->rtime.sec;
4559 args32->rtime.nsec = args64->rtime.nsec;
4560 args32->flags = args64->flags;
4561
4562 ret = copy_to_user(arg, args32, sizeof(*args32));
4563 if (ret)
4564 ret = -EFAULT;
4565
4566out:
4567 kfree(args32);
4568 kfree(args64);
4569 return ret;
4570}
4571#endif
4572
4573static long btrfs_ioctl_set_received_subvol(struct file *file,
4574 void __user *arg)
4575{
4576 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4577 int ret = 0;
4578
4579 sa = memdup_user(arg, sizeof(*sa));
4580 if (IS_ERR(sa)) {
4581 ret = PTR_ERR(sa);
4582 sa = NULL;
4583 goto out;
4584 }
4585
4586 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4587
4588 if (ret)
4589 goto out;
4590
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4591 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4592 if (ret)
4449 ret = -EFAULT; 4593 ret = -EFAULT;
4450 4594
4451out: 4595out:
4452 kfree(sa); 4596 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4597 return ret;
4456} 4598}
4457 4599
@@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4888 case BTRFS_IOC_SYNC: {
4747 int ret; 4889 int ret;
4748 4890
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4891 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4892 if (ret)
4751 return ret; 4893 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4894 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4912 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4913 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4914 return btrfs_ioctl_set_received_subvol(file, argp);
4915#ifdef CONFIG_64BIT
4916 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4917 return btrfs_ioctl_set_received_subvol_32(file, argp);
4918#endif
4773 case BTRFS_IOC_SEND: 4919 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4920 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4921 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4249 rc->block_group->key.objectid, rc->block_group->flags);
4250 4250
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4251 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4252 if (ret < 0) {
4253 err = ret; 4253 err = ret;
4254 goto out; 4254 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2698
2687 wait_event(sctx->list_wait, 2699 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2700 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2701 atomic_inc(&fs_info->scrubs_paused);
2702 wake_up(&fs_info->scrub_pause_wait);
2703
2704 /*
2705 * must be called before we decrease @scrub_paused.
2706 * make sure we don't block transaction commit while
2707 * we are waiting pending workers finished.
2708 */
2690 wait_event(sctx->list_wait, 2709 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2710 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2711 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2712
2713 mutex_lock(&fs_info->scrub_lock);
2714 __scrub_blocked_if_needed(fs_info);
2715 atomic_dec(&fs_info->scrubs_paused);
2716 mutex_unlock(&fs_info->scrub_lock);
2717 wake_up(&fs_info->scrub_pause_wait);
2693 2718
2694 btrfs_put_block_group(cache); 2719 btrfs_put_block_group(cache);
2695 if (ret) 2720 if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2782 int is_dev_replace)
2758{ 2783{
2759 int ret = 0; 2784 int ret = 0;
2785 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2786 int max_active = fs_info->thread_pool_size;
2760 2787
2761 if (fs_info->scrub_workers_refcnt == 0) { 2788 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2789 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2790 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2791 btrfs_alloc_workqueue("btrfs-scrub", flags,
2792 1, 4);
2765 else 2793 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2794 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2795 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2796 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2797 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2798 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2799 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2800 }
2774 "scrubwrc", 2801 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2802 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2803 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2804 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2805 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2806 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2807 }
2783 &fs_info->generic_worker); 2808 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2809 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2810 if (!fs_info->scrub_nocow_workers) {
2811 ret = -ENOMEM;
2786 goto out; 2812 goto out;
2813 }
2787 } 2814 }
2788 ++fs_info->scrub_workers_refcnt; 2815 ++fs_info->scrub_workers_refcnt;
2789out: 2816out:
@@ -2793,9 +2820,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2820static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2821{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2822 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2823 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2824 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2825 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2826 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2827 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2828}
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3133 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3134 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3135 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3136 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3137 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3138 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3139 &nocow_ctx->work);
3113 3140
3114 return 0; 3141 return 0;
3115} 3142}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
292 351
293 path_len = p->end - p->start; 352 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 353 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len); 354
296 355 /*
297 if (p->buf == p->inline_buf) { 356 * First time the inline_buf does not suffice
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 357 */
299 if (!tmp_buf) { 358 if (p->buf == p->inline_buf)
300 tmp_buf = vmalloc(len); 359 tmp_buf = kmalloc(len, GFP_NOFS);
301 if (!tmp_buf) 360 else
302 return -ENOMEM; 361 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
303 p->virtual_mem = 1; 362 if (!tmp_buf)
304 } 363 return -ENOMEM;
305 memcpy(tmp_buf, p->buf, p->buf_len); 364 p->buf = tmp_buf;
306 p->buf = tmp_buf; 365 /*
307 p->buf_len = len; 366 * The real size of the buffer is bigger, this will let the fast path
308 } else { 367 * happen most of the time
309 if (p->virtual_mem) { 368 */
310 tmp_buf = vmalloc(len); 369 p->buf_len = ksize(p->buf);
311 if (!tmp_buf) 370
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 }
329 if (p->reversed) { 371 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 372 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 373 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 380 return 0;
339} 381}
340 382
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 383static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
384 char **prepared)
342{ 385{
343 int ret; 386 int ret;
344 int new_len; 387 int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 397 if (p->start != p->end)
355 *--p->start = '/'; 398 *--p->start = '/';
356 p->start -= name_len; 399 p->start -= name_len;
357 p->prepared = p->start; 400 *prepared = p->start;
358 } else { 401 } else {
359 if (p->start != p->end) 402 if (p->start != p->end)
360 *p->end++ = '/'; 403 *p->end++ = '/';
361 p->prepared = p->end; 404 *prepared = p->end;
362 p->end += name_len; 405 p->end += name_len;
363 *p->end = 0; 406 *p->end = 0;
364 } 407 }
@@ -370,12 +413,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 413static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 414{
372 int ret; 415 int ret;
416 char *prepared;
373 417
374 ret = fs_path_prepare_for_add(p, name_len); 418 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 419 if (ret < 0)
376 goto out; 420 goto out;
377 memcpy(p->prepared, name, name_len); 421 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 422
380out: 423out:
381 return ret; 424 return ret;
@@ -384,12 +427,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 427static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 428{
386 int ret; 429 int ret;
430 char *prepared;
387 431
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 432 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 433 if (ret < 0)
390 goto out; 434 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 435 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 436
394out: 437out:
395 return ret; 438 return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 443 unsigned long off, int len)
401{ 444{
402 int ret; 445 int ret;
446 char *prepared;
403 447
404 ret = fs_path_prepare_for_add(p, len); 448 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 449 if (ret < 0)
406 goto out; 450 goto out;
407 451
408 read_extent_buffer(eb, p->prepared, off, len); 452 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 453
411out: 454out:
412 return ret; 455 return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 958 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 959 struct btrfs_key di_key;
917 char *buf = NULL; 960 char *buf = NULL;
918 char *buf2 = NULL; 961 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 962 u32 name_len;
922 u32 data_len; 963 u32 data_len;
923 u32 cur; 964 u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 968 int num;
928 u8 type; 969 u8 type;
929 970
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 971 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 972 if (!buf) {
933 ret = -ENOMEM; 973 ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 989 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 990 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 991
992 /*
993 * Path too long
994 */
952 if (name_len + data_len > buf_len) { 995 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 996 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 997 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 998 }
977 999
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1000 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1017 }
996 1018
997out: 1019out:
998 if (buf_virtual) 1020 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1021 return ret;
1003} 1022}
1004 1023
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1311 extent_item_pos = logical - found_key.objectid;
1293 else 1312 else
1294 extent_item_pos = 0; 1313 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1314 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1315 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1316 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1435 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1436 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1437 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1438 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1439
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1440 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1441 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1911
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1912 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1913 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1914 if (!nce_head) {
1915 btrfs_err(sctx->send_root->fs_info,
1916 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1917 nce->ino, sctx->name_cache_size);
1918 }
1902 1919
1903 list_del(&nce->radix_list); 1920 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1921 list_del(&nce->list);
1905 sctx->name_cache_size--; 1922 sctx->name_cache_size--;
1906 1923
1907 if (list_empty(nce_head)) { 1924 /*
1925 * We may not get to the final release of nce_head if the lookup fails
1926 */
1927 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1928 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1929 kfree(nce_head);
1910 } 1930 }
@@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 1997 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 1998static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 1999 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2000 u64 *parent_ino,
1982 u64 *parent_gen, 2001 u64 *parent_gen,
1983 struct fs_path *dest) 2002 struct fs_path *dest)
@@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2006 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2007 struct name_cache_entry *nce = NULL;
1989 2008
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2009 /*
1993 * First check if we already did a call to this function with the same 2010 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2011 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2050 goto out_cache;
2034 } 2051 }
2035 2052
2036get_ref:
2037 /* 2053 /*
2038 * Depending on whether the inode was already processed or not, use 2054 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2055 * send_root or parent_root for ref lookup.
2040 */ 2056 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2057 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2058 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2059 parent_ino, parent_gen, dest);
2044 else 2060 else
@@ -2062,8 +2078,6 @@ get_ref:
2062 goto out; 2078 goto out;
2063 ret = 1; 2079 ret = 1;
2064 } 2080 }
2065 if (skip_name_cache)
2066 goto out;
2067 2081
2068out_cache: 2082out_cache:
2069 /* 2083 /*
@@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2145 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2146 u64 parent_gen = 0;
2133 int stop = 0; 2147 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2148
2138 name = fs_path_alloc(); 2149 name = fs_path_alloc();
2139 if (!name) { 2150 if (!name) {
@@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2152 goto out;
2142 } 2153 }
2143 2154
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2155 dest->reversed = 1;
2149 fs_path_reset(dest); 2156 fs_path_reset(dest);
2150 2157
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2158 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2159 fs_path_reset(name);
2153 2160
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2161 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2162 ret = gen_unique_name(sctx, ino, gen, name);
2163 if (ret < 0)
2164 goto out;
2165 ret = fs_path_add_path(dest, name);
2166 break;
2167 }
2168
2169 if (is_waiting_for_move(sctx, ino)) {
2170 ret = get_first_ref(sctx->parent_root, ino,
2171 &parent_inode, &parent_gen, name);
2172 } else {
2173 ret = __get_cur_name_and_parent(sctx, ino, gen,
2174 &parent_inode,
2175 &parent_gen, name);
2176 if (ret)
2177 stop = 1;
2178 }
2179
2156 if (ret < 0) 2180 if (ret < 0)
2157 goto out; 2181 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2182
2170 ret = fs_path_add_path(dest, name); 2183 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2184 if (ret < 0)
@@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2442 if (!p)
2430 return -ENOMEM; 2443 return -ENOMEM;
2431 2444
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2445 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2446 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2447 NULL, NULL, &rdev);
2435 goto out; 2448 if (ret < 0)
2449 goto out;
2450 } else {
2451 gen = sctx->cur_inode_gen;
2452 mode = sctx->cur_inode_mode;
2453 rdev = sctx->cur_inode_rdev;
2454 }
2436 2455
2437 if (S_ISREG(mode)) { 2456 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2457 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2531 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2532 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2533 key.offset = 0;
2534 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2535 if (ret < 0)
2536 goto out;
2537
2515 while (1) { 2538 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2539 eb = path->nodes[0];
2517 1, 0); 2540 slot = path->slots[0];
2518 if (ret < 0) 2541 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2542 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2543 if (ret < 0) {
2521 eb = path->nodes[0]; 2544 goto out;
2522 slot = path->slots[0]; 2545 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2546 ret = 0;
2547 break;
2548 }
2549 continue;
2524 } 2550 }
2525 if (ret || found_key.objectid != key.objectid || 2551
2552 btrfs_item_key_to_cpu(eb, &found_key, slot);
2553 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2554 found_key.type != key.type) {
2527 ret = 0; 2555 ret = 0;
2528 goto out; 2556 goto out;
@@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2565 goto out;
2538 } 2566 }
2539 2567
2540 key.offset = found_key.offset + 1; 2568 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2569 }
2543 2570
2544out: 2571out:
@@ -2590,7 +2617,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2617 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2618 * This function is a helper to record one ref.
2592 */ 2619 */
2593static int record_ref(struct list_head *head, u64 dir, 2620static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2621 u64 dir_gen, struct fs_path *path)
2595{ 2622{
2596 struct recorded_ref *ref; 2623 struct recorded_ref *ref;
@@ -2676,12 +2703,78 @@ out:
2676 return ret; 2703 return ret;
2677} 2704}
2678 2705
2706static struct orphan_dir_info *
2707add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2708{
2709 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2710 struct rb_node *parent = NULL;
2711 struct orphan_dir_info *entry, *odi;
2712
2713 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2714 if (!odi)
2715 return ERR_PTR(-ENOMEM);
2716 odi->ino = dir_ino;
2717 odi->gen = 0;
2718
2719 while (*p) {
2720 parent = *p;
2721 entry = rb_entry(parent, struct orphan_dir_info, node);
2722 if (dir_ino < entry->ino) {
2723 p = &(*p)->rb_left;
2724 } else if (dir_ino > entry->ino) {
2725 p = &(*p)->rb_right;
2726 } else {
2727 kfree(odi);
2728 return entry;
2729 }
2730 }
2731
2732 rb_link_node(&odi->node, parent, p);
2733 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2734 return odi;
2735}
2736
2737static struct orphan_dir_info *
2738get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2739{
2740 struct rb_node *n = sctx->orphan_dirs.rb_node;
2741 struct orphan_dir_info *entry;
2742
2743 while (n) {
2744 entry = rb_entry(n, struct orphan_dir_info, node);
2745 if (dir_ino < entry->ino)
2746 n = n->rb_left;
2747 else if (dir_ino > entry->ino)
2748 n = n->rb_right;
2749 else
2750 return entry;
2751 }
2752 return NULL;
2753}
2754
2755static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2756{
2757 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2758
2759 return odi != NULL;
2760}
2761
2762static void free_orphan_dir_info(struct send_ctx *sctx,
2763 struct orphan_dir_info *odi)
2764{
2765 if (!odi)
2766 return;
2767 rb_erase(&odi->node, &sctx->orphan_dirs);
2768 kfree(odi);
2769}
2770
2679/* 2771/*
2680 * Returns 1 if a directory can be removed at this point in time. 2772 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2773 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2774 * the dir item was already processed.
2683 */ 2775 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2776static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2777 u64 send_progress)
2685{ 2778{
2686 int ret = 0; 2779 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2780 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2797 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2798 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2799 key.offset = 0;
2800 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2801 if (ret < 0)
2802 goto out;
2707 2803
2708 while (1) { 2804 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2805 struct waiting_dir_move *dm;
2710 if (ret < 0) 2806
2711 goto out; 2807 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2808 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2809 if (ret < 0)
2714 path->slots[0]); 2810 goto out;
2811 else if (ret > 0)
2812 break;
2813 continue;
2715 } 2814 }
2716 if (ret || found_key.objectid != key.objectid || 2815 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2816 path->slots[0]);
2817 if (found_key.objectid != key.objectid ||
2818 found_key.type != key.type)
2718 break; 2819 break;
2719 }
2720 2820
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2821 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2822 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2823 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2824
2825 dm = get_waiting_dir_move(sctx, loc.objectid);
2826 if (dm) {
2827 struct orphan_dir_info *odi;
2828
2829 odi = add_orphan_dir_info(sctx, dir);
2830 if (IS_ERR(odi)) {
2831 ret = PTR_ERR(odi);
2832 goto out;
2833 }
2834 odi->gen = dir_gen;
2835 dm->rmdir_ino = dir;
2836 ret = 0;
2837 goto out;
2838 }
2839
2725 if (loc.objectid > send_progress) { 2840 if (loc.objectid > send_progress) {
2726 ret = 0; 2841 ret = 0;
2727 goto out; 2842 goto out;
2728 } 2843 }
2729 2844
2730 btrfs_release_path(path); 2845 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2846 }
2733 2847
2734 ret = 1; 2848 ret = 1;
@@ -2740,19 +2854,9 @@ out:
2740 2854
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2855static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2856{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2857 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2858
2746 while (n) { 2859 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2860}
2757 2861
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2862static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2869 if (!dm)
2766 return -ENOMEM; 2870 return -ENOMEM;
2767 dm->ino = ino; 2871 dm->ino = ino;
2872 dm->rmdir_ino = 0;
2768 2873
2769 while (*p) { 2874 while (*p) {
2770 parent = *p; 2875 parent = *p;
@@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2889 return 0;
2785} 2890}
2786 2891
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2892static struct waiting_dir_move *
2893get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2894{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2895 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2896 struct waiting_dir_move *entry;
2791 2897
2792 while (n) { 2898 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2899 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2900 if (ino < entry->ino)
2795 n = n->rb_left; 2901 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2902 else if (ino > entry->ino)
2797 n = n->rb_right; 2903 n = n->rb_right;
2798 } else { 2904 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2905 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2906 }
2804 return -ENOENT; 2907 return NULL;
2908}
2909
2910static void free_waiting_dir_move(struct send_ctx *sctx,
2911 struct waiting_dir_move *dm)
2912{
2913 if (!dm)
2914 return;
2915 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2916 kfree(dm);
2805} 2917}
2806 2918
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2919static int add_pending_dir_move(struct send_ctx *sctx,
2920 u64 ino,
2921 u64 ino_gen,
2922 u64 parent_ino)
2808{ 2923{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2924 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2925 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2926 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2927 struct recorded_ref *cur;
2813 int exists = 0; 2928 int exists = 0;
2814 int ret; 2929 int ret;
@@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2932 if (!pm)
2818 return -ENOMEM; 2933 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2934 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2935 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2936 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2937 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2938 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2939 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3003{
2889 struct fs_path *from_path = NULL; 3004 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3005 struct fs_path *to_path = NULL;
3006 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3007 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3008 struct recorded_ref *cur;
3009 u64 parent_ino, parent_gen;
3010 struct waiting_dir_move *dm = NULL;
3011 u64 rmdir_ino = 0;
2893 int ret; 3012 int ret;
2894 3013
3014 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3015 from_path = fs_path_alloc();
2896 if (!from_path) 3016 if (!name || !from_path) {
2897 return -ENOMEM; 3017 ret = -ENOMEM;
3018 goto out;
3019 }
2898 3020
2899 sctx->send_progress = pm->ino; 3021 dm = get_waiting_dir_move(sctx, pm->ino);
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3022 ASSERT(dm);
3023 rmdir_ino = dm->rmdir_ino;
3024 free_waiting_dir_move(sctx, dm);
3025
3026 ret = get_first_ref(sctx->parent_root, pm->ino,
3027 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3028 if (ret < 0)
2902 goto out; 3029 goto out;
2903 3030
3031 if (parent_ino == sctx->cur_ino) {
3032 /* child only renamed, not moved */
3033 ASSERT(parent_gen == sctx->cur_inode_gen);
3034 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3035 from_path);
3036 if (ret < 0)
3037 goto out;
3038 ret = fs_path_add_path(from_path, name);
3039 if (ret < 0)
3040 goto out;
3041 } else {
3042 /* child moved and maybe renamed too */
3043 sctx->send_progress = pm->ino;
3044 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3045 if (ret < 0)
3046 goto out;
3047 }
3048
3049 fs_path_free(name);
3050 name = NULL;
3051
2904 to_path = fs_path_alloc(); 3052 to_path = fs_path_alloc();
2905 if (!to_path) { 3053 if (!to_path) {
2906 ret = -ENOMEM; 3054 ret = -ENOMEM;
@@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3056 }
2909 3057
2910 sctx->send_progress = sctx->cur_ino + 1; 3058 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3059 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3060 if (ret < 0)
2916 goto out; 3061 goto out;
@@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3064 if (ret < 0)
2920 goto out; 3065 goto out;
2921 3066
3067 if (rmdir_ino) {
3068 struct orphan_dir_info *odi;
3069
3070 odi = get_orphan_dir_info(sctx, rmdir_ino);
3071 if (!odi) {
3072 /* already deleted */
3073 goto finish;
3074 }
3075 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3076 if (ret < 0)
3077 goto out;
3078 if (!ret)
3079 goto finish;
3080
3081 name = fs_path_alloc();
3082 if (!name) {
3083 ret = -ENOMEM;
3084 goto out;
3085 }
3086 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3087 if (ret < 0)
3088 goto out;
3089 ret = send_rmdir(sctx, name);
3090 if (ret < 0)
3091 goto out;
3092 free_orphan_dir_info(sctx, odi);
3093 }
3094
3095finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3096 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3097 if (ret < 0)
2924 goto out; 3098 goto out;
@@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3102 * and old parent(s).
2929 */ 3103 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3104 list_for_each_entry(cur, &pm->update_refs, list) {
3105 if (cur->dir == rmdir_ino)
3106 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3107 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3108 if (ret < 0)
2933 goto out; 3109 goto out;
2934 } 3110 }
2935 3111
2936out: 3112out:
3113 fs_path_free(name);
2937 fs_path_free(from_path); 3114 fs_path_free(from_path);
2938 fs_path_free(to_path); 3115 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3116 sctx->send_progress = orig_progress;
@@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3182 int ret;
3006 u64 ino = parent_ref->dir; 3183 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3184 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3185 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3186 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3187 struct fs_path *path_after = NULL;
3011 int len1, len2; 3188 int len1, len2;
3012 3189 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3190 u64 gen;
3014 return 0;
3015 3191
3016 if (is_waiting_for_move(sctx, ino)) 3192 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3193 return 1;
3018 3194
3195 if (parent_ref->dir <= sctx->cur_ino)
3196 return 0;
3197
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3198 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3199 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3200 if (ret == -ENOENT)
@@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3202 else if (ret < 0)
3024 return ret; 3203 return ret;
3025 3204
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3205 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3206 return 0;
3033 3207
3034 path_before = fs_path_alloc(); 3208 path_before = fs_path_alloc();
@@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3225 }
3052 3226
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3227 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3228 &gen, path_after);
3055 if (ret == -ENOENT) { 3229 if (ret == -ENOENT) {
3056 ret = 0; 3230 ret = 0;
3057 goto out; 3231 goto out;
@@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3235
3062 len1 = fs_path_len(path_before); 3236 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3237 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3238 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3239 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3240 ret = 1;
3067 goto out; 3241 goto out;
3068 } 3242 }
3069 ret = 0; 3243 ret = 0;
3070 3244
3245 /*
3246 * Ok, our new most direct ancestor has a higher inode number but
3247 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3248 * the hierarchy have an higher inode number too *and* were renamed
3249 * or moved - in this case we need to wait for the ancestor's rename
3250 * or move operation before we can do the move/rename for the current
3251 * inode.
3252 */
3253 register_upper_dirs = 0;
3254 ino = parent_ino_after;
3255again:
3256 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3257 u64 parent_gen;
3258
3259 fs_path_reset(path_before);
3260 fs_path_reset(path_after);
3261
3262 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3263 &parent_gen, path_after);
3264 if (ret < 0)
3265 goto out;
3266 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3267 NULL, path_before);
3268 if (ret == -ENOENT) {
3269 ret = 0;
3270 break;
3271 } else if (ret < 0) {
3272 goto out;
3273 }
3274
3275 len1 = fs_path_len(path_before);
3276 len2 = fs_path_len(path_after);
3277 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3278 memcmp(path_before->start, path_after->start, len1)) {
3279 ret = 1;
3280 if (register_upper_dirs) {
3281 break;
3282 } else {
3283 register_upper_dirs = 1;
3284 ino = parent_ref->dir;
3285 gen = parent_ref->dir_gen;
3286 goto again;
3287 }
3288 } else if (register_upper_dirs) {
3289 ret = add_pending_dir_move(sctx, ino, gen,
3290 parent_ino_after);
3291 if (ret < 0 && ret != -EEXIST)
3292 goto out;
3293 }
3294
3295 ino = parent_ino_after;
3296 gen = parent_gen;
3297 }
3298
3071out: 3299out:
3072 fs_path_free(path_before); 3300 fs_path_free(path_before);
3073 fs_path_free(path_after); 3301 fs_path_free(path_after);
@@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3317 u64 ow_gen;
3090 int did_overwrite = 0; 3318 int did_overwrite = 0;
3091 int is_orphan = 0; 3319 int is_orphan = 0;
3320 u64 last_dir_ino_rm = 0;
3092 3321
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3322verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3323
@@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3456 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3457 * ref. The deleted ref is ignored later.
3229 */ 3458 */
3230 if (wait_for_parent_move(sctx, cur)) { 3459 ret = wait_for_parent_move(sctx, cur);
3460 if (ret < 0)
3461 goto out;
3462 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3463 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3464 sctx->cur_ino,
3465 sctx->cur_inode_gen,
3466 cur->dir);
3233 *pending_move = 1; 3467 *pending_move = 1;
3234 } else { 3468 } else {
3235 ret = send_rename(sctx, valid_path, 3469 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3493 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3494 * See the use of check_dirs for more details.
3261 */ 3495 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3496 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3497 sctx->cur_ino);
3263 if (ret < 0) 3498 if (ret < 0)
3264 goto out; 3499 goto out;
3265 if (ret) { 3500 if (ret) {
@@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3585 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3586 if (ret < 0)
3352 goto out; 3587 goto out;
3353 } else if (ret == inode_state_did_delete) { 3588 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3589 cur->dir != last_dir_ino_rm) {
3590 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3591 sctx->cur_ino);
3355 if (ret < 0) 3592 if (ret < 0)
3356 goto out; 3593 goto out;
3357 if (ret) { 3594 if (ret) {
@@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3599 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3600 if (ret < 0)
3364 goto out; 3601 goto out;
3602 last_dir_ino_rm = cur->dir;
3365 } 3603 }
3366 } 3604 }
3367 } 3605 }
@@ -3375,9 +3613,8 @@ out:
3375 return ret; 3613 return ret;
3376} 3614}
3377 3615
3378static int __record_new_ref(int num, u64 dir, int index, 3616static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3617 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3618{
3382 int ret = 0; 3619 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3620 struct send_ctx *sctx = ctx;
@@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3625 if (!p)
3389 return -ENOMEM; 3626 return -ENOMEM;
3390 3627
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3628 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3629 NULL, NULL);
3393 if (ret < 0) 3630 if (ret < 0)
3394 goto out; 3631 goto out;
@@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3637 if (ret < 0)
3401 goto out; 3638 goto out;
3402 3639
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3640 ret = __record_ref(refs, dir, gen, p);
3404 3641
3405out: 3642out:
3406 if (ret) 3643 if (ret)
@@ -3408,37 +3645,23 @@ out:
3408 return ret; 3645 return ret;
3409} 3646}
3410 3647
3648static int __record_new_ref(int num, u64 dir, int index,
3649 struct fs_path *name,
3650 void *ctx)
3651{
3652 struct send_ctx *sctx = ctx;
3653 return record_ref(sctx->send_root, num, dir, index, name,
3654 ctx, &sctx->new_refs);
3655}
3656
3657
3411static int __record_deleted_ref(int num, u64 dir, int index, 3658static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3659 struct fs_path *name,
3413 void *ctx) 3660 void *ctx)
3414{ 3661{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3662 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3663 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3664 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3665}
3443 3666
3444static int record_new_ref(struct send_ctx *sctx) 3667static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3842 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3843 cb = __record_deleted_ref;
3621 } else { 3844 } else {
3622 BUG(); 3845 btrfs_err(sctx->send_root->fs_info,
3846 "Wrong command %d in process_all_refs", cmd);
3847 ret = -EINVAL;
3848 goto out;
3623 } 3849 }
3624 3850
3625 key.objectid = sctx->cmp_key->objectid; 3851 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3852 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3853 key.offset = 0;
3628 while (1) { 3854 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3855 if (ret < 0)
3630 if (ret < 0) 3856 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3857
3858 while (1) {
3635 eb = path->nodes[0]; 3859 eb = path->nodes[0];
3636 slot = path->slots[0]; 3860 slot = path->slots[0];
3861 if (slot >= btrfs_header_nritems(eb)) {
3862 ret = btrfs_next_leaf(root, path);
3863 if (ret < 0)
3864 goto out;
3865 else if (ret > 0)
3866 break;
3867 continue;
3868 }
3869
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3870 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3871
3639 if (found_key.objectid != key.objectid || 3872 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3875 break;
3643 3876
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3877 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3878 if (ret < 0)
3647 goto out; 3879 goto out;
3648 3880
3649 key.offset = found_key.offset + 1; 3881 path->slots[0]++;
3650 } 3882 }
3651 btrfs_release_path(path); 3883 btrfs_release_path(path);
3652 3884
@@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4159 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4160 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4161 key.offset = 0;
3930 while (1) { 4162 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4163 if (ret < 0)
3932 if (ret < 0) 4164 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4165
4166 while (1) {
3939 eb = path->nodes[0]; 4167 eb = path->nodes[0];
3940 slot = path->slots[0]; 4168 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4169 if (slot >= btrfs_header_nritems(eb)) {
4170 ret = btrfs_next_leaf(root, path);
4171 if (ret < 0) {
4172 goto out;
4173 } else if (ret > 0) {
4174 ret = 0;
4175 break;
4176 }
4177 continue;
4178 }
3942 4179
4180 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4181 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4182 found_key.type != key.type) {
3945 ret = 0; 4183 ret = 0;
@@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4189 if (ret < 0)
3952 goto out; 4190 goto out;
3953 4191
3954 btrfs_release_path(path); 4192 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4193 }
3957 4194
3958out: 4195out:
@@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4228 goto out;
3992 4229
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4230 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4231
4232 /* initial readahead */
4233 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4234 file_ra_state_init(&sctx->ra, inode->i_mapping);
4235 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4236 last_index - index + 1);
4237
3994 while (index <= last_index) { 4238 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4239 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4240 PAGE_CACHE_SIZE - pg_offset);
@@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5007 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5008 if (ret)
4765 goto out; 5009 goto out;
5010 /*
5011 * Need to send that every time, no matter if it actually
5012 * changed between the two trees as we have done changes to
5013 * the inode before. If our inode is a directory and it's
5014 * waiting to be moved/renamed, we will send its utimes when
5015 * it's moved/renamed, therefore we don't need to do it here.
5016 */
5017 sctx->send_progress = sctx->cur_ino + 1;
5018 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5019 if (ret < 0)
5020 goto out;
4766 } 5021 }
4767 5022
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5023out:
4779 return ret; 5024 return ret;
4780} 5025}
@@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5085 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5086 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5087 sctx->left_path->nodes[0], left_ii);
5088 sctx->cur_inode_rdev = btrfs_inode_rdev(
5089 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5090 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5091 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5092 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5131 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5132 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5133 sctx->left_path->nodes[0], left_ii);
5134 sctx->cur_inode_rdev = btrfs_inode_rdev(
5135 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5136 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5137 if (ret < 0)
4889 goto out; 5138 goto out;
@@ -5118,6 +5367,7 @@ out:
5118static int full_send_tree(struct send_ctx *sctx) 5367static int full_send_tree(struct send_ctx *sctx)
5119{ 5368{
5120 int ret; 5369 int ret;
5370 struct btrfs_trans_handle *trans = NULL;
5121 struct btrfs_root *send_root = sctx->send_root; 5371 struct btrfs_root *send_root = sctx->send_root;
5122 struct btrfs_key key; 5372 struct btrfs_key key;
5123 struct btrfs_key found_key; 5373 struct btrfs_key found_key;
@@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
5139 key.type = BTRFS_INODE_ITEM_KEY; 5389 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5390 key.offset = 0;
5141 5391
5392join_trans:
5393 /*
5394 * We need to make sure the transaction does not get committed
5395 * while we do anything on commit roots. Join a transaction to prevent
5396 * this.
5397 */
5398 trans = btrfs_join_transaction(send_root);
5399 if (IS_ERR(trans)) {
5400 ret = PTR_ERR(trans);
5401 trans = NULL;
5402 goto out;
5403 }
5404
5142 /* 5405 /*
5143 * Make sure the tree has not changed after re-joining. We detect this 5406 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match. 5407 * by comparing start_ctransid and ctransid. They should always match.
@@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
5162 goto out_finish; 5425 goto out_finish;
5163 5426
5164 while (1) { 5427 while (1) {
5428 /*
5429 * When someone want to commit while we iterate, end the
5430 * joined transaction and rejoin.
5431 */
5432 if (btrfs_should_end_transaction(trans, send_root)) {
5433 ret = btrfs_end_transaction(trans, send_root);
5434 trans = NULL;
5435 if (ret < 0)
5436 goto out;
5437 btrfs_release_path(path);
5438 goto join_trans;
5439 }
5440
5165 eb = path->nodes[0]; 5441 eb = path->nodes[0];
5166 slot = path->slots[0]; 5442 slot = path->slots[0];
5167 btrfs_item_key_to_cpu(eb, &found_key, slot); 5443 btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5189,6 +5465,12 @@ out_finish:
5189 5465
5190out: 5466out:
5191 btrfs_free_path(path); 5467 btrfs_free_path(path);
5468 if (trans) {
5469 if (!ret)
5470 ret = btrfs_end_transaction(trans, send_root);
5471 else
5472 btrfs_end_transaction(trans, send_root);
5473 }
5192 return ret; 5474 return ret;
5193} 5475}
5194 5476
@@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5622
5341 sctx->pending_dir_moves = RB_ROOT; 5623 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5624 sctx->waiting_dir_moves = RB_ROOT;
5625 sctx->orphan_dirs = RB_ROOT;
5343 5626
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5627 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5628 (arg->clone_sources_count + 1));
@@ -5477,6 +5760,16 @@ out:
5477 kfree(dm); 5760 kfree(dm);
5478 } 5761 }
5479 5762
5763 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5764 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5765 struct rb_node *n;
5766 struct orphan_dir_info *odi;
5767
5768 n = rb_first(&sctx->orphan_dirs);
5769 odi = rb_entry(n, struct orphan_dir_info, node);
5770 free_orphan_dir_info(sctx, odi);
5771 }
5772
5480 if (sort_clone_roots) { 5773 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5774 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5775 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..9dbf42395153 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1305 return ERR_PTR(error);
1306} 1306}
1307 1307
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1308static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1309 int new_pool_size, int old_pool_size)
1317{ 1310{
@@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1316 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1317 old_pool_size, new_pool_size);
1325 1318
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1319 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1320 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1321 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1322 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1323 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1324 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1325 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1326 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1327 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1328 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1329 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1330 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1331 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1332 new_pool_size);
1340 new_pool_size);
1341} 1333}
1342 1334
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1335static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1388 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1380 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1389 int ret; 1381 int ret;
1390 1382
1383 sync_filesystem(sb);
1391 btrfs_remount_prepare(fs_info); 1384 btrfs_remount_prepare(fs_info);
1392 1385
1393 ret = btrfs_parse_options(root, data); 1386 ret = btrfs_parse_options(root, data);
@@ -1479,6 +1472,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1472 sb->s_flags &= ~MS_RDONLY;
1480 } 1473 }
1481out: 1474out:
1475 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1476 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1477 return 0;
1484 1478
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 683 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 684 int err = 0;
685 685
686 if (--trans->use_count) { 686 if (trans->use_count > 1) {
687 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 688 trans->block_rsv = trans->orig_rsv;
688 return 0; 689 return 0;
689 } 690 }
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 732 }
732 733
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 734 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 735 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 736 return btrfs_commit_transaction(trans, root);
742 } else { 737 else
743 wake_up_process(info->transaction_kthread); 738 wake_up_process(info->transaction_kthread);
744 }
745 } 739 }
746 740
747 if (trans->type & __TRANS_FREEZABLE) 741 if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1572
1579 trace_btrfs_transaction_commit(root); 1573 trace_btrfs_transaction_commit(root);
1580 1574
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1575 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1576 current->journal_info = NULL;
1577 btrfs_scrub_cancel(root->fs_info);
1585 1578
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1579 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1580}
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1615{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1617 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1618 return 0;
1626} 1619}
1627 1620
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1747 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1748 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1749 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1750 goto scrub_continue;
1758 } 1751 }
1759 /* 1752 /*
1760 * the reloc mutex makes sure that we stop 1753 * the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1764 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1765 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1766 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1767 goto scrub_continue;
1775 } 1768 }
1776 1769
1777 /* 1770 /*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1780 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1781 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1782 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1783 goto scrub_continue;
1791 } 1784 }
1792 1785
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1786 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1787 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1788 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1789 goto scrub_continue;
1797 } 1790 }
1798 1791
1799 /* 1792 /*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1816 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1817 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1818 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1819 goto scrub_continue;
1827 } 1820 }
1828 1821
1829 /* 1822 /*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1837 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1838 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1839 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1840 goto scrub_continue;
1848 } 1841 }
1849 1842
1850 /* 1843 /*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1848 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1849 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1850 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1851 goto scrub_continue;
1859 } 1852 }
1860 1853
1861 btrfs_prepare_extent_commit(trans, root); 1854 btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1884 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1885 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1886 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1887 goto scrub_continue;
1895 } 1888 }
1896 1889
1897 ret = write_ctree_super(trans, root, 0); 1890 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1891 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1892 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1893 goto scrub_continue;
1901 } 1894 }
1902 1895
1903 /* 1896 /*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1933
1941 return ret; 1934 return ret;
1942 1935
1936scrub_continue:
1937 btrfs_scrub_continue(root);
1943cleanup_transaction: 1938cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1939 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1940 trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5264static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5265{
5265 struct btrfs_bio *bbio = bio->bi_private; 5266 struct btrfs_bio *bbio = bio->bi_private;
5267 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5268 int is_orig_bio = 0;
5267 5269
5268 if (err) { 5270 if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5272 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5273 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5274 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5275
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5276 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5277 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5293 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5294 is_orig_bio = 1;
5294 5295
5296 btrfs_bio_counter_dec(bbio->fs_info);
5297
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5298 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5299 if (!is_orig_bio) {
5297 bio_put(bio); 5300 bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5331 }
5329} 5332}
5330 5333
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5334/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5335 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5336 * async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5387 spin_unlock(&device->io_lock);
5392 5388
5393 if (should_queue) 5389 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5390 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5391 &device->work);
5396} 5392}
5397 5393
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5394static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5443 }
5448#endif 5444#endif
5449 bio->bi_bdev = dev->bdev; 5445 bio->bi_bdev = dev->bdev;
5446
5447 btrfs_bio_counter_inc_noblocked(root->fs_info);
5448
5450 if (async) 5449 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5450 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5451 else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5514 length = bio->bi_iter.bi_size;
5516 map_length = length; 5515 map_length = length;
5517 5516
5517 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5519 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5520 if (ret) {
5521 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5522 return ret;
5523 }
5522 5524
5523 total_devs = bbio->num_stripes; 5525 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5526 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5527 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5528 bbio->end_io = first_bio->bi_end_io;
5529 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5530 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5531
5529 if (raid_map) { 5532 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5533 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5534 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5535 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5536 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5537 raid_map, map_length);
5535 } else { 5538 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5539 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5540 raid_map, map_length,
5538 mirror_num); 5541 mirror_num);
5539 } 5542 }
5543 /*
5544 * FIXME, replace dosen't support raid56 yet, please fix
5545 * it in the future.
5546 */
5547 btrfs_bio_counter_dec(root->fs_info);
5548 return ret;
5540 } 5549 }
5541 5550
5542 if (map_length < length) { 5551 if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5587 async_submit);
5579 dev_nr++; 5588 dev_nr++;
5580 } 5589 }
5590 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5591 return 0;
5582} 5592}
5583 5593
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5676 else
5667 generate_random_uuid(dev->uuid); 5677 generate_random_uuid(dev->uuid);
5668 5678
5669 dev->work.func = pending_bios_fn; 5679 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5680
5671 return dev; 5681 return dev;
5672} 5682}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8b43c1..8c53a2b15ecb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3088,7 +3088,7 @@ EXPORT_SYMBOL(submit_bh);
3088 * until the buffer gets unlocked). 3088 * until the buffer gets unlocked).
3089 * 3089 *
3090 * ll_rw_block sets b_end_io to simple completion handler that marks 3090 * ll_rw_block sets b_end_io to simple completion handler that marks
3091 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 3091 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3092 * any waiters. 3092 * any waiters.
3093 * 3093 *
3094 * All of the buffers must be for the same device, and must also be a 3094 * All of the buffers must be for the same device, and must also be a
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39dc8dc..6494d9f673aa 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,12 +391,12 @@ try_again:
391 path.dentry = dir; 391 path.dentry = dir;
392 path_to_graveyard.mnt = cache->mnt; 392 path_to_graveyard.mnt = cache->mnt;
393 path_to_graveyard.dentry = cache->graveyard; 393 path_to_graveyard.dentry = cache->graveyard;
394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave); 394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
395 if (ret < 0) { 395 if (ret < 0) {
396 cachefiles_io_error(cache, "Rename security error %d", ret); 396 cachefiles_io_error(cache, "Rename security error %d", ret);
397 } else { 397 } else {
398 ret = vfs_rename(dir->d_inode, rep, 398 ret = vfs_rename(dir->d_inode, rep,
399 cache->graveyard->d_inode, grave, NULL); 399 cache->graveyard->d_inode, grave, NULL, 0);
400 if (ret != 0 && ret != -ENOMEM) 400 if (ret != 0 && ret != -ENOMEM)
401 cachefiles_io_error(cache, 401 cachefiles_io_error(cache,
402 "Rename failed with error %d", ret); 402 "Rename failed with error %d", ret);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120d..4b1fb5ca65b8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
265 goto nomem_monitor; 265 goto nomem_monitor;
266 } 266 }
267 267
268 ret = add_to_page_cache(newpage, bmapping, 268 ret = add_to_page_cache_lru(newpage, bmapping,
269 netpage->index, cachefiles_gfp); 269 netpage->index, cachefiles_gfp);
270 if (ret == 0) 270 if (ret == 0)
271 goto installed_new_backing_page; 271 goto installed_new_backing_page;
272 if (ret != -EEXIST) 272 if (ret != -EEXIST)
273 goto nomem_page; 273 goto nomem_page;
274 } 274 }
275 275
276 /* we've installed a new backing page, so now we need to add it 276 /* we've installed a new backing page, so now we need to start
277 * to the LRU list and start it reading */ 277 * it reading */
278installed_new_backing_page: 278installed_new_backing_page:
279 _debug("- new %p", newpage); 279 _debug("- new %p", newpage);
280 280
281 backpage = newpage; 281 backpage = newpage;
282 newpage = NULL; 282 newpage = NULL;
283 283
284 lru_cache_add_file(backpage);
285
286read_backing_page: 284read_backing_page:
287 ret = bmapping->a_ops->readpage(NULL, backpage); 285 ret = bmapping->a_ops->readpage(NULL, backpage);
288 if (ret < 0) 286 if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
510 goto nomem; 508 goto nomem;
511 } 509 }
512 510
513 ret = add_to_page_cache(newpage, bmapping, 511 ret = add_to_page_cache_lru(newpage, bmapping,
514 netpage->index, cachefiles_gfp); 512 netpage->index,
513 cachefiles_gfp);
515 if (ret == 0) 514 if (ret == 0)
516 goto installed_new_backing_page; 515 goto installed_new_backing_page;
517 if (ret != -EEXIST) 516 if (ret != -EEXIST)
518 goto nomem; 517 goto nomem;
519 } 518 }
520 519
521 /* we've installed a new backing page, so now we need to add it 520 /* we've installed a new backing page, so now we need
522 * to the LRU list and start it reading */ 521 * to start it reading */
523 installed_new_backing_page: 522 installed_new_backing_page:
524 _debug("- new %p", newpage); 523 _debug("- new %p", newpage);
525 524
526 backpage = newpage; 525 backpage = newpage;
527 newpage = NULL; 526 newpage = NULL;
528 527
529 lru_cache_add_file(backpage);
530
531 reread_backing_page: 528 reread_backing_page:
532 ret = bmapping->a_ops->readpage(NULL, backpage); 529 ret = bmapping->a_ops->readpage(NULL, backpage);
533 if (ret < 0) 530 if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
538 monitor_backing_page: 535 monitor_backing_page:
539 _debug("- monitor add"); 536 _debug("- monitor add");
540 537
541 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 538 ret = add_to_page_cache_lru(netpage, op->mapping,
542 cachefiles_gfp); 539 netpage->index, cachefiles_gfp);
543 if (ret < 0) { 540 if (ret < 0) {
544 if (ret == -EEXIST) { 541 if (ret == -EEXIST) {
545 page_cache_release(netpage); 542 page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
549 goto nomem; 546 goto nomem;
550 } 547 }
551 548
552 lru_cache_add_file(netpage);
553
554 /* install a monitor */ 549 /* install a monitor */
555 page_cache_get(netpage); 550 page_cache_get(netpage);
556 monitor->netfs_page = netpage; 551 monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
613 backing_page_already_uptodate: 608 backing_page_already_uptodate:
614 _debug("- uptodate"); 609 _debug("- uptodate");
615 610
616 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 611 ret = add_to_page_cache_lru(netpage, op->mapping,
617 cachefiles_gfp); 612 netpage->index, cachefiles_gfp);
618 if (ret < 0) { 613 if (ret < 0) {
619 if (ret == -EEXIST) { 614 if (ret == -EEXIST) {
620 page_cache_release(netpage); 615 page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
631 626
632 fscache_mark_page_cached(op, netpage); 627 fscache_mark_page_cached(op, netpage);
633 628
634 lru_cache_add_file(netpage);
635
636 /* the netpage is unlocked and marked up to date here */ 629 /* the netpage is unlocked and marked up to date here */
637 fscache_end_io(op, netpage, 0); 630 fscache_end_io(op, netpage, 0);
638 page_cache_release(netpage); 631 page_cache_release(netpage);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd4e1c3..834f9f3723fb 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
205 ci->fscache = fscache_acquire_cookie(fsc->fscache, 205 ci->fscache = fscache_acquire_cookie(fsc->fscache,
206 &ceph_fscache_inode_object_def, 206 &ceph_fscache_inode_object_def,
207 ci, true); 207 ci, true);
208 fscache_check_consistency(ci->fscache);
208done: 209done:
209 mutex_unlock(&inode->i_mutex); 210 mutex_unlock(&inode->i_mutex);
210 211
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index da95f61b7a09..5ac591bd012b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
48void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); 48void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
49void ceph_queue_revalidate(struct inode *inode); 49void ceph_queue_revalidate(struct inode *inode);
50 50
51static inline void ceph_fscache_update_objectsize(struct inode *inode)
52{
53 struct ceph_inode_info *ci = ceph_inode(inode);
54 fscache_attr_changed(ci->fscache);
55}
56
51static inline void ceph_fscache_invalidate(struct inode *inode) 57static inline void ceph_fscache_invalidate(struct inode *inode)
52{ 58{
53 fscache_invalidate(ceph_inode(inode)->fscache); 59 fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
135{ 141{
136} 142}
137 143
144static inline void ceph_fscache_update_objectsize(struct inode *inode)
145{
146}
147
138static inline void ceph_fscache_invalidate(struct inode *inode) 148static inline void ceph_fscache_invalidate(struct inode *inode)
139{ 149{
140} 150}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 17543383545c..2e5e648eb5c3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -622,8 +622,10 @@ retry:
622 622
623 if (flags & CEPH_CAP_FLAG_AUTH) { 623 if (flags & CEPH_CAP_FLAG_AUTH) {
624 if (ci->i_auth_cap == NULL || 624 if (ci->i_auth_cap == NULL ||
625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
626 ci->i_auth_cap = cap; 626 ci->i_auth_cap = cap;
627 cap->mds_wanted = wanted;
628 }
627 ci->i_cap_exporting_issued = 0; 629 ci->i_cap_exporting_issued = 0;
628 } else { 630 } else {
629 WARN_ON(ci->i_auth_cap == cap); 631 WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
885 cap = rb_entry(p, struct ceph_cap, ci_node); 887 cap = rb_entry(p, struct ceph_cap, ci_node);
886 if (!__cap_is_valid(cap)) 888 if (!__cap_is_valid(cap))
887 continue; 889 continue;
888 mds_wanted |= cap->mds_wanted; 890 if (cap == ci->i_auth_cap)
891 mds_wanted |= cap->mds_wanted;
892 else
893 mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
889 } 894 }
890 return mds_wanted; 895 return mds_wanted;
891} 896}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..16b54aa31f08 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
93 } else if (req->r_path1) { 93 } else if (req->r_path1) {
94 seq_printf(s, " #%llx/%s", req->r_ino1.ino, 94 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
95 req->r_path1); 95 req->r_path1);
96 } else {
97 seq_printf(s, " #%llx", req->r_ino1.ino);
96 } 98 }
97 99
98 if (req->r_old_dentry) { 100 if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
102 path = NULL; 104 path = NULL;
103 spin_lock(&req->r_old_dentry->d_lock); 105 spin_lock(&req->r_old_dentry->d_lock);
104 seq_printf(s, " #%llx/%.*s (%s)", 106 seq_printf(s, " #%llx/%.*s (%s)",
105 ceph_ino(req->r_old_dentry_dir), 107 req->r_old_dentry_dir ?
108 ceph_ino(req->r_old_dentry_dir) : 0,
106 req->r_old_dentry->d_name.len, 109 req->r_old_dentry->d_name.len,
107 req->r_old_dentry->d_name.name, 110 req->r_old_dentry->d_name.name,
108 path ? path : ""); 111 path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 45eda6d7a40c..766410a12c2c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r)
119 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 119 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
120 * the MDS if/when the directory is modified). 120 * the MDS if/when the directory is modified).
121 */ 121 */
122static int __dcache_readdir(struct file *file, struct dir_context *ctx) 122static int __dcache_readdir(struct file *file, struct dir_context *ctx,
123 u32 shared_gen)
123{ 124{
124 struct ceph_file_info *fi = file->private_data; 125 struct ceph_file_info *fi = file->private_data;
125 struct dentry *parent = file->f_dentry; 126 struct dentry *parent = file->f_dentry;
@@ -133,8 +134,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
133 last = fi->dentry; 134 last = fi->dentry;
134 fi->dentry = NULL; 135 fi->dentry = NULL;
135 136
136 dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, 137 dout("__dcache_readdir %p v%u at %llu (last %p)\n",
137 last); 138 dir, shared_gen, ctx->pos, last);
138 139
139 spin_lock(&parent->d_lock); 140 spin_lock(&parent->d_lock);
140 141
@@ -161,7 +162,8 @@ more:
161 goto out_unlock; 162 goto out_unlock;
162 } 163 }
163 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 164 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
164 if (!d_unhashed(dentry) && dentry->d_inode && 165 if (di->lease_shared_gen == shared_gen &&
166 !d_unhashed(dentry) && dentry->d_inode &&
165 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 167 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
166 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 168 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
167 fpos_cmp(ctx->pos, di->offset) <= 0) 169 fpos_cmp(ctx->pos, di->offset) <= 0)
@@ -190,7 +192,7 @@ more:
190 if (last) { 192 if (last) {
191 /* remember our position */ 193 /* remember our position */
192 fi->dentry = last; 194 fi->dentry = last;
193 fi->next_offset = di->offset; 195 fi->next_offset = fpos_off(di->offset);
194 } 196 }
195 dput(dentry); 197 dput(dentry);
196 return 0; 198 return 0;
@@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
252 int err; 254 int err;
253 u32 ftype; 255 u32 ftype;
254 struct ceph_mds_reply_info_parsed *rinfo; 256 struct ceph_mds_reply_info_parsed *rinfo;
255 const int max_entries = fsc->mount_options->max_readdir;
256 const int max_bytes = fsc->mount_options->max_readdir_bytes;
257 257
258 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); 258 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
259 if (fi->flags & CEPH_F_ATEND) 259 if (fi->flags & CEPH_F_ATEND)
@@ -291,8 +291,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
291 ceph_snap(inode) != CEPH_SNAPDIR && 291 ceph_snap(inode) != CEPH_SNAPDIR &&
292 __ceph_dir_is_complete(ci) && 292 __ceph_dir_is_complete(ci) &&
293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
294 u32 shared_gen = ci->i_shared_gen;
294 spin_unlock(&ci->i_ceph_lock); 295 spin_unlock(&ci->i_ceph_lock);
295 err = __dcache_readdir(file, ctx); 296 err = __dcache_readdir(file, ctx, shared_gen);
296 if (err != -EAGAIN) 297 if (err != -EAGAIN)
297 return err; 298 return err;
298 } else { 299 } else {
@@ -322,14 +323,16 @@ more:
322 fi->last_readdir = NULL; 323 fi->last_readdir = NULL;
323 } 324 }
324 325
325 /* requery frag tree, as the frag topology may have changed */
326 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
327
328 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 326 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
329 ceph_vinop(inode), frag, fi->last_name); 327 ceph_vinop(inode), frag, fi->last_name);
330 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 328 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
331 if (IS_ERR(req)) 329 if (IS_ERR(req))
332 return PTR_ERR(req); 330 return PTR_ERR(req);
331 err = ceph_alloc_readdir_reply_buffer(req, inode);
332 if (err) {
333 ceph_mdsc_put_request(req);
334 return err;
335 }
333 req->r_inode = inode; 336 req->r_inode = inode;
334 ihold(inode); 337 ihold(inode);
335 req->r_dentry = dget(file->f_dentry); 338 req->r_dentry = dget(file->f_dentry);
@@ -340,9 +343,6 @@ more:
340 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 343 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
341 req->r_readdir_offset = fi->next_offset; 344 req->r_readdir_offset = fi->next_offset;
342 req->r_args.readdir.frag = cpu_to_le32(frag); 345 req->r_args.readdir.frag = cpu_to_le32(frag);
343 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
344 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
345 req->r_num_caps = max_entries + 1;
346 err = ceph_mdsc_do_request(mdsc, NULL, req); 346 err = ceph_mdsc_do_request(mdsc, NULL, req);
347 if (err < 0) { 347 if (err < 0) {
348 ceph_mdsc_put_request(req); 348 ceph_mdsc_put_request(req);
@@ -369,9 +369,9 @@ more:
369 fi->next_offset = 0; 369 fi->next_offset = 0;
370 off = fi->next_offset; 370 off = fi->next_offset;
371 } 371 }
372 fi->frag = frag;
372 fi->offset = fi->next_offset; 373 fi->offset = fi->next_offset;
373 fi->last_readdir = req; 374 fi->last_readdir = req;
374 fi->frag = frag;
375 375
376 if (req->r_reply_info.dir_end) { 376 if (req->r_reply_info.dir_end) {
377 kfree(fi->last_name); 377 kfree(fi->last_name);
@@ -454,7 +454,7 @@ more:
454 return 0; 454 return 0;
455} 455}
456 456
457static void reset_readdir(struct ceph_file_info *fi) 457static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
458{ 458{
459 if (fi->last_readdir) { 459 if (fi->last_readdir) {
460 ceph_mdsc_put_request(fi->last_readdir); 460 ceph_mdsc_put_request(fi->last_readdir);
@@ -462,7 +462,10 @@ static void reset_readdir(struct ceph_file_info *fi)
462 } 462 }
463 kfree(fi->last_name); 463 kfree(fi->last_name);
464 fi->last_name = NULL; 464 fi->last_name = NULL;
465 fi->next_offset = 2; /* compensate for . and .. */ 465 if (ceph_frag_is_leftmost(frag))
466 fi->next_offset = 2; /* compensate for . and .. */
467 else
468 fi->next_offset = 0;
466 if (fi->dentry) { 469 if (fi->dentry) {
467 dput(fi->dentry); 470 dput(fi->dentry);
468 fi->dentry = NULL; 471 fi->dentry = NULL;
@@ -474,7 +477,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
474{ 477{
475 struct ceph_file_info *fi = file->private_data; 478 struct ceph_file_info *fi = file->private_data;
476 struct inode *inode = file->f_mapping->host; 479 struct inode *inode = file->f_mapping->host;
477 loff_t old_offset = offset; 480 loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
478 loff_t retval; 481 loff_t retval;
479 482
480 mutex_lock(&inode->i_mutex); 483 mutex_lock(&inode->i_mutex);
@@ -491,7 +494,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
491 goto out; 494 goto out;
492 } 495 }
493 496
494 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 497 if (offset >= 0) {
495 if (offset != file->f_pos) { 498 if (offset != file->f_pos) {
496 file->f_pos = offset; 499 file->f_pos = offset;
497 file->f_version = 0; 500 file->f_version = 0;
@@ -504,14 +507,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
504 * seek to new frag, or seek prior to current chunk. 507 * seek to new frag, or seek prior to current chunk.
505 */ 508 */
506 if (offset == 0 || 509 if (offset == 0 ||
507 fpos_frag(offset) != fpos_frag(old_offset) || 510 fpos_frag(offset) != fi->frag ||
508 fpos_off(offset) < fi->offset) { 511 fpos_off(offset) < fi->offset) {
509 dout("dir_llseek dropping %p content\n", file); 512 dout("dir_llseek dropping %p content\n", file);
510 reset_readdir(fi); 513 reset_readdir(fi, fpos_frag(offset));
511 } 514 }
512 515
513 /* bump dir_release_count if we did a forward seek */ 516 /* bump dir_release_count if we did a forward seek */
514 if (offset > old_offset) 517 if (fpos_cmp(offset, old_offset) > 0)
515 fi->dir_release_count--; 518 fi->dir_release_count--;
516 } 519 }
517out: 520out:
@@ -812,8 +815,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
812 } 815 }
813 req->r_dentry = dget(dentry); 816 req->r_dentry = dget(dentry);
814 req->r_num_caps = 2; 817 req->r_num_caps = 2;
815 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 818 req->r_old_dentry = dget(old_dentry);
816 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
817 req->r_locked_dir = dir; 819 req->r_locked_dir = dir;
818 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 820 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
819 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 821 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -911,10 +913,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
911 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 913 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
912 if (IS_ERR(req)) 914 if (IS_ERR(req))
913 return PTR_ERR(req); 915 return PTR_ERR(req);
916 ihold(old_dir);
914 req->r_dentry = dget(new_dentry); 917 req->r_dentry = dget(new_dentry);
915 req->r_num_caps = 2; 918 req->r_num_caps = 2;
916 req->r_old_dentry = dget(old_dentry); 919 req->r_old_dentry = dget(old_dentry);
917 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 920 req->r_old_dentry_dir = old_dir;
918 req->r_locked_dir = new_dir; 921 req->r_locked_dir = new_dir;
919 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 922 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
920 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 923 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..00d6af6a32ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
8#include "mds_client.h" 8#include "mds_client.h"
9 9
10/* 10/*
11 * NFS export support
12 *
13 * NFS re-export of a ceph mount is, at present, only semireliable.
14 * The basic issue is that the Ceph architectures doesn't lend itself
15 * well to generating filehandles that will remain valid forever.
16 *
17 * So, we do our best. If you're lucky, your inode will be in the
18 * client's cache. If it's not, and you have a connectable fh, then
19 * the MDS server may be able to find it for you. Otherwise, you get
20 * ESTALE.
21 *
22 * There are ways to this more reliable, but in the non-connectable fh
23 * case, we won't every work perfectly, and in the connectable case,
24 * some changes are needed on the MDS side to work better.
25 */
26
27/*
28 * Basic fh 11 * Basic fh
29 */ 12 */
30struct ceph_nfs_fh { 13struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
32} __attribute__ ((packed)); 15} __attribute__ ((packed));
33 16
34/* 17/*
35 * Larger 'connectable' fh that includes parent ino and name hash. 18 * Larger fh that includes parent ino.
36 * Use this whenever possible, as it works more reliably.
37 */ 19 */
38struct ceph_nfs_confh { 20struct ceph_nfs_confh {
39 u64 ino, parent_ino; 21 u64 ino, parent_ino;
40 u32 parent_name_hash;
41} __attribute__ ((packed)); 22} __attribute__ ((packed));
42 23
43/*
44 * The presence of @parent_inode here tells us whether NFS wants a
45 * connectable file handle. However, we want to make a connectionable
46 * file handle unconditionally so that the MDS gets as much of a hint
47 * as possible. That means we only use @parent_dentry to indicate
48 * whether nfsd wants a connectable fh, and whether we should indicate
49 * failure from a too-small @max_len.
50 */
51static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 24static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
52 struct inode *parent_inode) 25 struct inode *parent_inode)
53{ 26{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
56 struct ceph_nfs_confh *cfh = (void *)rawfh; 29 struct ceph_nfs_confh *cfh = (void *)rawfh;
57 int connected_handle_length = sizeof(*cfh)/4; 30 int connected_handle_length = sizeof(*cfh)/4;
58 int handle_length = sizeof(*fh)/4; 31 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry;
60 struct dentry *parent;
61 32
62 /* don't re-export snaps */ 33 /* don't re-export snaps */
63 if (ceph_snap(inode) != CEPH_NOSNAP) 34 if (ceph_snap(inode) != CEPH_NOSNAP)
64 return -EINVAL; 35 return -EINVAL;
65 36
66 dentry = d_find_alias(inode); 37 if (parent_inode && (*max_len < connected_handle_length)) {
38 *max_len = connected_handle_length;
39 return FILEID_INVALID;
40 } else if (*max_len < handle_length) {
41 *max_len = handle_length;
42 return FILEID_INVALID;
43 }
67 44
68 /* if we found an alias, generate a connectable fh */ 45 if (parent_inode) {
69 if (*max_len >= connected_handle_length && dentry) { 46 dout("encode_fh %llx with parent %llx\n",
70 dout("encode_fh %p connectable\n", dentry); 47 ceph_ino(inode), ceph_ino(parent_inode));
71 spin_lock(&dentry->d_lock);
72 parent = dentry->d_parent;
73 cfh->ino = ceph_ino(inode); 48 cfh->ino = ceph_ino(inode);
74 cfh->parent_ino = ceph_ino(parent->d_inode); 49 cfh->parent_ino = ceph_ino(parent_inode);
75 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
76 dentry);
77 *max_len = connected_handle_length; 50 *max_len = connected_handle_length;
78 type = 2; 51 type = FILEID_INO32_GEN_PARENT;
79 spin_unlock(&dentry->d_lock);
80 } else if (*max_len >= handle_length) {
81 if (parent_inode) {
82 /* nfsd wants connectable */
83 *max_len = connected_handle_length;
84 type = FILEID_INVALID;
85 } else {
86 dout("encode_fh %p\n", dentry);
87 fh->ino = ceph_ino(inode);
88 *max_len = handle_length;
89 type = 1;
90 }
91 } else { 52 } else {
53 dout("encode_fh %llx\n", ceph_ino(inode));
54 fh->ino = ceph_ino(inode);
92 *max_len = handle_length; 55 *max_len = handle_length;
93 type = FILEID_INVALID; 56 type = FILEID_INO32_GEN;
94 } 57 }
95 if (dentry)
96 dput(dentry);
97 return type; 58 return type;
98} 59}
99 60
100/* 61static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
101 * convert regular fh to dentry
102 *
103 * FIXME: we should try harder by querying the mds for the ino.
104 */
105static struct dentry *__fh_to_dentry(struct super_block *sb,
106 struct ceph_nfs_fh *fh, int fh_len)
107{ 62{
108 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 63 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
109 struct inode *inode; 64 struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
111 struct ceph_vino vino; 66 struct ceph_vino vino;
112 int err; 67 int err;
113 68
114 if (fh_len < sizeof(*fh) / 4) 69 vino.ino = ino;
115 return ERR_PTR(-ESTALE);
116
117 dout("__fh_to_dentry %llx\n", fh->ino);
118 vino.ino = fh->ino;
119 vino.snap = CEPH_NOSNAP; 70 vino.snap = CEPH_NOSNAP;
120 inode = ceph_find_inode(sb, vino); 71 inode = ceph_find_inode(sb, vino);
121 if (!inode) { 72 if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
139 90
140 dentry = d_obtain_alias(inode); 91 dentry = d_obtain_alias(inode);
141 if (IS_ERR(dentry)) { 92 if (IS_ERR(dentry)) {
142 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
143 fh->ino, inode);
144 iput(inode); 93 iput(inode);
145 return dentry; 94 return dentry;
146 } 95 }
147 err = ceph_init_dentry(dentry); 96 err = ceph_init_dentry(dentry);
148 if (err < 0) { 97 if (err < 0) {
149 iput(inode); 98 dput(dentry);
150 return ERR_PTR(err); 99 return ERR_PTR(err);
151 } 100 }
152 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); 101 dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
153 return dentry; 102 return dentry;
154} 103}
155 104
156/* 105/*
157 * convert connectable fh to dentry 106 * convert regular fh to dentry
158 */ 107 */
159static struct dentry *__cfh_to_dentry(struct super_block *sb, 108static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
160 struct ceph_nfs_confh *cfh, int fh_len) 109 struct fid *fid,
110 int fh_len, int fh_type)
111{
112 struct ceph_nfs_fh *fh = (void *)fid->raw;
113
114 if (fh_type != FILEID_INO32_GEN &&
115 fh_type != FILEID_INO32_GEN_PARENT)
116 return NULL;
117 if (fh_len < sizeof(*fh) / 4)
118 return NULL;
119
120 dout("fh_to_dentry %llx\n", fh->ino);
121 return __fh_to_dentry(sb, fh->ino);
122}
123
124static struct dentry *__get_parent(struct super_block *sb,
125 struct dentry *child, u64 ino)
161{ 126{
162 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 127 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
128 struct ceph_mds_request *req;
163 struct inode *inode; 129 struct inode *inode;
164 struct dentry *dentry; 130 struct dentry *dentry;
165 struct ceph_vino vino;
166 int err; 131 int err;
167 132
168 if (fh_len < sizeof(*cfh) / 4) 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
169 return ERR_PTR(-ESTALE); 134 USE_ANY_MDS);
170 135 if (IS_ERR(req))
171 dout("__cfh_to_dentry %llx (%llx/%x)\n", 136 return ERR_CAST(req);
172 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
173
174 vino.ino = cfh->ino;
175 vino.snap = CEPH_NOSNAP;
176 inode = ceph_find_inode(sb, vino);
177 if (!inode) {
178 struct ceph_mds_request *req;
179
180 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
181 USE_ANY_MDS);
182 if (IS_ERR(req))
183 return ERR_CAST(req);
184 137
185 req->r_ino1 = vino; 138 if (child) {
186 req->r_ino2.ino = cfh->parent_ino; 139 req->r_inode = child->d_inode;
187 req->r_ino2.snap = CEPH_NOSNAP; 140 ihold(child->d_inode);
188 req->r_path2 = kmalloc(16, GFP_NOFS); 141 } else {
189 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); 142 req->r_ino1 = (struct ceph_vino) {
190 req->r_num_caps = 1; 143 .ino = ino,
191 err = ceph_mdsc_do_request(mdsc, NULL, req); 144 .snap = CEPH_NOSNAP,
192 inode = req->r_target_inode; 145 };
193 if (inode)
194 ihold(inode);
195 ceph_mdsc_put_request(req);
196 if (!inode)
197 return ERR_PTR(err ? err : -ESTALE);
198 } 146 }
147 req->r_num_caps = 1;
148 err = ceph_mdsc_do_request(mdsc, NULL, req);
149 inode = req->r_target_inode;
150 if (inode)
151 ihold(inode);
152 ceph_mdsc_put_request(req);
153 if (!inode)
154 return ERR_PTR(-ENOENT);
199 155
200 dentry = d_obtain_alias(inode); 156 dentry = d_obtain_alias(inode);
201 if (IS_ERR(dentry)) { 157 if (IS_ERR(dentry)) {
202 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
203 cfh->ino, inode);
204 iput(inode); 158 iput(inode);
205 return dentry; 159 return dentry;
206 } 160 }
207 err = ceph_init_dentry(dentry); 161 err = ceph_init_dentry(dentry);
208 if (err < 0) { 162 if (err < 0) {
209 iput(inode); 163 dput(dentry);
210 return ERR_PTR(err); 164 return ERR_PTR(err);
211 } 165 }
212 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); 166 dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
167 child ? ceph_ino(child->d_inode) : ino,
168 dentry, ceph_vinop(inode));
213 return dentry; 169 return dentry;
214} 170}
215 171
216static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, 172struct dentry *ceph_get_parent(struct dentry *child)
217 int fh_len, int fh_type)
218{ 173{
219 if (fh_type == 1) 174 /* don't re-export snaps */
220 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, 175 if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
221 fh_len); 176 return ERR_PTR(-EINVAL);
222 else 177
223 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, 178 dout("get_parent %p ino %llx.%llx\n",
224 fh_len); 179 child, ceph_vinop(child->d_inode));
180 return __get_parent(child->d_sb, child, 0);
225} 181}
226 182
227/* 183/*
228 * get parent, if possible. 184 * convert regular fh to parent
229 *
230 * FIXME: we could do better by querying the mds to discover the
231 * parent.
232 */ 185 */
233static struct dentry *ceph_fh_to_parent(struct super_block *sb, 186static struct dentry *ceph_fh_to_parent(struct super_block *sb,
234 struct fid *fid, 187 struct fid *fid,
235 int fh_len, int fh_type) 188 int fh_len, int fh_type)
236{ 189{
237 struct ceph_nfs_confh *cfh = (void *)fid->raw; 190 struct ceph_nfs_confh *cfh = (void *)fid->raw;
238 struct ceph_vino vino;
239 struct inode *inode;
240 struct dentry *dentry; 191 struct dentry *dentry;
241 int err;
242 192
243 if (fh_type == 1) 193 if (fh_type != FILEID_INO32_GEN_PARENT)
244 return ERR_PTR(-ESTALE); 194 return NULL;
245 if (fh_len < sizeof(*cfh) / 4) 195 if (fh_len < sizeof(*cfh) / 4)
246 return ERR_PTR(-ESTALE); 196 return NULL;
247 197
248 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, 198 dout("fh_to_parent %llx\n", cfh->parent_ino);
249 cfh->parent_name_hash); 199 dentry = __get_parent(sb, NULL, cfh->ino);
200 if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
201 dentry = __fh_to_dentry(sb, cfh->parent_ino);
202 return dentry;
203}
250 204
251 vino.ino = cfh->ino; 205static int ceph_get_name(struct dentry *parent, char *name,
252 vino.snap = CEPH_NOSNAP; 206 struct dentry *child)
253 inode = ceph_find_inode(sb, vino); 207{
254 if (!inode) 208 struct ceph_mds_client *mdsc;
255 return ERR_PTR(-ESTALE); 209 struct ceph_mds_request *req;
210 int err;
256 211
257 dentry = d_obtain_alias(inode); 212 mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
258 if (IS_ERR(dentry)) { 213 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
259 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 214 USE_ANY_MDS);
260 cfh->ino, inode); 215 if (IS_ERR(req))
261 iput(inode); 216 return PTR_ERR(req);
262 return dentry; 217
263 } 218 mutex_lock(&parent->d_inode->i_mutex);
264 err = ceph_init_dentry(dentry); 219
265 if (err < 0) { 220 req->r_inode = child->d_inode;
266 iput(inode); 221 ihold(child->d_inode);
267 return ERR_PTR(err); 222 req->r_ino2 = ceph_vino(parent->d_inode);
223 req->r_locked_dir = parent->d_inode;
224 req->r_num_caps = 2;
225 err = ceph_mdsc_do_request(mdsc, NULL, req);
226
227 mutex_unlock(&parent->d_inode->i_mutex);
228
229 if (!err) {
230 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
231 memcpy(name, rinfo->dname, rinfo->dname_len);
232 name[rinfo->dname_len] = 0;
233 dout("get_name %p ino %llx.%llx name %s\n",
234 child, ceph_vinop(child->d_inode), name);
235 } else {
236 dout("get_name %p ino %llx.%llx err %d\n",
237 child, ceph_vinop(child->d_inode), err);
268 } 238 }
269 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); 239
270 return dentry; 240 ceph_mdsc_put_request(req);
241 return err;
271} 242}
272 243
273const struct export_operations ceph_export_ops = { 244const struct export_operations ceph_export_ops = {
274 .encode_fh = ceph_encode_fh, 245 .encode_fh = ceph_encode_fh,
275 .fh_to_dentry = ceph_fh_to_dentry, 246 .fh_to_dentry = ceph_fh_to_dentry,
276 .fh_to_parent = ceph_fh_to_parent, 247 .fh_to_parent = ceph_fh_to_parent,
248 .get_parent = ceph_get_parent,
249 .get_name = ceph_get_name,
277}; 250};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 09c7afe32e49..66075a4ad979 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
210 ihold(inode); 210 ihold(inode);
211 211
212 req->r_num_caps = 1; 212 req->r_num_caps = 1;
213 if (flags & (O_CREAT|O_TRUNC)) 213 if (flags & O_CREAT)
214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
215 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 215 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
216 iput(parent_inode); 216 iput(parent_inode);
@@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
291 } 291 }
292 err = finish_open(file, dentry, ceph_open, opened); 292 err = finish_open(file, dentry, ceph_open, opened);
293 } 293 }
294
295out_err: 294out_err:
295 if (!req->r_err && req->r_target_inode)
296 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
296 ceph_mdsc_put_request(req); 297 ceph_mdsc_put_request(req);
297 dout("atomic_open result=%d\n", err); 298 dout("atomic_open result=%d\n", err);
298 return err; 299 return err;
@@ -970,6 +971,7 @@ retry_snap:
970 goto retry_snap; 971 goto retry_snap;
971 } 972 }
972 } else { 973 } else {
974 loff_t old_size = inode->i_size;
973 /* 975 /*
974 * No need to acquire the i_truncate_mutex. Because 976 * No need to acquire the i_truncate_mutex. Because
975 * the MDS revokes Fwb caps before sending truncate 977 * the MDS revokes Fwb caps before sending truncate
@@ -980,6 +982,8 @@ retry_snap:
980 written = generic_file_buffered_write(iocb, iov, nr_segs, 982 written = generic_file_buffered_write(iocb, iov, nr_segs,
981 pos, &iocb->ki_pos, 983 pos, &iocb->ki_pos,
982 count, 0); 984 count, 0);
985 if (inode->i_size > old_size)
986 ceph_fscache_update_objectsize(inode);
983 mutex_unlock(&inode->i_mutex); 987 mutex_unlock(&inode->i_mutex);
984 } 988 }
985 989
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 32d519d8a2e2..0b0728e5be2d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode,
659 le32_to_cpu(info->time_warp_seq), 659 le32_to_cpu(info->time_warp_seq),
660 &ctime, &mtime, &atime); 660 &ctime, &mtime, &atime);
661 661
662 /* only update max_size on auth cap */
663 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
664 ci->i_max_size != le64_to_cpu(info->max_size)) {
665 dout("max_size %lld -> %llu\n", ci->i_max_size,
666 le64_to_cpu(info->max_size));
667 ci->i_max_size = le64_to_cpu(info->max_size);
668 }
669
670 ci->i_layout = info->layout; 662 ci->i_layout = info->layout;
671 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 663 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
672 664
@@ -755,6 +747,14 @@ static int fill_inode(struct inode *inode,
755 ci->i_max_offset = 2; 747 ci->i_max_offset = 2;
756 } 748 }
757no_change: 749no_change:
750 /* only update max_size on auth cap */
751 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
752 ci->i_max_size != le64_to_cpu(info->max_size)) {
753 dout("max_size %lld -> %llu\n", ci->i_max_size,
754 le64_to_cpu(info->max_size));
755 ci->i_max_size = le64_to_cpu(info->max_size);
756 }
757
758 spin_unlock(&ci->i_ceph_lock); 758 spin_unlock(&ci->i_ceph_lock);
759 759
760 /* queue truncate if we saw i_size decrease */ 760 /* queue truncate if we saw i_size decrease */
@@ -1044,10 +1044,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1044 session, req->r_request_started, -1, 1044 session, req->r_request_started, -1,
1045 &req->r_caps_reservation); 1045 &req->r_caps_reservation);
1046 if (err < 0) 1046 if (err < 0)
1047 return err; 1047 goto done;
1048 } else { 1048 } else {
1049 WARN_ON_ONCE(1); 1049 WARN_ON_ONCE(1);
1050 } 1050 }
1051
1052 if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
1053 struct qstr dname;
1054 struct dentry *dn, *parent;
1055
1056 BUG_ON(!rinfo->head->is_target);
1057 BUG_ON(req->r_dentry);
1058
1059 parent = d_find_any_alias(dir);
1060 BUG_ON(!parent);
1061
1062 dname.name = rinfo->dname;
1063 dname.len = rinfo->dname_len;
1064 dname.hash = full_name_hash(dname.name, dname.len);
1065 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1066 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1067retry_lookup:
1068 dn = d_lookup(parent, &dname);
1069 dout("d_lookup on parent=%p name=%.*s got %p\n",
1070 parent, dname.len, dname.name, dn);
1071
1072 if (!dn) {
1073 dn = d_alloc(parent, &dname);
1074 dout("d_alloc %p '%.*s' = %p\n", parent,
1075 dname.len, dname.name, dn);
1076 if (dn == NULL) {
1077 dput(parent);
1078 err = -ENOMEM;
1079 goto done;
1080 }
1081 err = ceph_init_dentry(dn);
1082 if (err < 0) {
1083 dput(dn);
1084 dput(parent);
1085 goto done;
1086 }
1087 } else if (dn->d_inode &&
1088 (ceph_ino(dn->d_inode) != vino.ino ||
1089 ceph_snap(dn->d_inode) != vino.snap)) {
1090 dout(" dn %p points to wrong inode %p\n",
1091 dn, dn->d_inode);
1092 d_delete(dn);
1093 dput(dn);
1094 goto retry_lookup;
1095 }
1096
1097 req->r_dentry = dn;
1098 dput(parent);
1099 }
1051 } 1100 }
1052 1101
1053 if (rinfo->head->is_target) { 1102 if (rinfo->head->is_target) {
@@ -1063,7 +1112,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1063 1112
1064 err = fill_inode(in, &rinfo->targeti, NULL, 1113 err = fill_inode(in, &rinfo->targeti, NULL,
1065 session, req->r_request_started, 1114 session, req->r_request_started,
1066 (le32_to_cpu(rinfo->head->result) == 0) ? 1115 (!req->r_aborted && rinfo->head->result == 0) ?
1067 req->r_fmode : -1, 1116 req->r_fmode : -1,
1068 &req->r_caps_reservation); 1117 &req->r_caps_reservation);
1069 if (err < 0) { 1118 if (err < 0) {
@@ -1616,8 +1665,6 @@ static const struct inode_operations ceph_symlink_iops = {
1616 .getxattr = ceph_getxattr, 1665 .getxattr = ceph_getxattr,
1617 .listxattr = ceph_listxattr, 1666 .listxattr = ceph_listxattr,
1618 .removexattr = ceph_removexattr, 1667 .removexattr = ceph_removexattr,
1619 .get_acl = ceph_get_acl,
1620 .set_acl = ceph_set_acl,
1621}; 1668};
1622 1669
1623/* 1670/*
@@ -1627,7 +1674,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1627{ 1674{
1628 struct inode *inode = dentry->d_inode; 1675 struct inode *inode = dentry->d_inode;
1629 struct ceph_inode_info *ci = ceph_inode(inode); 1676 struct ceph_inode_info *ci = ceph_inode(inode);
1630 struct inode *parent_inode;
1631 const unsigned int ia_valid = attr->ia_valid; 1677 const unsigned int ia_valid = attr->ia_valid;
1632 struct ceph_mds_request *req; 1678 struct ceph_mds_request *req;
1633 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; 1679 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1819,9 +1865,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1819 req->r_inode_drop = release; 1865 req->r_inode_drop = release;
1820 req->r_args.setattr.mask = cpu_to_le32(mask); 1866 req->r_args.setattr.mask = cpu_to_le32(mask);
1821 req->r_num_caps = 1; 1867 req->r_num_caps = 1;
1822 parent_inode = ceph_get_dentry_parent_inode(dentry); 1868 err = ceph_mdsc_do_request(mdsc, NULL, req);
1823 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1824 iput(parent_inode);
1825 } 1869 }
1826 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, 1870 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1827 ceph_cap_string(dirtied), mask); 1871 ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index dc66c9e023e4..efbe08289292 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -64,7 +64,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
64static long ceph_ioctl_set_layout(struct file *file, void __user *arg) 64static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
65{ 65{
66 struct inode *inode = file_inode(file); 66 struct inode *inode = file_inode(file);
67 struct inode *parent_inode;
68 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 67 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
69 struct ceph_mds_request *req; 68 struct ceph_mds_request *req;
70 struct ceph_ioctl_layout l; 69 struct ceph_ioctl_layout l;
@@ -121,9 +120,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
121 cpu_to_le32(l.object_size); 120 cpu_to_le32(l.object_size);
122 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); 121 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
123 122
124 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 123 err = ceph_mdsc_do_request(mdsc, NULL, req);
125 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
126 iput(parent_inode);
127 ceph_mdsc_put_request(req); 124 ceph_mdsc_put_request(req);
128 return err; 125 return err;
129} 126}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0f..d94ba0df9f4d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5#include <linux/random.h>
5 6
6#include "super.h" 7#include "super.h"
7#include "mds_client.h" 8#include "mds_client.h"
8#include <linux/ceph/pagelist.h> 9#include <linux/ceph/pagelist.h>
9 10
11static u64 lock_secret;
12
13static inline u64 secure_addr(void *addr)
14{
15 u64 v = lock_secret ^ (u64)(unsigned long)addr;
16 /*
17 * Set the most significant bit, so that MDS knows the 'owner'
18 * is sufficient to identify the owner of lock. (old code uses
19 * both 'owner' and 'pid')
20 */
21 v |= (1ULL << 63);
22 return v;
23}
24
25void __init ceph_flock_init(void)
26{
27 get_random_bytes(&lock_secret, sizeof(lock_secret));
28}
29
10/** 30/**
11 * Implement fcntl and flock locking functions. 31 * Implement fcntl and flock locking functions.
12 */ 32 */
@@ -14,11 +34,11 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 int cmd, u8 wait, struct file_lock *fl) 34 int cmd, u8 wait, struct file_lock *fl)
15{ 35{
16 struct inode *inode = file_inode(file); 36 struct inode *inode = file_inode(file);
17 struct ceph_mds_client *mdsc = 37 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
18 ceph_sb_to_client(inode->i_sb)->mdsc;
19 struct ceph_mds_request *req; 38 struct ceph_mds_request *req;
20 int err; 39 int err;
21 u64 length = 0; 40 u64 length = 0;
41 u64 owner;
22 42
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 43 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req)) 44 if (IS_ERR(req))
@@ -32,25 +52,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
32 else 52 else
33 length = fl->fl_end - fl->fl_start + 1; 53 length = fl->fl_end - fl->fl_start + 1;
34 54
35 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 55 if (lock_type == CEPH_LOCK_FCNTL)
36 "length: %llu, wait: %d, type: %d", (int)lock_type, 56 owner = secure_addr(fl->fl_owner);
37 (int)operation, (u64)fl->fl_pid, fl->fl_start, 57 else
38 length, wait, fl->fl_type); 58 owner = secure_addr(fl->fl_file);
59
60 dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
61 "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
62 (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
63 wait, fl->fl_type);
39 64
40 req->r_args.filelock_change.rule = lock_type; 65 req->r_args.filelock_change.rule = lock_type;
41 req->r_args.filelock_change.type = cmd; 66 req->r_args.filelock_change.type = cmd;
67 req->r_args.filelock_change.owner = cpu_to_le64(owner);
42 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); 68 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
43 /* This should be adjusted, but I'm not sure if
44 namespaces actually get id numbers*/
45 req->r_args.filelock_change.pid_namespace =
46 cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
47 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 69 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
48 req->r_args.filelock_change.length = cpu_to_le64(length); 70 req->r_args.filelock_change.length = cpu_to_le64(length);
49 req->r_args.filelock_change.wait = wait; 71 req->r_args.filelock_change.wait = wait;
50 72
51 err = ceph_mdsc_do_request(mdsc, inode, req); 73 err = ceph_mdsc_do_request(mdsc, inode, req);
52 74
53 if ( operation == CEPH_MDS_OP_GETFILELOCK){ 75 if (operation == CEPH_MDS_OP_GETFILELOCK) {
54 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); 76 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
55 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 77 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
56 fl->fl_type = F_RDLCK; 78 fl->fl_type = F_RDLCK;
@@ -87,14 +109,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
87 u8 wait = 0; 109 u8 wait = 0;
88 u16 op = CEPH_MDS_OP_SETFILELOCK; 110 u16 op = CEPH_MDS_OP_SETFILELOCK;
89 111
90 fl->fl_nspid = get_pid(task_tgid(current)); 112 if (!(fl->fl_flags & FL_POSIX))
91 dout("ceph_lock, fl_pid:%d", fl->fl_pid); 113 return -ENOLCK;
114 /* No mandatory locks */
115 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
116 return -ENOLCK;
117
118 dout("ceph_lock, fl_owner: %p", fl->fl_owner);
92 119
93 /* set wait bit as appropriate, then make command as Ceph expects it*/ 120 /* set wait bit as appropriate, then make command as Ceph expects it*/
94 if (F_SETLKW == cmd) 121 if (IS_GETLK(cmd))
95 wait = 1;
96 if (F_GETLK == cmd)
97 op = CEPH_MDS_OP_GETFILELOCK; 122 op = CEPH_MDS_OP_GETFILELOCK;
123 else if (IS_SETLKW(cmd))
124 wait = 1;
98 125
99 if (F_RDLCK == fl->fl_type) 126 if (F_RDLCK == fl->fl_type)
100 lock_cmd = CEPH_LOCK_SHARED; 127 lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +132,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
105 132
106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); 133 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
107 if (!err) { 134 if (!err) {
108 if ( op != CEPH_MDS_OP_GETFILELOCK ){ 135 if (op != CEPH_MDS_OP_GETFILELOCK) {
109 dout("mds locked, locking locally"); 136 dout("mds locked, locking locally");
110 err = posix_lock_file(file, fl, NULL); 137 err = posix_lock_file(file, fl, NULL);
111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 138 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +158,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
131{ 158{
132 u8 lock_cmd; 159 u8 lock_cmd;
133 int err; 160 int err;
134 u8 wait = 1; 161 u8 wait = 0;
135 162
136 fl->fl_nspid = get_pid(task_tgid(current)); 163 if (!(fl->fl_flags & FL_FLOCK))
137 dout("ceph_flock, fl_pid:%d", fl->fl_pid); 164 return -ENOLCK;
138 165 /* No mandatory locks */
139 /* set wait bit, then clear it out of cmd*/ 166 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
140 if (cmd & LOCK_NB) 167 return -ENOLCK;
141 wait = 0; 168
142 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); 169 dout("ceph_flock, fl_file: %p", fl->fl_file);
143 /* set command sequence that Ceph wants to see: 170
144 shared lock, exclusive lock, or unlock */ 171 if (IS_SETLKW(cmd))
145 if (LOCK_SH == cmd) 172 wait = 1;
173
174 if (F_RDLCK == fl->fl_type)
146 lock_cmd = CEPH_LOCK_SHARED; 175 lock_cmd = CEPH_LOCK_SHARED;
147 else if (LOCK_EX == cmd) 176 else if (F_WRLCK == fl->fl_type)
148 lock_cmd = CEPH_LOCK_EXCL; 177 lock_cmd = CEPH_LOCK_EXCL;
149 else 178 else
150 lock_cmd = CEPH_LOCK_UNLOCK; 179 lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +309,14 @@ int lock_to_ceph_filelock(struct file_lock *lock,
280 struct ceph_filelock *cephlock) 309 struct ceph_filelock *cephlock)
281{ 310{
282 int err = 0; 311 int err = 0;
283
284 cephlock->start = cpu_to_le64(lock->fl_start); 312 cephlock->start = cpu_to_le64(lock->fl_start);
285 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 313 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
286 cephlock->client = cpu_to_le64(0); 314 cephlock->client = cpu_to_le64(0);
287 cephlock->pid = cpu_to_le64(lock->fl_pid); 315 cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
288 cephlock->pid_namespace = 316 if (lock->fl_flags & FL_POSIX)
289 cpu_to_le64((u64)(unsigned long)lock->fl_nspid); 317 cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
318 else
319 cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file));
290 320
291 switch (lock->fl_type) { 321 switch (lock->fl_type) {
292 case F_RDLCK: 322 case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f4f050a69a48..2b4d093d0563 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/wait.h> 4#include <linux/wait.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/gfp.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7#include <linux/debugfs.h> 8#include <linux/debugfs.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
@@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
165 if (num == 0) 166 if (num == 0)
166 goto done; 167 goto done;
167 168
168 /* alloc large array */ 169 BUG_ON(!info->dir_in);
169 info->dir_nr = num;
170 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
171 sizeof(*info->dir_dname) +
172 sizeof(*info->dir_dname_len) +
173 sizeof(*info->dir_dlease),
174 GFP_NOFS);
175 if (info->dir_in == NULL) {
176 err = -ENOMEM;
177 goto out_bad;
178 }
179 info->dir_dname = (void *)(info->dir_in + num); 170 info->dir_dname = (void *)(info->dir_in + num);
180 info->dir_dname_len = (void *)(info->dir_dname + num); 171 info->dir_dname_len = (void *)(info->dir_dname + num);
181 info->dir_dlease = (void *)(info->dir_dname_len + num); 172 info->dir_dlease = (void *)(info->dir_dname_len + num);
173 if ((unsigned long)(info->dir_dlease + num) >
174 (unsigned long)info->dir_in + info->dir_buf_size) {
175 pr_err("dir contents are larger than expected\n");
176 WARN_ON(1);
177 goto bad;
178 }
182 179
180 info->dir_nr = num;
183 while (num) { 181 while (num) {
184 /* dentry */ 182 /* dentry */
185 ceph_decode_need(p, end, sizeof(u32)*2, bad); 183 ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -327,7 +325,9 @@ out_bad:
327 325
328static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 326static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
329{ 327{
330 kfree(info->dir_in); 328 if (!info->dir_in)
329 return;
330 free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
331} 331}
332 332
333 333
@@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
512 struct ceph_mds_request *req = container_of(kref, 512 struct ceph_mds_request *req = container_of(kref,
513 struct ceph_mds_request, 513 struct ceph_mds_request,
514 r_kref); 514 r_kref);
515 destroy_reply_info(&req->r_reply_info);
515 if (req->r_request) 516 if (req->r_request)
516 ceph_msg_put(req->r_request); 517 ceph_msg_put(req->r_request);
517 if (req->r_reply) { 518 if (req->r_reply)
518 ceph_msg_put(req->r_reply); 519 ceph_msg_put(req->r_reply);
519 destroy_reply_info(&req->r_reply_info);
520 }
521 if (req->r_inode) { 520 if (req->r_inode) {
522 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 521 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
523 iput(req->r_inode); 522 iput(req->r_inode);
@@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
528 iput(req->r_target_inode); 527 iput(req->r_target_inode);
529 if (req->r_dentry) 528 if (req->r_dentry)
530 dput(req->r_dentry); 529 dput(req->r_dentry);
531 if (req->r_old_dentry) { 530 if (req->r_old_dentry)
531 dput(req->r_old_dentry);
532 if (req->r_old_dentry_dir) {
532 /* 533 /*
533 * track (and drop pins for) r_old_dentry_dir 534 * track (and drop pins for) r_old_dentry_dir
534 * separately, since r_old_dentry's d_parent may have 535 * separately, since r_old_dentry's d_parent may have
@@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
537 */ 538 */
538 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), 539 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
539 CEPH_CAP_PIN); 540 CEPH_CAP_PIN);
540 dput(req->r_old_dentry);
541 iput(req->r_old_dentry_dir); 541 iput(req->r_old_dentry_dir);
542 } 542 }
543 kfree(req->r_path1); 543 kfree(req->r_path1);
@@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1311 trim_caps - session->s_trim_caps); 1311 trim_caps - session->s_trim_caps);
1312 session->s_trim_caps = 0; 1312 session->s_trim_caps = 0;
1313 } 1313 }
1314
1315 ceph_add_cap_releases(mdsc, session);
1316 ceph_send_cap_releases(mdsc, session);
1314 return 0; 1317 return 0;
1315} 1318}
1316 1319
@@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1461 1464
1462 dout("discard_cap_releases mds%d\n", session->s_mds); 1465 dout("discard_cap_releases mds%d\n", session->s_mds);
1463 1466
1464 /* zero out the in-progress message */ 1467 if (!list_empty(&session->s_cap_releases)) {
1465 msg = list_first_entry(&session->s_cap_releases, 1468 /* zero out the in-progress message */
1466 struct ceph_msg, list_head); 1469 msg = list_first_entry(&session->s_cap_releases,
1467 head = msg->front.iov_base; 1470 struct ceph_msg, list_head);
1468 num = le32_to_cpu(head->num); 1471 head = msg->front.iov_base;
1469 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1472 num = le32_to_cpu(head->num);
1470 head->num = cpu_to_le32(0); 1473 dout("discard_cap_releases mds%d %p %u\n",
1471 msg->front.iov_len = sizeof(*head); 1474 session->s_mds, msg, num);
1472 session->s_num_cap_releases += num; 1475 head->num = cpu_to_le32(0);
1476 msg->front.iov_len = sizeof(*head);
1477 session->s_num_cap_releases += num;
1478 }
1473 1479
1474 /* requeue completed messages */ 1480 /* requeue completed messages */
1475 while (!list_empty(&session->s_cap_releases_done)) { 1481 while (!list_empty(&session->s_cap_releases_done)) {
@@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1492 * requests 1498 * requests
1493 */ 1499 */
1494 1500
1501int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1502 struct inode *dir)
1503{
1504 struct ceph_inode_info *ci = ceph_inode(dir);
1505 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1506 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
1507 size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
1508 sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
1509 int order, num_entries;
1510
1511 spin_lock(&ci->i_ceph_lock);
1512 num_entries = ci->i_files + ci->i_subdirs;
1513 spin_unlock(&ci->i_ceph_lock);
1514 num_entries = max(num_entries, 1);
1515 num_entries = min(num_entries, opt->max_readdir);
1516
1517 order = get_order(size * num_entries);
1518 while (order >= 0) {
1519 rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
1520 order);
1521 if (rinfo->dir_in)
1522 break;
1523 order--;
1524 }
1525 if (!rinfo->dir_in)
1526 return -ENOMEM;
1527
1528 num_entries = (PAGE_SIZE << order) / size;
1529 num_entries = min(num_entries, opt->max_readdir);
1530
1531 rinfo->dir_buf_size = PAGE_SIZE << order;
1532 req->r_num_caps = num_entries + 1;
1533 req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
1534 req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
1535 return 0;
1536}
1537
1495/* 1538/*
1496 * Create an mds request. 1539 * Create an mds request.
1497 */ 1540 */
@@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2053 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 2096 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
2054 if (req->r_locked_dir) 2097 if (req->r_locked_dir)
2055 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 2098 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
2056 if (req->r_old_dentry) 2099 if (req->r_old_dentry_dir)
2057 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), 2100 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
2058 CEPH_CAP_PIN); 2101 CEPH_CAP_PIN);
2059 2102
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 68288917c737..e90cfccf93bd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
67 /* for readdir results */ 67 /* for readdir results */
68 struct { 68 struct {
69 struct ceph_mds_reply_dirfrag *dir_dir; 69 struct ceph_mds_reply_dirfrag *dir_dir;
70 size_t dir_buf_size;
70 int dir_nr; 71 int dir_nr;
71 char **dir_dname; 72 char **dir_dname;
72 u32 *dir_dname_len; 73 u32 *dir_dname_len;
@@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
346 struct dentry *dn); 347 struct dentry *dn);
347 348
348extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 349extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
349 350extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
351 struct inode *dir);
350extern struct ceph_mds_request * 352extern struct ceph_mds_request *
351ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 353ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
352extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 354extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4440f447fd3f..51cc23e48111 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op)
54 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; 54 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
55 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; 55 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
56 case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; 56 case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
57 case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
57 case CEPH_MDS_OP_GETATTR: return "getattr"; 58 case CEPH_MDS_OP_GETATTR: return "getattr";
58 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 59 case CEPH_MDS_OP_SETXATTR: return "setxattr";
59 case CEPH_MDS_OP_SETATTR: return "setattr"; 60 case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 10a4ccbf38da..06150fd745ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1026,6 +1026,7 @@ static int __init init_ceph(void)
1026 if (ret) 1026 if (ret)
1027 goto out; 1027 goto out;
1028 1028
1029 ceph_flock_init();
1029 ceph_xattr_init(); 1030 ceph_xattr_init();
1030 ret = register_filesystem(&ceph_fs_type); 1031 ret = register_filesystem(&ceph_fs_type);
1031 if (ret) 1032 if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d8801a95b685..7866cd05a6bb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -577,7 +577,7 @@ struct ceph_file_info {
577 577
578 /* readdir: position within a frag */ 578 /* readdir: position within a frag */
579 unsigned offset; /* offset of last chunk, adjusted for . and .. */ 579 unsigned offset; /* offset of last chunk, adjusted for . and .. */
580 u64 next_offset; /* offset of next chunk (last_name's + 1) */ 580 unsigned next_offset; /* offset of next chunk (last_name's + 1) */
581 char *last_name; /* last entry in previous chunk */ 581 char *last_name; /* last entry in previous chunk */
582 struct dentry *dentry; /* next dentry (for dcache readdir) */ 582 struct dentry *dentry; /* next dentry (for dcache readdir) */
583 int dir_release_count; 583 int dir_release_count;
@@ -871,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
871extern const struct export_operations ceph_export_ops; 871extern const struct export_operations ceph_export_ops;
872 872
873/* locks.c */ 873/* locks.c */
874extern __init void ceph_flock_init(void);
874extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 875extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
875extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 876extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
876extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); 877extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a55ec37378c6..c9c2b887381e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
64} 64}
65 65
66static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 66static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
67 size_t size) 67 size_t size)
68{ 68{
69 int ret; 69 int ret;
70 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); 70 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
71 struct ceph_osd_client *osdc = &fsc->client->osdc; 71 struct ceph_osd_client *osdc = &fsc->client->osdc;
72 s64 pool = ceph_file_layout_pg_pool(ci->i_layout); 72 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
73 const char *pool_name; 73 const char *pool_name;
74 char buf[128];
74 75
75 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); 76 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
76 down_read(&osdc->map_sem); 77 down_read(&osdc->map_sem);
77 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 78 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
78 if (pool_name) 79 if (pool_name) {
79 ret = snprintf(val, size, 80 size_t len = strlen(pool_name);
80 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", 81 ret = snprintf(buf, sizeof(buf),
82 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
81 (unsigned long long)ceph_file_layout_su(ci->i_layout), 83 (unsigned long long)ceph_file_layout_su(ci->i_layout),
82 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 84 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
83 (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 85 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
84 pool_name); 86 if (!size) {
85 else 87 ret += len;
86 ret = snprintf(val, size, 88 } else if (ret + len > size) {
89 ret = -ERANGE;
90 } else {
91 memcpy(val, buf, ret);
92 memcpy(val + ret, pool_name, len);
93 ret += len;
94 }
95 } else {
96 ret = snprintf(buf, sizeof(buf),
87 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", 97 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
88 (unsigned long long)ceph_file_layout_su(ci->i_layout), 98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
89 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
90 (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
91 (unsigned long long)pool); 101 (unsigned long long)pool);
92 102 if (size) {
103 if (ret <= size)
104 memcpy(val, buf, ret);
105 else
106 ret = -ERANGE;
107 }
108 }
93 up_read(&osdc->map_sem); 109 up_read(&osdc->map_sem);
94 return ret; 110 return ret;
95} 111}
@@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
215 .name_size = sizeof("ceph.dir.layout"), 231 .name_size = sizeof("ceph.dir.layout"),
216 .getxattr_cb = ceph_vxattrcb_layout, 232 .getxattr_cb = ceph_vxattrcb_layout,
217 .readonly = false, 233 .readonly = false,
218 .hidden = false, 234 .hidden = true,
219 .exists_cb = ceph_vxattrcb_layout_exists, 235 .exists_cb = ceph_vxattrcb_layout_exists,
220 }, 236 },
221 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), 237 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
242 .name_size = sizeof("ceph.file.layout"), 258 .name_size = sizeof("ceph.file.layout"),
243 .getxattr_cb = ceph_vxattrcb_layout, 259 .getxattr_cb = ceph_vxattrcb_layout,
244 .readonly = false, 260 .readonly = false,
245 .hidden = false, 261 .hidden = true,
246 .exists_cb = ceph_vxattrcb_layout_exists, 262 .exists_cb = ceph_vxattrcb_layout_exists,
247 }, 263 },
248 XATTR_LAYOUT_FIELD(file, layout, stripe_unit), 264 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
842 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 858 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
843 struct inode *inode = dentry->d_inode; 859 struct inode *inode = dentry->d_inode;
844 struct ceph_inode_info *ci = ceph_inode(inode); 860 struct ceph_inode_info *ci = ceph_inode(inode);
845 struct inode *parent_inode;
846 struct ceph_mds_request *req; 861 struct ceph_mds_request *req;
847 struct ceph_mds_client *mdsc = fsc->mdsc; 862 struct ceph_mds_client *mdsc = fsc->mdsc;
848 int err; 863 int err;
@@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
893 req->r_data_len = size; 908 req->r_data_len = size;
894 909
895 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 910 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
896 parent_inode = ceph_get_dentry_parent_inode(dentry); 911 err = ceph_mdsc_do_request(mdsc, NULL, req);
897 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
898 iput(parent_inode);
899 ceph_mdsc_put_request(req); 912 ceph_mdsc_put_request(req);
900 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 913 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
901 914
@@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
1019 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1032 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
1020 struct ceph_mds_client *mdsc = fsc->mdsc; 1033 struct ceph_mds_client *mdsc = fsc->mdsc;
1021 struct inode *inode = dentry->d_inode; 1034 struct inode *inode = dentry->d_inode;
1022 struct inode *parent_inode;
1023 struct ceph_mds_request *req; 1035 struct ceph_mds_request *req;
1024 int err; 1036 int err;
1025 1037
@@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
1033 req->r_num_caps = 1; 1045 req->r_num_caps = 1;
1034 req->r_path2 = kstrdup(name, GFP_NOFS); 1046 req->r_path2 = kstrdup(name, GFP_NOFS);
1035 1047
1036 parent_inode = ceph_get_dentry_parent_inode(dentry); 1048 err = ceph_mdsc_do_request(mdsc, NULL, req);
1037 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1038 iput(parent_inode);
1039 ceph_mdsc_put_request(req); 1049 ceph_mdsc_put_request(req);
1040 return err; 1050 return err;
1041} 1051}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f6132b327..2c70cbe35d39 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -286,7 +286,7 @@ cifs_destroy_inode(struct inode *inode)
286static void 286static void
287cifs_evict_inode(struct inode *inode) 287cifs_evict_inode(struct inode *inode)
288{ 288{
289 truncate_inode_pages(&inode->i_data, 0); 289 truncate_inode_pages_final(&inode->i_data);
290 clear_inode(inode); 290 clear_inode(inode);
291 cifs_fscache_release_inode_cookie(inode); 291 cifs_fscache_release_inode_cookie(inode);
292} 292}
@@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
541 541
542static int cifs_remount(struct super_block *sb, int *flags, char *data) 542static int cifs_remount(struct super_block *sb, int *flags, char *data)
543{ 543{
544 sync_filesystem(sb);
544 *flags |= MS_NODIRATIME; 545 *flags |= MS_NODIRATIME;
545 return 0; 546 return 0;
546} 547}
@@ -1005,7 +1006,7 @@ cifs_init_once(void *inode)
1005 init_rwsem(&cifsi->lock_sem); 1006 init_rwsem(&cifsi->lock_sem);
1006} 1007}
1007 1008
1008static int 1009static int __init
1009cifs_init_inodecache(void) 1010cifs_init_inodecache(void)
1010{ 1011{
1011 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", 1012 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 834fce759d80..216d7e99f921 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3113,6 +3113,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
3113 3113
3114static struct vm_operations_struct cifs_file_vm_ops = { 3114static struct vm_operations_struct cifs_file_vm_ops = {
3115 .fault = filemap_fault, 3115 .fault = filemap_fault,
3116 .map_pages = filemap_map_pages,
3116 .page_mkwrite = cifs_page_mkwrite, 3117 .page_mkwrite = cifs_page_mkwrite,
3117 .remap_pages = generic_file_remap_pages, 3118 .remap_pages = generic_file_remap_pages,
3118}; 3119};
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index b7143cf783ac..381c993b1427 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,7 +10,7 @@ extern int coda_hard;
10extern int coda_fake_statfs; 10extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int __init coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); 14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
15void coda_sysctl_init(void); 15void coda_sysctl_init(void);
16void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 506de34a4ef3..d9c7751f10ac 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -73,7 +73,7 @@ static void init_once(void *foo)
73 inode_init_once(&ei->vfs_inode); 73 inode_init_once(&ei->vfs_inode);
74} 74}
75 75
76int coda_init_inodecache(void) 76int __init coda_init_inodecache(void)
77{ 77{
78 coda_inode_cachep = kmem_cache_create("coda_inode_cache", 78 coda_inode_cachep = kmem_cache_create("coda_inode_cache",
79 sizeof(struct coda_inode_info), 79 sizeof(struct coda_inode_info),
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
96 96
97static int coda_remount(struct super_block *sb, int *flags, char *data) 97static int coda_remount(struct super_block *sb, int *flags, char *data)
98{ 98{
99 sync_filesystem(sb);
99 *flags |= MS_NOATIME; 100 *flags |= MS_NOATIME;
100 return 0; 101 return 0;
101} 102}
@@ -250,7 +251,7 @@ static void coda_put_super(struct super_block *sb)
250 251
251static void coda_evict_inode(struct inode *inode) 252static void coda_evict_inode(struct inode *inode)
252{ 253{
253 truncate_inode_pages(&inode->i_data, 0); 254 truncate_inode_pages_final(&inode->i_data);
254 clear_inode(inode); 255 clear_inode(inode);
255 coda_cache_clear_inode(inode); 256 coda_cache_clear_inode(inode);
256} 257}
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..ca926ad0430c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...)
72 * Not all architectures have sys_utime, so implement this in terms 72 * Not all architectures have sys_utime, so implement this in terms
73 * of sys_utimes. 73 * of sys_utimes.
74 */ 74 */
75asmlinkage long compat_sys_utime(const char __user *filename, 75COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
76 struct compat_utimbuf __user *t) 76 struct compat_utimbuf __user *, t)
77{ 77{
78 struct timespec tv[2]; 78 struct timespec tv[2];
79 79
@@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); 87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
88} 88}
89 89
90asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) 90COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
91{ 91{
92 struct timespec tv[2]; 92 struct timespec tv[2];
93 93
94 if (t) { 94 if (t) {
95 if (get_compat_timespec(&tv[0], &t[0]) || 95 if (compat_get_timespec(&tv[0], &t[0]) ||
96 get_compat_timespec(&tv[1], &t[1])) 96 compat_get_timespec(&tv[1], &t[1]))
97 return -EFAULT; 97 return -EFAULT;
98 98
99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
102 return do_utimes(dfd, filename, t ? tv : NULL, flags); 102 return do_utimes(dfd, filename, t ? tv : NULL, flags);
103} 103}
104 104
105asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) 105COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
106{ 106{
107 struct timespec tv[2]; 107 struct timespec tv[2];
108 108
@@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
121 return do_utimes(dfd, filename, t ? tv : NULL, 0); 121 return do_utimes(dfd, filename, t ? tv : NULL, 0);
122} 122}
123 123
124asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) 124COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
125{ 125{
126 return compat_sys_futimesat(AT_FDCWD, filename, t); 126 return compat_sys_futimesat(AT_FDCWD, filename, t);
127} 127}
@@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; 159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
160} 160}
161 161
162asmlinkage long compat_sys_newstat(const char __user * filename, 162COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
163 struct compat_stat __user *statbuf) 163 struct compat_stat __user *, statbuf)
164{ 164{
165 struct kstat stat; 165 struct kstat stat;
166 int error; 166 int error;
@@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
171 return cp_compat_stat(&stat, statbuf); 171 return cp_compat_stat(&stat, statbuf);
172} 172}
173 173
174asmlinkage long compat_sys_newlstat(const char __user * filename, 174COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
175 struct compat_stat __user *statbuf) 175 struct compat_stat __user *, statbuf)
176{ 176{
177 struct kstat stat; 177 struct kstat stat;
178 int error; 178 int error;
@@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
184} 184}
185 185
186#ifndef __ARCH_WANT_STAT64 186#ifndef __ARCH_WANT_STAT64
187asmlinkage long compat_sys_newfstatat(unsigned int dfd, 187COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
188 const char __user *filename, 188 const char __user *, filename,
189 struct compat_stat __user *statbuf, int flag) 189 struct compat_stat __user *, statbuf, int, flag)
190{ 190{
191 struct kstat stat; 191 struct kstat stat;
192 int error; 192 int error;
@@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
198} 198}
199#endif 199#endif
200 200
201asmlinkage long compat_sys_newfstat(unsigned int fd, 201COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
202 struct compat_stat __user * statbuf) 202 struct compat_stat __user *, statbuf)
203{ 203{
204 struct kstat stat; 204 struct kstat stat;
205 int error = vfs_fstat(fd, &stat); 205 int error = vfs_fstat(fd, &stat);
@@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
247 * The following statfs calls are copies of code from fs/statfs.c and 247 * The following statfs calls are copies of code from fs/statfs.c and
248 * should be checked against those from time to time 248 * should be checked against those from time to time
249 */ 249 */
250asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 250COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
251{ 251{
252 struct kstatfs tmp; 252 struct kstatfs tmp;
253 int error = user_statfs(pathname, &tmp); 253 int error = user_statfs(pathname, &tmp);
@@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
256 return error; 256 return error;
257} 257}
258 258
259asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 259COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
260{ 260{
261 struct kstatfs tmp; 261 struct kstatfs tmp;
262 int error = fd_statfs(fd, &tmp); 262 int error = fd_statfs(fd, &tmp);
@@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
298 return 0; 298 return 0;
299} 299}
300 300
301asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 301COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
302{ 302{
303 struct kstatfs tmp; 303 struct kstatfs tmp;
304 int error; 304 int error;
@@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
312 return error; 312 return error;
313} 313}
314 314
315asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 315COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
316{ 316{
317 struct kstatfs tmp; 317 struct kstatfs tmp;
318 int error; 318 int error;
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
331 * Given how simple this syscall is that apporach is more maintainable 331 * Given how simple this syscall is that apporach is more maintainable
332 * than the various conversion hacks. 332 * than the various conversion hacks.
333 */ 333 */
334asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) 334COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
335{ 335{
336 struct compat_ustat tmp; 336 struct compat_ustat tmp;
337 struct kstatfs sbuf; 337 struct kstatfs sbuf;
@@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
399} 399}
400#endif 400#endif
401 401
402asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, 402static unsigned int
403 unsigned long arg) 403convert_fcntl_cmd(unsigned int cmd)
404{
405 switch (cmd) {
406 case F_GETLK64:
407 return F_GETLK;
408 case F_SETLK64:
409 return F_SETLK;
410 case F_SETLKW64:
411 return F_SETLKW;
412 }
413
414 return cmd;
415}
416
417COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
418 compat_ulong_t, arg)
404{ 419{
405 mm_segment_t old_fs; 420 mm_segment_t old_fs;
406 struct flock f; 421 struct flock f;
407 long ret; 422 long ret;
423 unsigned int conv_cmd;
408 424
409 switch (cmd) { 425 switch (cmd) {
410 case F_GETLK: 426 case F_GETLK:
@@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
441 case F_GETLK64: 457 case F_GETLK64:
442 case F_SETLK64: 458 case F_SETLK64:
443 case F_SETLKW64: 459 case F_SETLKW64:
460 case F_GETLKP:
461 case F_SETLKP:
462 case F_SETLKPW:
444 ret = get_compat_flock64(&f, compat_ptr(arg)); 463 ret = get_compat_flock64(&f, compat_ptr(arg));
445 if (ret != 0) 464 if (ret != 0)
446 break; 465 break;
447 old_fs = get_fs(); 466 old_fs = get_fs();
448 set_fs(KERNEL_DS); 467 set_fs(KERNEL_DS);
449 ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : 468 conv_cmd = convert_fcntl_cmd(cmd);
450 ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), 469 ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
451 (unsigned long)&f);
452 set_fs(old_fs); 470 set_fs(old_fs);
453 if (cmd == F_GETLK64 && ret == 0) { 471 if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) {
454 /* need to return lock information - see above for commentary */ 472 /* need to return lock information - see above for commentary */
455 if (f.l_start > COMPAT_LOFF_T_MAX) 473 if (f.l_start > COMPAT_LOFF_T_MAX)
456 ret = -EOVERFLOW; 474 ret = -EOVERFLOW;
@@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
468 return ret; 486 return ret;
469} 487}
470 488
471asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, 489COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
472 unsigned long arg) 490 compat_ulong_t, arg)
473{ 491{
474 if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) 492 switch (cmd) {
493 case F_GETLK64:
494 case F_SETLK64:
495 case F_SETLKW64:
496 case F_GETLKP:
497 case F_SETLKP:
498 case F_SETLKPW:
475 return -EINVAL; 499 return -EINVAL;
500 }
476 return compat_sys_fcntl64(fd, cmd, arg); 501 return compat_sys_fcntl64(fd, cmd, arg);
477} 502}
478 503
479asmlinkage long 504COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
480compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
481{ 505{
482 long ret; 506 long ret;
483 aio_context_t ctx64; 507 aio_context_t ctx64;
@@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
496 return ret; 520 return ret;
497} 521}
498 522
499asmlinkage long 523COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
500compat_sys_io_getevents(aio_context_t ctx_id, 524 compat_long_t, min_nr,
501 unsigned long min_nr, 525 compat_long_t, nr,
502 unsigned long nr, 526 struct io_event __user *, events,
503 struct io_event __user *events, 527 struct compat_timespec __user *, timeout)
504 struct compat_timespec __user *timeout)
505{ 528{
506 long ret;
507 struct timespec t; 529 struct timespec t;
508 struct timespec __user *ut = NULL; 530 struct timespec __user *ut = NULL;
509 531
510 ret = -EFAULT;
511 if (unlikely(!access_ok(VERIFY_WRITE, events,
512 nr * sizeof(struct io_event))))
513 goto out;
514 if (timeout) { 532 if (timeout) {
515 if (get_compat_timespec(&t, timeout)) 533 if (compat_get_timespec(&t, timeout))
516 goto out; 534 return -EFAULT;
517 535
518 ut = compat_alloc_user_space(sizeof(*ut)); 536 ut = compat_alloc_user_space(sizeof(*ut));
519 if (copy_to_user(ut, &t, sizeof(t)) ) 537 if (copy_to_user(ut, &t, sizeof(t)) )
520 goto out; 538 return -EFAULT;
521 } 539 }
522 ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); 540 return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
523out:
524 return ret;
525} 541}
526 542
527/* A write operation does a read from user space and vice versa */ 543/* A write operation does a read from user space and vice versa */
@@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
617 633
618#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) 634#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
619 635
620asmlinkage long 636COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
621compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) 637 int, nr, u32 __user *, iocb)
622{ 638{
623 struct iocb __user * __user *iocb64; 639 struct iocb __user * __user *iocb64;
624 long ret; 640 long ret;
@@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
770#define NCPFS_NAME "ncpfs" 786#define NCPFS_NAME "ncpfs"
771#define NFS4_NAME "nfs4" 787#define NFS4_NAME "nfs4"
772 788
773asmlinkage long compat_sys_mount(const char __user * dev_name, 789COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
774 const char __user * dir_name, 790 const char __user *, dir_name,
775 const char __user * type, unsigned long flags, 791 const char __user *, type, compat_ulong_t, flags,
776 const void __user * data) 792 const void __user *, data)
777{ 793{
778 char *kernel_type; 794 char *kernel_type;
779 unsigned long data_page; 795 unsigned long data_page;
@@ -869,8 +885,8 @@ efault:
869 return -EFAULT; 885 return -EFAULT;
870} 886}
871 887
872asmlinkage long compat_sys_old_readdir(unsigned int fd, 888COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
873 struct compat_old_linux_dirent __user *dirent, unsigned int count) 889 struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
874{ 890{
875 int error; 891 int error;
876 struct fd f = fdget(fd); 892 struct fd f = fdget(fd);
@@ -948,8 +964,8 @@ efault:
948 return -EFAULT; 964 return -EFAULT;
949} 965}
950 966
951asmlinkage long compat_sys_getdents(unsigned int fd, 967COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
952 struct compat_linux_dirent __user *dirent, unsigned int count) 968 struct compat_linux_dirent __user *, dirent, unsigned int, count)
953{ 969{
954 struct fd f; 970 struct fd f;
955 struct compat_linux_dirent __user * lastdirent; 971 struct compat_linux_dirent __user * lastdirent;
@@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
981 return error; 997 return error;
982} 998}
983 999
984#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 1000#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
985 1001
986struct compat_getdents_callback64 { 1002struct compat_getdents_callback64 {
987 struct dir_context ctx; 1003 struct dir_context ctx;
@@ -1033,8 +1049,8 @@ efault:
1033 return -EFAULT; 1049 return -EFAULT;
1034} 1050}
1035 1051
1036asmlinkage long compat_sys_getdents64(unsigned int fd, 1052COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
1037 struct linux_dirent64 __user * dirent, unsigned int count) 1053 struct linux_dirent64 __user *, dirent, unsigned int, count)
1038{ 1054{
1039 struct fd f; 1055 struct fd f;
1040 struct linux_dirent64 __user * lastdirent; 1056 struct linux_dirent64 __user * lastdirent;
@@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1066 fdput(f); 1082 fdput(f);
1067 return error; 1083 return error;
1068} 1084}
1069#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ 1085#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
1070 1086
1071/* 1087/*
1072 * Exactly like fs/open.c:sys_open(), except that it doesn't set the 1088 * Exactly like fs/open.c:sys_open(), except that it doesn't set the
@@ -1287,9 +1303,9 @@ out_nofds:
1287 return ret; 1303 return ret;
1288} 1304}
1289 1305
1290asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, 1306COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
1291 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1307 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1292 struct compat_timeval __user *tvp) 1308 struct compat_timeval __user *, tvp)
1293{ 1309{
1294 struct timespec end_time, *to = NULL; 1310 struct timespec end_time, *to = NULL;
1295 struct compat_timeval tv; 1311 struct compat_timeval tv;
@@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct {
1320 compat_uptr_t tvp; 1336 compat_uptr_t tvp;
1321}; 1337};
1322 1338
1323asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) 1339COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
1324{ 1340{
1325 struct compat_sel_arg_struct a; 1341 struct compat_sel_arg_struct a;
1326 1342
@@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1381 return ret; 1397 return ret;
1382} 1398}
1383 1399
1384asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, 1400COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
1385 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1401 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1386 struct compat_timespec __user *tsp, void __user *sig) 1402 struct compat_timespec __user *, tsp, void __user *, sig)
1387{ 1403{
1388 compat_size_t sigsetsize = 0; 1404 compat_size_t sigsetsize = 0;
1389 compat_uptr_t up = 0; 1405 compat_uptr_t up = 0;
@@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
1400 sigsetsize); 1416 sigsetsize);
1401} 1417}
1402 1418
1403asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, 1419COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
1404 unsigned int nfds, struct compat_timespec __user *tsp, 1420 unsigned int, nfds, struct compat_timespec __user *, tsp,
1405 const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) 1421 const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
1406{ 1422{
1407 compat_sigset_t ss32; 1423 compat_sigset_t ss32;
1408 sigset_t ksigmask, sigsaved; 1424 sigset_t ksigmask, sigsaved;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index a81147e2e4ef..4d24d17bcfc1 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
88#define ELF_HWCAP COMPAT_ELF_HWCAP 88#define ELF_HWCAP COMPAT_ELF_HWCAP
89#endif 89#endif
90 90
91#ifdef COMPAT_ELF_HWCAP2
92#undef ELF_HWCAP2
93#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
94#endif
95
91#ifdef COMPAT_ARCH_DLINFO 96#ifdef COMPAT_ARCH_DLINFO
92#undef ARCH_DLINFO 97#undef ARCH_DLINFO
93#define ARCH_DLINFO COMPAT_ARCH_DLINFO 98#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3881610b6438..e82289047272 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd)
1538 return ioctl_pointer[i] == xcmd; 1538 return ioctl_pointer[i] == xcmd;
1539} 1539}
1540 1540
1541asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, 1541COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
1542 unsigned long arg) 1542 compat_ulong_t, arg32)
1543{ 1543{
1544 unsigned long arg = arg32;
1544 struct fd f = fdget(fd); 1545 struct fd f = fdget(fd);
1545 int error = -EBADF; 1546 int error = -EBADF;
1546 if (!f.file) 1547 if (!f.file)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf94d57..ddcfe590b8a8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
195 struct page *page = NULL; 195 struct page *page = NULL;
196 196
197 if (blocknr + i < devsize) { 197 if (blocknr + i < devsize) {
198 page = read_mapping_page_async(mapping, blocknr + i, 198 page = read_mapping_page(mapping, blocknr + i, NULL);
199 NULL);
200 /* synchronous error? */ 199 /* synchronous error? */
201 if (IS_ERR(page)) 200 if (IS_ERR(page))
202 page = NULL; 201 page = NULL;
@@ -244,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
244 243
245static int cramfs_remount(struct super_block *sb, int *flags, char *data) 244static int cramfs_remount(struct super_block *sb, int *flags, char *data)
246{ 245{
246 sync_filesystem(sb);
247 *flags |= MS_RDONLY; 247 *flags |= MS_RDONLY;
248 return 0; 248 return 0;
249} 249}
diff --git a/fs/dcache.c b/fs/dcache.c
index 089f681ac952..40707d88a945 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2483,12 +2483,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2483 dentry->d_name.name = dentry->d_iname; 2483 dentry->d_name.name = dentry->d_iname;
2484 } else { 2484 } else {
2485 /* 2485 /*
2486 * Both are internal. Just copy target to dentry 2486 * Both are internal.
2487 */ 2487 */
2488 memcpy(dentry->d_iname, target->d_name.name, 2488 unsigned int i;
2489 target->d_name.len + 1); 2489 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
2490 dentry->d_name.len = target->d_name.len; 2490 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
2491 return; 2491 swap(((long *) &dentry->d_iname)[i],
2492 ((long *) &target->d_iname)[i]);
2493 }
2492 } 2494 }
2493 } 2495 }
2494 swap(dentry->d_name.len, target->d_name.len); 2496 swap(dentry->d_name.len, target->d_name.len);
@@ -2545,13 +2547,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
2545 * __d_move - move a dentry 2547 * __d_move - move a dentry
2546 * @dentry: entry to move 2548 * @dentry: entry to move
2547 * @target: new dentry 2549 * @target: new dentry
2550 * @exchange: exchange the two dentries
2548 * 2551 *
2549 * Update the dcache to reflect the move of a file name. Negative 2552 * Update the dcache to reflect the move of a file name. Negative
2550 * dcache entries should not be moved in this way. Caller must hold 2553 * dcache entries should not be moved in this way. Caller must hold
2551 * rename_lock, the i_mutex of the source and target directories, 2554 * rename_lock, the i_mutex of the source and target directories,
2552 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). 2555 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
2553 */ 2556 */
2554static void __d_move(struct dentry * dentry, struct dentry * target) 2557static void __d_move(struct dentry *dentry, struct dentry *target,
2558 bool exchange)
2555{ 2559{
2556 if (!dentry->d_inode) 2560 if (!dentry->d_inode)
2557 printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 2561 printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2573,8 +2577,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2573 __d_drop(dentry); 2577 __d_drop(dentry);
2574 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); 2578 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
2575 2579
2576 /* Unhash the target: dput() will then get rid of it */ 2580 /*
2581 * Unhash the target (d_delete() is not usable here). If exchanging
2582 * the two dentries, then rehash onto the other's hash queue.
2583 */
2577 __d_drop(target); 2584 __d_drop(target);
2585 if (exchange) {
2586 __d_rehash(target,
2587 d_hash(dentry->d_parent, dentry->d_name.hash));
2588 }
2578 2589
2579 list_del(&dentry->d_u.d_child); 2590 list_del(&dentry->d_u.d_child);
2580 list_del(&target->d_u.d_child); 2591 list_del(&target->d_u.d_child);
@@ -2601,6 +2612,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2601 write_seqcount_end(&dentry->d_seq); 2612 write_seqcount_end(&dentry->d_seq);
2602 2613
2603 dentry_unlock_parents_for_move(dentry, target); 2614 dentry_unlock_parents_for_move(dentry, target);
2615 if (exchange)
2616 fsnotify_d_move(target);
2604 spin_unlock(&target->d_lock); 2617 spin_unlock(&target->d_lock);
2605 fsnotify_d_move(dentry); 2618 fsnotify_d_move(dentry);
2606 spin_unlock(&dentry->d_lock); 2619 spin_unlock(&dentry->d_lock);
@@ -2618,11 +2631,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2618void d_move(struct dentry *dentry, struct dentry *target) 2631void d_move(struct dentry *dentry, struct dentry *target)
2619{ 2632{
2620 write_seqlock(&rename_lock); 2633 write_seqlock(&rename_lock);
2621 __d_move(dentry, target); 2634 __d_move(dentry, target, false);
2622 write_sequnlock(&rename_lock); 2635 write_sequnlock(&rename_lock);
2623} 2636}
2624EXPORT_SYMBOL(d_move); 2637EXPORT_SYMBOL(d_move);
2625 2638
2639/*
2640 * d_exchange - exchange two dentries
2641 * @dentry1: first dentry
2642 * @dentry2: second dentry
2643 */
2644void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
2645{
2646 write_seqlock(&rename_lock);
2647
2648 WARN_ON(!dentry1->d_inode);
2649 WARN_ON(!dentry2->d_inode);
2650 WARN_ON(IS_ROOT(dentry1));
2651 WARN_ON(IS_ROOT(dentry2));
2652
2653 __d_move(dentry1, dentry2, true);
2654
2655 write_sequnlock(&rename_lock);
2656}
2657
2626/** 2658/**
2627 * d_ancestor - search for an ancestor 2659 * d_ancestor - search for an ancestor
2628 * @p1: ancestor dentry 2660 * @p1: ancestor dentry
@@ -2670,7 +2702,7 @@ static struct dentry *__d_unalias(struct inode *inode,
2670 m2 = &alias->d_parent->d_inode->i_mutex; 2702 m2 = &alias->d_parent->d_inode->i_mutex;
2671out_unalias: 2703out_unalias:
2672 if (likely(!d_mountpoint(alias))) { 2704 if (likely(!d_mountpoint(alias))) {
2673 __d_move(alias, dentry); 2705 __d_move(alias, dentry, false);
2674 ret = alias; 2706 ret = alias;
2675 } 2707 }
2676out_err: 2708out_err:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444cccbe1..8c41b52da358 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
218 int err; 218 int err;
219 struct debugfs_fs_info *fsi = sb->s_fs_info; 219 struct debugfs_fs_info *fsi = sb->s_fs_info;
220 220
221 sync_filesystem(sb);
221 err = debugfs_parse_options(data, &fsi->mount_opts); 222 err = debugfs_parse_options(data, &fsi->mount_opts);
222 if (err) 223 if (err)
223 goto fail; 224 goto fail;
@@ -358,7 +359,7 @@ exit:
358 * @name: a pointer to a string containing the name of the file to create. 359 * @name: a pointer to a string containing the name of the file to create.
359 * @mode: the permission that the file should have. 360 * @mode: the permission that the file should have.
360 * @parent: a pointer to the parent dentry for this file. This should be a 361 * @parent: a pointer to the parent dentry for this file. This should be a
361 * directory dentry if set. If this paramater is NULL, then the 362 * directory dentry if set. If this parameter is NULL, then the
362 * file will be created in the root of the debugfs filesystem. 363 * file will be created in the root of the debugfs filesystem.
363 * @data: a pointer to something that the caller will want to get to later 364 * @data: a pointer to something that the caller will want to get to later
364 * on. The inode.i_private pointer will point to this value on 365 * on. The inode.i_private pointer will point to this value on
@@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
400 * @name: a pointer to a string containing the name of the directory to 401 * @name: a pointer to a string containing the name of the directory to
401 * create. 402 * create.
402 * @parent: a pointer to the parent dentry for this file. This should be a 403 * @parent: a pointer to the parent dentry for this file. This should be a
403 * directory dentry if set. If this paramater is NULL, then the 404 * directory dentry if set. If this parameter is NULL, then the
404 * directory will be created in the root of the debugfs filesystem. 405 * directory will be created in the root of the debugfs filesystem.
405 * 406 *
406 * This function creates a directory in debugfs with the given name. 407 * This function creates a directory in debugfs with the given name.
@@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
425 * @name: a pointer to a string containing the name of the symbolic link to 426 * @name: a pointer to a string containing the name of the symbolic link to
426 * create. 427 * create.
427 * @parent: a pointer to the parent dentry for this symbolic link. This 428 * @parent: a pointer to the parent dentry for this symbolic link. This
428 * should be a directory dentry if set. If this paramater is NULL, 429 * should be a directory dentry if set. If this parameter is NULL,
429 * then the symbolic link will be created in the root of the debugfs 430 * then the symbolic link will be created in the root of the debugfs
430 * filesystem. 431 * filesystem.
431 * @target: a pointer to a string containing the path to the target of the 432 * @target: a pointer to a string containing the path to the target of the
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f29cb7..c71038079b47 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
313 struct pts_fs_info *fsi = DEVPTS_SB(sb); 313 struct pts_fs_info *fsi = DEVPTS_SB(sb);
314 struct pts_mount_opts *opts = &fsi->mount_opts; 314 struct pts_mount_opts *opts = &fsi->mount_opts;
315 315
316 sync_filesystem(sb);
316 err = parse_mount_options(data, PARSE_REMOUNT, opts); 317 err = parse_mount_options(data, PARSE_REMOUNT, opts);
317 318
318 /* 319 /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a5489a939..31ba0935e32e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
664 goto out; 664 goto out;
665 sector = start_sector << (sdio->blkbits - 9); 665 sector = start_sector << (sdio->blkbits - 9);
666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); 666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
667 nr_pages = min(nr_pages, BIO_MAX_PAGES);
668 BUG_ON(nr_pages <= 0); 667 BUG_ON(nr_pages <= 0);
669 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); 668 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
670 sdio->boundary = 0; 669 sdio->boundary = 0;
@@ -1194,13 +1193,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1194 } 1193 }
1195 1194
1196 /* 1195 /*
1197 * For file extending writes updating i_size before data 1196 * For file extending writes updating i_size before data writeouts
1198 * writeouts complete can expose uninitialized blocks. So 1197 * complete can expose uninitialized blocks in dumb filesystems.
1199 * even for AIO, we need to wait for i/o to complete before 1198 * In that case we need to wait for I/O completion even if asked
1200 * returning in this case. 1199 * for an asynchronous write.
1201 */ 1200 */
1202 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1201 if (is_sync_kiocb(iocb))
1203 (end > i_size_read(inode))); 1202 dio->is_async = false;
1203 else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
1204 (rw & WRITE) && end > i_size_read(inode))
1205 dio->is_async = false;
1206 else
1207 dio->is_async = true;
1208
1204 dio->inode = inode; 1209 dio->inode = inode;
1205 dio->rw = rw; 1210 dio->rw = rw;
1206 1211
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 0e90f0c91b93..dcea1e37a1b7 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17#include "ast.h"
17 18
18static uint64_t dlm_cb_seq; 19static uint64_t dlm_cb_seq;
19static DEFINE_SPINLOCK(dlm_cb_seq_spin); 20static DEFINE_SPINLOCK(dlm_cb_seq_spin);
@@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
308 mutex_unlock(&ls->ls_cb_mutex); 309 mutex_unlock(&ls->ls_cb_mutex);
309 310
310 if (count) 311 if (count)
311 log_debug(ls, "dlm_callback_resume %d", count); 312 log_rinfo(ls, "dlm_callback_resume %d", count);
312} 313}
313 314
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda446..d975851a7e1e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
68 uint16_t namelen; 68 uint16_t namelen;
69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; 69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
70 70
71 log_debug(ls, "dlm_recover_directory"); 71 log_rinfo(ls, "dlm_recover_directory");
72 72
73 if (dlm_no_directory(ls)) 73 if (dlm_no_directory(ls))
74 goto out_status; 74 goto out_status;
@@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
189 error = 0; 189 error = 0;
190 dlm_set_recover_status(ls, DLM_RS_DIR); 190 dlm_set_recover_status(ls, DLM_RS_DIR);
191 191
192 log_debug(ls, "dlm_recover_directory %u in %u new", 192 log_rinfo(ls, "dlm_recover_directory %u in %u new",
193 count, count_add); 193 count, count_add);
194 out_free: 194 out_free:
195 kfree(last_name); 195 kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e7665c31f7b1..5eff6ea3e27f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,6 +65,8 @@ struct dlm_mhandle;
65 printk(KERN_ERR "dlm: "fmt"\n" , ##args) 65 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
66#define log_error(ls, fmt, args...) \ 66#define log_error(ls, fmt, args...) \
67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) 67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
68#define log_rinfo(ls, fmt, args...) \
69 printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
68 70
69#define log_debug(ls, fmt, args...) \ 71#define log_debug(ls, fmt, args...) \
70do { \ 72do { \
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e223a911a834..83f3d5520307 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s", 687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
688 from_nodeid, dir_nodeid, our_nodeid, r->res_name); 688 from_nodeid, dir_nodeid, our_nodeid, r->res_name);
689 dlm_free_rsb(r); 689 dlm_free_rsb(r);
690 r = NULL;
690 error = -ENOTBLK; 691 error = -ENOTBLK;
691 goto out_unlock; 692 goto out_unlock;
692 } 693 }
@@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls)
5462 up_write(&ls->ls_root_sem); 5463 up_write(&ls->ls_root_sem);
5463 5464
5464 if (lkb_count) 5465 if (lkb_count)
5465 log_debug(ls, "dlm_recover_purge %u locks for %u nodes", 5466 log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
5466 lkb_count, nodes_count); 5467 lkb_count, nodes_count);
5467} 5468}
5468 5469
@@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls)
5536 } 5537 }
5537 5538
5538 if (lkb_count) 5539 if (lkb_count)
5539 log_debug(ls, "dlm_recover_grant %u locks on %u resources", 5540 log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
5540 lkb_count, rsb_count); 5541 lkb_count, rsb_count);
5541} 5542}
5542 5543
@@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5695 put_rsb(r); 5696 put_rsb(r);
5696 out: 5697 out:
5697 if (error && error != -EEXIST) 5698 if (error && error != -EEXIST)
5698 log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", 5699 log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
5699 from_nodeid, remid, error); 5700 from_nodeid, remid, error);
5700 rl->rl_result = cpu_to_le32(error); 5701 rl->rl_result = cpu_to_le32(error);
5701 return error; 5702 return error;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d5abafd56a6d..04d6398c1f1c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -190,7 +190,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
190 else 190 else
191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); 191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
192 192
193 log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); 193 log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
194 194
195 /* dlm_controld will see the uevent, do the necessary group management 195 /* dlm_controld will see the uevent, do the necessary group management
196 and then write to sysfs to wake us */ 196 and then write to sysfs to wake us */
@@ -198,7 +198,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
198 error = wait_event_interruptible(ls->ls_uevent_wait, 198 error = wait_event_interruptible(ls->ls_uevent_wait,
199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); 199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
200 200
201 log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); 201 log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
202 202
203 if (error) 203 if (error)
204 goto out; 204 goto out;
@@ -640,7 +640,7 @@ static int new_lockspace(const char *name, const char *cluster,
640 640
641 dlm_create_debug_file(ls); 641 dlm_create_debug_file(ls);
642 642
643 log_debug(ls, "join complete"); 643 log_rinfo(ls, "join complete");
644 *lockspace = ls; 644 *lockspace = ls;
645 return 0; 645 return 0;
646 646
@@ -835,7 +835,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
835 dlm_clear_members(ls); 835 dlm_clear_members(ls);
836 dlm_clear_members_gone(ls); 836 dlm_clear_members_gone(ls);
837 kfree(ls->ls_node_array); 837 kfree(ls->ls_node_array);
838 log_debug(ls, "release_lockspace final free"); 838 log_rinfo(ls, "release_lockspace final free");
839 kobject_put(&ls->ls_kobj); 839 kobject_put(&ls->ls_kobj);
840 /* The ls structure will be freed when the kobject is done with */ 840 /* The ls structure will be freed when the kobject is done with */
841 841
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b54921..9c47f1c14a8b 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
60 60
61#define SLOT_DEBUG_LINE 128 61#define SLOT_DEBUG_LINE 128
62 62
63static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, 63static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
64 struct rcom_slot *ro0, struct dlm_slot *array, 64 struct rcom_slot *ro0, struct dlm_slot *array,
65 int array_size) 65 int array_size)
66{ 66{
67 char line[SLOT_DEBUG_LINE]; 67 char line[SLOT_DEBUG_LINE];
68 int len = SLOT_DEBUG_LINE - 1; 68 int len = SLOT_DEBUG_LINE - 1;
69 int pos = 0; 69 int pos = 0;
70 int ret, i; 70 int ret, i;
71 71
72 if (!dlm_config.ci_log_debug)
73 return;
74
75 memset(line, 0, sizeof(line)); 72 memset(line, 0, sizeof(line));
76 73
77 if (array) { 74 if (array) {
@@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
95 } 92 }
96 } 93 }
97 94
98 log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); 95 log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
99} 96}
100 97
101int dlm_slots_copy_in(struct dlm_ls *ls) 98int dlm_slots_copy_in(struct dlm_ls *ls)
@@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
129 ro->ro_slot = le16_to_cpu(ro->ro_slot); 126 ro->ro_slot = le16_to_cpu(ro->ro_slot);
130 } 127 }
131 128
132 log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); 129 log_slots(ls, gen, num_slots, ro0, NULL, 0);
133 130
134 list_for_each_entry(memb, &ls->ls_nodes, list) { 131 list_for_each_entry(memb, &ls->ls_nodes, list) {
135 for (i = 0, ro = ro0; i < num_slots; i++, ro++) { 132 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
@@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
274 271
275 gen++; 272 gen++;
276 273
277 log_debug_slots(ls, gen, num, NULL, array, array_size); 274 log_slots(ls, gen, num, NULL, array, array_size);
278 275
279 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - 276 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
280 sizeof(struct rcom_config)) / sizeof(struct rcom_slot); 277 sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
@@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls)
447 break; 444 break;
448 } 445 }
449 if (error) 446 if (error)
450 log_debug(ls, "ping_members aborted %d last nodeid %d", 447 log_rinfo(ls, "ping_members aborted %d last nodeid %d",
451 error, ls->ls_recover_nodeid); 448 error, ls->ls_recover_nodeid);
452 return error; 449 return error;
453} 450}
@@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
539 count as a negative change so the "neg" recovery steps will happen */ 536 count as a negative change so the "neg" recovery steps will happen */
540 537
541 list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 538 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
542 log_debug(ls, "prev removed member %d", memb->nodeid); 539 log_rinfo(ls, "prev removed member %d", memb->nodeid);
543 neg++; 540 neg++;
544 } 541 }
545 542
@@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
551 continue; 548 continue;
552 549
553 if (!node) { 550 if (!node) {
554 log_debug(ls, "remove member %d", memb->nodeid); 551 log_rinfo(ls, "remove member %d", memb->nodeid);
555 } else { 552 } else {
556 /* removed and re-added */ 553 /* removed and re-added */
557 log_debug(ls, "remove member %d comm_seq %u %u", 554 log_rinfo(ls, "remove member %d comm_seq %u %u",
558 memb->nodeid, memb->comm_seq, node->comm_seq); 555 memb->nodeid, memb->comm_seq, node->comm_seq);
559 } 556 }
560 557
@@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
571 if (dlm_is_member(ls, node->nodeid)) 568 if (dlm_is_member(ls, node->nodeid))
572 continue; 569 continue;
573 dlm_add_member(ls, node); 570 dlm_add_member(ls, node);
574 log_debug(ls, "add member %d", node->nodeid); 571 log_rinfo(ls, "add member %d", node->nodeid);
575 } 572 }
576 573
577 list_for_each_entry(memb, &ls->ls_nodes, list) { 574 list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
591 complete(&ls->ls_members_done); 588 complete(&ls->ls_members_done);
592 } 589 }
593 590
594 log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); 591 log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
595 return error; 592 return error;
596} 593}
597 594
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a6bc63f6e31b..eaea789bf97d 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
526 int nodir = dlm_no_directory(ls); 526 int nodir = dlm_no_directory(ls);
527 int error; 527 int error;
528 528
529 log_debug(ls, "dlm_recover_masters"); 529 log_rinfo(ls, "dlm_recover_masters");
530 530
531 down_read(&ls->ls_root_sem); 531 down_read(&ls->ls_root_sem);
532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
552 } 552 }
553 up_read(&ls->ls_root_sem); 553 up_read(&ls->ls_root_sem);
554 554
555 log_debug(ls, "dlm_recover_masters %u of %u", count, total); 555 log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
556 556
557 error = dlm_wait_function(ls, &recover_idr_empty); 557 error = dlm_wait_function(ls, &recover_idr_empty);
558 out: 558 out:
@@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
685 } 685 }
686 up_read(&ls->ls_root_sem); 686 up_read(&ls->ls_root_sem);
687 687
688 log_debug(ls, "dlm_recover_locks %d out", count); 688 log_rinfo(ls, "dlm_recover_locks %d out", count);
689 689
690 error = dlm_wait_function(ls, &recover_list_empty); 690 error = dlm_wait_function(ls, &recover_list_empty);
691 out: 691 out:
@@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
883 up_read(&ls->ls_root_sem); 883 up_read(&ls->ls_root_sem);
884 884
885 if (count) 885 if (count)
886 log_debug(ls, "dlm_recover_rsbs %d done", count); 886 log_rinfo(ls, "dlm_recover_rsbs %d done", count);
887} 887}
888 888
889/* Create a single list of all root rsb's to be used during recovery */ 889/* Create a single list of all root rsb's to be used during recovery */
@@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls)
950 } 950 }
951 951
952 if (count) 952 if (count)
953 log_debug(ls, "dlm_clear_toss %u done", count); 953 log_rinfo(ls, "dlm_clear_toss %u done", count);
954} 954}
955 955
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec3..6859b4bf971e 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
55 unsigned long start; 55 unsigned long start;
56 int error, neg = 0; 56 int error, neg = 0;
57 57
58 log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); 58 log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
59 59
60 mutex_lock(&ls->ls_recoverd_active); 60 mutex_lock(&ls->ls_recoverd_active);
61 61
@@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
76 76
77 error = dlm_recover_members(ls, rv, &neg); 77 error = dlm_recover_members(ls, rv, &neg);
78 if (error) { 78 if (error) {
79 log_debug(ls, "dlm_recover_members error %d", error); 79 log_rinfo(ls, "dlm_recover_members error %d", error);
80 goto fail; 80 goto fail;
81 } 81 }
82 82
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
90 90
91 error = dlm_recover_members_wait(ls); 91 error = dlm_recover_members_wait(ls);
92 if (error) { 92 if (error) {
93 log_debug(ls, "dlm_recover_members_wait error %d", error); 93 log_rinfo(ls, "dlm_recover_members_wait error %d", error);
94 goto fail; 94 goto fail;
95 } 95 }
96 96
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
103 103
104 error = dlm_recover_directory(ls); 104 error = dlm_recover_directory(ls);
105 if (error) { 105 if (error) {
106 log_debug(ls, "dlm_recover_directory error %d", error); 106 log_rinfo(ls, "dlm_recover_directory error %d", error);
107 goto fail; 107 goto fail;
108 } 108 }
109 109
@@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
111 111
112 error = dlm_recover_directory_wait(ls); 112 error = dlm_recover_directory_wait(ls);
113 if (error) { 113 if (error) {
114 log_debug(ls, "dlm_recover_directory_wait error %d", error); 114 log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
115 goto fail; 115 goto fail;
116 } 116 }
117 117
118 log_debug(ls, "dlm_recover_directory %u out %u messages", 118 log_rinfo(ls, "dlm_recover_directory %u out %u messages",
119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); 119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
120 120
121 /* 121 /*
@@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
144 144
145 error = dlm_recover_masters(ls); 145 error = dlm_recover_masters(ls);
146 if (error) { 146 if (error) {
147 log_debug(ls, "dlm_recover_masters error %d", error); 147 log_rinfo(ls, "dlm_recover_masters error %d", error);
148 goto fail; 148 goto fail;
149 } 149 }
150 150
@@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
154 154
155 error = dlm_recover_locks(ls); 155 error = dlm_recover_locks(ls);
156 if (error) { 156 if (error) {
157 log_debug(ls, "dlm_recover_locks error %d", error); 157 log_rinfo(ls, "dlm_recover_locks error %d", error);
158 goto fail; 158 goto fail;
159 } 159 }
160 160
@@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
162 162
163 error = dlm_recover_locks_wait(ls); 163 error = dlm_recover_locks_wait(ls);
164 if (error) { 164 if (error) {
165 log_debug(ls, "dlm_recover_locks_wait error %d", error); 165 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
166 goto fail; 166 goto fail;
167 } 167 }
168 168
169 log_debug(ls, "dlm_recover_locks %u in", 169 log_rinfo(ls, "dlm_recover_locks %u in",
170 ls->ls_recover_locks_in); 170 ls->ls_recover_locks_in);
171 171
172 /* 172 /*
@@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
186 186
187 error = dlm_recover_locks_wait(ls); 187 error = dlm_recover_locks_wait(ls);
188 if (error) { 188 if (error) {
189 log_debug(ls, "dlm_recover_locks_wait error %d", error); 189 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
190 goto fail; 190 goto fail;
191 } 191 }
192 } 192 }
@@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
205 205
206 error = dlm_recover_done_wait(ls); 206 error = dlm_recover_done_wait(ls);
207 if (error) { 207 if (error) {
208 log_debug(ls, "dlm_recover_done_wait error %d", error); 208 log_rinfo(ls, "dlm_recover_done_wait error %d", error);
209 goto fail; 209 goto fail;
210 } 210 }
211 211
@@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
217 217
218 error = enable_locking(ls, rv->seq); 218 error = enable_locking(ls, rv->seq);
219 if (error) { 219 if (error) {
220 log_debug(ls, "enable_locking error %d", error); 220 log_rinfo(ls, "enable_locking error %d", error);
221 goto fail; 221 goto fail;
222 } 222 }
223 223
224 error = dlm_process_requestqueue(ls); 224 error = dlm_process_requestqueue(ls);
225 if (error) { 225 if (error) {
226 log_debug(ls, "dlm_process_requestqueue error %d", error); 226 log_rinfo(ls, "dlm_process_requestqueue error %d", error);
227 goto fail; 227 goto fail;
228 } 228 }
229 229
230 error = dlm_recover_waiters_post(ls); 230 error = dlm_recover_waiters_post(ls);
231 if (error) { 231 if (error) {
232 log_debug(ls, "dlm_recover_waiters_post error %d", error); 232 log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
233 goto fail; 233 goto fail;
234 } 234 }
235 235
236 dlm_recover_grant(ls); 236 dlm_recover_grant(ls);
237 237
238 log_debug(ls, "dlm_recover %llu generation %u done: %u ms", 238 log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
239 (unsigned long long)rv->seq, ls->ls_generation, 239 (unsigned long long)rv->seq, ls->ls_generation,
240 jiffies_to_msecs(jiffies - start)); 240 jiffies_to_msecs(jiffies - start));
241 mutex_unlock(&ls->ls_recoverd_active); 241 mutex_unlock(&ls->ls_recoverd_active);
@@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
245 245
246 fail: 246 fail:
247 dlm_release_root_list(ls); 247 dlm_release_root_list(ls);
248 log_debug(ls, "dlm_recover %llu error %d", 248 log_rinfo(ls, "dlm_recover %llu error %d",
249 (unsigned long long)rv->seq, error); 249 (unsigned long long)rv->seq, error);
250 mutex_unlock(&ls->ls_recoverd_active); 250 mutex_unlock(&ls->ls_recoverd_active);
251 return error; 251 return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb2..9280202e488c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
59 if (ret) 59 if (ret)
60 return ret; 60 return ret;
61 if (write) { 61 if (write) {
62 if (sysctl_drop_caches & 1) 62 static int stfu;
63
64 if (sysctl_drop_caches & 1) {
63 iterate_supers(drop_pagecache_sb, NULL); 65 iterate_supers(drop_pagecache_sb, NULL);
64 if (sysctl_drop_caches & 2) 66 count_vm_event(DROP_PAGECACHE);
67 }
68 if (sysctl_drop_caches & 2) {
65 drop_slab(); 69 drop_slab();
70 count_vm_event(DROP_SLAB);
71 }
72 if (!stfu) {
73 pr_info("%s (%d): drop_caches: %d\n",
74 current->comm, task_pid_nr(current),
75 sysctl_drop_caches);
76 }
77 stfu |= sysctl_drop_caches & 4;
66 } 78 }
67 return 0; 79 return 0;
68} 80}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b167ca48b8ee..d4a9431ec73c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
641 } 641 }
642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, 642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
643 lower_new_dir_dentry->d_inode, lower_new_dentry, 643 lower_new_dir_dentry->d_inode, lower_new_dentry,
644 NULL); 644 NULL, 0);
645 if (rc) 645 if (rc)
646 goto out_lock; 646 goto out_lock;
647 if (target_inode) 647 if (target_inode)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8ff0b1..afa1b81c3418 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
132 */ 132 */
133static void ecryptfs_evict_inode(struct inode *inode) 133static void ecryptfs_evict_inode(struct inode *inode)
134{ 134{
135 truncate_inode_pages(&inode->i_data, 0); 135 truncate_inode_pages_final(&inode->i_data);
136 clear_inode(inode); 136 clear_inode(inode);
137 iput(ecryptfs_inode_to_lower(inode)); 137 iput(ecryptfs_inode_to_lower(inode));
138} 138}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8dd524f32284..cdb2971192a5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file,
21 u32 attributes; 21 u32 attributes;
22 struct inode *inode = file->f_mapping->host; 22 struct inode *inode = file->f_mapping->host;
23 unsigned long datasize = count - sizeof(attributes); 23 unsigned long datasize = count - sizeof(attributes);
24 ssize_t bytes = 0; 24 ssize_t bytes;
25 bool set = false; 25 bool set = false;
26 26
27 if (count < sizeof(attributes)) 27 if (count < sizeof(attributes))
@@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file,
33 if (attributes & ~(EFI_VARIABLE_MASK)) 33 if (attributes & ~(EFI_VARIABLE_MASK))
34 return -EINVAL; 34 return -EINVAL;
35 35
36 data = kmalloc(datasize, GFP_KERNEL); 36 data = memdup_user(userbuf + sizeof(attributes), datasize);
37 if (!data) 37 if (IS_ERR(data))
38 return -ENOMEM; 38 return PTR_ERR(data);
39
40 if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
41 bytes = -EFAULT;
42 goto out;
43 }
44 39
45 bytes = efivar_entry_set_get_size(var, attributes, &datasize, 40 bytes = efivar_entry_set_get_size(var, attributes, &datasize,
46 data, &set); 41 data, &set);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 50215bbd6463..3befcc9f5d63 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -91,7 +91,7 @@ static void init_once(void *foo)
91 inode_init_once(&ei->vfs_inode); 91 inode_init_once(&ei->vfs_inode);
92} 92}
93 93
94static int init_inodecache(void) 94static int __init init_inodecache(void)
95{ 95{
96 efs_inode_cachep = kmem_cache_create("efs_inode_cache", 96 efs_inode_cachep = kmem_cache_create("efs_inode_cache",
97 sizeof(struct efs_inode_info), 97 sizeof(struct efs_inode_info),
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
114 114
115static int efs_remount(struct super_block *sb, int *flags, char *data) 115static int efs_remount(struct super_block *sb, int *flags, char *data)
116{ 116{
117 sync_filesystem(sb);
117 *flags |= MS_RDONLY; 118 *flags |= MS_RDONLY;
118 return 0; 119 return 0;
119} 120}
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fccdd723..9e81c630dfa7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/fdtable.h> 27#include <linux/fdtable.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/vmacache.h>
29#include <linux/stat.h> 30#include <linux/stat.h>
30#include <linux/fcntl.h> 31#include <linux/fcntl.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
@@ -97,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
97 module_put(fmt->module); 98 module_put(fmt->module);
98} 99}
99 100
101#ifdef CONFIG_USELIB
100/* 102/*
101 * Note that a shared library must be both readable and executable due to 103 * Note that a shared library must be both readable and executable due to
102 * security reasons. 104 * security reasons.
@@ -156,6 +158,7 @@ exit:
156out: 158out:
157 return error; 159 return error;
158} 160}
161#endif /* #ifdef CONFIG_USELIB */
159 162
160#ifdef CONFIG_MMU 163#ifdef CONFIG_MMU
161/* 164/*
@@ -820,7 +823,7 @@ EXPORT_SYMBOL(read_code);
820static int exec_mmap(struct mm_struct *mm) 823static int exec_mmap(struct mm_struct *mm)
821{ 824{
822 struct task_struct *tsk; 825 struct task_struct *tsk;
823 struct mm_struct * old_mm, *active_mm; 826 struct mm_struct *old_mm, *active_mm;
824 827
825 /* Notify parent that we're no longer interested in the old VM */ 828 /* Notify parent that we're no longer interested in the old VM */
826 tsk = current; 829 tsk = current;
@@ -846,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
846 tsk->mm = mm; 849 tsk->mm = mm;
847 tsk->active_mm = mm; 850 tsk->active_mm = mm;
848 activate_mm(active_mm, mm); 851 activate_mm(active_mm, mm);
852 tsk->mm->vmacache_seqnum = 0;
853 vmacache_flush(tsk);
849 task_unlock(tsk); 854 task_unlock(tsk);
850 if (old_mm) { 855 if (old_mm) {
851 up_read(&old_mm->mmap_sem); 856 up_read(&old_mm->mmap_sem);
@@ -1041,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
1041 * so that a new one can be started 1046 * so that a new one can be started
1042 */ 1047 */
1043 1048
1044void set_task_comm(struct task_struct *tsk, char *buf) 1049void set_task_comm(struct task_struct *tsk, const char *buf)
1045{ 1050{
1046 task_lock(tsk); 1051 task_lock(tsk);
1047 trace_task_rename(tsk, buf); 1052 trace_task_rename(tsk, buf);
@@ -1050,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf)
1050 perf_event_comm(tsk); 1055 perf_event_comm(tsk);
1051} 1056}
1052 1057
1053static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
1054{
1055 int i, ch;
1056
1057 /* Copies the binary name from after last slash */
1058 for (i = 0; (ch = *(fn++)) != '\0';) {
1059 if (ch == '/')
1060 i = 0; /* overwrite what we wrote */
1061 else
1062 if (i < len - 1)
1063 tcomm[i++] = ch;
1064 }
1065 tcomm[i] = '\0';
1066}
1067
1068int flush_old_exec(struct linux_binprm * bprm) 1058int flush_old_exec(struct linux_binprm * bprm)
1069{ 1059{
1070 int retval; 1060 int retval;
@@ -1078,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm)
1078 goto out; 1068 goto out;
1079 1069
1080 set_mm_exe_file(bprm->mm, bprm->file); 1070 set_mm_exe_file(bprm->mm, bprm->file);
1081
1082 filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
1083 /* 1071 /*
1084 * Release all of the old mmap stuff 1072 * Release all of the old mmap stuff
1085 */ 1073 */
@@ -1122,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm)
1122 else 1110 else
1123 set_dumpable(current->mm, suid_dumpable); 1111 set_dumpable(current->mm, suid_dumpable);
1124 1112
1125 set_task_comm(current, bprm->tcomm); 1113 set_task_comm(current, kbasename(bprm->filename));
1126 1114
1127 /* Set the new mm task size. We have to do that late because it may 1115 /* Set the new mm task size. We have to do that late because it may
1128 * depend on TIF_32BIT which is only updated in flush_thread() on 1116 * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1619,9 +1607,9 @@ SYSCALL_DEFINE3(execve,
1619 return do_execve(getname(filename), argv, envp); 1607 return do_execve(getname(filename), argv, envp);
1620} 1608}
1621#ifdef CONFIG_COMPAT 1609#ifdef CONFIG_COMPAT
1622asmlinkage long compat_sys_execve(const char __user * filename, 1610COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1623 const compat_uptr_t __user * argv, 1611 const compat_uptr_t __user *, argv,
1624 const compat_uptr_t __user * envp) 1612 const compat_uptr_t __user *, envp)
1625{ 1613{
1626 return compat_do_execve(getname(filename), argv, envp); 1614 return compat_do_execve(getname(filename), argv, envp);
1627} 1615}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ee4317faccb1..d1c244d67667 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode)
1486 struct ore_io_state *ios; 1486 struct ore_io_state *ios;
1487 int ret; 1487 int ret;
1488 1488
1489 truncate_inode_pages(&inode->i_data, 0); 1489 truncate_inode_pages_final(&inode->i_data);
1490 1490
1491 /* TODO: should do better here */ 1491 /* TODO: should do better here */
1492 if (inode->i_nlink || is_bad_inode(inode)) 1492 if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 1b8001bbe947..27695e6f4e46 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> 4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */ 5 */
6 6
7#include <linux/capability.h>
8#include <linux/init.h> 7#include <linux/init.h>
9#include <linux/sched.h> 8#include <linux/sched.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7cadd823bb31..7d66fb0e4cca 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
284 int best_ndir = inodes_per_group; 284 int best_ndir = inodes_per_group;
285 int best_group = -1; 285 int best_group = -1;
286 286
287 get_random_bytes(&group, sizeof(group)); 287 group = prandom_u32();
288 parent_group = (unsigned)group % ngroups; 288 parent_group = (unsigned)group % ngroups;
289 for (i = 0; i < ngroups; i++) { 289 for (i = 0; i < ngroups; i++) {
290 group = (parent_group + i) % ngroups; 290 group = (parent_group + i) % ngroups;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 94ed36849b71..b1d2a4675d42 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
78 dquot_drop(inode); 78 dquot_drop(inode);
79 } 79 }
80 80
81 truncate_inode_pages(&inode->i_data, 0); 81 truncate_inode_pages_final(&inode->i_data);
82 82
83 if (want_delete) { 83 if (want_delete) {
84 sb_start_intwrite(inode->i_sb); 84 sb_start_intwrite(inode->i_sb);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697bd638..3750031cfa2f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,7 +192,7 @@ static void init_once(void *foo)
192 inode_init_once(&ei->vfs_inode); 192 inode_init_once(&ei->vfs_inode);
193} 193}
194 194
195static int init_inodecache(void) 195static int __init init_inodecache(void)
196{ 196{
197 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", 197 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
198 sizeof(struct ext2_inode_info), 198 sizeof(struct ext2_inode_info),
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 unsigned long old_sb_flags; 1254 unsigned long old_sb_flags;
1255 int err; 1255 int err;
1256 1256
1257 sync_filesystem(sb);
1257 spin_lock(&sbi->s_lock); 1258 spin_lock(&sbi->s_lock);
1258 1259
1259 /* Store the old options */ 1260 /* Store the old options */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index cfedb2cb0d8c..c0ebc4db8849 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
42 value, size, flags); 42 value, size, flags);
43} 43}
44 44
45int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, 45static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
46 void *fs_info) 46 void *fs_info)
47{ 47{
48 const struct xattr *xattr; 48 const struct xattr *xattr;
49 int err = 0; 49 int err = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22548f56197b..158b5d4ce067 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1727,10 +1727,7 @@ allocated:
1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1728 1728
1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1730 err = ext3_journal_dirty_metadata(handle, gdp_bh); 1730 fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
1731 if (!fatal)
1732 fatal = err;
1733
1734 if (fatal) 1731 if (fatal)
1735 goto out; 1732 goto out;
1736 1733
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e66e4808719f..17742eed2c16 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
276 * will be invalid once the directory was converted into a dx directory 276 * will be invalid once the directory was converted into a dx directory
277 */ 277 */
278loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) 278static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
279{ 279{
280 struct inode *inode = file->f_mapping->host; 280 struct inode *inode = file->f_mapping->host;
281 int dx_dir = is_dx_dir(inode); 281 int dx_dir = is_dx_dir(inode);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 082afd78b107..a1b810230cc5 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
215 int best_ndir = inodes_per_group; 215 int best_ndir = inodes_per_group;
216 int best_group = -1; 216 int best_group = -1;
217 217
218 get_random_bytes(&group, sizeof(group)); 218 group = prandom_u32();
219 parent_group = (unsigned)group % ngroups; 219 parent_group = (unsigned)group % ngroups;
220 for (i = 0; i < ngroups; i++) { 220 for (i = 0; i < ngroups; i++) {
221 group = (parent_group + i) % ngroups; 221 group = (parent_group + i) % ngroups;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 384b6ebb655f..f5157d0d1b43 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
228 log_wait_commit(journal, commit_tid); 228 log_wait_commit(journal, commit_tid);
229 filemap_write_and_wait(&inode->i_data); 229 filemap_write_and_wait(&inode->i_data);
230 } 230 }
231 truncate_inode_pages(&inode->i_data, 0); 231 truncate_inode_pages_final(&inode->i_data);
232 232
233 ext3_discard_reservation(inode); 233 ext3_discard_reservation(inode);
234 rsv = ei->i_block_alloc_info; 234 rsv = ei->i_block_alloc_info;
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1559} 1559}
1560 1560
1561/* 1561/*
1562 * Note that we always start a transaction even if we're not journalling 1562 * Note that whenever we need to map blocks we start a transaction even if
1563 * data. This is to preserve ordering: any hole instantiation within 1563 * we're not journalling data. This is to preserve ordering: any hole
1564 * __block_write_full_page -> ext3_get_block() should be journalled 1564 * instantiation within __block_write_full_page -> ext3_get_block() should be
1565 * along with the data so we don't crash and then get metadata which 1565 * journalled along with the data so we don't crash and then get metadata which
1566 * refers to old data. 1566 * refers to old data.
1567 * 1567 *
1568 * In all journalling modes block_write_full_page() will start the I/O. 1568 * In all journalling modes block_write_full_page() will start the I/O.
1569 * 1569 *
1570 * Problem:
1571 *
1572 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1573 * ext3_writepage()
1574 *
1575 * Similar for:
1576 *
1577 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1578 *
1579 * Same applies to ext3_get_block(). We will deadlock on various things like
1580 * lock_journal and i_truncate_mutex.
1581 *
1582 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1583 * allocations fail.
1584 *
1585 * 16May01: If we're reentered then journal_current_handle() will be
1586 * non-zero. We simply *return*.
1587 *
1588 * 1 July 2001: @@@ FIXME:
1589 * In journalled data mode, a data buffer may be metadata against the
1590 * current transaction. But the same file is part of a shared mapping
1591 * and someone does a writepage() on it.
1592 *
1593 * We will move the buffer onto the async_data list, but *after* it has
1594 * been dirtied. So there's a small window where we have dirty data on
1595 * BJ_Metadata.
1596 *
1597 * Note that this only applies to the last partial page in the file. The
1598 * bit which block_write_full_page() uses prepare/commit for. (That's
1599 * broken code anyway: it's wrong for msync()).
1600 *
1601 * It's a rare case: affects the final partial page, for journalled data
1602 * where the file is subject to bith write() and writepage() in the same
1603 * transction. To fix it we'll need a custom block_write_full_page().
1604 * We'll probably need that anyway for journalling writepage() output.
1605 *
1606 * We don't honour synchronous mounts for writepage(). That would be 1570 * We don't honour synchronous mounts for writepage(). That would be
1607 * disastrous. Any write() or metadata operation will sync the fs for 1571 * disastrous. Any write() or metadata operation will sync the fs for
1608 * us. 1572 * us.
1609 *
1610 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1611 * we don't need to open a transaction here.
1612 */ 1573 */
1613static int ext3_ordered_writepage(struct page *page, 1574static int ext3_ordered_writepage(struct page *page,
1614 struct writeback_control *wbc) 1575 struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
1673 * block_write_full_page() succeeded. Otherwise they are unmapped, 1634 * block_write_full_page() succeeded. Otherwise they are unmapped,
1674 * and generally junk. 1635 * and generally junk.
1675 */ 1636 */
1676 if (ret == 0) { 1637 if (ret == 0)
1677 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1638 ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1678 NULL, journal_dirty_data_fn); 1639 NULL, journal_dirty_data_fn);
1679 if (!ret)
1680 ret = err;
1681 }
1682 walk_page_buffers(handle, page_bufs, 0, 1640 walk_page_buffers(handle, page_bufs, 0,
1683 PAGE_CACHE_SIZE, NULL, bput_one); 1641 PAGE_CACHE_SIZE, NULL, bput_one);
1684 err = ext3_journal_stop(handle); 1642 err = ext3_journal_stop(handle);
@@ -1925,6 +1883,8 @@ retry:
1925 * and pretend the write failed... */ 1883 * and pretend the write failed... */
1926 ext3_truncate_failed_direct_write(inode); 1884 ext3_truncate_failed_direct_write(inode);
1927 ret = PTR_ERR(handle); 1885 ret = PTR_ERR(handle);
1886 if (inode->i_nlink)
1887 ext3_orphan_del(NULL, inode);
1928 goto out; 1888 goto out;
1929 } 1889 }
1930 if (inode->i_nlink) 1890 if (inode->i_nlink)
@@ -3212,21 +3172,20 @@ out_brelse:
3212 * 3172 *
3213 * We are called from a few places: 3173 * We are called from a few places:
3214 * 3174 *
3215 * - Within generic_file_write() for O_SYNC files. 3175 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
3216 * Here, there will be no transaction running. We wait for any running 3176 * Here, there will be no transaction running. We wait for any running
3217 * transaction to commit. 3177 * transaction to commit.
3218 * 3178 *
3219 * - Within sys_sync(), kupdate and such. 3179 * - Within flush work (for sys_sync(), kupdate and such).
3220 * We wait on commit, if tol to. 3180 * We wait on commit, if told to.
3221 * 3181 *
3222 * - Within prune_icache() (PF_MEMALLOC == true) 3182 * - Within iput_final() -> write_inode_now()
3223 * Here we simply return. We can't afford to block kswapd on the 3183 * We wait on commit, if told to.
3224 * journal commit.
3225 * 3184 *
3226 * In all cases it is actually safe for us to return without doing anything, 3185 * In all cases it is actually safe for us to return without doing anything,
3227 * because the inode has been copied into a raw inode buffer in 3186 * because the inode has been copied into a raw inode buffer in
3228 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 3187 * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
3229 * knfsd. 3188 * writeback.
3230 * 3189 *
3231 * Note that we are absolutely dependent upon all inode dirtiers doing the 3190 * Note that we are absolutely dependent upon all inode dirtiers doing the
3232 * right thing: they *must* call mark_inode_dirty() after dirtying info in 3191 * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3197,13 @@ out_brelse:
3238 * stuff(); 3197 * stuff();
3239 * inode->i_size = expr; 3198 * inode->i_size = expr;
3240 * 3199 *
3241 * is in error because a kswapd-driven write_inode() could occur while 3200 * is in error because write_inode() could occur while `stuff()' is running,
3242 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3201 * and the new i_size will be lost. Plus the inode will no longer be on the
3243 * will no longer be on the superblock's dirty inode list. 3202 * superblock's dirty inode list.
3244 */ 3203 */
3245int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) 3204int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3246{ 3205{
3247 if (current->flags & PF_MEMALLOC) 3206 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
3248 return 0; 3207 return 0;
3249 3208
3250 if (ext3_journal_current_handle()) { 3209 if (ext3_journal_current_handle()) {
@@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3253 return -EIO; 3212 return -EIO;
3254 } 3213 }
3255 3214
3256 if (wbc->sync_mode != WB_SYNC_ALL) 3215 /*
3216 * No need to force transaction in WB_SYNC_NONE mode. Also
3217 * ext3_sync_fs() will force the commit after everything is
3218 * written.
3219 */
3220 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
3257 return 0; 3221 return 0;
3258 3222
3259 return ext3_force_commit(inode->i_sb); 3223 return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..08cdfe5461e3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,7 +527,7 @@ static void init_once(void *foo)
527 inode_init_once(&ei->vfs_inode); 527 inode_init_once(&ei->vfs_inode);
528} 528}
529 529
530static int init_inodecache(void) 530static int __init init_inodecache(void)
531{ 531{
532 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", 532 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
533 sizeof(struct ext3_inode_info), 533 sizeof(struct ext3_inode_info),
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2649 int i; 2649 int i;
2650#endif 2650#endif
2651 2651
2652 sync_filesystem(sb);
2653
2652 /* Store the original options */ 2654 /* Store the original options */
2653 old_sb_flags = sb->s_flags; 2655 old_sb_flags = sb->s_flags;
2654 old_opts.s_mount_opt = sbi->s_mount_opt; 2656 old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3387664ad70e..722c2bf9645d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
43 name, value, size, flags); 43 name, value, size, flags);
44} 44}
45 45
46int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, 46static int ext3_initxattrs(struct inode *inode,
47 void *fs_info) 47 const struct xattr *xattr_array,
48 void *fs_info)
48{ 49{
49 const struct xattr *xattr; 50 const struct xattr *xattr;
50 handle_t *handle = fs_info; 51 handle_t *handle = fs_info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534fdc5ff..f1c65dc7cc0a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <crypto/hash.h> 33#include <crypto/hash.h>
34#include <linux/falloc.h>
34#ifdef __KERNEL__ 35#ifdef __KERNEL__
35#include <linux/compat.h> 36#include <linux/compat.h>
36#endif 37#endif
@@ -567,6 +568,8 @@ enum {
567#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 568#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
568 /* Do not put hole in extent cache */ 569 /* Do not put hole in extent cache */
569#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 570#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
571 /* Convert written extents to unwritten */
572#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
570 573
571/* 574/*
572 * The bit position of these flags must not overlap with any of the 575 * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
998#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group 1001#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
999 size of blocksize * 8 1002 size of blocksize * 8
1000 blocks */ 1003 blocks */
1004#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
1005 file systems */
1001 1006
1002#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1007#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
1003 ~EXT4_MOUNT_##opt 1008 ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
1326 struct list_head s_es_lru; 1331 struct list_head s_es_lru;
1327 unsigned long s_es_last_sorted; 1332 unsigned long s_es_last_sorted;
1328 struct percpu_counter s_extent_cache_cnt; 1333 struct percpu_counter s_extent_cache_cnt;
1334 struct mb_cache *s_mb_cache;
1329 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1330 1336
1331 /* Ratelimit ext4 messages. */ 1337 /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
2133extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2139extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2134extern int ext4_block_truncate_page(handle_t *handle, 2140extern int ext4_block_truncate_page(handle_t *handle,
2135 struct address_space *mapping, loff_t from); 2141 struct address_space *mapping, loff_t from);
2136extern int ext4_block_zero_page_range(handle_t *handle,
2137 struct address_space *mapping, loff_t from, loff_t length);
2138extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2142extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2139 loff_t lstart, loff_t lend); 2143 loff_t lstart, loff_t lend);
2140extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2144extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2757,6 +2761,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2757extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2761extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2758 __u64 start, __u64 len); 2762 __u64 start, __u64 len);
2759extern int ext4_ext_precache(struct inode *inode); 2763extern int ext4_ext_precache(struct inode *inode);
2764extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
2760 2765
2761/* move_extent.c */ 2766/* move_extent.c */
2762extern void ext4_double_down_write_data_sem(struct inode *first, 2767extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2771,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2766extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2771extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2767 __u64 start_orig, __u64 start_donor, 2772 __u64 start_orig, __u64 start_donor,
2768 __u64 len, __u64 *moved_len); 2773 __u64 len, __u64 *moved_len);
2774extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
2775 struct ext4_extent **extent);
2769 2776
2770/* page-io.c */ 2777/* page-io.c */
2771extern int __init ext4_init_pageio(void); 2778extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de832c8..c3fb607413ed 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
259 if (WARN_ON_ONCE(err)) { 259 if (WARN_ON_ONCE(err)) {
260 ext4_journal_abort_handle(where, line, __func__, bh, 260 ext4_journal_abort_handle(where, line, __func__, bh,
261 handle, err); 261 handle, err);
262 if (inode == NULL) {
263 pr_err("EXT4: jbd2_journal_dirty_metadata "
264 "failed: handle type %u started at "
265 "line %u, credits %u/%u, errcode %d",
266 handle->h_type,
267 handle->h_line_no,
268 handle->h_requested_credits,
269 handle->h_buffer_credits, err);
270 return err;
271 }
262 ext4_error_inode(inode, where, line, 272 ext4_error_inode(inode, where, line,
263 bh->b_blocknr, 273 bh->b_blocknr,
264 "journal_dirty_metadata failed: " 274 "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d549c58..82df3ce9874a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/string.h> 38#include <linux/string.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/falloc.h>
41#include <asm/uaccess.h> 40#include <asm/uaccess.h>
42#include <linux/fiemap.h> 41#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1691 * the extent that was written properly split out and conversion to 1690 * the extent that was written properly split out and conversion to
1692 * initialized is trivial. 1691 * initialized is trivial.
1693 */ 1692 */
1694 if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1693 if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
1695 return 0; 1694 return 0;
1696 1695
1697 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1696 ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1708 */ 1707 */
1709 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1708 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1710 return 0; 1709 return 0;
1710 if (ext4_ext_is_uninitialized(ex1) &&
1711 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1712 atomic_read(&EXT4_I(inode)->i_unwritten) ||
1713 (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
1714 return 0;
1711#ifdef AGGRESSIVE_TEST 1715#ifdef AGGRESSIVE_TEST
1712 if (ext1_ee_len >= 4) 1716 if (ext1_ee_len >= 4)
1713 return 0; 1717 return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1731{ 1735{
1732 struct ext4_extent_header *eh; 1736 struct ext4_extent_header *eh;
1733 unsigned int depth, len; 1737 unsigned int depth, len;
1734 int merge_done = 0; 1738 int merge_done = 0, uninit;
1735 1739
1736 depth = ext_depth(inode); 1740 depth = ext_depth(inode);
1737 BUG_ON(path[depth].p_hdr == NULL); 1741 BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1741 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1745 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1742 break; 1746 break;
1743 /* merge with next extent! */ 1747 /* merge with next extent! */
1748 uninit = ext4_ext_is_uninitialized(ex);
1744 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1749 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1745 + ext4_ext_get_actual_len(ex + 1)); 1750 + ext4_ext_get_actual_len(ex + 1));
1751 if (uninit)
1752 ext4_ext_mark_uninitialized(ex);
1746 1753
1747 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1754 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1748 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1755 len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1896 struct ext4_ext_path *npath = NULL; 1903 struct ext4_ext_path *npath = NULL;
1897 int depth, len, err; 1904 int depth, len, err;
1898 ext4_lblk_t next; 1905 ext4_lblk_t next;
1899 int mb_flags = 0; 1906 int mb_flags = 0, uninit;
1900 1907
1901 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1908 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1902 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1909 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1946 path + depth); 1953 path + depth);
1947 if (err) 1954 if (err)
1948 return err; 1955 return err;
1949 1956 uninit = ext4_ext_is_uninitialized(ex);
1950 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1957 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1951 + ext4_ext_get_actual_len(newext)); 1958 + ext4_ext_get_actual_len(newext));
1959 if (uninit)
1960 ext4_ext_mark_uninitialized(ex);
1952 eh = path[depth].p_hdr; 1961 eh = path[depth].p_hdr;
1953 nearex = ex; 1962 nearex = ex;
1954 goto merge; 1963 goto merge;
@@ -1971,10 +1980,13 @@ prepend:
1971 if (err) 1980 if (err)
1972 return err; 1981 return err;
1973 1982
1983 uninit = ext4_ext_is_uninitialized(ex);
1974 ex->ee_block = newext->ee_block; 1984 ex->ee_block = newext->ee_block;
1975 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1985 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1976 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1986 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1977 + ext4_ext_get_actual_len(newext)); 1987 + ext4_ext_get_actual_len(newext));
1988 if (uninit)
1989 ext4_ext_mark_uninitialized(ex);
1978 eh = path[depth].p_hdr; 1990 eh = path[depth].p_hdr;
1979 nearex = ex; 1991 nearex = ex;
1980 goto merge; 1992 goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2585 ex_ee_block = le32_to_cpu(ex->ee_block); 2597 ex_ee_block = le32_to_cpu(ex->ee_block);
2586 ex_ee_len = ext4_ext_get_actual_len(ex); 2598 ex_ee_len = ext4_ext_get_actual_len(ex);
2587 2599
2600 /*
2601 * If we're starting with an extent other than the last one in the
2602 * node, we need to see if it shares a cluster with the extent to
2603 * the right (towards the end of the file). If its leftmost cluster
2604 * is this extent's rightmost cluster and it is not cluster aligned,
2605 * we'll mark it as a partial that is not to be deallocated.
2606 */
2607
2608 if (ex != EXT_LAST_EXTENT(eh)) {
2609 ext4_fsblk_t current_pblk, right_pblk;
2610 long long current_cluster, right_cluster;
2611
2612 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2613 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2614 right_pblk = ext4_ext_pblock(ex + 1);
2615 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2616 if (current_cluster == right_cluster &&
2617 EXT4_PBLK_COFF(sbi, right_pblk))
2618 *partial_cluster = -right_cluster;
2619 }
2620
2588 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2621 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2589 2622
2590 while (ex >= EXT_FIRST_EXTENT(eh) && 2623 while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2710 err = ext4_ext_correct_indexes(handle, inode, path); 2743 err = ext4_ext_correct_indexes(handle, inode, path);
2711 2744
2712 /* 2745 /*
2713 * Free the partial cluster only if the current extent does not 2746 * If there's a partial cluster and at least one extent remains in
2714 * reference it. Otherwise we might free used cluster. 2747 * the leaf, free the partial cluster if it isn't shared with the
2748 * current extent. If there's a partial cluster and no extents
2749 * remain in the leaf, it can't be freed here. It can only be
2750 * freed when it's possible to determine if it's not shared with
2751 * any other extent - when the next leaf is processed or when space
2752 * removal is complete.
2715 */ 2753 */
2716 if (*partial_cluster > 0 && 2754 if (*partial_cluster > 0 && eh->eh_entries &&
2717 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2755 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2718 *partial_cluster)) { 2756 *partial_cluster)) {
2719 int flags = get_default_free_blocks_flags(inode); 2757 int flags = get_default_free_blocks_flags(inode);
@@ -3569,6 +3607,8 @@ out:
3569 * b> Splits in two extents: Write is happening at either end of the extent 3607 * b> Splits in two extents: Write is happening at either end of the extent
3570 * c> Splits in three extents: Somone is writing in middle of the extent 3608 * c> Splits in three extents: Somone is writing in middle of the extent
3571 * 3609 *
3610 * This works the same way in the case of initialized -> unwritten conversion.
3611 *
3572 * One of more index blocks maybe needed if the extent tree grow after 3612 * One of more index blocks maybe needed if the extent tree grow after
3573 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3613 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3574 * complete, we need to split the uninitialized extent before DIO submit 3614 * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3619,7 @@ out:
3579 * 3619 *
3580 * Returns the size of uninitialized extent to be written on success. 3620 * Returns the size of uninitialized extent to be written on success.
3581 */ 3621 */
3582static int ext4_split_unwritten_extents(handle_t *handle, 3622static int ext4_split_convert_extents(handle_t *handle,
3583 struct inode *inode, 3623 struct inode *inode,
3584 struct ext4_map_blocks *map, 3624 struct ext4_map_blocks *map,
3585 struct ext4_ext_path *path, 3625 struct ext4_ext_path *path,
@@ -3591,9 +3631,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3591 unsigned int ee_len; 3631 unsigned int ee_len;
3592 int split_flag = 0, depth; 3632 int split_flag = 0, depth;
3593 3633
3594 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3634 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3595 "block %llu, max_blocks %u\n", inode->i_ino, 3635 __func__, inode->i_ino,
3596 (unsigned long long)map->m_lblk, map->m_len); 3636 (unsigned long long)map->m_lblk, map->m_len);
3597 3637
3598 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3638 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3599 inode->i_sb->s_blocksize_bits; 3639 inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3648,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3608 ee_block = le32_to_cpu(ex->ee_block); 3648 ee_block = le32_to_cpu(ex->ee_block);
3609 ee_len = ext4_ext_get_actual_len(ex); 3649 ee_len = ext4_ext_get_actual_len(ex);
3610 3650
3611 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3651 /* Convert to unwritten */
3612 split_flag |= EXT4_EXT_MARK_UNINIT2; 3652 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3613 if (flags & EXT4_GET_BLOCKS_CONVERT) 3653 split_flag |= EXT4_EXT_DATA_VALID1;
3614 split_flag |= EXT4_EXT_DATA_VALID2; 3654 /* Convert to initialized */
3655 } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3656 split_flag |= ee_block + ee_len <= eof_block ?
3657 EXT4_EXT_MAY_ZEROOUT : 0;
3658 split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
3659 }
3615 flags |= EXT4_GET_BLOCKS_PRE_IO; 3660 flags |= EXT4_GET_BLOCKS_PRE_IO;
3616 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3661 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3617} 3662}
3618 3663
3664static int ext4_convert_initialized_extents(handle_t *handle,
3665 struct inode *inode,
3666 struct ext4_map_blocks *map,
3667 struct ext4_ext_path *path)
3668{
3669 struct ext4_extent *ex;
3670 ext4_lblk_t ee_block;
3671 unsigned int ee_len;
3672 int depth;
3673 int err = 0;
3674
3675 depth = ext_depth(inode);
3676 ex = path[depth].p_ext;
3677 ee_block = le32_to_cpu(ex->ee_block);
3678 ee_len = ext4_ext_get_actual_len(ex);
3679
3680 ext_debug("%s: inode %lu, logical"
3681 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3682 (unsigned long long)ee_block, ee_len);
3683
3684 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3685 err = ext4_split_convert_extents(handle, inode, map, path,
3686 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3687 if (err < 0)
3688 goto out;
3689 ext4_ext_drop_refs(path);
3690 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3691 if (IS_ERR(path)) {
3692 err = PTR_ERR(path);
3693 goto out;
3694 }
3695 depth = ext_depth(inode);
3696 ex = path[depth].p_ext;
3697 }
3698
3699 err = ext4_ext_get_access(handle, inode, path + depth);
3700 if (err)
3701 goto out;
3702 /* first mark the extent as uninitialized */
3703 ext4_ext_mark_uninitialized(ex);
3704
3705 /* note: ext4_ext_correct_indexes() isn't needed here because
3706 * borders are not changed
3707 */
3708 ext4_ext_try_to_merge(handle, inode, path, ex);
3709
3710 /* Mark modified extent as dirty */
3711 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3712out:
3713 ext4_ext_show_leaf(inode, path);
3714 return err;
3715}
3716
3717
3619static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3718static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3620 struct inode *inode, 3719 struct inode *inode,
3621 struct ext4_map_blocks *map, 3720 struct ext4_map_blocks *map,
@@ -3649,8 +3748,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3649 inode->i_ino, (unsigned long long)ee_block, ee_len, 3748 inode->i_ino, (unsigned long long)ee_block, ee_len,
3650 (unsigned long long)map->m_lblk, map->m_len); 3749 (unsigned long long)map->m_lblk, map->m_len);
3651#endif 3750#endif
3652 err = ext4_split_unwritten_extents(handle, inode, map, path, 3751 err = ext4_split_convert_extents(handle, inode, map, path,
3653 EXT4_GET_BLOCKS_CONVERT); 3752 EXT4_GET_BLOCKS_CONVERT);
3654 if (err < 0) 3753 if (err < 0)
3655 goto out; 3754 goto out;
3656 ext4_ext_drop_refs(path); 3755 ext4_ext_drop_refs(path);
@@ -3851,6 +3950,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3851} 3950}
3852 3951
3853static int 3952static int
3953ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3954 struct ext4_map_blocks *map,
3955 struct ext4_ext_path *path, int flags,
3956 unsigned int allocated, ext4_fsblk_t newblock)
3957{
3958 int ret = 0;
3959 int err = 0;
3960
3961 /*
3962 * Make sure that the extent is no bigger than we support with
3963 * uninitialized extent
3964 */
3965 if (map->m_len > EXT_UNINIT_MAX_LEN)
3966 map->m_len = EXT_UNINIT_MAX_LEN / 2;
3967
3968 ret = ext4_convert_initialized_extents(handle, inode, map,
3969 path);
3970 if (ret >= 0) {
3971 ext4_update_inode_fsync_trans(handle, inode, 1);
3972 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3973 path, map->m_len);
3974 } else
3975 err = ret;
3976 map->m_flags |= EXT4_MAP_UNWRITTEN;
3977 if (allocated > map->m_len)
3978 allocated = map->m_len;
3979 map->m_len = allocated;
3980
3981 return err ? err : allocated;
3982}
3983
3984static int
3854ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3985ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3855 struct ext4_map_blocks *map, 3986 struct ext4_map_blocks *map,
3856 struct ext4_ext_path *path, int flags, 3987 struct ext4_ext_path *path, int flags,
@@ -3877,8 +4008,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3877 4008
3878 /* get_block() before submit the IO, split the extent */ 4009 /* get_block() before submit the IO, split the extent */
3879 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4010 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3880 ret = ext4_split_unwritten_extents(handle, inode, map, 4011 ret = ext4_split_convert_extents(handle, inode, map,
3881 path, flags); 4012 path, flags | EXT4_GET_BLOCKS_CONVERT);
3882 if (ret <= 0) 4013 if (ret <= 0)
3883 goto out; 4014 goto out;
3884 /* 4015 /*
@@ -3993,10 +4124,6 @@ out1:
3993 map->m_pblk = newblock; 4124 map->m_pblk = newblock;
3994 map->m_len = allocated; 4125 map->m_len = allocated;
3995out2: 4126out2:
3996 if (path) {
3997 ext4_ext_drop_refs(path);
3998 kfree(path);
3999 }
4000 return err ? err : allocated; 4127 return err ? err : allocated;
4001} 4128}
4002 4129
@@ -4128,7 +4255,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4128 struct ext4_extent newex, *ex, *ex2; 4255 struct ext4_extent newex, *ex, *ex2;
4129 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4256 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4130 ext4_fsblk_t newblock = 0; 4257 ext4_fsblk_t newblock = 0;
4131 int free_on_err = 0, err = 0, depth; 4258 int free_on_err = 0, err = 0, depth, ret;
4132 unsigned int allocated = 0, offset = 0; 4259 unsigned int allocated = 0, offset = 0;
4133 unsigned int allocated_clusters = 0; 4260 unsigned int allocated_clusters = 0;
4134 struct ext4_allocation_request ar; 4261 struct ext4_allocation_request ar;
@@ -4170,6 +4297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4170 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4297 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4171 unsigned short ee_len; 4298 unsigned short ee_len;
4172 4299
4300
4173 /* 4301 /*
4174 * Uninitialized extents are treated as holes, except that 4302 * Uninitialized extents are treated as holes, except that
4175 * we split out initialized portions during a write. 4303 * we split out initialized portions during a write.
@@ -4186,13 +4314,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4186 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4314 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4187 ee_block, ee_len, newblock); 4315 ee_block, ee_len, newblock);
4188 4316
4189 if (!ext4_ext_is_uninitialized(ex)) 4317 /*
4318 * If the extent is initialized check whether the
4319 * caller wants to convert it to unwritten.
4320 */
4321 if ((!ext4_ext_is_uninitialized(ex)) &&
4322 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4323 allocated = ext4_ext_convert_initialized_extent(
4324 handle, inode, map, path, flags,
4325 allocated, newblock);
4326 goto out2;
4327 } else if (!ext4_ext_is_uninitialized(ex))
4190 goto out; 4328 goto out;
4191 4329
4192 allocated = ext4_ext_handle_uninitialized_extents( 4330 ret = ext4_ext_handle_uninitialized_extents(
4193 handle, inode, map, path, flags, 4331 handle, inode, map, path, flags,
4194 allocated, newblock); 4332 allocated, newblock);
4195 goto out3; 4333 if (ret < 0)
4334 err = ret;
4335 else
4336 allocated = ret;
4337 goto out2;
4196 } 4338 }
4197 } 4339 }
4198 4340
@@ -4473,7 +4615,6 @@ out2:
4473 kfree(path); 4615 kfree(path);
4474 } 4616 }
4475 4617
4476out3:
4477 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4618 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4478 err ? err : allocated); 4619 err ? err : allocated);
4479 ext4_es_lru_add(inode); 4620 ext4_es_lru_add(inode);
@@ -4514,34 +4655,200 @@ retry:
4514 ext4_std_error(inode->i_sb, err); 4655 ext4_std_error(inode->i_sb, err);
4515} 4656}
4516 4657
4517static void ext4_falloc_update_inode(struct inode *inode, 4658static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4518 int mode, loff_t new_size, int update_ctime) 4659 ext4_lblk_t len, int flags, int mode)
4519{ 4660{
4520 struct timespec now; 4661 struct inode *inode = file_inode(file);
4662 handle_t *handle;
4663 int ret = 0;
4664 int ret2 = 0;
4665 int retries = 0;
4666 struct ext4_map_blocks map;
4667 unsigned int credits;
4521 4668
4522 if (update_ctime) { 4669 map.m_lblk = offset;
4523 now = current_fs_time(inode->i_sb); 4670 /*
4524 if (!timespec_equal(&inode->i_ctime, &now)) 4671 * Don't normalize the request if it can fit in one extent so
4525 inode->i_ctime = now; 4672 * that it doesn't get unnecessarily split into multiple
4673 * extents.
4674 */
4675 if (len <= EXT_UNINIT_MAX_LEN)
4676 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4677
4678 /*
4679 * credits to insert 1 extent into extent tree
4680 */
4681 credits = ext4_chunk_trans_blocks(inode, len);
4682
4683retry:
4684 while (ret >= 0 && ret < len) {
4685 map.m_lblk = map.m_lblk + ret;
4686 map.m_len = len = len - ret;
4687 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4688 credits);
4689 if (IS_ERR(handle)) {
4690 ret = PTR_ERR(handle);
4691 break;
4692 }
4693 ret = ext4_map_blocks(handle, inode, &map, flags);
4694 if (ret <= 0) {
4695 ext4_debug("inode #%lu: block %u: len %u: "
4696 "ext4_ext_map_blocks returned %d",
4697 inode->i_ino, map.m_lblk,
4698 map.m_len, ret);
4699 ext4_mark_inode_dirty(handle, inode);
4700 ret2 = ext4_journal_stop(handle);
4701 break;
4702 }
4703 ret2 = ext4_journal_stop(handle);
4704 if (ret2)
4705 break;
4706 }
4707 if (ret == -ENOSPC &&
4708 ext4_should_retry_alloc(inode->i_sb, &retries)) {
4709 ret = 0;
4710 goto retry;
4526 } 4711 }
4712
4713 return ret > 0 ? ret2 : ret;
4714}
4715
4716static long ext4_zero_range(struct file *file, loff_t offset,
4717 loff_t len, int mode)
4718{
4719 struct inode *inode = file_inode(file);
4720 handle_t *handle = NULL;
4721 unsigned int max_blocks;
4722 loff_t new_size = 0;
4723 int ret = 0;
4724 int flags;
4725 int partial;
4726 loff_t start, end;
4727 ext4_lblk_t lblk;
4728 struct address_space *mapping = inode->i_mapping;
4729 unsigned int blkbits = inode->i_blkbits;
4730
4731 trace_ext4_zero_range(inode, offset, len, mode);
4732
4733 /*
4734 * Write out all dirty pages to avoid race conditions
4735 * Then release them.
4736 */
4737 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4738 ret = filemap_write_and_wait_range(mapping, offset,
4739 offset + len - 1);
4740 if (ret)
4741 return ret;
4742 }
4743
4527 /* 4744 /*
4528 * Update only when preallocation was requested beyond 4745 * Round up offset. This is not fallocate, we neet to zero out
4529 * the file size. 4746 * blocks, so convert interior block aligned part of the range to
4747 * unwritten and possibly manually zero out unaligned parts of the
4748 * range.
4530 */ 4749 */
4531 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4750 start = round_up(offset, 1 << blkbits);
4751 end = round_down((offset + len), 1 << blkbits);
4752
4753 if (start < offset || end > offset + len)
4754 return -EINVAL;
4755 partial = (offset + len) & ((1 << blkbits) - 1);
4756
4757 lblk = start >> blkbits;
4758 max_blocks = (end >> blkbits);
4759 if (max_blocks < lblk)
4760 max_blocks = 0;
4761 else
4762 max_blocks -= lblk;
4763
4764 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4765 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
4766 if (mode & FALLOC_FL_KEEP_SIZE)
4767 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4768
4769 mutex_lock(&inode->i_mutex);
4770
4771 /*
4772 * Indirect files do not support unwritten extnets
4773 */
4774 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4775 ret = -EOPNOTSUPP;
4776 goto out_mutex;
4777 }
4778
4779 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4780 offset + len > i_size_read(inode)) {
4781 new_size = offset + len;
4782 ret = inode_newsize_ok(inode, new_size);
4783 if (ret)
4784 goto out_mutex;
4785 /*
4786 * If we have a partial block after EOF we have to allocate
4787 * the entire block.
4788 */
4789 if (partial)
4790 max_blocks += 1;
4791 }
4792
4793 if (max_blocks > 0) {
4794
4795 /* Now release the pages and zero block aligned part of pages*/
4796 truncate_pagecache_range(inode, start, end - 1);
4797
4798 /* Wait all existing dio workers, newcomers will block on i_mutex */
4799 ext4_inode_block_unlocked_dio(inode);
4800 inode_dio_wait(inode);
4801
4802 /*
4803 * Remove entire range from the extent status tree.
4804 */
4805 ret = ext4_es_remove_extent(inode, lblk, max_blocks);
4806 if (ret)
4807 goto out_dio;
4808
4809 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
4810 mode);
4811 if (ret)
4812 goto out_dio;
4813 }
4814
4815 handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
4816 if (IS_ERR(handle)) {
4817 ret = PTR_ERR(handle);
4818 ext4_std_error(inode->i_sb, ret);
4819 goto out_dio;
4820 }
4821
4822 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4823
4824 if (new_size) {
4532 if (new_size > i_size_read(inode)) 4825 if (new_size > i_size_read(inode))
4533 i_size_write(inode, new_size); 4826 i_size_write(inode, new_size);
4534 if (new_size > EXT4_I(inode)->i_disksize) 4827 if (new_size > EXT4_I(inode)->i_disksize)
4535 ext4_update_i_disksize(inode, new_size); 4828 ext4_update_i_disksize(inode, new_size);
4536 } else { 4829 } else {
4537 /* 4830 /*
4538 * Mark that we allocate beyond EOF so the subsequent truncate 4831 * Mark that we allocate beyond EOF so the subsequent truncate
4539 * can proceed even if the new size is the same as i_size. 4832 * can proceed even if the new size is the same as i_size.
4540 */ 4833 */
4541 if (new_size > i_size_read(inode)) 4834 if ((offset + len) > i_size_read(inode))
4542 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4835 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4543 } 4836 }
4544 4837
4838 ext4_mark_inode_dirty(handle, inode);
4839
4840 /* Zero out partial block at the edges of the range */
4841 ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4842
4843 if (file->f_flags & O_SYNC)
4844 ext4_handle_sync(handle);
4845
4846 ext4_journal_stop(handle);
4847out_dio:
4848 ext4_inode_resume_unlocked_dio(inode);
4849out_mutex:
4850 mutex_unlock(&inode->i_mutex);
4851 return ret;
4545} 4852}
4546 4853
4547/* 4854/*
@@ -4555,22 +4862,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4555{ 4862{
4556 struct inode *inode = file_inode(file); 4863 struct inode *inode = file_inode(file);
4557 handle_t *handle; 4864 handle_t *handle;
4558 loff_t new_size; 4865 loff_t new_size = 0;
4559 unsigned int max_blocks; 4866 unsigned int max_blocks;
4560 int ret = 0; 4867 int ret = 0;
4561 int ret2 = 0;
4562 int retries = 0;
4563 int flags; 4868 int flags;
4564 struct ext4_map_blocks map; 4869 ext4_lblk_t lblk;
4565 unsigned int credits, blkbits = inode->i_blkbits; 4870 struct timespec tv;
4871 unsigned int blkbits = inode->i_blkbits;
4566 4872
4567 /* Return error if mode is not supported */ 4873 /* Return error if mode is not supported */
4568 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4874 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4875 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4569 return -EOPNOTSUPP; 4876 return -EOPNOTSUPP;
4570 4877
4571 if (mode & FALLOC_FL_PUNCH_HOLE) 4878 if (mode & FALLOC_FL_PUNCH_HOLE)
4572 return ext4_punch_hole(inode, offset, len); 4879 return ext4_punch_hole(inode, offset, len);
4573 4880
4881 if (mode & FALLOC_FL_COLLAPSE_RANGE)
4882 return ext4_collapse_range(inode, offset, len);
4883
4574 ret = ext4_convert_inline_data(inode); 4884 ret = ext4_convert_inline_data(inode);
4575 if (ret) 4885 if (ret)
4576 return ret; 4886 return ret;
@@ -4582,83 +4892,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4582 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4892 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4583 return -EOPNOTSUPP; 4893 return -EOPNOTSUPP;
4584 4894
4895 if (mode & FALLOC_FL_ZERO_RANGE)
4896 return ext4_zero_range(file, offset, len, mode);
4897
4585 trace_ext4_fallocate_enter(inode, offset, len, mode); 4898 trace_ext4_fallocate_enter(inode, offset, len, mode);
4586 map.m_lblk = offset >> blkbits; 4899 lblk = offset >> blkbits;
4587 /* 4900 /*
4588 * We can't just convert len to max_blocks because 4901 * We can't just convert len to max_blocks because
4589 * If blocksize = 4096 offset = 3072 and len = 2048 4902 * If blocksize = 4096 offset = 3072 and len = 2048
4590 */ 4903 */
4591 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4904 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4592 - map.m_lblk; 4905 - lblk;
4593 /* 4906
4594 * credits to insert 1 extent into extent tree
4595 */
4596 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4597 mutex_lock(&inode->i_mutex);
4598 ret = inode_newsize_ok(inode, (len + offset));
4599 if (ret) {
4600 mutex_unlock(&inode->i_mutex);
4601 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4602 return ret;
4603 }
4604 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4907 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4605 if (mode & FALLOC_FL_KEEP_SIZE) 4908 if (mode & FALLOC_FL_KEEP_SIZE)
4606 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4909 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4607 /*
4608 * Don't normalize the request if it can fit in one extent so
4609 * that it doesn't get unnecessarily split into multiple
4610 * extents.
4611 */
4612 if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4613 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4614 4910
4615retry: 4911 mutex_lock(&inode->i_mutex);
4616 while (ret >= 0 && ret < max_blocks) {
4617 map.m_lblk = map.m_lblk + ret;
4618 map.m_len = max_blocks = max_blocks - ret;
4619 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4620 credits);
4621 if (IS_ERR(handle)) {
4622 ret = PTR_ERR(handle);
4623 break;
4624 }
4625 ret = ext4_map_blocks(handle, inode, &map, flags);
4626 if (ret <= 0) {
4627#ifdef EXT4FS_DEBUG
4628 ext4_warning(inode->i_sb,
4629 "inode #%lu: block %u: len %u: "
4630 "ext4_ext_map_blocks returned %d",
4631 inode->i_ino, map.m_lblk,
4632 map.m_len, ret);
4633#endif
4634 ext4_mark_inode_dirty(handle, inode);
4635 ret2 = ext4_journal_stop(handle);
4636 break;
4637 }
4638 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4639 blkbits) >> blkbits))
4640 new_size = offset + len;
4641 else
4642 new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4643 4912
4644 ext4_falloc_update_inode(inode, mode, new_size, 4913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4645 (map.m_flags & EXT4_MAP_NEW)); 4914 offset + len > i_size_read(inode)) {
4646 ext4_mark_inode_dirty(handle, inode); 4915 new_size = offset + len;
4647 if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4916 ret = inode_newsize_ok(inode, new_size);
4648 ext4_handle_sync(handle); 4917 if (ret)
4649 ret2 = ext4_journal_stop(handle); 4918 goto out;
4650 if (ret2)
4651 break;
4652 } 4919 }
4653 if (ret == -ENOSPC && 4920
4654 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4921 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
4655 ret = 0; 4922 if (ret)
4656 goto retry; 4923 goto out;
4924
4925 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4926 if (IS_ERR(handle))
4927 goto out;
4928
4929 tv = inode->i_ctime = ext4_current_time(inode);
4930
4931 if (new_size) {
4932 if (new_size > i_size_read(inode)) {
4933 i_size_write(inode, new_size);
4934 inode->i_mtime = tv;
4935 }
4936 if (new_size > EXT4_I(inode)->i_disksize)
4937 ext4_update_i_disksize(inode, new_size);
4938 } else {
4939 /*
4940 * Mark that we allocate beyond EOF so the subsequent truncate
4941 * can proceed even if the new size is the same as i_size.
4942 */
4943 if ((offset + len) > i_size_read(inode))
4944 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4657 } 4945 }
4946 ext4_mark_inode_dirty(handle, inode);
4947 if (file->f_flags & O_SYNC)
4948 ext4_handle_sync(handle);
4949
4950 ext4_journal_stop(handle);
4951out:
4658 mutex_unlock(&inode->i_mutex); 4952 mutex_unlock(&inode->i_mutex);
4659 trace_ext4_fallocate_exit(inode, offset, max_blocks, 4953 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4660 ret > 0 ? ret2 : ret); 4954 return ret;
4661 return ret > 0 ? ret2 : ret;
4662} 4955}
4663 4956
4664/* 4957/*
@@ -4869,3 +5162,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4869 ext4_es_lru_add(inode); 5162 ext4_es_lru_add(inode);
4870 return error; 5163 return error;
4871} 5164}
5165
5166/*
5167 * ext4_access_path:
5168 * Function to access the path buffer for marking it dirty.
5169 * It also checks if there are sufficient credits left in the journal handle
5170 * to update path.
5171 */
5172static int
5173ext4_access_path(handle_t *handle, struct inode *inode,
5174 struct ext4_ext_path *path)
5175{
5176 int credits, err;
5177
5178 if (!ext4_handle_valid(handle))
5179 return 0;
5180
5181 /*
5182 * Check if need to extend journal credits
5183 * 3 for leaf, sb, and inode plus 2 (bmap and group
5184 * descriptor) for each block group; assume two block
5185 * groups
5186 */
5187 if (handle->h_buffer_credits < 7) {
5188 credits = ext4_writepage_trans_blocks(inode);
5189 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5190 /* EAGAIN is success */
5191 if (err && err != -EAGAIN)
5192 return err;
5193 }
5194
5195 err = ext4_ext_get_access(handle, inode, path);
5196 return err;
5197}
5198
5199/*
5200 * ext4_ext_shift_path_extents:
5201 * Shift the extents of a path structure lying between path[depth].p_ext
5202 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5203 * from starting block for each extent.
5204 */
5205static int
5206ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5207 struct inode *inode, handle_t *handle,
5208 ext4_lblk_t *start)
5209{
5210 int depth, err = 0;
5211 struct ext4_extent *ex_start, *ex_last;
5212 bool update = 0;
5213 depth = path->p_depth;
5214
5215 while (depth >= 0) {
5216 if (depth == path->p_depth) {
5217 ex_start = path[depth].p_ext;
5218 if (!ex_start)
5219 return -EIO;
5220
5221 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5222 if (!ex_last)
5223 return -EIO;
5224
5225 err = ext4_access_path(handle, inode, path + depth);
5226 if (err)
5227 goto out;
5228
5229 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5230 update = 1;
5231
5232 *start = ex_last->ee_block +
5233 ext4_ext_get_actual_len(ex_last);
5234
5235 while (ex_start <= ex_last) {
5236 ex_start->ee_block -= shift;
5237 if (ex_start >
5238 EXT_FIRST_EXTENT(path[depth].p_hdr)) {
5239 if (ext4_ext_try_to_merge_right(inode,
5240 path, ex_start - 1))
5241 ex_last--;
5242 }
5243 ex_start++;
5244 }
5245 err = ext4_ext_dirty(handle, inode, path + depth);
5246 if (err)
5247 goto out;
5248
5249 if (--depth < 0 || !update)
5250 break;
5251 }
5252
5253 /* Update index too */
5254 err = ext4_access_path(handle, inode, path + depth);
5255 if (err)
5256 goto out;
5257
5258 path[depth].p_idx->ei_block -= shift;
5259 err = ext4_ext_dirty(handle, inode, path + depth);
5260 if (err)
5261 goto out;
5262
5263 /* we are done if current index is not a starting index */
5264 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5265 break;
5266
5267 depth--;
5268 }
5269
5270out:
5271 return err;
5272}
5273
5274/*
5275 * ext4_ext_shift_extents:
5276 * All the extents which lies in the range from start to the last allocated
5277 * block for the file are shifted downwards by shift blocks.
5278 * On success, 0 is returned, error otherwise.
5279 */
5280static int
5281ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5282 ext4_lblk_t start, ext4_lblk_t shift)
5283{
5284 struct ext4_ext_path *path;
5285 int ret = 0, depth;
5286 struct ext4_extent *extent;
5287 ext4_lblk_t stop_block, current_block;
5288 ext4_lblk_t ex_start, ex_end;
5289
5290 /* Let path point to the last extent */
5291 path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5292 if (IS_ERR(path))
5293 return PTR_ERR(path);
5294
5295 depth = path->p_depth;
5296 extent = path[depth].p_ext;
5297 if (!extent) {
5298 ext4_ext_drop_refs(path);
5299 kfree(path);
5300 return ret;
5301 }
5302
5303 stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
5304 ext4_ext_drop_refs(path);
5305 kfree(path);
5306
5307 /* Nothing to shift, if hole is at the end of file */
5308 if (start >= stop_block)
5309 return ret;
5310
5311 /*
5312 * Don't start shifting extents until we make sure the hole is big
5313 * enough to accomodate the shift.
5314 */
5315 path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5316 depth = path->p_depth;
5317 extent = path[depth].p_ext;
5318 ex_start = extent->ee_block;
5319 ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
5320 ext4_ext_drop_refs(path);
5321 kfree(path);
5322
5323 if ((start == ex_start && shift > ex_start) ||
5324 (shift > start - ex_end))
5325 return -EINVAL;
5326
5327 /* Its safe to start updating extents */
5328 while (start < stop_block) {
5329 path = ext4_ext_find_extent(inode, start, NULL, 0);
5330 if (IS_ERR(path))
5331 return PTR_ERR(path);
5332 depth = path->p_depth;
5333 extent = path[depth].p_ext;
5334 current_block = extent->ee_block;
5335 if (start > current_block) {
5336 /* Hole, move to the next extent */
5337 ret = mext_next_extent(inode, path, &extent);
5338 if (ret != 0) {
5339 ext4_ext_drop_refs(path);
5340 kfree(path);
5341 if (ret == 1)
5342 ret = 0;
5343 break;
5344 }
5345 }
5346 ret = ext4_ext_shift_path_extents(path, shift, inode,
5347 handle, &start);
5348 ext4_ext_drop_refs(path);
5349 kfree(path);
5350 if (ret)
5351 break;
5352 }
5353
5354 return ret;
5355}
5356
5357/*
5358 * ext4_collapse_range:
5359 * This implements the fallocate's collapse range functionality for ext4
5360 * Returns: 0 and non-zero on error.
5361 */
5362int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5363{
5364 struct super_block *sb = inode->i_sb;
5365 ext4_lblk_t punch_start, punch_stop;
5366 handle_t *handle;
5367 unsigned int credits;
5368 loff_t new_size;
5369 int ret;
5370
5371 BUG_ON(offset + len > i_size_read(inode));
5372
5373 /* Collapse range works only on fs block size aligned offsets. */
5374 if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
5375 len & (EXT4_BLOCK_SIZE(sb) - 1))
5376 return -EINVAL;
5377
5378 if (!S_ISREG(inode->i_mode))
5379 return -EOPNOTSUPP;
5380
5381 trace_ext4_collapse_range(inode, offset, len);
5382
5383 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5384 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5385
5386 /* Write out all dirty pages */
5387 ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
5388 if (ret)
5389 return ret;
5390
5391 /* Take mutex lock */
5392 mutex_lock(&inode->i_mutex);
5393
5394 /* It's not possible punch hole on append only file */
5395 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
5396 ret = -EPERM;
5397 goto out_mutex;
5398 }
5399
5400 if (IS_SWAPFILE(inode)) {
5401 ret = -ETXTBSY;
5402 goto out_mutex;
5403 }
5404
5405 /* Currently just for extent based files */
5406 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5407 ret = -EOPNOTSUPP;
5408 goto out_mutex;
5409 }
5410
5411 truncate_pagecache_range(inode, offset, -1);
5412
5413 /* Wait for existing dio to complete */
5414 ext4_inode_block_unlocked_dio(inode);
5415 inode_dio_wait(inode);
5416
5417 credits = ext4_writepage_trans_blocks(inode);
5418 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5419 if (IS_ERR(handle)) {
5420 ret = PTR_ERR(handle);
5421 goto out_dio;
5422 }
5423
5424 down_write(&EXT4_I(inode)->i_data_sem);
5425 ext4_discard_preallocations(inode);
5426
5427 ret = ext4_es_remove_extent(inode, punch_start,
5428 EXT_MAX_BLOCKS - punch_start - 1);
5429 if (ret) {
5430 up_write(&EXT4_I(inode)->i_data_sem);
5431 goto out_stop;
5432 }
5433
5434 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5435 if (ret) {
5436 up_write(&EXT4_I(inode)->i_data_sem);
5437 goto out_stop;
5438 }
5439
5440 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5441 punch_stop - punch_start);
5442 if (ret) {
5443 up_write(&EXT4_I(inode)->i_data_sem);
5444 goto out_stop;
5445 }
5446
5447 new_size = i_size_read(inode) - len;
5448 truncate_setsize(inode, new_size);
5449 EXT4_I(inode)->i_disksize = new_size;
5450
5451 ext4_discard_preallocations(inode);
5452 up_write(&EXT4_I(inode)->i_data_sem);
5453 if (IS_SYNC(inode))
5454 ext4_handle_sync(handle);
5455 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5456 ext4_mark_inode_dirty(handle, inode);
5457
5458out_stop:
5459 ext4_journal_stop(handle);
5460out_dio:
5461 ext4_inode_resume_unlocked_dio(inode);
5462out_mutex:
5463 mutex_unlock(&inode->i_mutex);
5464 return ret;
5465}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff783950..0a014a7194b2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
184 while (node) { 184 while (node) {
185 struct extent_status *es; 185 struct extent_status *es;
186 es = rb_entry(node, struct extent_status, rb_node); 186 es = rb_entry(node, struct extent_status, rb_node);
187 printk(KERN_DEBUG " [%u/%u) %llu %llx", 187 printk(KERN_DEBUG " [%u/%u) %llu %x",
188 es->es_lblk, es->es_len, 188 es->es_lblk, es->es_len,
189 ext4_es_pblock(es), ext4_es_status(es)); 189 ext4_es_pblock(es), ext4_es_status(es));
190 node = rb_next(node); 190 node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
445 pr_warn("ES insert assertion failed for " 445 pr_warn("ES insert assertion failed for "
446 "inode: %lu we can find an extent " 446 "inode: %lu we can find an extent "
447 "at block [%d/%d/%llu/%c], but we " 447 "at block [%d/%d/%llu/%c], but we "
448 "want to add an delayed/hole extent " 448 "want to add a delayed/hole extent "
449 "[%d/%d/%llu/%llx]\n", 449 "[%d/%d/%llu/%x]\n",
450 inode->i_ino, ee_block, ee_len, 450 inode->i_ino, ee_block, ee_len,
451 ee_start, ee_status ? 'u' : 'w', 451 ee_start, ee_status ? 'u' : 'w',
452 es->es_lblk, es->es_len, 452 es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
487 pr_warn("ES insert assertion failed for inode: %lu " 487 pr_warn("ES insert assertion failed for inode: %lu "
488 "can't find an extent at block %d but we want " 488 "can't find an extent at block %d but we want "
489 "to add an written/unwritten extent " 489 "to add a written/unwritten extent "
490 "[%d/%d/%llu/%llx]\n", inode->i_ino, 490 "[%d/%d/%llu/%x]\n", inode->i_ino,
491 es->es_lblk, es->es_lblk, es->es_len, 491 es->es_lblk, es->es_lblk, es->es_len,
492 ext4_es_pblock(es), ext4_es_status(es)); 492 ext4_es_pblock(es), ext4_es_status(es));
493 } 493 }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
524 */ 524 */
525 pr_warn("ES insert assertion failed for inode: %lu " 525 pr_warn("ES insert assertion failed for inode: %lu "
526 "We can find blocks but we want to add a " 526 "We can find blocks but we want to add a "
527 "delayed/hole extent [%d/%d/%llu/%llx]\n", 527 "delayed/hole extent [%d/%d/%llu/%x]\n",
528 inode->i_ino, es->es_lblk, es->es_len, 528 inode->i_ino, es->es_lblk, es->es_len,
529 ext4_es_pblock(es), ext4_es_status(es)); 529 ext4_es_pblock(es), ext4_es_status(es));
530 return; 530 return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
554 if (ext4_es_is_written(es)) { 554 if (ext4_es_is_written(es)) {
555 pr_warn("ES insert assertion failed for inode: %lu " 555 pr_warn("ES insert assertion failed for inode: %lu "
556 "We can't find the block but we want to add " 556 "We can't find the block but we want to add "
557 "an written extent [%d/%d/%llu/%llx]\n", 557 "a written extent [%d/%d/%llu/%x]\n",
558 inode->i_ino, es->es_lblk, es->es_len, 558 inode->i_ino, es->es_lblk, es->es_len,
559 ext4_es_pblock(es), ext4_es_status(es)); 559 ext4_es_pblock(es), ext4_es_status(es));
560 return; 560 return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
658 658
659 newes.es_lblk = lblk; 659 newes.es_lblk = lblk;
660 newes.es_len = len; 660 newes.es_len = len;
661 ext4_es_store_pblock(&newes, pblk); 661 ext4_es_store_pblock_status(&newes, pblk, status);
662 ext4_es_store_status(&newes, status);
663 trace_ext4_es_insert_extent(inode, &newes); 662 trace_ext4_es_insert_extent(inode, &newes);
664 663
665 ext4_es_insert_extent_check(inode, &newes); 664 ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
699 698
700 newes.es_lblk = lblk; 699 newes.es_lblk = lblk;
701 newes.es_len = len; 700 newes.es_len = len;
702 ext4_es_store_pblock(&newes, pblk); 701 ext4_es_store_pblock_status(&newes, pblk, status);
703 ext4_es_store_status(&newes, status);
704 trace_ext4_es_cache_extent(inode, &newes); 702 trace_ext4_es_cache_extent(inode, &newes);
705 703
706 if (!len) 704 if (!len)
@@ -812,13 +810,13 @@ retry:
812 810
813 newes.es_lblk = end + 1; 811 newes.es_lblk = end + 1;
814 newes.es_len = len2; 812 newes.es_len = len2;
813 block = 0x7FDEADBEEF;
815 if (ext4_es_is_written(&orig_es) || 814 if (ext4_es_is_written(&orig_es) ||
816 ext4_es_is_unwritten(&orig_es)) { 815 ext4_es_is_unwritten(&orig_es))
817 block = ext4_es_pblock(&orig_es) + 816 block = ext4_es_pblock(&orig_es) +
818 orig_es.es_len - len2; 817 orig_es.es_len - len2;
819 ext4_es_store_pblock(&newes, block); 818 ext4_es_store_pblock_status(&newes, block,
820 } 819 ext4_es_status(&orig_es));
821 ext4_es_store_status(&newes, ext4_es_status(&orig_es));
822 err = __es_insert_extent(inode, &newes); 820 err = __es_insert_extent(inode, &newes);
823 if (err) { 821 if (err) {
824 es->es_lblk = orig_es.es_lblk; 822 es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc3..f1b62a419920 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
129 (es->es_pblk & ~ES_MASK)); 129 (es->es_pblk & ~ES_MASK));
130} 130}
131 131
132static inline void ext4_es_store_pblock_status(struct extent_status *es,
133 ext4_fsblk_t pb,
134 unsigned int status)
135{
136 es->es_pblk = (((ext4_fsblk_t)
137 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
138 (pb & ~ES_MASK));
139}
140
132extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 141extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
133extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 142extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
134extern void ext4_es_lru_add(struct inode *inode); 143extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a5073959f32..4e508fc83dcf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -153,7 +153,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
153 ssize_t err; 153 ssize_t err;
154 154
155 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 155 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
156 if (err < 0 && ret > 0) 156 if (err < 0)
157 ret = err; 157 ret = err;
158 } 158 }
159 blk_finish_plug(&plug); 159 blk_finish_plug(&plug);
@@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
200 200
201static const struct vm_operations_struct ext4_file_vm_ops = { 201static const struct vm_operations_struct ext4_file_vm_ops = {
202 .fault = filemap_fault, 202 .fault = filemap_fault,
203 .map_pages = filemap_map_pages,
203 .page_mkwrite = ext4_page_mkwrite, 204 .page_mkwrite = ext4_page_mkwrite,
204 .remap_pages = generic_file_remap_pages, 205 .remap_pages = generic_file_remap_pages,
205}; 206};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24bfd7ff3049..5b0d2c7d5408 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode)
215 jbd2_complete_transaction(journal, commit_tid); 215 jbd2_complete_transaction(journal, commit_tid);
216 filemap_write_and_wait(&inode->i_data); 216 filemap_write_and_wait(&inode->i_data);
217 } 217 }
218 truncate_inode_pages(&inode->i_data, 0); 218 truncate_inode_pages_final(&inode->i_data);
219 219
220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
221 goto no_delete; 221 goto no_delete;
@@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode)
226 226
227 if (ext4_should_order_data(inode)) 227 if (ext4_should_order_data(inode))
228 ext4_begin_ordered_truncate(inode, 0); 228 ext4_begin_ordered_truncate(inode, 0);
229 truncate_inode_pages(&inode->i_data, 0); 229 truncate_inode_pages_final(&inode->i_data);
230 230
231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232 if (is_bad_inode(inode)) 232 if (is_bad_inode(inode))
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
504{ 504{
505 struct extent_status es; 505 struct extent_status es;
506 int retval; 506 int retval;
507 int ret = 0;
507#ifdef ES_AGGRESSIVE_TEST 508#ifdef ES_AGGRESSIVE_TEST
508 struct ext4_map_blocks orig_map; 509 struct ext4_map_blocks orig_map;
509 510
@@ -515,6 +516,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 516 "logical block %lu\n", inode->i_ino, flags, map->m_len,
516 (unsigned long) map->m_lblk); 517 (unsigned long) map->m_lblk);
517 518
519 /*
520 * ext4_map_blocks returns an int, and m_len is an unsigned int
521 */
522 if (unlikely(map->m_len > INT_MAX))
523 map->m_len = INT_MAX;
524
518 /* Lookup extent status tree firstly */ 525 /* Lookup extent status tree firstly */
519 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 526 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
520 ext4_es_lru_add(inode); 527 ext4_es_lru_add(inode);
@@ -553,7 +560,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
553 EXT4_GET_BLOCKS_KEEP_SIZE); 560 EXT4_GET_BLOCKS_KEEP_SIZE);
554 } 561 }
555 if (retval > 0) { 562 if (retval > 0) {
556 int ret;
557 unsigned int status; 563 unsigned int status;
558 564
559 if (unlikely(retval != map->m_len)) { 565 if (unlikely(retval != map->m_len)) {
@@ -580,7 +586,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
580 586
581found: 587found:
582 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 588 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
583 int ret = check_block_validity(inode, map); 589 ret = check_block_validity(inode, map);
584 if (ret != 0) 590 if (ret != 0)
585 return ret; 591 return ret;
586 } 592 }
@@ -597,7 +603,13 @@ found:
597 * with buffer head unmapped. 603 * with buffer head unmapped.
598 */ 604 */
599 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 605 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
600 return retval; 606 /*
607 * If we need to convert extent to unwritten
608 * we continue and do the actual work in
609 * ext4_ext_map_blocks()
610 */
611 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
612 return retval;
601 613
602 /* 614 /*
603 * Here we clear m_flags because after allocating an new extent, 615 * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +665,6 @@ found:
653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 665 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
654 666
655 if (retval > 0) { 667 if (retval > 0) {
656 int ret;
657 unsigned int status; 668 unsigned int status;
658 669
659 if (unlikely(retval != map->m_len)) { 670 if (unlikely(retval != map->m_len)) {
@@ -688,7 +699,7 @@ found:
688has_zeroout: 699has_zeroout:
689 up_write((&EXT4_I(inode)->i_data_sem)); 700 up_write((&EXT4_I(inode)->i_data_sem));
690 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 701 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
691 int ret = check_block_validity(inode, map); 702 ret = check_block_validity(inode, map);
692 if (ret != 0) 703 if (ret != 0)
693 return ret; 704 return ret;
694 } 705 }
@@ -3313,33 +3324,13 @@ void ext4_set_aops(struct inode *inode)
3313} 3324}
3314 3325
3315/* 3326/*
3316 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3317 * up to the end of the block which corresponds to `from'.
3318 * This required during truncate. We need to physically zero the tail end
3319 * of that block so it doesn't yield old data if the file is later grown.
3320 */
3321int ext4_block_truncate_page(handle_t *handle,
3322 struct address_space *mapping, loff_t from)
3323{
3324 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3325 unsigned length;
3326 unsigned blocksize;
3327 struct inode *inode = mapping->host;
3328
3329 blocksize = inode->i_sb->s_blocksize;
3330 length = blocksize - (offset & (blocksize - 1));
3331
3332 return ext4_block_zero_page_range(handle, mapping, from, length);
3333}
3334
3335/*
3336 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3327 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3337 * starting from file offset 'from'. The range to be zero'd must 3328 * starting from file offset 'from'. The range to be zero'd must
3338 * be contained with in one block. If the specified range exceeds 3329 * be contained with in one block. If the specified range exceeds
3339 * the end of the block it will be shortened to end of the block 3330 * the end of the block it will be shortened to end of the block
3340 * that cooresponds to 'from' 3331 * that cooresponds to 'from'
3341 */ 3332 */
3342int ext4_block_zero_page_range(handle_t *handle, 3333static int ext4_block_zero_page_range(handle_t *handle,
3343 struct address_space *mapping, loff_t from, loff_t length) 3334 struct address_space *mapping, loff_t from, loff_t length)
3344{ 3335{
3345 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3336 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3420,26 @@ unlock:
3429 return err; 3420 return err;
3430} 3421}
3431 3422
3423/*
3424 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3425 * up to the end of the block which corresponds to `from'.
3426 * This required during truncate. We need to physically zero the tail end
3427 * of that block so it doesn't yield old data if the file is later grown.
3428 */
3429int ext4_block_truncate_page(handle_t *handle,
3430 struct address_space *mapping, loff_t from)
3431{
3432 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3433 unsigned length;
3434 unsigned blocksize;
3435 struct inode *inode = mapping->host;
3436
3437 blocksize = inode->i_sb->s_blocksize;
3438 length = blocksize - (offset & (blocksize - 1));
3439
3440 return ext4_block_zero_page_range(handle, mapping, from, length);
3441}
3442
3432int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3443int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3433 loff_t lstart, loff_t length) 3444 loff_t lstart, loff_t length)
3434{ 3445{
@@ -3502,7 +3513,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3502 if (!S_ISREG(inode->i_mode)) 3513 if (!S_ISREG(inode->i_mode))
3503 return -EOPNOTSUPP; 3514 return -EOPNOTSUPP;
3504 3515
3505 trace_ext4_punch_hole(inode, offset, length); 3516 trace_ext4_punch_hole(inode, offset, length, 0);
3506 3517
3507 /* 3518 /*
3508 * Write out all dirty pages to avoid race conditions 3519 * Write out all dirty pages to avoid race conditions
@@ -3609,6 +3620,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3609 up_write(&EXT4_I(inode)->i_data_sem); 3620 up_write(&EXT4_I(inode)->i_data_sem);
3610 if (IS_SYNC(inode)) 3621 if (IS_SYNC(inode))
3611 ext4_handle_sync(handle); 3622 ext4_handle_sync(handle);
3623
3624 /* Now release the pages again to reduce race window */
3625 if (last_block_offset > first_block_offset)
3626 truncate_pagecache_range(inode, first_block_offset,
3627 last_block_offset);
3628
3612 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3629 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3613 ext4_mark_inode_dirty(handle, inode); 3630 ext4_mark_inode_dirty(handle, inode);
3614out_stop: 3631out_stop:
@@ -3682,7 +3699,7 @@ void ext4_truncate(struct inode *inode)
3682 3699
3683 /* 3700 /*
3684 * There is a possibility that we're either freeing the inode 3701 * There is a possibility that we're either freeing the inode
3685 * or it completely new indode. In those cases we might not 3702 * or it's a completely new inode. In those cases we might not
3686 * have i_mutex locked because it's not necessary. 3703 * have i_mutex locked because it's not necessary.
3687 */ 3704 */
3688 if (!(inode->i_state & (I_NEW|I_FREEING))) 3705 if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3951,8 @@ void ext4_set_inode_flags(struct inode *inode)
3934 new_fl |= S_NOATIME; 3951 new_fl |= S_NOATIME;
3935 if (flags & EXT4_DIRSYNC_FL) 3952 if (flags & EXT4_DIRSYNC_FL)
3936 new_fl |= S_DIRSYNC; 3953 new_fl |= S_DIRSYNC;
3937 set_mask_bits(&inode->i_flags, 3954 inode_set_flags(inode, new_fl,
3938 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); 3955 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3939} 3956}
3940 3957
3941/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3958/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4171,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4154 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4171 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4155 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4172 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4156 4173
4157 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4174 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4158 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4175 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4159 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4176 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4160 inode->i_version |= 4177 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4161 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4178 inode->i_version |=
4179 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4180 }
4162 } 4181 }
4163 4182
4164 ret = 0; 4183 ret = 0;
@@ -4328,8 +4347,7 @@ static int ext4_do_update_inode(handle_t *handle,
4328 goto out_brelse; 4347 goto out_brelse;
4329 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4348 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4330 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4349 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4331 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4350 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
4332 cpu_to_le32(EXT4_OS_HURD))
4333 raw_inode->i_file_acl_high = 4351 raw_inode->i_file_acl_high =
4334 cpu_to_le16(ei->i_file_acl >> 32); 4352 cpu_to_le16(ei->i_file_acl >> 32);
4335 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4353 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4392,15 @@ static int ext4_do_update_inode(handle_t *handle,
4374 raw_inode->i_block[block] = ei->i_data[block]; 4392 raw_inode->i_block[block] = ei->i_data[block];
4375 } 4393 }
4376 4394
4377 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4395 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4378 if (ei->i_extra_isize) { 4396 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4379 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4397 if (ei->i_extra_isize) {
4380 raw_inode->i_version_hi = 4398 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4381 cpu_to_le32(inode->i_version >> 32); 4399 raw_inode->i_version_hi =
4382 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4400 cpu_to_le32(inode->i_version >> 32);
4401 raw_inode->i_extra_isize =
4402 cpu_to_le16(ei->i_extra_isize);
4403 }
4383 } 4404 }
4384 4405
4385 ext4_inode_csum_set(inode, raw_inode, ei); 4406 ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4446,7 +4467,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4446 return -EIO; 4467 return -EIO;
4447 } 4468 }
4448 4469
4449 if (wbc->sync_mode != WB_SYNC_ALL) 4470 /*
4471 * No need to force transaction in WB_SYNC_NONE mode. Also
4472 * ext4_sync_fs() will force the commit after everything is
4473 * written.
4474 */
4475 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
4450 return 0; 4476 return 0;
4451 4477
4452 err = ext4_force_commit(inode->i_sb); 4478 err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4482,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4456 err = __ext4_get_inode_loc(inode, &iloc, 0); 4482 err = __ext4_get_inode_loc(inode, &iloc, 0);
4457 if (err) 4483 if (err)
4458 return err; 4484 return err;
4459 if (wbc->sync_mode == WB_SYNC_ALL) 4485 /*
4486 * sync(2) will flush the whole buffer cache. No need to do
4487 * it here separately for each inode.
4488 */
4489 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4460 sync_dirty_buffer(iloc.bh); 4490 sync_dirty_buffer(iloc.bh);
4461 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 4491 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
4462 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 4492 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..0f2252ec274d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
104 struct ext4_inode_info *ei_bl; 104 struct ext4_inode_info *ei_bl;
105 struct ext4_sb_info *sbi = EXT4_SB(sb); 105 struct ext4_sb_info *sbi = EXT4_SB(sb);
106 106
107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
108 err = -EINVAL; 108 return -EINVAL;
109 goto swap_boot_out;
110 }
111 109
112 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { 110 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
113 err = -EPERM; 111 return -EPERM;
114 goto swap_boot_out;
115 }
116 112
117 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 113 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
118 if (IS_ERR(inode_bl)) { 114 if (IS_ERR(inode_bl))
119 err = PTR_ERR(inode_bl); 115 return PTR_ERR(inode_bl);
120 goto swap_boot_out;
121 }
122 ei_bl = EXT4_I(inode_bl); 116 ei_bl = EXT4_I(inode_bl);
123 117
124 filemap_flush(inode->i_mapping); 118 filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
193 ext4_mark_inode_dirty(handle, inode); 187 ext4_mark_inode_dirty(handle, inode);
194 } 188 }
195 } 189 }
196
197 ext4_journal_stop(handle); 190 ext4_journal_stop(handle);
198
199 ext4_double_up_write_data_sem(inode, inode_bl); 191 ext4_double_up_write_data_sem(inode, inode_bl);
200 192
201journal_err_out: 193journal_err_out:
202 ext4_inode_resume_unlocked_dio(inode); 194 ext4_inode_resume_unlocked_dio(inode);
203 ext4_inode_resume_unlocked_dio(inode_bl); 195 ext4_inode_resume_unlocked_dio(inode_bl);
204
205 unlock_two_nondirectories(inode, inode_bl); 196 unlock_two_nondirectories(inode, inode_bl);
206
207 iput(inode_bl); 197 iput(inode_bl);
208
209swap_boot_out:
210 return err; 198 return err;
211} 199}
212 200
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c7504be9..a888cac76e9c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1808 ext4_lock_group(ac->ac_sb, group); 1808 ext4_lock_group(ac->ac_sb, group);
1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1810 ac->ac_g_ex.fe_len, &ex); 1810 ac->ac_g_ex.fe_len, &ex);
1811 ex.fe_logical = 0xDEADFA11; /* debug value */
1811 1812
1812 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1813 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1813 ext4_fsblk_t start; 1814 ext4_fsblk_t start;
@@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1936 */ 1937 */
1937 break; 1938 break;
1938 } 1939 }
1939 1940 ex.fe_logical = 0xDEADC0DE; /* debug value */
1940 ext4_mb_measure_extent(ac, &ex, e4b); 1941 ext4_mb_measure_extent(ac, &ex, e4b);
1941 1942
1942 i += ex.fe_len; 1943 i += ex.fe_len;
@@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1977 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1978 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1978 if (max >= sbi->s_stripe) { 1979 if (max >= sbi->s_stripe) {
1979 ac->ac_found++; 1980 ac->ac_found++;
1981 ex.fe_logical = 0xDEADF00D; /* debug value */
1980 ac->ac_b_ex = ex; 1982 ac->ac_b_ex = ex;
1981 ext4_mb_use_best_found(ac, e4b); 1983 ext4_mb_use_best_found(ac, e4b);
1982 break; 1984 break;
@@ -4006,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4006 (unsigned long)ac->ac_b_ex.fe_len, 4008 (unsigned long)ac->ac_b_ex.fe_len,
4007 (unsigned long)ac->ac_b_ex.fe_logical, 4009 (unsigned long)ac->ac_b_ex.fe_logical,
4008 (int)ac->ac_criteria); 4010 (int)ac->ac_criteria);
4009 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", 4011 ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4010 ac->ac_ex_scanned, ac->ac_found);
4011 ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); 4012 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4012 ngroups = ext4_get_groups_count(sb); 4013 ngroups = ext4_get_groups_count(sb);
4013 for (i = 0; i < ngroups; i++) { 4014 for (i = 0; i < ngroups; i++) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd5..d634e183b4d4 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
48 } \ 48 } \
49 } while (0) 49 } while (0)
50#else 50#else
51#define mb_debug(n, fmt, a...) 51#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
52#endif 52#endif
53 53
54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
175 /* copy of the best found extent taken before preallocation efforts */ 175 /* copy of the best found extent taken before preallocation efforts */
176 struct ext4_free_extent ac_f_ex; 176 struct ext4_free_extent ac_f_ex;
177 177
178 /* number of iterations done. we have to track to limit searching */
179 unsigned long ac_ex_scanned;
180 __u16 ac_groups_scanned; 178 __u16 ac_groups_scanned;
181 __u16 ac_found; 179 __u16 ac_found;
182 __u16 ac_tail; 180 __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503bd18c..58ee7dc87669 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
76 * ext4_ext_path structure refers to the last extent, or a negative error 76 * ext4_ext_path structure refers to the last extent, or a negative error
77 * value on failure. 77 * value on failure.
78 */ 78 */
79static int 79int
80mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 80mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
81 struct ext4_extent **extent) 81 struct ext4_extent **extent)
82{ 82{
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
861 } 861 }
862 if (!buffer_mapped(bh)) { 862 if (!buffer_mapped(bh)) {
863 zero_user(page, block_start, blocksize); 863 zero_user(page, block_start, blocksize);
864 if (!err) 864 set_buffer_uptodate(bh);
865 set_buffer_uptodate(bh);
866 continue; 865 continue;
867 } 866 }
868 } 867 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d050e043e884..1cb84f78909e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3000,6 +3000,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
3000 return ext4_get_first_inline_block(inode, parent_de, retval); 3000 return ext4_get_first_inline_block(inode, parent_de, retval);
3001} 3001}
3002 3002
3003struct ext4_renament {
3004 struct inode *dir;
3005 struct dentry *dentry;
3006 struct inode *inode;
3007 bool is_dir;
3008 int dir_nlink_delta;
3009
3010 /* entry for "dentry" */
3011 struct buffer_head *bh;
3012 struct ext4_dir_entry_2 *de;
3013 int inlined;
3014
3015 /* entry for ".." in inode if it's a directory */
3016 struct buffer_head *dir_bh;
3017 struct ext4_dir_entry_2 *parent_de;
3018 int dir_inlined;
3019};
3020
3021static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
3022{
3023 int retval;
3024
3025 ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
3026 &retval, &ent->parent_de,
3027 &ent->dir_inlined);
3028 if (!ent->dir_bh)
3029 return retval;
3030 if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
3031 return -EIO;
3032 BUFFER_TRACE(ent->dir_bh, "get_write_access");
3033 return ext4_journal_get_write_access(handle, ent->dir_bh);
3034}
3035
3036static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
3037 unsigned dir_ino)
3038{
3039 int retval;
3040
3041 ent->parent_de->inode = cpu_to_le32(dir_ino);
3042 BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
3043 if (!ent->dir_inlined) {
3044 if (is_dx(ent->inode)) {
3045 retval = ext4_handle_dirty_dx_node(handle,
3046 ent->inode,
3047 ent->dir_bh);
3048 } else {
3049 retval = ext4_handle_dirty_dirent_node(handle,
3050 ent->inode,
3051 ent->dir_bh);
3052 }
3053 } else {
3054 retval = ext4_mark_inode_dirty(handle, ent->inode);
3055 }
3056 if (retval) {
3057 ext4_std_error(ent->dir->i_sb, retval);
3058 return retval;
3059 }
3060 return 0;
3061}
3062
3063static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
3064 unsigned ino, unsigned file_type)
3065{
3066 int retval;
3067
3068 BUFFER_TRACE(ent->bh, "get write access");
3069 retval = ext4_journal_get_write_access(handle, ent->bh);
3070 if (retval)
3071 return retval;
3072 ent->de->inode = cpu_to_le32(ino);
3073 if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
3074 EXT4_FEATURE_INCOMPAT_FILETYPE))
3075 ent->de->file_type = file_type;
3076 ent->dir->i_version++;
3077 ent->dir->i_ctime = ent->dir->i_mtime =
3078 ext4_current_time(ent->dir);
3079 ext4_mark_inode_dirty(handle, ent->dir);
3080 BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
3081 if (!ent->inlined) {
3082 retval = ext4_handle_dirty_dirent_node(handle,
3083 ent->dir, ent->bh);
3084 if (unlikely(retval)) {
3085 ext4_std_error(ent->dir->i_sb, retval);
3086 return retval;
3087 }
3088 }
3089 brelse(ent->bh);
3090 ent->bh = NULL;
3091
3092 return 0;
3093}
3094
3095static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3096 const struct qstr *d_name)
3097{
3098 int retval = -ENOENT;
3099 struct buffer_head *bh;
3100 struct ext4_dir_entry_2 *de;
3101
3102 bh = ext4_find_entry(dir, d_name, &de, NULL);
3103 if (bh) {
3104 retval = ext4_delete_entry(handle, dir, de, bh);
3105 brelse(bh);
3106 }
3107 return retval;
3108}
3109
3110static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
3111{
3112 int retval;
3113 /*
3114 * ent->de could have moved from under us during htree split, so make
3115 * sure that we are deleting the right entry. We might also be pointing
3116 * to a stale entry in the unused part of ent->bh so just checking inum
3117 * and the name isn't enough.
3118 */
3119 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
3120 ent->de->name_len != ent->dentry->d_name.len ||
3121 strncmp(ent->de->name, ent->dentry->d_name.name,
3122 ent->de->name_len)) {
3123 retval = ext4_find_delete_entry(handle, ent->dir,
3124 &ent->dentry->d_name);
3125 } else {
3126 retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
3127 if (retval == -ENOENT) {
3128 retval = ext4_find_delete_entry(handle, ent->dir,
3129 &ent->dentry->d_name);
3130 }
3131 }
3132
3133 if (retval) {
3134 ext4_warning(ent->dir->i_sb,
3135 "Deleting old file (%lu), %d, error=%d",
3136 ent->dir->i_ino, ent->dir->i_nlink, retval);
3137 }
3138}
3139
3140static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3141{
3142 if (ent->dir_nlink_delta) {
3143 if (ent->dir_nlink_delta == -1)
3144 ext4_dec_count(handle, ent->dir);
3145 else
3146 ext4_inc_count(handle, ent->dir);
3147 ext4_mark_inode_dirty(handle, ent->dir);
3148 }
3149}
3150
3003/* 3151/*
3004 * Anybody can rename anything with this: the permission checks are left to the 3152 * Anybody can rename anything with this: the permission checks are left to the
3005 * higher-level routines. 3153 * higher-level routines.
@@ -3012,198 +3160,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3012 struct inode *new_dir, struct dentry *new_dentry) 3160 struct inode *new_dir, struct dentry *new_dentry)
3013{ 3161{
3014 handle_t *handle = NULL; 3162 handle_t *handle = NULL;
3015 struct inode *old_inode, *new_inode; 3163 struct ext4_renament old = {
3016 struct buffer_head *old_bh, *new_bh, *dir_bh; 3164 .dir = old_dir,
3017 struct ext4_dir_entry_2 *old_de, *new_de; 3165 .dentry = old_dentry,
3166 .inode = old_dentry->d_inode,
3167 };
3168 struct ext4_renament new = {
3169 .dir = new_dir,
3170 .dentry = new_dentry,
3171 .inode = new_dentry->d_inode,
3172 };
3018 int retval; 3173 int retval;
3019 int inlined = 0, new_inlined = 0;
3020 struct ext4_dir_entry_2 *parent_de;
3021 3174
3022 dquot_initialize(old_dir); 3175 dquot_initialize(old.dir);
3023 dquot_initialize(new_dir); 3176 dquot_initialize(new.dir);
3024
3025 old_bh = new_bh = dir_bh = NULL;
3026 3177
3027 /* Initialize quotas before so that eventual writes go 3178 /* Initialize quotas before so that eventual writes go
3028 * in separate transaction */ 3179 * in separate transaction */
3029 if (new_dentry->d_inode) 3180 if (new.inode)
3030 dquot_initialize(new_dentry->d_inode); 3181 dquot_initialize(new.inode);
3031 3182
3032 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); 3183 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3033 /* 3184 /*
3034 * Check for inode number is _not_ due to possible IO errors. 3185 * Check for inode number is _not_ due to possible IO errors.
3035 * We might rmdir the source, keep it as pwd of some process 3186 * We might rmdir the source, keep it as pwd of some process
3036 * and merrily kill the link to whatever was created under the 3187 * and merrily kill the link to whatever was created under the
3037 * same name. Goodbye sticky bit ;-< 3188 * same name. Goodbye sticky bit ;-<
3038 */ 3189 */
3039 old_inode = old_dentry->d_inode;
3040 retval = -ENOENT; 3190 retval = -ENOENT;
3041 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) 3191 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3042 goto end_rename; 3192 goto end_rename;
3043 3193
3044 new_inode = new_dentry->d_inode; 3194 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3045 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, 3195 &new.de, &new.inlined);
3046 &new_de, &new_inlined); 3196 if (new.bh) {
3047 if (new_bh) { 3197 if (!new.inode) {
3048 if (!new_inode) { 3198 brelse(new.bh);
3049 brelse(new_bh); 3199 new.bh = NULL;
3050 new_bh = NULL;
3051 } 3200 }
3052 } 3201 }
3053 if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) 3202 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
3054 ext4_alloc_da_blocks(old_inode); 3203 ext4_alloc_da_blocks(old.inode);
3055 3204
3056 handle = ext4_journal_start(old_dir, EXT4_HT_DIR, 3205 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3057 (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 3206 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3058 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3207 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3059 if (IS_ERR(handle)) 3208 if (IS_ERR(handle))
3060 return PTR_ERR(handle); 3209 return PTR_ERR(handle);
3061 3210
3062 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3211 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3063 ext4_handle_sync(handle); 3212 ext4_handle_sync(handle);
3064 3213
3065 if (S_ISDIR(old_inode->i_mode)) { 3214 if (S_ISDIR(old.inode->i_mode)) {
3066 if (new_inode) { 3215 if (new.inode) {
3067 retval = -ENOTEMPTY; 3216 retval = -ENOTEMPTY;
3068 if (!empty_dir(new_inode)) 3217 if (!empty_dir(new.inode))
3218 goto end_rename;
3219 } else {
3220 retval = -EMLINK;
3221 if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
3069 goto end_rename; 3222 goto end_rename;
3070 } 3223 }
3071 retval = -EIO; 3224 retval = ext4_rename_dir_prepare(handle, &old);
3072 dir_bh = ext4_get_first_dir_block(handle, old_inode,
3073 &retval, &parent_de,
3074 &inlined);
3075 if (!dir_bh)
3076 goto end_rename;
3077 if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
3078 goto end_rename;
3079 retval = -EMLINK;
3080 if (!new_inode && new_dir != old_dir &&
3081 EXT4_DIR_LINK_MAX(new_dir))
3082 goto end_rename;
3083 BUFFER_TRACE(dir_bh, "get_write_access");
3084 retval = ext4_journal_get_write_access(handle, dir_bh);
3085 if (retval) 3225 if (retval)
3086 goto end_rename; 3226 goto end_rename;
3087 } 3227 }
3088 if (!new_bh) { 3228 if (!new.bh) {
3089 retval = ext4_add_entry(handle, new_dentry, old_inode); 3229 retval = ext4_add_entry(handle, new.dentry, old.inode);
3090 if (retval) 3230 if (retval)
3091 goto end_rename; 3231 goto end_rename;
3092 } else { 3232 } else {
3093 BUFFER_TRACE(new_bh, "get write access"); 3233 retval = ext4_setent(handle, &new,
3094 retval = ext4_journal_get_write_access(handle, new_bh); 3234 old.inode->i_ino, old.de->file_type);
3095 if (retval) 3235 if (retval)
3096 goto end_rename; 3236 goto end_rename;
3097 new_de->inode = cpu_to_le32(old_inode->i_ino);
3098 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
3099 EXT4_FEATURE_INCOMPAT_FILETYPE))
3100 new_de->file_type = old_de->file_type;
3101 new_dir->i_version++;
3102 new_dir->i_ctime = new_dir->i_mtime =
3103 ext4_current_time(new_dir);
3104 ext4_mark_inode_dirty(handle, new_dir);
3105 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
3106 if (!new_inlined) {
3107 retval = ext4_handle_dirty_dirent_node(handle,
3108 new_dir, new_bh);
3109 if (unlikely(retval)) {
3110 ext4_std_error(new_dir->i_sb, retval);
3111 goto end_rename;
3112 }
3113 }
3114 brelse(new_bh);
3115 new_bh = NULL;
3116 } 3237 }
3117 3238
3118 /* 3239 /*
3119 * Like most other Unix systems, set the ctime for inodes on a 3240 * Like most other Unix systems, set the ctime for inodes on a
3120 * rename. 3241 * rename.
3121 */ 3242 */
3122 old_inode->i_ctime = ext4_current_time(old_inode); 3243 old.inode->i_ctime = ext4_current_time(old.inode);
3123 ext4_mark_inode_dirty(handle, old_inode); 3244 ext4_mark_inode_dirty(handle, old.inode);
3124 3245
3125 /* 3246 /*
3126 * ok, that's it 3247 * ok, that's it
3127 */ 3248 */
3128 if (le32_to_cpu(old_de->inode) != old_inode->i_ino || 3249 ext4_rename_delete(handle, &old);
3129 old_de->name_len != old_dentry->d_name.len || 3250
3130 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || 3251 if (new.inode) {
3131 (retval = ext4_delete_entry(handle, old_dir, 3252 ext4_dec_count(handle, new.inode);
3132 old_de, old_bh)) == -ENOENT) { 3253 new.inode->i_ctime = ext4_current_time(new.inode);
3133 /* old_de could have moved from under us during htree split, so
3134 * make sure that we are deleting the right entry. We might
3135 * also be pointing to a stale entry in the unused part of
3136 * old_bh so just checking inum and the name isn't enough. */
3137 struct buffer_head *old_bh2;
3138 struct ext4_dir_entry_2 *old_de2;
3139
3140 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
3141 &old_de2, NULL);
3142 if (old_bh2) {
3143 retval = ext4_delete_entry(handle, old_dir,
3144 old_de2, old_bh2);
3145 brelse(old_bh2);
3146 }
3147 } 3254 }
3148 if (retval) { 3255 old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
3149 ext4_warning(old_dir->i_sb, 3256 ext4_update_dx_flag(old.dir);
3150 "Deleting old file (%lu), %d, error=%d", 3257 if (old.dir_bh) {
3151 old_dir->i_ino, old_dir->i_nlink, retval); 3258 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3152 } 3259 if (retval)
3153
3154 if (new_inode) {
3155 ext4_dec_count(handle, new_inode);
3156 new_inode->i_ctime = ext4_current_time(new_inode);
3157 }
3158 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
3159 ext4_update_dx_flag(old_dir);
3160 if (dir_bh) {
3161 parent_de->inode = cpu_to_le32(new_dir->i_ino);
3162 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
3163 if (!inlined) {
3164 if (is_dx(old_inode)) {
3165 retval = ext4_handle_dirty_dx_node(handle,
3166 old_inode,
3167 dir_bh);
3168 } else {
3169 retval = ext4_handle_dirty_dirent_node(handle,
3170 old_inode, dir_bh);
3171 }
3172 } else {
3173 retval = ext4_mark_inode_dirty(handle, old_inode);
3174 }
3175 if (retval) {
3176 ext4_std_error(old_dir->i_sb, retval);
3177 goto end_rename; 3260 goto end_rename;
3178 } 3261
3179 ext4_dec_count(handle, old_dir); 3262 ext4_dec_count(handle, old.dir);
3180 if (new_inode) { 3263 if (new.inode) {
3181 /* checked empty_dir above, can't have another parent, 3264 /* checked empty_dir above, can't have another parent,
3182 * ext4_dec_count() won't work for many-linked dirs */ 3265 * ext4_dec_count() won't work for many-linked dirs */
3183 clear_nlink(new_inode); 3266 clear_nlink(new.inode);
3184 } else { 3267 } else {
3185 ext4_inc_count(handle, new_dir); 3268 ext4_inc_count(handle, new.dir);
3186 ext4_update_dx_flag(new_dir); 3269 ext4_update_dx_flag(new.dir);
3187 ext4_mark_inode_dirty(handle, new_dir); 3270 ext4_mark_inode_dirty(handle, new.dir);
3188 } 3271 }
3189 } 3272 }
3190 ext4_mark_inode_dirty(handle, old_dir); 3273 ext4_mark_inode_dirty(handle, old.dir);
3191 if (new_inode) { 3274 if (new.inode) {
3192 ext4_mark_inode_dirty(handle, new_inode); 3275 ext4_mark_inode_dirty(handle, new.inode);
3193 if (!new_inode->i_nlink) 3276 if (!new.inode->i_nlink)
3194 ext4_orphan_add(handle, new_inode); 3277 ext4_orphan_add(handle, new.inode);
3195 } 3278 }
3196 retval = 0; 3279 retval = 0;
3197 3280
3198end_rename: 3281end_rename:
3199 brelse(dir_bh); 3282 brelse(old.dir_bh);
3200 brelse(old_bh); 3283 brelse(old.bh);
3201 brelse(new_bh); 3284 brelse(new.bh);
3202 if (handle) 3285 if (handle)
3203 ext4_journal_stop(handle); 3286 ext4_journal_stop(handle);
3204 return retval; 3287 return retval;
3205} 3288}
3206 3289
3290static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3291 struct inode *new_dir, struct dentry *new_dentry)
3292{
3293 handle_t *handle = NULL;
3294 struct ext4_renament old = {
3295 .dir = old_dir,
3296 .dentry = old_dentry,
3297 .inode = old_dentry->d_inode,
3298 };
3299 struct ext4_renament new = {
3300 .dir = new_dir,
3301 .dentry = new_dentry,
3302 .inode = new_dentry->d_inode,
3303 };
3304 u8 new_file_type;
3305 int retval;
3306
3307 dquot_initialize(old.dir);
3308 dquot_initialize(new.dir);
3309
3310 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3311 &old.de, &old.inlined);
3312 /*
3313 * Check for inode number is _not_ due to possible IO errors.
3314 * We might rmdir the source, keep it as pwd of some process
3315 * and merrily kill the link to whatever was created under the
3316 * same name. Goodbye sticky bit ;-<
3317 */
3318 retval = -ENOENT;
3319 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3320 goto end_rename;
3321
3322 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3323 &new.de, &new.inlined);
3324
3325 /* RENAME_EXCHANGE case: old *and* new must both exist */
3326 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
3327 goto end_rename;
3328
3329 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3330 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3331 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3332 if (IS_ERR(handle))
3333 return PTR_ERR(handle);
3334
3335 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3336 ext4_handle_sync(handle);
3337
3338 if (S_ISDIR(old.inode->i_mode)) {
3339 old.is_dir = true;
3340 retval = ext4_rename_dir_prepare(handle, &old);
3341 if (retval)
3342 goto end_rename;
3343 }
3344 if (S_ISDIR(new.inode->i_mode)) {
3345 new.is_dir = true;
3346 retval = ext4_rename_dir_prepare(handle, &new);
3347 if (retval)
3348 goto end_rename;
3349 }
3350
3351 /*
3352 * Other than the special case of overwriting a directory, parents'
3353 * nlink only needs to be modified if this is a cross directory rename.
3354 */
3355 if (old.dir != new.dir && old.is_dir != new.is_dir) {
3356 old.dir_nlink_delta = old.is_dir ? -1 : 1;
3357 new.dir_nlink_delta = -old.dir_nlink_delta;
3358 retval = -EMLINK;
3359 if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
3360 (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
3361 goto end_rename;
3362 }
3363
3364 new_file_type = new.de->file_type;
3365 retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
3366 if (retval)
3367 goto end_rename;
3368
3369 retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
3370 if (retval)
3371 goto end_rename;
3372
3373 /*
3374 * Like most other Unix systems, set the ctime for inodes on a
3375 * rename.
3376 */
3377 old.inode->i_ctime = ext4_current_time(old.inode);
3378 new.inode->i_ctime = ext4_current_time(new.inode);
3379 ext4_mark_inode_dirty(handle, old.inode);
3380 ext4_mark_inode_dirty(handle, new.inode);
3381
3382 if (old.dir_bh) {
3383 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3384 if (retval)
3385 goto end_rename;
3386 }
3387 if (new.dir_bh) {
3388 retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
3389 if (retval)
3390 goto end_rename;
3391 }
3392 ext4_update_dir_count(handle, &old);
3393 ext4_update_dir_count(handle, &new);
3394 retval = 0;
3395
3396end_rename:
3397 brelse(old.dir_bh);
3398 brelse(new.dir_bh);
3399 brelse(old.bh);
3400 brelse(new.bh);
3401 if (handle)
3402 ext4_journal_stop(handle);
3403 return retval;
3404}
3405
3406static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
3407 struct inode *new_dir, struct dentry *new_dentry,
3408 unsigned int flags)
3409{
3410 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
3411 return -EINVAL;
3412
3413 if (flags & RENAME_EXCHANGE) {
3414 return ext4_cross_rename(old_dir, old_dentry,
3415 new_dir, new_dentry);
3416 }
3417 /*
3418 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
3419 * is equivalent to regular rename.
3420 */
3421 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
3422}
3423
3207/* 3424/*
3208 * directories can handle most operations... 3425 * directories can handle most operations...
3209 */ 3426 */
@@ -3218,6 +3435,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3218 .mknod = ext4_mknod, 3435 .mknod = ext4_mknod,
3219 .tmpfile = ext4_tmpfile, 3436 .tmpfile = ext4_tmpfile,
3220 .rename = ext4_rename, 3437 .rename = ext4_rename,
3438 .rename2 = ext4_rename2,
3221 .setattr = ext4_setattr, 3439 .setattr = ext4_setattr,
3222 .setxattr = generic_setxattr, 3440 .setxattr = generic_setxattr,
3223 .getxattr = generic_getxattr, 3441 .getxattr = generic_getxattr,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2377d4..f3c667091618 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
59static struct ext4_lazy_init *ext4_li_info; 59static struct ext4_lazy_init *ext4_li_info;
60static struct mutex ext4_li_mtx; 60static struct mutex ext4_li_mtx;
61static struct ext4_features *ext4_feat; 61static struct ext4_features *ext4_feat;
62static int ext4_mballoc_ready;
62 63
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 64static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 65 unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
845 invalidate_bdev(sbi->journal_bdev); 846 invalidate_bdev(sbi->journal_bdev);
846 ext4_blkdev_remove(sbi); 847 ext4_blkdev_remove(sbi);
847 } 848 }
849 if (sbi->s_mb_cache) {
850 ext4_xattr_destroy_cache(sbi->s_mb_cache);
851 sbi->s_mb_cache = NULL;
852 }
848 if (sbi->s_mmp_tsk) 853 if (sbi->s_mmp_tsk)
849 kthread_stop(sbi->s_mmp_tsk); 854 kthread_stop(sbi->s_mmp_tsk);
850 sb->s_fs_info = NULL; 855 sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
940 inode_init_once(&ei->vfs_inode); 945 inode_init_once(&ei->vfs_inode);
941} 946}
942 947
943static int init_inodecache(void) 948static int __init init_inodecache(void)
944{ 949{
945 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 950 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
946 sizeof(struct ext4_inode_info), 951 sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3575 "feature flags set on rev 0 fs, " 3580 "feature flags set on rev 0 fs, "
3576 "running e2fsck is recommended"); 3581 "running e2fsck is recommended");
3577 3582
3583 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3584 set_opt2(sb, HURD_COMPAT);
3585 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
3586 EXT4_FEATURE_INCOMPAT_64BIT)) {
3587 ext4_msg(sb, KERN_ERR,
3588 "The Hurd can't support 64-bit file systems");
3589 goto failed_mount;
3590 }
3591 }
3592
3578 if (IS_EXT2_SB(sb)) { 3593 if (IS_EXT2_SB(sb)) {
3579 if (ext2_feature_set_ok(sb)) 3594 if (ext2_feature_set_ok(sb))
3580 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " 3595 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -4010,6 +4025,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4010 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); 4025 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
4011 4026
4012no_journal: 4027no_journal:
4028 if (ext4_mballoc_ready) {
4029 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
4030 if (!sbi->s_mb_cache) {
4031 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
4032 goto failed_mount_wq;
4033 }
4034 }
4035
4013 /* 4036 /*
4014 * Get the # of file system overhead blocks from the 4037 * Get the # of file system overhead blocks from the
4015 * superblock if present. 4038 * superblock if present.
@@ -4835,6 +4858,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4835 } 4858 }
4836 4859
4837 if (*flags & MS_RDONLY) { 4860 if (*flags & MS_RDONLY) {
4861 err = sync_filesystem(sb);
4862 if (err < 0)
4863 goto restore_opts;
4838 err = dquot_suspend(sb, -1); 4864 err = dquot_suspend(sb, -1);
4839 if (err < 0) 4865 if (err < 0)
4840 goto restore_opts; 4866 goto restore_opts;
@@ -5516,11 +5542,9 @@ static int __init ext4_init_fs(void)
5516 5542
5517 err = ext4_init_mballoc(); 5543 err = ext4_init_mballoc();
5518 if (err) 5544 if (err)
5519 goto out3;
5520
5521 err = ext4_init_xattr();
5522 if (err)
5523 goto out2; 5545 goto out2;
5546 else
5547 ext4_mballoc_ready = 1;
5524 err = init_inodecache(); 5548 err = init_inodecache();
5525 if (err) 5549 if (err)
5526 goto out1; 5550 goto out1;
@@ -5536,10 +5560,9 @@ out:
5536 unregister_as_ext3(); 5560 unregister_as_ext3();
5537 destroy_inodecache(); 5561 destroy_inodecache();
5538out1: 5562out1:
5539 ext4_exit_xattr(); 5563 ext4_mballoc_ready = 0;
5540out2:
5541 ext4_exit_mballoc(); 5564 ext4_exit_mballoc();
5542out3: 5565out2:
5543 ext4_exit_feat_adverts(); 5566 ext4_exit_feat_adverts();
5544out4: 5567out4:
5545 if (ext4_proc_root) 5568 if (ext4_proc_root)
@@ -5562,7 +5585,6 @@ static void __exit ext4_exit_fs(void)
5562 unregister_as_ext3(); 5585 unregister_as_ext3();
5563 unregister_filesystem(&ext4_fs_type); 5586 unregister_filesystem(&ext4_fs_type);
5564 destroy_inodecache(); 5587 destroy_inodecache();
5565 ext4_exit_xattr();
5566 ext4_exit_mballoc(); 5588 ext4_exit_mballoc();
5567 ext4_exit_feat_adverts(); 5589 ext4_exit_feat_adverts();
5568 remove_proc_entry("fs/ext4", NULL); 5590 remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94116ac..1f5cf5880718 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
82#endif 82#endif
83 83
84static void ext4_xattr_cache_insert(struct buffer_head *); 84static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
85static struct buffer_head *ext4_xattr_cache_find(struct inode *, 85static struct buffer_head *ext4_xattr_cache_find(struct inode *,
86 struct ext4_xattr_header *, 86 struct ext4_xattr_header *,
87 struct mb_cache_entry **); 87 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
90static int ext4_xattr_list(struct dentry *dentry, char *buffer, 90static int ext4_xattr_list(struct dentry *dentry, char *buffer,
91 size_t buffer_size); 91 size_t buffer_size);
92 92
93static struct mb_cache *ext4_xattr_cache;
94
95static const struct xattr_handler *ext4_xattr_handler_map[] = { 93static const struct xattr_handler *ext4_xattr_handler_map[] = {
96 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 94 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
97#ifdef CONFIG_EXT4_FS_POSIX_ACL 95#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
117 NULL 115 NULL
118}; 116};
119 117
118#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
119 inode->i_sb->s_fs_info)->s_mb_cache)
120
120static __le32 ext4_xattr_block_csum(struct inode *inode, 121static __le32 ext4_xattr_block_csum(struct inode *inode,
121 sector_t block_nr, 122 sector_t block_nr,
122 struct ext4_xattr_header *hdr) 123 struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
265 struct ext4_xattr_entry *entry; 266 struct ext4_xattr_entry *entry;
266 size_t size; 267 size_t size;
267 int error; 268 int error;
269 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
268 270
269 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 271 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
270 name_index, name, buffer, (long)buffer_size); 272 name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
286 error = -EIO; 288 error = -EIO;
287 goto cleanup; 289 goto cleanup;
288 } 290 }
289 ext4_xattr_cache_insert(bh); 291 ext4_xattr_cache_insert(ext4_mb_cache, bh);
290 entry = BFIRST(bh); 292 entry = BFIRST(bh);
291 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); 293 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
292 if (error == -EIO) 294 if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
409 struct inode *inode = dentry->d_inode; 411 struct inode *inode = dentry->d_inode;
410 struct buffer_head *bh = NULL; 412 struct buffer_head *bh = NULL;
411 int error; 413 int error;
414 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
412 415
413 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 416 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
414 buffer, (long)buffer_size); 417 buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
430 error = -EIO; 433 error = -EIO;
431 goto cleanup; 434 goto cleanup;
432 } 435 }
433 ext4_xattr_cache_insert(bh); 436 ext4_xattr_cache_insert(ext4_mb_cache, bh);
434 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); 437 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
435 438
436cleanup: 439cleanup:
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
526{ 529{
527 struct mb_cache_entry *ce = NULL; 530 struct mb_cache_entry *ce = NULL;
528 int error = 0; 531 int error = 0;
532 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
529 533
530 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); 534 ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
531 error = ext4_journal_get_write_access(handle, bh); 535 error = ext4_journal_get_write_access(handle, bh);
532 if (error) 536 if (error)
533 goto out; 537 goto out;
@@ -567,12 +571,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
567 size_t *min_offs, void *base, int *total) 571 size_t *min_offs, void *base, int *total)
568{ 572{
569 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { 573 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
570 *total += EXT4_XATTR_LEN(last->e_name_len);
571 if (!last->e_value_block && last->e_value_size) { 574 if (!last->e_value_block && last->e_value_size) {
572 size_t offs = le16_to_cpu(last->e_value_offs); 575 size_t offs = le16_to_cpu(last->e_value_offs);
573 if (offs < *min_offs) 576 if (offs < *min_offs)
574 *min_offs = offs; 577 *min_offs = offs;
575 } 578 }
579 if (total)
580 *total += EXT4_XATTR_LEN(last->e_name_len);
576 } 581 }
577 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 582 return (*min_offs - ((void *)last - base) - sizeof(__u32));
578} 583}
@@ -745,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
745 struct ext4_xattr_search *s = &bs->s; 750 struct ext4_xattr_search *s = &bs->s;
746 struct mb_cache_entry *ce = NULL; 751 struct mb_cache_entry *ce = NULL;
747 int error = 0; 752 int error = 0;
753 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
748 754
749#define header(x) ((struct ext4_xattr_header *)(x)) 755#define header(x) ((struct ext4_xattr_header *)(x))
750 756
751 if (i->value && i->value_len > sb->s_blocksize) 757 if (i->value && i->value_len > sb->s_blocksize)
752 return -ENOSPC; 758 return -ENOSPC;
753 if (s->base) { 759 if (s->base) {
754 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, 760 ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
755 bs->bh->b_blocknr); 761 bs->bh->b_blocknr);
756 error = ext4_journal_get_write_access(handle, bs->bh); 762 error = ext4_journal_get_write_access(handle, bs->bh);
757 if (error) 763 if (error)
@@ -769,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
769 if (!IS_LAST_ENTRY(s->first)) 775 if (!IS_LAST_ENTRY(s->first))
770 ext4_xattr_rehash(header(s->base), 776 ext4_xattr_rehash(header(s->base),
771 s->here); 777 s->here);
772 ext4_xattr_cache_insert(bs->bh); 778 ext4_xattr_cache_insert(ext4_mb_cache,
779 bs->bh);
773 } 780 }
774 unlock_buffer(bs->bh); 781 unlock_buffer(bs->bh);
775 if (error == -EIO) 782 if (error == -EIO)
@@ -905,7 +912,7 @@ getblk_failed:
905 memcpy(new_bh->b_data, s->base, new_bh->b_size); 912 memcpy(new_bh->b_data, s->base, new_bh->b_size);
906 set_buffer_uptodate(new_bh); 913 set_buffer_uptodate(new_bh);
907 unlock_buffer(new_bh); 914 unlock_buffer(new_bh);
908 ext4_xattr_cache_insert(new_bh); 915 ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
909 error = ext4_handle_dirty_xattr_block(handle, 916 error = ext4_handle_dirty_xattr_block(handle,
910 inode, new_bh); 917 inode, new_bh);
911 if (error) 918 if (error)
@@ -1228,7 +1235,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1228 struct ext4_xattr_block_find *bs = NULL; 1235 struct ext4_xattr_block_find *bs = NULL;
1229 char *buffer = NULL, *b_entry_name = NULL; 1236 char *buffer = NULL, *b_entry_name = NULL;
1230 size_t min_offs, free; 1237 size_t min_offs, free;
1231 int total_ino, total_blk; 1238 int total_ino;
1232 void *base, *start, *end; 1239 void *base, *start, *end;
1233 int extra_isize = 0, error = 0, tried_min_extra_isize = 0; 1240 int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
1234 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); 1241 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1293,7 @@ retry:
1286 first = BFIRST(bh); 1293 first = BFIRST(bh);
1287 end = bh->b_data + bh->b_size; 1294 end = bh->b_data + bh->b_size;
1288 min_offs = end - base; 1295 min_offs = end - base;
1289 free = ext4_xattr_free_space(first, &min_offs, base, 1296 free = ext4_xattr_free_space(first, &min_offs, base, NULL);
1290 &total_blk);
1291 if (free < new_extra_isize) { 1297 if (free < new_extra_isize) {
1292 if (!tried_min_extra_isize && s_min_extra_isize) { 1298 if (!tried_min_extra_isize && s_min_extra_isize) {
1293 tried_min_extra_isize++; 1299 tried_min_extra_isize++;
@@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb)
1495 * Returns 0, or a negative error number on failure. 1501 * Returns 0, or a negative error number on failure.
1496 */ 1502 */
1497static void 1503static void
1498ext4_xattr_cache_insert(struct buffer_head *bh) 1504ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
1499{ 1505{
1500 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1506 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1501 struct mb_cache_entry *ce; 1507 struct mb_cache_entry *ce;
1502 int error; 1508 int error;
1503 1509
1504 ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); 1510 ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
1505 if (!ce) { 1511 if (!ce) {
1506 ea_bdebug(bh, "out of memory"); 1512 ea_bdebug(bh, "out of memory");
1507 return; 1513 return;
@@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1573{ 1579{
1574 __u32 hash = le32_to_cpu(header->h_hash); 1580 __u32 hash = le32_to_cpu(header->h_hash);
1575 struct mb_cache_entry *ce; 1581 struct mb_cache_entry *ce;
1582 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
1576 1583
1577 if (!header->h_hash) 1584 if (!header->h_hash)
1578 return NULL; /* never share */ 1585 return NULL; /* never share */
1579 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1586 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1580again: 1587again:
1581 ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, 1588 ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
1582 hash); 1589 hash);
1583 while (ce) { 1590 while (ce) {
1584 struct buffer_head *bh; 1591 struct buffer_head *bh;
@@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1676 1683
1677#undef BLOCK_HASH_SHIFT 1684#undef BLOCK_HASH_SHIFT
1678 1685
1679int __init 1686#define HASH_BUCKET_BITS 10
1680ext4_init_xattr(void) 1687
1688struct mb_cache *
1689ext4_xattr_create_cache(char *name)
1681{ 1690{
1682 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1691 return mb_cache_create(name, HASH_BUCKET_BITS);
1683 if (!ext4_xattr_cache)
1684 return -ENOMEM;
1685 return 0;
1686} 1692}
1687 1693
1688void 1694void ext4_xattr_destroy_cache(struct mb_cache *cache)
1689ext4_exit_xattr(void)
1690{ 1695{
1691 if (ext4_xattr_cache) 1696 if (cache)
1692 mb_cache_destroy(ext4_xattr_cache); 1697 mb_cache_destroy(cache);
1693 ext4_xattr_cache = NULL;
1694} 1698}
1699
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d6398833f..29bedf5589f6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
111 struct ext4_inode *raw_inode, handle_t *handle); 111 struct ext4_inode *raw_inode, handle_t *handle);
112 112
113extern int __init ext4_init_xattr(void);
114extern void ext4_exit_xattr(void);
115
116extern const struct xattr_handler *ext4_xattr_handlers[]; 113extern const struct xattr_handler *ext4_xattr_handlers[];
117 114
118extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 115extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
124 struct ext4_xattr_info *i, 121 struct ext4_xattr_info *i,
125 struct ext4_xattr_ibody_find *is); 122 struct ext4_xattr_ibody_find *is);
126 123
124extern struct mb_cache *ext4_xattr_create_cache(char *name);
125extern void ext4_xattr_destroy_cache(struct mb_cache *);
126
127#ifdef CONFIG_EXT4_FS_SECURITY 127#ifdef CONFIG_EXT4_FS_SECURITY
128extern int ext4_init_security(handle_t *handle, struct inode *inode, 128extern int ext4_init_security(handle_t *handle, struct inode *inode,
129 struct inode *dir, const struct qstr *qstr); 129 struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fa8da4cb8c4b..e93e4ec7d165 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
174 174
175 retval = f2fs_getxattr(inode, name_index, "", NULL, 0); 175 retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
176 if (retval > 0) { 176 if (retval > 0) {
177 value = kmalloc(retval, GFP_KERNEL); 177 value = kmalloc(retval, GFP_F2FS_ZERO);
178 if (!value) 178 if (!value)
179 return ERR_PTR(-ENOMEM); 179 return ERR_PTR(-ENOMEM);
180 retval = f2fs_getxattr(inode, name_index, "", value, retval); 180 retval = f2fs_getxattr(inode, name_index, "", value, retval);
@@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type,
203 size_t size = 0; 203 size_t size = 0;
204 int error; 204 int error;
205 205
206 if (acl) {
207 error = posix_acl_valid(acl);
208 if (error < 0)
209 return error;
210 }
211
206 switch (type) { 212 switch (type) {
207 case ACL_TYPE_ACCESS: 213 case ACL_TYPE_ACCESS:
208 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; 214 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..4aa521aa9bc3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
33 struct address_space *mapping = META_MAPPING(sbi); 33 struct address_space *mapping = META_MAPPING(sbi);
34 struct page *page = NULL; 34 struct page *page = NULL;
35repeat: 35repeat:
36 page = grab_cache_page(mapping, index); 36 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
37 if (!page) { 37 if (!page) {
38 cond_resched(); 38 cond_resched();
39 goto repeat; 39 goto repeat;
40 } 40 }
41 41
42 /* We wait writeback only inside grab_meta_page() */
43 wait_on_page_writeback(page);
44 SetPageUptodate(page); 42 SetPageUptodate(page);
45 return page; 43 return page;
46} 44}
@@ -75,23 +73,102 @@ out:
75 return page; 73 return page;
76} 74}
77 75
76inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
77{
78 switch (type) {
79 case META_NAT:
80 return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
81 case META_SIT:
82 return SIT_BLK_CNT(sbi);
83 case META_SSA:
84 case META_CP:
85 return 0;
86 default:
87 BUG();
88 }
89}
90
91/*
92 * Readahead CP/NAT/SIT/SSA pages
93 */
94int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
95{
96 block_t prev_blk_addr = 0;
97 struct page *page;
98 int blkno = start;
99 int max_blks = get_max_meta_blks(sbi, type);
100
101 struct f2fs_io_info fio = {
102 .type = META,
103 .rw = READ_SYNC | REQ_META | REQ_PRIO
104 };
105
106 for (; nrpages-- > 0; blkno++) {
107 block_t blk_addr;
108
109 switch (type) {
110 case META_NAT:
111 /* get nat block addr */
112 if (unlikely(blkno >= max_blks))
113 blkno = 0;
114 blk_addr = current_nat_addr(sbi,
115 blkno * NAT_ENTRY_PER_BLOCK);
116 break;
117 case META_SIT:
118 /* get sit block addr */
119 if (unlikely(blkno >= max_blks))
120 goto out;
121 blk_addr = current_sit_addr(sbi,
122 blkno * SIT_ENTRY_PER_BLOCK);
123 if (blkno != start && prev_blk_addr + 1 != blk_addr)
124 goto out;
125 prev_blk_addr = blk_addr;
126 break;
127 case META_SSA:
128 case META_CP:
129 /* get ssa/cp block addr */
130 blk_addr = blkno;
131 break;
132 default:
133 BUG();
134 }
135
136 page = grab_cache_page(META_MAPPING(sbi), blk_addr);
137 if (!page)
138 continue;
139 if (PageUptodate(page)) {
140 mark_page_accessed(page);
141 f2fs_put_page(page, 1);
142 continue;
143 }
144
145 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
146 mark_page_accessed(page);
147 f2fs_put_page(page, 0);
148 }
149out:
150 f2fs_submit_merged_bio(sbi, META, READ);
151 return blkno - start;
152}
153
78static int f2fs_write_meta_page(struct page *page, 154static int f2fs_write_meta_page(struct page *page,
79 struct writeback_control *wbc) 155 struct writeback_control *wbc)
80{ 156{
81 struct inode *inode = page->mapping->host; 157 struct inode *inode = page->mapping->host;
82 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 158 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
83 159
84 /* Should not write any meta pages, if any IO error was occurred */ 160 if (unlikely(sbi->por_doing))
85 if (unlikely(sbi->por_doing ||
86 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
87 goto redirty_out; 161 goto redirty_out;
88
89 if (wbc->for_reclaim) 162 if (wbc->for_reclaim)
90 goto redirty_out; 163 goto redirty_out;
91 164
92 wait_on_page_writeback(page); 165 /* Should not write any meta pages, if any IO error was occurred */
166 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
167 goto no_write;
93 168
169 f2fs_wait_on_page_writeback(page, META);
94 write_meta_page(sbi, page); 170 write_meta_page(sbi, page);
171no_write:
95 dec_page_count(sbi, F2FS_DIRTY_META); 172 dec_page_count(sbi, F2FS_DIRTY_META);
96 unlock_page(page); 173 unlock_page(page);
97 return 0; 174 return 0;
@@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page,
99redirty_out: 176redirty_out:
100 dec_page_count(sbi, F2FS_DIRTY_META); 177 dec_page_count(sbi, F2FS_DIRTY_META);
101 wbc->pages_skipped++; 178 wbc->pages_skipped++;
179 account_page_redirty(page);
102 set_page_dirty(page); 180 set_page_dirty(page);
103 return AOP_WRITEPAGE_ACTIVATE; 181 return AOP_WRITEPAGE_ACTIVATE;
104} 182}
@@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
107 struct writeback_control *wbc) 185 struct writeback_control *wbc)
108{ 186{
109 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 187 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
110 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 188 long diff, written;
111 long written;
112
113 if (wbc->for_kupdate)
114 return 0;
115 189
116 /* collect a number of dirty meta pages and write together */ 190 /* collect a number of dirty meta pages and write together */
117 if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) 191 if (wbc->for_kupdate ||
118 return 0; 192 get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
193 goto skip_write;
119 194
120 /* if mounting is failed, skip writing node pages */ 195 /* if mounting is failed, skip writing node pages */
121 mutex_lock(&sbi->cp_mutex); 196 mutex_lock(&sbi->cp_mutex);
122 written = sync_meta_pages(sbi, META, nrpages); 197 diff = nr_pages_to_write(sbi, META, wbc);
198 written = sync_meta_pages(sbi, META, wbc->nr_to_write);
123 mutex_unlock(&sbi->cp_mutex); 199 mutex_unlock(&sbi->cp_mutex);
124 wbc->nr_to_write -= written; 200 wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
201 return 0;
202
203skip_write:
204 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
125 return 0; 205 return 0;
126} 206}
127 207
@@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
148 228
149 for (i = 0; i < nr_pages; i++) { 229 for (i = 0; i < nr_pages; i++) {
150 struct page *page = pvec.pages[i]; 230 struct page *page = pvec.pages[i];
231
151 lock_page(page); 232 lock_page(page);
152 f2fs_bug_on(page->mapping != mapping); 233
153 f2fs_bug_on(!PageDirty(page)); 234 if (unlikely(page->mapping != mapping)) {
154 clear_page_dirty_for_io(page); 235continue_unlock:
236 unlock_page(page);
237 continue;
238 }
239 if (!PageDirty(page)) {
240 /* someone wrote it for us */
241 goto continue_unlock;
242 }
243
244 if (!clear_page_dirty_for_io(page))
245 goto continue_unlock;
246
155 if (f2fs_write_meta_page(page, &wbc)) { 247 if (f2fs_write_meta_page(page, &wbc)) {
156 unlock_page(page); 248 unlock_page(page);
157 break; 249 break;
@@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
216 308
217void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 309void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
218{ 310{
219 struct list_head *head, *this; 311 struct list_head *head;
220 struct orphan_inode_entry *new = NULL, *orphan = NULL; 312 struct orphan_inode_entry *new, *orphan;
221 313
222 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); 314 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
223 new->ino = ino; 315 new->ino = ino;
224 316
225 spin_lock(&sbi->orphan_inode_lock); 317 spin_lock(&sbi->orphan_inode_lock);
226 head = &sbi->orphan_inode_list; 318 head = &sbi->orphan_inode_list;
227 list_for_each(this, head) { 319 list_for_each_entry(orphan, head, list) {
228 orphan = list_entry(this, struct orphan_inode_entry, list);
229 if (orphan->ino == ino) { 320 if (orphan->ino == ino) {
230 spin_unlock(&sbi->orphan_inode_lock); 321 spin_unlock(&sbi->orphan_inode_lock);
231 kmem_cache_free(orphan_entry_slab, new); 322 kmem_cache_free(orphan_entry_slab, new);
@@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
234 325
235 if (orphan->ino > ino) 326 if (orphan->ino > ino)
236 break; 327 break;
237 orphan = NULL;
238 } 328 }
239 329
240 /* add new_oentry into list which is sorted by inode number */ 330 /* add new orphan entry into list which is sorted by inode number */
241 if (orphan) 331 list_add_tail(&new->list, &orphan->list);
242 list_add(&new->list, this->prev);
243 else
244 list_add_tail(&new->list, head);
245 spin_unlock(&sbi->orphan_inode_lock); 332 spin_unlock(&sbi->orphan_inode_lock);
246} 333}
247 334
@@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
255 list_for_each_entry(orphan, head, list) { 342 list_for_each_entry(orphan, head, list) {
256 if (orphan->ino == ino) { 343 if (orphan->ino == ino) {
257 list_del(&orphan->list); 344 list_del(&orphan->list);
258 kmem_cache_free(orphan_entry_slab, orphan);
259 f2fs_bug_on(sbi->n_orphans == 0); 345 f2fs_bug_on(sbi->n_orphans == 0);
260 sbi->n_orphans--; 346 sbi->n_orphans--;
261 break; 347 spin_unlock(&sbi->orphan_inode_lock);
348 kmem_cache_free(orphan_entry_slab, orphan);
349 return;
262 } 350 }
263 } 351 }
264 spin_unlock(&sbi->orphan_inode_lock); 352 spin_unlock(&sbi->orphan_inode_lock);
@@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
285 start_blk = __start_cp_addr(sbi) + 1; 373 start_blk = __start_cp_addr(sbi) + 1;
286 orphan_blkaddr = __start_sum_addr(sbi) - 1; 374 orphan_blkaddr = __start_sum_addr(sbi) - 1;
287 375
376 ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
377
288 for (i = 0; i < orphan_blkaddr; i++) { 378 for (i = 0; i < orphan_blkaddr; i++) {
289 struct page *page = get_meta_page(sbi, start_blk + i); 379 struct page *page = get_meta_page(sbi, start_blk + i);
290 struct f2fs_orphan_block *orphan_blk; 380 struct f2fs_orphan_block *orphan_blk;
@@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
466{ 556{
467 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 557 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
468 struct list_head *head = &sbi->dir_inode_list; 558 struct list_head *head = &sbi->dir_inode_list;
469 struct list_head *this; 559 struct dir_inode_entry *entry;
470 560
471 list_for_each(this, head) { 561 list_for_each_entry(entry, head, list)
472 struct dir_inode_entry *entry;
473 entry = list_entry(this, struct dir_inode_entry, list);
474 if (unlikely(entry->inode == inode)) 562 if (unlikely(entry->inode == inode))
475 return -EEXIST; 563 return -EEXIST;
476 } 564
477 list_add_tail(&new->list, head); 565 list_add_tail(&new->list, head);
478 stat_inc_dirty_dir(sbi); 566 stat_inc_dirty_dir(sbi);
479 return 0; 567 return 0;
@@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
483{ 571{
484 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 572 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
485 struct dir_inode_entry *new; 573 struct dir_inode_entry *new;
574 int ret = 0;
486 575
487 if (!S_ISDIR(inode->i_mode)) 576 if (!S_ISDIR(inode->i_mode))
488 return; 577 return;
@@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
492 INIT_LIST_HEAD(&new->list); 581 INIT_LIST_HEAD(&new->list);
493 582
494 spin_lock(&sbi->dir_inode_lock); 583 spin_lock(&sbi->dir_inode_lock);
495 if (__add_dirty_inode(inode, new)) 584 ret = __add_dirty_inode(inode, new);
496 kmem_cache_free(inode_entry_slab, new);
497
498 inc_page_count(sbi, F2FS_DIRTY_DENTS);
499 inode_inc_dirty_dents(inode); 585 inode_inc_dirty_dents(inode);
500 SetPagePrivate(page); 586 SetPagePrivate(page);
501 spin_unlock(&sbi->dir_inode_lock); 587 spin_unlock(&sbi->dir_inode_lock);
588
589 if (ret)
590 kmem_cache_free(inode_entry_slab, new);
502} 591}
503 592
504void add_dirty_dir_inode(struct inode *inode) 593void add_dirty_dir_inode(struct inode *inode)
@@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode)
506 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 595 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
507 struct dir_inode_entry *new = 596 struct dir_inode_entry *new =
508 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 597 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
598 int ret = 0;
509 599
510 new->inode = inode; 600 new->inode = inode;
511 INIT_LIST_HEAD(&new->list); 601 INIT_LIST_HEAD(&new->list);
512 602
513 spin_lock(&sbi->dir_inode_lock); 603 spin_lock(&sbi->dir_inode_lock);
514 if (__add_dirty_inode(inode, new)) 604 ret = __add_dirty_inode(inode, new);
515 kmem_cache_free(inode_entry_slab, new);
516 spin_unlock(&sbi->dir_inode_lock); 605 spin_unlock(&sbi->dir_inode_lock);
606
607 if (ret)
608 kmem_cache_free(inode_entry_slab, new);
517} 609}
518 610
519void remove_dirty_dir_inode(struct inode *inode) 611void remove_dirty_dir_inode(struct inode *inode)
520{ 612{
521 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 613 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
522 614 struct list_head *head;
523 struct list_head *this, *head; 615 struct dir_inode_entry *entry;
524 616
525 if (!S_ISDIR(inode->i_mode)) 617 if (!S_ISDIR(inode->i_mode))
526 return; 618 return;
527 619
528 spin_lock(&sbi->dir_inode_lock); 620 spin_lock(&sbi->dir_inode_lock);
529 if (atomic_read(&F2FS_I(inode)->dirty_dents)) { 621 if (get_dirty_dents(inode)) {
530 spin_unlock(&sbi->dir_inode_lock); 622 spin_unlock(&sbi->dir_inode_lock);
531 return; 623 return;
532 } 624 }
533 625
534 head = &sbi->dir_inode_list; 626 head = &sbi->dir_inode_list;
535 list_for_each(this, head) { 627 list_for_each_entry(entry, head, list) {
536 struct dir_inode_entry *entry;
537 entry = list_entry(this, struct dir_inode_entry, list);
538 if (entry->inode == inode) { 628 if (entry->inode == inode) {
539 list_del(&entry->list); 629 list_del(&entry->list);
540 kmem_cache_free(inode_entry_slab, entry);
541 stat_dec_dirty_dir(sbi); 630 stat_dec_dirty_dir(sbi);
542 break; 631 spin_unlock(&sbi->dir_inode_lock);
632 kmem_cache_free(inode_entry_slab, entry);
633 goto done;
543 } 634 }
544 } 635 }
545 spin_unlock(&sbi->dir_inode_lock); 636 spin_unlock(&sbi->dir_inode_lock);
546 637
638done:
547 /* Only from the recovery routine */ 639 /* Only from the recovery routine */
548 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { 640 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
549 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); 641 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
@@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode)
554struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) 646struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
555{ 647{
556 648
557 struct list_head *this, *head; 649 struct list_head *head;
558 struct inode *inode = NULL; 650 struct inode *inode = NULL;
651 struct dir_inode_entry *entry;
559 652
560 spin_lock(&sbi->dir_inode_lock); 653 spin_lock(&sbi->dir_inode_lock);
561 654
562 head = &sbi->dir_inode_list; 655 head = &sbi->dir_inode_list;
563 list_for_each(this, head) { 656 list_for_each_entry(entry, head, list) {
564 struct dir_inode_entry *entry;
565 entry = list_entry(this, struct dir_inode_entry, list);
566 if (entry->inode->i_ino == ino) { 657 if (entry->inode->i_ino == ino) {
567 inode = entry->inode; 658 inode = entry->inode;
568 break; 659 break;
@@ -589,7 +680,7 @@ retry:
589 inode = igrab(entry->inode); 680 inode = igrab(entry->inode);
590 spin_unlock(&sbi->dir_inode_lock); 681 spin_unlock(&sbi->dir_inode_lock);
591 if (inode) { 682 if (inode) {
592 filemap_flush(inode->i_mapping); 683 filemap_fdatawrite(inode->i_mapping);
593 iput(inode); 684 iput(inode);
594 } else { 685 } else {
595 /* 686 /*
@@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
824 unblock_operations(sbi); 915 unblock_operations(sbi);
825 mutex_unlock(&sbi->cp_mutex); 916 mutex_unlock(&sbi->cp_mutex);
826 917
918 stat_inc_cp_count(sbi->stat_info);
827 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 919 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
828} 920}
829 921
@@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
845int __init create_checkpoint_caches(void) 937int __init create_checkpoint_caches(void)
846{ 938{
847 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 939 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
848 sizeof(struct orphan_inode_entry), NULL); 940 sizeof(struct orphan_inode_entry));
849 if (!orphan_entry_slab) 941 if (!orphan_entry_slab)
850 return -ENOMEM; 942 return -ENOMEM;
851 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 943 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
852 sizeof(struct dir_inode_entry), NULL); 944 sizeof(struct dir_inode_entry));
853 if (!inode_entry_slab) { 945 if (!inode_entry_slab) {
854 kmem_cache_destroy(orphan_entry_slab); 946 kmem_cache_destroy(orphan_entry_slab);
855 return -ENOMEM; 947 return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2261ccdd0b5f..45abd60e2bff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
45 45
46static void f2fs_write_end_io(struct bio *bio, int err) 46static void f2fs_write_end_io(struct bio *bio, int err)
47{ 47{
48 struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); 48 struct f2fs_sb_info *sbi = bio->bi_private;
49 struct bio_vec *bvec; 49 struct bio_vec *bvec;
50 int i; 50 int i;
51 51
@@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err)
55 if (unlikely(err)) { 55 if (unlikely(err)) {
56 SetPageError(page); 56 SetPageError(page);
57 set_bit(AS_EIO, &page->mapping->flags); 57 set_bit(AS_EIO, &page->mapping->flags);
58 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 58 f2fs_stop_checkpoint(sbi);
59 sbi->sb->s_flags |= MS_RDONLY;
60 } 59 }
61 end_page_writeback(page); 60 end_page_writeback(page);
62 dec_page_count(sbi, F2FS_WRITEBACK); 61 dec_page_count(sbi, F2FS_WRITEBACK);
63 } 62 }
64 63
65 if (bio->bi_private) 64 if (sbi->wait_io) {
66 complete(bio->bi_private); 65 complete(sbi->wait_io);
66 sbi->wait_io = NULL;
67 }
67 68
68 if (!get_pages(sbi, F2FS_WRITEBACK) && 69 if (!get_pages(sbi, F2FS_WRITEBACK) &&
69 !list_empty(&sbi->cp_wait.task_list)) 70 !list_empty(&sbi->cp_wait.task_list))
@@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
86 bio->bi_bdev = sbi->sb->s_bdev; 87 bio->bi_bdev = sbi->sb->s_bdev;
87 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
88 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
90 bio->bi_private = sbi;
89 91
90 return bio; 92 return bio;
91} 93}
@@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
113 */ 115 */
114 if (fio->type == META_FLUSH) { 116 if (fio->type == META_FLUSH) {
115 DECLARE_COMPLETION_ONSTACK(wait); 117 DECLARE_COMPLETION_ONSTACK(wait);
116 io->bio->bi_private = &wait; 118 io->sbi->wait_io = &wait;
117 submit_bio(rw, io->bio); 119 submit_bio(rw, io->bio);
118 wait_for_completion(&wait); 120 wait_for_completion(&wait);
119 } else { 121 } else {
@@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
132 134
133 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; 135 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
134 136
135 mutex_lock(&io->io_mutex); 137 down_write(&io->io_rwsem);
136 138
137 /* change META to META_FLUSH in the checkpoint procedure */ 139 /* change META to META_FLUSH in the checkpoint procedure */
138 if (type >= META_FLUSH) { 140 if (type >= META_FLUSH) {
@@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
140 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; 142 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
141 } 143 }
142 __submit_merged_bio(io); 144 __submit_merged_bio(io);
143 mutex_unlock(&io->io_mutex); 145 up_write(&io->io_rwsem);
144} 146}
145 147
146/* 148/*
@@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
178 180
179 verify_block_addr(sbi, blk_addr); 181 verify_block_addr(sbi, blk_addr);
180 182
181 mutex_lock(&io->io_mutex); 183 down_write(&io->io_rwsem);
182 184
183 if (!is_read) 185 if (!is_read)
184 inc_page_count(sbi, F2FS_WRITEBACK); 186 inc_page_count(sbi, F2FS_WRITEBACK);
@@ -202,7 +204,7 @@ alloc_new:
202 204
203 io->last_block_in_bio = blk_addr; 205 io->last_block_in_bio = blk_addr;
204 206
205 mutex_unlock(&io->io_mutex); 207 up_write(&io->io_rwsem);
206 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); 208 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
207} 209}
208 210
@@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page,
797 */ 799 */
798 offset = i_size & (PAGE_CACHE_SIZE - 1); 800 offset = i_size & (PAGE_CACHE_SIZE - 1);
799 if ((page->index >= end_index + 1) || !offset) { 801 if ((page->index >= end_index + 1) || !offset) {
800 if (S_ISDIR(inode->i_mode)) { 802 inode_dec_dirty_dents(inode);
801 dec_page_count(sbi, F2FS_DIRTY_DENTS);
802 inode_dec_dirty_dents(inode);
803 }
804 goto out; 803 goto out;
805 } 804 }
806 805
807 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 806 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
808write: 807write:
809 if (unlikely(sbi->por_doing)) { 808 if (unlikely(sbi->por_doing))
810 err = AOP_WRITEPAGE_ACTIVATE;
811 goto redirty_out; 809 goto redirty_out;
812 }
813 810
814 /* Dentry blocks are controlled by checkpoint */ 811 /* Dentry blocks are controlled by checkpoint */
815 if (S_ISDIR(inode->i_mode)) { 812 if (S_ISDIR(inode->i_mode)) {
816 dec_page_count(sbi, F2FS_DIRTY_DENTS);
817 inode_dec_dirty_dents(inode); 813 inode_dec_dirty_dents(inode);
818 err = do_write_data_page(page, &fio); 814 err = do_write_data_page(page, &fio);
819 } else { 815 goto done;
820 f2fs_lock_op(sbi); 816 }
821
822 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
823 err = f2fs_write_inline_data(inode, page, offset);
824 f2fs_unlock_op(sbi);
825 goto out;
826 } else {
827 err = do_write_data_page(page, &fio);
828 }
829 817
830 f2fs_unlock_op(sbi); 818 if (!wbc->for_reclaim)
831 need_balance_fs = true; 819 need_balance_fs = true;
832 } 820 else if (has_not_enough_free_secs(sbi, 0))
833 if (err == -ENOENT)
834 goto out;
835 else if (err)
836 goto redirty_out; 821 goto redirty_out;
837 822
838 if (wbc->for_reclaim) { 823 f2fs_lock_op(sbi);
839 f2fs_submit_merged_bio(sbi, DATA, WRITE); 824 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
840 need_balance_fs = false; 825 err = f2fs_write_inline_data(inode, page, offset);
841 } 826 else
827 err = do_write_data_page(page, &fio);
828 f2fs_unlock_op(sbi);
829done:
830 if (err && err != -ENOENT)
831 goto redirty_out;
842 832
843 clear_cold_data(page); 833 clear_cold_data(page);
844out: 834out:
@@ -849,12 +839,11 @@ out:
849 839
850redirty_out: 840redirty_out:
851 wbc->pages_skipped++; 841 wbc->pages_skipped++;
842 account_page_redirty(page);
852 set_page_dirty(page); 843 set_page_dirty(page);
853 return err; 844 return AOP_WRITEPAGE_ACTIVATE;
854} 845}
855 846
856#define MAX_DESIRED_PAGES_WP 4096
857
858static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, 847static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
859 void *data) 848 void *data)
860{ 849{
@@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping,
871 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 860 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
872 bool locked = false; 861 bool locked = false;
873 int ret; 862 int ret;
874 long excess_nrtw = 0, desired_nrtw; 863 long diff;
875 864
876 /* deal with chardevs and other special file */ 865 /* deal with chardevs and other special file */
877 if (!mapping->a_ops->writepage) 866 if (!mapping->a_ops->writepage)
878 return 0; 867 return 0;
879 868
880 if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { 869 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
881 desired_nrtw = MAX_DESIRED_PAGES_WP; 870 get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
882 excess_nrtw = desired_nrtw - wbc->nr_to_write; 871 goto skip_write;
883 wbc->nr_to_write = desired_nrtw; 872
884 } 873 diff = nr_pages_to_write(sbi, DATA, wbc);
885 874
886 if (!S_ISDIR(inode->i_mode)) { 875 if (!S_ISDIR(inode->i_mode)) {
887 mutex_lock(&sbi->writepages); 876 mutex_lock(&sbi->writepages);
@@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping,
895 884
896 remove_dirty_dir_inode(inode); 885 remove_dirty_dir_inode(inode);
897 886
898 wbc->nr_to_write -= excess_nrtw; 887 wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
899 return ret; 888 return ret;
889
890skip_write:
891 wbc->pages_skipped += get_dirty_dents(inode);
892 return 0;
900} 893}
901 894
902static int f2fs_write_begin(struct file *file, struct address_space *mapping, 895static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -949,13 +942,19 @@ inline_data:
949 if (dn.data_blkaddr == NEW_ADDR) { 942 if (dn.data_blkaddr == NEW_ADDR) {
950 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 943 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
951 } else { 944 } else {
952 if (f2fs_has_inline_data(inode)) 945 if (f2fs_has_inline_data(inode)) {
953 err = f2fs_read_inline_data(inode, page); 946 err = f2fs_read_inline_data(inode, page);
954 else 947 if (err) {
948 page_cache_release(page);
949 return err;
950 }
951 } else {
955 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 952 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
956 READ_SYNC); 953 READ_SYNC);
957 if (err) 954 if (err)
958 return err; 955 return err;
956 }
957
959 lock_page(page); 958 lock_page(page);
960 if (unlikely(!PageUptodate(page))) { 959 if (unlikely(!PageUptodate(page))) {
961 f2fs_put_page(page, 1); 960 f2fs_put_page(page, 1);
@@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
1031 unsigned int length) 1030 unsigned int length)
1032{ 1031{
1033 struct inode *inode = page->mapping->host; 1032 struct inode *inode = page->mapping->host;
1034 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1033 if (PageDirty(page))
1035 if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
1036 dec_page_count(sbi, F2FS_DIRTY_DENTS);
1037 inode_dec_dirty_dents(inode); 1034 inode_dec_dirty_dents(inode);
1038 }
1039 ClearPagePrivate(page); 1035 ClearPagePrivate(page);
1040} 1036}
1041 1037
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 3de9d20d0c14..b52c12cf5873 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
86{ 86{
87 struct f2fs_stat_info *si = F2FS_STAT(sbi); 87 struct f2fs_stat_info *si = F2FS_STAT(sbi);
88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; 88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
89 struct sit_info *sit_i = SIT_I(sbi);
90 unsigned int segno, vblocks; 89 unsigned int segno, vblocks;
91 int ndirty = 0; 90 int ndirty = 0;
92 91
@@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
94 total_vblocks = 0; 93 total_vblocks = 0;
95 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
96 hblks_per_sec = blks_per_sec / 2; 95 hblks_per_sec = blks_per_sec / 2;
97 mutex_lock(&sit_i->sentry_lock);
98 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 96 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
99 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
100 dist = abs(vblocks - hblks_per_sec); 98 dist = abs(vblocks - hblks_per_sec);
@@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
105 ndirty++; 103 ndirty++;
106 } 104 }
107 } 105 }
108 mutex_unlock(&sit_i->sentry_lock);
109 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 106 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
110 si->bimodal = bimodal / dist; 107 si->bimodal = bimodal / dist;
111 if (si->dirty_count) 108 if (si->dirty_count)
@@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v)
236 si->dirty_count); 233 si->dirty_count);
237 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", 234 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
238 si->prefree_count, si->free_segs, si->free_secs); 235 si->prefree_count, si->free_segs, si->free_secs);
236 seq_printf(s, "CP calls: %d\n", si->cp_count);
239 seq_printf(s, "GC calls: %d (BG: %d)\n", 237 seq_printf(s, "GC calls: %d (BG: %d)\n",
240 si->call_count, si->bg_gc); 238 si->call_count, si->bg_gc);
241 seq_printf(s, " - data segments : %d\n", si->data_segs); 239 seq_printf(s, " - data segments : %d\n", si->data_segs);
@@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v)
252 si->ndirty_dent, si->ndirty_dirs); 250 si->ndirty_dent, si->ndirty_dirs);
253 seq_printf(s, " - meta: %4d in %4d\n", 251 seq_printf(s, " - meta: %4d in %4d\n",
254 si->ndirty_meta, si->meta_pages); 252 si->ndirty_meta, si->meta_pages);
255 seq_printf(s, " - NATs: %5d > %lu\n", 253 seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
256 si->nats, NM_WOUT_THRESHOLD); 254 si->nats, si->sits);
257 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 255 seq_printf(s, " - free_nids: %9d\n",
258 si->sits, si->fnids); 256 si->fnids);
259 seq_puts(s, "\nDistribution of User Blocks:"); 257 seq_puts(s, "\nDistribution of User Blocks:");
260 seq_puts(s, " [ valid | invalid | free ]\n"); 258 seq_puts(s, " [ valid | invalid | free ]\n");
261 seq_puts(s, " ["); 259 seq_puts(s, " [");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2b7c255bcbdf..972fd0ef230f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode)
21 >> PAGE_CACHE_SHIFT; 21 >> PAGE_CACHE_SHIFT;
22} 22}
23 23
24static unsigned int dir_buckets(unsigned int level) 24static unsigned int dir_buckets(unsigned int level, int dir_level)
25{ 25{
26 if (level < MAX_DIR_HASH_DEPTH / 2) 26 if (level < MAX_DIR_HASH_DEPTH / 2)
27 return 1 << level; 27 return 1 << (level + dir_level);
28 else 28 else
29 return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); 29 return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1);
30} 30}
31 31
32static unsigned int bucket_blocks(unsigned int level) 32static unsigned int bucket_blocks(unsigned int level)
@@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; 65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
66} 66}
67 67
68static unsigned long dir_block_index(unsigned int level, unsigned int idx) 68static unsigned long dir_block_index(unsigned int level,
69 int dir_level, unsigned int idx)
69{ 70{
70 unsigned long i; 71 unsigned long i;
71 unsigned long bidx = 0; 72 unsigned long bidx = 0;
72 73
73 for (i = 0; i < level; i++) 74 for (i = 0; i < level; i++)
74 bidx += dir_buckets(i) * bucket_blocks(i); 75 bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
75 bidx += idx * bucket_blocks(level); 76 bidx += idx * bucket_blocks(level);
76 return bidx; 77 return bidx;
77} 78}
@@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
93 f2fs_hash_t namehash, struct page **res_page) 94 f2fs_hash_t namehash, struct page **res_page)
94{ 95{
95 struct f2fs_dir_entry *de; 96 struct f2fs_dir_entry *de;
96 unsigned long bit_pos, end_pos, next_pos; 97 unsigned long bit_pos = 0;
97 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); 98 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
98 int slots; 99 const void *dentry_bits = &dentry_blk->dentry_bitmap;
100 int max_len = 0;
99 101
100 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
101 NR_DENTRY_IN_BLOCK, 0);
102 while (bit_pos < NR_DENTRY_IN_BLOCK) { 102 while (bit_pos < NR_DENTRY_IN_BLOCK) {
103 if (!test_bit_le(bit_pos, dentry_bits)) {
104 if (bit_pos == 0)
105 max_len = 1;
106 else if (!test_bit_le(bit_pos - 1, dentry_bits))
107 max_len++;
108 bit_pos++;
109 continue;
110 }
103 de = &dentry_blk->dentry[bit_pos]; 111 de = &dentry_blk->dentry[bit_pos];
104 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
105
106 if (early_match_name(name, namelen, namehash, de)) { 112 if (early_match_name(name, namelen, namehash, de)) {
107 if (!memcmp(dentry_blk->filename[bit_pos], 113 if (!memcmp(dentry_blk->filename[bit_pos],
108 name, namelen)) { 114 name, namelen)) {
@@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
110 goto found; 116 goto found;
111 } 117 }
112 } 118 }
113 next_pos = bit_pos + slots; 119 if (max_len > *max_slots) {
114 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 120 *max_slots = max_len;
115 NR_DENTRY_IN_BLOCK, next_pos); 121 max_len = 0;
116 if (bit_pos >= NR_DENTRY_IN_BLOCK) 122 }
117 end_pos = NR_DENTRY_IN_BLOCK; 123 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
118 else
119 end_pos = bit_pos;
120 if (*max_slots < end_pos - next_pos)
121 *max_slots = end_pos - next_pos;
122 } 124 }
123 125
124 de = NULL; 126 de = NULL;
125 kunmap(dentry_page); 127 kunmap(dentry_page);
126found: 128found:
129 if (max_len > *max_slots)
130 *max_slots = max_len;
127 return de; 131 return de;
128} 132}
129 133
@@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
141 145
142 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 146 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
143 147
144 nbucket = dir_buckets(level); 148 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
145 nblock = bucket_blocks(level); 149 nblock = bucket_blocks(level);
146 150
147 bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); 151 bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
152 le32_to_cpu(namehash) % nbucket);
148 end_block = bidx + nblock; 153 end_block = bidx + nblock;
149 154
150 for (; bidx < end_block; bidx++) { 155 for (; bidx < end_block; bidx++) {
@@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
248 struct page *page, struct inode *inode) 253 struct page *page, struct inode *inode)
249{ 254{
250 lock_page(page); 255 lock_page(page);
251 wait_on_page_writeback(page); 256 f2fs_wait_on_page_writeback(page, DATA);
252 de->ino = cpu_to_le32(inode->i_ino); 257 de->ino = cpu_to_le32(inode->i_ino);
253 set_de_type(de, inode); 258 set_de_type(de, inode);
254 kunmap(page); 259 kunmap(page);
@@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode,
347 err = f2fs_init_security(inode, dir, name, page); 352 err = f2fs_init_security(inode, dir, name, page);
348 if (err) 353 if (err)
349 goto put_error; 354 goto put_error;
350
351 wait_on_page_writeback(page);
352 } else { 355 } else {
353 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 356 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
354 if (IS_ERR(page)) 357 if (IS_ERR(page))
355 return page; 358 return page;
356 359
357 wait_on_page_writeback(page);
358 set_cold_node(inode, page); 360 set_cold_node(inode, page);
359 } 361 }
360 362
@@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode,
372 374
373put_error: 375put_error:
374 f2fs_put_page(page, 1); 376 f2fs_put_page(page, 1);
377 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
378 truncate_inode_pages(&inode->i_data, 0);
379 truncate_blocks(inode, 0);
380 remove_dirty_dir_inode(inode);
375error: 381error:
376 remove_inode_page(inode); 382 remove_inode_page(inode);
377 return ERR_PTR(err); 383 return ERR_PTR(err);
@@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
395 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 401 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
396 } 402 }
397 403
398 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
399 update_inode_page(dir);
400
401 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 404 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
402 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 405 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
403} 406}
@@ -464,10 +467,11 @@ start:
464 if (level == current_depth) 467 if (level == current_depth)
465 ++current_depth; 468 ++current_depth;
466 469
467 nbucket = dir_buckets(level); 470 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
468 nblock = bucket_blocks(level); 471 nblock = bucket_blocks(level);
469 472
470 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); 473 bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
474 (le32_to_cpu(dentry_hash) % nbucket));
471 475
472 for (block = bidx; block <= (bidx + nblock - 1); block++) { 476 for (block = bidx; block <= (bidx + nblock - 1); block++) {
473 dentry_page = get_new_data_page(dir, NULL, block, true); 477 dentry_page = get_new_data_page(dir, NULL, block, true);
@@ -487,8 +491,9 @@ start:
487 ++level; 491 ++level;
488 goto start; 492 goto start;
489add_dentry: 493add_dentry:
490 wait_on_page_writeback(dentry_page); 494 f2fs_wait_on_page_writeback(dentry_page, DATA);
491 495
496 down_write(&F2FS_I(inode)->i_sem);
492 page = init_inode_metadata(inode, dir, name); 497 page = init_inode_metadata(inode, dir, name);
493 if (IS_ERR(page)) { 498 if (IS_ERR(page)) {
494 err = PTR_ERR(page); 499 err = PTR_ERR(page);
@@ -511,7 +516,12 @@ add_dentry:
511 516
512 update_parent_metadata(dir, inode, current_depth); 517 update_parent_metadata(dir, inode, current_depth);
513fail: 518fail:
514 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 519 up_write(&F2FS_I(inode)->i_sem);
520
521 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
522 update_inode_page(dir);
523 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
524 }
515 kunmap(dentry_page); 525 kunmap(dentry_page);
516 f2fs_put_page(dentry_page, 1); 526 f2fs_put_page(dentry_page, 1);
517 return err; 527 return err;
@@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
528 unsigned int bit_pos; 538 unsigned int bit_pos;
529 struct address_space *mapping = page->mapping; 539 struct address_space *mapping = page->mapping;
530 struct inode *dir = mapping->host; 540 struct inode *dir = mapping->host;
531 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
532 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 541 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
533 void *kaddr = page_address(page); 542 void *kaddr = page_address(page);
534 int i; 543 int i;
535 544
536 lock_page(page); 545 lock_page(page);
537 wait_on_page_writeback(page); 546 f2fs_wait_on_page_writeback(page, DATA);
538 547
539 dentry_blk = (struct f2fs_dentry_block *)kaddr; 548 dentry_blk = (struct f2fs_dentry_block *)kaddr;
540 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; 549 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
@@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
551 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 560 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
552 561
553 if (inode) { 562 if (inode) {
563 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
564
565 down_write(&F2FS_I(inode)->i_sem);
566
554 if (S_ISDIR(inode->i_mode)) { 567 if (S_ISDIR(inode->i_mode)) {
555 drop_nlink(dir); 568 drop_nlink(dir);
556 update_inode_page(dir); 569 update_inode_page(dir);
@@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
561 drop_nlink(inode); 574 drop_nlink(inode);
562 i_size_write(inode, 0); 575 i_size_write(inode, 0);
563 } 576 }
577 up_write(&F2FS_I(inode)->i_sem);
564 update_inode_page(inode); 578 update_inode_page(inode);
565 579
566 if (inode->i_nlink == 0) 580 if (inode->i_nlink == 0)
@@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
573 truncate_hole(dir, page->index, page->index + 1); 587 truncate_hole(dir, page->index, page->index + 1);
574 clear_page_dirty_for_io(page); 588 clear_page_dirty_for_io(page);
575 ClearPageUptodate(page); 589 ClearPageUptodate(page);
576 dec_page_count(sbi, F2FS_DIRTY_DENTS);
577 inode_dec_dirty_dents(dir); 590 inode_dec_dirty_dents(dir);
578 } 591 }
579 f2fs_put_page(page, 1); 592 f2fs_put_page(page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fc3c558cb4f3..2ecac8312359 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -40,6 +40,7 @@
40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
41#define F2FS_MOUNT_INLINE_XATTR 0x00000080 41#define F2FS_MOUNT_INLINE_XATTR 0x00000080
42#define F2FS_MOUNT_INLINE_DATA 0x00000100 42#define F2FS_MOUNT_INLINE_DATA 0x00000100
43#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
43 44
44#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 45#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
45#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 46#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -88,6 +89,16 @@ enum {
88 SIT_BITMAP 89 SIT_BITMAP
89}; 90};
90 91
92/*
93 * For CP/NAT/SIT/SSA readahead
94 */
95enum {
96 META_CP,
97 META_NAT,
98 META_SIT,
99 META_SSA
100};
101
91/* for the list of orphan inodes */ 102/* for the list of orphan inodes */
92struct orphan_inode_entry { 103struct orphan_inode_entry {
93 struct list_head list; /* list head */ 104 struct list_head list; /* list head */
@@ -187,16 +198,20 @@ struct extent_info {
187#define FADVISE_COLD_BIT 0x01 198#define FADVISE_COLD_BIT 0x01
188#define FADVISE_LOST_PINO_BIT 0x02 199#define FADVISE_LOST_PINO_BIT 0x02
189 200
201#define DEF_DIR_LEVEL 0
202
190struct f2fs_inode_info { 203struct f2fs_inode_info {
191 struct inode vfs_inode; /* serve a vfs inode */ 204 struct inode vfs_inode; /* serve a vfs inode */
192 unsigned long i_flags; /* keep an inode flags for ioctl */ 205 unsigned long i_flags; /* keep an inode flags for ioctl */
193 unsigned char i_advise; /* use to give file attribute hints */ 206 unsigned char i_advise; /* use to give file attribute hints */
207 unsigned char i_dir_level; /* use for dentry level for large dir */
194 unsigned int i_current_depth; /* use only in directory structure */ 208 unsigned int i_current_depth; /* use only in directory structure */
195 unsigned int i_pino; /* parent inode number */ 209 unsigned int i_pino; /* parent inode number */
196 umode_t i_acl_mode; /* keep file acl mode temporarily */ 210 umode_t i_acl_mode; /* keep file acl mode temporarily */
197 211
198 /* Use below internally in f2fs*/ 212 /* Use below internally in f2fs*/
199 unsigned long flags; /* use to pass per-file flags */ 213 unsigned long flags; /* use to pass per-file flags */
214 struct rw_semaphore i_sem; /* protect fi info */
200 atomic_t dirty_dents; /* # of dirty dentry pages */ 215 atomic_t dirty_dents; /* # of dirty dentry pages */
201 f2fs_hash_t chash; /* hash value of given file name */ 216 f2fs_hash_t chash; /* hash value of given file name */
202 unsigned int clevel; /* maximum level of given file name */ 217 unsigned int clevel; /* maximum level of given file name */
@@ -229,6 +244,7 @@ struct f2fs_nm_info {
229 block_t nat_blkaddr; /* base disk address of NAT */ 244 block_t nat_blkaddr; /* base disk address of NAT */
230 nid_t max_nid; /* maximum possible node ids */ 245 nid_t max_nid; /* maximum possible node ids */
231 nid_t next_scan_nid; /* the next nid to be scanned */ 246 nid_t next_scan_nid; /* the next nid to be scanned */
247 unsigned int ram_thresh; /* control the memory footprint */
232 248
233 /* NAT cache management */ 249 /* NAT cache management */
234 struct radix_tree_root nat_root;/* root of the nat entry cache */ 250 struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -238,6 +254,7 @@ struct f2fs_nm_info {
238 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 254 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
239 255
240 /* free node ids management */ 256 /* free node ids management */
257 struct radix_tree_root free_nid_root;/* root of the free_nid cache */
241 struct list_head free_nid_list; /* a list for free nids */ 258 struct list_head free_nid_list; /* a list for free nids */
242 spinlock_t free_nid_list_lock; /* protect free nid list */ 259 spinlock_t free_nid_list_lock; /* protect free nid list */
243 unsigned int fcnt; /* the number of free node id */ 260 unsigned int fcnt; /* the number of free node id */
@@ -300,6 +317,12 @@ enum {
300 NO_CHECK_TYPE 317 NO_CHECK_TYPE
301}; 318};
302 319
320struct flush_cmd {
321 struct flush_cmd *next;
322 struct completion wait;
323 int ret;
324};
325
303struct f2fs_sm_info { 326struct f2fs_sm_info {
304 struct sit_info *sit_info; /* whole segment information */ 327 struct sit_info *sit_info; /* whole segment information */
305 struct free_segmap_info *free_info; /* free segment information */ 328 struct free_segmap_info *free_info; /* free segment information */
@@ -328,6 +351,14 @@ struct f2fs_sm_info {
328 351
329 unsigned int ipu_policy; /* in-place-update policy */ 352 unsigned int ipu_policy; /* in-place-update policy */
330 unsigned int min_ipu_util; /* in-place-update threshold */ 353 unsigned int min_ipu_util; /* in-place-update threshold */
354
355 /* for flush command control */
356 struct task_struct *f2fs_issue_flush; /* flush thread */
357 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
358 struct flush_cmd *issue_list; /* list for command issue */
359 struct flush_cmd *dispatch_list; /* list for command dispatch */
360 spinlock_t issue_lock; /* for issue list lock */
361 struct flush_cmd *issue_tail; /* list tail of issue list */
331}; 362};
332 363
333/* 364/*
@@ -378,7 +409,7 @@ struct f2fs_bio_info {
378 struct bio *bio; /* bios to merge */ 409 struct bio *bio; /* bios to merge */
379 sector_t last_block_in_bio; /* last block number */ 410 sector_t last_block_in_bio; /* last block number */
380 struct f2fs_io_info fio; /* store buffered io info. */ 411 struct f2fs_io_info fio; /* store buffered io info. */
381 struct mutex io_mutex; /* mutex for bio */ 412 struct rw_semaphore io_rwsem; /* blocking op for bio */
382}; 413};
383 414
384struct f2fs_sb_info { 415struct f2fs_sb_info {
@@ -398,6 +429,7 @@ struct f2fs_sb_info {
398 /* for bio operations */ 429 /* for bio operations */
399 struct f2fs_bio_info read_io; /* for read bios */ 430 struct f2fs_bio_info read_io; /* for read bios */
400 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ 431 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
432 struct completion *wait_io; /* for completion bios */
401 433
402 /* for checkpoint */ 434 /* for checkpoint */
403 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ 435 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -407,7 +439,6 @@ struct f2fs_sb_info {
407 struct mutex node_write; /* locking node writes */ 439 struct mutex node_write; /* locking node writes */
408 struct mutex writepages; /* mutex for writepages() */ 440 struct mutex writepages; /* mutex for writepages() */
409 bool por_doing; /* recovery is doing or not */ 441 bool por_doing; /* recovery is doing or not */
410 bool on_build_free_nids; /* build_free_nids is doing */
411 wait_queue_head_t cp_wait; 442 wait_queue_head_t cp_wait;
412 443
413 /* for orphan inode management */ 444 /* for orphan inode management */
@@ -436,6 +467,7 @@ struct f2fs_sb_info {
436 unsigned int total_valid_node_count; /* valid node block count */ 467 unsigned int total_valid_node_count; /* valid node block count */
437 unsigned int total_valid_inode_count; /* valid inode count */ 468 unsigned int total_valid_inode_count; /* valid inode count */
438 int active_logs; /* # of active logs */ 469 int active_logs; /* # of active logs */
470 int dir_level; /* directory level */
439 471
440 block_t user_block_count; /* # of user blocks */ 472 block_t user_block_count; /* # of user blocks */
441 block_t total_valid_block_count; /* # of valid blocks */ 473 block_t total_valid_block_count; /* # of valid blocks */
@@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode)
622 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; 654 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
623} 655}
624 656
657static inline bool f2fs_has_xattr_block(unsigned int ofs)
658{
659 return ofs == XATTR_NODE_OFFSET;
660}
661
625static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, 662static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
626 struct inode *inode, blkcnt_t count) 663 struct inode *inode, blkcnt_t count)
627{ 664{
@@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
661 698
662static inline void inode_inc_dirty_dents(struct inode *inode) 699static inline void inode_inc_dirty_dents(struct inode *inode)
663{ 700{
701 inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
664 atomic_inc(&F2FS_I(inode)->dirty_dents); 702 atomic_inc(&F2FS_I(inode)->dirty_dents);
665} 703}
666 704
@@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
671 709
672static inline void inode_dec_dirty_dents(struct inode *inode) 710static inline void inode_dec_dirty_dents(struct inode *inode)
673{ 711{
712 if (!S_ISDIR(inode->i_mode))
713 return;
714
715 dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
674 atomic_dec(&F2FS_I(inode)->dirty_dents); 716 atomic_dec(&F2FS_I(inode)->dirty_dents);
675} 717}
676 718
@@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
679 return atomic_read(&sbi->nr_pages[count_type]); 721 return atomic_read(&sbi->nr_pages[count_type]);
680} 722}
681 723
724static inline int get_dirty_dents(struct inode *inode)
725{
726 return atomic_read(&F2FS_I(inode)->dirty_dents);
727}
728
682static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) 729static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
683{ 730{
684 unsigned int pages_per_sec = sbi->segs_per_sec * 731 unsigned int pages_per_sec = sbi->segs_per_sec *
@@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
689 736
690static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) 737static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
691{ 738{
692 block_t ret; 739 return sbi->total_valid_block_count;
693 spin_lock(&sbi->stat_lock);
694 ret = sbi->total_valid_block_count;
695 spin_unlock(&sbi->stat_lock);
696 return ret;
697} 740}
698 741
699static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) 742static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
@@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
789 832
790static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) 833static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
791{ 834{
792 unsigned int ret; 835 return sbi->total_valid_node_count;
793 spin_lock(&sbi->stat_lock);
794 ret = sbi->total_valid_node_count;
795 spin_unlock(&sbi->stat_lock);
796 return ret;
797} 836}
798 837
799static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) 838static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
@@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
814 853
815static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) 854static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
816{ 855{
817 unsigned int ret; 856 return sbi->total_valid_inode_count;
818 spin_lock(&sbi->stat_lock);
819 ret = sbi->total_valid_inode_count;
820 spin_unlock(&sbi->stat_lock);
821 return ret;
822} 857}
823 858
824static inline void f2fs_put_page(struct page *page, int unlock) 859static inline void f2fs_put_page(struct page *page, int unlock)
@@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn)
844} 879}
845 880
846static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, 881static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
847 size_t size, void (*ctor)(void *)) 882 size_t size)
848{ 883{
849 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); 884 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
850} 885}
851 886
852static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, 887static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
@@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
983 ri->i_inline |= F2FS_INLINE_DATA; 1018 ri->i_inline |= F2FS_INLINE_DATA;
984} 1019}
985 1020
1021static inline int f2fs_has_inline_xattr(struct inode *inode)
1022{
1023 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
1024}
1025
986static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 1026static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
987{ 1027{
988 if (is_inode_flag_set(fi, FI_INLINE_XATTR)) 1028 if (f2fs_has_inline_xattr(&fi->vfs_inode))
989 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; 1029 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
990 return DEF_ADDRS_PER_INODE; 1030 return DEF_ADDRS_PER_INODE;
991} 1031}
992 1032
993static inline void *inline_xattr_addr(struct page *page) 1033static inline void *inline_xattr_addr(struct page *page)
994{ 1034{
995 struct f2fs_inode *ri; 1035 struct f2fs_inode *ri = F2FS_INODE(page);
996 ri = (struct f2fs_inode *)page_address(page);
997 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - 1036 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
998 F2FS_INLINE_XATTR_ADDRS]); 1037 F2FS_INLINE_XATTR_ADDRS]);
999} 1038}
1000 1039
1001static inline int inline_xattr_size(struct inode *inode) 1040static inline int inline_xattr_size(struct inode *inode)
1002{ 1041{
1003 if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) 1042 if (f2fs_has_inline_xattr(inode))
1004 return F2FS_INLINE_XATTR_ADDRS << 2; 1043 return F2FS_INLINE_XATTR_ADDRS << 2;
1005 else 1044 else
1006 return 0; 1045 return 0;
@@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode)
1013 1052
1014static inline void *inline_data_addr(struct page *page) 1053static inline void *inline_data_addr(struct page *page)
1015{ 1054{
1016 struct f2fs_inode *ri; 1055 struct f2fs_inode *ri = F2FS_INODE(page);
1017 ri = (struct f2fs_inode *)page_address(page);
1018 return (void *)&(ri->i_addr[1]); 1056 return (void *)&(ri->i_addr[1]);
1019} 1057}
1020 1058
@@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb)
1023 return sb->s_flags & MS_RDONLY; 1061 return sb->s_flags & MS_RDONLY;
1024} 1062}
1025 1063
1064static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1065{
1066 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
1067 sbi->sb->s_flags |= MS_RDONLY;
1068}
1069
1026#define get_inode_mode(i) \ 1070#define get_inode_mode(i) \
1027 ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ 1071 ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
1028 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) 1072 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *);
1048struct inode *f2fs_iget(struct super_block *, unsigned long); 1092struct inode *f2fs_iget(struct super_block *, unsigned long);
1049int try_to_free_nats(struct f2fs_sb_info *, int); 1093int try_to_free_nats(struct f2fs_sb_info *, int);
1050void update_inode(struct inode *, struct page *); 1094void update_inode(struct inode *, struct page *);
1051int update_inode_page(struct inode *); 1095void update_inode_page(struct inode *);
1052int f2fs_write_inode(struct inode *, struct writeback_control *); 1096int f2fs_write_inode(struct inode *, struct writeback_control *);
1053void f2fs_evict_inode(struct inode *); 1097void f2fs_evict_inode(struct inode *);
1054 1098
@@ -1097,6 +1141,7 @@ struct dnode_of_data;
1097struct node_info; 1141struct node_info;
1098 1142
1099int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1143int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1144bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
1100void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1145void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1101int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1146int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1102int truncate_inode_blocks(struct inode *, pgoff_t); 1147int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
1115void alloc_nid_failed(struct f2fs_sb_info *, nid_t); 1160void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
1116void recover_node_page(struct f2fs_sb_info *, struct page *, 1161void recover_node_page(struct f2fs_sb_info *, struct page *,
1117 struct f2fs_summary *, struct node_info *, block_t); 1162 struct f2fs_summary *, struct node_info *, block_t);
1163bool recover_xattr_data(struct inode *, struct page *, block_t);
1118int recover_inode_page(struct f2fs_sb_info *, struct page *); 1164int recover_inode_page(struct f2fs_sb_info *, struct page *);
1119int restore_node_summary(struct f2fs_sb_info *, unsigned int, 1165int restore_node_summary(struct f2fs_sb_info *, unsigned int,
1120 struct f2fs_summary_block *); 1166 struct f2fs_summary_block *);
@@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void);
1129 */ 1175 */
1130void f2fs_balance_fs(struct f2fs_sb_info *); 1176void f2fs_balance_fs(struct f2fs_sb_info *);
1131void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1177void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1178int f2fs_issue_flush(struct f2fs_sb_info *);
1132void invalidate_blocks(struct f2fs_sb_info *, block_t); 1179void invalidate_blocks(struct f2fs_sb_info *, block_t);
1180void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1133void clear_prefree_segments(struct f2fs_sb_info *); 1181void clear_prefree_segments(struct f2fs_sb_info *);
1134int npages_for_summary_flush(struct f2fs_sb_info *); 1182int npages_for_summary_flush(struct f2fs_sb_info *);
1135void allocate_new_segments(struct f2fs_sb_info *); 1183void allocate_new_segments(struct f2fs_sb_info *);
@@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void);
1162 */ 1210 */
1163struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1211struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1164struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1212struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1213int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
1165long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1214long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1166int acquire_orphan_inode(struct f2fs_sb_info *); 1215int acquire_orphan_inode(struct f2fs_sb_info *);
1167void release_orphan_inode(struct f2fs_sb_info *); 1216void release_orphan_inode(struct f2fs_sb_info *);
@@ -1231,7 +1280,7 @@ struct f2fs_stat_info {
1231 int util_free, util_valid, util_invalid; 1280 int util_free, util_valid, util_invalid;
1232 int rsvd_segs, overp_segs; 1281 int rsvd_segs, overp_segs;
1233 int dirty_count, node_pages, meta_pages; 1282 int dirty_count, node_pages, meta_pages;
1234 int prefree_count, call_count; 1283 int prefree_count, call_count, cp_count;
1235 int tot_segs, node_segs, data_segs, free_segs, free_secs; 1284 int tot_segs, node_segs, data_segs, free_segs, free_secs;
1236 int tot_blks, data_blks, node_blks; 1285 int tot_blks, data_blks, node_blks;
1237 int curseg[NR_CURSEG_TYPE]; 1286 int curseg[NR_CURSEG_TYPE];
@@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1248 return (struct f2fs_stat_info *)sbi->stat_info; 1297 return (struct f2fs_stat_info *)sbi->stat_info;
1249} 1298}
1250 1299
1300#define stat_inc_cp_count(si) ((si)->cp_count++)
1251#define stat_inc_call_count(si) ((si)->call_count++) 1301#define stat_inc_call_count(si) ((si)->call_count++)
1252#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) 1302#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
1253#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) 1303#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
@@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *);
1302void __init f2fs_create_root_stats(void); 1352void __init f2fs_create_root_stats(void);
1303void f2fs_destroy_root_stats(void); 1353void f2fs_destroy_root_stats(void);
1304#else 1354#else
1355#define stat_inc_cp_count(si)
1305#define stat_inc_call_count(si) 1356#define stat_inc_call_count(si)
1306#define stat_inc_bggc_count(si) 1357#define stat_inc_bggc_count(si)
1307#define stat_inc_dirty_dir(sbi) 1358#define stat_inc_dirty_dir(sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0dfcef53a6ed..60e7d5448a1d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
76 trace_f2fs_vm_page_mkwrite(page, DATA); 76 trace_f2fs_vm_page_mkwrite(page, DATA);
77mapped: 77mapped:
78 /* fill the page */ 78 /* fill the page */
79 wait_on_page_writeback(page); 79 f2fs_wait_on_page_writeback(page, DATA);
80out: 80out:
81 sb_end_pagefault(inode->i_sb); 81 sb_end_pagefault(inode->i_sb);
82 return block_page_mkwrite_return(err); 82 return block_page_mkwrite_return(err);
@@ -84,6 +84,7 @@ out:
84 84
85static const struct vm_operations_struct f2fs_file_vm_ops = { 85static const struct vm_operations_struct f2fs_file_vm_ops = {
86 .fault = filemap_fault, 86 .fault = filemap_fault,
87 .map_pages = filemap_map_pages,
87 .page_mkwrite = f2fs_vm_page_mkwrite, 88 .page_mkwrite = f2fs_vm_page_mkwrite,
88 .remap_pages = generic_file_remap_pages, 89 .remap_pages = generic_file_remap_pages,
89}; 90};
@@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
111int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 112int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
112{ 113{
113 struct inode *inode = file->f_mapping->host; 114 struct inode *inode = file->f_mapping->host;
115 struct f2fs_inode_info *fi = F2FS_I(inode);
114 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 116 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
115 int ret = 0; 117 int ret = 0;
116 bool need_cp = false; 118 bool need_cp = false;
117 struct writeback_control wbc = { 119 struct writeback_control wbc = {
118 .sync_mode = WB_SYNC_NONE, 120 .sync_mode = WB_SYNC_ALL,
119 .nr_to_write = LONG_MAX, 121 .nr_to_write = LONG_MAX,
120 .for_reclaim = 0, 122 .for_reclaim = 0,
121 }; 123 };
@@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
133 /* guarantee free sections for fsync */ 135 /* guarantee free sections for fsync */
134 f2fs_balance_fs(sbi); 136 f2fs_balance_fs(sbi);
135 137
136 mutex_lock(&inode->i_mutex); 138 down_read(&fi->i_sem);
137 139
138 /* 140 /*
139 * Both of fdatasync() and fsync() are able to be recovered from 141 * Both of fdatasync() and fsync() are able to be recovered from
@@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
150 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) 152 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
151 need_cp = true; 153 need_cp = true;
152 154
155 up_read(&fi->i_sem);
156
153 if (need_cp) { 157 if (need_cp) {
154 nid_t pino; 158 nid_t pino;
155 159
156 F2FS_I(inode)->xattr_ver = 0;
157
158 /* all the dirty node pages should be flushed for POR */ 160 /* all the dirty node pages should be flushed for POR */
159 ret = f2fs_sync_fs(inode->i_sb, 1); 161 ret = f2fs_sync_fs(inode->i_sb, 1);
162
163 down_write(&fi->i_sem);
164 F2FS_I(inode)->xattr_ver = 0;
160 if (file_wrong_pino(inode) && inode->i_nlink == 1 && 165 if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
161 get_parent_ino(inode, &pino)) { 166 get_parent_ino(inode, &pino)) {
162 F2FS_I(inode)->i_pino = pino; 167 F2FS_I(inode)->i_pino = pino;
163 file_got_pino(inode); 168 file_got_pino(inode);
169 up_write(&fi->i_sem);
164 mark_inode_dirty_sync(inode); 170 mark_inode_dirty_sync(inode);
165 ret = f2fs_write_inode(inode, NULL); 171 ret = f2fs_write_inode(inode, NULL);
166 if (ret) 172 if (ret)
167 goto out; 173 goto out;
174 } else {
175 up_write(&fi->i_sem);
168 } 176 }
169 } else { 177 } else {
170 /* if there is no written node page, write its inode page */ 178 /* if there is no written node page, write its inode page */
171 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 179 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
180 if (fsync_mark_done(sbi, inode->i_ino))
181 goto out;
172 mark_inode_dirty_sync(inode); 182 mark_inode_dirty_sync(inode);
173 ret = f2fs_write_inode(inode, NULL); 183 ret = f2fs_write_inode(inode, NULL);
174 if (ret) 184 if (ret)
@@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
177 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 187 ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
178 if (ret) 188 if (ret)
179 goto out; 189 goto out;
180 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 190 ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
181 } 191 }
182out: 192out:
183 mutex_unlock(&inode->i_mutex);
184 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 193 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
185 return ret; 194 return ret;
186} 195}
@@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
245 f2fs_put_page(page, 1); 254 f2fs_put_page(page, 1);
246 return; 255 return;
247 } 256 }
248 wait_on_page_writeback(page); 257 f2fs_wait_on_page_writeback(page, DATA);
249 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 258 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
250 set_page_dirty(page); 259 set_page_dirty(page);
251 f2fs_put_page(page, 1); 260 f2fs_put_page(page, 1);
@@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
422 f2fs_unlock_op(sbi); 431 f2fs_unlock_op(sbi);
423 432
424 if (!IS_ERR(page)) { 433 if (!IS_ERR(page)) {
425 wait_on_page_writeback(page); 434 f2fs_wait_on_page_writeback(page, DATA);
426 zero_user(page, start, len); 435 zero_user(page, start, len);
427 set_page_dirty(page); 436 set_page_dirty(page);
428 f2fs_put_page(page, 1); 437 f2fs_put_page(page, 1);
@@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode,
560 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 569 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
561 return -EOPNOTSUPP; 570 return -EOPNOTSUPP;
562 571
572 mutex_lock(&inode->i_mutex);
573
563 if (mode & FALLOC_FL_PUNCH_HOLE) 574 if (mode & FALLOC_FL_PUNCH_HOLE)
564 ret = punch_hole(inode, offset, len); 575 ret = punch_hole(inode, offset, len);
565 else 576 else
@@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode,
569 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 580 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
570 mark_inode_dirty(inode); 581 mark_inode_dirty(inode);
571 } 582 }
583
584 mutex_unlock(&inode->i_mutex);
585
572 trace_f2fs_fallocate(inode, mode, offset, len, ret); 586 trace_f2fs_fallocate(inode, mode, offset, len, ret);
573 return ret; 587 return ret;
574} 588}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea0371e854b4..b90dbe55403a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
531 set_page_dirty(page); 531 set_page_dirty(page);
532 set_cold_data(page); 532 set_cold_data(page);
533 } else { 533 } else {
534 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
535
536 f2fs_wait_on_page_writeback(page, DATA); 534 f2fs_wait_on_page_writeback(page, DATA);
537 535
538 if (clear_page_dirty_for_io(page) && 536 if (clear_page_dirty_for_io(page))
539 S_ISDIR(inode->i_mode)) {
540 dec_page_count(sbi, F2FS_DIRTY_DENTS);
541 inode_dec_dirty_dents(inode); 537 inode_dec_dirty_dents(inode);
542 }
543 set_cold_data(page); 538 set_cold_data(page);
544 do_write_data_page(page, &fio); 539 do_write_data_page(page, &fio);
545 clear_cold_data(page); 540 clear_cold_data(page);
@@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
701gc_more: 696gc_more:
702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 697 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
703 goto stop; 698 goto stop;
699 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
700 goto stop;
704 701
705 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 702 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
706 gc_type = FG_GC; 703 gc_type = FG_GC;
@@ -711,6 +708,11 @@ gc_more:
711 goto stop; 708 goto stop;
712 ret = 0; 709 ret = 0;
713 710
711 /* readahead multi ssa blocks those have contiguous address */
712 if (sbi->segs_per_sec > 1)
713 ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
714 META_SSA);
715
714 for (i = 0; i < sbi->segs_per_sec; i++) 716 for (i = 0; i < sbi->segs_per_sec; i++)
715 do_garbage_collect(sbi, segno + i, &ilist, gc_type); 717 do_garbage_collect(sbi, segno + i, &ilist, gc_type);
716 718
@@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
740int __init create_gc_caches(void) 742int __init create_gc_caches(void)
741{ 743{
742 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", 744 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
743 sizeof(struct inode_entry), NULL); 745 sizeof(struct inode_entry));
744 if (!winode_slab) 746 if (!winode_slab)
745 return -ENOMEM; 747 return -ENOMEM;
746 return 0; 748 return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 31ee5b164ff9..383db1fabcf4 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
45 } 45 }
46 46
47 ipage = get_node_page(sbi, inode->i_ino); 47 ipage = get_node_page(sbi, inode->i_ino);
48 if (IS_ERR(ipage)) 48 if (IS_ERR(ipage)) {
49 unlock_page(page);
49 return PTR_ERR(ipage); 50 return PTR_ERR(ipage);
51 }
50 52
51 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); 53 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
52 54
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 4d67ed736dca..ee829d360468 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode)
107 fi->flags = 0; 107 fi->flags = 0;
108 fi->i_advise = ri->i_advise; 108 fi->i_advise = ri->i_advise;
109 fi->i_pino = le32_to_cpu(ri->i_pino); 109 fi->i_pino = le32_to_cpu(ri->i_pino);
110 fi->i_dir_level = ri->i_dir_level;
110 111
111 get_extent_info(&fi->ext, ri->i_ext); 112 get_extent_info(&fi->ext, ri->i_ext);
112 get_inline_info(fi, ri); 113 get_inline_info(fi, ri);
@@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page)
204 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); 205 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
205 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); 206 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
206 ri->i_generation = cpu_to_le32(inode->i_generation); 207 ri->i_generation = cpu_to_le32(inode->i_generation);
208 ri->i_dir_level = F2FS_I(inode)->i_dir_level;
207 209
208 __set_inode_rdev(inode, ri); 210 __set_inode_rdev(inode, ri);
209 set_cold_node(inode, node_page); 211 set_cold_node(inode, node_page);
@@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page)
212 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); 214 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
213} 215}
214 216
215int update_inode_page(struct inode *inode) 217void update_inode_page(struct inode *inode)
216{ 218{
217 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 219 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
218 struct page *node_page; 220 struct page *node_page;
219 221retry:
220 node_page = get_node_page(sbi, inode->i_ino); 222 node_page = get_node_page(sbi, inode->i_ino);
221 if (IS_ERR(node_page)) 223 if (IS_ERR(node_page)) {
222 return PTR_ERR(node_page); 224 int err = PTR_ERR(node_page);
223 225 if (err == -ENOMEM) {
226 cond_resched();
227 goto retry;
228 } else if (err != -ENOENT) {
229 f2fs_stop_checkpoint(sbi);
230 }
231 return;
232 }
224 update_inode(inode, node_page); 233 update_inode(inode, node_page);
225 f2fs_put_page(node_page, 1); 234 f2fs_put_page(node_page, 1);
226 return 0;
227} 235}
228 236
229int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 237int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
230{ 238{
231 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
232 int ret;
233 240
234 if (inode->i_ino == F2FS_NODE_INO(sbi) || 241 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
235 inode->i_ino == F2FS_META_INO(sbi)) 242 inode->i_ino == F2FS_META_INO(sbi))
@@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
243 * during the urgent cleaning time when runing out of free sections. 250 * during the urgent cleaning time when runing out of free sections.
244 */ 251 */
245 f2fs_lock_op(sbi); 252 f2fs_lock_op(sbi);
246 ret = update_inode_page(inode); 253 update_inode_page(inode);
247 f2fs_unlock_op(sbi); 254 f2fs_unlock_op(sbi);
248 255
249 if (wbc) 256 if (wbc)
250 f2fs_balance_fs(sbi); 257 f2fs_balance_fs(sbi);
251 258
252 return ret; 259 return 0;
253} 260}
254 261
255/* 262/*
@@ -260,13 +267,13 @@ void f2fs_evict_inode(struct inode *inode)
260 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 267 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
261 268
262 trace_f2fs_evict_inode(inode); 269 trace_f2fs_evict_inode(inode);
263 truncate_inode_pages(&inode->i_data, 0); 270 truncate_inode_pages_final(&inode->i_data);
264 271
265 if (inode->i_ino == F2FS_NODE_INO(sbi) || 272 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
266 inode->i_ino == F2FS_META_INO(sbi)) 273 inode->i_ino == F2FS_META_INO(sbi))
267 goto no_delete; 274 goto no_delete;
268 275
269 f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); 276 f2fs_bug_on(get_dirty_dents(inode));
270 remove_dirty_dir_inode(inode); 277 remove_dirty_dir_inode(inode);
271 278
272 if (inode->i_nlink || is_bad_inode(inode)) 279 if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 397d459e97bf..a9409d19dfd4 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
207 inode = f2fs_iget(dir->i_sb, ino); 207 inode = f2fs_iget(dir->i_sb, ino);
208 if (IS_ERR(inode)) 208 if (IS_ERR(inode))
209 return ERR_CAST(inode); 209 return ERR_CAST(inode);
210
211 stat_inc_inline_inode(inode);
210 } 212 }
211 213
212 return d_splice_alias(inode, dentry); 214 return d_splice_alias(inode, dentry);
@@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
424 } 426 }
425 427
426 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 428 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
429 down_write(&F2FS_I(old_inode)->i_sem);
427 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 430 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
431 up_write(&F2FS_I(old_inode)->i_sem);
428 432
429 new_inode->i_ctime = CURRENT_TIME; 433 new_inode->i_ctime = CURRENT_TIME;
434 down_write(&F2FS_I(new_inode)->i_sem);
430 if (old_dir_entry) 435 if (old_dir_entry)
431 drop_nlink(new_inode); 436 drop_nlink(new_inode);
432 drop_nlink(new_inode); 437 drop_nlink(new_inode);
438 up_write(&F2FS_I(new_inode)->i_sem);
439
433 mark_inode_dirty(new_inode); 440 mark_inode_dirty(new_inode);
434 441
435 if (!new_inode->i_nlink) 442 if (!new_inode->i_nlink)
@@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
459 if (old_dir != new_dir) { 466 if (old_dir != new_dir) {
460 f2fs_set_link(old_inode, old_dir_entry, 467 f2fs_set_link(old_inode, old_dir_entry,
461 old_dir_page, new_dir); 468 old_dir_page, new_dir);
469 down_write(&F2FS_I(old_inode)->i_sem);
462 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 470 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
471 up_write(&F2FS_I(old_inode)->i_sem);
463 update_inode_page(old_inode); 472 update_inode_page(old_inode);
464 } else { 473 } else {
465 kunmap(old_dir_page); 474 kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..a161e955c4c8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -21,9 +21,27 @@
21#include "segment.h" 21#include "segment.h"
22#include <trace/events/f2fs.h> 22#include <trace/events/f2fs.h>
23 23
24#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
25
24static struct kmem_cache *nat_entry_slab; 26static struct kmem_cache *nat_entry_slab;
25static struct kmem_cache *free_nid_slab; 27static struct kmem_cache *free_nid_slab;
26 28
29static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type)
30{
31 struct sysinfo val;
32 unsigned long mem_size = 0;
33
34 si_meminfo(&val);
35 if (type == FREE_NIDS)
36 mem_size = nm_i->fcnt * sizeof(struct free_nid);
37 else if (type == NAT_ENTRIES)
38 mem_size += nm_i->nat_cnt * sizeof(struct nat_entry);
39 mem_size >>= 12;
40
41 /* give 50:50 memory for free nids and nat caches respectively */
42 return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11));
43}
44
27static void clear_node_page_dirty(struct page *page) 45static void clear_node_page_dirty(struct page *page)
28{ 46{
29 struct address_space *mapping = page->mapping; 47 struct address_space *mapping = page->mapping;
@@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
82 return dst_page; 100 return dst_page;
83} 101}
84 102
85/*
86 * Readahead NAT pages
87 */
88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
89{
90 struct address_space *mapping = META_MAPPING(sbi);
91 struct f2fs_nm_info *nm_i = NM_I(sbi);
92 struct page *page;
93 pgoff_t index;
94 int i;
95 struct f2fs_io_info fio = {
96 .type = META,
97 .rw = READ_SYNC | REQ_META | REQ_PRIO
98 };
99
100
101 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
102 if (unlikely(nid >= nm_i->max_nid))
103 nid = 0;
104 index = current_nat_addr(sbi, nid);
105
106 page = grab_cache_page(mapping, index);
107 if (!page)
108 continue;
109 if (PageUptodate(page)) {
110 mark_page_accessed(page);
111 f2fs_put_page(page, 1);
112 continue;
113 }
114 f2fs_submit_page_mbio(sbi, page, index, &fio);
115 mark_page_accessed(page);
116 f2fs_put_page(page, 0);
117 }
118 f2fs_submit_merged_bio(sbi, META, READ);
119}
120
121static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 103static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
122{ 104{
123 return radix_tree_lookup(&nm_i->nat_root, n); 105 return radix_tree_lookup(&nm_i->nat_root, n);
@@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
151 return is_cp; 133 return is_cp;
152} 134}
153 135
136bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
137{
138 struct f2fs_nm_info *nm_i = NM_I(sbi);
139 struct nat_entry *e;
140 bool fsync_done = false;
141
142 read_lock(&nm_i->nat_tree_lock);
143 e = __lookup_nat_cache(nm_i, nid);
144 if (e)
145 fsync_done = e->fsync_done;
146 read_unlock(&nm_i->nat_tree_lock);
147 return fsync_done;
148}
149
154static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 150static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
155{ 151{
156 struct nat_entry *new; 152 struct nat_entry *new;
@@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
164 } 160 }
165 memset(new, 0, sizeof(struct nat_entry)); 161 memset(new, 0, sizeof(struct nat_entry));
166 nat_set_nid(new, nid); 162 nat_set_nid(new, nid);
163 new->checkpointed = true;
167 list_add_tail(&new->list, &nm_i->nat_entries); 164 list_add_tail(&new->list, &nm_i->nat_entries);
168 nm_i->nat_cnt++; 165 nm_i->nat_cnt++;
169 return new; 166 return new;
@@ -185,13 +182,12 @@ retry:
185 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); 182 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
186 nat_set_ino(e, le32_to_cpu(ne->ino)); 183 nat_set_ino(e, le32_to_cpu(ne->ino));
187 nat_set_version(e, ne->version); 184 nat_set_version(e, ne->version);
188 e->checkpointed = true;
189 } 185 }
190 write_unlock(&nm_i->nat_tree_lock); 186 write_unlock(&nm_i->nat_tree_lock);
191} 187}
192 188
193static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, 189static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
194 block_t new_blkaddr) 190 block_t new_blkaddr, bool fsync_done)
195{ 191{
196 struct f2fs_nm_info *nm_i = NM_I(sbi); 192 struct f2fs_nm_info *nm_i = NM_I(sbi);
197 struct nat_entry *e; 193 struct nat_entry *e;
@@ -205,7 +201,6 @@ retry:
205 goto retry; 201 goto retry;
206 } 202 }
207 e->ni = *ni; 203 e->ni = *ni;
208 e->checkpointed = true;
209 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 204 f2fs_bug_on(ni->blk_addr == NEW_ADDR);
210 } else if (new_blkaddr == NEW_ADDR) { 205 } else if (new_blkaddr == NEW_ADDR) {
211 /* 206 /*
@@ -217,9 +212,6 @@ retry:
217 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 212 f2fs_bug_on(ni->blk_addr != NULL_ADDR);
218 } 213 }
219 214
220 if (new_blkaddr == NEW_ADDR)
221 e->checkpointed = false;
222
223 /* sanity check */ 215 /* sanity check */
224 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 216 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
225 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 217 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
@@ -239,6 +231,11 @@ retry:
239 /* change address */ 231 /* change address */
240 nat_set_blkaddr(e, new_blkaddr); 232 nat_set_blkaddr(e, new_blkaddr);
241 __set_nat_cache_dirty(nm_i, e); 233 __set_nat_cache_dirty(nm_i, e);
234
235 /* update fsync_mark if its inode nat entry is still alive */
236 e = __lookup_nat_cache(nm_i, ni->ino);
237 if (e)
238 e->fsync_done = fsync_done;
242 write_unlock(&nm_i->nat_tree_lock); 239 write_unlock(&nm_i->nat_tree_lock);
243} 240}
244 241
@@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
246{ 243{
247 struct f2fs_nm_info *nm_i = NM_I(sbi); 244 struct f2fs_nm_info *nm_i = NM_I(sbi);
248 245
249 if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) 246 if (available_free_memory(nm_i, NAT_ENTRIES))
250 return 0; 247 return 0;
251 248
252 write_lock(&nm_i->nat_tree_lock); 249 write_lock(&nm_i->nat_tree_lock);
@@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn)
505 /* Deallocate node address */ 502 /* Deallocate node address */
506 invalidate_blocks(sbi, ni.blk_addr); 503 invalidate_blocks(sbi, ni.blk_addr);
507 dec_valid_node_count(sbi, dn->inode); 504 dec_valid_node_count(sbi, dn->inode);
508 set_node_addr(sbi, &ni, NULL_ADDR); 505 set_node_addr(sbi, &ni, NULL_ADDR, false);
509 506
510 if (dn->nid == dn->inode->i_ino) { 507 if (dn->nid == dn->inode->i_ino) {
511 remove_orphan_inode(sbi, dn->nid); 508 remove_orphan_inode(sbi, dn->nid);
@@ -763,7 +760,7 @@ skip_partial:
763 f2fs_put_page(page, 1); 760 f2fs_put_page(page, 1);
764 goto restart; 761 goto restart;
765 } 762 }
766 wait_on_page_writeback(page); 763 f2fs_wait_on_page_writeback(page, NODE);
767 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 764 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
768 set_page_dirty(page); 765 set_page_dirty(page);
769 unlock_page(page); 766 unlock_page(page);
@@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
852 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 849 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
853 return ERR_PTR(-EPERM); 850 return ERR_PTR(-EPERM);
854 851
855 page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); 852 page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
853 dn->nid, AOP_FLAG_NOFS);
856 if (!page) 854 if (!page)
857 return ERR_PTR(-ENOMEM); 855 return ERR_PTR(-ENOMEM);
858 856
@@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
867 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 865 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
868 new_ni = old_ni; 866 new_ni = old_ni;
869 new_ni.ino = dn->inode->i_ino; 867 new_ni.ino = dn->inode->i_ino;
870 set_node_addr(sbi, &new_ni, NEW_ADDR); 868 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
871 869
872 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); 870 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
873 set_cold_node(dn->inode, page); 871 set_cold_node(dn->inode, page);
874 SetPageUptodate(page); 872 SetPageUptodate(page);
875 set_page_dirty(page); 873 set_page_dirty(page);
876 874
877 if (ofs == XATTR_NODE_OFFSET) 875 if (f2fs_has_xattr_block(ofs))
878 F2FS_I(dn->inode)->i_xattr_nid = dn->nid; 876 F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
879 877
880 dn->node_page = page; 878 dn->node_page = page;
@@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
948 struct page *page; 946 struct page *page;
949 int err; 947 int err;
950repeat: 948repeat:
951 page = grab_cache_page(NODE_MAPPING(sbi), nid); 949 page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
950 nid, AOP_FLAG_NOFS);
952 if (!page) 951 if (!page)
953 return ERR_PTR(-ENOMEM); 952 return ERR_PTR(-ENOMEM);
954 953
@@ -959,7 +958,7 @@ repeat:
959 goto got_it; 958 goto got_it;
960 959
961 lock_page(page); 960 lock_page(page);
962 if (unlikely(!PageUptodate(page))) { 961 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
963 f2fs_put_page(page, 1); 962 f2fs_put_page(page, 1);
964 return ERR_PTR(-EIO); 963 return ERR_PTR(-EIO);
965 } 964 }
@@ -968,7 +967,6 @@ repeat:
968 goto repeat; 967 goto repeat;
969 } 968 }
970got_it: 969got_it:
971 f2fs_bug_on(nid != nid_of_node(page));
972 mark_page_accessed(page); 970 mark_page_accessed(page);
973 return page; 971 return page;
974} 972}
@@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1168 continue; 1166 continue;
1169 1167
1170 if (ino && ino_of_node(page) == ino) { 1168 if (ino && ino_of_node(page) == ino) {
1171 wait_on_page_writeback(page); 1169 f2fs_wait_on_page_writeback(page, NODE);
1172 if (TestClearPageError(page)) 1170 if (TestClearPageError(page))
1173 ret = -EIO; 1171 ret = -EIO;
1174 } 1172 }
@@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page,
1201 if (unlikely(sbi->por_doing)) 1199 if (unlikely(sbi->por_doing))
1202 goto redirty_out; 1200 goto redirty_out;
1203 1201
1204 wait_on_page_writeback(page); 1202 f2fs_wait_on_page_writeback(page, NODE);
1205 1203
1206 /* get old block addr of this node page */ 1204 /* get old block addr of this node page */
1207 nid = nid_of_node(page); 1205 nid = nid_of_node(page);
@@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page,
1222 mutex_lock(&sbi->node_write); 1220 mutex_lock(&sbi->node_write);
1223 set_page_writeback(page); 1221 set_page_writeback(page);
1224 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1222 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1225 set_node_addr(sbi, &ni, new_addr); 1223 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
1226 dec_page_count(sbi, F2FS_DIRTY_NODES); 1224 dec_page_count(sbi, F2FS_DIRTY_NODES);
1227 mutex_unlock(&sbi->node_write); 1225 mutex_unlock(&sbi->node_write);
1228 unlock_page(page); 1226 unlock_page(page);
@@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page,
1231redirty_out: 1229redirty_out:
1232 dec_page_count(sbi, F2FS_DIRTY_NODES); 1230 dec_page_count(sbi, F2FS_DIRTY_NODES);
1233 wbc->pages_skipped++; 1231 wbc->pages_skipped++;
1232 account_page_redirty(page);
1234 set_page_dirty(page); 1233 set_page_dirty(page);
1235 return AOP_WRITEPAGE_ACTIVATE; 1234 return AOP_WRITEPAGE_ACTIVATE;
1236} 1235}
1237 1236
1238/*
1239 * It is very important to gather dirty pages and write at once, so that we can
1240 * submit a big bio without interfering other data writes.
1241 * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
1242 */
1243#define COLLECT_DIRTY_NODES 1536
1244static int f2fs_write_node_pages(struct address_space *mapping, 1237static int f2fs_write_node_pages(struct address_space *mapping,
1245 struct writeback_control *wbc) 1238 struct writeback_control *wbc)
1246{ 1239{
1247 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1240 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1248 long nr_to_write = wbc->nr_to_write; 1241 long diff;
1249 1242
1250 /* balancing f2fs's metadata in background */ 1243 /* balancing f2fs's metadata in background */
1251 f2fs_balance_fs_bg(sbi); 1244 f2fs_balance_fs_bg(sbi);
1252 1245
1253 /* collect a number of dirty node pages and write together */ 1246 /* collect a number of dirty node pages and write together */
1254 if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) 1247 if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1255 return 0; 1248 goto skip_write;
1256 1249
1257 /* if mounting is failed, skip writing node pages */ 1250 diff = nr_pages_to_write(sbi, NODE, wbc);
1258 wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1259 wbc->sync_mode = WB_SYNC_NONE; 1251 wbc->sync_mode = WB_SYNC_NONE;
1260 sync_node_pages(sbi, 0, wbc); 1252 sync_node_pages(sbi, 0, wbc);
1261 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - 1253 wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1262 wbc->nr_to_write); 1254 return 0;
1255
1256skip_write:
1257 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1263 return 0; 1258 return 0;
1264} 1259}
1265 1260
@@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = {
1307 .releasepage = f2fs_release_node_page, 1302 .releasepage = f2fs_release_node_page,
1308}; 1303};
1309 1304
1310static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) 1305static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1306 nid_t n)
1311{ 1307{
1312 struct list_head *this; 1308 return radix_tree_lookup(&nm_i->free_nid_root, n);
1313 struct free_nid *i;
1314 list_for_each(this, head) {
1315 i = list_entry(this, struct free_nid, list);
1316 if (i->nid == n)
1317 return i;
1318 }
1319 return NULL;
1320} 1309}
1321 1310
1322static void __del_from_free_nid_list(struct free_nid *i) 1311static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
1312 struct free_nid *i)
1323{ 1313{
1324 list_del(&i->list); 1314 list_del(&i->list);
1325 kmem_cache_free(free_nid_slab, i); 1315 radix_tree_delete(&nm_i->free_nid_root, i->nid);
1326} 1316}
1327 1317
1328static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) 1318static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
@@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1331 struct nat_entry *ne; 1321 struct nat_entry *ne;
1332 bool allocated = false; 1322 bool allocated = false;
1333 1323
1334 if (nm_i->fcnt > 2 * MAX_FREE_NIDS) 1324 if (!available_free_memory(nm_i, FREE_NIDS))
1335 return -1; 1325 return -1;
1336 1326
1337 /* 0 nid should not be used */ 1327 /* 0 nid should not be used */
@@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1342 /* do not add allocated nids */ 1332 /* do not add allocated nids */
1343 read_lock(&nm_i->nat_tree_lock); 1333 read_lock(&nm_i->nat_tree_lock);
1344 ne = __lookup_nat_cache(nm_i, nid); 1334 ne = __lookup_nat_cache(nm_i, nid);
1345 if (ne && nat_get_blkaddr(ne) != NULL_ADDR) 1335 if (ne &&
1336 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
1346 allocated = true; 1337 allocated = true;
1347 read_unlock(&nm_i->nat_tree_lock); 1338 read_unlock(&nm_i->nat_tree_lock);
1348 if (allocated) 1339 if (allocated)
@@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1354 i->state = NID_NEW; 1345 i->state = NID_NEW;
1355 1346
1356 spin_lock(&nm_i->free_nid_list_lock); 1347 spin_lock(&nm_i->free_nid_list_lock);
1357 if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { 1348 if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
1358 spin_unlock(&nm_i->free_nid_list_lock); 1349 spin_unlock(&nm_i->free_nid_list_lock);
1359 kmem_cache_free(free_nid_slab, i); 1350 kmem_cache_free(free_nid_slab, i);
1360 return 0; 1351 return 0;
@@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1368static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) 1359static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1369{ 1360{
1370 struct free_nid *i; 1361 struct free_nid *i;
1362 bool need_free = false;
1363
1371 spin_lock(&nm_i->free_nid_list_lock); 1364 spin_lock(&nm_i->free_nid_list_lock);
1372 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1365 i = __lookup_free_nid_list(nm_i, nid);
1373 if (i && i->state == NID_NEW) { 1366 if (i && i->state == NID_NEW) {
1374 __del_from_free_nid_list(i); 1367 __del_from_free_nid_list(nm_i, i);
1375 nm_i->fcnt--; 1368 nm_i->fcnt--;
1369 need_free = true;
1376 } 1370 }
1377 spin_unlock(&nm_i->free_nid_list_lock); 1371 spin_unlock(&nm_i->free_nid_list_lock);
1372
1373 if (need_free)
1374 kmem_cache_free(free_nid_slab, i);
1378} 1375}
1379 1376
1380static void scan_nat_page(struct f2fs_nm_info *nm_i, 1377static void scan_nat_page(struct f2fs_nm_info *nm_i,
@@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1413 return; 1410 return;
1414 1411
1415 /* readahead nat pages to be scanned */ 1412 /* readahead nat pages to be scanned */
1416 ra_nat_pages(sbi, nid); 1413 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
1417 1414
1418 while (1) { 1415 while (1) {
1419 struct page *page = get_current_nat_page(sbi, nid); 1416 struct page *page = get_current_nat_page(sbi, nid);
@@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1454{ 1451{
1455 struct f2fs_nm_info *nm_i = NM_I(sbi); 1452 struct f2fs_nm_info *nm_i = NM_I(sbi);
1456 struct free_nid *i = NULL; 1453 struct free_nid *i = NULL;
1457 struct list_head *this;
1458retry: 1454retry:
1459 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) 1455 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
1460 return false; 1456 return false;
@@ -1462,13 +1458,11 @@ retry:
1462 spin_lock(&nm_i->free_nid_list_lock); 1458 spin_lock(&nm_i->free_nid_list_lock);
1463 1459
1464 /* We should not use stale free nids created by build_free_nids */ 1460 /* We should not use stale free nids created by build_free_nids */
1465 if (nm_i->fcnt && !sbi->on_build_free_nids) { 1461 if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1466 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1462 f2fs_bug_on(list_empty(&nm_i->free_nid_list));
1467 list_for_each(this, &nm_i->free_nid_list) { 1463 list_for_each_entry(i, &nm_i->free_nid_list, list)
1468 i = list_entry(this, struct free_nid, list);
1469 if (i->state == NID_NEW) 1464 if (i->state == NID_NEW)
1470 break; 1465 break;
1471 }
1472 1466
1473 f2fs_bug_on(i->state != NID_NEW); 1467 f2fs_bug_on(i->state != NID_NEW);
1474 *nid = i->nid; 1468 *nid = i->nid;
@@ -1481,9 +1475,7 @@ retry:
1481 1475
1482 /* Let's scan nat pages and its caches to get free nids */ 1476 /* Let's scan nat pages and its caches to get free nids */
1483 mutex_lock(&nm_i->build_lock); 1477 mutex_lock(&nm_i->build_lock);
1484 sbi->on_build_free_nids = true;
1485 build_free_nids(sbi); 1478 build_free_nids(sbi);
1486 sbi->on_build_free_nids = false;
1487 mutex_unlock(&nm_i->build_lock); 1479 mutex_unlock(&nm_i->build_lock);
1488 goto retry; 1480 goto retry;
1489} 1481}
@@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1497 struct free_nid *i; 1489 struct free_nid *i;
1498 1490
1499 spin_lock(&nm_i->free_nid_list_lock); 1491 spin_lock(&nm_i->free_nid_list_lock);
1500 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1492 i = __lookup_free_nid_list(nm_i, nid);
1501 f2fs_bug_on(!i || i->state != NID_ALLOC); 1493 f2fs_bug_on(!i || i->state != NID_ALLOC);
1502 __del_from_free_nid_list(i); 1494 __del_from_free_nid_list(nm_i, i);
1503 spin_unlock(&nm_i->free_nid_list_lock); 1495 spin_unlock(&nm_i->free_nid_list_lock);
1496
1497 kmem_cache_free(free_nid_slab, i);
1504} 1498}
1505 1499
1506/* 1500/*
@@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1510{ 1504{
1511 struct f2fs_nm_info *nm_i = NM_I(sbi); 1505 struct f2fs_nm_info *nm_i = NM_I(sbi);
1512 struct free_nid *i; 1506 struct free_nid *i;
1507 bool need_free = false;
1513 1508
1514 if (!nid) 1509 if (!nid)
1515 return; 1510 return;
1516 1511
1517 spin_lock(&nm_i->free_nid_list_lock); 1512 spin_lock(&nm_i->free_nid_list_lock);
1518 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1513 i = __lookup_free_nid_list(nm_i, nid);
1519 f2fs_bug_on(!i || i->state != NID_ALLOC); 1514 f2fs_bug_on(!i || i->state != NID_ALLOC);
1520 if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { 1515 if (!available_free_memory(nm_i, FREE_NIDS)) {
1521 __del_from_free_nid_list(i); 1516 __del_from_free_nid_list(nm_i, i);
1517 need_free = true;
1522 } else { 1518 } else {
1523 i->state = NID_NEW; 1519 i->state = NID_NEW;
1524 nm_i->fcnt++; 1520 nm_i->fcnt++;
1525 } 1521 }
1526 spin_unlock(&nm_i->free_nid_list_lock); 1522 spin_unlock(&nm_i->free_nid_list_lock);
1523
1524 if (need_free)
1525 kmem_cache_free(free_nid_slab, i);
1527} 1526}
1528 1527
1529void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, 1528void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1531 block_t new_blkaddr) 1530 block_t new_blkaddr)
1532{ 1531{
1533 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); 1532 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1534 set_node_addr(sbi, ni, new_blkaddr); 1533 set_node_addr(sbi, ni, new_blkaddr, false);
1535 clear_node_page_dirty(page); 1534 clear_node_page_dirty(page);
1536} 1535}
1537 1536
1537void recover_inline_xattr(struct inode *inode, struct page *page)
1538{
1539 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1540 void *src_addr, *dst_addr;
1541 size_t inline_size;
1542 struct page *ipage;
1543 struct f2fs_inode *ri;
1544
1545 if (!f2fs_has_inline_xattr(inode))
1546 return;
1547
1548 if (!IS_INODE(page))
1549 return;
1550
1551 ri = F2FS_INODE(page);
1552 if (!(ri->i_inline & F2FS_INLINE_XATTR))
1553 return;
1554
1555 ipage = get_node_page(sbi, inode->i_ino);
1556 f2fs_bug_on(IS_ERR(ipage));
1557
1558 dst_addr = inline_xattr_addr(ipage);
1559 src_addr = inline_xattr_addr(page);
1560 inline_size = inline_xattr_size(inode);
1561
1562 memcpy(dst_addr, src_addr, inline_size);
1563
1564 update_inode(inode, ipage);
1565 f2fs_put_page(ipage, 1);
1566}
1567
1568bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1569{
1570 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1571 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1572 nid_t new_xnid = nid_of_node(page);
1573 struct node_info ni;
1574
1575 recover_inline_xattr(inode, page);
1576
1577 if (!f2fs_has_xattr_block(ofs_of_node(page)))
1578 return false;
1579
1580 /* 1: invalidate the previous xattr nid */
1581 if (!prev_xnid)
1582 goto recover_xnid;
1583
1584 /* Deallocate node address */
1585 get_node_info(sbi, prev_xnid, &ni);
1586 f2fs_bug_on(ni.blk_addr == NULL_ADDR);
1587 invalidate_blocks(sbi, ni.blk_addr);
1588 dec_valid_node_count(sbi, inode);
1589 set_node_addr(sbi, &ni, NULL_ADDR, false);
1590
1591recover_xnid:
1592 /* 2: allocate new xattr nid */
1593 if (unlikely(!inc_valid_node_count(sbi, inode)))
1594 f2fs_bug_on(1);
1595
1596 remove_free_nid(NM_I(sbi), new_xnid);
1597 get_node_info(sbi, new_xnid, &ni);
1598 ni.ino = inode->i_ino;
1599 set_node_addr(sbi, &ni, NEW_ADDR, false);
1600 F2FS_I(inode)->i_xattr_nid = new_xnid;
1601
1602 /* 3: update xattr blkaddr */
1603 refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
1604 set_node_addr(sbi, &ni, blkaddr, false);
1605
1606 update_inode_page(inode);
1607 return true;
1608}
1609
1538int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1610int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1539{ 1611{
1540 struct f2fs_inode *src, *dst; 1612 struct f2fs_inode *src, *dst;
@@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1567 1639
1568 if (unlikely(!inc_valid_node_count(sbi, NULL))) 1640 if (unlikely(!inc_valid_node_count(sbi, NULL)))
1569 WARN_ON(1); 1641 WARN_ON(1);
1570 set_node_addr(sbi, &new_ni, NEW_ADDR); 1642 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1571 inc_valid_inode_count(sbi); 1643 inc_valid_inode_count(sbi);
1572 f2fs_put_page(ipage, 1); 1644 f2fs_put_page(ipage, 1);
1573 return 0; 1645 return 0;
@@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1590 for (; page_idx < start + nrpages; page_idx++) { 1662 for (; page_idx < start + nrpages; page_idx++) {
1591 /* alloc temporal page for read node summary info*/ 1663 /* alloc temporal page for read node summary info*/
1592 page = alloc_page(GFP_F2FS_ZERO); 1664 page = alloc_page(GFP_F2FS_ZERO);
1593 if (!page) { 1665 if (!page)
1594 struct page *tmp; 1666 break;
1595 list_for_each_entry_safe(page, tmp, pages, lru) {
1596 list_del(&page->lru);
1597 unlock_page(page);
1598 __free_pages(page, 0);
1599 }
1600 return -ENOMEM;
1601 }
1602 1667
1603 lock_page(page); 1668 lock_page(page);
1604 page->index = page_idx; 1669 page->index = page_idx;
@@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1609 f2fs_submit_page_mbio(sbi, page, page->index, &fio); 1674 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
1610 1675
1611 f2fs_submit_merged_bio(sbi, META, READ); 1676 f2fs_submit_merged_bio(sbi, META, READ);
1612 return 0; 1677
1678 return page_idx - start;
1613} 1679}
1614 1680
1615int restore_node_summary(struct f2fs_sb_info *sbi, 1681int restore_node_summary(struct f2fs_sb_info *sbi,
@@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1628 addr = START_BLOCK(sbi, segno); 1694 addr = START_BLOCK(sbi, segno);
1629 sum_entry = &sum->entries[0]; 1695 sum_entry = &sum->entries[0];
1630 1696
1631 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { 1697 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
1632 nrpages = min(last_offset - i, bio_blocks); 1698 nrpages = min(last_offset - i, bio_blocks);
1633 1699
1634 /* read ahead node pages */ 1700 /* read ahead node pages */
1635 err = ra_sum_pages(sbi, &page_list, addr, nrpages); 1701 nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages);
1636 if (err) 1702 if (!nrpages)
1637 return err; 1703 return -ENOMEM;
1638 1704
1639 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1705 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1706 if (err)
1707 goto skip;
1640 1708
1641 lock_page(page); 1709 lock_page(page);
1642 if (unlikely(!PageUptodate(page))) { 1710 if (unlikely(!PageUptodate(page))) {
@@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1648 sum_entry->ofs_in_node = 0; 1716 sum_entry->ofs_in_node = 0;
1649 sum_entry++; 1717 sum_entry++;
1650 } 1718 }
1651
1652 list_del(&page->lru);
1653 unlock_page(page); 1719 unlock_page(page);
1720skip:
1721 list_del(&page->lru);
1654 __free_pages(page, 0); 1722 __free_pages(page, 0);
1655 } 1723 }
1656 } 1724 }
@@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1709 struct f2fs_nm_info *nm_i = NM_I(sbi); 1777 struct f2fs_nm_info *nm_i = NM_I(sbi);
1710 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1778 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1711 struct f2fs_summary_block *sum = curseg->sum_blk; 1779 struct f2fs_summary_block *sum = curseg->sum_blk;
1712 struct list_head *cur, *n; 1780 struct nat_entry *ne, *cur;
1713 struct page *page = NULL; 1781 struct page *page = NULL;
1714 struct f2fs_nat_block *nat_blk = NULL; 1782 struct f2fs_nat_block *nat_blk = NULL;
1715 nid_t start_nid = 0, end_nid = 0; 1783 nid_t start_nid = 0, end_nid = 0;
@@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1721 mutex_lock(&curseg->curseg_mutex); 1789 mutex_lock(&curseg->curseg_mutex);
1722 1790
1723 /* 1) flush dirty nat caches */ 1791 /* 1) flush dirty nat caches */
1724 list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { 1792 list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
1725 struct nat_entry *ne;
1726 nid_t nid; 1793 nid_t nid;
1727 struct f2fs_nat_entry raw_ne; 1794 struct f2fs_nat_entry raw_ne;
1728 int offset = -1; 1795 int offset = -1;
1729 block_t new_blkaddr; 1796 block_t new_blkaddr;
1730 1797
1731 ne = list_entry(cur, struct nat_entry, list);
1732 nid = nat_get_nid(ne);
1733
1734 if (nat_get_blkaddr(ne) == NEW_ADDR) 1798 if (nat_get_blkaddr(ne) == NEW_ADDR)
1735 continue; 1799 continue;
1800
1801 nid = nat_get_nid(ne);
1802
1736 if (flushed) 1803 if (flushed)
1737 goto to_nat_page; 1804 goto to_nat_page;
1738 1805
@@ -1783,16 +1850,12 @@ flush_now:
1783 } else { 1850 } else {
1784 write_lock(&nm_i->nat_tree_lock); 1851 write_lock(&nm_i->nat_tree_lock);
1785 __clear_nat_cache_dirty(nm_i, ne); 1852 __clear_nat_cache_dirty(nm_i, ne);
1786 ne->checkpointed = true;
1787 write_unlock(&nm_i->nat_tree_lock); 1853 write_unlock(&nm_i->nat_tree_lock);
1788 } 1854 }
1789 } 1855 }
1790 if (!flushed) 1856 if (!flushed)
1791 mutex_unlock(&curseg->curseg_mutex); 1857 mutex_unlock(&curseg->curseg_mutex);
1792 f2fs_put_page(page, 1); 1858 f2fs_put_page(page, 1);
1793
1794 /* 2) shrink nat caches if necessary */
1795 try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
1796} 1859}
1797 1860
1798static int init_node_manager(struct f2fs_sb_info *sbi) 1861static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1807 /* segment_count_nat includes pair segment so divide to 2. */ 1870 /* segment_count_nat includes pair segment so divide to 2. */
1808 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; 1871 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
1809 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); 1872 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
1810 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1873
1874 /* not used nids: 0, node, meta, (and root counted as valid node) */
1875 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3;
1811 nm_i->fcnt = 0; 1876 nm_i->fcnt = 0;
1812 nm_i->nat_cnt = 0; 1877 nm_i->nat_cnt = 0;
1878 nm_i->ram_thresh = DEF_RAM_THRESHOLD;
1813 1879
1880 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
1814 INIT_LIST_HEAD(&nm_i->free_nid_list); 1881 INIT_LIST_HEAD(&nm_i->free_nid_list);
1815 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1882 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1816 INIT_LIST_HEAD(&nm_i->nat_entries); 1883 INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1864 spin_lock(&nm_i->free_nid_list_lock); 1931 spin_lock(&nm_i->free_nid_list_lock);
1865 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 1932 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1866 f2fs_bug_on(i->state == NID_ALLOC); 1933 f2fs_bug_on(i->state == NID_ALLOC);
1867 __del_from_free_nid_list(i); 1934 __del_from_free_nid_list(nm_i, i);
1868 nm_i->fcnt--; 1935 nm_i->fcnt--;
1936 spin_unlock(&nm_i->free_nid_list_lock);
1937 kmem_cache_free(free_nid_slab, i);
1938 spin_lock(&nm_i->free_nid_list_lock);
1869 } 1939 }
1870 f2fs_bug_on(nm_i->fcnt); 1940 f2fs_bug_on(nm_i->fcnt);
1871 spin_unlock(&nm_i->free_nid_list_lock); 1941 spin_unlock(&nm_i->free_nid_list_lock);
@@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1875 while ((found = __gang_lookup_nat_cache(nm_i, 1945 while ((found = __gang_lookup_nat_cache(nm_i,
1876 nid, NATVEC_SIZE, natvec))) { 1946 nid, NATVEC_SIZE, natvec))) {
1877 unsigned idx; 1947 unsigned idx;
1878 for (idx = 0; idx < found; idx++) { 1948 nid = nat_get_nid(natvec[found - 1]) + 1;
1879 struct nat_entry *e = natvec[idx]; 1949 for (idx = 0; idx < found; idx++)
1880 nid = nat_get_nid(e) + 1; 1950 __del_from_nat_cache(nm_i, natvec[idx]);
1881 __del_from_nat_cache(nm_i, e);
1882 }
1883 } 1951 }
1884 f2fs_bug_on(nm_i->nat_cnt); 1952 f2fs_bug_on(nm_i->nat_cnt);
1885 write_unlock(&nm_i->nat_tree_lock); 1953 write_unlock(&nm_i->nat_tree_lock);
@@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1892int __init create_node_manager_caches(void) 1960int __init create_node_manager_caches(void)
1893{ 1961{
1894 nat_entry_slab = f2fs_kmem_cache_create("nat_entry", 1962 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1895 sizeof(struct nat_entry), NULL); 1963 sizeof(struct nat_entry));
1896 if (!nat_entry_slab) 1964 if (!nat_entry_slab)
1897 return -ENOMEM; 1965 return -ENOMEM;
1898 1966
1899 free_nid_slab = f2fs_kmem_cache_create("free_nid", 1967 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1900 sizeof(struct free_nid), NULL); 1968 sizeof(struct free_nid));
1901 if (!free_nid_slab) { 1969 if (!free_nid_slab) {
1902 kmem_cache_destroy(nat_entry_slab); 1970 kmem_cache_destroy(nat_entry_slab);
1903 return -ENOMEM; 1971 return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c4c79885c993..5decc1a375f0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -17,14 +17,11 @@
17/* # of pages to perform readahead before building free nids */ 17/* # of pages to perform readahead before building free nids */
18#define FREE_NID_PAGES 4 18#define FREE_NID_PAGES 4
19 19
20/* maximum # of free node ids to produce during build_free_nids */
21#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
22
23/* maximum readahead size for node during getting data blocks */ 20/* maximum readahead size for node during getting data blocks */
24#define MAX_RA_NODE 128 21#define MAX_RA_NODE 128
25 22
26/* maximum cached nat entries to manage memory footprint */ 23/* control the memory footprint threshold (10MB per 1GB ram) */
27#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) 24#define DEF_RAM_THRESHOLD 10
28 25
29/* vector size for gang look-up from nat cache that consists of radix tree */ 26/* vector size for gang look-up from nat cache that consists of radix tree */
30#define NATVEC_SIZE 64 27#define NATVEC_SIZE 64
@@ -45,6 +42,7 @@ struct node_info {
45struct nat_entry { 42struct nat_entry {
46 struct list_head list; /* for clean or dirty nat list */ 43 struct list_head list; /* for clean or dirty nat list */
47 bool checkpointed; /* whether it is checkpointed or not */ 44 bool checkpointed; /* whether it is checkpointed or not */
45 bool fsync_done; /* whether the latest node has fsync mark */
48 struct node_info ni; /* in-memory node information */ 46 struct node_info ni; /* in-memory node information */
49}; 47};
50 48
@@ -58,9 +56,15 @@ struct nat_entry {
58#define nat_set_version(nat, v) (nat->ni.version = v) 56#define nat_set_version(nat, v) (nat->ni.version = v)
59 57
60#define __set_nat_cache_dirty(nm_i, ne) \ 58#define __set_nat_cache_dirty(nm_i, ne) \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); 59 do { \
60 ne->checkpointed = false; \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
62 } while (0);
62#define __clear_nat_cache_dirty(nm_i, ne) \ 63#define __clear_nat_cache_dirty(nm_i, ne) \
63 list_move_tail(&ne->list, &nm_i->nat_entries); 64 do { \
65 ne->checkpointed = true; \
66 list_move_tail(&ne->list, &nm_i->nat_entries); \
67 } while (0);
64#define inc_node_version(version) (++version) 68#define inc_node_version(version) (++version)
65 69
66static inline void node_info_from_raw_nat(struct node_info *ni, 70static inline void node_info_from_raw_nat(struct node_info *ni,
@@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni,
71 ni->version = raw_ne->version; 75 ni->version = raw_ne->version;
72} 76}
73 77
78enum nid_type {
79 FREE_NIDS, /* indicates the free nid list */
80 NAT_ENTRIES /* indicates the cached nat entry */
81};
82
74/* 83/*
75 * For free nid mangement 84 * For free nid mangement
76 */ 85 */
@@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page)
236{ 245{
237 unsigned int ofs = ofs_of_node(node_page); 246 unsigned int ofs = ofs_of_node(node_page);
238 247
239 if (ofs == XATTR_NODE_OFFSET) 248 if (f2fs_has_xattr_block(ofs))
240 return false; 249 return false;
241 250
242 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || 251 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 976a7a934db5..b1ae89f0f44e 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, 27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
28 nid_t ino) 28 nid_t ino)
29{ 29{
30 struct list_head *this;
31 struct fsync_inode_entry *entry; 30 struct fsync_inode_entry *entry;
32 31
33 list_for_each(this, head) { 32 list_for_each_entry(entry, head, list)
34 entry = list_entry(this, struct fsync_inode_entry, list);
35 if (entry->inode->i_ino == ino) 33 if (entry->inode->i_ino == ino)
36 return entry; 34 return entry;
37 } 35
38 return NULL; 36 return NULL;
39} 37}
40 38
@@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
136 134
137 /* get node pages in the current segment */ 135 /* get node pages in the current segment */
138 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 136 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
139 blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; 137 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
140 138
141 /* read node page */ 139 /* read node page */
142 page = alloc_page(GFP_F2FS_ZERO); 140 page = alloc_page(GFP_F2FS_ZERO);
@@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
218{ 216{
219 struct seg_entry *sentry; 217 struct seg_entry *sentry;
220 unsigned int segno = GET_SEGNO(sbi, blkaddr); 218 unsigned int segno = GET_SEGNO(sbi, blkaddr);
221 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & 219 unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
222 (sbi->blocks_per_seg - 1); 220 struct f2fs_summary_block *sum_node;
223 struct f2fs_summary sum; 221 struct f2fs_summary sum;
222 struct page *sum_page, *node_page;
224 nid_t ino, nid; 223 nid_t ino, nid;
225 void *kaddr;
226 struct inode *inode; 224 struct inode *inode;
227 struct page *node_page;
228 unsigned int offset; 225 unsigned int offset;
229 block_t bidx; 226 block_t bidx;
230 int i; 227 int i;
@@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
238 struct curseg_info *curseg = CURSEG_I(sbi, i); 235 struct curseg_info *curseg = CURSEG_I(sbi, i);
239 if (curseg->segno == segno) { 236 if (curseg->segno == segno) {
240 sum = curseg->sum_blk->entries[blkoff]; 237 sum = curseg->sum_blk->entries[blkoff];
241 break; 238 goto got_it;
242 } 239 }
243 } 240 }
244 if (i > CURSEG_COLD_DATA) {
245 struct page *sum_page = get_sum_page(sbi, segno);
246 struct f2fs_summary_block *sum_node;
247 kaddr = page_address(sum_page);
248 sum_node = (struct f2fs_summary_block *)kaddr;
249 sum = sum_node->entries[blkoff];
250 f2fs_put_page(sum_page, 1);
251 }
252 241
242 sum_page = get_sum_page(sbi, segno);
243 sum_node = (struct f2fs_summary_block *)page_address(sum_page);
244 sum = sum_node->entries[blkoff];
245 f2fs_put_page(sum_page, 1);
246got_it:
253 /* Use the locked dnode page and inode */ 247 /* Use the locked dnode page and inode */
254 nid = le32_to_cpu(sum.nid); 248 nid = le32_to_cpu(sum.nid);
255 if (dn->inode->i_ino == nid) { 249 if (dn->inode->i_ino == nid) {
@@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
301 if (recover_inline_data(inode, page)) 295 if (recover_inline_data(inode, page))
302 goto out; 296 goto out;
303 297
298 if (recover_xattr_data(inode, page, blkaddr))
299 goto out;
300
304 start = start_bidx_of_node(ofs_of_node(page), fi); 301 start = start_bidx_of_node(ofs_of_node(page), fi);
305 if (IS_INODE(page)) 302 if (IS_INODE(page))
306 end = start + ADDRS_PER_INODE(fi); 303 end = start + ADDRS_PER_INODE(fi);
@@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
317 goto out; 314 goto out;
318 } 315 }
319 316
320 wait_on_page_writeback(dn.node_page); 317 f2fs_wait_on_page_writeback(dn.node_page, NODE);
321 318
322 get_node_info(sbi, dn.nid, &ni); 319 get_node_info(sbi, dn.nid, &ni);
323 f2fs_bug_on(ni.ino != ino_of_node(page)); 320 f2fs_bug_on(ni.ino != ino_of_node(page));
@@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
437 bool need_writecp = false; 434 bool need_writecp = false;
438 435
439 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", 436 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
440 sizeof(struct fsync_inode_entry), NULL); 437 sizeof(struct fsync_inode_entry));
441 if (!fsync_entry_slab) 438 if (!fsync_entry_slab)
442 return -ENOMEM; 439 return -ENOMEM;
443 440
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7caac5f2ca9e..085f548be7a3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/prefetch.h> 15#include <linux/prefetch.h>
16#include <linux/kthread.h>
16#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
17#include <linux/swap.h> 18#include <linux/swap.h>
18 19
@@ -24,6 +25,7 @@
24#define __reverse_ffz(x) __reverse_ffs(~(x)) 25#define __reverse_ffz(x) __reverse_ffs(~(x))
25 26
26static struct kmem_cache *discard_entry_slab; 27static struct kmem_cache *discard_entry_slab;
28static struct kmem_cache *flush_cmd_slab;
27 29
28/* 30/*
29 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since 31 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
195 f2fs_sync_fs(sbi->sb, true); 197 f2fs_sync_fs(sbi->sb, true);
196} 198}
197 199
200static int issue_flush_thread(void *data)
201{
202 struct f2fs_sb_info *sbi = data;
203 struct f2fs_sm_info *sm_i = SM_I(sbi);
204 wait_queue_head_t *q = &sm_i->flush_wait_queue;
205repeat:
206 if (kthread_should_stop())
207 return 0;
208
209 spin_lock(&sm_i->issue_lock);
210 if (sm_i->issue_list) {
211 sm_i->dispatch_list = sm_i->issue_list;
212 sm_i->issue_list = sm_i->issue_tail = NULL;
213 }
214 spin_unlock(&sm_i->issue_lock);
215
216 if (sm_i->dispatch_list) {
217 struct bio *bio = bio_alloc(GFP_NOIO, 0);
218 struct flush_cmd *cmd, *next;
219 int ret;
220
221 bio->bi_bdev = sbi->sb->s_bdev;
222 ret = submit_bio_wait(WRITE_FLUSH, bio);
223
224 for (cmd = sm_i->dispatch_list; cmd; cmd = next) {
225 cmd->ret = ret;
226 next = cmd->next;
227 complete(&cmd->wait);
228 }
229 sm_i->dispatch_list = NULL;
230 }
231
232 wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list);
233 goto repeat;
234}
235
236int f2fs_issue_flush(struct f2fs_sb_info *sbi)
237{
238 struct f2fs_sm_info *sm_i = SM_I(sbi);
239 struct flush_cmd *cmd;
240 int ret;
241
242 if (!test_opt(sbi, FLUSH_MERGE))
243 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
244
245 cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC);
246 cmd->next = NULL;
247 cmd->ret = 0;
248 init_completion(&cmd->wait);
249
250 spin_lock(&sm_i->issue_lock);
251 if (sm_i->issue_list)
252 sm_i->issue_tail->next = cmd;
253 else
254 sm_i->issue_list = cmd;
255 sm_i->issue_tail = cmd;
256 spin_unlock(&sm_i->issue_lock);
257
258 if (!sm_i->dispatch_list)
259 wake_up(&sm_i->flush_wait_queue);
260
261 wait_for_completion(&cmd->wait);
262 ret = cmd->ret;
263 kmem_cache_free(flush_cmd_slab, cmd);
264 return ret;
265}
266
198static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, 267static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
199 enum dirty_type dirty_type) 268 enum dirty_type dirty_type)
200{ 269{
@@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
340void clear_prefree_segments(struct f2fs_sb_info *sbi) 409void clear_prefree_segments(struct f2fs_sb_info *sbi)
341{ 410{
342 struct list_head *head = &(SM_I(sbi)->discard_list); 411 struct list_head *head = &(SM_I(sbi)->discard_list);
343 struct list_head *this, *next; 412 struct discard_entry *entry, *this;
344 struct discard_entry *entry;
345 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 413 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
346 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 414 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
347 unsigned int total_segs = TOTAL_SEGS(sbi); 415 unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
370 mutex_unlock(&dirty_i->seglist_lock); 438 mutex_unlock(&dirty_i->seglist_lock);
371 439
372 /* send small discards */ 440 /* send small discards */
373 list_for_each_safe(this, next, head) { 441 list_for_each_entry_safe(entry, this, head, list) {
374 entry = list_entry(this, struct discard_entry, list);
375 f2fs_issue_discard(sbi, entry->blkaddr, entry->len); 442 f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
376 list_del(&entry->list); 443 list_del(&entry->list);
377 SM_I(sbi)->nr_discards -= entry->len; 444 SM_I(sbi)->nr_discards -= entry->len;
@@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
405 472
406 se = get_seg_entry(sbi, segno); 473 se = get_seg_entry(sbi, segno);
407 new_vblocks = se->valid_blocks + del; 474 new_vblocks = se->valid_blocks + del;
408 offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); 475 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
409 476
410 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 477 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
411 (new_vblocks > sbi->blocks_per_seg))); 478 (new_vblocks > sbi->blocks_per_seg)));
@@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
434 get_sec_entry(sbi, segno)->valid_blocks += del; 501 get_sec_entry(sbi, segno)->valid_blocks += del;
435} 502}
436 503
437static void refresh_sit_entry(struct f2fs_sb_info *sbi, 504void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
438 block_t old_blkaddr, block_t new_blkaddr)
439{ 505{
440 update_sit_entry(sbi, new_blkaddr, 1); 506 update_sit_entry(sbi, new, 1);
441 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) 507 if (GET_SEGNO(sbi, old) != NULL_SEGNO)
442 update_sit_entry(sbi, old_blkaddr, -1); 508 update_sit_entry(sbi, old, -1);
509
510 locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
511 locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
443} 512}
444 513
445void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) 514void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
@@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
881 950
882 stat_inc_block_count(sbi, curseg); 951 stat_inc_block_count(sbi, curseg);
883 952
953 if (!__has_curseg_space(sbi, type))
954 sit_i->s_ops->allocate_segment(sbi, type, false);
884 /* 955 /*
885 * SIT information should be updated before segment allocation, 956 * SIT information should be updated before segment allocation,
886 * since SSR needs latest valid block information. 957 * since SSR needs latest valid block information.
887 */ 958 */
888 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); 959 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
889
890 if (!__has_curseg_space(sbi, type))
891 sit_i->s_ops->allocate_segment(sbi, type, false);
892
893 locate_dirty_segment(sbi, old_cursegno); 960 locate_dirty_segment(sbi, old_cursegno);
894 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 961
895 mutex_unlock(&sit_i->sentry_lock); 962 mutex_unlock(&sit_i->sentry_lock);
896 963
897 if (page && IS_NODESEG(type)) 964 if (page && IS_NODESEG(type))
@@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi,
987 change_curseg(sbi, type, true); 1054 change_curseg(sbi, type, true);
988 } 1055 }
989 1056
990 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1057 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
991 (sbi->blocks_per_seg - 1);
992 __add_sum_entry(sbi, type, sum); 1058 __add_sum_entry(sbi, type, sum);
993 1059
994 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1060 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
995
996 locate_dirty_segment(sbi, old_cursegno); 1061 locate_dirty_segment(sbi, old_cursegno);
997 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
998 1062
999 mutex_unlock(&sit_i->sentry_lock); 1063 mutex_unlock(&sit_i->sentry_lock);
1000 mutex_unlock(&curseg->curseg_mutex); 1064 mutex_unlock(&curseg->curseg_mutex);
@@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
1028 curseg->next_segno = segno; 1092 curseg->next_segno = segno;
1029 change_curseg(sbi, type, true); 1093 change_curseg(sbi, type, true);
1030 } 1094 }
1031 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1095 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
1032 (sbi->blocks_per_seg - 1);
1033 __add_sum_entry(sbi, type, sum); 1096 __add_sum_entry(sbi, type, sum);
1034 1097
1035 /* change the current log to the next block addr in advance */ 1098 /* change the current log to the next block addr in advance */
@@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
1037 curseg->next_segno = next_segno; 1100 curseg->next_segno = next_segno;
1038 change_curseg(sbi, type, true); 1101 change_curseg(sbi, type, true);
1039 } 1102 }
1040 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & 1103 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
1041 (sbi->blocks_per_seg - 1);
1042 1104
1043 /* rewrite node page */ 1105 /* rewrite node page */
1044 set_page_writeback(page); 1106 set_page_writeback(page);
1045 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); 1107 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
1046 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1108 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1047 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1109 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
1048
1049 locate_dirty_segment(sbi, old_cursegno); 1110 locate_dirty_segment(sbi, old_cursegno);
1050 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
1051 1111
1052 mutex_unlock(&sit_i->sentry_lock); 1112 mutex_unlock(&sit_i->sentry_lock);
1053 mutex_unlock(&curseg->curseg_mutex); 1113 mutex_unlock(&curseg->curseg_mutex);
1054} 1114}
1055 1115
1116static inline bool is_merged_page(struct f2fs_sb_info *sbi,
1117 struct page *page, enum page_type type)
1118{
1119 enum page_type btype = PAGE_TYPE_OF_BIO(type);
1120 struct f2fs_bio_info *io = &sbi->write_io[btype];
1121 struct bio_vec *bvec;
1122 int i;
1123
1124 down_read(&io->io_rwsem);
1125 if (!io->bio)
1126 goto out;
1127
1128 bio_for_each_segment_all(bvec, io->bio, i) {
1129 if (page == bvec->bv_page) {
1130 up_read(&io->io_rwsem);
1131 return true;
1132 }
1133 }
1134
1135out:
1136 up_read(&io->io_rwsem);
1137 return false;
1138}
1139
1056void f2fs_wait_on_page_writeback(struct page *page, 1140void f2fs_wait_on_page_writeback(struct page *page,
1057 enum page_type type) 1141 enum page_type type)
1058{ 1142{
1059 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1143 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1060 if (PageWriteback(page)) { 1144 if (PageWriteback(page)) {
1061 f2fs_submit_merged_bio(sbi, type, WRITE); 1145 if (is_merged_page(sbi, page, type))
1146 f2fs_submit_merged_bio(sbi, type, WRITE);
1062 wait_on_page_writeback(page); 1147 wait_on_page_writeback(page);
1063 } 1148 }
1064} 1149}
@@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1167 ns->ofs_in_node = 0; 1252 ns->ofs_in_node = 0;
1168 } 1253 }
1169 } else { 1254 } else {
1170 if (restore_node_summary(sbi, segno, sum)) { 1255 int err;
1256
1257 err = restore_node_summary(sbi, segno, sum);
1258 if (err) {
1171 f2fs_put_page(new, 1); 1259 f2fs_put_page(new, 1);
1172 return -EINVAL; 1260 return err;
1173 } 1261 }
1174 } 1262 }
1175 } 1263 }
@@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1190static int restore_curseg_summaries(struct f2fs_sb_info *sbi) 1278static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1191{ 1279{
1192 int type = CURSEG_HOT_DATA; 1280 int type = CURSEG_HOT_DATA;
1281 int err;
1193 1282
1194 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { 1283 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1195 /* restore for compacted data summary */ 1284 /* restore for compacted data summary */
@@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1198 type = CURSEG_HOT_NODE; 1287 type = CURSEG_HOT_NODE;
1199 } 1288 }
1200 1289
1201 for (; type <= CURSEG_COLD_NODE; type++) 1290 for (; type <= CURSEG_COLD_NODE; type++) {
1202 if (read_normal_summaries(sbi, type)) 1291 err = read_normal_summaries(sbi, type);
1203 return -EINVAL; 1292 if (err)
1293 return err;
1294 }
1295
1204 return 0; 1296 return 0;
1205} 1297}
1206 1298
@@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi)
1583 return restore_curseg_summaries(sbi); 1675 return restore_curseg_summaries(sbi);
1584} 1676}
1585 1677
1586static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
1587{
1588 struct address_space *mapping = META_MAPPING(sbi);
1589 struct page *page;
1590 block_t blk_addr, prev_blk_addr = 0;
1591 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1592 int blkno = start;
1593 struct f2fs_io_info fio = {
1594 .type = META,
1595 .rw = READ_SYNC | REQ_META | REQ_PRIO
1596 };
1597
1598 for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
1599
1600 blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
1601
1602 if (blkno != start && prev_blk_addr + 1 != blk_addr)
1603 break;
1604 prev_blk_addr = blk_addr;
1605repeat:
1606 page = grab_cache_page(mapping, blk_addr);
1607 if (!page) {
1608 cond_resched();
1609 goto repeat;
1610 }
1611 if (PageUptodate(page)) {
1612 mark_page_accessed(page);
1613 f2fs_put_page(page, 1);
1614 continue;
1615 }
1616
1617 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
1618
1619 mark_page_accessed(page);
1620 f2fs_put_page(page, 0);
1621 }
1622
1623 f2fs_submit_merged_bio(sbi, META, READ);
1624 return blkno - start;
1625}
1626
1627static void build_sit_entries(struct f2fs_sb_info *sbi) 1678static void build_sit_entries(struct f2fs_sb_info *sbi)
1628{ 1679{
1629 struct sit_info *sit_i = SIT_I(sbi); 1680 struct sit_info *sit_i = SIT_I(sbi);
@@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1635 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1686 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1636 1687
1637 do { 1688 do {
1638 readed = ra_sit_pages(sbi, start_blk, nrpages); 1689 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
1639 1690
1640 start = start_blk * sit_i->sents_per_block; 1691 start = start_blk * sit_i->sents_per_block;
1641 end = (start_blk + readed) * sit_i->sents_per_block; 1692 end = (start_blk + readed) * sit_i->sents_per_block;
@@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1781{ 1832{
1782 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); 1833 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1783 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1834 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1835 dev_t dev = sbi->sb->s_bdev->bd_dev;
1784 struct f2fs_sm_info *sm_info; 1836 struct f2fs_sm_info *sm_info;
1785 int err; 1837 int err;
1786 1838
@@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1799 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); 1851 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
1800 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); 1852 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1801 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 1853 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1802 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; 1854 sm_info->rec_prefree_segments = sm_info->main_segments *
1855 DEF_RECLAIM_PREFREE_SEGMENTS / 100;
1803 sm_info->ipu_policy = F2FS_IPU_DISABLE; 1856 sm_info->ipu_policy = F2FS_IPU_DISABLE;
1804 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 1857 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
1805 1858
@@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1807 sm_info->nr_discards = 0; 1860 sm_info->nr_discards = 0;
1808 sm_info->max_discards = 0; 1861 sm_info->max_discards = 0;
1809 1862
1863 if (test_opt(sbi, FLUSH_MERGE)) {
1864 spin_lock_init(&sm_info->issue_lock);
1865 init_waitqueue_head(&sm_info->flush_wait_queue);
1866
1867 sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
1868 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
1869 if (IS_ERR(sm_info->f2fs_issue_flush))
1870 return PTR_ERR(sm_info->f2fs_issue_flush);
1871 }
1872
1810 err = build_sit_info(sbi); 1873 err = build_sit_info(sbi);
1811 if (err) 1874 if (err)
1812 return err; 1875 return err;
@@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1915 struct f2fs_sm_info *sm_info = SM_I(sbi); 1978 struct f2fs_sm_info *sm_info = SM_I(sbi);
1916 if (!sm_info) 1979 if (!sm_info)
1917 return; 1980 return;
1981 if (sm_info->f2fs_issue_flush)
1982 kthread_stop(sm_info->f2fs_issue_flush);
1918 destroy_dirty_segmap(sbi); 1983 destroy_dirty_segmap(sbi);
1919 destroy_curseg(sbi); 1984 destroy_curseg(sbi);
1920 destroy_free_segmap(sbi); 1985 destroy_free_segmap(sbi);
@@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1926int __init create_segment_manager_caches(void) 1991int __init create_segment_manager_caches(void)
1927{ 1992{
1928 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 1993 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
1929 sizeof(struct discard_entry), NULL); 1994 sizeof(struct discard_entry));
1930 if (!discard_entry_slab) 1995 if (!discard_entry_slab)
1931 return -ENOMEM; 1996 return -ENOMEM;
1997 flush_cmd_slab = f2fs_kmem_cache_create("flush_command",
1998 sizeof(struct flush_cmd));
1999 if (!flush_cmd_slab) {
2000 kmem_cache_destroy(discard_entry_slab);
2001 return -ENOMEM;
2002 }
1932 return 0; 2003 return 0;
1933} 2004}
1934 2005
1935void destroy_segment_manager_caches(void) 2006void destroy_segment_manager_caches(void)
1936{ 2007{
1937 kmem_cache_destroy(discard_entry_slab); 2008 kmem_cache_destroy(discard_entry_slab);
2009 kmem_cache_destroy(flush_cmd_slab);
1938} 2010}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5731682d7516..7091204680f4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,7 +14,7 @@
14#define NULL_SEGNO ((unsigned int)(~0)) 14#define NULL_SEGNO ((unsigned int)(~0))
15#define NULL_SECNO ((unsigned int)(~0)) 15#define NULL_SECNO ((unsigned int)(~0))
16 16
17#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ 17#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
18 18
19/* L: Logical segment # in volume, R: Relative segment # in main area */ 19/* L: Logical segment # in volume, R: Relative segment # in main area */
20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) 20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
@@ -57,6 +57,9 @@
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr) 57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
61 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
62
60#define GET_SEGNO(sbi, blk_addr) \ 63#define GET_SEGNO(sbi, blk_addr) \
61 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ 64 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
62 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ 65 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
@@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
377 380
378static inline block_t written_block_count(struct f2fs_sb_info *sbi) 381static inline block_t written_block_count(struct f2fs_sb_info *sbi)
379{ 382{
380 struct sit_info *sit_i = SIT_I(sbi); 383 return SIT_I(sbi)->written_valid_blocks;
381 block_t vblocks;
382
383 mutex_lock(&sit_i->sentry_lock);
384 vblocks = sit_i->written_valid_blocks;
385 mutex_unlock(&sit_i->sentry_lock);
386
387 return vblocks;
388} 384}
389 385
390static inline unsigned int free_segments(struct f2fs_sb_info *sbi) 386static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
391{ 387{
392 struct free_segmap_info *free_i = FREE_I(sbi); 388 return FREE_I(sbi)->free_segments;
393 unsigned int free_segs;
394
395 read_lock(&free_i->segmap_lock);
396 free_segs = free_i->free_segments;
397 read_unlock(&free_i->segmap_lock);
398
399 return free_segs;
400} 389}
401 390
402static inline int reserved_segments(struct f2fs_sb_info *sbi) 391static inline int reserved_segments(struct f2fs_sb_info *sbi)
@@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi)
406 395
407static inline unsigned int free_sections(struct f2fs_sb_info *sbi) 396static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
408{ 397{
409 struct free_segmap_info *free_i = FREE_I(sbi); 398 return FREE_I(sbi)->free_sections;
410 unsigned int free_secs;
411
412 read_lock(&free_i->segmap_lock);
413 free_secs = free_i->free_sections;
414 read_unlock(&free_i->segmap_lock);
415
416 return free_secs;
417} 399}
418 400
419static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) 401static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
@@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
682 struct request_queue *q = bdev_get_queue(bdev); 664 struct request_queue *q = bdev_get_queue(bdev);
683 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 665 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
684} 666}
667
668/*
669 * It is very important to gather dirty pages and write at once, so that we can
670 * submit a big bio without interfering other data writes.
671 * By default, 512 pages for directory data,
672 * 512 pages (2MB) * 3 for three types of nodes, and
673 * max_bio_blocks for meta are set.
674 */
675static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
676{
677 if (type == DATA)
678 return sbi->blocks_per_seg;
679 else if (type == NODE)
680 return 3 * sbi->blocks_per_seg;
681 else if (type == META)
682 return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
683 else
684 return 0;
685}
686
687/*
688 * When writing pages, it'd better align nr_to_write for segment size.
689 */
690static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
691 struct writeback_control *wbc)
692{
693 long nr_to_write, desired;
694
695 if (wbc->sync_mode != WB_SYNC_NONE)
696 return 0;
697
698 nr_to_write = wbc->nr_to_write;
699
700 if (type == DATA)
701 desired = 4096;
702 else if (type == NODE)
703 desired = 3 * max_hw_blocks(sbi);
704 else
705 desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
706
707 wbc->nr_to_write = desired;
708 return desired - nr_to_write;
709}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83abd53..c756923a7302 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -51,6 +51,7 @@ enum {
51 Opt_disable_ext_identify, 51 Opt_disable_ext_identify,
52 Opt_inline_xattr, 52 Opt_inline_xattr,
53 Opt_inline_data, 53 Opt_inline_data,
54 Opt_flush_merge,
54 Opt_err, 55 Opt_err,
55}; 56};
56 57
@@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = {
67 {Opt_disable_ext_identify, "disable_ext_identify"}, 68 {Opt_disable_ext_identify, "disable_ext_identify"},
68 {Opt_inline_xattr, "inline_xattr"}, 69 {Opt_inline_xattr, "inline_xattr"},
69 {Opt_inline_data, "inline_data"}, 70 {Opt_inline_data, "inline_data"},
71 {Opt_flush_merge, "flush_merge"},
70 {Opt_err, NULL}, 72 {Opt_err, NULL},
71}; 73};
72 74
@@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = {
74enum { 76enum {
75 GC_THREAD, /* struct f2fs_gc_thread */ 77 GC_THREAD, /* struct f2fs_gc_thread */
76 SM_INFO, /* struct f2fs_sm_info */ 78 SM_INFO, /* struct f2fs_sm_info */
79 NM_INFO, /* struct f2fs_nm_info */
77 F2FS_SBI, /* struct f2fs_sb_info */ 80 F2FS_SBI, /* struct f2fs_sb_info */
78}; 81};
79 82
@@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
92 return (unsigned char *)sbi->gc_thread; 95 return (unsigned char *)sbi->gc_thread;
93 else if (struct_type == SM_INFO) 96 else if (struct_type == SM_INFO)
94 return (unsigned char *)SM_I(sbi); 97 return (unsigned char *)SM_I(sbi);
98 else if (struct_type == NM_INFO)
99 return (unsigned char *)NM_I(sbi);
95 else if (struct_type == F2FS_SBI) 100 else if (struct_type == F2FS_SBI)
96 return (unsigned char *)sbi; 101 return (unsigned char *)sbi;
97 return NULL; 102 return NULL;
@@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
183F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 188F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
184F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 189F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
185F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
191F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
186F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 192F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
193F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
187 194
188#define ATTR_LIST(name) (&f2fs_attr_##name.attr) 195#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
189static struct attribute *f2fs_attrs[] = { 196static struct attribute *f2fs_attrs[] = {
@@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = {
196 ATTR_LIST(ipu_policy), 203 ATTR_LIST(ipu_policy),
197 ATTR_LIST(min_ipu_util), 204 ATTR_LIST(min_ipu_util),
198 ATTR_LIST(max_victim_search), 205 ATTR_LIST(max_victim_search),
206 ATTR_LIST(dir_level),
207 ATTR_LIST(ram_thresh),
199 NULL, 208 NULL,
200}; 209};
201 210
@@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options)
256 265
257 if (!name) 266 if (!name)
258 return -ENOMEM; 267 return -ENOMEM;
259 if (!strncmp(name, "on", 2)) 268 if (strlen(name) == 2 && !strncmp(name, "on", 2))
260 set_opt(sbi, BG_GC); 269 set_opt(sbi, BG_GC);
261 else if (!strncmp(name, "off", 3)) 270 else if (strlen(name) == 3 && !strncmp(name, "off", 3))
262 clear_opt(sbi, BG_GC); 271 clear_opt(sbi, BG_GC);
263 else { 272 else {
264 kfree(name); 273 kfree(name);
@@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options)
327 case Opt_inline_data: 336 case Opt_inline_data:
328 set_opt(sbi, INLINE_DATA); 337 set_opt(sbi, INLINE_DATA);
329 break; 338 break;
339 case Opt_flush_merge:
340 set_opt(sbi, FLUSH_MERGE);
341 break;
330 default: 342 default:
331 f2fs_msg(sb, KERN_ERR, 343 f2fs_msg(sb, KERN_ERR,
332 "Unrecognized mount option \"%s\" or missing value", 344 "Unrecognized mount option \"%s\" or missing value",
@@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
353 fi->i_current_depth = 1; 365 fi->i_current_depth = 1;
354 fi->i_advise = 0; 366 fi->i_advise = 0;
355 rwlock_init(&fi->ext.ext_lock); 367 rwlock_init(&fi->ext.ext_lock);
368 init_rwsem(&fi->i_sem);
356 369
357 set_inode_flag(fi, FI_NEW_INODE); 370 set_inode_flag(fi, FI_NEW_INODE);
358 371
359 if (test_opt(F2FS_SB(sb), INLINE_XATTR)) 372 if (test_opt(F2FS_SB(sb), INLINE_XATTR))
360 set_inode_flag(fi, FI_INLINE_XATTR); 373 set_inode_flag(fi, FI_INLINE_XATTR);
361 374
375 /* Will be used by directory only */
376 fi->i_dir_level = F2FS_SB(sb)->dir_level;
377
362 return &fi->vfs_inode; 378 return &fi->vfs_inode;
363} 379}
364 380
@@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
526 seq_puts(seq, ",disable_ext_identify"); 542 seq_puts(seq, ",disable_ext_identify");
527 if (test_opt(sbi, INLINE_DATA)) 543 if (test_opt(sbi, INLINE_DATA))
528 seq_puts(seq, ",inline_data"); 544 seq_puts(seq, ",inline_data");
545 if (test_opt(sbi, FLUSH_MERGE))
546 seq_puts(seq, ",flush_merge");
529 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 547 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
530 548
531 return 0; 549 return 0;
@@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
539 le32_to_cpu(sbi->raw_super->segment_count_main); 557 le32_to_cpu(sbi->raw_super->segment_count_main);
540 int i; 558 int i;
541 559
560 seq_puts(seq, "format: segment_type|valid_blocks\n"
561 "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
562
542 for (i = 0; i < total_segs; i++) { 563 for (i = 0; i < total_segs; i++) {
543 seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); 564 struct seg_entry *se = get_seg_entry(sbi, i);
544 if (i != 0 && (i % 10) == 0) 565
545 seq_puts(seq, "\n"); 566 if ((i % 10) == 0)
567 seq_printf(seq, "%-5d", i);
568 seq_printf(seq, "%d|%-3u", se->type,
569 get_valid_blocks(sbi, i, 1));
570 if ((i % 10) == 9 || i == (total_segs - 1))
571 seq_putc(seq, '\n');
546 else 572 else
547 seq_puts(seq, " "); 573 seq_putc(seq, ' ');
548 } 574 }
575
549 return 0; 576 return 0;
550} 577}
551 578
@@ -568,6 +595,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
568 struct f2fs_mount_info org_mount_opt; 595 struct f2fs_mount_info org_mount_opt;
569 int err, active_logs; 596 int err, active_logs;
570 597
598 sync_filesystem(sb);
599
571 /* 600 /*
572 * Save the old mount options in case we 601 * Save the old mount options in case we
573 * need to restore them. 602 * need to restore them.
@@ -638,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
638 667
639 if (unlikely(ino < F2FS_ROOT_INO(sbi))) 668 if (unlikely(ino < F2FS_ROOT_INO(sbi)))
640 return ERR_PTR(-ESTALE); 669 return ERR_PTR(-ESTALE);
670 if (unlikely(ino >= NM_I(sbi)->max_nid))
671 return ERR_PTR(-ESTALE);
641 672
642 /* 673 /*
643 * f2fs_iget isn't quite right if the inode is currently unallocated! 674 * f2fs_iget isn't quite right if the inode is currently unallocated!
@@ -785,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
785 816
786 for (i = 0; i < NR_COUNT_TYPE; i++) 817 for (i = 0; i < NR_COUNT_TYPE; i++)
787 atomic_set(&sbi->nr_pages[i], 0); 818 atomic_set(&sbi->nr_pages[i], 0);
819
820 sbi->dir_level = DEF_DIR_LEVEL;
788} 821}
789 822
790/* 823/*
@@ -896,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
896 sbi->por_doing = false; 929 sbi->por_doing = false;
897 spin_lock_init(&sbi->stat_lock); 930 spin_lock_init(&sbi->stat_lock);
898 931
899 mutex_init(&sbi->read_io.io_mutex); 932 init_rwsem(&sbi->read_io.io_rwsem);
900 sbi->read_io.sbi = sbi; 933 sbi->read_io.sbi = sbi;
901 sbi->read_io.bio = NULL; 934 sbi->read_io.bio = NULL;
902 for (i = 0; i < NR_PAGE_TYPE; i++) { 935 for (i = 0; i < NR_PAGE_TYPE; i++) {
903 mutex_init(&sbi->write_io[i].io_mutex); 936 init_rwsem(&sbi->write_io[i].io_rwsem);
904 sbi->write_io[i].sbi = sbi; 937 sbi->write_io[i].sbi = sbi;
905 sbi->write_io[i].bio = NULL; 938 sbi->write_io[i].bio = NULL;
906 } 939 }
@@ -989,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
989 goto free_root_inode; 1022 goto free_root_inode;
990 } 1023 }
991 1024
992 /* recover fsynced data */
993 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
994 err = recover_fsync_data(sbi);
995 if (err)
996 f2fs_msg(sb, KERN_ERR,
997 "Cannot recover all fsync data errno=%ld", err);
998 }
999
1000 /*
1001 * If filesystem is not mounted as read-only then
1002 * do start the gc_thread.
1003 */
1004 if (!(sb->s_flags & MS_RDONLY)) {
1005 /* After POR, we can run background GC thread.*/
1006 err = start_gc_thread(sbi);
1007 if (err)
1008 goto free_gc;
1009 }
1010
1011 err = f2fs_build_stats(sbi); 1025 err = f2fs_build_stats(sbi);
1012 if (err) 1026 if (err)
1013 goto free_gc; 1027 goto free_root_inode;
1014 1028
1015 if (f2fs_proc_root) 1029 if (f2fs_proc_root)
1016 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); 1030 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1032,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1032 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, 1046 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
1033 "%s", sb->s_id); 1047 "%s", sb->s_id);
1034 if (err) 1048 if (err)
1035 goto fail; 1049 goto free_proc;
1050
1051 /* recover fsynced data */
1052 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1053 err = recover_fsync_data(sbi);
1054 if (err)
1055 f2fs_msg(sb, KERN_ERR,
1056 "Cannot recover all fsync data errno=%ld", err);
1057 }
1036 1058
1059 /*
1060 * If filesystem is not mounted as read-only then
1061 * do start the gc_thread.
1062 */
1063 if (!(sb->s_flags & MS_RDONLY)) {
1064 /* After POR, we can run background GC thread.*/
1065 err = start_gc_thread(sbi);
1066 if (err)
1067 goto free_kobj;
1068 }
1037 return 0; 1069 return 0;
1038fail: 1070
1071free_kobj:
1072 kobject_del(&sbi->s_kobj);
1073free_proc:
1039 if (sbi->s_proc) { 1074 if (sbi->s_proc) {
1040 remove_proc_entry("segment_info", sbi->s_proc); 1075 remove_proc_entry("segment_info", sbi->s_proc);
1041 remove_proc_entry(sb->s_id, f2fs_proc_root); 1076 remove_proc_entry(sb->s_id, f2fs_proc_root);
1042 } 1077 }
1043 f2fs_destroy_stats(sbi); 1078 f2fs_destroy_stats(sbi);
1044free_gc:
1045 stop_gc_thread(sbi);
1046free_root_inode: 1079free_root_inode:
1047 dput(sb->s_root); 1080 dput(sb->s_root);
1048 sb->s_root = NULL; 1081 sb->s_root = NULL;
@@ -1082,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs");
1082static int __init init_inodecache(void) 1115static int __init init_inodecache(void)
1083{ 1116{
1084 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1117 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
1085 sizeof(struct f2fs_inode_info), NULL); 1118 sizeof(struct f2fs_inode_info));
1086 if (!f2fs_inode_cachep) 1119 if (!f2fs_inode_cachep)
1087 return -ENOMEM; 1120 return -ENOMEM;
1088 return 0; 1121 return 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 89d0422a91a8..503c2451131e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
275 275
276 inline_size = inline_xattr_size(inode); 276 inline_size = inline_xattr_size(inode);
277 277
278 txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); 278 txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
279 if (!txattr_addr) 279 if (!txattr_addr)
280 return NULL; 280 return NULL;
281 281
@@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
407 if (name == NULL) 407 if (name == NULL)
408 return -EINVAL; 408 return -EINVAL;
409 name_len = strlen(name); 409 name_len = strlen(name);
410 if (name_len > F2FS_NAME_LEN)
411 return -ERANGE;
410 412
411 base_addr = read_all_xattrs(inode, NULL); 413 base_addr = read_all_xattrs(inode, NULL);
412 if (!base_addr) 414 if (!base_addr)
@@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
590 f2fs_balance_fs(sbi); 592 f2fs_balance_fs(sbi);
591 593
592 f2fs_lock_op(sbi); 594 f2fs_lock_op(sbi);
595 /* protect xattr_ver */
596 down_write(&F2FS_I(inode)->i_sem);
593 err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); 597 err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
598 up_write(&F2FS_I(inode)->i_sem);
594 f2fs_unlock_op(sbi); 599 f2fs_unlock_op(sbi);
595 600
596 return err; 601 return err;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578f6695..b3361fe2bcb5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);
490 490
491static void fat_evict_inode(struct inode *inode) 491static void fat_evict_inode(struct inode *inode)
492{ 492{
493 truncate_inode_pages(&inode->i_data, 0); 493 truncate_inode_pages_final(&inode->i_data);
494 if (!inode->i_nlink) { 494 if (!inode->i_nlink) {
495 inode->i_size = 0; 495 inode->i_size = 0;
496 fat_truncate_blocks(inode, 0); 496 fat_truncate_blocks(inode, 0);
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
635 struct msdos_sb_info *sbi = MSDOS_SB(sb); 635 struct msdos_sb_info *sbi = MSDOS_SB(sb);
636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); 636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
637 637
638 sync_filesystem(sb);
639
638 /* make sure we update state on remount. */ 640 /* make sure we update state on remount. */
639 new_rdonly = *flags & MS_RDONLY; 641 new_rdonly = *flags & MS_RDONLY;
640 if (new_rdonly != (sb->s_flags & MS_RDONLY)) { 642 if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ef6866592a0f..9ead1596399a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -272,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
272 case F_SETFL: 272 case F_SETFL:
273 err = setfl(fd, filp, arg); 273 err = setfl(fd, filp, arg);
274 break; 274 break;
275#if BITS_PER_LONG != 32
276 /* 32-bit arches must use fcntl64() */
277 case F_GETLKP:
278#endif
275 case F_GETLK: 279 case F_GETLK:
276 err = fcntl_getlk(filp, (struct flock __user *) arg); 280 err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
277 break; 281 break;
282#if BITS_PER_LONG != 32
283 /* 32-bit arches must use fcntl64() */
284 case F_SETLKP:
285 case F_SETLKPW:
286#endif
287 /* Fallthrough */
278 case F_SETLK: 288 case F_SETLK:
279 case F_SETLKW: 289 case F_SETLKW:
280 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); 290 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
@@ -388,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
388 goto out1; 398 goto out1;
389 399
390 switch (cmd) { 400 switch (cmd) {
391 case F_GETLK64: 401 case F_GETLK64:
392 err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); 402 case F_GETLKP:
393 break; 403 err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
394 case F_SETLK64: 404 break;
395 case F_SETLKW64: 405 case F_SETLK64:
396 err = fcntl_setlk64(fd, f.file, cmd, 406 case F_SETLKW64:
397 (struct flock64 __user *) arg); 407 case F_SETLKP:
398 break; 408 case F_SETLKPW:
399 default: 409 err = fcntl_setlk64(fd, f.file, cmd,
400 err = do_fcntl(fd, cmd, arg, f.file); 410 (struct flock64 __user *) arg);
401 break; 411 break;
412 default:
413 err = do_fcntl(fd, cmd, arg, f.file);
414 break;
402 } 415 }
403out1: 416out1:
404 fdput(f); 417 fdput(f);
diff --git a/fs/file.c b/fs/file.c
index eb56a13dab3e..b61293badfb1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -497,7 +497,7 @@ repeat:
497 error = fd; 497 error = fd;
498#if 1 498#if 1
499 /* Sanity check */ 499 /* Sanity check */
500 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { 500 if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
501 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 501 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
502 rcu_assign_pointer(fdt->fd[fd], NULL); 502 rcu_assign_pointer(fdt->fd[fd], NULL);
503 } 503 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 5b24008ea4f6..01071c4d752e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -235,7 +235,7 @@ static void __fput(struct file *file)
235 * in the file cleanup chain. 235 * in the file cleanup chain.
236 */ 236 */
237 eventpoll_release(file); 237 eventpoll_release(file);
238 locks_remove_flock(file); 238 locks_remove_file(file);
239 239
240 if (unlikely(file->f_flags & FASYNC)) { 240 if (unlikely(file->f_flags & FASYNC)) {
241 if (file->f_op->fasync) 241 if (file->f_op->fasync)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6a..5797d45a78cb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
121 121
122EXPORT_SYMBOL(unregister_filesystem); 122EXPORT_SYMBOL(unregister_filesystem);
123 123
124#ifdef CONFIG_SYSFS_SYSCALL
124static int fs_index(const char __user * __name) 125static int fs_index(const char __user * __name)
125{ 126{
126 struct file_system_type * tmp; 127 struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
199 } 200 }
200 return retval; 201 return retval;
201} 202}
203#endif
202 204
203int __init get_filesystem_list(char *buf) 205int __init get_filesystem_list(char *buf)
204{ 206{
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72cef17..363e3ae25f6b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
354void 354void
355vxfs_evict_inode(struct inode *ip) 355vxfs_evict_inode(struct inode *ip)
356{ 356{
357 truncate_inode_pages(&ip->i_data, 0); 357 truncate_inode_pages_final(&ip->i_data);
358 clear_inode(ip); 358 clear_inode(ip);
359 call_rcu(&ip->i_rcu, vxfs_i_callback); 359 call_rcu(&ip->i_rcu, vxfs_i_callback);
360} 360}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 25d4099a4aea..99c7f0a37af4 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
192 * vxfs_lookup - lookup pathname component 192 * vxfs_lookup - lookup pathname component
193 * @dip: dir in which we lookup 193 * @dip: dir in which we lookup
194 * @dp: dentry we lookup 194 * @dp: dentry we lookup
195 * @nd: lookup nameidata 195 * @flags: lookup flags
196 * 196 *
197 * Description: 197 * Description:
198 * vxfs_lookup tries to lookup the pathname component described 198 * vxfs_lookup tries to lookup the pathname component described
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e492..7ca8c75d50d3 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
124 124
125static int vxfs_remount(struct super_block *sb, int *flags, char *data) 125static int vxfs_remount(struct super_block *sb, int *flags, char *data)
126{ 126{
127 sync_filesystem(sb);
127 *flags |= MS_RDONLY; 128 *flags |= MS_RDONLY;
128 return 0; 129 return 0;
129} 130}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d754e3cf99a8..be568b7311d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -89,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head)
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
90#include <trace/events/writeback.h> 90#include <trace/events/writeback.h>
91 91
92EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
93
94static void bdi_wakeup_thread(struct backing_dev_info *bdi)
95{
96 spin_lock_bh(&bdi->wb_lock);
97 if (test_bit(BDI_registered, &bdi->state))
98 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
99 spin_unlock_bh(&bdi->wb_lock);
100}
101
92static void bdi_queue_work(struct backing_dev_info *bdi, 102static void bdi_queue_work(struct backing_dev_info *bdi,
93 struct wb_writeback_work *work) 103 struct wb_writeback_work *work)
94{ 104{
95 trace_writeback_queue(bdi, work); 105 trace_writeback_queue(bdi, work);
96 106
97 spin_lock_bh(&bdi->wb_lock); 107 spin_lock_bh(&bdi->wb_lock);
108 if (!test_bit(BDI_registered, &bdi->state)) {
109 if (work->done)
110 complete(work->done);
111 goto out_unlock;
112 }
98 list_add_tail(&work->list, &bdi->work_list); 113 list_add_tail(&work->list, &bdi->work_list);
99 spin_unlock_bh(&bdi->wb_lock);
100
101 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 114 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
115out_unlock:
116 spin_unlock_bh(&bdi->wb_lock);
102} 117}
103 118
104static void 119static void
@@ -114,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
114 work = kzalloc(sizeof(*work), GFP_ATOMIC); 129 work = kzalloc(sizeof(*work), GFP_ATOMIC);
115 if (!work) { 130 if (!work) {
116 trace_writeback_nowork(bdi); 131 trace_writeback_nowork(bdi);
117 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 132 bdi_wakeup_thread(bdi);
118 return; 133 return;
119 } 134 }
120 135
@@ -161,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
161 * writeback as soon as there is no other work to do. 176 * writeback as soon as there is no other work to do.
162 */ 177 */
163 trace_writeback_wake_background(bdi); 178 trace_writeback_wake_background(bdi);
164 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 179 bdi_wakeup_thread(bdi);
165} 180}
166 181
167/* 182/*
@@ -1017,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1017 current->flags |= PF_SWAPWRITE; 1032 current->flags |= PF_SWAPWRITE;
1018 1033
1019 if (likely(!current_is_workqueue_rescuer() || 1034 if (likely(!current_is_workqueue_rescuer() ||
1020 list_empty(&bdi->bdi_list))) { 1035 !test_bit(BDI_registered, &bdi->state))) {
1021 /* 1036 /*
1022 * The normal path. Keep writing back @bdi until its 1037 * The normal path. Keep writing back @bdi until its
1023 * work_list is empty. Note that this path is also taken 1038 * work_list is empty. Note that this path is also taken
@@ -1039,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work)
1039 trace_writeback_pages_written(pages_written); 1054 trace_writeback_pages_written(pages_written);
1040 } 1055 }
1041 1056
1042 if (!list_empty(&bdi->work_list) || 1057 if (!list_empty(&bdi->work_list))
1043 (wb_has_dirty_io(wb) && dirty_writeback_interval)) 1058 mod_delayed_work(bdi_wq, &wb->dwork, 0);
1044 queue_delayed_work(bdi_wq, &wb->dwork, 1059 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1045 msecs_to_jiffies(dirty_writeback_interval * 10)); 1060 bdi_wakeup_thread_delayed(bdi);
1046 1061
1047 current->flags &= ~PF_SWAPWRITE; 1062 current->flags &= ~PF_SWAPWRITE;
1048} 1063}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b96a49b37d66..13b691a8a7d2 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
95 struct iovec iov = { .iov_base = buf, .iov_len = count }; 95 struct iovec iov = { .iov_base = buf, .iov_len = count };
96 struct fuse_io_priv io = { .async = 0, .file = file }; 96 struct fuse_io_priv io = { .async = 0, .file = file };
97 97
98 return fuse_direct_io(&io, &iov, 1, count, &pos, 0); 98 return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
99} 99}
100 100
101static ssize_t cuse_write(struct file *file, const char __user *buf, 101static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
109 * No locking or generic_write_checks(), the server is 109 * No locking or generic_write_checks(), the server is
110 * responsible for locking and sanity checks. 110 * responsible for locking and sanity checks.
111 */ 111 */
112 return fuse_direct_io(&io, &iov, 1, count, &pos, 1); 112 return fuse_direct_io(&io, &iov, 1, count, &pos,
113 FUSE_DIO_WRITE | FUSE_DIO_CUSE);
113} 114}
114 115
115static int cuse_open(struct inode *inode, struct file *file) 116static int cuse_open(struct inode *inode, struct file *file)
@@ -568,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
568 569
569 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); 570 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
570} 571}
571static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); 572static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
572 573
573static ssize_t cuse_class_abort_store(struct device *dev, 574static ssize_t cuse_class_abort_store(struct device *dev,
574 struct device_attribute *attr, 575 struct device_attribute *attr,
@@ -579,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
579 fuse_abort_conn(&cc->fc); 580 fuse_abort_conn(&cc->fc);
580 return count; 581 return count;
581} 582}
582static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); 583static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
583 584
584static struct attribute *cuse_class_dev_attrs[] = { 585static struct attribute *cuse_class_dev_attrs[] = {
585 &dev_attr_waiting.attr, 586 &dev_attr_waiting.attr,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1d1292c581c3..5b4e035b364c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -839,6 +839,14 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
839 struct kstat *stat) 839 struct kstat *stat)
840{ 840{
841 unsigned int blkbits; 841 unsigned int blkbits;
842 struct fuse_conn *fc = get_fuse_conn(inode);
843
844 /* see the comment in fuse_change_attributes() */
845 if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
846 attr->size = i_size_read(inode);
847 attr->mtime = inode->i_mtime.tv_sec;
848 attr->mtimensec = inode->i_mtime.tv_nsec;
849 }
842 850
843 stat->dev = inode->i_sb->s_dev; 851 stat->dev = inode->i_sb->s_dev;
844 stat->ino = attr->ino; 852 stat->ino = attr->ino;
@@ -1477,12 +1485,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
1477 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); 1485 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
1478} 1486}
1479 1487
1480static bool update_mtime(unsigned ivalid) 1488static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
1481{ 1489{
1482 /* Always update if mtime is explicitly set */ 1490 /* Always update if mtime is explicitly set */
1483 if (ivalid & ATTR_MTIME_SET) 1491 if (ivalid & ATTR_MTIME_SET)
1484 return true; 1492 return true;
1485 1493
1494 /* Or if kernel i_mtime is the official one */
1495 if (trust_local_mtime)
1496 return true;
1497
1486 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ 1498 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
1487 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) 1499 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
1488 return false; 1500 return false;
@@ -1491,7 +1503,8 @@ static bool update_mtime(unsigned ivalid)
1491 return true; 1503 return true;
1492} 1504}
1493 1505
1494static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) 1506static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
1507 bool trust_local_mtime)
1495{ 1508{
1496 unsigned ivalid = iattr->ia_valid; 1509 unsigned ivalid = iattr->ia_valid;
1497 1510
@@ -1510,11 +1523,11 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1510 if (!(ivalid & ATTR_ATIME_SET)) 1523 if (!(ivalid & ATTR_ATIME_SET))
1511 arg->valid |= FATTR_ATIME_NOW; 1524 arg->valid |= FATTR_ATIME_NOW;
1512 } 1525 }
1513 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { 1526 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) {
1514 arg->valid |= FATTR_MTIME; 1527 arg->valid |= FATTR_MTIME;
1515 arg->mtime = iattr->ia_mtime.tv_sec; 1528 arg->mtime = iattr->ia_mtime.tv_sec;
1516 arg->mtimensec = iattr->ia_mtime.tv_nsec; 1529 arg->mtimensec = iattr->ia_mtime.tv_nsec;
1517 if (!(ivalid & ATTR_MTIME_SET)) 1530 if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime)
1518 arg->valid |= FATTR_MTIME_NOW; 1531 arg->valid |= FATTR_MTIME_NOW;
1519 } 1532 }
1520} 1533}
@@ -1563,6 +1576,63 @@ void fuse_release_nowrite(struct inode *inode)
1563 spin_unlock(&fc->lock); 1576 spin_unlock(&fc->lock);
1564} 1577}
1565 1578
1579static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
1580 struct inode *inode,
1581 struct fuse_setattr_in *inarg_p,
1582 struct fuse_attr_out *outarg_p)
1583{
1584 req->in.h.opcode = FUSE_SETATTR;
1585 req->in.h.nodeid = get_node_id(inode);
1586 req->in.numargs = 1;
1587 req->in.args[0].size = sizeof(*inarg_p);
1588 req->in.args[0].value = inarg_p;
1589 req->out.numargs = 1;
1590 if (fc->minor < 9)
1591 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1592 else
1593 req->out.args[0].size = sizeof(*outarg_p);
1594 req->out.args[0].value = outarg_p;
1595}
1596
1597/*
1598 * Flush inode->i_mtime to the server
1599 */
1600int fuse_flush_mtime(struct file *file, bool nofail)
1601{
1602 struct inode *inode = file->f_mapping->host;
1603 struct fuse_inode *fi = get_fuse_inode(inode);
1604 struct fuse_conn *fc = get_fuse_conn(inode);
1605 struct fuse_req *req = NULL;
1606 struct fuse_setattr_in inarg;
1607 struct fuse_attr_out outarg;
1608 int err;
1609
1610 if (nofail) {
1611 req = fuse_get_req_nofail_nopages(fc, file);
1612 } else {
1613 req = fuse_get_req_nopages(fc);
1614 if (IS_ERR(req))
1615 return PTR_ERR(req);
1616 }
1617
1618 memset(&inarg, 0, sizeof(inarg));
1619 memset(&outarg, 0, sizeof(outarg));
1620
1621 inarg.valid |= FATTR_MTIME;
1622 inarg.mtime = inode->i_mtime.tv_sec;
1623 inarg.mtimensec = inode->i_mtime.tv_nsec;
1624
1625 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1626 fuse_request_send(fc, req);
1627 err = req->out.h.error;
1628 fuse_put_request(fc, req);
1629
1630 if (!err)
1631 clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
1632
1633 return err;
1634}
1635
1566/* 1636/*
1567 * Set attributes, and at the same time refresh them. 1637 * Set attributes, and at the same time refresh them.
1568 * 1638 *
@@ -1580,8 +1650,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1580 struct fuse_setattr_in inarg; 1650 struct fuse_setattr_in inarg;
1581 struct fuse_attr_out outarg; 1651 struct fuse_attr_out outarg;
1582 bool is_truncate = false; 1652 bool is_truncate = false;
1653 bool is_wb = fc->writeback_cache;
1583 loff_t oldsize; 1654 loff_t oldsize;
1584 int err; 1655 int err;
1656 bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode);
1585 1657
1586 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) 1658 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
1587 attr->ia_valid |= ATTR_FORCE; 1659 attr->ia_valid |= ATTR_FORCE;
@@ -1610,7 +1682,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1610 1682
1611 memset(&inarg, 0, sizeof(inarg)); 1683 memset(&inarg, 0, sizeof(inarg));
1612 memset(&outarg, 0, sizeof(outarg)); 1684 memset(&outarg, 0, sizeof(outarg));
1613 iattr_to_fattr(attr, &inarg); 1685 iattr_to_fattr(attr, &inarg, trust_local_mtime);
1614 if (file) { 1686 if (file) {
1615 struct fuse_file *ff = file->private_data; 1687 struct fuse_file *ff = file->private_data;
1616 inarg.valid |= FATTR_FH; 1688 inarg.valid |= FATTR_FH;
@@ -1621,17 +1693,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1621 inarg.valid |= FATTR_LOCKOWNER; 1693 inarg.valid |= FATTR_LOCKOWNER;
1622 inarg.lock_owner = fuse_lock_owner_id(fc, current->files); 1694 inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
1623 } 1695 }
1624 req->in.h.opcode = FUSE_SETATTR; 1696 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1625 req->in.h.nodeid = get_node_id(inode);
1626 req->in.numargs = 1;
1627 req->in.args[0].size = sizeof(inarg);
1628 req->in.args[0].value = &inarg;
1629 req->out.numargs = 1;
1630 if (fc->minor < 9)
1631 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1632 else
1633 req->out.args[0].size = sizeof(outarg);
1634 req->out.args[0].value = &outarg;
1635 fuse_request_send(fc, req); 1697 fuse_request_send(fc, req);
1636 err = req->out.h.error; 1698 err = req->out.h.error;
1637 fuse_put_request(fc, req); 1699 fuse_put_request(fc, req);
@@ -1648,10 +1710,18 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1648 } 1710 }
1649 1711
1650 spin_lock(&fc->lock); 1712 spin_lock(&fc->lock);
1713 /* the kernel maintains i_mtime locally */
1714 if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) {
1715 inode->i_mtime = attr->ia_mtime;
1716 clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
1717 }
1718
1651 fuse_change_attributes_common(inode, &outarg.attr, 1719 fuse_change_attributes_common(inode, &outarg.attr,
1652 attr_timeout(&outarg)); 1720 attr_timeout(&outarg));
1653 oldsize = inode->i_size; 1721 oldsize = inode->i_size;
1654 i_size_write(inode, outarg.attr.size); 1722 /* see the comment in fuse_change_attributes() */
1723 if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
1724 i_size_write(inode, outarg.attr.size);
1655 1725
1656 if (is_truncate) { 1726 if (is_truncate) {
1657 /* NOTE: this may release/reacquire fc->lock */ 1727 /* NOTE: this may release/reacquire fc->lock */
@@ -1663,7 +1733,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1663 * Only call invalidate_inode_pages2() after removing 1733 * Only call invalidate_inode_pages2() after removing
1664 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1734 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1665 */ 1735 */
1666 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1736 if ((is_truncate || !is_wb) &&
1737 S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1667 truncate_pagecache(inode, outarg.attr.size); 1738 truncate_pagecache(inode, outarg.attr.size);
1668 invalidate_inode_pages2(inode->i_mapping); 1739 invalidate_inode_pages2(inode->i_mapping);
1669 } 1740 }
@@ -1875,6 +1946,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1875 return err; 1946 return err;
1876} 1947}
1877 1948
1949static int fuse_update_time(struct inode *inode, struct timespec *now,
1950 int flags)
1951{
1952 if (flags & S_MTIME) {
1953 inode->i_mtime = *now;
1954 set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state);
1955 BUG_ON(!S_ISREG(inode->i_mode));
1956 }
1957 return 0;
1958}
1959
1878static const struct inode_operations fuse_dir_inode_operations = { 1960static const struct inode_operations fuse_dir_inode_operations = {
1879 .lookup = fuse_lookup, 1961 .lookup = fuse_lookup,
1880 .mkdir = fuse_mkdir, 1962 .mkdir = fuse_mkdir,
@@ -1914,6 +1996,7 @@ static const struct inode_operations fuse_common_inode_operations = {
1914 .getxattr = fuse_getxattr, 1996 .getxattr = fuse_getxattr,
1915 .listxattr = fuse_listxattr, 1997 .listxattr = fuse_listxattr,
1916 .removexattr = fuse_removexattr, 1998 .removexattr = fuse_removexattr,
1999 .update_time = fuse_update_time,
1917}; 2000};
1918 2001
1919static const struct inode_operations fuse_symlink_inode_operations = { 2002static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc303c3ae..48992cac714b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
188} 188}
189EXPORT_SYMBOL_GPL(fuse_do_open); 189EXPORT_SYMBOL_GPL(fuse_do_open);
190 190
191static void fuse_link_write_file(struct file *file)
192{
193 struct inode *inode = file_inode(file);
194 struct fuse_conn *fc = get_fuse_conn(inode);
195 struct fuse_inode *fi = get_fuse_inode(inode);
196 struct fuse_file *ff = file->private_data;
197 /*
198 * file may be written through mmap, so chain it onto the
199 * inodes's write_file list
200 */
201 spin_lock(&fc->lock);
202 if (list_empty(&ff->write_entry))
203 list_add(&ff->write_entry, &fi->write_files);
204 spin_unlock(&fc->lock);
205}
206
191void fuse_finish_open(struct inode *inode, struct file *file) 207void fuse_finish_open(struct inode *inode, struct file *file)
192{ 208{
193 struct fuse_file *ff = file->private_data; 209 struct fuse_file *ff = file->private_data;
@@ -208,6 +224,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
208 spin_unlock(&fc->lock); 224 spin_unlock(&fc->lock);
209 fuse_invalidate_attr(inode); 225 fuse_invalidate_attr(inode);
210 } 226 }
227 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
228 fuse_link_write_file(file);
211} 229}
212 230
213int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 231int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -292,6 +310,15 @@ static int fuse_open(struct inode *inode, struct file *file)
292 310
293static int fuse_release(struct inode *inode, struct file *file) 311static int fuse_release(struct inode *inode, struct file *file)
294{ 312{
313 struct fuse_conn *fc = get_fuse_conn(inode);
314
315 /* see fuse_vma_close() for !writeback_cache case */
316 if (fc->writeback_cache)
317 filemap_write_and_wait(file->f_mapping);
318
319 if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state))
320 fuse_flush_mtime(file, true);
321
295 fuse_release_common(file, FUSE_RELEASE); 322 fuse_release_common(file, FUSE_RELEASE);
296 323
297 /* return value is ignored by VFS */ 324 /* return value is ignored by VFS */
@@ -333,12 +360,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
333} 360}
334 361
335/* 362/*
336 * Check if page is under writeback 363 * Check if any page in a range is under writeback
337 * 364 *
338 * This is currently done by walking the list of writepage requests 365 * This is currently done by walking the list of writepage requests
339 * for the inode, which can be pretty inefficient. 366 * for the inode, which can be pretty inefficient.
340 */ 367 */
341static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 368static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
369 pgoff_t idx_to)
342{ 370{
343 struct fuse_conn *fc = get_fuse_conn(inode); 371 struct fuse_conn *fc = get_fuse_conn(inode);
344 struct fuse_inode *fi = get_fuse_inode(inode); 372 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -351,8 +379,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
351 379
352 BUG_ON(req->inode != inode); 380 BUG_ON(req->inode != inode);
353 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; 381 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
354 if (curr_index <= index && 382 if (idx_from < curr_index + req->num_pages &&
355 index < curr_index + req->num_pages) { 383 curr_index <= idx_to) {
356 found = true; 384 found = true;
357 break; 385 break;
358 } 386 }
@@ -362,6 +390,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
362 return found; 390 return found;
363} 391}
364 392
393static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
394{
395 return fuse_range_is_writeback(inode, index, index);
396}
397
365/* 398/*
366 * Wait for page writeback to be completed. 399 * Wait for page writeback to be completed.
367 * 400 *
@@ -376,6 +409,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
376 return 0; 409 return 0;
377} 410}
378 411
412/*
413 * Wait for all pending writepages on the inode to finish.
414 *
415 * This is currently done by blocking further writes with FUSE_NOWRITE
416 * and waiting for all sent writes to complete.
417 *
418 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
419 * could conflict with truncation.
420 */
421static void fuse_sync_writes(struct inode *inode)
422{
423 fuse_set_nowrite(inode);
424 fuse_release_nowrite(inode);
425}
426
379static int fuse_flush(struct file *file, fl_owner_t id) 427static int fuse_flush(struct file *file, fl_owner_t id)
380{ 428{
381 struct inode *inode = file_inode(file); 429 struct inode *inode = file_inode(file);
@@ -391,6 +439,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
391 if (fc->no_flush) 439 if (fc->no_flush)
392 return 0; 440 return 0;
393 441
442 err = filemap_write_and_wait(file->f_mapping);
443 if (err)
444 return err;
445
446 mutex_lock(&inode->i_mutex);
447 fuse_sync_writes(inode);
448 mutex_unlock(&inode->i_mutex);
449
394 req = fuse_get_req_nofail_nopages(fc, file); 450 req = fuse_get_req_nofail_nopages(fc, file);
395 memset(&inarg, 0, sizeof(inarg)); 451 memset(&inarg, 0, sizeof(inarg));
396 inarg.fh = ff->fh; 452 inarg.fh = ff->fh;
@@ -411,21 +467,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
411 return err; 467 return err;
412} 468}
413 469
414/*
415 * Wait for all pending writepages on the inode to finish.
416 *
417 * This is currently done by blocking further writes with FUSE_NOWRITE
418 * and waiting for all sent writes to complete.
419 *
420 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
421 * could conflict with truncation.
422 */
423static void fuse_sync_writes(struct inode *inode)
424{
425 fuse_set_nowrite(inode);
426 fuse_release_nowrite(inode);
427}
428
429int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 470int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
430 int datasync, int isdir) 471 int datasync, int isdir)
431{ 472{
@@ -459,6 +500,12 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
459 500
460 fuse_sync_writes(inode); 501 fuse_sync_writes(inode);
461 502
503 if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) {
504 int err = fuse_flush_mtime(file, false);
505 if (err)
506 goto out;
507 }
508
462 req = fuse_get_req_nopages(fc); 509 req = fuse_get_req_nopages(fc);
463 if (IS_ERR(req)) { 510 if (IS_ERR(req)) {
464 err = PTR_ERR(req); 511 err = PTR_ERR(req);
@@ -655,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
655 spin_unlock(&fc->lock); 702 spin_unlock(&fc->lock);
656} 703}
657 704
658static int fuse_readpage(struct file *file, struct page *page) 705static void fuse_short_read(struct fuse_req *req, struct inode *inode,
706 u64 attr_ver)
707{
708 size_t num_read = req->out.args[0].size;
709 struct fuse_conn *fc = get_fuse_conn(inode);
710
711 if (fc->writeback_cache) {
712 /*
713 * A hole in a file. Some data after the hole are in page cache,
714 * but have not reached the client fs yet. So, the hole is not
715 * present there.
716 */
717 int i;
718 int start_idx = num_read >> PAGE_CACHE_SHIFT;
719 size_t off = num_read & (PAGE_CACHE_SIZE - 1);
720
721 for (i = start_idx; i < req->num_pages; i++) {
722 zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
723 off = 0;
724 }
725 } else {
726 loff_t pos = page_offset(req->pages[0]) + num_read;
727 fuse_read_update_size(inode, pos, attr_ver);
728 }
729}
730
731static int fuse_do_readpage(struct file *file, struct page *page)
659{ 732{
660 struct fuse_io_priv io = { .async = 0, .file = file }; 733 struct fuse_io_priv io = { .async = 0, .file = file };
661 struct inode *inode = page->mapping->host; 734 struct inode *inode = page->mapping->host;
@@ -667,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page)
667 u64 attr_ver; 740 u64 attr_ver;
668 int err; 741 int err;
669 742
670 err = -EIO;
671 if (is_bad_inode(inode))
672 goto out;
673
674 /* 743 /*
675 * Page writeback can extend beyond the lifetime of the 744 * Page writeback can extend beyond the lifetime of the
676 * page-cache page, so make sure we read a properly synced 745 * page-cache page, so make sure we read a properly synced
@@ -679,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page)
679 fuse_wait_on_page_writeback(inode, page->index); 748 fuse_wait_on_page_writeback(inode, page->index);
680 749
681 req = fuse_get_req(fc, 1); 750 req = fuse_get_req(fc, 1);
682 err = PTR_ERR(req);
683 if (IS_ERR(req)) 751 if (IS_ERR(req))
684 goto out; 752 return PTR_ERR(req);
685 753
686 attr_ver = fuse_get_attr_version(fc); 754 attr_ver = fuse_get_attr_version(fc);
687 755
@@ -692,18 +760,32 @@ static int fuse_readpage(struct file *file, struct page *page)
692 req->page_descs[0].length = count; 760 req->page_descs[0].length = count;
693 num_read = fuse_send_read(req, &io, pos, count, NULL); 761 num_read = fuse_send_read(req, &io, pos, count, NULL);
694 err = req->out.h.error; 762 err = req->out.h.error;
695 fuse_put_request(fc, req);
696 763
697 if (!err) { 764 if (!err) {
698 /* 765 /*
699 * Short read means EOF. If file size is larger, truncate it 766 * Short read means EOF. If file size is larger, truncate it
700 */ 767 */
701 if (num_read < count) 768 if (num_read < count)
702 fuse_read_update_size(inode, pos + num_read, attr_ver); 769 fuse_short_read(req, inode, attr_ver);
703 770
704 SetPageUptodate(page); 771 SetPageUptodate(page);
705 } 772 }
706 773
774 fuse_put_request(fc, req);
775
776 return err;
777}
778
779static int fuse_readpage(struct file *file, struct page *page)
780{
781 struct inode *inode = page->mapping->host;
782 int err;
783
784 err = -EIO;
785 if (is_bad_inode(inode))
786 goto out;
787
788 err = fuse_do_readpage(file, page);
707 fuse_invalidate_atime(inode); 789 fuse_invalidate_atime(inode);
708 out: 790 out:
709 unlock_page(page); 791 unlock_page(page);
@@ -726,13 +808,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
726 /* 808 /*
727 * Short read means EOF. If file size is larger, truncate it 809 * Short read means EOF. If file size is larger, truncate it
728 */ 810 */
729 if (!req->out.h.error && num_read < count) { 811 if (!req->out.h.error && num_read < count)
730 loff_t pos; 812 fuse_short_read(req, inode, req->misc.read.attr_ver);
731 813
732 pos = page_offset(req->pages[0]) + num_read;
733 fuse_read_update_size(inode, pos,
734 req->misc.read.attr_ver);
735 }
736 fuse_invalidate_atime(inode); 814 fuse_invalidate_atime(inode);
737 } 815 }
738 816
@@ -922,16 +1000,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
922 return req->misc.write.out.size; 1000 return req->misc.write.out.size;
923} 1001}
924 1002
925void fuse_write_update_size(struct inode *inode, loff_t pos) 1003bool fuse_write_update_size(struct inode *inode, loff_t pos)
926{ 1004{
927 struct fuse_conn *fc = get_fuse_conn(inode); 1005 struct fuse_conn *fc = get_fuse_conn(inode);
928 struct fuse_inode *fi = get_fuse_inode(inode); 1006 struct fuse_inode *fi = get_fuse_inode(inode);
1007 bool ret = false;
929 1008
930 spin_lock(&fc->lock); 1009 spin_lock(&fc->lock);
931 fi->attr_version = ++fc->attr_version; 1010 fi->attr_version = ++fc->attr_version;
932 if (pos > inode->i_size) 1011 if (pos > inode->i_size) {
933 i_size_write(inode, pos); 1012 i_size_write(inode, pos);
1013 ret = true;
1014 }
934 spin_unlock(&fc->lock); 1015 spin_unlock(&fc->lock);
1016
1017 return ret;
935} 1018}
936 1019
937static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 1020static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -1116,6 +1199,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1116 struct iov_iter i; 1199 struct iov_iter i;
1117 loff_t endbyte = 0; 1200 loff_t endbyte = 0;
1118 1201
1202 if (get_fuse_conn(inode)->writeback_cache) {
1203 /* Update size (EOF optimization) and mode (SUID clearing) */
1204 err = fuse_update_attributes(mapping->host, NULL, file, NULL);
1205 if (err)
1206 return err;
1207
1208 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1209 }
1210
1119 WARN_ON(iocb->ki_pos != pos); 1211 WARN_ON(iocb->ki_pos != pos);
1120 1212
1121 ocount = 0; 1213 ocount = 0;
@@ -1289,13 +1381,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
1289 1381
1290ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 1382ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1291 unsigned long nr_segs, size_t count, loff_t *ppos, 1383 unsigned long nr_segs, size_t count, loff_t *ppos,
1292 int write) 1384 int flags)
1293{ 1385{
1386 int write = flags & FUSE_DIO_WRITE;
1387 int cuse = flags & FUSE_DIO_CUSE;
1294 struct file *file = io->file; 1388 struct file *file = io->file;
1389 struct inode *inode = file->f_mapping->host;
1295 struct fuse_file *ff = file->private_data; 1390 struct fuse_file *ff = file->private_data;
1296 struct fuse_conn *fc = ff->fc; 1391 struct fuse_conn *fc = ff->fc;
1297 size_t nmax = write ? fc->max_write : fc->max_read; 1392 size_t nmax = write ? fc->max_write : fc->max_read;
1298 loff_t pos = *ppos; 1393 loff_t pos = *ppos;
1394 pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
1395 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1299 ssize_t res = 0; 1396 ssize_t res = 0;
1300 struct fuse_req *req; 1397 struct fuse_req *req;
1301 struct iov_iter ii; 1398 struct iov_iter ii;
@@ -1309,6 +1406,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1309 if (IS_ERR(req)) 1406 if (IS_ERR(req))
1310 return PTR_ERR(req); 1407 return PTR_ERR(req);
1311 1408
1409 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1410 if (!write)
1411 mutex_lock(&inode->i_mutex);
1412 fuse_sync_writes(inode);
1413 if (!write)
1414 mutex_unlock(&inode->i_mutex);
1415 }
1416
1312 while (count) { 1417 while (count) {
1313 size_t nres; 1418 size_t nres;
1314 fl_owner_t owner = current->files; 1419 fl_owner_t owner = current->files;
@@ -1397,7 +1502,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
1397 1502
1398 res = generic_write_checks(file, ppos, &count, 0); 1503 res = generic_write_checks(file, ppos, &count, 0);
1399 if (!res) 1504 if (!res)
1400 res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); 1505 res = fuse_direct_io(io, iov, nr_segs, count, ppos,
1506 FUSE_DIO_WRITE);
1401 1507
1402 fuse_invalidate_attr(inode); 1508 fuse_invalidate_attr(inode);
1403 1509
@@ -1885,6 +1991,77 @@ out:
1885 return err; 1991 return err;
1886} 1992}
1887 1993
1994/*
1995 * It's worthy to make sure that space is reserved on disk for the write,
1996 * but how to implement it without killing performance need more thinking.
1997 */
1998static int fuse_write_begin(struct file *file, struct address_space *mapping,
1999 loff_t pos, unsigned len, unsigned flags,
2000 struct page **pagep, void **fsdata)
2001{
2002 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2003 struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
2004 struct page *page;
2005 loff_t fsize;
2006 int err = -ENOMEM;
2007
2008 WARN_ON(!fc->writeback_cache);
2009
2010 page = grab_cache_page_write_begin(mapping, index, flags);
2011 if (!page)
2012 goto error;
2013
2014 fuse_wait_on_page_writeback(mapping->host, page->index);
2015
2016 if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
2017 goto success;
2018 /*
2019 * Check if the start this page comes after the end of file, in which
2020 * case the readpage can be optimized away.
2021 */
2022 fsize = i_size_read(mapping->host);
2023 if (fsize <= (pos & PAGE_CACHE_MASK)) {
2024 size_t off = pos & ~PAGE_CACHE_MASK;
2025 if (off)
2026 zero_user_segment(page, 0, off);
2027 goto success;
2028 }
2029 err = fuse_do_readpage(file, page);
2030 if (err)
2031 goto cleanup;
2032success:
2033 *pagep = page;
2034 return 0;
2035
2036cleanup:
2037 unlock_page(page);
2038 page_cache_release(page);
2039error:
2040 return err;
2041}
2042
2043static int fuse_write_end(struct file *file, struct address_space *mapping,
2044 loff_t pos, unsigned len, unsigned copied,
2045 struct page *page, void *fsdata)
2046{
2047 struct inode *inode = page->mapping->host;
2048
2049 if (!PageUptodate(page)) {
2050 /* Zero any unwritten bytes at the end of the page */
2051 size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
2052 if (endoff)
2053 zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
2054 SetPageUptodate(page);
2055 }
2056
2057 fuse_write_update_size(inode, pos + copied);
2058 set_page_dirty(page);
2059 unlock_page(page);
2060 page_cache_release(page);
2061
2062 return copied;
2063}
2064
1888static int fuse_launder_page(struct page *page) 2065static int fuse_launder_page(struct page *page)
1889{ 2066{
1890 int err = 0; 2067 int err = 0;
@@ -1940,26 +2117,16 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1940static const struct vm_operations_struct fuse_file_vm_ops = { 2117static const struct vm_operations_struct fuse_file_vm_ops = {
1941 .close = fuse_vma_close, 2118 .close = fuse_vma_close,
1942 .fault = filemap_fault, 2119 .fault = filemap_fault,
2120 .map_pages = filemap_map_pages,
1943 .page_mkwrite = fuse_page_mkwrite, 2121 .page_mkwrite = fuse_page_mkwrite,
1944 .remap_pages = generic_file_remap_pages, 2122 .remap_pages = generic_file_remap_pages,
1945}; 2123};
1946 2124
1947static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2125static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1948{ 2126{
1949 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 2127 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1950 struct inode *inode = file_inode(file); 2128 fuse_link_write_file(file);
1951 struct fuse_conn *fc = get_fuse_conn(inode); 2129
1952 struct fuse_inode *fi = get_fuse_inode(inode);
1953 struct fuse_file *ff = file->private_data;
1954 /*
1955 * file may be written through mmap, so chain it onto the
1956 * inodes's write_file list
1957 */
1958 spin_lock(&fc->lock);
1959 if (list_empty(&ff->write_entry))
1960 list_add(&ff->write_entry, &fi->write_files);
1961 spin_unlock(&fc->lock);
1962 }
1963 file_accessed(file); 2130 file_accessed(file);
1964 vma->vm_ops = &fuse_file_vm_ops; 2131 vma->vm_ops = &fuse_file_vm_ops;
1965 return 0; 2132 return 0;
@@ -2606,7 +2773,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
2606{ 2773{
2607 spin_lock(&fc->lock); 2774 spin_lock(&fc->lock);
2608 if (RB_EMPTY_NODE(&ff->polled_node)) { 2775 if (RB_EMPTY_NODE(&ff->polled_node)) {
2609 struct rb_node **link, *parent; 2776 struct rb_node **link, *uninitialized_var(parent);
2610 2777
2611 link = fuse_find_polled_node(fc, ff->kh, &parent); 2778 link = fuse_find_polled_node(fc, ff->kh, &parent);
2612 BUG_ON(*link); 2779 BUG_ON(*link);
@@ -2850,8 +3017,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2850 goto out; 3017 goto out;
2851 3018
2852 /* we could have extended the file */ 3019 /* we could have extended the file */
2853 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3020 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2854 fuse_write_update_size(inode, offset + length); 3021 bool changed = fuse_write_update_size(inode, offset + length);
3022
3023 if (changed && fc->writeback_cache) {
3024 struct fuse_inode *fi = get_fuse_inode(inode);
3025
3026 inode->i_mtime = current_fs_time(inode->i_sb);
3027 set_bit(FUSE_I_MTIME_DIRTY, &fi->state);
3028 }
3029 }
2855 3030
2856 if (mode & FALLOC_FL_PUNCH_HOLE) 3031 if (mode & FALLOC_FL_PUNCH_HOLE)
2857 truncate_pagecache_range(inode, offset, offset + length - 1); 3032 truncate_pagecache_range(inode, offset, offset + length - 1);
@@ -2915,6 +3090,8 @@ static const struct address_space_operations fuse_file_aops = {
2915 .set_page_dirty = __set_page_dirty_nobuffers, 3090 .set_page_dirty = __set_page_dirty_nobuffers,
2916 .bmap = fuse_bmap, 3091 .bmap = fuse_bmap,
2917 .direct_IO = fuse_direct_IO, 3092 .direct_IO = fuse_direct_IO,
3093 .write_begin = fuse_write_begin,
3094 .write_end = fuse_write_end,
2918}; 3095};
2919 3096
2920void fuse_init_file_inode(struct inode *inode) 3097void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2da5db2c8bdb..a257ed8ebee6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -119,6 +119,8 @@ enum {
119 FUSE_I_INIT_RDPLUS, 119 FUSE_I_INIT_RDPLUS,
120 /** An operation changing file size is in progress */ 120 /** An operation changing file size is in progress */
121 FUSE_I_SIZE_UNSTABLE, 121 FUSE_I_SIZE_UNSTABLE,
122 /** i_mtime has been updated locally; a flush to userspace needed */
123 FUSE_I_MTIME_DIRTY,
122}; 124};
123 125
124struct fuse_conn; 126struct fuse_conn;
@@ -480,6 +482,9 @@ struct fuse_conn {
480 /** Set if bdi is valid */ 482 /** Set if bdi is valid */
481 unsigned bdi_initialized:1; 483 unsigned bdi_initialized:1;
482 484
485 /** write-back cache policy (default is write-through) */
486 unsigned writeback_cache:1;
487
483 /* 488 /*
484 * The following bitfields are only for optimization purposes 489 * The following bitfields are only for optimization purposes
485 * and hence races in setting them will not cause malfunction 490 * and hence races in setting them will not cause malfunction
@@ -863,9 +868,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
863 868
864int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 869int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
865 bool isdir); 870 bool isdir);
871
872/**
873 * fuse_direct_io() flags
874 */
875
876/** If set, it is WRITE; otherwise - READ */
877#define FUSE_DIO_WRITE (1 << 0)
878
879/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
880#define FUSE_DIO_CUSE (1 << 1)
881
866ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 882ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
867 unsigned long nr_segs, size_t count, loff_t *ppos, 883 unsigned long nr_segs, size_t count, loff_t *ppos,
868 int write); 884 int flags);
869long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 885long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
870 unsigned int flags); 886 unsigned int flags);
871long fuse_ioctl_common(struct file *file, unsigned int cmd, 887long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -873,7 +889,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
873unsigned fuse_file_poll(struct file *file, poll_table *wait); 889unsigned fuse_file_poll(struct file *file, poll_table *wait);
874int fuse_dev_release(struct inode *inode, struct file *file); 890int fuse_dev_release(struct inode *inode, struct file *file);
875 891
876void fuse_write_update_size(struct inode *inode, loff_t pos); 892bool fuse_write_update_size(struct inode *inode, loff_t pos);
893
894int fuse_flush_mtime(struct file *file, bool nofail);
877 895
878int fuse_do_setattr(struct inode *inode, struct iattr *attr, 896int fuse_do_setattr(struct inode *inode, struct iattr *attr,
879 struct file *file); 897 struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d468643a68b2..8d611696fcad 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)
123 123
124static void fuse_evict_inode(struct inode *inode) 124static void fuse_evict_inode(struct inode *inode)
125{ 125{
126 truncate_inode_pages(&inode->i_data, 0); 126 truncate_inode_pages_final(&inode->i_data);
127 clear_inode(inode); 127 clear_inode(inode);
128 if (inode->i_sb->s_flags & MS_ACTIVE) { 128 if (inode->i_sb->s_flags & MS_ACTIVE) {
129 struct fuse_conn *fc = get_fuse_conn(inode); 129 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
135 135
136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) 136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
137{ 137{
138 sync_filesystem(sb);
138 if (*flags & MS_MANDLOCK) 139 if (*flags & MS_MANDLOCK)
139 return -EINVAL; 140 return -EINVAL;
140 141
@@ -170,8 +171,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
170 inode->i_blocks = attr->blocks; 171 inode->i_blocks = attr->blocks;
171 inode->i_atime.tv_sec = attr->atime; 172 inode->i_atime.tv_sec = attr->atime;
172 inode->i_atime.tv_nsec = attr->atimensec; 173 inode->i_atime.tv_nsec = attr->atimensec;
173 inode->i_mtime.tv_sec = attr->mtime; 174 /* mtime from server may be stale due to local buffered write */
174 inode->i_mtime.tv_nsec = attr->mtimensec; 175 if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
176 inode->i_mtime.tv_sec = attr->mtime;
177 inode->i_mtime.tv_nsec = attr->mtimensec;
178 }
175 inode->i_ctime.tv_sec = attr->ctime; 179 inode->i_ctime.tv_sec = attr->ctime;
176 inode->i_ctime.tv_nsec = attr->ctimensec; 180 inode->i_ctime.tv_nsec = attr->ctimensec;
177 181
@@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
197{ 201{
198 struct fuse_conn *fc = get_fuse_conn(inode); 202 struct fuse_conn *fc = get_fuse_conn(inode);
199 struct fuse_inode *fi = get_fuse_inode(inode); 203 struct fuse_inode *fi = get_fuse_inode(inode);
204 bool is_wb = fc->writeback_cache;
200 loff_t oldsize; 205 loff_t oldsize;
201 struct timespec old_mtime; 206 struct timespec old_mtime;
202 207
@@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
211 fuse_change_attributes_common(inode, attr, attr_valid); 216 fuse_change_attributes_common(inode, attr, attr_valid);
212 217
213 oldsize = inode->i_size; 218 oldsize = inode->i_size;
214 i_size_write(inode, attr->size); 219 /*
220 * In case of writeback_cache enabled, the cached writes beyond EOF
221 * extend local i_size without keeping userspace server in sync. So,
222 * attr->size coming from server can be stale. We cannot trust it.
223 */
224 if (!is_wb || !S_ISREG(inode->i_mode))
225 i_size_write(inode, attr->size);
215 spin_unlock(&fc->lock); 226 spin_unlock(&fc->lock);
216 227
217 if (S_ISREG(inode->i_mode)) { 228 if (!is_wb && S_ISREG(inode->i_mode)) {
218 bool inval = false; 229 bool inval = false;
219 230
220 if (oldsize != attr->size) { 231 if (oldsize != attr->size) {
@@ -243,6 +254,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
243{ 254{
244 inode->i_mode = attr->mode & S_IFMT; 255 inode->i_mode = attr->mode & S_IFMT;
245 inode->i_size = attr->size; 256 inode->i_size = attr->size;
257 inode->i_mtime.tv_sec = attr->mtime;
258 inode->i_mtime.tv_nsec = attr->mtimensec;
246 if (S_ISREG(inode->i_mode)) { 259 if (S_ISREG(inode->i_mode)) {
247 fuse_init_common(inode); 260 fuse_init_common(inode);
248 fuse_init_file_inode(inode); 261 fuse_init_file_inode(inode);
@@ -289,7 +302,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
289 return NULL; 302 return NULL;
290 303
291 if ((inode->i_state & I_NEW)) { 304 if ((inode->i_state & I_NEW)) {
292 inode->i_flags |= S_NOATIME|S_NOCMTIME; 305 inode->i_flags |= S_NOATIME;
306 if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
307 inode->i_flags |= S_NOCMTIME;
293 inode->i_generation = generation; 308 inode->i_generation = generation;
294 inode->i_data.backing_dev_info = &fc->bdi; 309 inode->i_data.backing_dev_info = &fc->bdi;
295 fuse_init_inode(inode, attr); 310 fuse_init_inode(inode, attr);
@@ -873,6 +888,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
873 } 888 }
874 if (arg->flags & FUSE_ASYNC_DIO) 889 if (arg->flags & FUSE_ASYNC_DIO)
875 fc->async_dio = 1; 890 fc->async_dio = 1;
891 if (arg->flags & FUSE_WRITEBACK_CACHE)
892 fc->writeback_cache = 1;
876 } else { 893 } else {
877 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 894 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
878 fc->no_lock = 1; 895 fc->no_lock = 1;
@@ -900,7 +917,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
900 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | 917 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
901 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | 918 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
902 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | 919 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
903 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; 920 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
921 FUSE_WRITEBACK_CACHE;
904 req->in.h.opcode = FUSE_INIT; 922 req->in.h.opcode = FUSE_INIT;
905 req->in.numargs = 1; 923 req->in.numargs = 1;
906 req->in.args[0].size = sizeof(*arg); 924 req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ba9456685f47..3088e2a38e30 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -64,18 +64,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
64 return acl; 64 return acl;
65} 65}
66 66
67static int gfs2_set_mode(struct inode *inode, umode_t mode)
68{
69 int error = 0;
70
71 if (mode != inode->i_mode) {
72 inode->i_mode = mode;
73 mark_inode_dirty(inode);
74 }
75
76 return error;
77}
78
79int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) 67int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
80{ 68{
81 int error; 69 int error;
@@ -85,8 +73,8 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
85 73
86 BUG_ON(name == NULL); 74 BUG_ON(name == NULL);
87 75
88 if (acl->a_count > GFS2_ACL_MAX_ENTRIES) 76 if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
89 return -EINVAL; 77 return -E2BIG;
90 78
91 if (type == ACL_TYPE_ACCESS) { 79 if (type == ACL_TYPE_ACCESS) {
92 umode_t mode = inode->i_mode; 80 umode_t mode = inode->i_mode;
@@ -98,9 +86,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
98 if (error == 0) 86 if (error == 0)
99 acl = NULL; 87 acl = NULL;
100 88
101 error = gfs2_set_mode(inode, mode); 89 if (mode != inode->i_mode) {
102 if (error) 90 inode->i_mode = mode;
103 return error; 91 mark_inode_dirty(inode);
92 }
104 } 93 }
105 94
106 if (acl) { 95 if (acl) {
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 301260c999ba..2d65ec4cd4be 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -14,7 +14,7 @@
14 14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" 15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
18 18
19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); 19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); 20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 49436fa7cd4f..ce62dcac90b6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -21,6 +21,7 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <trace/events/writeback.h>
24 25
25#include "gfs2.h" 26#include "gfs2.h"
26#include "incore.h" 27#include "incore.h"
@@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping,
230static int gfs2_write_jdata_pagevec(struct address_space *mapping, 231static int gfs2_write_jdata_pagevec(struct address_space *mapping,
231 struct writeback_control *wbc, 232 struct writeback_control *wbc,
232 struct pagevec *pvec, 233 struct pagevec *pvec,
233 int nr_pages, pgoff_t end) 234 int nr_pages, pgoff_t end,
235 pgoff_t *done_index)
234{ 236{
235 struct inode *inode = mapping->host; 237 struct inode *inode = mapping->host;
236 struct gfs2_sbd *sdp = GFS2_SB(inode); 238 struct gfs2_sbd *sdp = GFS2_SB(inode);
237 loff_t i_size = i_size_read(inode);
238 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
239 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
240 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); 239 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
241 int i; 240 int i;
242 int ret; 241 int ret;
@@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
248 for(i = 0; i < nr_pages; i++) { 247 for(i = 0; i < nr_pages; i++) {
249 struct page *page = pvec->pages[i]; 248 struct page *page = pvec->pages[i];
250 249
250 /*
251 * At this point, the page may be truncated or
252 * invalidated (changing page->mapping to NULL), or
253 * even swizzled back from swapper_space to tmpfs file
254 * mapping. However, page->index will not change
255 * because we have a reference on the page.
256 */
257 if (page->index > end) {
258 /*
259 * can't be range_cyclic (1st pass) because
260 * end == -1 in that case.
261 */
262 ret = 1;
263 break;
264 }
265
266 *done_index = page->index;
267
251 lock_page(page); 268 lock_page(page);
252 269
253 if (unlikely(page->mapping != mapping)) { 270 if (unlikely(page->mapping != mapping)) {
271continue_unlock:
254 unlock_page(page); 272 unlock_page(page);
255 continue; 273 continue;
256 } 274 }
257 275
258 if (!wbc->range_cyclic && page->index > end) { 276 if (!PageDirty(page)) {
259 ret = 1; 277 /* someone wrote it for us */
260 unlock_page(page); 278 goto continue_unlock;
261 continue;
262 } 279 }
263 280
264 if (wbc->sync_mode != WB_SYNC_NONE) 281 if (PageWriteback(page)) {
265 wait_on_page_writeback(page); 282 if (wbc->sync_mode != WB_SYNC_NONE)
266 283 wait_on_page_writeback(page);
267 if (PageWriteback(page) || 284 else
268 !clear_page_dirty_for_io(page)) { 285 goto continue_unlock;
269 unlock_page(page);
270 continue;
271 } 286 }
272 287
273 /* Is the page fully outside i_size? (truncate in progress) */ 288 BUG_ON(PageWriteback(page));
274 if (page->index > end_index || (page->index == end_index && !offset)) { 289 if (!clear_page_dirty_for_io(page))
275 page->mapping->a_ops->invalidatepage(page, 0, 290 goto continue_unlock;
276 PAGE_CACHE_SIZE); 291
277 unlock_page(page); 292 trace_wbc_writepage(wbc, mapping->backing_dev_info);
278 continue;
279 }
280 293
281 ret = __gfs2_jdata_writepage(page, wbc); 294 ret = __gfs2_jdata_writepage(page, wbc);
295 if (unlikely(ret)) {
296 if (ret == AOP_WRITEPAGE_ACTIVATE) {
297 unlock_page(page);
298 ret = 0;
299 } else {
300
301 /*
302 * done_index is set past this page,
303 * so media errors will not choke
304 * background writeout for the entire
305 * file. This has consequences for
306 * range_cyclic semantics (ie. it may
307 * not be suitable for data integrity
308 * writeout).
309 */
310 *done_index = page->index + 1;
311 ret = 1;
312 break;
313 }
314 }
282 315
283 if (ret || (--(wbc->nr_to_write) <= 0)) 316 /*
317 * We stop writing back only if we are not doing
318 * integrity sync. In case of integrity sync we have to
319 * keep going until we have written all the pages
320 * we tagged for writeback prior to entering this loop.
321 */
322 if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
284 ret = 1; 323 ret = 1;
324 break;
325 }
326
285 } 327 }
286 gfs2_trans_end(sdp); 328 gfs2_trans_end(sdp);
287 return ret; 329 return ret;
@@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
306 int done = 0; 348 int done = 0;
307 struct pagevec pvec; 349 struct pagevec pvec;
308 int nr_pages; 350 int nr_pages;
351 pgoff_t uninitialized_var(writeback_index);
309 pgoff_t index; 352 pgoff_t index;
310 pgoff_t end; 353 pgoff_t end;
311 int scanned = 0; 354 pgoff_t done_index;
355 int cycled;
312 int range_whole = 0; 356 int range_whole = 0;
357 int tag;
313 358
314 pagevec_init(&pvec, 0); 359 pagevec_init(&pvec, 0);
315 if (wbc->range_cyclic) { 360 if (wbc->range_cyclic) {
316 index = mapping->writeback_index; /* Start from prev offset */ 361 writeback_index = mapping->writeback_index; /* prev offset */
362 index = writeback_index;
363 if (index == 0)
364 cycled = 1;
365 else
366 cycled = 0;
317 end = -1; 367 end = -1;
318 } else { 368 } else {
319 index = wbc->range_start >> PAGE_CACHE_SHIFT; 369 index = wbc->range_start >> PAGE_CACHE_SHIFT;
320 end = wbc->range_end >> PAGE_CACHE_SHIFT; 370 end = wbc->range_end >> PAGE_CACHE_SHIFT;
321 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 371 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
322 range_whole = 1; 372 range_whole = 1;
323 scanned = 1; 373 cycled = 1; /* ignore range_cyclic tests */
324 } 374 }
375 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
376 tag = PAGECACHE_TAG_TOWRITE;
377 else
378 tag = PAGECACHE_TAG_DIRTY;
325 379
326retry: 380retry:
327 while (!done && (index <= end) && 381 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
328 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 382 tag_pages_for_writeback(mapping, index, end);
329 PAGECACHE_TAG_DIRTY, 383 done_index = index;
330 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 384 while (!done && (index <= end)) {
331 scanned = 1; 385 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
332 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); 386 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
387 if (nr_pages == 0)
388 break;
389
390 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
333 if (ret) 391 if (ret)
334 done = 1; 392 done = 1;
335 if (ret > 0) 393 if (ret > 0)
336 ret = 0; 394 ret = 0;
337
338 pagevec_release(&pvec); 395 pagevec_release(&pvec);
339 cond_resched(); 396 cond_resched();
340 } 397 }
341 398
342 if (!scanned && !done) { 399 if (!cycled && !done) {
343 /* 400 /*
401 * range_cyclic:
344 * We hit the last page and there is more work to be done: wrap 402 * We hit the last page and there is more work to be done: wrap
345 * back to the start of the file 403 * back to the start of the file
346 */ 404 */
347 scanned = 1; 405 cycled = 1;
348 index = 0; 406 index = 0;
407 end = writeback_index - 1;
349 goto retry; 408 goto retry;
350 } 409 }
351 410
352 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 411 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
353 mapping->writeback_index = index; 412 mapping->writeback_index = done_index;
413
354 return ret; 414 return ret;
355} 415}
356 416
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fe0500c0af7a..c62d4b9f51dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1328,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1328} 1328}
1329 1329
1330/** 1330/**
1331 * gfs2_free_journal_extents - Free cached journal bmap info
1332 * @jd: The journal
1333 *
1334 */
1335
1336void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1337{
1338 struct gfs2_journal_extent *jext;
1339
1340 while(!list_empty(&jd->extent_list)) {
1341 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1342 list_del(&jext->list);
1343 kfree(jext);
1344 }
1345}
1346
1347/**
1348 * gfs2_add_jextent - Add or merge a new extent to extent cache
1349 * @jd: The journal descriptor
1350 * @lblock: The logical block at start of new extent
1351 * @pblock: The physical block at start of new extent
1352 * @blocks: Size of extent in fs blocks
1353 *
1354 * Returns: 0 on success or -ENOMEM
1355 */
1356
1357static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1358{
1359 struct gfs2_journal_extent *jext;
1360
1361 if (!list_empty(&jd->extent_list)) {
1362 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1363 if ((jext->dblock + jext->blocks) == dblock) {
1364 jext->blocks += blocks;
1365 return 0;
1366 }
1367 }
1368
1369 jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1370 if (jext == NULL)
1371 return -ENOMEM;
1372 jext->dblock = dblock;
1373 jext->lblock = lblock;
1374 jext->blocks = blocks;
1375 list_add_tail(&jext->list, &jd->extent_list);
1376 jd->nr_extents++;
1377 return 0;
1378}
1379
1380/**
1381 * gfs2_map_journal_extents - Cache journal bmap info
1382 * @sdp: The super block
1383 * @jd: The journal to map
1384 *
1385 * Create a reusable "extent" mapping from all logical
1386 * blocks to all physical blocks for the given journal. This will save
1387 * us time when writing journal blocks. Most journals will have only one
1388 * extent that maps all their logical blocks. That's because gfs2.mkfs
1389 * arranges the journal blocks sequentially to maximize performance.
1390 * So the extent would map the first block for the entire file length.
1391 * However, gfs2_jadd can happen while file activity is happening, so
1392 * those journals may not be sequential. Less likely is the case where
1393 * the users created their own journals by mounting the metafs and
1394 * laying it out. But it's still possible. These journals might have
1395 * several extents.
1396 *
1397 * Returns: 0 on success, or error on failure
1398 */
1399
1400int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1401{
1402 u64 lblock = 0;
1403 u64 lblock_stop;
1404 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1405 struct buffer_head bh;
1406 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1407 u64 size;
1408 int rc;
1409
1410 lblock_stop = i_size_read(jd->jd_inode) >> shift;
1411 size = (lblock_stop - lblock) << shift;
1412 jd->nr_extents = 0;
1413 WARN_ON(!list_empty(&jd->extent_list));
1414
1415 do {
1416 bh.b_state = 0;
1417 bh.b_blocknr = 0;
1418 bh.b_size = size;
1419 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1420 if (rc || !buffer_mapped(&bh))
1421 goto fail;
1422 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1423 if (rc)
1424 goto fail;
1425 size -= bh.b_size;
1426 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1427 } while(size > 0);
1428
1429 fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1430 jd->nr_extents);
1431 return 0;
1432
1433fail:
1434 fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1435 rc, jd->jd_jid,
1436 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1437 jd->nr_extents);
1438 fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1439 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1440 bh.b_state, (unsigned long long)bh.b_size);
1441 gfs2_free_journal_extents(jd);
1442 return rc;
1443}
1444
1445/**
1331 * gfs2_write_alloc_required - figure out if a write will require an allocation 1446 * gfs2_write_alloc_required - figure out if a write will require an allocation
1332 * @ip: the file being written to 1447 * @ip: the file being written to
1333 * @offset: the offset to write to 1448 * @offset: the offset to write to
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 42fea03e2bd9..81ded5e2aaa2 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55extern int gfs2_file_dealloc(struct gfs2_inode *ip); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len); 57 unsigned int len);
58extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
59extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
58 60
59#endif /* __BMAP_DOT_H__ */ 61#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fa32655449c8..1a349f9a9685 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -53,6 +53,8 @@
53 * but never before the maximum hash table size has been reached. 53 * but never before the maximum hash table size has been reached.
54 */ 54 */
55 55
56#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
57
56#include <linux/slab.h> 58#include <linux/slab.h>
57#include <linux/spinlock.h> 59#include <linux/spinlock.h>
58#include <linux/buffer_head.h> 60#include <linux/buffer_head.h>
@@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
507 goto error; 509 goto error;
508 return 0; 510 return 0;
509error: 511error:
510 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, 512 pr_warn("%s: %s (%s)\n",
511 first ? "first in block" : "not first in block"); 513 __func__, msg, first ? "first in block" : "not first in block");
512 return -EIO; 514 return -EIO;
513} 515}
514 516
@@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf)
531 } 533 }
532 return offset; 534 return offset;
533wrong_type: 535wrong_type:
534 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", 536 pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
535 be32_to_cpu(h->mh_type));
536 return -1; 537 return -1;
537} 538}
538 539
@@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
728 729
729 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); 730 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
730 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { 731 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
731 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ 732 /* pr_info("block num=%llu\n", leaf_no); */
732 error = -EIO; 733 error = -EIO;
733 } 734 }
734 735
@@ -1006,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1006 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); 1007 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
1007 half_len = len >> 1; 1008 half_len = len >> 1;
1008 if (!half_len) { 1009 if (!half_len) {
1009 printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); 1010 pr_warn("i_depth %u lf_depth %u index %u\n",
1011 dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
1010 gfs2_consist_inode(dip); 1012 gfs2_consist_inode(dip);
1011 error = -EIO; 1013 error = -EIO;
1012 goto fail_brelse; 1014 goto fail_brelse;
@@ -1684,6 +1686,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1684 return 0; 1686 return 0;
1685} 1687}
1686 1688
1689static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
1690{
1691 u64 where = ip->i_no_addr + 1;
1692 if (ip->i_eattr == where)
1693 return 1;
1694 return 0;
1695}
1696
1687/** 1697/**
1688 * gfs2_dir_add - Add new filename into directory 1698 * gfs2_dir_add - Add new filename into directory
1689 * @inode: The directory inode 1699 * @inode: The directory inode
@@ -1721,6 +1731,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1721 dent = gfs2_init_dirent(inode, dent, name, bh); 1731 dent = gfs2_init_dirent(inode, dent, name, bh);
1722 gfs2_inum_out(nip, dent); 1732 gfs2_inum_out(nip, dent);
1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1733 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1734 dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
1724 tv = CURRENT_TIME; 1735 tv = CURRENT_TIME;
1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1736 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1726 leaf = (struct gfs2_leaf *)bh->b_data; 1737 leaf = (struct gfs2_leaf *)bh->b_data;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index efc078f0ee4e..80d67253623c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -494,6 +494,7 @@ out:
494 494
495static const struct vm_operations_struct gfs2_vm_ops = { 495static const struct vm_operations_struct gfs2_vm_ops = {
496 .fault = filemap_fault, 496 .fault = filemap_fault,
497 .map_pages = filemap_map_pages,
497 .page_mkwrite = gfs2_page_mkwrite, 498 .page_mkwrite = gfs2_page_mkwrite,
498 .remap_pages = generic_file_remap_pages, 499 .remap_pages = generic_file_remap_pages,
499}; 500};
@@ -811,6 +812,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
811 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 812 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
812 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 813 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
813 loff_t max_chunk_size = UINT_MAX & bsize_mask; 814 loff_t max_chunk_size = UINT_MAX & bsize_mask;
815 struct gfs2_holder gh;
816
814 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 817 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
815 818
816 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 819 /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -831,8 +834,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
831 if (error) 834 if (error)
832 return error; 835 return error;
833 836
834 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 837 mutex_lock(&inode->i_mutex);
835 error = gfs2_glock_nq(&ip->i_gh); 838
839 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
840 error = gfs2_glock_nq(&gh);
836 if (unlikely(error)) 841 if (unlikely(error))
837 goto out_uninit; 842 goto out_uninit;
838 843
@@ -900,9 +905,10 @@ out_trans_fail:
900out_qunlock: 905out_qunlock:
901 gfs2_quota_unlock(ip); 906 gfs2_quota_unlock(ip);
902out_unlock: 907out_unlock:
903 gfs2_glock_dq(&ip->i_gh); 908 gfs2_glock_dq(&gh);
904out_uninit: 909out_uninit:
905 gfs2_holder_uninit(&ip->i_gh); 910 gfs2_holder_uninit(&gh);
911 mutex_unlock(&inode->i_mutex);
906 return error; 912 return error;
907} 913}
908 914
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ca0be6c69a26..aec7f73832f0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -468,7 +470,7 @@ retry:
468 do_xmote(gl, gh, LM_ST_UNLOCKED); 470 do_xmote(gl, gh, LM_ST_UNLOCKED);
469 break; 471 break;
470 default: /* Everything else */ 472 default: /* Everything else */
471 printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); 473 pr_err("wanted %u got %u\n", gl->gl_target, state);
472 GLOCK_BUG_ON(gl, 1); 474 GLOCK_BUG_ON(gl, 1);
473 } 475 }
474 spin_unlock(&gl->gl_spin); 476 spin_unlock(&gl->gl_spin);
@@ -542,7 +544,7 @@ __acquires(&gl->gl_spin)
542 /* lock_dlm */ 544 /* lock_dlm */
543 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); 545 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
544 if (ret) { 546 if (ret) {
545 printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); 547 pr_err("lm_lock ret %d\n", ret);
546 GLOCK_BUG_ON(gl, 1); 548 GLOCK_BUG_ON(gl, 1);
547 } 549 }
548 } else { /* lock_nolock */ 550 } else { /* lock_nolock */
@@ -935,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
935 vaf.fmt = fmt; 937 vaf.fmt = fmt;
936 vaf.va = &args; 938 vaf.va = &args;
937 939
938 printk(KERN_ERR " %pV", &vaf); 940 pr_err("%pV", &vaf);
939 } 941 }
940 942
941 va_end(args); 943 va_end(args);
@@ -1010,13 +1012,13 @@ do_cancel:
1010 return; 1012 return;
1011 1013
1012trap_recursive: 1014trap_recursive:
1013 printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); 1015 pr_err("original: %pSR\n", (void *)gh2->gh_ip);
1014 printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); 1016 pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
1015 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1017 pr_err("lock type: %d req lock state : %d\n",
1016 gh2->gh_gl->gl_name.ln_type, gh2->gh_state); 1018 gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
1017 printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); 1019 pr_err("new: %pSR\n", (void *)gh->gh_ip);
1018 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); 1020 pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
1019 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1021 pr_err("lock type: %d req lock state : %d\n",
1020 gh->gh_gl->gl_name.ln_type, gh->gh_state); 1022 gh->gh_gl->gl_name.ln_type, gh->gh_state);
1021 gfs2_dump_glock(NULL, gl); 1023 gfs2_dump_glock(NULL, gl);
1022 BUG(); 1024 BUG();
@@ -1045,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1045 1047
1046 spin_lock(&gl->gl_spin); 1048 spin_lock(&gl->gl_spin);
1047 add_to_queue(gh); 1049 add_to_queue(gh);
1048 if ((LM_FLAG_NOEXP & gh->gh_flags) && 1050 if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
1049 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) 1051 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
1050 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1052 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1053 gl->gl_lockref.count++;
1054 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1055 gl->gl_lockref.count--;
1056 }
1051 run_queue(gl, 1); 1057 run_queue(gl, 1);
1052 spin_unlock(&gl->gl_spin); 1058 spin_unlock(&gl->gl_spin);
1053 1059
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3bf0631b5d56..54b66809e818 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -82,6 +82,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
82 struct gfs2_trans tr; 82 struct gfs2_trans tr;
83 83
84 memset(&tr, 0, sizeof(tr)); 84 memset(&tr, 0, sizeof(tr));
85 INIT_LIST_HEAD(&tr.tr_buf);
86 INIT_LIST_HEAD(&tr.tr_databuf);
85 tr.tr_revokes = atomic_read(&gl->gl_ail_count); 87 tr.tr_revokes = atomic_read(&gl->gl_ail_count);
86 88
87 if (!tr.tr_revokes) 89 if (!tr.tr_revokes)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index cf0e34400f71..bdf70c18610c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,7 @@ struct gfs2_log_header_host {
52 */ 52 */
53 53
54struct gfs2_log_operations { 54struct gfs2_log_operations {
55 void (*lo_before_commit) (struct gfs2_sbd *sdp); 55 void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); 56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_scan) (struct gfs2_jdesc *jd, 57 void (*lo_before_scan) (struct gfs2_jdesc *jd,
58 struct gfs2_log_header_host *head, int pass); 58 struct gfs2_log_header_host *head, int pass);
@@ -371,6 +371,7 @@ enum {
371 GIF_ALLOC_FAILED = 2, 371 GIF_ALLOC_FAILED = 2,
372 GIF_SW_PAGED = 3, 372 GIF_SW_PAGED = 3,
373 GIF_ORDERED = 4, 373 GIF_ORDERED = 4,
374 GIF_FREE_VFS_INODE = 5,
374}; 375};
375 376
376struct gfs2_inode { 377struct gfs2_inode {
@@ -462,11 +463,11 @@ struct gfs2_trans {
462 unsigned int tr_blocks; 463 unsigned int tr_blocks;
463 unsigned int tr_revokes; 464 unsigned int tr_revokes;
464 unsigned int tr_reserved; 465 unsigned int tr_reserved;
466 unsigned int tr_touched:1;
467 unsigned int tr_attached:1;
465 468
466 struct gfs2_holder tr_t_gh; 469 struct gfs2_holder tr_t_gh;
467 470
468 int tr_touched;
469 int tr_attached;
470 471
471 unsigned int tr_num_buf_new; 472 unsigned int tr_num_buf_new;
472 unsigned int tr_num_databuf_new; 473 unsigned int tr_num_databuf_new;
@@ -476,6 +477,8 @@ struct gfs2_trans {
476 unsigned int tr_num_revoke_rm; 477 unsigned int tr_num_revoke_rm;
477 478
478 struct list_head tr_list; 479 struct list_head tr_list;
480 struct list_head tr_databuf;
481 struct list_head tr_buf;
479 482
480 unsigned int tr_first; 483 unsigned int tr_first;
481 struct list_head tr_ail1_list; 484 struct list_head tr_ail1_list;
@@ -483,7 +486,7 @@ struct gfs2_trans {
483}; 486};
484 487
485struct gfs2_journal_extent { 488struct gfs2_journal_extent {
486 struct list_head extent_list; 489 struct list_head list;
487 490
488 unsigned int lblock; /* First logical block */ 491 unsigned int lblock; /* First logical block */
489 u64 dblock; /* First disk block */ 492 u64 dblock; /* First disk block */
@@ -493,6 +496,7 @@ struct gfs2_journal_extent {
493struct gfs2_jdesc { 496struct gfs2_jdesc {
494 struct list_head jd_list; 497 struct list_head jd_list;
495 struct list_head extent_list; 498 struct list_head extent_list;
499 unsigned int nr_extents;
496 struct work_struct jd_work; 500 struct work_struct jd_work;
497 struct inode *jd_inode; 501 struct inode *jd_inode;
498 unsigned long jd_flags; 502 unsigned long jd_flags;
@@ -500,6 +504,15 @@ struct gfs2_jdesc {
500 unsigned int jd_jid; 504 unsigned int jd_jid;
501 unsigned int jd_blocks; 505 unsigned int jd_blocks;
502 int jd_recover_error; 506 int jd_recover_error;
507 /* Replay stuff */
508
509 unsigned int jd_found_blocks;
510 unsigned int jd_found_revokes;
511 unsigned int jd_replayed_blocks;
512
513 struct list_head jd_revoke_list;
514 unsigned int jd_replay_tail;
515
503}; 516};
504 517
505struct gfs2_statfs_change_host { 518struct gfs2_statfs_change_host {
@@ -746,19 +759,12 @@ struct gfs2_sbd {
746 759
747 struct gfs2_trans *sd_log_tr; 760 struct gfs2_trans *sd_log_tr;
748 unsigned int sd_log_blks_reserved; 761 unsigned int sd_log_blks_reserved;
749 unsigned int sd_log_commited_buf;
750 unsigned int sd_log_commited_databuf;
751 int sd_log_commited_revoke; 762 int sd_log_commited_revoke;
752 763
753 atomic_t sd_log_pinned; 764 atomic_t sd_log_pinned;
754 unsigned int sd_log_num_buf;
755 unsigned int sd_log_num_revoke; 765 unsigned int sd_log_num_revoke;
756 unsigned int sd_log_num_rg;
757 unsigned int sd_log_num_databuf;
758 766
759 struct list_head sd_log_le_buf;
760 struct list_head sd_log_le_revoke; 767 struct list_head sd_log_le_revoke;
761 struct list_head sd_log_le_databuf;
762 struct list_head sd_log_le_ordered; 768 struct list_head sd_log_le_ordered;
763 spinlock_t sd_ordered_lock; 769 spinlock_t sd_ordered_lock;
764 770
@@ -786,15 +792,6 @@ struct gfs2_sbd {
786 struct list_head sd_ail1_list; 792 struct list_head sd_ail1_list;
787 struct list_head sd_ail2_list; 793 struct list_head sd_ail2_list;
788 794
789 /* Replay stuff */
790
791 struct list_head sd_revoke_list;
792 unsigned int sd_replay_tail;
793
794 unsigned int sd_found_blocks;
795 unsigned int sd_found_revokes;
796 unsigned int sd_replayed_blocks;
797
798 /* For quiescing the filesystem */ 795 /* For quiescing the filesystem */
799 struct gfs2_holder sd_freeze_gh; 796 struct gfs2_holder sd_freeze_gh;
800 797
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5c524180c98e..28cc7bf6575a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -376,12 +376,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
376 inode->i_gid = current_fsgid(); 376 inode->i_gid = current_fsgid();
377} 377}
378 378
379static int alloc_dinode(struct gfs2_inode *ip, u32 flags) 379static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
380{ 380{
381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
382 struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, }; 382 struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
383 int error; 383 int error;
384 int dblocks = 1;
385 384
386 error = gfs2_quota_lock_check(ip); 385 error = gfs2_quota_lock_check(ip);
387 if (error) 386 if (error)
@@ -391,11 +390,11 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
391 if (error) 390 if (error)
392 goto out_quota; 391 goto out_quota;
393 392
394 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); 393 error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
395 if (error) 394 if (error)
396 goto out_ipreserv; 395 goto out_ipreserv;
397 396
398 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); 397 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
399 ip->i_no_formal_ino = ip->i_generation; 398 ip->i_no_formal_ino = ip->i_generation;
400 ip->i_inode.i_ino = ip->i_no_addr; 399 ip->i_inode.i_ino = ip->i_no_addr;
401 ip->i_goal = ip->i_no_addr; 400 ip->i_goal = ip->i_no_addr;
@@ -428,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh,
428} 427}
429 428
430/** 429/**
430 * gfs2_init_xattr - Initialise an xattr block for a new inode
431 * @ip: The inode in question
432 *
433 * This sets up an empty xattr block for a new inode, ready to
434 * take any ACLs, LSM xattrs, etc.
435 */
436
437static void gfs2_init_xattr(struct gfs2_inode *ip)
438{
439 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
440 struct buffer_head *bh;
441 struct gfs2_ea_header *ea;
442
443 bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
444 gfs2_trans_add_meta(ip->i_gl, bh);
445 gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
446 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
447
448 ea = GFS2_EA_BH2FIRST(bh);
449 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
450 ea->ea_type = GFS2_EATYPE_UNUSED;
451 ea->ea_flags = GFS2_EAFLAG_LAST;
452
453 brelse(bh);
454}
455
456/**
431 * init_dinode - Fill in a new dinode structure 457 * init_dinode - Fill in a new dinode structure
432 * @dip: The directory this inode is being created in 458 * @dip: The directory this inode is being created in
433 * @ip: The inode 459 * @ip: The inode
@@ -545,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
545 return err; 571 return err;
546} 572}
547 573
548static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
549 const struct qstr *qstr)
550{
551 return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
552 &gfs2_initxattrs, NULL);
553}
554
555/** 574/**
556 * gfs2_create_inode - Create a new inode 575 * gfs2_create_inode - Create a new inode
557 * @dir: The parent directory 576 * @dir: The parent directory
@@ -578,8 +597,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
578 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 597 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
579 struct gfs2_glock *io_gl; 598 struct gfs2_glock *io_gl;
580 struct dentry *d; 599 struct dentry *d;
581 int error; 600 int error, free_vfs_inode = 0;
582 u32 aflags = 0; 601 u32 aflags = 0;
602 unsigned blocks = 1;
583 struct gfs2_diradd da = { .bh = NULL, }; 603 struct gfs2_diradd da = { .bh = NULL, };
584 604
585 if (!name->len || name->len > GFS2_FNAMESIZE) 605 if (!name->len || name->len > GFS2_FNAMESIZE)
@@ -676,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
676 (dip->i_diskflags & GFS2_DIF_TOPDIR)) 696 (dip->i_diskflags & GFS2_DIF_TOPDIR))
677 aflags |= GFS2_AF_ORLOV; 697 aflags |= GFS2_AF_ORLOV;
678 698
679 error = alloc_dinode(ip, aflags); 699 if (default_acl || acl)
700 blocks++;
701
702 error = alloc_dinode(ip, aflags, &blocks);
680 if (error) 703 if (error)
681 goto fail_free_inode; 704 goto fail_free_inode;
682 705
706 gfs2_set_inode_blocks(inode, blocks);
707
683 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 708 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
684 if (error) 709 if (error)
685 goto fail_free_inode; 710 goto fail_free_inode;
@@ -689,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
689 if (error) 714 if (error)
690 goto fail_free_inode; 715 goto fail_free_inode;
691 716
692 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 717 error = gfs2_trans_begin(sdp, blocks, 0);
693 if (error) 718 if (error)
694 goto fail_gunlock2; 719 goto fail_gunlock2;
695 720
721 if (blocks > 1) {
722 ip->i_eattr = ip->i_no_addr + 1;
723 gfs2_init_xattr(ip);
724 }
696 init_dinode(dip, ip, symname); 725 init_dinode(dip, ip, symname);
697 gfs2_trans_end(sdp); 726 gfs2_trans_end(sdp);
698 727
@@ -722,7 +751,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
722 if (error) 751 if (error)
723 goto fail_gunlock3; 752 goto fail_gunlock3;
724 753
725 error = gfs2_security_init(dip, ip, name); 754 error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
755 &gfs2_initxattrs, NULL);
726 if (error) 756 if (error)
727 goto fail_gunlock3; 757 goto fail_gunlock3;
728 758
@@ -758,15 +788,16 @@ fail_free_acls:
758 if (acl) 788 if (acl)
759 posix_acl_release(acl); 789 posix_acl_release(acl);
760fail_free_vfs_inode: 790fail_free_vfs_inode:
761 free_inode_nonrcu(inode); 791 free_vfs_inode = 1;
762 inode = NULL;
763fail_gunlock: 792fail_gunlock:
764 gfs2_dir_no_add(&da); 793 gfs2_dir_no_add(&da);
765 gfs2_glock_dq_uninit(ghs); 794 gfs2_glock_dq_uninit(ghs);
766 if (inode && !IS_ERR(inode)) { 795 if (inode && !IS_ERR(inode)) {
767 clear_nlink(inode); 796 clear_nlink(inode);
768 mark_inode_dirty(inode); 797 if (!free_vfs_inode)
769 set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); 798 mark_inode_dirty(inode);
799 set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
800 &GFS2_I(inode)->i_flags);
770 iput(inode); 801 iput(inode);
771 } 802 }
772fail: 803fail:
@@ -1263,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1263 } 1294 }
1264 1295
1265 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); 1296 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
1297 if (!tmp) {
1298 error = -ENOENT;
1299 break;
1300 }
1266 if (IS_ERR(tmp)) { 1301 if (IS_ERR(tmp)) {
1267 error = PTR_ERR(tmp); 1302 error = PTR_ERR(tmp);
1268 break; 1303 break;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2a6ba06bee6f..c1eb555dc588 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/fs.h> 12#include <linux/fs.h>
11#include <linux/dlm.h> 13#include <linux/dlm.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode)
176 gfs2_glock_cb(gl, LM_ST_SHARED); 178 gfs2_glock_cb(gl, LM_ST_SHARED);
177 break; 179 break;
178 default: 180 default:
179 printk(KERN_ERR "unknown bast mode %d", mode); 181 pr_err("unknown bast mode %d\n", mode);
180 BUG(); 182 BUG();
181 } 183 }
182} 184}
@@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate)
195 case LM_ST_SHARED: 197 case LM_ST_SHARED:
196 return DLM_LOCK_PR; 198 return DLM_LOCK_PR;
197 } 199 }
198 printk(KERN_ERR "unknown LM state %d", lmstate); 200 pr_err("unknown LM state %d\n", lmstate);
199 BUG(); 201 BUG();
200 return -1; 202 return -1;
201} 203}
@@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
308 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, 310 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
309 NULL, gl); 311 NULL, gl);
310 if (error) { 312 if (error) {
311 printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", 313 pr_err("gdlm_unlock %x,%llx err=%d\n",
312 gl->gl_name.ln_type, 314 gl->gl_name.ln_type,
313 (unsigned long long)gl->gl_name.ln_number, error); 315 (unsigned long long)gl->gl_name.ln_number, error);
314 return; 316 return;
@@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
1102 } 1104 }
1103 1105
1104 if (ls->ls_recover_submit[jid]) { 1106 if (ls->ls_recover_submit[jid]) {
1105 fs_info(sdp, "recover_slot jid %d gen %u prev %u", 1107 fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
1106 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); 1108 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
1107 } 1109 }
1108 ls->ls_recover_submit[jid] = ls->ls_recover_block; 1110 ls->ls_recover_submit[jid] = ls->ls_recover_block;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9dcb9777a5f8..4a14d504ef83 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/kthread.h> 18#include <linux/kthread.h>
19#include <linux/freezer.h> 19#include <linux/freezer.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/blkdev.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/list_sort.h> 23#include <linux/list_sort.h>
23 24
@@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
145{ 146{
146 struct list_head *head = &sdp->sd_ail1_list; 147 struct list_head *head = &sdp->sd_ail1_list;
147 struct gfs2_trans *tr; 148 struct gfs2_trans *tr;
149 struct blk_plug plug;
148 150
149 trace_gfs2_ail_flush(sdp, wbc, 1); 151 trace_gfs2_ail_flush(sdp, wbc, 1);
152 blk_start_plug(&plug);
150 spin_lock(&sdp->sd_ail_lock); 153 spin_lock(&sdp->sd_ail_lock);
151restart: 154restart:
152 list_for_each_entry_reverse(tr, head, tr_list) { 155 list_for_each_entry_reverse(tr, head, tr_list) {
@@ -156,6 +159,7 @@ restart:
156 goto restart; 159 goto restart;
157 } 160 }
158 spin_unlock(&sdp->sd_ail_lock); 161 spin_unlock(&sdp->sd_ail_lock);
162 blk_finish_plug(&plug);
159 trace_gfs2_ail_flush(sdp, wbc, 0); 163 trace_gfs2_ail_flush(sdp, wbc, 0);
160} 164}
161 165
@@ -410,24 +414,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
410static unsigned int calc_reserved(struct gfs2_sbd *sdp) 414static unsigned int calc_reserved(struct gfs2_sbd *sdp)
411{ 415{
412 unsigned int reserved = 0; 416 unsigned int reserved = 0;
413 unsigned int mbuf_limit, metabufhdrs_needed; 417 unsigned int mbuf;
414 unsigned int dbuf_limit, databufhdrs_needed; 418 unsigned int dbuf;
415 unsigned int revokes = 0; 419 struct gfs2_trans *tr = sdp->sd_log_tr;
416 420
417 mbuf_limit = buf_limit(sdp); 421 if (tr) {
418 metabufhdrs_needed = (sdp->sd_log_commited_buf + 422 mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
419 (mbuf_limit - 1)) / mbuf_limit; 423 dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
420 dbuf_limit = databuf_limit(sdp); 424 reserved = mbuf + dbuf;
421 databufhdrs_needed = (sdp->sd_log_commited_databuf + 425 /* Account for header blocks */
422 (dbuf_limit - 1)) / dbuf_limit; 426 reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
427 reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
428 }
423 429
424 if (sdp->sd_log_commited_revoke > 0) 430 if (sdp->sd_log_commited_revoke > 0)
425 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 431 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
426 sizeof(u64)); 432 sizeof(u64));
427
428 reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
429 sdp->sd_log_commited_databuf + databufhdrs_needed +
430 revokes;
431 /* One for the overall header */ 433 /* One for the overall header */
432 if (reserved) 434 if (reserved)
433 reserved++; 435 reserved++;
@@ -682,36 +684,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
682 } 684 }
683 trace_gfs2_log_flush(sdp, 1); 685 trace_gfs2_log_flush(sdp, 1);
684 686
687 sdp->sd_log_flush_head = sdp->sd_log_head;
688 sdp->sd_log_flush_wrapped = 0;
685 tr = sdp->sd_log_tr; 689 tr = sdp->sd_log_tr;
686 if (tr) { 690 if (tr) {
687 sdp->sd_log_tr = NULL; 691 sdp->sd_log_tr = NULL;
688 INIT_LIST_HEAD(&tr->tr_ail1_list); 692 INIT_LIST_HEAD(&tr->tr_ail1_list);
689 INIT_LIST_HEAD(&tr->tr_ail2_list); 693 INIT_LIST_HEAD(&tr->tr_ail2_list);
694 tr->tr_first = sdp->sd_log_flush_head;
690 } 695 }
691 696
692 if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
693 printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
694 sdp->sd_log_commited_buf);
695 gfs2_assert_withdraw(sdp, 0);
696 }
697 if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
698 printk(KERN_INFO "GFS2: log databuf %u %u\n",
699 sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
700 gfs2_assert_withdraw(sdp, 0);
701 }
702 gfs2_assert_withdraw(sdp, 697 gfs2_assert_withdraw(sdp,
703 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 698 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
704 699
705 sdp->sd_log_flush_head = sdp->sd_log_head;
706 sdp->sd_log_flush_wrapped = 0;
707 if (tr)
708 tr->tr_first = sdp->sd_log_flush_head;
709
710 gfs2_ordered_write(sdp); 700 gfs2_ordered_write(sdp);
711 lops_before_commit(sdp); 701 lops_before_commit(sdp, tr);
712 gfs2_log_flush_bio(sdp, WRITE); 702 gfs2_log_flush_bio(sdp, WRITE);
713 703
714 if (sdp->sd_log_head != sdp->sd_log_flush_head) { 704 if (sdp->sd_log_head != sdp->sd_log_flush_head) {
705 log_flush_wait(sdp);
715 log_write_header(sdp, 0); 706 log_write_header(sdp, 0);
716 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 707 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
717 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ 708 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
@@ -723,8 +714,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
723 gfs2_log_lock(sdp); 714 gfs2_log_lock(sdp);
724 sdp->sd_log_head = sdp->sd_log_flush_head; 715 sdp->sd_log_head = sdp->sd_log_flush_head;
725 sdp->sd_log_blks_reserved = 0; 716 sdp->sd_log_blks_reserved = 0;
726 sdp->sd_log_commited_buf = 0;
727 sdp->sd_log_commited_databuf = 0;
728 sdp->sd_log_commited_revoke = 0; 717 sdp->sd_log_commited_revoke = 0;
729 718
730 spin_lock(&sdp->sd_ail_lock); 719 spin_lock(&sdp->sd_ail_lock);
@@ -740,34 +729,54 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
740 kfree(tr); 729 kfree(tr);
741} 730}
742 731
732/**
733 * gfs2_merge_trans - Merge a new transaction into a cached transaction
734 * @old: Original transaction to be expanded
735 * @new: New transaction to be merged
736 */
737
738static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
739{
740 WARN_ON_ONCE(old->tr_attached != 1);
741
742 old->tr_num_buf_new += new->tr_num_buf_new;
743 old->tr_num_databuf_new += new->tr_num_databuf_new;
744 old->tr_num_buf_rm += new->tr_num_buf_rm;
745 old->tr_num_databuf_rm += new->tr_num_databuf_rm;
746 old->tr_num_revoke += new->tr_num_revoke;
747 old->tr_num_revoke_rm += new->tr_num_revoke_rm;
748
749 list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
750 list_splice_tail_init(&new->tr_buf, &old->tr_buf);
751}
752
743static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 753static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
744{ 754{
745 unsigned int reserved; 755 unsigned int reserved;
746 unsigned int unused; 756 unsigned int unused;
757 unsigned int maxres;
747 758
748 gfs2_log_lock(sdp); 759 gfs2_log_lock(sdp);
749 760
750 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; 761 if (sdp->sd_log_tr) {
751 sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - 762 gfs2_merge_trans(sdp->sd_log_tr, tr);
752 tr->tr_num_databuf_rm; 763 } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
753 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 764 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
754 (((int)sdp->sd_log_commited_databuf) >= 0)); 765 sdp->sd_log_tr = tr;
766 tr->tr_attached = 1;
767 }
768
755 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
756 reserved = calc_reserved(sdp); 770 reserved = calc_reserved(sdp);
757 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 771 maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
758 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 772 gfs2_assert_withdraw(sdp, maxres >= reserved);
773 unused = maxres - reserved;
759 atomic_add(unused, &sdp->sd_log_blks_free); 774 atomic_add(unused, &sdp->sd_log_blks_free);
760 trace_gfs2_log_blocks(sdp, unused); 775 trace_gfs2_log_blocks(sdp, unused);
761 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 776 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
762 sdp->sd_jdesc->jd_blocks); 777 sdp->sd_jdesc->jd_blocks);
763 sdp->sd_log_blks_reserved = reserved; 778 sdp->sd_log_blks_reserved = reserved;
764 779
765 if (sdp->sd_log_tr == NULL &&
766 (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
767 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
768 sdp->sd_log_tr = tr;
769 tr->tr_attached = 1;
770 }
771 gfs2_log_unlock(sdp); 780 gfs2_log_unlock(sdp);
772} 781}
773 782
@@ -807,10 +816,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
807 down_write(&sdp->sd_log_flush_lock); 816 down_write(&sdp->sd_log_flush_lock);
808 817
809 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 818 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
810 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
811 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 819 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
812 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
813 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
814 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); 820 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
815 821
816 sdp->sd_log_flush_head = sdp->sd_log_head; 822 sdp->sd_log_flush_head = sdp->sd_log_head;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 76693793cedd..a294d8d8bcd4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -146,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
146 struct gfs2_journal_extent *je; 146 struct gfs2_journal_extent *je;
147 u64 block; 147 u64 block;
148 148
149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { 149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
150 if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { 150 if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
151 block = je->dblock + lbn - je->lblock; 151 block = je->dblock + lbn - je->lblock;
152 gfs2_log_incr_head(sdp); 152 gfs2_log_incr_head(sdp);
153 return block; 153 return block;
@@ -491,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
491 gfs2_log_unlock(sdp); 491 gfs2_log_unlock(sdp);
492} 492}
493 493
494static void buf_lo_before_commit(struct gfs2_sbd *sdp) 494static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
495{ 495{
496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ 496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
497 497 unsigned int nbuf;
498 gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, 498 if (tr == NULL)
499 &sdp->sd_log_le_buf, 0); 499 return;
500 nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
501 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
500} 502}
501 503
502static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 504static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
503{ 505{
504 struct list_head *head = &sdp->sd_log_le_buf; 506 struct list_head *head;
505 struct gfs2_bufdata *bd; 507 struct gfs2_bufdata *bd;
506 508
507 if (tr == NULL) { 509 if (tr == NULL)
508 gfs2_assert(sdp, list_empty(head));
509 return; 510 return;
510 }
511 511
512 head = &tr->tr_buf;
512 while (!list_empty(head)) { 513 while (!list_empty(head)) {
513 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 514 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
514 list_del_init(&bd->bd_list); 515 list_del_init(&bd->bd_list);
515 sdp->sd_log_num_buf--;
516
517 gfs2_unpin(sdp, bd->bd_bh, tr); 516 gfs2_unpin(sdp, bd->bd_bh, tr);
518 } 517 }
519 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
520} 518}
521 519
522static void buf_lo_before_scan(struct gfs2_jdesc *jd, 520static void buf_lo_before_scan(struct gfs2_jdesc *jd,
523 struct gfs2_log_header_host *head, int pass) 521 struct gfs2_log_header_host *head, int pass)
524{ 522{
525 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
526
527 if (pass != 0) 523 if (pass != 0)
528 return; 524 return;
529 525
530 sdp->sd_found_blocks = 0; 526 jd->jd_found_blocks = 0;
531 sdp->sd_replayed_blocks = 0; 527 jd->jd_replayed_blocks = 0;
532} 528}
533 529
534static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 530static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -551,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
551 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { 547 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
552 blkno = be64_to_cpu(*ptr++); 548 blkno = be64_to_cpu(*ptr++);
553 549
554 sdp->sd_found_blocks++; 550 jd->jd_found_blocks++;
555 551
556 if (gfs2_revoke_check(sdp, blkno, start)) 552 if (gfs2_revoke_check(jd, blkno, start))
557 continue; 553 continue;
558 554
559 error = gfs2_replay_read_block(jd, start, &bh_log); 555 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -574,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
574 if (error) 570 if (error)
575 break; 571 break;
576 572
577 sdp->sd_replayed_blocks++; 573 jd->jd_replayed_blocks++;
578 } 574 }
579 575
580 return error; 576 return error;
@@ -617,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
617 gfs2_meta_sync(ip->i_gl); 613 gfs2_meta_sync(ip->i_gl);
618 614
619 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", 615 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
620 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 616 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
621} 617}
622 618
623static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 619static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
624{ 620{
625 struct gfs2_meta_header *mh; 621 struct gfs2_meta_header *mh;
626 unsigned int offset; 622 unsigned int offset;
@@ -679,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
679static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 675static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
680 struct gfs2_log_header_host *head, int pass) 676 struct gfs2_log_header_host *head, int pass)
681{ 677{
682 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
683
684 if (pass != 0) 678 if (pass != 0)
685 return; 679 return;
686 680
687 sdp->sd_found_revokes = 0; 681 jd->jd_found_revokes = 0;
688 sdp->sd_replay_tail = head->lh_tail; 682 jd->jd_replay_tail = head->lh_tail;
689} 683}
690 684
691static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 685static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -717,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
717 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { 711 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
718 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); 712 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
719 713
720 error = gfs2_revoke_add(sdp, blkno, start); 714 error = gfs2_revoke_add(jd, blkno, start);
721 if (error < 0) { 715 if (error < 0) {
722 brelse(bh); 716 brelse(bh);
723 return error; 717 return error;
724 } 718 }
725 else if (error) 719 else if (error)
726 sdp->sd_found_revokes++; 720 jd->jd_found_revokes++;
727 721
728 if (!--revokes) 722 if (!--revokes)
729 break; 723 break;
@@ -743,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
743 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 737 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
744 738
745 if (error) { 739 if (error) {
746 gfs2_revoke_clean(sdp); 740 gfs2_revoke_clean(jd);
747 return; 741 return;
748 } 742 }
749 if (pass != 1) 743 if (pass != 1)
750 return; 744 return;
751 745
752 fs_info(sdp, "jid=%u: Found %u revoke tags\n", 746 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
753 jd->jd_jid, sdp->sd_found_revokes); 747 jd->jd_jid, jd->jd_found_revokes);
754 748
755 gfs2_revoke_clean(sdp); 749 gfs2_revoke_clean(jd);
756} 750}
757 751
758/** 752/**
@@ -760,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
760 * 754 *
761 */ 755 */
762 756
763static void databuf_lo_before_commit(struct gfs2_sbd *sdp) 757static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
764{ 758{
765 unsigned int limit = buf_limit(sdp) / 2; 759 unsigned int limit = databuf_limit(sdp);
766 760 unsigned int nbuf;
767 gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, 761 if (tr == NULL)
768 &sdp->sd_log_le_databuf, 1); 762 return;
763 nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
764 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
769} 765}
770 766
771static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 767static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -789,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
789 blkno = be64_to_cpu(*ptr++); 785 blkno = be64_to_cpu(*ptr++);
790 esc = be64_to_cpu(*ptr++); 786 esc = be64_to_cpu(*ptr++);
791 787
792 sdp->sd_found_blocks++; 788 jd->jd_found_blocks++;
793 789
794 if (gfs2_revoke_check(sdp, blkno, start)) 790 if (gfs2_revoke_check(jd, blkno, start))
795 continue; 791 continue;
796 792
797 error = gfs2_replay_read_block(jd, start, &bh_log); 793 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -811,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
811 brelse(bh_log); 807 brelse(bh_log);
812 brelse(bh_ip); 808 brelse(bh_ip);
813 809
814 sdp->sd_replayed_blocks++; 810 jd->jd_replayed_blocks++;
815 } 811 }
816 812
817 return error; 813 return error;
@@ -835,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
835 gfs2_meta_sync(ip->i_gl); 831 gfs2_meta_sync(ip->i_gl);
836 832
837 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", 833 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
838 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 834 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
839} 835}
840 836
841static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 837static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
842{ 838{
843 struct list_head *head = &sdp->sd_log_le_databuf; 839 struct list_head *head;
844 struct gfs2_bufdata *bd; 840 struct gfs2_bufdata *bd;
845 841
846 if (tr == NULL) { 842 if (tr == NULL)
847 gfs2_assert(sdp, list_empty(head));
848 return; 843 return;
849 }
850 844
845 head = &tr->tr_databuf;
851 while (!list_empty(head)) { 846 while (!list_empty(head)) {
852 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 847 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
853 list_del_init(&bd->bd_list); 848 list_del_init(&bd->bd_list);
854 sdp->sd_log_num_databuf--;
855 gfs2_unpin(sdp, bd->bd_bh, tr); 849 gfs2_unpin(sdp, bd->bd_bh, tr);
856 } 850 }
857 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
858} 851}
859 852
860 853
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9ca2e6438419..a65a7ba32ffd 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
46 return limit; 46 return limit;
47} 47}
48 48
49static inline void lops_before_commit(struct gfs2_sbd *sdp) 49static inline void lops_before_commit(struct gfs2_sbd *sdp,
50 struct gfs2_trans *tr)
50{ 51{
51 int x; 52 int x;
52 for (x = 0; gfs2_log_ops[x]; x++) 53 for (x = 0; gfs2_log_ops[x]; x++)
53 if (gfs2_log_ops[x]->lo_before_commit) 54 if (gfs2_log_ops[x]->lo_before_commit)
54 gfs2_log_ops[x]->lo_before_commit(sdp); 55 gfs2_log_ops[x]->lo_before_commit(sdp, tr);
55} 56}
56 57
57static inline void lops_after_commit(struct gfs2_sbd *sdp, 58static inline void lops_after_commit(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c272e73063de..82b6ac829656 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -165,7 +167,7 @@ static int __init init_gfs2_fs(void)
165 167
166 gfs2_register_debugfs(); 168 gfs2_register_debugfs();
167 169
168 printk("GFS2 installed\n"); 170 pr_info("GFS2 installed\n");
169 171
170 return 0; 172 return 0;
171 173
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index c7f24690ed05..2cf09b63a6b4 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
97 .releasepage = gfs2_releasepage, 97 .releasepage = gfs2_releasepage,
98}; 98};
99 99
100const struct address_space_operations gfs2_rgrp_aops = {
101 .writepage = gfs2_aspace_writepage,
102 .releasepage = gfs2_releasepage,
103};
104
100/** 105/**
101 * gfs2_getbuf - Get a buffer with a given address space 106 * gfs2_getbuf - Get a buffer with a given address space
102 * @gl: the glock 107 * @gl: the glock
@@ -267,15 +272,10 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
267 trace_gfs2_pin(bd, 0); 272 trace_gfs2_pin(bd, 0);
268 atomic_dec(&sdp->sd_log_pinned); 273 atomic_dec(&sdp->sd_log_pinned);
269 list_del_init(&bd->bd_list); 274 list_del_init(&bd->bd_list);
270 if (meta) { 275 if (meta)
271 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
272 sdp->sd_log_num_buf--;
273 tr->tr_num_buf_rm++; 276 tr->tr_num_buf_rm++;
274 } else { 277 else
275 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
276 sdp->sd_log_num_databuf--;
277 tr->tr_num_databuf_rm++; 278 tr->tr_num_databuf_rm++;
278 }
279 tr->tr_touched = 1; 279 tr->tr_touched = 1;
280 was_pinned = 1; 280 was_pinned = 1;
281 brelse(bh); 281 brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208a..ac5d8027d335 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
38} 38}
39 39
40extern const struct address_space_operations gfs2_meta_aops; 40extern const struct address_space_operations gfs2_meta_aops;
41extern const struct address_space_operations gfs2_rgrp_aops;
41 42
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) 43static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{ 44{
44 struct inode *inode = mapping->host; 45 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops) 46 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; 47 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
48 else if (mapping->a_ops == &gfs2_rgrp_aops)
49 return container_of(mapping, struct gfs2_sbd, sd_aspace);
47 else 50 else
48 return inode->i_sb->s_fs_info; 51 return inode->i_sb->s_fs_info;
49} 52}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6872d09561a..22f954051bb8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -104,7 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
104 mapping = &sdp->sd_aspace; 106 mapping = &sdp->sd_aspace;
105 107
106 address_space_init_once(mapping); 108 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops; 109 mapping->a_ops = &gfs2_rgrp_aops;
108 mapping->host = sb->s_bdev->bd_inode; 110 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0; 111 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS); 112 mapping_set_gfp_mask(mapping, GFP_NOFS);
@@ -114,9 +116,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
114 116
115 spin_lock_init(&sdp->sd_log_lock); 117 spin_lock_init(&sdp->sd_log_lock);
116 atomic_set(&sdp->sd_log_pinned, 0); 118 atomic_set(&sdp->sd_log_pinned, 0);
117 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
118 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 119 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
119 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
121 spin_lock_init(&sdp->sd_ordered_lock); 121 spin_lock_init(&sdp->sd_ordered_lock);
122 122
@@ -130,8 +130,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
130 atomic_set(&sdp->sd_log_in_flight, 0); 130 atomic_set(&sdp->sd_log_in_flight, 0);
131 init_waitqueue_head(&sdp->sd_log_flush_wait); 131 init_waitqueue_head(&sdp->sd_log_flush_wait);
132 132
133 INIT_LIST_HEAD(&sdp->sd_revoke_list);
134
135 return sdp; 133 return sdp;
136} 134}
137 135
@@ -154,7 +152,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
154 if (sb->sb_magic != GFS2_MAGIC || 152 if (sb->sb_magic != GFS2_MAGIC ||
155 sb->sb_type != GFS2_METATYPE_SB) { 153 sb->sb_type != GFS2_METATYPE_SB) {
156 if (!silent) 154 if (!silent)
157 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); 155 pr_warn("not a GFS2 filesystem\n");
158 return -EINVAL; 156 return -EINVAL;
159 } 157 }
160 158
@@ -176,7 +174,7 @@ static void end_bio_io_page(struct bio *bio, int error)
176 if (!error) 174 if (!error)
177 SetPageUptodate(page); 175 SetPageUptodate(page);
178 else 176 else
179 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); 177 pr_warn("error %d reading superblock\n", error);
180 unlock_page(page); 178 unlock_page(page);
181} 179}
182 180
@@ -519,67 +517,6 @@ out:
519 return ret; 517 return ret;
520} 518}
521 519
522/**
523 * map_journal_extents - create a reusable "extent" mapping from all logical
524 * blocks to all physical blocks for the given journal. This will save
525 * us time when writing journal blocks. Most journals will have only one
526 * extent that maps all their logical blocks. That's because gfs2.mkfs
527 * arranges the journal blocks sequentially to maximize performance.
528 * So the extent would map the first block for the entire file length.
529 * However, gfs2_jadd can happen while file activity is happening, so
530 * those journals may not be sequential. Less likely is the case where
531 * the users created their own journals by mounting the metafs and
532 * laying it out. But it's still possible. These journals might have
533 * several extents.
534 *
535 * TODO: This should be done in bigger chunks rather than one block at a time,
536 * but since it's only done at mount time, I'm not worried about the
537 * time it takes.
538 */
539static int map_journal_extents(struct gfs2_sbd *sdp)
540{
541 struct gfs2_jdesc *jd = sdp->sd_jdesc;
542 unsigned int lb;
543 u64 db, prev_db; /* logical block, disk block, prev disk block */
544 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
545 struct gfs2_journal_extent *jext = NULL;
546 struct buffer_head bh;
547 int rc = 0;
548
549 prev_db = 0;
550
551 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
552 bh.b_state = 0;
553 bh.b_blocknr = 0;
554 bh.b_size = 1 << ip->i_inode.i_blkbits;
555 rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
556 db = bh.b_blocknr;
557 if (rc || !db) {
558 printk(KERN_INFO "GFS2 journal mapping error %d: lb="
559 "%u db=%llu\n", rc, lb, (unsigned long long)db);
560 break;
561 }
562 if (!prev_db || db != prev_db + 1) {
563 jext = kzalloc(sizeof(struct gfs2_journal_extent),
564 GFP_KERNEL);
565 if (!jext) {
566 printk(KERN_INFO "GFS2 error: out of memory "
567 "mapping journal extents.\n");
568 rc = -ENOMEM;
569 break;
570 }
571 jext->dblock = db;
572 jext->lblock = lb;
573 jext->blocks = 1;
574 list_add_tail(&jext->extent_list, &jd->extent_list);
575 } else {
576 jext->blocks++;
577 }
578 prev_db = db;
579 }
580 return rc;
581}
582
583static void gfs2_others_may_mount(struct gfs2_sbd *sdp) 520static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
584{ 521{
585 char *message = "FIRSTMOUNT=Done"; 522 char *message = "FIRSTMOUNT=Done";
@@ -638,6 +575,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
638 break; 575 break;
639 576
640 INIT_LIST_HEAD(&jd->extent_list); 577 INIT_LIST_HEAD(&jd->extent_list);
578 INIT_LIST_HEAD(&jd->jd_revoke_list);
579
641 INIT_WORK(&jd->jd_work, gfs2_recover_func); 580 INIT_WORK(&jd->jd_work, gfs2_recover_func);
642 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 581 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
643 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 582 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
@@ -781,7 +720,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
781 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); 720 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
782 721
783 /* Map the extents for this journal's blocks */ 722 /* Map the extents for this journal's blocks */
784 map_journal_extents(sdp); 723 gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
785 } 724 }
786 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); 725 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
787 726
@@ -1008,7 +947,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1008 lm = &gfs2_dlm_ops; 947 lm = &gfs2_dlm_ops;
1009#endif 948#endif
1010 } else { 949 } else {
1011 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); 950 pr_info("can't find protocol %s\n", proto);
1012 return -ENOENT; 951 return -ENOENT;
1013 } 952 }
1014 953
@@ -1115,7 +1054,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1115 1054
1116 sdp = init_sbd(sb); 1055 sdp = init_sbd(sb);
1117 if (!sdp) { 1056 if (!sdp) {
1118 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); 1057 pr_warn("can't alloc struct gfs2_sbd\n");
1119 return -ENOMEM; 1058 return -ENOMEM;
1120 } 1059 }
1121 sdp->sd_args = *args; 1060 sdp->sd_args = *args;
@@ -1363,7 +1302,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1363 1302
1364 error = gfs2_mount_args(&args, data); 1303 error = gfs2_mount_args(&args, data);
1365 if (error) { 1304 if (error) {
1366 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1305 pr_warn("can't parse mount arguments\n");
1367 goto error_super; 1306 goto error_super;
1368 } 1307 }
1369 1308
@@ -1413,15 +1352,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1413 1352
1414 error = kern_path(dev_name, LOOKUP_FOLLOW, &path); 1353 error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
1415 if (error) { 1354 if (error) {
1416 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1355 pr_warn("path_lookup on %s returned error %d\n",
1417 dev_name, error); 1356 dev_name, error);
1418 return ERR_PTR(error); 1357 return ERR_PTR(error);
1419 } 1358 }
1420 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, 1359 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
1421 path.dentry->d_inode->i_sb->s_bdev); 1360 path.dentry->d_inode->i_sb->s_bdev);
1422 path_put(&path); 1361 path_put(&path);
1423 if (IS_ERR(s)) { 1362 if (IS_ERR(s)) {
1424 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1363 pr_warn("gfs2 mount does not exist\n");
1425 return ERR_CAST(s); 1364 return ERR_CAST(s);
1426 } 1365 }
1427 if ((flags ^ s->s_flags) & MS_RDONLY) { 1366 if ((flags ^ s->s_flags) & MS_RDONLY) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8bec0e3192dd..c4effff7cf55 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -36,6 +36,8 @@
36 * the quota file, so it is not being constantly read. 36 * the quota file, so it is not being constantly read.
37 */ 37 */
38 38
39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
40
39#include <linux/sched.h> 41#include <linux/sched.h>
40#include <linux/slab.h> 42#include <linux/slab.h>
41#include <linux/mm.h> 43#include <linux/mm.h>
@@ -330,6 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd)
330 if (bit < sdp->sd_quota_slots) { 332 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap); 333 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit; 334 qd->qd_slot = bit;
335 error = 0;
333out: 336out:
334 qd->qd_slot_count++; 337 qd->qd_slot_count++;
335 } 338 }
@@ -1081,10 +1084,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
1081{ 1084{
1082 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 1085 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
1083 1086
1084 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", 1087 fs_info(sdp, "quota %s for %s %u\n",
1085 sdp->sd_fsname, type, 1088 type,
1086 (qd->qd_id.type == USRQUOTA) ? "user" : "group", 1089 (qd->qd_id.type == USRQUOTA) ? "user" : "group",
1087 from_kqid(&init_user_ns, qd->qd_id)); 1090 from_kqid(&init_user_ns, qd->qd_id));
1088 1091
1089 return 0; 1092 return 0;
1090} 1093}
@@ -1242,14 +1245,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); 1245 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1243 bm_size *= sizeof(unsigned long); 1246 bm_size *= sizeof(unsigned long);
1244 error = -ENOMEM; 1247 error = -ENOMEM;
1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); 1248 sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
1246 if (sdp->sd_quota_bitmap == NULL) 1249 if (sdp->sd_quota_bitmap == NULL)
1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); 1250 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
1251 __GFP_ZERO, PAGE_KERNEL);
1248 if (!sdp->sd_quota_bitmap) 1252 if (!sdp->sd_quota_bitmap)
1249 return error; 1253 return error;
1250 1254
1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1252
1253 for (x = 0; x < blocks; x++) { 1255 for (x = 0; x < blocks; x++) {
1254 struct buffer_head *bh; 1256 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc; 1257 const struct gfs2_quota_change *qc;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 963b2d75200c..7ad4094d68c0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
52 return error; 52 return error;
53} 53}
54 54
55int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 55int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
56{ 56{
57 struct list_head *head = &sdp->sd_revoke_list; 57 struct list_head *head = &jd->jd_revoke_list;
58 struct gfs2_revoke_replay *rr; 58 struct gfs2_revoke_replay *rr;
59 int found = 0; 59 int found = 0;
60 60
@@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
81 return 1; 81 return 1;
82} 82}
83 83
84int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 84int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
85{ 85{
86 struct gfs2_revoke_replay *rr; 86 struct gfs2_revoke_replay *rr;
87 int wrap, a, b, revoke; 87 int wrap, a, b, revoke;
88 int found = 0; 88 int found = 0;
89 89
90 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { 90 list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
91 if (rr->rr_blkno == blkno) { 91 if (rr->rr_blkno == blkno) {
92 found = 1; 92 found = 1;
93 break; 93 break;
@@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
97 if (!found) 97 if (!found)
98 return 0; 98 return 0;
99 99
100 wrap = (rr->rr_where < sdp->sd_replay_tail); 100 wrap = (rr->rr_where < jd->jd_replay_tail);
101 a = (sdp->sd_replay_tail < where); 101 a = (jd->jd_replay_tail < where);
102 b = (where < rr->rr_where); 102 b = (where < rr->rr_where);
103 revoke = (wrap) ? (a || b) : (a && b); 103 revoke = (wrap) ? (a || b) : (a && b);
104 104
105 return revoke; 105 return revoke;
106} 106}
107 107
108void gfs2_revoke_clean(struct gfs2_sbd *sdp) 108void gfs2_revoke_clean(struct gfs2_jdesc *jd)
109{ 109{
110 struct list_head *head = &sdp->sd_revoke_list; 110 struct list_head *head = &jd->jd_revoke_list;
111 struct gfs2_revoke_replay *rr; 111 struct gfs2_revoke_replay *rr;
112 112
113 while (!list_empty(head)) { 113 while (!list_empty(head)) {
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 2226136c7647..6142836cce96 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
24 struct buffer_head **bh); 24 struct buffer_head **bh);
25 25
26extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 26extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
27extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 27extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
28extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); 28extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
29 29
30extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
31 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a1da21349235..281a7716e3f3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -99,12 +101,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
99 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; 101 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
100 102
101 if (unlikely(!valid_change[new_state * 4 + cur_state])) { 103 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
102 printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " 104 pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
103 "new_state=%d\n", rbm->offset, cur_state, new_state); 105 rbm->offset, cur_state, new_state);
104 printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", 106 pr_warn("rgrp=0x%llx bi_start=0x%x\n",
105 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); 107 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
106 printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", 108 pr_warn("bi_offset=0x%x bi_len=0x%x\n",
107 bi->bi_offset, bi->bi_len); 109 bi->bi_offset, bi->bi_len);
108 dump_stack(); 110 dump_stack();
109 gfs2_consist_rgrpd(rbm->rgd); 111 gfs2_consist_rgrpd(rbm->rgd);
110 return; 112 return;
@@ -736,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
736 738
737static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) 739static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
738{ 740{
739 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); 741 pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
740 printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); 742 pr_info("ri_length = %u\n", rgd->rd_length);
741 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); 743 pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
742 printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); 744 pr_info("ri_data = %u\n", rgd->rd_data);
743 printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); 745 pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
744} 746}
745 747
746/** 748/**
@@ -1102,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1102 * Returns: errno 1104 * Returns: errno
1103 */ 1105 */
1104 1106
1105int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) 1107static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1106{ 1108{
1107 struct gfs2_sbd *sdp = rgd->rd_sbd; 1109 struct gfs2_sbd *sdp = rgd->rd_sbd;
1108 struct gfs2_glock *gl = rgd->rd_gl; 1110 struct gfs2_glock *gl = rgd->rd_gl;
@@ -1169,7 +1171,7 @@ fail:
1169 return error; 1171 return error;
1170} 1172}
1171 1173
1172int update_rgrp_lvb(struct gfs2_rgrpd *rgd) 1174static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1173{ 1175{
1174 u32 rl_flags; 1176 u32 rl_flags;
1175 1177
@@ -2278,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2278 } 2280 }
2279 } 2281 }
2280 if (rbm.rgd->rd_free < *nblocks) { 2282 if (rbm.rgd->rd_free < *nblocks) {
2281 printk(KERN_WARNING "nblocks=%u\n", *nblocks); 2283 pr_warn("nblocks=%u\n", *nblocks);
2282 goto rgrp_error; 2284 goto rgrp_error;
2283 } 2285 }
2284 2286
@@ -2296,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2296 2298
2297 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); 2299 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2298 if (dinode) 2300 if (dinode)
2299 gfs2_trans_add_unrevoke(sdp, block, 1); 2301 gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2300 2302
2301 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); 2303 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2302 2304
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 60f60f6181f3..de8afad89e51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/bio.h> 12#include <linux/bio.h>
11#include <linux/sched.h> 13#include <linux/sched.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
175 break; 177 break;
176 case Opt_debug: 178 case Opt_debug:
177 if (args->ar_errors == GFS2_ERRORS_PANIC) { 179 if (args->ar_errors == GFS2_ERRORS_PANIC) {
178 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 180 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
179 "are mutually exclusive.\n");
180 return -EINVAL; 181 return -EINVAL;
181 } 182 }
182 args->ar_debug = 1; 183 args->ar_debug = 1;
@@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
228 case Opt_commit: 229 case Opt_commit:
229 rv = match_int(&tmp[0], &args->ar_commit); 230 rv = match_int(&tmp[0], &args->ar_commit);
230 if (rv || args->ar_commit <= 0) { 231 if (rv || args->ar_commit <= 0) {
231 printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); 232 pr_warn("commit mount option requires a positive numeric argument\n");
232 return rv ? rv : -EINVAL; 233 return rv ? rv : -EINVAL;
233 } 234 }
234 break; 235 break;
235 case Opt_statfs_quantum: 236 case Opt_statfs_quantum:
236 rv = match_int(&tmp[0], &args->ar_statfs_quantum); 237 rv = match_int(&tmp[0], &args->ar_statfs_quantum);
237 if (rv || args->ar_statfs_quantum < 0) { 238 if (rv || args->ar_statfs_quantum < 0) {
238 printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); 239 pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
239 return rv ? rv : -EINVAL; 240 return rv ? rv : -EINVAL;
240 } 241 }
241 break; 242 break;
242 case Opt_quota_quantum: 243 case Opt_quota_quantum:
243 rv = match_int(&tmp[0], &args->ar_quota_quantum); 244 rv = match_int(&tmp[0], &args->ar_quota_quantum);
244 if (rv || args->ar_quota_quantum <= 0) { 245 if (rv || args->ar_quota_quantum <= 0) {
245 printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); 246 pr_warn("quota_quantum mount option requires a positive numeric argument\n");
246 return rv ? rv : -EINVAL; 247 return rv ? rv : -EINVAL;
247 } 248 }
248 break; 249 break;
@@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
250 rv = match_int(&tmp[0], &args->ar_statfs_percent); 251 rv = match_int(&tmp[0], &args->ar_statfs_percent);
251 if (rv || args->ar_statfs_percent < 0 || 252 if (rv || args->ar_statfs_percent < 0 ||
252 args->ar_statfs_percent > 100) { 253 args->ar_statfs_percent > 100) {
253 printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); 254 pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
254 return rv ? rv : -EINVAL; 255 return rv ? rv : -EINVAL;
255 } 256 }
256 break; 257 break;
@@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
259 break; 260 break;
260 case Opt_err_panic: 261 case Opt_err_panic:
261 if (args->ar_debug) { 262 if (args->ar_debug) {
262 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 263 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
263 "are mutually exclusive.\n");
264 return -EINVAL; 264 return -EINVAL;
265 } 265 }
266 args->ar_errors = GFS2_ERRORS_PANIC; 266 args->ar_errors = GFS2_ERRORS_PANIC;
@@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
279 break; 279 break;
280 case Opt_error: 280 case Opt_error:
281 default: 281 default:
282 printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); 282 pr_warn("invalid mount option: %s\n", o);
283 return -EINVAL; 283 return -EINVAL;
284 } 284 }
285 } 285 }
@@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
295 295
296void gfs2_jindex_free(struct gfs2_sbd *sdp) 296void gfs2_jindex_free(struct gfs2_sbd *sdp)
297{ 297{
298 struct list_head list, *head; 298 struct list_head list;
299 struct gfs2_jdesc *jd; 299 struct gfs2_jdesc *jd;
300 struct gfs2_journal_extent *jext;
301 300
302 spin_lock(&sdp->sd_jindex_spin); 301 spin_lock(&sdp->sd_jindex_spin);
303 list_add(&list, &sdp->sd_jindex_list); 302 list_add(&list, &sdp->sd_jindex_list);
@@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
307 306
308 while (!list_empty(&list)) { 307 while (!list_empty(&list)) {
309 jd = list_entry(list.next, struct gfs2_jdesc, jd_list); 308 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
310 head = &jd->extent_list; 309 gfs2_free_journal_extents(jd);
311 while (!list_empty(head)) {
312 jext = list_entry(head->next,
313 struct gfs2_journal_extent,
314 extent_list);
315 list_del(&jext->extent_list);
316 kfree(jext);
317 }
318 list_del(&jd->jd_list); 310 list_del(&jd->jd_list);
319 iput(jd->jd_inode); 311 iput(jd->jd_inode);
320 kfree(jd); 312 kfree(jd);
@@ -1175,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1175 struct gfs2_tune *gt = &sdp->sd_tune; 1167 struct gfs2_tune *gt = &sdp->sd_tune;
1176 int error; 1168 int error;
1177 1169
1170 sync_filesystem(sb);
1171
1178 spin_lock(&gt->gt_spin); 1172 spin_lock(&gt->gt_spin);
1179 args.ar_commit = gt->gt_logd_secs; 1173 args.ar_commit = gt->gt_logd_secs;
1180 args.ar_quota_quantum = gt->gt_quota_quantum; 1174 args.ar_quota_quantum = gt->gt_quota_quantum;
@@ -1256,7 +1250,7 @@ static int gfs2_drop_inode(struct inode *inode)
1256{ 1250{
1257 struct gfs2_inode *ip = GFS2_I(inode); 1251 struct gfs2_inode *ip = GFS2_I(inode);
1258 1252
1259 if (inode->i_nlink) { 1253 if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
1260 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1254 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1261 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1255 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1262 clear_nlink(inode); 1256 clear_nlink(inode);
@@ -1471,6 +1465,11 @@ static void gfs2_evict_inode(struct inode *inode)
1471 struct gfs2_holder gh; 1465 struct gfs2_holder gh;
1472 int error; 1466 int error;
1473 1467
1468 if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
1469 clear_inode(inode);
1470 return;
1471 }
1472
1474 if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) 1473 if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
1475 goto out; 1474 goto out;
1476 1475
@@ -1558,7 +1557,7 @@ out_unlock:
1558 fs_warn(sdp, "gfs2_evict_inode: %d\n", error); 1557 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1559out: 1558out:
1560 /* Case 3 starts here */ 1559 /* Case 3 starts here */
1561 truncate_inode_pages(&inode->i_data, 0); 1560 truncate_inode_pages_final(&inode->i_data);
1562 gfs2_rs_delete(ip, NULL); 1561 gfs2_rs_delete(ip, NULL);
1563 gfs2_ordered_del_inode(ip); 1562 gfs2_ordered_del_inode(ip);
1564 clear_inode(inode); 1563 clear_inode(inode);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d09f6edda0ff..de25d5577e5d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
138 if (simple_strtol(buf, NULL, 0) != 1) 140 if (simple_strtol(buf, NULL, 0) != 1)
139 return -EINVAL; 141 return -EINVAL;
140 142
141 gfs2_lm_withdraw(sdp, 143 gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
142 "GFS2: fsid=%s: withdrawing from cluster at user's request\n", 144
143 sdp->sd_fsname);
144 return len; 145 return len;
145} 146}
146 147
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 2b20d7046bf3..bead90d27bad 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -51,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
51 if (revokes) 53 if (revokes)
52 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 54 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
53 sizeof(u64)); 55 sizeof(u64));
56 INIT_LIST_HEAD(&tr->tr_databuf);
57 INIT_LIST_HEAD(&tr->tr_buf);
58
54 sb_start_intwrite(sdp->sd_vfs); 59 sb_start_intwrite(sdp->sd_vfs);
55 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 60 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
56 61
@@ -96,14 +101,13 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
96 101
97static void gfs2_print_trans(const struct gfs2_trans *tr) 102static void gfs2_print_trans(const struct gfs2_trans *tr)
98{ 103{
99 printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", 104 pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
100 (void *)tr->tr_ip); 105 pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
101 printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", 106 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
102 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); 107 pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
103 printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", 108 tr->tr_num_buf_new, tr->tr_num_buf_rm,
104 tr->tr_num_buf_new, tr->tr_num_buf_rm, 109 tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
105 tr->tr_num_databuf_new, tr->tr_num_databuf_rm, 110 tr->tr_num_revoke, tr->tr_num_revoke_rm);
106 tr->tr_num_revoke, tr->tr_num_revoke_rm);
107} 111}
108 112
109void gfs2_trans_end(struct gfs2_sbd *sdp) 113void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -210,8 +214,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
210 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 214 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
211 gfs2_pin(sdp, bd->bd_bh); 215 gfs2_pin(sdp, bd->bd_bh);
212 tr->tr_num_databuf_new++; 216 tr->tr_num_databuf_new++;
213 sdp->sd_log_num_databuf++; 217 list_add_tail(&bd->bd_list, &tr->tr_databuf);
214 list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
215 } 218 }
216 gfs2_log_unlock(sdp); 219 gfs2_log_unlock(sdp);
217 unlock_buffer(bh); 220 unlock_buffer(bh);
@@ -230,16 +233,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
230 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 233 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
231 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; 234 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
232 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { 235 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
233 printk(KERN_ERR 236 pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
234 "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
235 (unsigned long long)bd->bd_bh->b_blocknr); 237 (unsigned long long)bd->bd_bh->b_blocknr);
236 BUG(); 238 BUG();
237 } 239 }
238 gfs2_pin(sdp, bd->bd_bh); 240 gfs2_pin(sdp, bd->bd_bh);
239 mh->__pad0 = cpu_to_be64(0); 241 mh->__pad0 = cpu_to_be64(0);
240 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); 242 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
241 sdp->sd_log_num_buf++; 243 list_add(&bd->bd_list, &tr->tr_buf);
242 list_add(&bd->bd_list, &sdp->sd_log_le_buf);
243 tr->tr_num_buf_new++; 244 tr->tr_num_buf_new++;
244} 245}
245 246
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f7109f689e61..86d2035ac669 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
11#include <linux/completion.h> 13#include <linux/completion.h>
12#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly;
30 32
31void gfs2_assert_i(struct gfs2_sbd *sdp) 33void gfs2_assert_i(struct gfs2_sbd *sdp)
32{ 34{
33 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", 35 fs_emerg(sdp, "fatal assertion failed\n");
34 sdp->sd_fsname);
35} 36}
36 37
37int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) 38int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
38{ 39{
39 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 40 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
40 const struct lm_lockops *lm = ls->ls_ops; 41 const struct lm_lockops *lm = ls->ls_ops;
41 va_list args; 42 va_list args;
43 struct va_format vaf;
42 44
43 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && 45 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
44 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 46 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
45 return 0; 47 return 0;
46 48
47 va_start(args, fmt); 49 va_start(args, fmt);
48 vprintk(fmt, args); 50
51 vaf.fmt = fmt;
52 vaf.va = &args;
53
54 fs_err(sdp, "%pV", &vaf);
55
49 va_end(args); 56 va_end(args);
50 57
51 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { 58 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
@@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
66 } 73 }
67 74
68 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) 75 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
69 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); 76 panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
70 77
71 return -1; 78 return -1;
72} 79}
@@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
82{ 89{
83 int me; 90 int me;
84 me = gfs2_lm_withdraw(sdp, 91 me = gfs2_lm_withdraw(sdp,
85 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" 92 "fatal: assertion \"%s\" failed\n"
86 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 93 " function = %s, file = %s, line = %u\n",
87 sdp->sd_fsname, assertion, 94 assertion, function, file, line);
88 sdp->sd_fsname, function, file, line);
89 dump_stack(); 95 dump_stack();
90 return (me) ? -1 : -2; 96 return (me) ? -1 : -2;
91} 97}
@@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
105 return -2; 111 return -2;
106 112
107 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) 113 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
108 printk(KERN_WARNING 114 fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
109 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 115 assertion, function, file, line);
110 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
111 sdp->sd_fsname, assertion,
112 sdp->sd_fsname, function, file, line);
113 116
114 if (sdp->sd_args.ar_debug) 117 if (sdp->sd_args.ar_debug)
115 BUG(); 118 BUG();
@@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
138{ 141{
139 int rv; 142 int rv;
140 rv = gfs2_lm_withdraw(sdp, 143 rv = gfs2_lm_withdraw(sdp,
141 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 144 "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
142 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 145 function, file, line);
143 sdp->sd_fsname,
144 sdp->sd_fsname, function, file, line);
145 return rv; 146 return rv;
146} 147}
147 148
@@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
157 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 158 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
158 int rv; 159 int rv;
159 rv = gfs2_lm_withdraw(sdp, 160 rv = gfs2_lm_withdraw(sdp,
160 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 161 "fatal: filesystem consistency error\n"
161 "GFS2: fsid=%s: inode = %llu %llu\n" 162 " inode = %llu %llu\n"
162 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 163 " function = %s, file = %s, line = %u\n",
163 sdp->sd_fsname, 164 (unsigned long long)ip->i_no_formal_ino,
164 sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, 165 (unsigned long long)ip->i_no_addr,
165 (unsigned long long)ip->i_no_addr, 166 function, file, line);
166 sdp->sd_fsname, function, file, line);
167 return rv; 167 return rv;
168} 168}
169 169
@@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
179 struct gfs2_sbd *sdp = rgd->rd_sbd; 179 struct gfs2_sbd *sdp = rgd->rd_sbd;
180 int rv; 180 int rv;
181 rv = gfs2_lm_withdraw(sdp, 181 rv = gfs2_lm_withdraw(sdp,
182 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 182 "fatal: filesystem consistency error\n"
183 "GFS2: fsid=%s: RG = %llu\n" 183 " RG = %llu\n"
184 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 184 " function = %s, file = %s, line = %u\n",
185 sdp->sd_fsname, 185 (unsigned long long)rgd->rd_addr,
186 sdp->sd_fsname, (unsigned long long)rgd->rd_addr, 186 function, file, line);
187 sdp->sd_fsname, function, file, line);
188 return rv; 187 return rv;
189} 188}
190 189
@@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
200{ 199{
201 int me; 200 int me;
202 me = gfs2_lm_withdraw(sdp, 201 me = gfs2_lm_withdraw(sdp,
203 "GFS2: fsid=%s: fatal: invalid metadata block\n" 202 "fatal: invalid metadata block\n"
204 "GFS2: fsid=%s: bh = %llu (%s)\n" 203 " bh = %llu (%s)\n"
205 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 204 " function = %s, file = %s, line = %u\n",
206 sdp->sd_fsname, 205 (unsigned long long)bh->b_blocknr, type,
207 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, 206 function, file, line);
208 sdp->sd_fsname, function, file, line);
209 return (me) ? -1 : -2; 207 return (me) ? -1 : -2;
210} 208}
211 209
@@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
221{ 219{
222 int me; 220 int me;
223 me = gfs2_lm_withdraw(sdp, 221 me = gfs2_lm_withdraw(sdp,
224 "GFS2: fsid=%s: fatal: invalid metadata block\n" 222 "fatal: invalid metadata block\n"
225 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" 223 " bh = %llu (type: exp=%u, found=%u)\n"
226 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 224 " function = %s, file = %s, line = %u\n",
227 sdp->sd_fsname, 225 (unsigned long long)bh->b_blocknr, type, t,
228 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, 226 function, file, line);
229 sdp->sd_fsname, function, file, line);
230 return (me) ? -1 : -2; 227 return (me) ? -1 : -2;
231} 228}
232 229
@@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
241{ 238{
242 int rv; 239 int rv;
243 rv = gfs2_lm_withdraw(sdp, 240 rv = gfs2_lm_withdraw(sdp,
244 "GFS2: fsid=%s: fatal: I/O error\n" 241 "fatal: I/O error\n"
245 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 242 " function = %s, file = %s, line = %u\n",
246 sdp->sd_fsname, 243 function, file, line);
247 sdp->sd_fsname, function, file, line);
248 return rv; 244 return rv;
249} 245}
250 246
@@ -259,12 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
259{ 255{
260 int rv; 256 int rv;
261 rv = gfs2_lm_withdraw(sdp, 257 rv = gfs2_lm_withdraw(sdp,
262 "GFS2: fsid=%s: fatal: I/O error\n" 258 "fatal: I/O error\n"
263 "GFS2: fsid=%s: block = %llu\n" 259 " block = %llu\n"
264 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 260 " function = %s, file = %s, line = %u\n",
265 sdp->sd_fsname, 261 (unsigned long long)bh->b_blocknr,
266 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, 262 function, file, line);
267 sdp->sd_fsname, function, file, line);
268 return rv; 263 return rv;
269} 264}
270 265
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index b7ffb09b99ea..cbdcbdf39614 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,22 +10,23 @@
10#ifndef __UTIL_DOT_H__ 10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__ 11#define __UTIL_DOT_H__
12 12
13#ifdef pr_fmt
14#undef pr_fmt
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#endif
17
13#include <linux/mempool.h> 18#include <linux/mempool.h>
14 19
15#include "incore.h" 20#include "incore.h"
16 21
17#define fs_printk(level, fs, fmt, arg...) \ 22#define fs_emerg(fs, fmt, ...) \
18 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) 23 pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
19 24#define fs_warn(fs, fmt, ...) \
20#define fs_info(fs, fmt, arg...) \ 25 pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
21 fs_printk(KERN_INFO , fs , fmt , ## arg) 26#define fs_err(fs, fmt, ...) \
22 27 pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
23#define fs_warn(fs, fmt, arg...) \ 28#define fs_info(fs, fmt, ...) \
24 fs_printk(KERN_WARNING , fs , fmt , ## arg) 29 pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
25
26#define fs_err(fs, fmt, arg...) \
27 fs_printk(KERN_ERR, fs , fmt , ## arg)
28
29 30
30void gfs2_assert_i(struct gfs2_sbd *sdp); 31void gfs2_assert_i(struct gfs2_sbd *sdp);
31 32
@@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
85 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; 86 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
86 u32 magic = be32_to_cpu(mh->mh_magic); 87 u32 magic = be32_to_cpu(mh->mh_magic);
87 if (unlikely(magic != GFS2_MAGIC)) { 88 if (unlikely(magic != GFS2_MAGIC)) {
88 printk(KERN_ERR "GFS2: Magic number missing at %llu\n", 89 pr_err("Magic number missing at %llu\n",
89 (unsigned long long)bh->b_blocknr); 90 (unsigned long long)bh->b_blocknr);
90 return -EIO; 91 return -EIO;
91 } 92 }
@@ -164,7 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
164#define gfs2_tune_get(sdp, field) \ 165#define gfs2_tune_get(sdp, field) \
165gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) 166gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
166 167
167int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); 168__printf(2, 3)
169int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
168 170
169#endif /* __UTIL_DOT_H__ */ 171#endif /* __UTIL_DOT_H__ */
170
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31b5e0f..9e2fecd62f62 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,7 +547,7 @@ out:
547 547
548void hfs_evict_inode(struct inode *inode) 548void hfs_evict_inode(struct inode *inode)
549{ 549{
550 truncate_inode_pages(&inode->i_data, 0); 550 truncate_inode_pages_final(&inode->i_data);
551 clear_inode(inode); 551 clear_inode(inode);
552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754cd..eee7206c38d1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
112 112
113static int hfs_remount(struct super_block *sb, int *flags, char *data) 113static int hfs_remount(struct super_block *sb, int *flags, char *data)
114{ 114{
115 sync_filesystem(sb);
115 *flags |= MS_NODIRATIME; 116 *flags |= MS_NODIRATIME;
116 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 117 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
117 return 0; 118 return 0;
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 0f47890299c4..caf89a7be0a1 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -11,7 +11,7 @@
11 11
12static struct kmem_cache *hfsplus_attr_tree_cachep; 12static struct kmem_cache *hfsplus_attr_tree_cachep;
13 13
14int hfsplus_create_attr_tree_cache(void) 14int __init hfsplus_create_attr_tree_cache(void)
15{ 15{
16 if (hfsplus_attr_tree_cachep) 16 if (hfsplus_attr_tree_cachep)
17 return -EEXIST; 17 return -EEXIST;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fbb212fbb1ef..a7aafb35b624 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
227 u32 ablock, dblock, mask; 227 u32 ablock, dblock, mask;
228 sector_t sector; 228 sector_t sector;
229 int was_dirty = 0; 229 int was_dirty = 0;
230 int shift;
231 230
232 /* Convert inode block to disk allocation block */ 231 /* Convert inode block to disk allocation block */
233 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
234 ablock = iblock >> sbi->fs_shift; 232 ablock = iblock >> sbi->fs_shift;
235 233
236 if (iblock >= hip->fs_blocks) { 234 if (iblock >= hip->fs_blocks) {
@@ -498,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode)
498 goto insert_extent; 496 goto insert_extent;
499 } 497 }
500out: 498out:
501 mutex_unlock(&hip->extents_lock);
502 if (!res) { 499 if (!res) {
503 hip->alloc_blocks += len; 500 hip->alloc_blocks += len;
501 mutex_unlock(&hip->extents_lock);
504 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); 502 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
503 return 0;
505 } 504 }
505 mutex_unlock(&hip->extents_lock);
506 return res; 506 return res;
507 507
508insert_extent: 508insert_extent:
@@ -556,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode)
556 556
557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> 557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
558 HFSPLUS_SB(sb)->alloc_blksz_shift; 558 HFSPLUS_SB(sb)->alloc_blksz_shift;
559
560 mutex_lock(&hip->extents_lock);
561
559 alloc_cnt = hip->alloc_blocks; 562 alloc_cnt = hip->alloc_blocks;
560 if (blk_cnt == alloc_cnt) 563 if (blk_cnt == alloc_cnt)
561 goto out; 564 goto out_unlock;
562 565
563 mutex_lock(&hip->extents_lock);
564 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); 566 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
565 if (res) { 567 if (res) {
566 mutex_unlock(&hip->extents_lock); 568 mutex_unlock(&hip->extents_lock);
@@ -592,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode)
592 hfs_brec_remove(&fd); 594 hfs_brec_remove(&fd);
593 } 595 }
594 hfs_find_exit(&fd); 596 hfs_find_exit(&fd);
595 mutex_unlock(&hip->extents_lock);
596 597
597 hip->alloc_blocks = blk_cnt; 598 hip->alloc_blocks = blk_cnt;
598out: 599out_unlock:
600 mutex_unlock(&hip->extents_lock);
599 hip->phys_size = inode->i_size; 601 hip->phys_size = inode->i_size;
600 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> 602 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
601 sb->s_blocksize_bits; 603 sb->s_blocksize_bits;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 62d571eb69ba..83dc29286b10 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *,
367 */ 367 */
368 368
369/* attributes.c */ 369/* attributes.c */
370int hfsplus_create_attr_tree_cache(void); 370int __init hfsplus_create_attr_tree_cache(void);
371void hfsplus_destroy_attr_tree_cache(void); 371void hfsplus_destroy_attr_tree_cache(void);
372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); 372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); 373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa640ef..a513d2d36be9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode,
161static void hfsplus_evict_inode(struct inode *inode) 161static void hfsplus_evict_inode(struct inode *inode)
162{ 162{
163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); 163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
164 truncate_inode_pages(&inode->i_data, 0); 164 truncate_inode_pages_final(&inode->i_data);
165 clear_inode(inode); 165 clear_inode(inode);
166 if (HFSPLUS_IS_RSRC(inode)) { 166 if (HFSPLUS_IS_RSRC(inode)) {
167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
323 323
324static int hfsplus_remount(struct super_block *sb, int *flags, char *data) 324static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
325{ 325{
326 sync_filesystem(sb);
326 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 327 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
327 return 0; 328 return 0;
328 if (!(*flags & MS_RDONLY)) { 329 if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe649d325b1f..9c470fde9878 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
230 230
231static void hostfs_evict_inode(struct inode *inode) 231static void hostfs_evict_inode(struct inode *inode)
232{ 232{
233 truncate_inode_pages(&inode->i_data, 0); 233 truncate_inode_pages_final(&inode->i_data);
234 clear_inode(inode); 234 clear_inode(inode);
235 if (HOSTFS_I(inode)->fd != -1) { 235 if (HOSTFS_I(inode)->fd != -1) {
236 close_file(&HOSTFS_I(inode)->fd); 236 close_file(&HOSTFS_I(inode)->fd);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0ea97e..50a427313835 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode)
304 304
305void hpfs_evict_inode(struct inode *inode) 305void hpfs_evict_inode(struct inode *inode)
306{ 306{
307 truncate_inode_pages(&inode->i_data, 0); 307 truncate_inode_pages_final(&inode->i_data);
308 clear_inode(inode); 308 clear_inode(inode);
309 if (!inode->i_nlink) { 309 if (!inode->i_nlink) {
310 hpfs_lock(inode->i_sb); 310 hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff688b76..fe3463a43236 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
421 struct hpfs_sb_info *sbi = hpfs_sb(s); 421 struct hpfs_sb_info *sbi = hpfs_sb(s);
422 char *new_opts = kstrdup(data, GFP_KERNEL); 422 char *new_opts = kstrdup(data, GFP_KERNEL);
423 423
424 sync_filesystem(s);
425
424 *flags |= MS_NOATIME; 426 *flags |= MS_NOATIME;
425 427
426 hpfs_lock(s); 428 hpfs_lock(s);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..204027520937 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
366 366
367static void hugetlbfs_evict_inode(struct inode *inode) 367static void hugetlbfs_evict_inode(struct inode *inode)
368{ 368{
369 struct resv_map *resv_map;
370
369 truncate_hugepages(inode, 0); 371 truncate_hugepages(inode, 0);
372 resv_map = (struct resv_map *)inode->i_mapping->private_data;
373 /* root inode doesn't have the resv_map, so we should check it */
374 if (resv_map)
375 resv_map_release(&resv_map->refs);
370 clear_inode(inode); 376 clear_inode(inode);
371} 377}
372 378
@@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
476 umode_t mode, dev_t dev) 482 umode_t mode, dev_t dev)
477{ 483{
478 struct inode *inode; 484 struct inode *inode;
485 struct resv_map *resv_map;
486
487 resv_map = resv_map_alloc();
488 if (!resv_map)
489 return NULL;
479 490
480 inode = new_inode(sb); 491 inode = new_inode(sb);
481 if (inode) { 492 if (inode) {
@@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
487 inode->i_mapping->a_ops = &hugetlbfs_aops; 498 inode->i_mapping->a_ops = &hugetlbfs_aops;
488 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 499 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
489 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 500 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
490 INIT_LIST_HEAD(&inode->i_mapping->private_list); 501 inode->i_mapping->private_data = resv_map;
491 info = HUGETLBFS_I(inode); 502 info = HUGETLBFS_I(inode);
492 /* 503 /*
493 * The policy is initialized here even if we are creating a 504 * The policy is initialized here even if we are creating a
@@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
517 break; 528 break;
518 } 529 }
519 lockdep_annotate_inode_mutex_key(inode); 530 lockdep_annotate_inode_mutex_key(inode);
520 } 531 } else
532 kref_put(&resv_map->refs, resv_map_release);
533
521 return inode; 534 return inode;
522} 535}
523 536
diff --git a/fs/inode.c b/fs/inode.c
index 4bcdad3c9361..f96d2a6f88cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -503,6 +503,7 @@ void clear_inode(struct inode *inode)
503 */ 503 */
504 spin_lock_irq(&inode->i_data.tree_lock); 504 spin_lock_irq(&inode->i_data.tree_lock);
505 BUG_ON(inode->i_data.nrpages); 505 BUG_ON(inode->i_data.nrpages);
506 BUG_ON(inode->i_data.nrshadows);
506 spin_unlock_irq(&inode->i_data.tree_lock); 507 spin_unlock_irq(&inode->i_data.tree_lock);
507 BUG_ON(!list_empty(&inode->i_data.private_list)); 508 BUG_ON(!list_empty(&inode->i_data.private_list));
508 BUG_ON(!(inode->i_state & I_FREEING)); 509 BUG_ON(!(inode->i_state & I_FREEING));
@@ -548,8 +549,7 @@ static void evict(struct inode *inode)
548 if (op->evict_inode) { 549 if (op->evict_inode) {
549 op->evict_inode(inode); 550 op->evict_inode(inode);
550 } else { 551 } else {
551 if (inode->i_data.nrpages) 552 truncate_inode_pages_final(&inode->i_data);
552 truncate_inode_pages(&inode->i_data, 0);
553 clear_inode(inode); 553 clear_inode(inode);
554 } 554 }
555 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 555 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -944,24 +944,22 @@ EXPORT_SYMBOL(unlock_new_inode);
944 944
945/** 945/**
946 * lock_two_nondirectories - take two i_mutexes on non-directory objects 946 * lock_two_nondirectories - take two i_mutexes on non-directory objects
947 *
948 * Lock any non-NULL argument that is not a directory.
949 * Zero, one or two objects may be locked by this function.
950 *
947 * @inode1: first inode to lock 951 * @inode1: first inode to lock
948 * @inode2: second inode to lock 952 * @inode2: second inode to lock
949 */ 953 */
950void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 954void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
951{ 955{
952 WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); 956 if (inode1 > inode2)
953 if (inode1 == inode2 || !inode2) { 957 swap(inode1, inode2);
954 mutex_lock(&inode1->i_mutex); 958
955 return; 959 if (inode1 && !S_ISDIR(inode1->i_mode))
956 }
957 WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
958 if (inode1 < inode2) {
959 mutex_lock(&inode1->i_mutex); 960 mutex_lock(&inode1->i_mutex);
961 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
960 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); 962 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
961 } else {
962 mutex_lock(&inode2->i_mutex);
963 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2);
964 }
965} 963}
966EXPORT_SYMBOL(lock_two_nondirectories); 964EXPORT_SYMBOL(lock_two_nondirectories);
967 965
@@ -972,8 +970,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
972 */ 970 */
973void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 971void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
974{ 972{
975 mutex_unlock(&inode1->i_mutex); 973 if (inode1 && !S_ISDIR(inode1->i_mode))
976 if (inode2 && inode2 != inode1) 974 mutex_unlock(&inode1->i_mutex);
975 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
977 mutex_unlock(&inode2->i_mutex); 976 mutex_unlock(&inode2->i_mutex);
978} 977}
979EXPORT_SYMBOL(unlock_two_nondirectories); 978EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1899,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
1899 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); 1898 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
1900} 1899}
1901EXPORT_SYMBOL(inode_dio_done); 1900EXPORT_SYMBOL(inode_dio_done);
1901
1902/*
1903 * inode_set_flags - atomically set some inode flags
1904 *
1905 * Note: the caller should be holding i_mutex, or else be sure that
1906 * they have exclusive access to the inode structure (i.e., while the
1907 * inode is being instantiated). The reason for the cmpxchg() loop
1908 * --- which wouldn't be necessary if all code paths which modify
1909 * i_flags actually followed this rule, is that there is at least one
1910 * code path which doesn't today --- for example,
1911 * __generic_file_aio_write() calls file_remove_suid() without holding
1912 * i_mutex --- so we use cmpxchg() out of an abundance of caution.
1913 *
1914 * In the long run, i_mutex is overkill, and we should probably look
1915 * at using the i_lock spinlock to protect i_flags, and then make sure
1916 * it is so documented in include/linux/fs.h and that all code follows
1917 * the locking convention!!
1918 */
1919void inode_set_flags(struct inode *inode, unsigned int flags,
1920 unsigned int mask)
1921{
1922 unsigned int old_flags, new_flags;
1923
1924 WARN_ON_ONCE(flags & ~mask);
1925 do {
1926 old_flags = ACCESS_ONCE(inode->i_flags);
1927 new_flags = (old_flags & ~mask) | flags;
1928 } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
1929 new_flags) != old_flags));
1930}
1931EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10ea13f2..4556ce1af5b0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static void init_once(void *foo)
93 inode_init_once(&ei->vfs_inode); 93 inode_init_once(&ei->vfs_inode);
94} 94}
95 95
96static int init_inodecache(void) 96static int __init init_inodecache(void)
97{ 97{
98 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", 98 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
99 sizeof(struct iso_inode_info), 99 sizeof(struct iso_inode_info),
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
117 117
118static int isofs_remount(struct super_block *sb, int *flags, char *data) 118static int isofs_remount(struct super_block *sb, int *flags, char *data)
119{ 119{
120 sync_filesystem(sb);
120 if (!(*flags & MS_RDONLY)) 121 if (!(*flags & MS_RDONLY))
121 return -EROFS; 122 return -EROFS;
122 return 0; 123 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc0594063..5f26139a165a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
555 blk_start_plug(&plug); 555 blk_start_plug(&plug);
556 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
557 &log_bufs, WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
558 blk_finish_plug(&plug);
559 558
560 jbd_debug(3, "JBD2: commit phase 2b\n"); 559 jbd_debug(3, "JBD2: commit phase 2b\n");
561 560
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
582 err = 0; 581 err = 0;
583 bufs = 0; 582 bufs = 0;
584 descriptor = NULL; 583 descriptor = NULL;
585 blk_start_plug(&plug);
586 while (commit_transaction->t_buffers) { 584 while (commit_transaction->t_buffers) {
587 585
588 /* Find the next buffer to be journaled... */ 586 /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
1067 goto restart_loop; 1065 goto restart_loop;
1068 } 1066 }
1069 1067
1068 /* Add the transaction to the checkpoint list
1069 * __journal_remove_checkpoint() can not destroy transaction
1070 * under us because it is not marked as T_FINISHED yet */
1071 if (journal->j_checkpoint_transactions == NULL) {
1072 journal->j_checkpoint_transactions = commit_transaction;
1073 commit_transaction->t_cpnext = commit_transaction;
1074 commit_transaction->t_cpprev = commit_transaction;
1075 } else {
1076 commit_transaction->t_cpnext =
1077 journal->j_checkpoint_transactions;
1078 commit_transaction->t_cpprev =
1079 commit_transaction->t_cpnext->t_cpprev;
1080 commit_transaction->t_cpnext->t_cpprev =
1081 commit_transaction;
1082 commit_transaction->t_cpprev->t_cpnext =
1083 commit_transaction;
1084 }
1085 spin_unlock(&journal->j_list_lock);
1086
1070 /* Done with this transaction! */ 1087 /* Done with this transaction! */
1071 1088
1072 jbd_debug(3, "JBD2: commit phase 7\n"); 1089 jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
1085 atomic_read(&commit_transaction->t_handle_count); 1102 atomic_read(&commit_transaction->t_handle_count);
1086 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1103 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1087 commit_transaction->t_tid, &stats.run); 1104 commit_transaction->t_tid, &stats.run);
1088 1105 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1089 /*
1090 * Calculate overall stats
1091 */
1092 spin_lock(&journal->j_history_lock);
1093 journal->j_stats.ts_tid++;
1094 if (commit_transaction->t_requested)
1095 journal->j_stats.ts_requested++;
1096 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1097 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1098 journal->j_stats.run.rs_running += stats.run.rs_running;
1099 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1100 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1101 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1102 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1103 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1104 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1105 spin_unlock(&journal->j_history_lock);
1106 1106
1107 commit_transaction->t_state = T_COMMIT_CALLBACK; 1107 commit_transaction->t_state = T_COMMIT_CALLBACK;
1108 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1108 J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
1122 1122
1123 write_unlock(&journal->j_state_lock); 1123 write_unlock(&journal->j_state_lock);
1124 1124
1125 if (journal->j_checkpoint_transactions == NULL) {
1126 journal->j_checkpoint_transactions = commit_transaction;
1127 commit_transaction->t_cpnext = commit_transaction;
1128 commit_transaction->t_cpprev = commit_transaction;
1129 } else {
1130 commit_transaction->t_cpnext =
1131 journal->j_checkpoint_transactions;
1132 commit_transaction->t_cpprev =
1133 commit_transaction->t_cpnext->t_cpprev;
1134 commit_transaction->t_cpnext->t_cpprev =
1135 commit_transaction;
1136 commit_transaction->t_cpprev->t_cpnext =
1137 commit_transaction;
1138 }
1139 spin_unlock(&journal->j_list_lock);
1140 /* Drop all spin_locks because commit_callback may be block.
1141 * __journal_remove_checkpoint() can not destroy transaction
1142 * under us because it is not marked as T_FINISHED yet */
1143 if (journal->j_commit_callback) 1125 if (journal->j_commit_callback)
1144 journal->j_commit_callback(journal, commit_transaction); 1126 journal->j_commit_callback(journal, commit_transaction);
1145 1127
@@ -1150,7 +1132,7 @@ restart_loop:
1150 write_lock(&journal->j_state_lock); 1132 write_lock(&journal->j_state_lock);
1151 spin_lock(&journal->j_list_lock); 1133 spin_lock(&journal->j_list_lock);
1152 commit_transaction->t_state = T_FINISHED; 1134 commit_transaction->t_state = T_FINISHED;
1153 /* Recheck checkpoint lists after j_list_lock was dropped */ 1135 /* Check if the transaction can be dropped now that we are finished */
1154 if (commit_transaction->t_checkpoint_list == NULL && 1136 if (commit_transaction->t_checkpoint_list == NULL &&
1155 commit_transaction->t_checkpoint_io_list == NULL) { 1137 commit_transaction->t_checkpoint_io_list == NULL) {
1156 __jbd2_journal_drop_transaction(journal, commit_transaction); 1138 __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
1159 spin_unlock(&journal->j_list_lock); 1141 spin_unlock(&journal->j_list_lock);
1160 write_unlock(&journal->j_state_lock); 1142 write_unlock(&journal->j_state_lock);
1161 wake_up(&journal->j_wait_done_commit); 1143 wake_up(&journal->j_wait_done_commit);
1144
1145 /*
1146 * Calculate overall stats
1147 */
1148 spin_lock(&journal->j_history_lock);
1149 journal->j_stats.ts_tid++;
1150 journal->j_stats.ts_requested += stats.ts_requested;
1151 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1152 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1153 journal->j_stats.run.rs_running += stats.run.rs_running;
1154 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1155 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1156 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1157 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1158 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1159 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1160 spin_unlock(&journal->j_history_lock);
1162} 1161}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344afb49a..67b8e303946c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
122#endif 122#endif
123 123
124/* Checksumming functions */ 124/* Checksumming functions */
125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
126{ 126{
127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
128 return 1; 128 return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
143 return cpu_to_be32(csum); 143 return cpu_to_be32(csum);
144} 144}
145 145
146int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
147{ 147{
148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
149 return 1; 149 return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 151 return sb->s_checksum == jbd2_superblock_csum(j, sb);
152} 152}
153 153
154void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
155{ 155{
156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
157 return; 157 return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
302 journal->j_flags |= JBD2_UNMOUNT; 302 journal->j_flags |= JBD2_UNMOUNT;
303 303
304 while (journal->j_task) { 304 while (journal->j_task) {
305 wake_up(&journal->j_wait_commit);
306 write_unlock(&journal->j_state_lock); 305 write_unlock(&journal->j_state_lock);
306 wake_up(&journal->j_wait_commit);
307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
308 write_lock(&journal->j_state_lock); 308 write_lock(&journal->j_state_lock);
309 } 309 }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
710 while (tid_gt(tid, journal->j_commit_sequence)) { 710 while (tid_gt(tid, journal->j_commit_sequence)) {
711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", 711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
712 tid, journal->j_commit_sequence); 712 tid, journal->j_commit_sequence);
713 wake_up(&journal->j_wait_commit);
714 read_unlock(&journal->j_state_lock); 713 read_unlock(&journal->j_state_lock);
714 wake_up(&journal->j_wait_commit);
715 wait_event(journal->j_wait_done_commit, 715 wait_event(journal->j_wait_done_commit,
716 !tid_gt(tid, journal->j_commit_sequence)); 716 !tid_gt(tid, journal->j_commit_sequence));
717 read_lock(&journal->j_state_lock); 717 read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365f54a5..38cfcf5f6fce 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1073 * reused here. 1073 * reused here.
1074 */ 1074 */
1075 jbd_lock_bh_state(bh); 1075 jbd_lock_bh_state(bh);
1076 spin_lock(&journal->j_list_lock);
1077 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 1076 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
1078 jh->b_transaction == NULL || 1077 jh->b_transaction == NULL ||
1079 (jh->b_transaction == journal->j_committing_transaction && 1078 (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1096 jh->b_modified = 0; 1095 jh->b_modified = 0;
1097 1096
1098 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 1097 JBUFFER_TRACE(jh, "file as BJ_Reserved");
1098 spin_lock(&journal->j_list_lock);
1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1100 } else if (jh->b_transaction == journal->j_committing_transaction) { 1100 } else if (jh->b_transaction == journal->j_committing_transaction) {
1101 /* first access by this transaction */ 1101 /* first access by this transaction */
1102 jh->b_modified = 0; 1102 jh->b_modified = 0;
1103 1103
1104 JBUFFER_TRACE(jh, "set next transaction"); 1104 JBUFFER_TRACE(jh, "set next transaction");
1105 spin_lock(&journal->j_list_lock);
1105 jh->b_next_transaction = transaction; 1106 jh->b_next_transaction = transaction;
1106 } 1107 }
1107 spin_unlock(&journal->j_list_lock); 1108 spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1312 journal->j_running_transaction)) { 1313 journal->j_running_transaction)) {
1313 printk(KERN_ERR "JBD2: %s: " 1314 printk(KERN_ERR "JBD2: %s: "
1314 "jh->b_transaction (%llu, %p, %u) != " 1315 "jh->b_transaction (%llu, %p, %u) != "
1315 "journal->j_running_transaction (%p, %u)", 1316 "journal->j_running_transaction (%p, %u)\n",
1316 journal->j_devname, 1317 journal->j_devname,
1317 (unsigned long long) bh->b_blocknr, 1318 (unsigned long long) bh->b_blocknr,
1318 jh->b_transaction, 1319 jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1335 */ 1336 */
1336 if (jh->b_transaction != transaction) { 1337 if (jh->b_transaction != transaction) {
1337 JBUFFER_TRACE(jh, "already on other transaction"); 1338 JBUFFER_TRACE(jh, "already on other transaction");
1338 if (unlikely(jh->b_transaction != 1339 if (unlikely(((jh->b_transaction !=
1339 journal->j_committing_transaction)) { 1340 journal->j_committing_transaction)) ||
1340 printk(KERN_ERR "JBD2: %s: " 1341 (jh->b_next_transaction != transaction))) {
1341 "jh->b_transaction (%llu, %p, %u) != " 1342 printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1342 "journal->j_committing_transaction (%p, %u)", 1343 "bad jh for block %llu: "
1344 "transaction (%p, %u), "
1345 "jh->b_transaction (%p, %u), "
1346 "jh->b_next_transaction (%p, %u), jlist %u\n",
1343 journal->j_devname, 1347 journal->j_devname,
1344 (unsigned long long) bh->b_blocknr, 1348 (unsigned long long) bh->b_blocknr,
1349 transaction, transaction->t_tid,
1345 jh->b_transaction, 1350 jh->b_transaction,
1346 jh->b_transaction ? jh->b_transaction->t_tid : 0, 1351 jh->b_transaction ?
1347 journal->j_committing_transaction, 1352 jh->b_transaction->t_tid : 0,
1348 journal->j_committing_transaction ?
1349 journal->j_committing_transaction->t_tid : 0);
1350 ret = -EINVAL;
1351 }
1352 if (unlikely(jh->b_next_transaction != transaction)) {
1353 printk(KERN_ERR "JBD2: %s: "
1354 "jh->b_next_transaction (%llu, %p, %u) != "
1355 "transaction (%p, %u)",
1356 journal->j_devname,
1357 (unsigned long long) bh->b_blocknr,
1358 jh->b_next_transaction, 1353 jh->b_next_transaction,
1359 jh->b_next_transaction ? 1354 jh->b_next_transaction ?
1360 jh->b_next_transaction->t_tid : 0, 1355 jh->b_next_transaction->t_tid : 0,
1361 transaction, transaction->t_tid); 1356 jh->b_jlist);
1357 WARN_ON(1);
1362 ret = -EINVAL; 1358 ret = -EINVAL;
1363 } 1359 }
1364 /* And this case is illegal: we can't reuse another 1360 /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1415 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1416 1412
1417 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
1418 spin_lock(&journal->j_list_lock);
1419 1414
1420 if (!buffer_jbd(bh)) 1415 if (!buffer_jbd(bh))
1421 goto not_jbd; 1416 goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1468 * we know to remove the checkpoint after we commit. 1463 * we know to remove the checkpoint after we commit.
1469 */ 1464 */
1470 1465
1466 spin_lock(&journal->j_list_lock);
1471 if (jh->b_cp_transaction) { 1467 if (jh->b_cp_transaction) {
1472 __jbd2_journal_temp_unlink_buffer(jh); 1468 __jbd2_journal_temp_unlink_buffer(jh);
1473 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1469 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1480 goto drop; 1476 goto drop;
1481 } 1477 }
1482 } 1478 }
1479 spin_unlock(&journal->j_list_lock);
1483 } else if (jh->b_transaction) { 1480 } else if (jh->b_transaction) {
1484 J_ASSERT_JH(jh, (jh->b_transaction == 1481 J_ASSERT_JH(jh, (jh->b_transaction ==
1485 journal->j_committing_transaction)); 1482 journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1491 1488
1492 if (jh->b_next_transaction) { 1489 if (jh->b_next_transaction) {
1493 J_ASSERT(jh->b_next_transaction == transaction); 1490 J_ASSERT(jh->b_next_transaction == transaction);
1491 spin_lock(&journal->j_list_lock);
1494 jh->b_next_transaction = NULL; 1492 jh->b_next_transaction = NULL;
1493 spin_unlock(&journal->j_list_lock);
1495 1494
1496 /* 1495 /*
1497 * only drop a reference if this transaction modified 1496 * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1503 } 1502 }
1504 1503
1505not_jbd: 1504not_jbd:
1506 spin_unlock(&journal->j_list_lock);
1507 jbd_unlock_bh_state(bh); 1505 jbd_unlock_bh_state(bh);
1508 __brelse(bh); 1506 __brelse(bh);
1509drop: 1507drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1821 if (buffer_locked(bh) || buffer_dirty(bh)) 1819 if (buffer_locked(bh) || buffer_dirty(bh))
1822 goto out; 1820 goto out;
1823 1821
1824 if (jh->b_next_transaction != NULL) 1822 if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
1825 goto out; 1823 goto out;
1826 1824
1827 spin_lock(&journal->j_list_lock); 1825 spin_lock(&journal->j_list_lock);
1828 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1826 if (jh->b_cp_transaction != NULL) {
1829 /* written-back checkpointed metadata buffer */ 1827 /* written-back checkpointed metadata buffer */
1830 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1828 JBUFFER_TRACE(jh, "remove from checkpoint list");
1831 __jbd2_journal_remove_checkpoint(jh); 1829 __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 16a5047903a6..406d9cc84ba8 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
33 unsigned char *cpage_out, 33 unsigned char *cpage_out,
34 uint32_t *sourcelen, uint32_t *dstlen) 34 uint32_t *sourcelen, uint32_t *dstlen)
35{ 35{
36 short positions[256]; 36 unsigned short positions[256];
37 int outpos = 0; 37 int outpos = 0;
38 int pos=0; 38 int pos=0;
39 39
@@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in,
74 unsigned char *cpage_out, 74 unsigned char *cpage_out,
75 uint32_t srclen, uint32_t destlen) 75 uint32_t srclen, uint32_t destlen)
76{ 76{
77 short positions[256]; 77 unsigned short positions[256];
78 int outpos = 0; 78 int outpos = 0;
79 int pos=0; 79 int pos=0;
80 80
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a69e426435dd..601afd1afddf 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode)
242 242
243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n", 243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
244 __func__, inode->i_ino, inode->i_mode); 244 __func__, inode->i_ino, inode->i_mode);
245 truncate_inode_pages(&inode->i_data, 0); 245 truncate_inode_pages_final(&inode->i_data);
246 clear_inode(inode); 246 clear_inode(inode);
247 jffs2_do_clear_inode(c, f); 247 jffs2_do_clear_inode(c, f);
248} 248}
@@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
457 The umask is only applied if there's no default ACL */ 457 The umask is only applied if there's no default ACL */
458 ret = jffs2_init_acl_pre(dir_i, inode, &mode); 458 ret = jffs2_init_acl_pre(dir_i, inode, &mode);
459 if (ret) { 459 if (ret) {
460 make_bad_inode(inode); 460 mutex_unlock(&f->sem);
461 iput(inode); 461 make_bad_inode(inode);
462 return ERR_PTR(ret); 462 iput(inode);
463 return ERR_PTR(ret);
463 } 464 }
464 ret = jffs2_do_new_inode (c, f, mode, ri); 465 ret = jffs2_do_new_inode (c, f, mode, ri);
465 if (ret) { 466 if (ret) {
467 mutex_unlock(&f->sem);
466 make_bad_inode(inode); 468 make_bad_inode(inode);
467 iput(inode); 469 iput(inode);
468 return ERR_PTR(ret); 470 return ERR_PTR(ret);
@@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
479 inode->i_size = 0; 481 inode->i_size = 0;
480 482
481 if (insert_inode_locked(inode) < 0) { 483 if (insert_inode_locked(inode) < 0) {
484 mutex_unlock(&f->sem);
482 make_bad_inode(inode); 485 make_bad_inode(inode);
483 iput(inode); 486 iput(inode);
484 return ERR_PTR(-EINVAL); 487 return ERR_PTR(-EINVAL);
@@ -687,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
687 struct inode *inode = OFNI_EDONI_2SFFJ(f); 690 struct inode *inode = OFNI_EDONI_2SFFJ(f);
688 struct page *pg; 691 struct page *pg;
689 692
690 pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, 693 pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
691 (void *)jffs2_do_readpage_unlock, inode); 694 (void *)jffs2_do_readpage_unlock, inode);
692 if (IS_ERR(pg)) 695 if (IS_ERR(pg))
693 return (void *)pg; 696 return (void *)pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index e4619b00f7c5..fa35ff79ab35 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info
231 uint32_t version; 231 uint32_t version;
232 uint32_t data_crc; 232 uint32_t data_crc;
233 uint32_t partial_crc; 233 uint32_t partial_crc;
234 uint16_t csize; 234 uint32_t csize;
235 uint16_t overlapped; 235 uint16_t overlapped;
236}; 236};
237 237
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 03310721712f..b6bd4affd9ad 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
179 spin_unlock(&c->erase_completion_lock); 179 spin_unlock(&c->erase_completion_lock);
180 180
181 schedule(); 181 schedule();
182 remove_wait_queue(&c->erase_wait, &wait);
182 } else 183 } else
183 spin_unlock(&c->erase_completion_lock); 184 spin_unlock(&c->erase_completion_lock);
184 } else if (ret) 185 } else if (ret)
@@ -211,20 +212,25 @@ out:
211int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, 212int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
212 uint32_t *len, uint32_t sumsize) 213 uint32_t *len, uint32_t sumsize)
213{ 214{
214 int ret = -EAGAIN; 215 int ret;
215 minsize = PAD(minsize); 216 minsize = PAD(minsize);
216 217
217 jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); 218 jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
218 219
219 spin_lock(&c->erase_completion_lock); 220 while (true) {
220 while(ret == -EAGAIN) { 221 spin_lock(&c->erase_completion_lock);
221 ret = jffs2_do_reserve_space(c, minsize, len, sumsize); 222 ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
222 if (ret) { 223 if (ret) {
223 jffs2_dbg(1, "%s(): looping, ret is %d\n", 224 jffs2_dbg(1, "%s(): looping, ret is %d\n",
224 __func__, ret); 225 __func__, ret);
225 } 226 }
227 spin_unlock(&c->erase_completion_lock);
228
229 if (ret == -EAGAIN)
230 cond_resched();
231 else
232 break;
226 } 233 }
227 spin_unlock(&c->erase_completion_lock);
228 if (!ret) 234 if (!ret)
229 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); 235 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
230 236
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a35..0918f0e2e266 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
244 int err; 244 int err;
245 245
246 sync_filesystem(sb);
246 err = jffs2_parse_options(c, data); 247 err = jffs2_parse_options(c, data);
247 if (err) 248 if (err)
248 return -EINVAL; 249 return -EINVAL;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab719add5..6f8fe72c2a7a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
154 dquot_initialize(inode); 154 dquot_initialize(inode);
155 155
156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) { 156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
157 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages_final(&inode->i_data);
158 158
159 if (test_cflag(COMMIT_Freewmap, inode)) 159 if (test_cflag(COMMIT_Freewmap, inode))
160 jfs_free_zero_link(inode); 160 jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
168 dquot_free_inode(inode); 168 dquot_free_inode(inode);
169 } 169 }
170 } else { 170 } else {
171 truncate_inode_pages(&inode->i_data, 0); 171 truncate_inode_pages_final(&inode->i_data);
172 } 172 }
173 clear_inode(inode); 173 clear_inode(inode);
174 dquot_drop(inode); 174 dquot_drop(inode);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483444fd..97f7fda51890 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
418 int flag = JFS_SBI(sb)->flag; 418 int flag = JFS_SBI(sb)->flag;
419 int ret; 419 int ret;
420 420
421 sync_filesystem(sb);
421 if (!parse_options(data, sb, &newLVSize, &flag)) { 422 if (!parse_options(data, sb, &newLVSize, &flag)) {
422 return -EINVAL; 423 return -EINVAL;
423 } 424 }
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
new file mode 100644
index 000000000000..397b5f7a7a16
--- /dev/null
+++ b/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
1#
2# KERNFS should be selected by its users
3#
4
5config KERNFS
6 bool
7 default n
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bd6e18be6e1a..78f3403300af 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,6 +8,7 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/sched.h>
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/idr.h> 14#include <linux/idr.h>
@@ -18,9 +19,162 @@
18#include "kernfs-internal.h" 19#include "kernfs-internal.h"
19 20
20DEFINE_MUTEX(kernfs_mutex); 21DEFINE_MUTEX(kernfs_mutex);
22static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
23static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
21 24
22#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) 25#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
23 26
27static bool kernfs_active(struct kernfs_node *kn)
28{
29 lockdep_assert_held(&kernfs_mutex);
30 return atomic_read(&kn->active) >= 0;
31}
32
33static bool kernfs_lockdep(struct kernfs_node *kn)
34{
35#ifdef CONFIG_DEBUG_LOCK_ALLOC
36 return kn->flags & KERNFS_LOCKDEP;
37#else
38 return false;
39#endif
40}
41
42static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
43{
44 return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
45}
46
47static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
48 size_t buflen)
49{
50 char *p = buf + buflen;
51 int len;
52
53 *--p = '\0';
54
55 do {
56 len = strlen(kn->name);
57 if (p - buf < len + 1) {
58 buf[0] = '\0';
59 p = NULL;
60 break;
61 }
62 p -= len;
63 memcpy(p, kn->name, len);
64 *--p = '/';
65 kn = kn->parent;
66 } while (kn && kn->parent);
67
68 return p;
69}
70
71/**
72 * kernfs_name - obtain the name of a given node
73 * @kn: kernfs_node of interest
74 * @buf: buffer to copy @kn's name into
75 * @buflen: size of @buf
76 *
77 * Copies the name of @kn into @buf of @buflen bytes. The behavior is
78 * similar to strlcpy(). It returns the length of @kn's name and if @buf
79 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
80 *
81 * This function can be called from any context.
82 */
83int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
84{
85 unsigned long flags;
86 int ret;
87
88 spin_lock_irqsave(&kernfs_rename_lock, flags);
89 ret = kernfs_name_locked(kn, buf, buflen);
90 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
91 return ret;
92}
93
94/**
95 * kernfs_path - build full path of a given node
96 * @kn: kernfs_node of interest
97 * @buf: buffer to copy @kn's name into
98 * @buflen: size of @buf
99 *
100 * Builds and returns the full path of @kn in @buf of @buflen bytes. The
101 * path is built from the end of @buf so the returned pointer usually
102 * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
103 * and %NULL is returned.
104 */
105char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
106{
107 unsigned long flags;
108 char *p;
109
110 spin_lock_irqsave(&kernfs_rename_lock, flags);
111 p = kernfs_path_locked(kn, buf, buflen);
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113 return p;
114}
115EXPORT_SYMBOL_GPL(kernfs_path);
116
117/**
118 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
119 * @kn: kernfs_node of interest
120 *
121 * This function can be called from any context.
122 */
123void pr_cont_kernfs_name(struct kernfs_node *kn)
124{
125 unsigned long flags;
126
127 spin_lock_irqsave(&kernfs_rename_lock, flags);
128
129 kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
130 pr_cont("%s", kernfs_pr_cont_buf);
131
132 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
133}
134
135/**
136 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
137 * @kn: kernfs_node of interest
138 *
139 * This function can be called from any context.
140 */
141void pr_cont_kernfs_path(struct kernfs_node *kn)
142{
143 unsigned long flags;
144 char *p;
145
146 spin_lock_irqsave(&kernfs_rename_lock, flags);
147
148 p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
149 sizeof(kernfs_pr_cont_buf));
150 if (p)
151 pr_cont("%s", p);
152 else
153 pr_cont("<name too long>");
154
155 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
156}
157
158/**
159 * kernfs_get_parent - determine the parent node and pin it
160 * @kn: kernfs_node of interest
161 *
162 * Determines @kn's parent, pins and returns it. This function can be
163 * called from any context.
164 */
165struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
166{
167 struct kernfs_node *parent;
168 unsigned long flags;
169
170 spin_lock_irqsave(&kernfs_rename_lock, flags);
171 parent = kn->parent;
172 kernfs_get(parent);
173 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
174
175 return parent;
176}
177
24/** 178/**
25 * kernfs_name_hash 179 * kernfs_name_hash
26 * @name: Null terminated string to hash 180 * @name: Null terminated string to hash
@@ -37,7 +191,7 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
37 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); 191 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
38 hash &= 0x7fffffffU; 192 hash &= 0x7fffffffU;
39 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ 193 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
40 if (hash < 1) 194 if (hash < 2)
41 hash += 2; 195 hash += 2;
42 if (hash >= INT_MAX) 196 if (hash >= INT_MAX)
43 hash = INT_MAX - 1; 197 hash = INT_MAX - 1;
@@ -105,18 +259,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
105 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree 259 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
106 * @kn: kernfs_node of interest 260 * @kn: kernfs_node of interest
107 * 261 *
108 * Unlink @kn from its sibling rbtree which starts from 262 * Try to unlink @kn from its sibling rbtree which starts from
109 * kn->parent->dir.children. 263 * kn->parent->dir.children. Returns %true if @kn was actually
264 * removed, %false if @kn wasn't on the rbtree.
110 * 265 *
111 * Locking: 266 * Locking:
112 * mutex_lock(kernfs_mutex) 267 * mutex_lock(kernfs_mutex)
113 */ 268 */
114static void kernfs_unlink_sibling(struct kernfs_node *kn) 269static bool kernfs_unlink_sibling(struct kernfs_node *kn)
115{ 270{
271 if (RB_EMPTY_NODE(&kn->rb))
272 return false;
273
116 if (kernfs_type(kn) == KERNFS_DIR) 274 if (kernfs_type(kn) == KERNFS_DIR)
117 kn->parent->dir.subdirs--; 275 kn->parent->dir.subdirs--;
118 276
119 rb_erase(&kn->rb, &kn->parent->dir.children); 277 rb_erase(&kn->rb, &kn->parent->dir.children);
278 RB_CLEAR_NODE(&kn->rb);
279 return true;
120} 280}
121 281
122/** 282/**
@@ -137,7 +297,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
137 if (!atomic_inc_unless_negative(&kn->active)) 297 if (!atomic_inc_unless_negative(&kn->active))
138 return NULL; 298 return NULL;
139 299
140 if (kn->flags & KERNFS_LOCKDEP) 300 if (kernfs_lockdep(kn))
141 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); 301 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
142 return kn; 302 return kn;
143} 303}
@@ -151,59 +311,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
151 */ 311 */
152void kernfs_put_active(struct kernfs_node *kn) 312void kernfs_put_active(struct kernfs_node *kn)
153{ 313{
314 struct kernfs_root *root = kernfs_root(kn);
154 int v; 315 int v;
155 316
156 if (unlikely(!kn)) 317 if (unlikely(!kn))
157 return; 318 return;
158 319
159 if (kn->flags & KERNFS_LOCKDEP) 320 if (kernfs_lockdep(kn))
160 rwsem_release(&kn->dep_map, 1, _RET_IP_); 321 rwsem_release(&kn->dep_map, 1, _RET_IP_);
161 v = atomic_dec_return(&kn->active); 322 v = atomic_dec_return(&kn->active);
162 if (likely(v != KN_DEACTIVATED_BIAS)) 323 if (likely(v != KN_DEACTIVATED_BIAS))
163 return; 324 return;
164 325
165 /* 326 wake_up_all(&root->deactivate_waitq);
166 * atomic_dec_return() is a mb(), we'll always see the updated
167 * kn->u.completion.
168 */
169 complete(kn->u.completion);
170} 327}
171 328
172/** 329/**
173 * kernfs_deactivate - deactivate kernfs_node 330 * kernfs_drain - drain kernfs_node
174 * @kn: kernfs_node to deactivate 331 * @kn: kernfs_node to drain
175 * 332 *
176 * Deny new active references and drain existing ones. 333 * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
334 * removers may invoke this function concurrently on @kn and all will
335 * return after draining is complete.
177 */ 336 */
178static void kernfs_deactivate(struct kernfs_node *kn) 337static void kernfs_drain(struct kernfs_node *kn)
338 __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
179{ 339{
180 DECLARE_COMPLETION_ONSTACK(wait); 340 struct kernfs_root *root = kernfs_root(kn);
181 int v;
182 341
183 BUG_ON(!(kn->flags & KERNFS_REMOVED)); 342 lockdep_assert_held(&kernfs_mutex);
184 343 WARN_ON_ONCE(kernfs_active(kn));
185 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
186 return;
187 344
188 kn->u.completion = (void *)&wait; 345 mutex_unlock(&kernfs_mutex);
189 346
190 if (kn->flags & KERNFS_LOCKDEP) 347 if (kernfs_lockdep(kn)) {
191 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); 348 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
192 /* atomic_add_return() is a mb(), put_active() will always see 349 if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
193 * the updated kn->u.completion.
194 */
195 v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
196
197 if (v != KN_DEACTIVATED_BIAS) {
198 if (kn->flags & KERNFS_LOCKDEP)
199 lock_contended(&kn->dep_map, _RET_IP_); 350 lock_contended(&kn->dep_map, _RET_IP_);
200 wait_for_completion(&wait);
201 } 351 }
202 352
203 if (kn->flags & KERNFS_LOCKDEP) { 353 /* but everyone should wait for draining */
354 wait_event(root->deactivate_waitq,
355 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
356
357 if (kernfs_lockdep(kn)) {
204 lock_acquired(&kn->dep_map, _RET_IP_); 358 lock_acquired(&kn->dep_map, _RET_IP_);
205 rwsem_release(&kn->dep_map, 1, _RET_IP_); 359 rwsem_release(&kn->dep_map, 1, _RET_IP_);
206 } 360 }
361
362 kernfs_unmap_bin_file(kn);
363
364 mutex_lock(&kernfs_mutex);
207} 365}
208 366
209/** 367/**
@@ -234,13 +392,15 @@ void kernfs_put(struct kernfs_node *kn)
234 return; 392 return;
235 root = kernfs_root(kn); 393 root = kernfs_root(kn);
236 repeat: 394 repeat:
237 /* Moving/renaming is always done while holding reference. 395 /*
396 * Moving/renaming is always done while holding reference.
238 * kn->parent won't change beneath us. 397 * kn->parent won't change beneath us.
239 */ 398 */
240 parent = kn->parent; 399 parent = kn->parent;
241 400
242 WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", 401 WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
243 parent ? parent->name : "", kn->name); 402 "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
403 parent ? parent->name : "", kn->name, atomic_read(&kn->active));
244 404
245 if (kernfs_type(kn) == KERNFS_LINK) 405 if (kernfs_type(kn) == KERNFS_LINK)
246 kernfs_put(kn->symlink.target_kn); 406 kernfs_put(kn->symlink.target_kn);
@@ -282,8 +442,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
282 kn = dentry->d_fsdata; 442 kn = dentry->d_fsdata;
283 mutex_lock(&kernfs_mutex); 443 mutex_lock(&kernfs_mutex);
284 444
285 /* The kernfs node has been deleted */ 445 /* The kernfs node has been deactivated */
286 if (kn->flags & KERNFS_REMOVED) 446 if (!kernfs_active(kn))
287 goto out_bad; 447 goto out_bad;
288 448
289 /* The kernfs node has been moved? */ 449 /* The kernfs node has been moved? */
@@ -328,6 +488,24 @@ const struct dentry_operations kernfs_dops = {
328 .d_release = kernfs_dop_release, 488 .d_release = kernfs_dop_release,
329}; 489};
330 490
491/**
492 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
493 * @dentry: the dentry in question
494 *
495 * Return the kernfs_node associated with @dentry. If @dentry is not a
496 * kernfs one, %NULL is returned.
497 *
498 * While the returned kernfs_node will stay accessible as long as @dentry
499 * is accessible, the returned node can be in any state and the caller is
500 * fully responsible for determining what's accessible.
501 */
502struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
503{
504 if (dentry->d_sb->s_op == &kernfs_sops)
505 return dentry->d_fsdata;
506 return NULL;
507}
508
331static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, 509static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
332 const char *name, umode_t mode, 510 const char *name, umode_t mode,
333 unsigned flags) 511 unsigned flags)
@@ -352,11 +530,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
352 kn->ino = ret; 530 kn->ino = ret;
353 531
354 atomic_set(&kn->count, 1); 532 atomic_set(&kn->count, 1);
355 atomic_set(&kn->active, 0); 533 atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
534 RB_CLEAR_NODE(&kn->rb);
356 535
357 kn->name = name; 536 kn->name = name;
358 kn->mode = mode; 537 kn->mode = mode;
359 kn->flags = flags | KERNFS_REMOVED; 538 kn->flags = flags;
360 539
361 return kn; 540 return kn;
362 541
@@ -382,69 +561,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
382} 561}
383 562
384/** 563/**
385 * kernfs_addrm_start - prepare for kernfs_node add/remove
386 * @acxt: pointer to kernfs_addrm_cxt to be used
387 *
388 * This function is called when the caller is about to add or remove
389 * kernfs_node. This function acquires kernfs_mutex. @acxt is used
390 * to keep and pass context to other addrm functions.
391 *
392 * LOCKING:
393 * Kernel thread context (may sleep). kernfs_mutex is locked on
394 * return.
395 */
396void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
397 __acquires(kernfs_mutex)
398{
399 memset(acxt, 0, sizeof(*acxt));
400
401 mutex_lock(&kernfs_mutex);
402}
403
404/**
405 * kernfs_add_one - add kernfs_node to parent without warning 564 * kernfs_add_one - add kernfs_node to parent without warning
406 * @acxt: addrm context to use
407 * @kn: kernfs_node to be added 565 * @kn: kernfs_node to be added
408 * 566 *
409 * The caller must already have initialized @kn->parent. This 567 * The caller must already have initialized @kn->parent. This
410 * function increments nlink of the parent's inode if @kn is a 568 * function increments nlink of the parent's inode if @kn is a
411 * directory and link into the children list of the parent. 569 * directory and link into the children list of the parent.
412 * 570 *
413 * This function should be called between calls to
414 * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
415 * the same @acxt as passed to kernfs_addrm_start().
416 *
417 * LOCKING:
418 * Determined by kernfs_addrm_start().
419 *
420 * RETURNS: 571 * RETURNS:
421 * 0 on success, -EEXIST if entry with the given name already 572 * 0 on success, -EEXIST if entry with the given name already
422 * exists. 573 * exists.
423 */ 574 */
424int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) 575int kernfs_add_one(struct kernfs_node *kn)
425{ 576{
426 struct kernfs_node *parent = kn->parent; 577 struct kernfs_node *parent = kn->parent;
427 bool has_ns = kernfs_ns_enabled(parent);
428 struct kernfs_iattrs *ps_iattr; 578 struct kernfs_iattrs *ps_iattr;
579 bool has_ns;
429 int ret; 580 int ret;
430 581
431 if (has_ns != (bool)kn->ns) { 582 mutex_lock(&kernfs_mutex);
432 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", 583
433 has_ns ? "required" : "invalid", parent->name, kn->name); 584 ret = -EINVAL;
434 return -EINVAL; 585 has_ns = kernfs_ns_enabled(parent);
435 } 586 if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
587 has_ns ? "required" : "invalid", parent->name, kn->name))
588 goto out_unlock;
436 589
437 if (kernfs_type(parent) != KERNFS_DIR) 590 if (kernfs_type(parent) != KERNFS_DIR)
438 return -EINVAL; 591 goto out_unlock;
439 592
440 if (parent->flags & KERNFS_REMOVED) 593 ret = -ENOENT;
441 return -ENOENT; 594 if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
595 goto out_unlock;
442 596
443 kn->hash = kernfs_name_hash(kn->name, kn->ns); 597 kn->hash = kernfs_name_hash(kn->name, kn->ns);
444 598
445 ret = kernfs_link_sibling(kn); 599 ret = kernfs_link_sibling(kn);
446 if (ret) 600 if (ret)
447 return ret; 601 goto out_unlock;
448 602
449 /* Update timestamps on the parent */ 603 /* Update timestamps on the parent */
450 ps_iattr = parent->iattr; 604 ps_iattr = parent->iattr;
@@ -453,82 +607,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
453 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; 607 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
454 } 608 }
455 609
456 /* Mark the entry added into directory tree */ 610 mutex_unlock(&kernfs_mutex);
457 kn->flags &= ~KERNFS_REMOVED;
458
459 return 0;
460}
461
462/**
463 * kernfs_remove_one - remove kernfs_node from parent
464 * @acxt: addrm context to use
465 * @kn: kernfs_node to be removed
466 *
467 * Mark @kn removed and drop nlink of parent inode if @kn is a
468 * directory. @kn is unlinked from the children list.
469 *
470 * This function should be called between calls to
471 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
472 * passed the same @acxt as passed to kernfs_addrm_start().
473 *
474 * LOCKING:
475 * Determined by kernfs_addrm_start().
476 */
477static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
478 struct kernfs_node *kn)
479{
480 struct kernfs_iattrs *ps_iattr;
481 611
482 /* 612 /*
483 * Removal can be called multiple times on the same node. Only the 613 * Activate the new node unless CREATE_DEACTIVATED is requested.
484 * first invocation is effective and puts the base ref. 614 * If not activated here, the kernfs user is responsible for
615 * activating the node with kernfs_activate(). A node which hasn't
616 * been activated is not visible to userland and its removal won't
617 * trigger deactivation.
485 */ 618 */
486 if (kn->flags & KERNFS_REMOVED) 619 if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
487 return; 620 kernfs_activate(kn);
488 621 return 0;
489 if (kn->parent) {
490 kernfs_unlink_sibling(kn);
491
492 /* Update timestamps on the parent */
493 ps_iattr = kn->parent->iattr;
494 if (ps_iattr) {
495 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
496 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
497 }
498 }
499
500 kn->flags |= KERNFS_REMOVED;
501 kn->u.removed_list = acxt->removed;
502 acxt->removed = kn;
503}
504 622
505/** 623out_unlock:
506 * kernfs_addrm_finish - finish up kernfs_node add/remove
507 * @acxt: addrm context to finish up
508 *
509 * Finish up kernfs_node add/remove. Resources acquired by
510 * kernfs_addrm_start() are released and removed kernfs_nodes are
511 * cleaned up.
512 *
513 * LOCKING:
514 * kernfs_mutex is released.
515 */
516void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
517 __releases(kernfs_mutex)
518{
519 /* release resources acquired by kernfs_addrm_start() */
520 mutex_unlock(&kernfs_mutex); 624 mutex_unlock(&kernfs_mutex);
521 625 return ret;
522 /* kill removed kernfs_nodes */
523 while (acxt->removed) {
524 struct kernfs_node *kn = acxt->removed;
525
526 acxt->removed = kn->u.removed_list;
527
528 kernfs_deactivate(kn);
529 kernfs_unmap_bin_file(kn);
530 kernfs_put(kn);
531 }
532} 626}
533 627
534/** 628/**
@@ -599,13 +693,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
599 693
600/** 694/**
601 * kernfs_create_root - create a new kernfs hierarchy 695 * kernfs_create_root - create a new kernfs hierarchy
602 * @kdops: optional directory syscall operations for the hierarchy 696 * @scops: optional syscall operations for the hierarchy
697 * @flags: KERNFS_ROOT_* flags
603 * @priv: opaque data associated with the new directory 698 * @priv: opaque data associated with the new directory
604 * 699 *
605 * Returns the root of the new hierarchy on success, ERR_PTR() value on 700 * Returns the root of the new hierarchy on success, ERR_PTR() value on
606 * failure. 701 * failure.
607 */ 702 */
608struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) 703struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
704 unsigned int flags, void *priv)
609{ 705{
610 struct kernfs_root *root; 706 struct kernfs_root *root;
611 struct kernfs_node *kn; 707 struct kernfs_node *kn;
@@ -624,12 +720,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
624 return ERR_PTR(-ENOMEM); 720 return ERR_PTR(-ENOMEM);
625 } 721 }
626 722
627 kn->flags &= ~KERNFS_REMOVED;
628 kn->priv = priv; 723 kn->priv = priv;
629 kn->dir.root = root; 724 kn->dir.root = root;
630 725
631 root->dir_ops = kdops; 726 root->syscall_ops = scops;
727 root->flags = flags;
632 root->kn = kn; 728 root->kn = kn;
729 init_waitqueue_head(&root->deactivate_waitq);
730
731 if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
732 kernfs_activate(kn);
633 733
634 return root; 734 return root;
635} 735}
@@ -660,7 +760,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
660 const char *name, umode_t mode, 760 const char *name, umode_t mode,
661 void *priv, const void *ns) 761 void *priv, const void *ns)
662{ 762{
663 struct kernfs_addrm_cxt acxt;
664 struct kernfs_node *kn; 763 struct kernfs_node *kn;
665 int rc; 764 int rc;
666 765
@@ -674,10 +773,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
674 kn->priv = priv; 773 kn->priv = priv;
675 774
676 /* link in */ 775 /* link in */
677 kernfs_addrm_start(&acxt); 776 rc = kernfs_add_one(kn);
678 rc = kernfs_add_one(&acxt, kn);
679 kernfs_addrm_finish(&acxt);
680
681 if (!rc) 777 if (!rc)
682 return kn; 778 return kn;
683 779
@@ -703,7 +799,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
703 kn = kernfs_find_ns(parent, dentry->d_name.name, ns); 799 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
704 800
705 /* no such entry */ 801 /* no such entry */
706 if (!kn) { 802 if (!kn || !kernfs_active(kn)) {
707 ret = NULL; 803 ret = NULL;
708 goto out_unlock; 804 goto out_unlock;
709 } 805 }
@@ -728,23 +824,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
728 umode_t mode) 824 umode_t mode)
729{ 825{
730 struct kernfs_node *parent = dir->i_private; 826 struct kernfs_node *parent = dir->i_private;
731 struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; 827 struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
828 int ret;
732 829
733 if (!kdops || !kdops->mkdir) 830 if (!scops || !scops->mkdir)
734 return -EPERM; 831 return -EPERM;
735 832
736 return kdops->mkdir(parent, dentry->d_name.name, mode); 833 if (!kernfs_get_active(parent))
834 return -ENODEV;
835
836 ret = scops->mkdir(parent, dentry->d_name.name, mode);
837
838 kernfs_put_active(parent);
839 return ret;
737} 840}
738 841
739static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) 842static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
740{ 843{
741 struct kernfs_node *kn = dentry->d_fsdata; 844 struct kernfs_node *kn = dentry->d_fsdata;
742 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 845 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
846 int ret;
743 847
744 if (!kdops || !kdops->rmdir) 848 if (!scops || !scops->rmdir)
745 return -EPERM; 849 return -EPERM;
746 850
747 return kdops->rmdir(kn); 851 if (!kernfs_get_active(kn))
852 return -ENODEV;
853
854 ret = scops->rmdir(kn);
855
856 kernfs_put_active(kn);
857 return ret;
748} 858}
749 859
750static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, 860static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -752,12 +862,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
752{ 862{
753 struct kernfs_node *kn = old_dentry->d_fsdata; 863 struct kernfs_node *kn = old_dentry->d_fsdata;
754 struct kernfs_node *new_parent = new_dir->i_private; 864 struct kernfs_node *new_parent = new_dir->i_private;
755 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 865 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
866 int ret;
756 867
757 if (!kdops || !kdops->rename) 868 if (!scops || !scops->rename)
758 return -EPERM; 869 return -EPERM;
759 870
760 return kdops->rename(kn, new_parent, new_dentry->d_name.name); 871 if (!kernfs_get_active(kn))
872 return -ENODEV;
873
874 if (!kernfs_get_active(new_parent)) {
875 kernfs_put_active(kn);
876 return -ENODEV;
877 }
878
879 ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
880
881 kernfs_put_active(new_parent);
882 kernfs_put_active(kn);
883 return ret;
761} 884}
762 885
763const struct inode_operations kernfs_dir_iops = { 886const struct inode_operations kernfs_dir_iops = {
@@ -830,23 +953,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
830 return pos->parent; 953 return pos->parent;
831} 954}
832 955
833static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, 956/**
834 struct kernfs_node *kn) 957 * kernfs_activate - activate a node which started deactivated
958 * @kn: kernfs_node whose subtree is to be activated
959 *
960 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
961 * needs to be explicitly activated. A node which hasn't been activated
962 * isn't visible to userland and deactivation is skipped during its
963 * removal. This is useful to construct atomic init sequences where
964 * creation of multiple nodes should either succeed or fail atomically.
965 *
966 * The caller is responsible for ensuring that this function is not called
967 * after kernfs_remove*() is invoked on @kn.
968 */
969void kernfs_activate(struct kernfs_node *kn)
835{ 970{
836 struct kernfs_node *pos, *next; 971 struct kernfs_node *pos;
837 972
838 if (!kn) 973 mutex_lock(&kernfs_mutex);
974
975 pos = NULL;
976 while ((pos = kernfs_next_descendant_post(pos, kn))) {
977 if (!pos || (pos->flags & KERNFS_ACTIVATED))
978 continue;
979
980 WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
981 WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
982
983 atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
984 pos->flags |= KERNFS_ACTIVATED;
985 }
986
987 mutex_unlock(&kernfs_mutex);
988}
989
990static void __kernfs_remove(struct kernfs_node *kn)
991{
992 struct kernfs_node *pos;
993
994 lockdep_assert_held(&kernfs_mutex);
995
996 /*
997 * Short-circuit if non-root @kn has already finished removal.
998 * This is for kernfs_remove_self() which plays with active ref
999 * after removal.
1000 */
1001 if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
839 return; 1002 return;
840 1003
841 pr_debug("kernfs %s: removing\n", kn->name); 1004 pr_debug("kernfs %s: removing\n", kn->name);
842 1005
843 next = NULL; 1006 /* prevent any new usage under @kn by deactivating all nodes */
1007 pos = NULL;
1008 while ((pos = kernfs_next_descendant_post(pos, kn)))
1009 if (kernfs_active(pos))
1010 atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1011
1012 /* deactivate and unlink the subtree node-by-node */
844 do { 1013 do {
845 pos = next; 1014 pos = kernfs_leftmost_descendant(kn);
846 next = kernfs_next_descendant_post(pos, kn); 1015
847 if (pos) 1016 /*
848 kernfs_remove_one(acxt, pos); 1017 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
849 } while (next); 1018 * base ref could have been put by someone else by the time
1019 * the function returns. Make sure it doesn't go away
1020 * underneath us.
1021 */
1022 kernfs_get(pos);
1023
1024 /*
1025 * Drain iff @kn was activated. This avoids draining and
1026 * its lockdep annotations for nodes which have never been
1027 * activated and allows embedding kernfs_remove() in create
1028 * error paths without worrying about draining.
1029 */
1030 if (kn->flags & KERNFS_ACTIVATED)
1031 kernfs_drain(pos);
1032 else
1033 WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1034
1035 /*
1036 * kernfs_unlink_sibling() succeeds once per node. Use it
1037 * to decide who's responsible for cleanups.
1038 */
1039 if (!pos->parent || kernfs_unlink_sibling(pos)) {
1040 struct kernfs_iattrs *ps_iattr =
1041 pos->parent ? pos->parent->iattr : NULL;
1042
1043 /* update timestamps on the parent */
1044 if (ps_iattr) {
1045 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
1046 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
1047 }
1048
1049 kernfs_put(pos);
1050 }
1051
1052 kernfs_put(pos);
1053 } while (pos != kn);
850} 1054}
851 1055
852/** 1056/**
@@ -857,11 +1061,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
857 */ 1061 */
858void kernfs_remove(struct kernfs_node *kn) 1062void kernfs_remove(struct kernfs_node *kn)
859{ 1063{
860 struct kernfs_addrm_cxt acxt; 1064 mutex_lock(&kernfs_mutex);
1065 __kernfs_remove(kn);
1066 mutex_unlock(&kernfs_mutex);
1067}
861 1068
862 kernfs_addrm_start(&acxt); 1069/**
863 __kernfs_remove(&acxt, kn); 1070 * kernfs_break_active_protection - break out of active protection
864 kernfs_addrm_finish(&acxt); 1071 * @kn: the self kernfs_node
1072 *
1073 * The caller must be running off of a kernfs operation which is invoked
1074 * with an active reference - e.g. one of kernfs_ops. Each invocation of
1075 * this function must also be matched with an invocation of
1076 * kernfs_unbreak_active_protection().
1077 *
1078 * This function releases the active reference of @kn the caller is
1079 * holding. Once this function is called, @kn may be removed at any point
1080 * and the caller is solely responsible for ensuring that the objects it
1081 * dereferences are accessible.
1082 */
1083void kernfs_break_active_protection(struct kernfs_node *kn)
1084{
1085 /*
1086 * Take out ourself out of the active ref dependency chain. If
1087 * we're called without an active ref, lockdep will complain.
1088 */
1089 kernfs_put_active(kn);
1090}
1091
1092/**
1093 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
1094 * @kn: the self kernfs_node
1095 *
1096 * If kernfs_break_active_protection() was called, this function must be
1097 * invoked before finishing the kernfs operation. Note that while this
1098 * function restores the active reference, it doesn't and can't actually
1099 * restore the active protection - @kn may already or be in the process of
1100 * being removed. Once kernfs_break_active_protection() is invoked, that
1101 * protection is irreversibly gone for the kernfs operation instance.
1102 *
1103 * While this function may be called at any point after
1104 * kernfs_break_active_protection() is invoked, its most useful location
1105 * would be right before the enclosing kernfs operation returns.
1106 */
1107void kernfs_unbreak_active_protection(struct kernfs_node *kn)
1108{
1109 /*
1110 * @kn->active could be in any state; however, the increment we do
1111 * here will be undone as soon as the enclosing kernfs operation
1112 * finishes and this temporary bump can't break anything. If @kn
1113 * is alive, nothing changes. If @kn is being deactivated, the
1114 * soon-to-follow put will either finish deactivation or restore
1115 * deactivated state. If @kn is already removed, the temporary
1116 * bump is guaranteed to be gone before @kn is released.
1117 */
1118 atomic_inc(&kn->active);
1119 if (kernfs_lockdep(kn))
1120 rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
1121}
1122
1123/**
1124 * kernfs_remove_self - remove a kernfs_node from its own method
1125 * @kn: the self kernfs_node to remove
1126 *
1127 * The caller must be running off of a kernfs operation which is invoked
1128 * with an active reference - e.g. one of kernfs_ops. This can be used to
1129 * implement a file operation which deletes itself.
1130 *
1131 * For example, the "delete" file for a sysfs device directory can be
1132 * implemented by invoking kernfs_remove_self() on the "delete" file
1133 * itself. This function breaks the circular dependency of trying to
1134 * deactivate self while holding an active ref itself. It isn't necessary
1135 * to modify the usual removal path to use kernfs_remove_self(). The
1136 * "delete" implementation can simply invoke kernfs_remove_self() on self
1137 * before proceeding with the usual removal path. kernfs will ignore later
1138 * kernfs_remove() on self.
1139 *
1140 * kernfs_remove_self() can be called multiple times concurrently on the
1141 * same kernfs_node. Only the first one actually performs removal and
1142 * returns %true. All others will wait until the kernfs operation which
1143 * won self-removal finishes and return %false. Note that the losers wait
1144 * for the completion of not only the winning kernfs_remove_self() but also
1145 * the whole kernfs_ops which won the arbitration. This can be used to
1146 * guarantee, for example, all concurrent writes to a "delete" file to
1147 * finish only after the whole operation is complete.
1148 */
1149bool kernfs_remove_self(struct kernfs_node *kn)
1150{
1151 bool ret;
1152
1153 mutex_lock(&kernfs_mutex);
1154 kernfs_break_active_protection(kn);
1155
1156 /*
1157 * SUICIDAL is used to arbitrate among competing invocations. Only
1158 * the first one will actually perform removal. When the removal
1159 * is complete, SUICIDED is set and the active ref is restored
1160 * while holding kernfs_mutex. The ones which lost arbitration
1161 * waits for SUICDED && drained which can happen only after the
1162 * enclosing kernfs operation which executed the winning instance
1163 * of kernfs_remove_self() finished.
1164 */
1165 if (!(kn->flags & KERNFS_SUICIDAL)) {
1166 kn->flags |= KERNFS_SUICIDAL;
1167 __kernfs_remove(kn);
1168 kn->flags |= KERNFS_SUICIDED;
1169 ret = true;
1170 } else {
1171 wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
1172 DEFINE_WAIT(wait);
1173
1174 while (true) {
1175 prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
1176
1177 if ((kn->flags & KERNFS_SUICIDED) &&
1178 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
1179 break;
1180
1181 mutex_unlock(&kernfs_mutex);
1182 schedule();
1183 mutex_lock(&kernfs_mutex);
1184 }
1185 finish_wait(waitq, &wait);
1186 WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
1187 ret = false;
1188 }
1189
1190 /*
1191 * This must be done while holding kernfs_mutex; otherwise, waiting
1192 * for SUICIDED && deactivated could finish prematurely.
1193 */
1194 kernfs_unbreak_active_protection(kn);
1195
1196 mutex_unlock(&kernfs_mutex);
1197 return ret;
865} 1198}
866 1199
867/** 1200/**
@@ -876,7 +1209,6 @@ void kernfs_remove(struct kernfs_node *kn)
876int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, 1209int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
877 const void *ns) 1210 const void *ns)
878{ 1211{
879 struct kernfs_addrm_cxt acxt;
880 struct kernfs_node *kn; 1212 struct kernfs_node *kn;
881 1213
882 if (!parent) { 1214 if (!parent) {
@@ -885,13 +1217,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
885 return -ENOENT; 1217 return -ENOENT;
886 } 1218 }
887 1219
888 kernfs_addrm_start(&acxt); 1220 mutex_lock(&kernfs_mutex);
889 1221
890 kn = kernfs_find_ns(parent, name, ns); 1222 kn = kernfs_find_ns(parent, name, ns);
891 if (kn) 1223 if (kn)
892 __kernfs_remove(&acxt, kn); 1224 __kernfs_remove(kn);
893 1225
894 kernfs_addrm_finish(&acxt); 1226 mutex_unlock(&kernfs_mutex);
895 1227
896 if (kn) 1228 if (kn)
897 return 0; 1229 return 0;
@@ -909,12 +1241,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
909int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, 1241int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
910 const char *new_name, const void *new_ns) 1242 const char *new_name, const void *new_ns)
911{ 1243{
1244 struct kernfs_node *old_parent;
1245 const char *old_name = NULL;
912 int error; 1246 int error;
913 1247
1248 /* can't move or rename root */
1249 if (!kn->parent)
1250 return -EINVAL;
1251
914 mutex_lock(&kernfs_mutex); 1252 mutex_lock(&kernfs_mutex);
915 1253
916 error = -ENOENT; 1254 error = -ENOENT;
917 if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) 1255 if (!kernfs_active(kn) || !kernfs_active(new_parent))
918 goto out; 1256 goto out;
919 1257
920 error = 0; 1258 error = 0;
@@ -932,13 +1270,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
932 new_name = kstrdup(new_name, GFP_KERNEL); 1270 new_name = kstrdup(new_name, GFP_KERNEL);
933 if (!new_name) 1271 if (!new_name)
934 goto out; 1272 goto out;
935 1273 } else {
936 if (kn->flags & KERNFS_STATIC_NAME) 1274 new_name = NULL;
937 kn->flags &= ~KERNFS_STATIC_NAME;
938 else
939 kfree(kn->name);
940
941 kn->name = new_name;
942 } 1275 }
943 1276
944 /* 1277 /*
@@ -946,12 +1279,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
946 */ 1279 */
947 kernfs_unlink_sibling(kn); 1280 kernfs_unlink_sibling(kn);
948 kernfs_get(new_parent); 1281 kernfs_get(new_parent);
949 kernfs_put(kn->parent); 1282
1283 /* rename_lock protects ->parent and ->name accessors */
1284 spin_lock_irq(&kernfs_rename_lock);
1285
1286 old_parent = kn->parent;
1287 kn->parent = new_parent;
1288
950 kn->ns = new_ns; 1289 kn->ns = new_ns;
1290 if (new_name) {
1291 if (!(kn->flags & KERNFS_STATIC_NAME))
1292 old_name = kn->name;
1293 kn->flags &= ~KERNFS_STATIC_NAME;
1294 kn->name = new_name;
1295 }
1296
1297 spin_unlock_irq(&kernfs_rename_lock);
1298
951 kn->hash = kernfs_name_hash(kn->name, kn->ns); 1299 kn->hash = kernfs_name_hash(kn->name, kn->ns);
952 kn->parent = new_parent;
953 kernfs_link_sibling(kn); 1300 kernfs_link_sibling(kn);
954 1301
1302 kernfs_put(old_parent);
1303 kfree(old_name);
1304
955 error = 0; 1305 error = 0;
956 out: 1306 out:
957 mutex_unlock(&kernfs_mutex); 1307 mutex_unlock(&kernfs_mutex);
@@ -974,7 +1324,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
974 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) 1324 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
975{ 1325{
976 if (pos) { 1326 if (pos) {
977 int valid = !(pos->flags & KERNFS_REMOVED) && 1327 int valid = kernfs_active(pos) &&
978 pos->parent == parent && hash == pos->hash; 1328 pos->parent == parent && hash == pos->hash;
979 kernfs_put(pos); 1329 kernfs_put(pos);
980 if (!valid) 1330 if (!valid)
@@ -993,8 +1343,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
993 break; 1343 break;
994 } 1344 }
995 } 1345 }
996 /* Skip over entries in the wrong namespace */ 1346 /* Skip over entries which are dying/dead or in the wrong namespace */
997 while (pos && pos->ns != ns) { 1347 while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
998 struct rb_node *node = rb_next(&pos->rb); 1348 struct rb_node *node = rb_next(&pos->rb);
999 if (!node) 1349 if (!node)
1000 pos = NULL; 1350 pos = NULL;
@@ -1008,14 +1358,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1008 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) 1358 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1009{ 1359{
1010 pos = kernfs_dir_pos(ns, parent, ino, pos); 1360 pos = kernfs_dir_pos(ns, parent, ino, pos);
1011 if (pos) 1361 if (pos) {
1012 do { 1362 do {
1013 struct rb_node *node = rb_next(&pos->rb); 1363 struct rb_node *node = rb_next(&pos->rb);
1014 if (!node) 1364 if (!node)
1015 pos = NULL; 1365 pos = NULL;
1016 else 1366 else
1017 pos = rb_to_kn(node); 1367 pos = rb_to_kn(node);
1018 } while (pos && pos->ns != ns); 1368 } while (pos && (!kernfs_active(pos) || pos->ns != ns));
1369 }
1019 return pos; 1370 return pos;
1020} 1371}
1021 1372
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf397bfdff2..8034706a7af8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -252,10 +252,18 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
252 size_t count, loff_t *ppos) 252 size_t count, loff_t *ppos)
253{ 253{
254 struct kernfs_open_file *of = kernfs_of(file); 254 struct kernfs_open_file *of = kernfs_of(file);
255 ssize_t len = min_t(size_t, count, PAGE_SIZE);
256 const struct kernfs_ops *ops; 255 const struct kernfs_ops *ops;
256 size_t len;
257 char *buf; 257 char *buf;
258 258
259 if (of->atomic_write_len) {
260 len = count;
261 if (len > of->atomic_write_len)
262 return -E2BIG;
263 } else {
264 len = min_t(size_t, count, PAGE_SIZE);
265 }
266
259 buf = kmalloc(len + 1, GFP_KERNEL); 267 buf = kmalloc(len + 1, GFP_KERNEL);
260 if (!buf) 268 if (!buf)
261 return -ENOMEM; 269 return -ENOMEM;
@@ -653,6 +661,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
653 of->file = file; 661 of->file = file;
654 662
655 /* 663 /*
664 * Write path needs to atomic_write_len outside active reference.
665 * Cache it in open_file. See kernfs_fop_write() for details.
666 */
667 of->atomic_write_len = ops->atomic_write_len;
668
669 /*
656 * Always instantiate seq_file even if read access doesn't use 670 * Always instantiate seq_file even if read access doesn't use
657 * seq_file or is not requested. This unifies private data access 671 * seq_file or is not requested. This unifies private data access
658 * and readable regular files are the vast majority anyway. 672 * and readable regular files are the vast majority anyway.
@@ -820,7 +834,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
820 bool name_is_static, 834 bool name_is_static,
821 struct lock_class_key *key) 835 struct lock_class_key *key)
822{ 836{
823 struct kernfs_addrm_cxt acxt;
824 struct kernfs_node *kn; 837 struct kernfs_node *kn;
825 unsigned flags; 838 unsigned flags;
826 int rc; 839 int rc;
@@ -855,10 +868,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
855 if (ops->mmap) 868 if (ops->mmap)
856 kn->flags |= KERNFS_HAS_MMAP; 869 kn->flags |= KERNFS_HAS_MMAP;
857 870
858 kernfs_addrm_start(&acxt); 871 rc = kernfs_add_one(kn);
859 rc = kernfs_add_one(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861
862 if (rc) { 872 if (rc) {
863 kernfs_put(kn); 873 kernfs_put(kn);
864 return ERR_PTR(rc); 874 return ERR_PTR(rc);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index e55126f85bd2..abb0f1f53d93 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -355,7 +355,7 @@ void kernfs_evict_inode(struct inode *inode)
355{ 355{
356 struct kernfs_node *kn = inode->i_private; 356 struct kernfs_node *kn = inode->i_private;
357 357
358 truncate_inode_pages(&inode->i_data, 0); 358 truncate_inode_pages_final(&inode->i_data);
359 clear_inode(inode); 359 clear_inode(inode);
360 kernfs_put(kn); 360 kernfs_put(kn);
361} 361}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eb536b76374a..8be13b2a079b 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,7 +26,8 @@ struct kernfs_iattrs {
26 struct simple_xattrs xattrs; 26 struct simple_xattrs xattrs;
27}; 27};
28 28
29#define KN_DEACTIVATED_BIAS INT_MIN 29/* +1 to avoid triggering overflow warning when negating it */
30#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
30 31
31/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ 32/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
32 33
@@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
45} 46}
46 47
47/* 48/*
48 * Context structure to be used while adding/removing nodes.
49 */
50struct kernfs_addrm_cxt {
51 struct kernfs_node *removed;
52};
53
54/*
55 * mount.c 49 * mount.c
56 */ 50 */
57struct kernfs_super_info { 51struct kernfs_super_info {
@@ -71,6 +65,7 @@ struct kernfs_super_info {
71}; 65};
72#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) 66#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
73 67
68extern const struct super_operations kernfs_sops;
74extern struct kmem_cache *kernfs_node_cache; 69extern struct kmem_cache *kernfs_node_cache;
75 70
76/* 71/*
@@ -100,9 +95,7 @@ extern const struct inode_operations kernfs_dir_iops;
100 95
101struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); 96struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
102void kernfs_put_active(struct kernfs_node *kn); 97void kernfs_put_active(struct kernfs_node *kn);
103void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); 98int kernfs_add_one(struct kernfs_node *kn);
104int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
105void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
106struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, 99struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
107 const char *name, umode_t mode, 100 const char *name, umode_t mode,
108 unsigned flags); 101 unsigned flags);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0f4152defe7b..6a5f04ac8704 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -19,12 +19,49 @@
19 19
20struct kmem_cache *kernfs_node_cache; 20struct kmem_cache *kernfs_node_cache;
21 21
22static const struct super_operations kernfs_sops = { 22static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
23{
24 struct kernfs_root *root = kernfs_info(sb)->root;
25 struct kernfs_syscall_ops *scops = root->syscall_ops;
26
27 if (scops && scops->remount_fs)
28 return scops->remount_fs(root, flags, data);
29 return 0;
30}
31
32static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
33{
34 struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
35 struct kernfs_syscall_ops *scops = root->syscall_ops;
36
37 if (scops && scops->show_options)
38 return scops->show_options(sf, root);
39 return 0;
40}
41
42const struct super_operations kernfs_sops = {
23 .statfs = simple_statfs, 43 .statfs = simple_statfs,
24 .drop_inode = generic_delete_inode, 44 .drop_inode = generic_delete_inode,
25 .evict_inode = kernfs_evict_inode, 45 .evict_inode = kernfs_evict_inode,
46
47 .remount_fs = kernfs_sop_remount_fs,
48 .show_options = kernfs_sop_show_options,
26}; 49};
27 50
51/**
52 * kernfs_root_from_sb - determine kernfs_root associated with a super_block
53 * @sb: the super_block in question
54 *
55 * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
56 * %NULL is returned.
57 */
58struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
59{
60 if (sb->s_op == &kernfs_sops)
61 return kernfs_info(sb)->root;
62 return NULL;
63}
64
28static int kernfs_fill_super(struct super_block *sb) 65static int kernfs_fill_super(struct super_block *sb)
29{ 66{
30 struct kernfs_super_info *info = kernfs_info(sb); 67 struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 4d457055acb9..8a198898e39a 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
27 struct kernfs_node *target) 27 struct kernfs_node *target)
28{ 28{
29 struct kernfs_node *kn; 29 struct kernfs_node *kn;
30 struct kernfs_addrm_cxt acxt;
31 int error; 30 int error;
32 31
33 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); 32 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
@@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
39 kn->symlink.target_kn = target; 38 kn->symlink.target_kn = target;
40 kernfs_get(target); /* ref owned by symlink */ 39 kernfs_get(target); /* ref owned by symlink */
41 40
42 kernfs_addrm_start(&acxt); 41 error = kernfs_add_one(kn);
43 error = kernfs_add_one(&acxt, kn);
44 kernfs_addrm_finish(&acxt);
45
46 if (!error) 42 if (!error)
47 return kn; 43 return kn;
48 44
diff --git a/fs/locks.c b/fs/locks.c
index 92a0f0a52b06..13fc7a6d380a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,7 @@
135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
138#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT)
138 139
139static bool lease_breaking(struct file_lock *fl) 140static bool lease_breaking(struct file_lock *fl)
140{ 141{
@@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type)
344 return 0; 345 return 0;
345} 346}
346 347
347/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX 348static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
348 * style lock. 349 struct flock64 *l)
349 */
350static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
351 struct flock *l)
352{ 350{
353 off_t start, end;
354
355 switch (l->l_whence) { 351 switch (l->l_whence) {
356 case SEEK_SET: 352 case SEEK_SET:
357 start = 0; 353 fl->fl_start = 0;
358 break; 354 break;
359 case SEEK_CUR: 355 case SEEK_CUR:
360 start = filp->f_pos; 356 fl->fl_start = filp->f_pos;
361 break; 357 break;
362 case SEEK_END: 358 case SEEK_END:
363 start = i_size_read(file_inode(filp)); 359 fl->fl_start = i_size_read(file_inode(filp));
364 break; 360 break;
365 default: 361 default:
366 return -EINVAL; 362 return -EINVAL;
367 } 363 }
364 if (l->l_start > OFFSET_MAX - fl->fl_start)
365 return -EOVERFLOW;
366 fl->fl_start += l->l_start;
367 if (fl->fl_start < 0)
368 return -EINVAL;
368 369
369 /* POSIX-1996 leaves the case l->l_len < 0 undefined; 370 /* POSIX-1996 leaves the case l->l_len < 0 undefined;
370 POSIX-2001 defines it. */ 371 POSIX-2001 defines it. */
371 start += l->l_start;
372 if (start < 0)
373 return -EINVAL;
374 fl->fl_end = OFFSET_MAX;
375 if (l->l_len > 0) { 372 if (l->l_len > 0) {
376 end = start + l->l_len - 1; 373 if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
377 fl->fl_end = end; 374 return -EOVERFLOW;
375 fl->fl_end = fl->fl_start + l->l_len - 1;
376
378 } else if (l->l_len < 0) { 377 } else if (l->l_len < 0) {
379 end = start - 1; 378 if (fl->fl_start + l->l_len < 0)
380 fl->fl_end = end;
381 start += l->l_len;
382 if (start < 0)
383 return -EINVAL; 379 return -EINVAL;
384 } 380 fl->fl_end = fl->fl_start - 1;
385 fl->fl_start = start; /* we record the absolute position */ 381 fl->fl_start += l->l_len;
386 if (fl->fl_end < fl->fl_start) 382 } else
387 return -EOVERFLOW; 383 fl->fl_end = OFFSET_MAX;
388 384
389 fl->fl_owner = current->files; 385 fl->fl_owner = current->files;
390 fl->fl_pid = current->tgid; 386 fl->fl_pid = current->tgid;
391 fl->fl_file = filp; 387 fl->fl_file = filp;
@@ -393,55 +389,36 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
393 fl->fl_ops = NULL; 389 fl->fl_ops = NULL;
394 fl->fl_lmops = NULL; 390 fl->fl_lmops = NULL;
395 391
396 return assign_type(fl, l->l_type); 392 /* Ensure that fl->fl_filp has compatible f_mode */
397} 393 switch (l->l_type) {
398 394 case F_RDLCK:
399#if BITS_PER_LONG == 32 395 if (!(filp->f_mode & FMODE_READ))
400static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, 396 return -EBADF;
401 struct flock64 *l)
402{
403 loff_t start;
404
405 switch (l->l_whence) {
406 case SEEK_SET:
407 start = 0;
408 break;
409 case SEEK_CUR:
410 start = filp->f_pos;
411 break; 397 break;
412 case SEEK_END: 398 case F_WRLCK:
413 start = i_size_read(file_inode(filp)); 399 if (!(filp->f_mode & FMODE_WRITE))
400 return -EBADF;
414 break; 401 break;
415 default:
416 return -EINVAL;
417 } 402 }
418 403
419 start += l->l_start;
420 if (start < 0)
421 return -EINVAL;
422 fl->fl_end = OFFSET_MAX;
423 if (l->l_len > 0) {
424 fl->fl_end = start + l->l_len - 1;
425 } else if (l->l_len < 0) {
426 fl->fl_end = start - 1;
427 start += l->l_len;
428 if (start < 0)
429 return -EINVAL;
430 }
431 fl->fl_start = start; /* we record the absolute position */
432 if (fl->fl_end < fl->fl_start)
433 return -EOVERFLOW;
434
435 fl->fl_owner = current->files;
436 fl->fl_pid = current->tgid;
437 fl->fl_file = filp;
438 fl->fl_flags = FL_POSIX;
439 fl->fl_ops = NULL;
440 fl->fl_lmops = NULL;
441
442 return assign_type(fl, l->l_type); 404 return assign_type(fl, l->l_type);
443} 405}
444#endif 406
407/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
408 * style lock.
409 */
410static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
411 struct flock *l)
412{
413 struct flock64 ll = {
414 .l_type = l->l_type,
415 .l_whence = l->l_whence,
416 .l_start = l->l_start,
417 .l_len = l->l_len,
418 };
419
420 return flock64_to_posix_lock(filp, fl, &ll);
421}
445 422
446/* default lease lock manager operations */ 423/* default lease lock manager operations */
447static void lease_break_callback(struct file_lock *fl) 424static void lease_break_callback(struct file_lock *fl)
@@ -511,8 +488,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
511} 488}
512 489
513/* Must be called with the i_lock held! */ 490/* Must be called with the i_lock held! */
514static inline void 491static void locks_insert_global_locks(struct file_lock *fl)
515locks_insert_global_locks(struct file_lock *fl)
516{ 492{
517 lg_local_lock(&file_lock_lglock); 493 lg_local_lock(&file_lock_lglock);
518 fl->fl_link_cpu = smp_processor_id(); 494 fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +497,7 @@ locks_insert_global_locks(struct file_lock *fl)
521} 497}
522 498
523/* Must be called with the i_lock held! */ 499/* Must be called with the i_lock held! */
524static inline void 500static void locks_delete_global_locks(struct file_lock *fl)
525locks_delete_global_locks(struct file_lock *fl)
526{ 501{
527 /* 502 /*
528 * Avoid taking lock if already unhashed. This is safe since this check 503 * Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +519,12 @@ posix_owner_key(struct file_lock *fl)
544 return (unsigned long)fl->fl_owner; 519 return (unsigned long)fl->fl_owner;
545} 520}
546 521
547static inline void 522static void locks_insert_global_blocked(struct file_lock *waiter)
548locks_insert_global_blocked(struct file_lock *waiter)
549{ 523{
550 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); 524 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
551} 525}
552 526
553static inline void 527static void locks_delete_global_blocked(struct file_lock *waiter)
554locks_delete_global_blocked(struct file_lock *waiter)
555{ 528{
556 hash_del(&waiter->fl_link); 529 hash_del(&waiter->fl_link);
557} 530}
@@ -581,7 +554,7 @@ static void locks_delete_block(struct file_lock *waiter)
581 * it seems like the reasonable thing to do. 554 * it seems like the reasonable thing to do.
582 * 555 *
583 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block 556 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
584 * list itself is protected by the file_lock_list, but by ensuring that the 557 * list itself is protected by the blocked_lock_lock, but by ensuring that the
585 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock 558 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
586 * in some cases when we see that the fl_block list is empty. 559 * in some cases when we see that the fl_block list is empty.
587 */ 560 */
@@ -591,7 +564,7 @@ static void __locks_insert_block(struct file_lock *blocker,
591 BUG_ON(!list_empty(&waiter->fl_block)); 564 BUG_ON(!list_empty(&waiter->fl_block));
592 waiter->fl_next = blocker; 565 waiter->fl_next = blocker;
593 list_add_tail(&waiter->fl_block, &blocker->fl_block); 566 list_add_tail(&waiter->fl_block, &blocker->fl_block);
594 if (IS_POSIX(blocker)) 567 if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker))
595 locks_insert_global_blocked(waiter); 568 locks_insert_global_blocked(waiter);
596} 569}
597 570
@@ -652,15 +625,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
652 locks_insert_global_locks(fl); 625 locks_insert_global_locks(fl);
653} 626}
654 627
655/* 628/**
656 * Delete a lock and then free it. 629 * locks_delete_lock - Delete a lock and then free it.
657 * Wake up processes that are blocked waiting for this lock, 630 * @thisfl_p: pointer that points to the fl_next field of the previous
658 * notify the FS that the lock has been cleared and 631 * inode->i_flock list entry
659 * finally free the lock. 632 *
633 * Unlink a lock from all lists and free the namespace reference, but don't
634 * free it yet. Wake up processes that are blocked waiting for this lock and
635 * notify the FS that the lock has been cleared.
660 * 636 *
661 * Must be called with the i_lock held! 637 * Must be called with the i_lock held!
662 */ 638 */
663static void locks_delete_lock(struct file_lock **thisfl_p) 639static void locks_unlink_lock(struct file_lock **thisfl_p)
664{ 640{
665 struct file_lock *fl = *thisfl_p; 641 struct file_lock *fl = *thisfl_p;
666 642
@@ -675,6 +651,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
675 } 651 }
676 652
677 locks_wake_up_blocks(fl); 653 locks_wake_up_blocks(fl);
654}
655
656/*
657 * Unlink a lock from all lists and free it.
658 *
659 * Must be called with i_lock held!
660 */
661static void locks_delete_lock(struct file_lock **thisfl_p)
662{
663 struct file_lock *fl = *thisfl_p;
664
665 locks_unlink_lock(thisfl_p);
678 locks_free_lock(fl); 666 locks_free_lock(fl);
679} 667}
680 668
@@ -769,8 +757,16 @@ EXPORT_SYMBOL(posix_test_lock);
769 * Note: the above assumption may not be true when handling lock 757 * Note: the above assumption may not be true when handling lock
770 * requests from a broken NFS client. It may also fail in the presence 758 * requests from a broken NFS client. It may also fail in the presence
771 * of tasks (such as posix threads) sharing the same open file table. 759 * of tasks (such as posix threads) sharing the same open file table.
772 *
773 * To handle those cases, we just bail out after a few iterations. 760 * To handle those cases, we just bail out after a few iterations.
761 *
762 * For FL_FILE_PVT locks, the owner is the filp, not the files_struct.
763 * Because the owner is not even nominally tied to a thread of
764 * execution, the deadlock detection below can't reasonably work well. Just
765 * skip it for those.
766 *
767 * In principle, we could do a more limited deadlock detection on FL_FILE_PVT
768 * locks that just checks for the case where two tasks are attempting to
769 * upgrade from read to write locks on the same inode.
774 */ 770 */
775 771
776#define MAX_DEADLK_ITERATIONS 10 772#define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +789,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
793{ 789{
794 int i = 0; 790 int i = 0;
795 791
792 /*
793 * This deadlock detector can't reasonably detect deadlocks with
794 * FL_FILE_PVT locks, since they aren't owned by a process, per-se.
795 */
796 if (IS_FILE_PVT(caller_fl))
797 return 0;
798
796 while ((block_fl = what_owner_is_waiting_for(block_fl))) { 799 while ((block_fl = what_owner_is_waiting_for(block_fl))) {
797 if (i++ > MAX_DEADLK_ITERATIONS) 800 if (i++ > MAX_DEADLK_ITERATIONS)
798 return 0; 801 return 0;
@@ -1152,13 +1155,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1152 1155
1153/** 1156/**
1154 * locks_mandatory_locked - Check for an active lock 1157 * locks_mandatory_locked - Check for an active lock
1155 * @inode: the file to check 1158 * @file: the file to check
1156 * 1159 *
1157 * Searches the inode's list of locks to find any POSIX locks which conflict. 1160 * Searches the inode's list of locks to find any POSIX locks which conflict.
1158 * This function is called from locks_verify_locked() only. 1161 * This function is called from locks_verify_locked() only.
1159 */ 1162 */
1160int locks_mandatory_locked(struct inode *inode) 1163int locks_mandatory_locked(struct file *file)
1161{ 1164{
1165 struct inode *inode = file_inode(file);
1162 fl_owner_t owner = current->files; 1166 fl_owner_t owner = current->files;
1163 struct file_lock *fl; 1167 struct file_lock *fl;
1164 1168
@@ -1169,7 +1173,7 @@ int locks_mandatory_locked(struct inode *inode)
1169 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1173 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1170 if (!IS_POSIX(fl)) 1174 if (!IS_POSIX(fl))
1171 continue; 1175 continue;
1172 if (fl->fl_owner != owner) 1176 if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
1173 break; 1177 break;
1174 } 1178 }
1175 spin_unlock(&inode->i_lock); 1179 spin_unlock(&inode->i_lock);
@@ -1195,19 +1199,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1195{ 1199{
1196 struct file_lock fl; 1200 struct file_lock fl;
1197 int error; 1201 int error;
1202 bool sleep = false;
1198 1203
1199 locks_init_lock(&fl); 1204 locks_init_lock(&fl);
1200 fl.fl_owner = current->files;
1201 fl.fl_pid = current->tgid; 1205 fl.fl_pid = current->tgid;
1202 fl.fl_file = filp; 1206 fl.fl_file = filp;
1203 fl.fl_flags = FL_POSIX | FL_ACCESS; 1207 fl.fl_flags = FL_POSIX | FL_ACCESS;
1204 if (filp && !(filp->f_flags & O_NONBLOCK)) 1208 if (filp && !(filp->f_flags & O_NONBLOCK))
1205 fl.fl_flags |= FL_SLEEP; 1209 sleep = true;
1206 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; 1210 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
1207 fl.fl_start = offset; 1211 fl.fl_start = offset;
1208 fl.fl_end = offset + count - 1; 1212 fl.fl_end = offset + count - 1;
1209 1213
1210 for (;;) { 1214 for (;;) {
1215 if (filp) {
1216 fl.fl_owner = (fl_owner_t)filp;
1217 fl.fl_flags &= ~FL_SLEEP;
1218 error = __posix_lock_file(inode, &fl, NULL);
1219 if (!error)
1220 break;
1221 }
1222
1223 if (sleep)
1224 fl.fl_flags |= FL_SLEEP;
1225 fl.fl_owner = current->files;
1211 error = __posix_lock_file(inode, &fl, NULL); 1226 error = __posix_lock_file(inode, &fl, NULL);
1212 if (error != FILE_LOCK_DEFERRED) 1227 if (error != FILE_LOCK_DEFERRED)
1213 break; 1228 break;
@@ -1472,6 +1487,32 @@ int fcntl_getlease(struct file *filp)
1472 return type; 1487 return type;
1473} 1488}
1474 1489
1490/**
1491 * check_conflicting_open - see if the given dentry points to a file that has
1492 * an existing open that would conflict with the
1493 * desired lease.
1494 * @dentry: dentry to check
1495 * @arg: type of lease that we're trying to acquire
1496 *
1497 * Check to see if there's an existing open fd on this file that would
1498 * conflict with the lease we're trying to set.
1499 */
1500static int
1501check_conflicting_open(const struct dentry *dentry, const long arg)
1502{
1503 int ret = 0;
1504 struct inode *inode = dentry->d_inode;
1505
1506 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1507 return -EAGAIN;
1508
1509 if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
1510 (atomic_read(&inode->i_count) > 1)))
1511 ret = -EAGAIN;
1512
1513 return ret;
1514}
1515
1475static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1516static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1476{ 1517{
1477 struct file_lock *fl, **before, **my_before = NULL, *lease; 1518 struct file_lock *fl, **before, **my_before = NULL, *lease;
@@ -1499,12 +1540,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1499 return -EINVAL; 1540 return -EINVAL;
1500 } 1541 }
1501 1542
1502 error = -EAGAIN; 1543 error = check_conflicting_open(dentry, arg);
1503 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1544 if (error)
1504 goto out;
1505 if ((arg == F_WRLCK)
1506 && ((d_count(dentry) > 1)
1507 || (atomic_read(&inode->i_count) > 1)))
1508 goto out; 1545 goto out;
1509 1546
1510 /* 1547 /*
@@ -1549,7 +1586,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1549 goto out; 1586 goto out;
1550 1587
1551 locks_insert_lock(before, lease); 1588 locks_insert_lock(before, lease);
1552 error = 0; 1589 /*
1590 * The check in break_lease() is lockless. It's possible for another
1591 * open to race in after we did the earlier check for a conflicting
1592 * open but before the lease was inserted. Check again for a
1593 * conflicting open and cancel the lease if there is one.
1594 *
1595 * We also add a barrier here to ensure that the insertion of the lock
1596 * precedes these checks.
1597 */
1598 smp_mb();
1599 error = check_conflicting_open(dentry, arg);
1600 if (error)
1601 locks_unlink_lock(flp);
1553out: 1602out:
1554 if (is_deleg) 1603 if (is_deleg)
1555 mutex_unlock(&inode->i_mutex); 1604 mutex_unlock(&inode->i_mutex);
@@ -1842,7 +1891,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
1842 1891
1843static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) 1892static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1844{ 1893{
1845 flock->l_pid = fl->fl_pid; 1894 flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
1846#if BITS_PER_LONG == 32 1895#if BITS_PER_LONG == 32
1847 /* 1896 /*
1848 * Make sure we can represent the posix lock via 1897 * Make sure we can represent the posix lock via
@@ -1864,7 +1913,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1864#if BITS_PER_LONG == 32 1913#if BITS_PER_LONG == 32
1865static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) 1914static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1866{ 1915{
1867 flock->l_pid = fl->fl_pid; 1916 flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
1868 flock->l_start = fl->fl_start; 1917 flock->l_start = fl->fl_start;
1869 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : 1918 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
1870 fl->fl_end - fl->fl_start + 1; 1919 fl->fl_end - fl->fl_start + 1;
@@ -1876,7 +1925,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1876/* Report the first existing lock that would conflict with l. 1925/* Report the first existing lock that would conflict with l.
1877 * This implements the F_GETLK command of fcntl(). 1926 * This implements the F_GETLK command of fcntl().
1878 */ 1927 */
1879int fcntl_getlk(struct file *filp, struct flock __user *l) 1928int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
1880{ 1929{
1881 struct file_lock file_lock; 1930 struct file_lock file_lock;
1882 struct flock flock; 1931 struct flock flock;
@@ -1893,6 +1942,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
1893 if (error) 1942 if (error)
1894 goto out; 1943 goto out;
1895 1944
1945 if (cmd == F_GETLKP) {
1946 error = -EINVAL;
1947 if (flock.l_pid != 0)
1948 goto out;
1949
1950 cmd = F_GETLK;
1951 file_lock.fl_flags |= FL_FILE_PVT;
1952 file_lock.fl_owner = (fl_owner_t)filp;
1953 }
1954
1896 error = vfs_test_lock(filp, &file_lock); 1955 error = vfs_test_lock(filp, &file_lock);
1897 if (error) 1956 if (error)
1898 goto out; 1957 goto out;
@@ -2012,25 +2071,32 @@ again:
2012 error = flock_to_posix_lock(filp, file_lock, &flock); 2071 error = flock_to_posix_lock(filp, file_lock, &flock);
2013 if (error) 2072 if (error)
2014 goto out; 2073 goto out;
2015 if (cmd == F_SETLKW) { 2074
2016 file_lock->fl_flags |= FL_SLEEP; 2075 /*
2017 } 2076 * If the cmd is requesting file-private locks, then set the
2018 2077 * FL_FILE_PVT flag and override the owner.
2019 error = -EBADF; 2078 */
2020 switch (flock.l_type) { 2079 switch (cmd) {
2021 case F_RDLCK: 2080 case F_SETLKP:
2022 if (!(filp->f_mode & FMODE_READ)) 2081 error = -EINVAL;
2023 goto out; 2082 if (flock.l_pid != 0)
2024 break;
2025 case F_WRLCK:
2026 if (!(filp->f_mode & FMODE_WRITE))
2027 goto out; 2083 goto out;
2084
2085 cmd = F_SETLK;
2086 file_lock->fl_flags |= FL_FILE_PVT;
2087 file_lock->fl_owner = (fl_owner_t)filp;
2028 break; 2088 break;
2029 case F_UNLCK: 2089 case F_SETLKPW:
2030 break;
2031 default:
2032 error = -EINVAL; 2090 error = -EINVAL;
2033 goto out; 2091 if (flock.l_pid != 0)
2092 goto out;
2093
2094 cmd = F_SETLKW;
2095 file_lock->fl_flags |= FL_FILE_PVT;
2096 file_lock->fl_owner = (fl_owner_t)filp;
2097 /* Fallthrough */
2098 case F_SETLKW:
2099 file_lock->fl_flags |= FL_SLEEP;
2034 } 2100 }
2035 2101
2036 error = do_lock_file_wait(filp, cmd, file_lock); 2102 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2061,7 +2127,7 @@ out:
2061/* Report the first existing lock that would conflict with l. 2127/* Report the first existing lock that would conflict with l.
2062 * This implements the F_GETLK command of fcntl(). 2128 * This implements the F_GETLK command of fcntl().
2063 */ 2129 */
2064int fcntl_getlk64(struct file *filp, struct flock64 __user *l) 2130int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
2065{ 2131{
2066 struct file_lock file_lock; 2132 struct file_lock file_lock;
2067 struct flock64 flock; 2133 struct flock64 flock;
@@ -2078,6 +2144,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
2078 if (error) 2144 if (error)
2079 goto out; 2145 goto out;
2080 2146
2147 if (cmd == F_GETLKP) {
2148 error = -EINVAL;
2149 if (flock.l_pid != 0)
2150 goto out;
2151
2152 cmd = F_GETLK64;
2153 file_lock.fl_flags |= FL_FILE_PVT;
2154 file_lock.fl_owner = (fl_owner_t)filp;
2155 }
2156
2081 error = vfs_test_lock(filp, &file_lock); 2157 error = vfs_test_lock(filp, &file_lock);
2082 if (error) 2158 if (error)
2083 goto out; 2159 goto out;
@@ -2130,25 +2206,32 @@ again:
2130 error = flock64_to_posix_lock(filp, file_lock, &flock); 2206 error = flock64_to_posix_lock(filp, file_lock, &flock);
2131 if (error) 2207 if (error)
2132 goto out; 2208 goto out;
2133 if (cmd == F_SETLKW64) { 2209
2134 file_lock->fl_flags |= FL_SLEEP; 2210 /*
2135 } 2211 * If the cmd is requesting file-private locks, then set the
2136 2212 * FL_FILE_PVT flag and override the owner.
2137 error = -EBADF; 2213 */
2138 switch (flock.l_type) { 2214 switch (cmd) {
2139 case F_RDLCK: 2215 case F_SETLKP:
2140 if (!(filp->f_mode & FMODE_READ)) 2216 error = -EINVAL;
2141 goto out; 2217 if (flock.l_pid != 0)
2142 break;
2143 case F_WRLCK:
2144 if (!(filp->f_mode & FMODE_WRITE))
2145 goto out; 2218 goto out;
2219
2220 cmd = F_SETLK64;
2221 file_lock->fl_flags |= FL_FILE_PVT;
2222 file_lock->fl_owner = (fl_owner_t)filp;
2146 break; 2223 break;
2147 case F_UNLCK: 2224 case F_SETLKPW:
2148 break;
2149 default:
2150 error = -EINVAL; 2225 error = -EINVAL;
2151 goto out; 2226 if (flock.l_pid != 0)
2227 goto out;
2228
2229 cmd = F_SETLKW64;
2230 file_lock->fl_flags |= FL_FILE_PVT;
2231 file_lock->fl_owner = (fl_owner_t)filp;
2232 /* Fallthrough */
2233 case F_SETLKW64:
2234 file_lock->fl_flags |= FL_SLEEP;
2152 } 2235 }
2153 2236
2154 error = do_lock_file_wait(filp, cmd, file_lock); 2237 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2209,7 +2292,7 @@ EXPORT_SYMBOL(locks_remove_posix);
2209/* 2292/*
2210 * This function is called on the last close of an open file. 2293 * This function is called on the last close of an open file.
2211 */ 2294 */
2212void locks_remove_flock(struct file *filp) 2295void locks_remove_file(struct file *filp)
2213{ 2296{
2214 struct inode * inode = file_inode(filp); 2297 struct inode * inode = file_inode(filp);
2215 struct file_lock *fl; 2298 struct file_lock *fl;
@@ -2218,6 +2301,8 @@ void locks_remove_flock(struct file *filp)
2218 if (!inode->i_flock) 2301 if (!inode->i_flock)
2219 return; 2302 return;
2220 2303
2304 locks_remove_posix(filp, (fl_owner_t)filp);
2305
2221 if (filp->f_op->flock) { 2306 if (filp->f_op->flock) {
2222 struct file_lock fl = { 2307 struct file_lock fl = {
2223 .fl_pid = current->tgid, 2308 .fl_pid = current->tgid,
@@ -2236,16 +2321,28 @@ void locks_remove_flock(struct file *filp)
2236 2321
2237 while ((fl = *before) != NULL) { 2322 while ((fl = *before) != NULL) {
2238 if (fl->fl_file == filp) { 2323 if (fl->fl_file == filp) {
2239 if (IS_FLOCK(fl)) {
2240 locks_delete_lock(before);
2241 continue;
2242 }
2243 if (IS_LEASE(fl)) { 2324 if (IS_LEASE(fl)) {
2244 lease_modify(before, F_UNLCK); 2325 lease_modify(before, F_UNLCK);
2245 continue; 2326 continue;
2246 } 2327 }
2247 /* What? */ 2328
2248 BUG(); 2329 /*
2330 * There's a leftover lock on the list of a type that
2331 * we didn't expect to see. Most likely a classic
2332 * POSIX lock that ended up not getting released
2333 * properly, or that raced onto the list somehow. Log
2334 * some info about it and then just remove it from
2335 * the list.
2336 */
2337 WARN(!IS_FLOCK(fl),
2338 "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
2339 MAJOR(inode->i_sb->s_dev),
2340 MINOR(inode->i_sb->s_dev), inode->i_ino,
2341 fl->fl_type, fl->fl_flags,
2342 fl->fl_start, fl->fl_end);
2343
2344 locks_delete_lock(before);
2345 continue;
2249 } 2346 }
2250 before = &fl->fl_next; 2347 before = &fl->fl_next;
2251 } 2348 }
@@ -2314,8 +2411,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2314 2411
2315 seq_printf(f, "%lld:%s ", id, pfx); 2412 seq_printf(f, "%lld:%s ", id, pfx);
2316 if (IS_POSIX(fl)) { 2413 if (IS_POSIX(fl)) {
2317 seq_printf(f, "%6s %s ", 2414 if (fl->fl_flags & FL_ACCESS)
2318 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2415 seq_printf(f, "ACCESS");
2416 else if (IS_FILE_PVT(fl))
2417 seq_printf(f, "FLPVT ");
2418 else
2419 seq_printf(f, "POSIX ");
2420
2421 seq_printf(f, " %s ",
2319 (inode == NULL) ? "*NOINODE*" : 2422 (inode == NULL) ? "*NOINODE*" :
2320 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); 2423 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
2321 } else if (IS_FLOCK(fl)) { 2424 } else if (IS_FLOCK(fl)) {
@@ -2385,6 +2488,7 @@ static int locks_show(struct seq_file *f, void *v)
2385} 2488}
2386 2489
2387static void *locks_start(struct seq_file *f, loff_t *pos) 2490static void *locks_start(struct seq_file *f, loff_t *pos)
2491 __acquires(&blocked_lock_lock)
2388{ 2492{
2389 struct locks_iterator *iter = f->private; 2493 struct locks_iterator *iter = f->private;
2390 2494
@@ -2403,6 +2507,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2403} 2507}
2404 2508
2405static void locks_stop(struct seq_file *f, void *v) 2509static void locks_stop(struct seq_file *f, void *v)
2510 __releases(&blocked_lock_lock)
2406{ 2511{
2407 spin_unlock(&blocked_lock_lock); 2512 spin_unlock(&blocked_lock_lock);
2408 lg_global_unlock(&file_lock_lglock); 2513 lg_global_unlock(&file_lock_lglock);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cbade2fb..48140315f627 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
2180 do_delete_inode(inode); 2180 do_delete_inode(inode);
2181 } 2181 }
2182 } 2182 }
2183 truncate_inode_pages(&inode->i_data, 0); 2183 truncate_inode_pages_final(&inode->i_data);
2184 clear_inode(inode); 2184 clear_inode(inode);
2185 2185
2186 /* Cheaper version of write_inode. All changes are concealed in 2186 /* Cheaper version of write_inode. All changes are concealed in
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf673..bf166e388f0d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
26 * back on the lru list. 26 * back on the lru list.
27 */ 27 */
28 28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
29#include <linux/kernel.h> 64#include <linux/kernel.h>
30#include <linux/module.h> 65#include <linux/module.h>
31 66
@@ -34,9 +69,10 @@
34#include <linux/mm.h> 69#include <linux/mm.h>
35#include <linux/slab.h> 70#include <linux/slab.h>
36#include <linux/sched.h> 71#include <linux/sched.h>
37#include <linux/init.h> 72#include <linux/list_bl.h>
38#include <linux/mbcache.h> 73#include <linux/mbcache.h>
39 74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
40 76
41#ifdef MB_CACHE_DEBUG 77#ifdef MB_CACHE_DEBUG
42# define mb_debug(f...) do { \ 78# define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
57 93
58#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 94#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
59 95
96#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS)
97#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
98 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
99
60static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); 100static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
61 101static struct blockgroup_lock *mb_cache_bg_lock;
102static struct kmem_cache *mb_cache_kmem_cache;
103
62MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); 104MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
63MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); 105MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
64MODULE_LICENSE("GPL"); 106MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
86static LIST_HEAD(mb_cache_lru_list); 128static LIST_HEAD(mb_cache_lru_list);
87static DEFINE_SPINLOCK(mb_cache_spinlock); 129static DEFINE_SPINLOCK(mb_cache_spinlock);
88 130
131static inline void
132__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
133{
134 spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
135 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
136}
137
138static inline void
139__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
140{
141 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
142 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
143}
144
89static inline int 145static inline int
90__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 146__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
91{ 147{
92 return !list_empty(&ce->e_block_list); 148 return !hlist_bl_unhashed(&ce->e_block_list);
93} 149}
94 150
95 151
96static void 152static inline void
97__mb_cache_entry_unhash(struct mb_cache_entry *ce) 153__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
98{ 154{
99 if (__mb_cache_entry_is_hashed(ce)) { 155 if (__mb_cache_entry_is_block_hashed(ce))
100 list_del_init(&ce->e_block_list); 156 hlist_bl_del_init(&ce->e_block_list);
101 list_del(&ce->e_index.o_list);
102 }
103} 157}
104 158
159static inline int
160__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
161{
162 return !hlist_bl_unhashed(&ce->e_index.o_list);
163}
164
165static inline void
166__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
167{
168 if (__mb_cache_entry_is_index_hashed(ce))
169 hlist_bl_del_init(&ce->e_index.o_list);
170}
171
172/*
173 * __mb_cache_entry_unhash_unlock()
174 *
175 * This function is called to unhash both the block and index hash
176 * chain.
177 * It assumes both the block and index hash chain is locked upon entry.
178 * It also unlock both hash chains both exit
179 */
180static inline void
181__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
182{
183 __mb_cache_entry_unhash_index(ce);
184 hlist_bl_unlock(ce->e_index_hash_p);
185 __mb_cache_entry_unhash_block(ce);
186 hlist_bl_unlock(ce->e_block_hash_p);
187}
105 188
106static void 189static void
107__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 190__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
108{ 191{
109 struct mb_cache *cache = ce->e_cache; 192 struct mb_cache *cache = ce->e_cache;
110 193
111 mb_assert(!(ce->e_used || ce->e_queued)); 194 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
112 kmem_cache_free(cache->c_entry_cache, ce); 195 kmem_cache_free(cache->c_entry_cache, ce);
113 atomic_dec(&cache->c_entry_count); 196 atomic_dec(&cache->c_entry_count);
114} 197}
115 198
116
117static void 199static void
118__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 200__mb_cache_entry_release(struct mb_cache_entry *ce)
119 __releases(mb_cache_spinlock)
120{ 201{
202 /* First lock the entry to serialize access to its local data. */
203 __spin_lock_mb_cache_entry(ce);
121 /* Wake up all processes queuing for this cache entry. */ 204 /* Wake up all processes queuing for this cache entry. */
122 if (ce->e_queued) 205 if (ce->e_queued)
123 wake_up_all(&mb_cache_queue); 206 wake_up_all(&mb_cache_queue);
124 if (ce->e_used >= MB_CACHE_WRITER) 207 if (ce->e_used >= MB_CACHE_WRITER)
125 ce->e_used -= MB_CACHE_WRITER; 208 ce->e_used -= MB_CACHE_WRITER;
209 /*
210 * Make sure that all cache entries on lru_list have
211 * both e_used and e_qued of 0s.
212 */
126 ce->e_used--; 213 ce->e_used--;
127 if (!(ce->e_used || ce->e_queued)) { 214 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
128 if (!__mb_cache_entry_is_hashed(ce)) 215 if (!__mb_cache_entry_is_block_hashed(ce)) {
216 __spin_unlock_mb_cache_entry(ce);
129 goto forget; 217 goto forget;
130 mb_assert(list_empty(&ce->e_lru_list)); 218 }
131 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 219 /*
220 * Need access to lru list, first drop entry lock,
221 * then reacquire the lock in the proper order.
222 */
223 spin_lock(&mb_cache_spinlock);
224 if (list_empty(&ce->e_lru_list))
225 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
226 spin_unlock(&mb_cache_spinlock);
132 } 227 }
133 spin_unlock(&mb_cache_spinlock); 228 __spin_unlock_mb_cache_entry(ce);
134 return; 229 return;
135forget: 230forget:
136 spin_unlock(&mb_cache_spinlock); 231 mb_assert(list_empty(&ce->e_lru_list));
137 __mb_cache_entry_forget(ce, GFP_KERNEL); 232 __mb_cache_entry_forget(ce, GFP_KERNEL);
138} 233}
139 234
140
141/* 235/*
142 * mb_cache_shrink_scan() memory pressure callback 236 * mb_cache_shrink_scan() memory pressure callback
143 * 237 *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
160 254
161 mb_debug("trying to free %d entries", nr_to_scan); 255 mb_debug("trying to free %d entries", nr_to_scan);
162 spin_lock(&mb_cache_spinlock); 256 spin_lock(&mb_cache_spinlock);
163 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 257 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
164 struct mb_cache_entry *ce = 258 struct mb_cache_entry *ce =
165 list_entry(mb_cache_lru_list.next, 259 list_entry(mb_cache_lru_list.next,
166 struct mb_cache_entry, e_lru_list); 260 struct mb_cache_entry, e_lru_list);
167 list_move_tail(&ce->e_lru_list, &free_list); 261 list_del_init(&ce->e_lru_list);
168 __mb_cache_entry_unhash(ce); 262 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
169 freed++; 263 continue;
264 spin_unlock(&mb_cache_spinlock);
265 /* Prevent any find or get operation on the entry */
266 hlist_bl_lock(ce->e_block_hash_p);
267 hlist_bl_lock(ce->e_index_hash_p);
268 /* Ignore if it is touched by a find/get */
269 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
270 !list_empty(&ce->e_lru_list)) {
271 hlist_bl_unlock(ce->e_index_hash_p);
272 hlist_bl_unlock(ce->e_block_hash_p);
273 spin_lock(&mb_cache_spinlock);
274 continue;
275 }
276 __mb_cache_entry_unhash_unlock(ce);
277 list_add_tail(&ce->e_lru_list, &free_list);
278 spin_lock(&mb_cache_spinlock);
170 } 279 }
171 spin_unlock(&mb_cache_spinlock); 280 spin_unlock(&mb_cache_spinlock);
281
172 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 282 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
173 __mb_cache_entry_forget(entry, gfp_mask); 283 __mb_cache_entry_forget(entry, gfp_mask);
284 freed++;
174 } 285 }
175 return freed; 286 return freed;
176} 287}
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
215 int n, bucket_count = 1 << bucket_bits; 326 int n, bucket_count = 1 << bucket_bits;
216 struct mb_cache *cache = NULL; 327 struct mb_cache *cache = NULL;
217 328
329 if (!mb_cache_bg_lock) {
330 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
331 GFP_KERNEL);
332 if (!mb_cache_bg_lock)
333 return NULL;
334 bgl_lock_init(mb_cache_bg_lock);
335 }
336
218 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); 337 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
219 if (!cache) 338 if (!cache)
220 return NULL; 339 return NULL;
221 cache->c_name = name; 340 cache->c_name = name;
222 atomic_set(&cache->c_entry_count, 0); 341 atomic_set(&cache->c_entry_count, 0);
223 cache->c_bucket_bits = bucket_bits; 342 cache->c_bucket_bits = bucket_bits;
224 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 343 cache->c_block_hash = kmalloc(bucket_count *
225 GFP_KERNEL); 344 sizeof(struct hlist_bl_head), GFP_KERNEL);
226 if (!cache->c_block_hash) 345 if (!cache->c_block_hash)
227 goto fail; 346 goto fail;
228 for (n=0; n<bucket_count; n++) 347 for (n=0; n<bucket_count; n++)
229 INIT_LIST_HEAD(&cache->c_block_hash[n]); 348 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
230 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), 349 cache->c_index_hash = kmalloc(bucket_count *
231 GFP_KERNEL); 350 sizeof(struct hlist_bl_head), GFP_KERNEL);
232 if (!cache->c_index_hash) 351 if (!cache->c_index_hash)
233 goto fail; 352 goto fail;
234 for (n=0; n<bucket_count; n++) 353 for (n=0; n<bucket_count; n++)
235 INIT_LIST_HEAD(&cache->c_index_hash[n]); 354 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
236 cache->c_entry_cache = kmem_cache_create(name, 355 if (!mb_cache_kmem_cache) {
237 sizeof(struct mb_cache_entry), 0, 356 mb_cache_kmem_cache = kmem_cache_create(name,
238 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 357 sizeof(struct mb_cache_entry), 0,
239 if (!cache->c_entry_cache) 358 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
240 goto fail2; 359 if (!mb_cache_kmem_cache)
360 goto fail2;
361 }
362 cache->c_entry_cache = mb_cache_kmem_cache;
241 363
242 /* 364 /*
243 * Set an upper limit on the number of cache entries so that the hash 365 * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
273mb_cache_shrink(struct block_device *bdev) 395mb_cache_shrink(struct block_device *bdev)
274{ 396{
275 LIST_HEAD(free_list); 397 LIST_HEAD(free_list);
276 struct list_head *l, *ltmp; 398 struct list_head *l;
399 struct mb_cache_entry *ce, *tmp;
277 400
401 l = &mb_cache_lru_list;
278 spin_lock(&mb_cache_spinlock); 402 spin_lock(&mb_cache_spinlock);
279 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 403 while (!list_is_last(l, &mb_cache_lru_list)) {
280 struct mb_cache_entry *ce = 404 l = l->next;
281 list_entry(l, struct mb_cache_entry, e_lru_list); 405 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
282 if (ce->e_bdev == bdev) { 406 if (ce->e_bdev == bdev) {
283 list_move_tail(&ce->e_lru_list, &free_list); 407 list_del_init(&ce->e_lru_list);
284 __mb_cache_entry_unhash(ce); 408 if (ce->e_used || ce->e_queued ||
409 atomic_read(&ce->e_refcnt))
410 continue;
411 spin_unlock(&mb_cache_spinlock);
412 /*
413 * Prevent any find or get operation on the entry.
414 */
415 hlist_bl_lock(ce->e_block_hash_p);
416 hlist_bl_lock(ce->e_index_hash_p);
417 /* Ignore if it is touched by a find/get */
418 if (ce->e_used || ce->e_queued ||
419 atomic_read(&ce->e_refcnt) ||
420 !list_empty(&ce->e_lru_list)) {
421 hlist_bl_unlock(ce->e_index_hash_p);
422 hlist_bl_unlock(ce->e_block_hash_p);
423 l = &mb_cache_lru_list;
424 spin_lock(&mb_cache_spinlock);
425 continue;
426 }
427 __mb_cache_entry_unhash_unlock(ce);
428 mb_assert(!(ce->e_used || ce->e_queued ||
429 atomic_read(&ce->e_refcnt)));
430 list_add_tail(&ce->e_lru_list, &free_list);
431 l = &mb_cache_lru_list;
432 spin_lock(&mb_cache_spinlock);
285 } 433 }
286 } 434 }
287 spin_unlock(&mb_cache_spinlock); 435 spin_unlock(&mb_cache_spinlock);
288 list_for_each_safe(l, ltmp, &free_list) { 436
289 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 437 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
290 e_lru_list), GFP_KERNEL); 438 __mb_cache_entry_forget(ce, GFP_KERNEL);
291 } 439 }
292} 440}
293 441
@@ -303,23 +451,27 @@ void
303mb_cache_destroy(struct mb_cache *cache) 451mb_cache_destroy(struct mb_cache *cache)
304{ 452{
305 LIST_HEAD(free_list); 453 LIST_HEAD(free_list);
306 struct list_head *l, *ltmp; 454 struct mb_cache_entry *ce, *tmp;
307 455
308 spin_lock(&mb_cache_spinlock); 456 spin_lock(&mb_cache_spinlock);
309 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 457 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
310 struct mb_cache_entry *ce = 458 if (ce->e_cache == cache)
311 list_entry(l, struct mb_cache_entry, e_lru_list);
312 if (ce->e_cache == cache) {
313 list_move_tail(&ce->e_lru_list, &free_list); 459 list_move_tail(&ce->e_lru_list, &free_list);
314 __mb_cache_entry_unhash(ce);
315 }
316 } 460 }
317 list_del(&cache->c_cache_list); 461 list_del(&cache->c_cache_list);
318 spin_unlock(&mb_cache_spinlock); 462 spin_unlock(&mb_cache_spinlock);
319 463
320 list_for_each_safe(l, ltmp, &free_list) { 464 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
321 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 465 list_del_init(&ce->e_lru_list);
322 e_lru_list), GFP_KERNEL); 466 /*
467 * Prevent any find or get operation on the entry.
468 */
469 hlist_bl_lock(ce->e_block_hash_p);
470 hlist_bl_lock(ce->e_index_hash_p);
471 mb_assert(!(ce->e_used || ce->e_queued ||
472 atomic_read(&ce->e_refcnt)));
473 __mb_cache_entry_unhash_unlock(ce);
474 __mb_cache_entry_forget(ce, GFP_KERNEL);
323 } 475 }
324 476
325 if (atomic_read(&cache->c_entry_count) > 0) { 477 if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
328 atomic_read(&cache->c_entry_count)); 480 atomic_read(&cache->c_entry_count));
329 } 481 }
330 482
331 kmem_cache_destroy(cache->c_entry_cache); 483 if (list_empty(&mb_cache_list)) {
332 484 kmem_cache_destroy(mb_cache_kmem_cache);
485 mb_cache_kmem_cache = NULL;
486 }
333 kfree(cache->c_index_hash); 487 kfree(cache->c_index_hash);
334 kfree(cache->c_block_hash); 488 kfree(cache->c_block_hash);
335 kfree(cache); 489 kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
346struct mb_cache_entry * 500struct mb_cache_entry *
347mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 501mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
348{ 502{
349 struct mb_cache_entry *ce = NULL; 503 struct mb_cache_entry *ce;
350 504
351 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { 505 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
506 struct list_head *l;
507
508 l = &mb_cache_lru_list;
352 spin_lock(&mb_cache_spinlock); 509 spin_lock(&mb_cache_spinlock);
353 if (!list_empty(&mb_cache_lru_list)) { 510 while (!list_is_last(l, &mb_cache_lru_list)) {
354 ce = list_entry(mb_cache_lru_list.next, 511 l = l->next;
355 struct mb_cache_entry, e_lru_list); 512 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
356 list_del_init(&ce->e_lru_list); 513 if (ce->e_cache == cache) {
357 __mb_cache_entry_unhash(ce); 514 list_del_init(&ce->e_lru_list);
515 if (ce->e_used || ce->e_queued ||
516 atomic_read(&ce->e_refcnt))
517 continue;
518 spin_unlock(&mb_cache_spinlock);
519 /*
520 * Prevent any find or get operation on the
521 * entry.
522 */
523 hlist_bl_lock(ce->e_block_hash_p);
524 hlist_bl_lock(ce->e_index_hash_p);
525 /* Ignore if it is touched by a find/get */
526 if (ce->e_used || ce->e_queued ||
527 atomic_read(&ce->e_refcnt) ||
528 !list_empty(&ce->e_lru_list)) {
529 hlist_bl_unlock(ce->e_index_hash_p);
530 hlist_bl_unlock(ce->e_block_hash_p);
531 l = &mb_cache_lru_list;
532 spin_lock(&mb_cache_spinlock);
533 continue;
534 }
535 mb_assert(list_empty(&ce->e_lru_list));
536 mb_assert(!(ce->e_used || ce->e_queued ||
537 atomic_read(&ce->e_refcnt)));
538 __mb_cache_entry_unhash_unlock(ce);
539 goto found;
540 }
358 } 541 }
359 spin_unlock(&mb_cache_spinlock); 542 spin_unlock(&mb_cache_spinlock);
360 } 543 }
361 if (!ce) { 544
362 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 545 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
363 if (!ce) 546 if (!ce)
364 return NULL; 547 return NULL;
365 atomic_inc(&cache->c_entry_count); 548 atomic_inc(&cache->c_entry_count);
366 INIT_LIST_HEAD(&ce->e_lru_list); 549 INIT_LIST_HEAD(&ce->e_lru_list);
367 INIT_LIST_HEAD(&ce->e_block_list); 550 INIT_HLIST_BL_NODE(&ce->e_block_list);
368 ce->e_cache = cache; 551 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
369 ce->e_queued = 0; 552 ce->e_cache = cache;
370 } 553 ce->e_queued = 0;
554 atomic_set(&ce->e_refcnt, 0);
555found:
556 ce->e_block_hash_p = &cache->c_block_hash[0];
557 ce->e_index_hash_p = &cache->c_index_hash[0];
371 ce->e_used = 1 + MB_CACHE_WRITER; 558 ce->e_used = 1 + MB_CACHE_WRITER;
372 return ce; 559 return ce;
373} 560}
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
393{ 580{
394 struct mb_cache *cache = ce->e_cache; 581 struct mb_cache *cache = ce->e_cache;
395 unsigned int bucket; 582 unsigned int bucket;
396 struct list_head *l; 583 struct hlist_bl_node *l;
397 int error = -EBUSY; 584 struct hlist_bl_head *block_hash_p;
585 struct hlist_bl_head *index_hash_p;
586 struct mb_cache_entry *lce;
398 587
588 mb_assert(ce);
399 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 589 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
400 cache->c_bucket_bits); 590 cache->c_bucket_bits);
401 spin_lock(&mb_cache_spinlock); 591 block_hash_p = &cache->c_block_hash[bucket];
402 list_for_each_prev(l, &cache->c_block_hash[bucket]) { 592 hlist_bl_lock(block_hash_p);
403 struct mb_cache_entry *ce = 593 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
404 list_entry(l, struct mb_cache_entry, e_block_list); 594 if (lce->e_bdev == bdev && lce->e_block == block) {
405 if (ce->e_bdev == bdev && ce->e_block == block) 595 hlist_bl_unlock(block_hash_p);
406 goto out; 596 return -EBUSY;
597 }
407 } 598 }
408 __mb_cache_entry_unhash(ce); 599 mb_assert(!__mb_cache_entry_is_block_hashed(ce));
600 __mb_cache_entry_unhash_block(ce);
601 __mb_cache_entry_unhash_index(ce);
409 ce->e_bdev = bdev; 602 ce->e_bdev = bdev;
410 ce->e_block = block; 603 ce->e_block = block;
411 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 604 ce->e_block_hash_p = block_hash_p;
412 ce->e_index.o_key = key; 605 ce->e_index.o_key = key;
606 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
607 hlist_bl_unlock(block_hash_p);
413 bucket = hash_long(key, cache->c_bucket_bits); 608 bucket = hash_long(key, cache->c_bucket_bits);
414 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); 609 index_hash_p = &cache->c_index_hash[bucket];
415 error = 0; 610 hlist_bl_lock(index_hash_p);
416out: 611 ce->e_index_hash_p = index_hash_p;
417 spin_unlock(&mb_cache_spinlock); 612 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
418 return error; 613 hlist_bl_unlock(index_hash_p);
614 return 0;
419} 615}
420 616
421 617
@@ -429,24 +625,26 @@ out:
429void 625void
430mb_cache_entry_release(struct mb_cache_entry *ce) 626mb_cache_entry_release(struct mb_cache_entry *ce)
431{ 627{
432 spin_lock(&mb_cache_spinlock); 628 __mb_cache_entry_release(ce);
433 __mb_cache_entry_release_unlock(ce);
434} 629}
435 630
436 631
437/* 632/*
438 * mb_cache_entry_free() 633 * mb_cache_entry_free()
439 * 634 *
440 * This is equivalent to the sequence mb_cache_entry_takeout() --
441 * mb_cache_entry_release().
442 */ 635 */
443void 636void
444mb_cache_entry_free(struct mb_cache_entry *ce) 637mb_cache_entry_free(struct mb_cache_entry *ce)
445{ 638{
446 spin_lock(&mb_cache_spinlock); 639 mb_assert(ce);
447 mb_assert(list_empty(&ce->e_lru_list)); 640 mb_assert(list_empty(&ce->e_lru_list));
448 __mb_cache_entry_unhash(ce); 641 hlist_bl_lock(ce->e_index_hash_p);
449 __mb_cache_entry_release_unlock(ce); 642 __mb_cache_entry_unhash_index(ce);
643 hlist_bl_unlock(ce->e_index_hash_p);
644 hlist_bl_lock(ce->e_block_hash_p);
645 __mb_cache_entry_unhash_block(ce);
646 hlist_bl_unlock(ce->e_block_hash_p);
647 __mb_cache_entry_release(ce);
450} 648}
451 649
452 650
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
463 sector_t block) 661 sector_t block)
464{ 662{
465 unsigned int bucket; 663 unsigned int bucket;
466 struct list_head *l; 664 struct hlist_bl_node *l;
467 struct mb_cache_entry *ce; 665 struct mb_cache_entry *ce;
666 struct hlist_bl_head *block_hash_p;
468 667
469 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 668 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
470 cache->c_bucket_bits); 669 cache->c_bucket_bits);
471 spin_lock(&mb_cache_spinlock); 670 block_hash_p = &cache->c_block_hash[bucket];
472 list_for_each(l, &cache->c_block_hash[bucket]) { 671 /* First serialize access to the block corresponding hash chain. */
473 ce = list_entry(l, struct mb_cache_entry, e_block_list); 672 hlist_bl_lock(block_hash_p);
673 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
674 mb_assert(ce->e_block_hash_p == block_hash_p);
474 if (ce->e_bdev == bdev && ce->e_block == block) { 675 if (ce->e_bdev == bdev && ce->e_block == block) {
475 DEFINE_WAIT(wait); 676 /*
677 * Prevent a free from removing the entry.
678 */
679 atomic_inc(&ce->e_refcnt);
680 hlist_bl_unlock(block_hash_p);
681 __spin_lock_mb_cache_entry(ce);
682 atomic_dec(&ce->e_refcnt);
683 if (ce->e_used > 0) {
684 DEFINE_WAIT(wait);
685 while (ce->e_used > 0) {
686 ce->e_queued++;
687 prepare_to_wait(&mb_cache_queue, &wait,
688 TASK_UNINTERRUPTIBLE);
689 __spin_unlock_mb_cache_entry(ce);
690 schedule();
691 __spin_lock_mb_cache_entry(ce);
692 ce->e_queued--;
693 }
694 finish_wait(&mb_cache_queue, &wait);
695 }
696 ce->e_used += 1 + MB_CACHE_WRITER;
697 __spin_unlock_mb_cache_entry(ce);
476 698
477 if (!list_empty(&ce->e_lru_list)) 699 if (!list_empty(&ce->e_lru_list)) {
700 spin_lock(&mb_cache_spinlock);
478 list_del_init(&ce->e_lru_list); 701 list_del_init(&ce->e_lru_list);
479
480 while (ce->e_used > 0) {
481 ce->e_queued++;
482 prepare_to_wait(&mb_cache_queue, &wait,
483 TASK_UNINTERRUPTIBLE);
484 spin_unlock(&mb_cache_spinlock); 702 spin_unlock(&mb_cache_spinlock);
485 schedule();
486 spin_lock(&mb_cache_spinlock);
487 ce->e_queued--;
488 } 703 }
489 finish_wait(&mb_cache_queue, &wait); 704 if (!__mb_cache_entry_is_block_hashed(ce)) {
490 ce->e_used += 1 + MB_CACHE_WRITER; 705 __mb_cache_entry_release(ce);
491
492 if (!__mb_cache_entry_is_hashed(ce)) {
493 __mb_cache_entry_release_unlock(ce);
494 return NULL; 706 return NULL;
495 } 707 }
496 goto cleanup; 708 return ce;
497 } 709 }
498 } 710 }
499 ce = NULL; 711 hlist_bl_unlock(block_hash_p);
500 712 return NULL;
501cleanup:
502 spin_unlock(&mb_cache_spinlock);
503 return ce;
504} 713}
505 714
506#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 715#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
507 716
508static struct mb_cache_entry * 717static struct mb_cache_entry *
509__mb_cache_entry_find(struct list_head *l, struct list_head *head, 718__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
510 struct block_device *bdev, unsigned int key) 719 struct block_device *bdev, unsigned int key)
511{ 720{
512 while (l != head) { 721
722 /* The index hash chain is alredy acquire by caller. */
723 while (l != NULL) {
513 struct mb_cache_entry *ce = 724 struct mb_cache_entry *ce =
514 list_entry(l, struct mb_cache_entry, e_index.o_list); 725 hlist_bl_entry(l, struct mb_cache_entry,
726 e_index.o_list);
727 mb_assert(ce->e_index_hash_p == head);
515 if (ce->e_bdev == bdev && ce->e_index.o_key == key) { 728 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
516 DEFINE_WAIT(wait); 729 /*
517 730 * Prevent a free from removing the entry.
518 if (!list_empty(&ce->e_lru_list)) 731 */
519 list_del_init(&ce->e_lru_list); 732 atomic_inc(&ce->e_refcnt);
520 733 hlist_bl_unlock(head);
734 __spin_lock_mb_cache_entry(ce);
735 atomic_dec(&ce->e_refcnt);
736 ce->e_used++;
521 /* Incrementing before holding the lock gives readers 737 /* Incrementing before holding the lock gives readers
522 priority over writers. */ 738 priority over writers. */
523 ce->e_used++; 739 if (ce->e_used >= MB_CACHE_WRITER) {
524 while (ce->e_used >= MB_CACHE_WRITER) { 740 DEFINE_WAIT(wait);
525 ce->e_queued++; 741
526 prepare_to_wait(&mb_cache_queue, &wait, 742 while (ce->e_used >= MB_CACHE_WRITER) {
527 TASK_UNINTERRUPTIBLE); 743 ce->e_queued++;
528 spin_unlock(&mb_cache_spinlock); 744 prepare_to_wait(&mb_cache_queue, &wait,
529 schedule(); 745 TASK_UNINTERRUPTIBLE);
530 spin_lock(&mb_cache_spinlock); 746 __spin_unlock_mb_cache_entry(ce);
531 ce->e_queued--; 747 schedule();
748 __spin_lock_mb_cache_entry(ce);
749 ce->e_queued--;
750 }
751 finish_wait(&mb_cache_queue, &wait);
532 } 752 }
533 finish_wait(&mb_cache_queue, &wait); 753 __spin_unlock_mb_cache_entry(ce);
534 754 if (!list_empty(&ce->e_lru_list)) {
535 if (!__mb_cache_entry_is_hashed(ce)) {
536 __mb_cache_entry_release_unlock(ce);
537 spin_lock(&mb_cache_spinlock); 755 spin_lock(&mb_cache_spinlock);
756 list_del_init(&ce->e_lru_list);
757 spin_unlock(&mb_cache_spinlock);
758 }
759 if (!__mb_cache_entry_is_block_hashed(ce)) {
760 __mb_cache_entry_release(ce);
538 return ERR_PTR(-EAGAIN); 761 return ERR_PTR(-EAGAIN);
539 } 762 }
540 return ce; 763 return ce;
541 } 764 }
542 l = l->next; 765 l = l->next;
543 } 766 }
767 hlist_bl_unlock(head);
544 return NULL; 768 return NULL;
545} 769}
546 770
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
562 unsigned int key) 786 unsigned int key)
563{ 787{
564 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 788 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
565 struct list_head *l; 789 struct hlist_bl_node *l;
566 struct mb_cache_entry *ce; 790 struct mb_cache_entry *ce = NULL;
567 791 struct hlist_bl_head *index_hash_p;
568 spin_lock(&mb_cache_spinlock); 792
569 l = cache->c_index_hash[bucket].next; 793 index_hash_p = &cache->c_index_hash[bucket];
570 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 794 hlist_bl_lock(index_hash_p);
571 spin_unlock(&mb_cache_spinlock); 795 if (!hlist_bl_empty(index_hash_p)) {
796 l = hlist_bl_first(index_hash_p);
797 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
798 } else
799 hlist_bl_unlock(index_hash_p);
572 return ce; 800 return ce;
573} 801}
574 802
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
597{ 825{
598 struct mb_cache *cache = prev->e_cache; 826 struct mb_cache *cache = prev->e_cache;
599 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 827 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
600 struct list_head *l; 828 struct hlist_bl_node *l;
601 struct mb_cache_entry *ce; 829 struct mb_cache_entry *ce;
830 struct hlist_bl_head *index_hash_p;
602 831
603 spin_lock(&mb_cache_spinlock); 832 index_hash_p = &cache->c_index_hash[bucket];
833 mb_assert(prev->e_index_hash_p == index_hash_p);
834 hlist_bl_lock(index_hash_p);
835 mb_assert(!hlist_bl_empty(index_hash_p));
604 l = prev->e_index.o_list.next; 836 l = prev->e_index.o_list.next;
605 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 837 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
606 __mb_cache_entry_release_unlock(prev); 838 __mb_cache_entry_release(prev);
607 return ce; 839 return ce;
608} 840}
609 841
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109162a5..f007a3355570 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
26 26
27static void minix_evict_inode(struct inode *inode) 27static void minix_evict_inode(struct inode *inode)
28{ 28{
29 truncate_inode_pages(&inode->i_data, 0); 29 truncate_inode_pages_final(&inode->i_data);
30 if (!inode->i_nlink) { 30 if (!inode->i_nlink) {
31 inode->i_size = 0; 31 inode->i_size = 0;
32 minix_truncate(inode); 32 minix_truncate(inode);
@@ -86,7 +86,7 @@ static void init_once(void *foo)
86 inode_init_once(&ei->vfs_inode); 86 inode_init_once(&ei->vfs_inode);
87} 87}
88 88
89static int init_inodecache(void) 89static int __init init_inodecache(void)
90{ 90{
91 minix_inode_cachep = kmem_cache_create("minix_inode_cache", 91 minix_inode_cachep = kmem_cache_create("minix_inode_cache",
92 sizeof(struct minix_inode_info), 92 sizeof(struct minix_inode_info),
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
123 struct minix_sb_info * sbi = minix_sb(sb); 123 struct minix_sb_info * sbi = minix_sb(sb);
124 struct minix_super_block * ms; 124 struct minix_super_block * ms;
125 125
126 sync_filesystem(sb);
126 ms = sbi->s_ms; 127 ms = sbi->s_ms;
127 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 128 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
128 return 0; 129 return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 4b491b431990..88339f59efb5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1796,7 +1796,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1796 if (err) 1796 if (err)
1797 return err; 1797 return err;
1798 } 1798 }
1799 if (!d_is_directory(nd->path.dentry)) { 1799 if (!d_can_lookup(nd->path.dentry)) {
1800 err = -ENOTDIR; 1800 err = -ENOTDIR;
1801 break; 1801 break;
1802 } 1802 }
@@ -1817,7 +1817,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1817 struct dentry *root = nd->root.dentry; 1817 struct dentry *root = nd->root.dentry;
1818 struct inode *inode = root->d_inode; 1818 struct inode *inode = root->d_inode;
1819 if (*name) { 1819 if (*name) {
1820 if (!d_is_directory(root)) 1820 if (!d_can_lookup(root))
1821 return -ENOTDIR; 1821 return -ENOTDIR;
1822 retval = inode_permission(inode, MAY_EXEC); 1822 retval = inode_permission(inode, MAY_EXEC);
1823 if (retval) 1823 if (retval)
@@ -1873,7 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1873 dentry = f.file->f_path.dentry; 1873 dentry = f.file->f_path.dentry;
1874 1874
1875 if (*name) { 1875 if (*name) {
1876 if (!d_is_directory(dentry)) { 1876 if (!d_can_lookup(dentry)) {
1877 fdput(f); 1877 fdput(f);
1878 return -ENOTDIR; 1878 return -ENOTDIR;
1879 } 1879 }
@@ -1955,7 +1955,7 @@ static int path_lookupat(int dfd, const char *name,
1955 err = complete_walk(nd); 1955 err = complete_walk(nd);
1956 1956
1957 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1957 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1958 if (!d_is_directory(nd->path.dentry)) { 1958 if (!d_can_lookup(nd->path.dentry)) {
1959 path_put(&nd->path); 1959 path_put(&nd->path);
1960 err = -ENOTDIR; 1960 err = -ENOTDIR;
1961 } 1961 }
@@ -2414,11 +2414,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2414 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) 2414 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
2415 return -EPERM; 2415 return -EPERM;
2416 if (isdir) { 2416 if (isdir) {
2417 if (!d_is_directory(victim) && !d_is_autodir(victim)) 2417 if (!d_is_dir(victim))
2418 return -ENOTDIR; 2418 return -ENOTDIR;
2419 if (IS_ROOT(victim)) 2419 if (IS_ROOT(victim))
2420 return -EBUSY; 2420 return -EBUSY;
2421 } else if (d_is_directory(victim) || d_is_autodir(victim)) 2421 } else if (d_is_dir(victim))
2422 return -EISDIR; 2422 return -EISDIR;
2423 if (IS_DEADDIR(dir)) 2423 if (IS_DEADDIR(dir))
2424 return -ENOENT; 2424 return -ENOENT;
@@ -2569,7 +2569,7 @@ static int handle_truncate(struct file *filp)
2569 /* 2569 /*
2570 * Refuse to truncate files with mandatory locks held on them. 2570 * Refuse to truncate files with mandatory locks held on them.
2571 */ 2571 */
2572 error = locks_verify_locked(inode); 2572 error = locks_verify_locked(filp);
2573 if (!error) 2573 if (!error)
2574 error = security_path_truncate(path); 2574 error = security_path_truncate(path);
2575 if (!error) { 2575 if (!error) {
@@ -3016,11 +3016,10 @@ finish_open:
3016 } 3016 }
3017 audit_inode(name, nd->path.dentry, 0); 3017 audit_inode(name, nd->path.dentry, 0);
3018 error = -EISDIR; 3018 error = -EISDIR;
3019 if ((open_flag & O_CREAT) && 3019 if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3020 (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
3021 goto out; 3020 goto out;
3022 error = -ENOTDIR; 3021 error = -ENOTDIR;
3023 if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) 3022 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3024 goto out; 3023 goto out;
3025 if (!S_ISREG(nd->inode->i_mode)) 3024 if (!S_ISREG(nd->inode->i_mode))
3026 will_truncate = false; 3025 will_truncate = false;
@@ -3744,7 +3743,7 @@ exit1:
3744slashes: 3743slashes:
3745 if (d_is_negative(dentry)) 3744 if (d_is_negative(dentry))
3746 error = -ENOENT; 3745 error = -ENOENT;
3747 else if (d_is_directory(dentry) || d_is_autodir(dentry)) 3746 else if (d_is_dir(dentry))
3748 error = -EISDIR; 3747 error = -EISDIR;
3749 else 3748 else
3750 error = -ENOTDIR; 3749 error = -ENOTDIR;
@@ -3974,7 +3973,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3974 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 3973 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3975} 3974}
3976 3975
3977/* 3976/**
3977 * vfs_rename - rename a filesystem object
3978 * @old_dir: parent of source
3979 * @old_dentry: source
3980 * @new_dir: parent of destination
3981 * @new_dentry: destination
3982 * @delegated_inode: returns an inode needing a delegation break
3983 * @flags: rename flags
3984 *
3985 * The caller must hold multiple mutexes--see lock_rename()).
3986 *
3987 * If vfs_rename discovers a delegation in need of breaking at either
3988 * the source or destination, it will return -EWOULDBLOCK and return a
3989 * reference to the inode in delegated_inode. The caller should then
3990 * break the delegation and retry. Because breaking a delegation may
3991 * take a long time, the caller should drop all locks before doing
3992 * so.
3993 *
3994 * Alternatively, a caller may pass NULL for delegated_inode. This may
3995 * be appropriate for callers that expect the underlying filesystem not
3996 * to be NFS exported.
3997 *
3978 * The worst of all namespace operations - renaming directory. "Perverted" 3998 * The worst of all namespace operations - renaming directory. "Perverted"
3979 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 3999 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
3980 * Problems: 4000 * Problems:
@@ -4002,163 +4022,139 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
4002 * ->i_mutex on parents, which works but leads to some truly excessive 4022 * ->i_mutex on parents, which works but leads to some truly excessive
4003 * locking]. 4023 * locking].
4004 */ 4024 */
4005static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 4025int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4006 struct inode *new_dir, struct dentry *new_dentry) 4026 struct inode *new_dir, struct dentry *new_dentry,
4027 struct inode **delegated_inode, unsigned int flags)
4007{ 4028{
4008 int error = 0; 4029 int error;
4030 bool is_dir = d_is_dir(old_dentry);
4031 const unsigned char *old_name;
4032 struct inode *source = old_dentry->d_inode;
4009 struct inode *target = new_dentry->d_inode; 4033 struct inode *target = new_dentry->d_inode;
4034 bool new_is_dir = false;
4010 unsigned max_links = new_dir->i_sb->s_max_links; 4035 unsigned max_links = new_dir->i_sb->s_max_links;
4011 4036
4037 if (source == target)
4038 return 0;
4039
4040 error = may_delete(old_dir, old_dentry, is_dir);
4041 if (error)
4042 return error;
4043
4044 if (!target) {
4045 error = may_create(new_dir, new_dentry);
4046 } else {
4047 new_is_dir = d_is_dir(new_dentry);
4048
4049 if (!(flags & RENAME_EXCHANGE))
4050 error = may_delete(new_dir, new_dentry, is_dir);
4051 else
4052 error = may_delete(new_dir, new_dentry, new_is_dir);
4053 }
4054 if (error)
4055 return error;
4056
4057 if (!old_dir->i_op->rename)
4058 return -EPERM;
4059
4060 if (flags && !old_dir->i_op->rename2)
4061 return -EINVAL;
4062
4012 /* 4063 /*
4013 * If we are going to change the parent - check write permissions, 4064 * If we are going to change the parent - check write permissions,
4014 * we'll need to flip '..'. 4065 * we'll need to flip '..'.
4015 */ 4066 */
4016 if (new_dir != old_dir) { 4067 if (new_dir != old_dir) {
4017 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 4068 if (is_dir) {
4018 if (error) 4069 error = inode_permission(source, MAY_WRITE);
4019 return error; 4070 if (error)
4071 return error;
4072 }
4073 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4074 error = inode_permission(target, MAY_WRITE);
4075 if (error)
4076 return error;
4077 }
4020 } 4078 }
4021 4079
4022 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 4080 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4081 flags);
4023 if (error) 4082 if (error)
4024 return error; 4083 return error;
4025 4084
4085 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4026 dget(new_dentry); 4086 dget(new_dentry);
4027 if (target) 4087 if (!is_dir || (flags & RENAME_EXCHANGE))
4088 lock_two_nondirectories(source, target);
4089 else if (target)
4028 mutex_lock(&target->i_mutex); 4090 mutex_lock(&target->i_mutex);
4029 4091
4030 error = -EBUSY; 4092 error = -EBUSY;
4031 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 4093 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
4032 goto out; 4094 goto out;
4033 4095
4034 error = -EMLINK; 4096 if (max_links && new_dir != old_dir) {
4035 if (max_links && !target && new_dir != old_dir && 4097 error = -EMLINK;
4036 new_dir->i_nlink >= max_links) 4098 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4037 goto out; 4099 goto out;
4038 4100 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4039 if (target) 4101 old_dir->i_nlink >= max_links)
4102 goto out;
4103 }
4104 if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4040 shrink_dcache_parent(new_dentry); 4105 shrink_dcache_parent(new_dentry);
4041 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4106 if (!is_dir) {
4042 if (error) 4107 error = try_break_deleg(source, delegated_inode);
4043 goto out; 4108 if (error)
4044 4109 goto out;
4045 if (target) {
4046 target->i_flags |= S_DEAD;
4047 dont_mount(new_dentry);
4048 } 4110 }
4049out: 4111 if (target && !new_is_dir) {
4050 if (target)
4051 mutex_unlock(&target->i_mutex);
4052 dput(new_dentry);
4053 if (!error)
4054 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4055 d_move(old_dentry,new_dentry);
4056 return error;
4057}
4058
4059static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4060 struct inode *new_dir, struct dentry *new_dentry,
4061 struct inode **delegated_inode)
4062{
4063 struct inode *target = new_dentry->d_inode;
4064 struct inode *source = old_dentry->d_inode;
4065 int error;
4066
4067 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
4068 if (error)
4069 return error;
4070
4071 dget(new_dentry);
4072 lock_two_nondirectories(source, target);
4073
4074 error = -EBUSY;
4075 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
4076 goto out;
4077
4078 error = try_break_deleg(source, delegated_inode);
4079 if (error)
4080 goto out;
4081 if (target) {
4082 error = try_break_deleg(target, delegated_inode); 4112 error = try_break_deleg(target, delegated_inode);
4083 if (error) 4113 if (error)
4084 goto out; 4114 goto out;
4085 } 4115 }
4086 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4116 if (!flags) {
4117 error = old_dir->i_op->rename(old_dir, old_dentry,
4118 new_dir, new_dentry);
4119 } else {
4120 error = old_dir->i_op->rename2(old_dir, old_dentry,
4121 new_dir, new_dentry, flags);
4122 }
4087 if (error) 4123 if (error)
4088 goto out; 4124 goto out;
4089 4125
4090 if (target) 4126 if (!(flags & RENAME_EXCHANGE) && target) {
4127 if (is_dir)
4128 target->i_flags |= S_DEAD;
4091 dont_mount(new_dentry); 4129 dont_mount(new_dentry);
4092 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 4130 }
4093 d_move(old_dentry, new_dentry); 4131 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4132 if (!(flags & RENAME_EXCHANGE))
4133 d_move(old_dentry, new_dentry);
4134 else
4135 d_exchange(old_dentry, new_dentry);
4136 }
4094out: 4137out:
4095 unlock_two_nondirectories(source, target); 4138 if (!is_dir || (flags & RENAME_EXCHANGE))
4139 unlock_two_nondirectories(source, target);
4140 else if (target)
4141 mutex_unlock(&target->i_mutex);
4096 dput(new_dentry); 4142 dput(new_dentry);
4097 return error; 4143 if (!error) {
4098}
4099
4100/**
4101 * vfs_rename - rename a filesystem object
4102 * @old_dir: parent of source
4103 * @old_dentry: source
4104 * @new_dir: parent of destination
4105 * @new_dentry: destination
4106 * @delegated_inode: returns an inode needing a delegation break
4107 *
4108 * The caller must hold multiple mutexes--see lock_rename()).
4109 *
4110 * If vfs_rename discovers a delegation in need of breaking at either
4111 * the source or destination, it will return -EWOULDBLOCK and return a
4112 * reference to the inode in delegated_inode. The caller should then
4113 * break the delegation and retry. Because breaking a delegation may
4114 * take a long time, the caller should drop all locks before doing
4115 * so.
4116 *
4117 * Alternatively, a caller may pass NULL for delegated_inode. This may
4118 * be appropriate for callers that expect the underlying filesystem not
4119 * to be NFS exported.
4120 */
4121int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4122 struct inode *new_dir, struct dentry *new_dentry,
4123 struct inode **delegated_inode)
4124{
4125 int error;
4126 int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
4127 const unsigned char *old_name;
4128
4129 if (old_dentry->d_inode == new_dentry->d_inode)
4130 return 0;
4131
4132 error = may_delete(old_dir, old_dentry, is_dir);
4133 if (error)
4134 return error;
4135
4136 if (!new_dentry->d_inode)
4137 error = may_create(new_dir, new_dentry);
4138 else
4139 error = may_delete(new_dir, new_dentry, is_dir);
4140 if (error)
4141 return error;
4142
4143 if (!old_dir->i_op->rename)
4144 return -EPERM;
4145
4146 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4147
4148 if (is_dir)
4149 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
4150 else
4151 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
4152 if (!error)
4153 fsnotify_move(old_dir, new_dir, old_name, is_dir, 4144 fsnotify_move(old_dir, new_dir, old_name, is_dir,
4154 new_dentry->d_inode, old_dentry); 4145 !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4146 if (flags & RENAME_EXCHANGE) {
4147 fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4148 new_is_dir, NULL, new_dentry);
4149 }
4150 }
4155 fsnotify_oldname_free(old_name); 4151 fsnotify_oldname_free(old_name);
4156 4152
4157 return error; 4153 return error;
4158} 4154}
4159 4155
4160SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4156SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4161 int, newdfd, const char __user *, newname) 4157 int, newdfd, const char __user *, newname, unsigned int, flags)
4162{ 4158{
4163 struct dentry *old_dir, *new_dir; 4159 struct dentry *old_dir, *new_dir;
4164 struct dentry *old_dentry, *new_dentry; 4160 struct dentry *old_dentry, *new_dentry;
@@ -4170,6 +4166,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4170 unsigned int lookup_flags = 0; 4166 unsigned int lookup_flags = 0;
4171 bool should_retry = false; 4167 bool should_retry = false;
4172 int error; 4168 int error;
4169
4170 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
4171 return -EINVAL;
4172
4173 if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
4174 return -EINVAL;
4175
4173retry: 4176retry:
4174 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); 4177 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4175 if (IS_ERR(from)) { 4178 if (IS_ERR(from)) {
@@ -4193,6 +4196,8 @@ retry:
4193 goto exit2; 4196 goto exit2;
4194 4197
4195 new_dir = newnd.path.dentry; 4198 new_dir = newnd.path.dentry;
4199 if (flags & RENAME_NOREPLACE)
4200 error = -EEXIST;
4196 if (newnd.last_type != LAST_NORM) 4201 if (newnd.last_type != LAST_NORM)
4197 goto exit2; 4202 goto exit2;
4198 4203
@@ -4202,7 +4207,8 @@ retry:
4202 4207
4203 oldnd.flags &= ~LOOKUP_PARENT; 4208 oldnd.flags &= ~LOOKUP_PARENT;
4204 newnd.flags &= ~LOOKUP_PARENT; 4209 newnd.flags &= ~LOOKUP_PARENT;
4205 newnd.flags |= LOOKUP_RENAME_TARGET; 4210 if (!(flags & RENAME_EXCHANGE))
4211 newnd.flags |= LOOKUP_RENAME_TARGET;
4206 4212
4207retry_deleg: 4213retry_deleg:
4208 trap = lock_rename(new_dir, old_dir); 4214 trap = lock_rename(new_dir, old_dir);
@@ -4215,34 +4221,49 @@ retry_deleg:
4215 error = -ENOENT; 4221 error = -ENOENT;
4216 if (d_is_negative(old_dentry)) 4222 if (d_is_negative(old_dentry))
4217 goto exit4; 4223 goto exit4;
4224 new_dentry = lookup_hash(&newnd);
4225 error = PTR_ERR(new_dentry);
4226 if (IS_ERR(new_dentry))
4227 goto exit4;
4228 error = -EEXIST;
4229 if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4230 goto exit5;
4231 if (flags & RENAME_EXCHANGE) {
4232 error = -ENOENT;
4233 if (d_is_negative(new_dentry))
4234 goto exit5;
4235
4236 if (!d_is_dir(new_dentry)) {
4237 error = -ENOTDIR;
4238 if (newnd.last.name[newnd.last.len])
4239 goto exit5;
4240 }
4241 }
4218 /* unless the source is a directory trailing slashes give -ENOTDIR */ 4242 /* unless the source is a directory trailing slashes give -ENOTDIR */
4219 if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { 4243 if (!d_is_dir(old_dentry)) {
4220 error = -ENOTDIR; 4244 error = -ENOTDIR;
4221 if (oldnd.last.name[oldnd.last.len]) 4245 if (oldnd.last.name[oldnd.last.len])
4222 goto exit4; 4246 goto exit5;
4223 if (newnd.last.name[newnd.last.len]) 4247 if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
4224 goto exit4; 4248 goto exit5;
4225 } 4249 }
4226 /* source should not be ancestor of target */ 4250 /* source should not be ancestor of target */
4227 error = -EINVAL; 4251 error = -EINVAL;
4228 if (old_dentry == trap) 4252 if (old_dentry == trap)
4229 goto exit4; 4253 goto exit5;
4230 new_dentry = lookup_hash(&newnd);
4231 error = PTR_ERR(new_dentry);
4232 if (IS_ERR(new_dentry))
4233 goto exit4;
4234 /* target should not be an ancestor of source */ 4254 /* target should not be an ancestor of source */
4235 error = -ENOTEMPTY; 4255 if (!(flags & RENAME_EXCHANGE))
4256 error = -ENOTEMPTY;
4236 if (new_dentry == trap) 4257 if (new_dentry == trap)
4237 goto exit5; 4258 goto exit5;
4238 4259
4239 error = security_path_rename(&oldnd.path, old_dentry, 4260 error = security_path_rename(&oldnd.path, old_dentry,
4240 &newnd.path, new_dentry); 4261 &newnd.path, new_dentry, flags);
4241 if (error) 4262 if (error)
4242 goto exit5; 4263 goto exit5;
4243 error = vfs_rename(old_dir->d_inode, old_dentry, 4264 error = vfs_rename(old_dir->d_inode, old_dentry,
4244 new_dir->d_inode, new_dentry, 4265 new_dir->d_inode, new_dentry,
4245 &delegated_inode); 4266 &delegated_inode, flags);
4246exit5: 4267exit5:
4247 dput(new_dentry); 4268 dput(new_dentry);
4248exit4: 4269exit4:
@@ -4272,9 +4293,15 @@ exit:
4272 return error; 4293 return error;
4273} 4294}
4274 4295
4296SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4297 int, newdfd, const char __user *, newname)
4298{
4299 return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4300}
4301
4275SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4302SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4276{ 4303{
4277 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 4304 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4278} 4305}
4279 4306
4280int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 4307int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebecb55f..647d86d2db39 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -99,6 +99,7 @@ static void destroy_inodecache(void)
99 99
100static int ncp_remount(struct super_block *sb, int *flags, char* data) 100static int ncp_remount(struct super_block *sb, int *flags, char* data)
101{ 101{
102 sync_filesystem(sb);
102 *flags |= MS_NODIRATIME; 103 *flags |= MS_NODIRATIME;
103 return 0; 104 return 0;
104} 105}
@@ -296,7 +297,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
296static void 297static void
297ncp_evict_inode(struct inode *inode) 298ncp_evict_inode(struct inode *inode)
298{ 299{
299 truncate_inode_pages(&inode->i_data, 0); 300 truncate_inode_pages_final(&inode->i_data);
300 clear_inode(inode); 301 clear_inode(inode);
301 302
302 if (S_ISDIR(inode->i_mode)) { 303 if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1214 if (end != NFS_I(inode)->npages) { 1214 if (end != NFS_I(inode)->npages) {
1215 rcu_read_lock(); 1215 rcu_read_lock();
1216 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); 1216 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
1217 rcu_read_unlock(); 1217 rcu_read_unlock();
1218 } 1218 }
1219 1219
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ae2e87b95453..41db5258e7a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -112,7 +112,8 @@ out:
112 * TODO: keep track of all layouts (and delegations) in a hash table 112 * TODO: keep track of all layouts (and delegations) in a hash table
113 * hashed by filehandle. 113 * hashed by filehandle.
114 */ 114 */
115static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) 115static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
116 struct nfs_fh *fh, nfs4_stateid *stateid)
116{ 117{
117 struct nfs_server *server; 118 struct nfs_server *server;
118 struct inode *ino; 119 struct inode *ino;
@@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
120 121
121 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 122 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
122 list_for_each_entry(lo, &server->layouts, plh_layouts) { 123 list_for_each_entry(lo, &server->layouts, plh_layouts) {
124 if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
125 continue;
123 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) 126 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
124 continue; 127 continue;
125 ino = igrab(lo->plh_inode); 128 ino = igrab(lo->plh_inode);
126 if (!ino) 129 if (!ino)
127 continue; 130 break;
128 spin_lock(&ino->i_lock); 131 spin_lock(&ino->i_lock);
129 /* Is this layout in the process of being freed? */ 132 /* Is this layout in the process of being freed? */
130 if (NFS_I(ino)->layout != lo) { 133 if (NFS_I(ino)->layout != lo) {
131 spin_unlock(&ino->i_lock); 134 spin_unlock(&ino->i_lock);
132 iput(ino); 135 iput(ino);
133 continue; 136 break;
134 } 137 }
135 pnfs_get_layout_hdr(lo); 138 pnfs_get_layout_hdr(lo);
136 spin_unlock(&ino->i_lock); 139 spin_unlock(&ino->i_lock);
@@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
141 return NULL; 144 return NULL;
142} 145}
143 146
144static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) 147static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
148 struct nfs_fh *fh, nfs4_stateid *stateid)
145{ 149{
146 struct pnfs_layout_hdr *lo; 150 struct pnfs_layout_hdr *lo;
147 151
148 spin_lock(&clp->cl_lock); 152 spin_lock(&clp->cl_lock);
149 rcu_read_lock(); 153 rcu_read_lock();
150 lo = get_layout_by_fh_locked(clp, fh); 154 lo = get_layout_by_fh_locked(clp, fh, stateid);
151 rcu_read_unlock(); 155 rcu_read_unlock();
152 spin_unlock(&clp->cl_lock); 156 spin_unlock(&clp->cl_lock);
153 157
@@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,
162 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 166 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
163 LIST_HEAD(free_me_list); 167 LIST_HEAD(free_me_list);
164 168
165 lo = get_layout_by_fh(clp, &args->cbl_fh); 169 lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
166 if (!lo) 170 if (!lo)
167 return NFS4ERR_NOMATCHING_LAYOUT; 171 goto out;
168 172
169 ino = lo->plh_inode; 173 ino = lo->plh_inode;
170 spin_lock(&ino->i_lock); 174 spin_lock(&ino->i_lock);
@@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
179 pnfs_free_lseg_list(&free_me_list); 183 pnfs_free_lseg_list(&free_me_list);
180 pnfs_put_layout_hdr(lo); 184 pnfs_put_layout_hdr(lo);
181 iput(ino); 185 iput(ino);
186out:
182 return rv; 187 return rv;
183} 188}
184 189
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a48fe4b84b6..d9f3d067cd15 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {
69 69
70static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) 70static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
71{ 71{
72 struct nfs_inode *nfsi = NFS_I(dir);
72 struct nfs_open_dir_context *ctx; 73 struct nfs_open_dir_context *ctx;
73 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 74 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
74 if (ctx != NULL) { 75 if (ctx != NULL) {
75 ctx->duped = 0; 76 ctx->duped = 0;
76 ctx->attr_gencount = NFS_I(dir)->attr_gencount; 77 ctx->attr_gencount = nfsi->attr_gencount;
77 ctx->dir_cookie = 0; 78 ctx->dir_cookie = 0;
78 ctx->dup_cookie = 0; 79 ctx->dup_cookie = 0;
79 ctx->cred = get_rpccred(cred); 80 ctx->cred = get_rpccred(cred);
81 spin_lock(&dir->i_lock);
82 list_add(&ctx->list, &nfsi->open_files);
83 spin_unlock(&dir->i_lock);
80 return ctx; 84 return ctx;
81 } 85 }
82 return ERR_PTR(-ENOMEM); 86 return ERR_PTR(-ENOMEM);
83} 87}
84 88
85static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 89static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
86{ 90{
91 spin_lock(&dir->i_lock);
92 list_del(&ctx->list);
93 spin_unlock(&dir->i_lock);
87 put_rpccred(ctx->cred); 94 put_rpccred(ctx->cred);
88 kfree(ctx); 95 kfree(ctx);
89} 96}
@@ -126,7 +133,7 @@ out:
126static int 133static int
127nfs_closedir(struct inode *inode, struct file *filp) 134nfs_closedir(struct inode *inode, struct file *filp)
128{ 135{
129 put_nfs_open_dir_context(filp->private_data); 136 put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
130 return 0; 137 return 0;
131} 138}
132 139
@@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
306 if (printk_ratelimit()) { 313 if (printk_ratelimit()) {
307 pr_notice("NFS: directory %pD2 contains a readdir loop." 314 pr_notice("NFS: directory %pD2 contains a readdir loop."
308 "Please contact your server vendor. " 315 "Please contact your server vendor. "
309 "The file: %s has duplicate cookie %llu\n", 316 "The file: %.*s has duplicate cookie %llu\n",
310 desc->file, 317 desc->file, array->array[i].string.len,
311 array->array[i].string.name, 318 array->array[i].string.name, *desc->dir_cookie);
312 *desc->dir_cookie);
313 } 319 }
314 status = -ELOOP; 320 status = -ELOOP;
315 goto out; 321 goto out;
@@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)
437 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); 443 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
438} 444}
439 445
446/*
447 * This function is mainly for use by nfs_getattr().
448 *
449 * If this is an 'ls -l', we want to force use of readdirplus.
450 * Do this by checking if there is an active file descriptor
451 * and calling nfs_advise_use_readdirplus, then forcing a
452 * cache flush.
453 */
454void nfs_force_use_readdirplus(struct inode *dir)
455{
456 if (!list_empty(&NFS_I(dir)->open_files)) {
457 nfs_advise_use_readdirplus(dir);
458 nfs_zap_mapping(dir, dir->i_mapping);
459 }
460}
461
440static 462static
441void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) 463void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
442{ 464{
@@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
815 goto out; 837 goto out;
816} 838}
817 839
840static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
841{
842 struct nfs_inode *nfsi = NFS_I(dir);
843
844 if (nfs_attribute_cache_expired(dir))
845 return true;
846 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
847 return true;
848 return false;
849}
850
818/* The file offset position represents the dirent entry number. A 851/* The file offset position represents the dirent entry number. A
819 last cookie cache takes care of the common case of reading the 852 last cookie cache takes care of the common case of reading the
820 whole directory. 853 whole directory.
@@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
847 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; 880 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
848 881
849 nfs_block_sillyrename(dentry); 882 nfs_block_sillyrename(dentry);
850 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) 883 if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
851 res = nfs_revalidate_mapping(inode, file->f_mapping); 884 res = nfs_revalidate_mapping(inode, file->f_mapping);
852 if (res < 0) 885 if (res < 0)
853 goto out; 886 goto out;
@@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1911 struct inode *old_inode = old_dentry->d_inode; 1944 struct inode *old_inode = old_dentry->d_inode;
1912 struct inode *new_inode = new_dentry->d_inode; 1945 struct inode *new_inode = new_dentry->d_inode;
1913 struct dentry *dentry = NULL, *rehash = NULL; 1946 struct dentry *dentry = NULL, *rehash = NULL;
1947 struct rpc_task *task;
1914 int error = -EBUSY; 1948 int error = -EBUSY;
1915 1949
1916 dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", 1950 dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
@@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1958 if (new_inode != NULL) 1992 if (new_inode != NULL)
1959 NFS_PROTO(new_inode)->return_delegation(new_inode); 1993 NFS_PROTO(new_inode)->return_delegation(new_inode);
1960 1994
1961 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1995 task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
1962 new_dir, &new_dentry->d_name); 1996 if (IS_ERR(task)) {
1997 error = PTR_ERR(task);
1998 goto out;
1999 }
2000
2001 error = rpc_wait_for_completion_task(task);
2002 if (error == 0)
2003 error = task->tk_status;
2004 rpc_put_task(task);
1963 nfs_mark_for_revalidate(old_inode); 2005 nfs_mark_for_revalidate(old_inode);
1964out: 2006out:
1965 if (rehash) 2007 if (rehash)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5bb790a69c71..284ca901fe16 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -617,6 +617,7 @@ out:
617 617
618static const struct vm_operations_struct nfs_file_vm_ops = { 618static const struct vm_operations_struct nfs_file_vm_ops = {
619 .fault = filemap_fault, 619 .fault = filemap_fault,
620 .map_pages = filemap_map_pages,
620 .page_mkwrite = nfs_vm_page_mkwrite, 621 .page_mkwrite = nfs_vm_page_mkwrite,
621 .remap_pages = generic_file_remap_pages, 622 .remap_pages = generic_file_remap_pages,
622}; 623};
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 360114ae8b82..0c438973f3c8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode);
128 128
129void nfs_evict_inode(struct inode *inode) 129void nfs_evict_inode(struct inode *inode)
130{ 130{
131 truncate_inode_pages(&inode->i_data, 0); 131 truncate_inode_pages_final(&inode->i_data);
132 clear_inode(inode); 132 clear_inode(inode);
133 nfs_clear_inode(inode); 133 nfs_clear_inode(inode);
134} 134}
@@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
588} 588}
589EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); 589EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
590 590
591static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
592{
593 struct dentry *parent;
594
595 parent = dget_parent(dentry);
596 nfs_force_use_readdirplus(parent->d_inode);
597 dput(parent);
598}
599
600static bool nfs_need_revalidate_inode(struct inode *inode)
601{
602 if (NFS_I(inode)->cache_validity &
603 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
604 return true;
605 if (nfs_attribute_cache_expired(inode))
606 return true;
607 return false;
608}
609
591int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 610int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
592{ 611{
593 struct inode *inode = dentry->d_inode; 612 struct inode *inode = dentry->d_inode;
@@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
616 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 635 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
617 need_atime = 0; 636 need_atime = 0;
618 637
619 if (need_atime) 638 if (need_atime || nfs_need_revalidate_inode(inode)) {
620 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 639 struct nfs_server *server = NFS_SERVER(inode);
621 else 640
622 err = nfs_revalidate_inode(NFS_SERVER(inode), inode); 641 if (server->caps & NFS_CAP_READDIRPLUS)
642 nfs_request_parent_use_readdirplus(dentry);
643 err = __nfs_revalidate_inode(server, inode);
644 }
623 if (!err) { 645 if (!err) {
624 generic_fillattr(inode, stat); 646 generic_fillattr(inode, stat);
625 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 647 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode)
961 */ 983 */
962int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 984int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
963{ 985{
964 if (!(NFS_I(inode)->cache_validity & 986 if (!nfs_need_revalidate_inode(inode))
965 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
966 && !nfs_attribute_cache_expired(inode))
967 return NFS_STALE(inode) ? -ESTALE : 0; 987 return NFS_STALE(inode) ? -ESTALE : 0;
968 return __nfs_revalidate_inode(server, inode); 988 return __nfs_revalidate_inode(server, inode);
969} 989}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b46cf5a67329..dd8bfc2e2464 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
301 const char *ip_addr); 301 const char *ip_addr);
302 302
303/* dir.c */ 303/* dir.c */
304extern void nfs_force_use_readdirplus(struct inode *dir);
304extern unsigned long nfs_access_cache_count(struct shrinker *shrink, 305extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
305 struct shrink_control *sc); 306 struct shrink_control *sc);
306extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, 307extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *,
474#define nfs_migrate_page NULL 475#define nfs_migrate_page NULL
475#endif 476#endif
476 477
478/* unlink.c */
479extern struct rpc_task *
480nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
481 struct dentry *old_dentry, struct dentry *new_dentry,
482 void (*complete)(struct rpc_task *, struct nfs_renamedata *));
483extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
484
477/* direct.c */ 485/* direct.c */
478void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 486void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
479 struct nfs_direct_req *dreq); 487 struct nfs_direct_req *dreq);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a462ef0fb5d6..db60149c4579 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
479} 479}
480 480
481static int 481static int
482nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
483 struct inode *new_dir, struct qstr *new_name)
484{
485 struct nfs_renameargs arg = {
486 .old_dir = NFS_FH(old_dir),
487 .old_name = old_name,
488 .new_dir = NFS_FH(new_dir),
489 .new_name = new_name,
490 };
491 struct nfs_renameres res;
492 struct rpc_message msg = {
493 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
494 .rpc_argp = &arg,
495 .rpc_resp = &res,
496 };
497 int status = -ENOMEM;
498
499 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
500
501 res.old_fattr = nfs_alloc_fattr();
502 res.new_fattr = nfs_alloc_fattr();
503 if (res.old_fattr == NULL || res.new_fattr == NULL)
504 goto out;
505
506 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
507 nfs_post_op_update_inode(old_dir, res.old_fattr);
508 nfs_post_op_update_inode(new_dir, res.new_fattr);
509out:
510 nfs_free_fattr(res.old_fattr);
511 nfs_free_fattr(res.new_fattr);
512 dprintk("NFS reply rename: %d\n", status);
513 return status;
514}
515
516static int
517nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 482nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
518{ 483{
519 struct nfs3_linkargs arg = { 484 struct nfs3_linkargs arg = {
@@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
968 .unlink_setup = nfs3_proc_unlink_setup, 933 .unlink_setup = nfs3_proc_unlink_setup,
969 .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, 934 .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
970 .unlink_done = nfs3_proc_unlink_done, 935 .unlink_done = nfs3_proc_unlink_done,
971 .rename = nfs3_proc_rename,
972 .rename_setup = nfs3_proc_rename_setup, 936 .rename_setup = nfs3_proc_rename_setup,
973 .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, 937 .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
974 .rename_done = nfs3_proc_rename_done, 938 .rename_done = nfs3_proc_rename_done,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a5b27c2d9689..e1d1badbe53c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
427extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 427extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
428extern void nfs_inode_find_state_and_recover(struct inode *inode, 428extern void nfs_inode_find_state_and_recover(struct inode *inode,
429 const nfs4_stateid *stateid); 429 const nfs4_stateid *stateid);
430extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
430extern void nfs4_schedule_lease_recovery(struct nfs_client *); 431extern void nfs4_schedule_lease_recovery(struct nfs_client *);
431extern int nfs4_wait_clnt_recover(struct nfs_client *clp); 432extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
432extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); 433extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
@@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
500 return memcmp(dst, src, sizeof(*dst)) == 0; 501 return memcmp(dst, src, sizeof(*dst)) == 0;
501} 502}
502 503
504static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
505{
506 return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
507}
508
509static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
510{
511 return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
512}
513
503static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) 514static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
504{ 515{
505 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; 516 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e46d3d1b6cc..aa9ef4876046 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,
531 *result = pos; 531 *result = pos;
532 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", 532 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
533 __func__, pos, atomic_read(&pos->cl_count)); 533 __func__, pos, atomic_read(&pos->cl_count));
534 goto out;
535 case -ERESTARTSYS:
536 case -ETIMEDOUT:
537 /* The callback path may have been inadvertently
538 * changed. Schedule recovery!
539 */
540 nfs4_schedule_path_down_recovery(pos);
534 default: 541 default:
535 goto out; 542 goto out;
536 } 543 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 450bfedbe2f4..397be39c6dc8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)
1068 dput(p->dentry); 1068 dput(p->dentry);
1069 nfs_sb_deactive(sb); 1069 nfs_sb_deactive(sb);
1070 nfs_fattr_free_names(&p->f_attr); 1070 nfs_fattr_free_names(&p->f_attr);
1071 kfree(p->f_attr.mdsthreshold);
1071 kfree(p); 1072 kfree(p);
1072} 1073}
1073 1074
@@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
1137 nfs4_state_set_mode_locked(state, state->state | fmode); 1138 nfs4_state_set_mode_locked(state, state->state | fmode);
1138} 1139}
1139 1140
1140static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 1141static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
1142{
1143 struct nfs_client *clp = state->owner->so_server->nfs_client;
1144 bool need_recover = false;
1145
1146 if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
1147 need_recover = true;
1148 if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
1149 need_recover = true;
1150 if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
1151 need_recover = true;
1152 if (need_recover)
1153 nfs4_state_mark_reclaim_nograce(clp, state);
1154}
1155
1156static bool nfs_need_update_open_stateid(struct nfs4_state *state,
1157 nfs4_stateid *stateid)
1158{
1159 if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
1160 return true;
1161 if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
1162 nfs_test_and_clear_all_open_stateid(state);
1163 return true;
1164 }
1165 if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
1166 return true;
1167 return false;
1168}
1169
1170static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1171 nfs4_stateid *stateid, fmode_t fmode)
1141{ 1172{
1173 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1174 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
1175 case FMODE_WRITE:
1176 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1177 break;
1178 case FMODE_READ:
1179 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1180 break;
1181 case 0:
1182 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1183 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1184 clear_bit(NFS_OPEN_STATE, &state->flags);
1185 }
1186 if (stateid == NULL)
1187 return;
1188 if (!nfs_need_update_open_stateid(state, stateid))
1189 return;
1142 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1190 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1143 nfs4_stateid_copy(&state->stateid, stateid); 1191 nfs4_stateid_copy(&state->stateid, stateid);
1144 nfs4_stateid_copy(&state->open_stateid, stateid); 1192 nfs4_stateid_copy(&state->open_stateid, stateid);
1145 set_bit(NFS_OPEN_STATE, &state->flags); 1193}
1194
1195static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
1196{
1197 write_seqlock(&state->seqlock);
1198 nfs_clear_open_stateid_locked(state, stateid, fmode);
1199 write_sequnlock(&state->seqlock);
1200 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
1201 nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
1202}
1203
1204static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
1205{
1146 switch (fmode) { 1206 switch (fmode) {
1147 case FMODE_READ: 1207 case FMODE_READ:
1148 set_bit(NFS_O_RDONLY_STATE, &state->flags); 1208 set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
1153 case FMODE_READ|FMODE_WRITE: 1213 case FMODE_READ|FMODE_WRITE:
1154 set_bit(NFS_O_RDWR_STATE, &state->flags); 1214 set_bit(NFS_O_RDWR_STATE, &state->flags);
1155 } 1215 }
1156} 1216 if (!nfs_need_update_open_stateid(state, stateid))
1157 1217 return;
1158static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 1218 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1159{ 1219 nfs4_stateid_copy(&state->stateid, stateid);
1160 write_seqlock(&state->seqlock); 1220 nfs4_stateid_copy(&state->open_stateid, stateid);
1161 nfs_set_open_stateid_locked(state, stateid, fmode);
1162 write_sequnlock(&state->seqlock);
1163} 1221}
1164 1222
1165static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) 1223static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
@@ -1217,6 +1275,8 @@ no_delegation:
1217 __update_open_stateid(state, open_stateid, NULL, fmode); 1275 __update_open_stateid(state, open_stateid, NULL, fmode);
1218 ret = 1; 1276 ret = 1;
1219 } 1277 }
1278 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
1279 nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
1220 1280
1221 return ret; 1281 return ret;
1222} 1282}
@@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1450 struct nfs4_state *newstate; 1510 struct nfs4_state *newstate;
1451 int ret; 1511 int ret;
1452 1512
1513 /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
1514 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1515 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1516 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1453 /* memory barrier prior to reading state->n_* */ 1517 /* memory barrier prior to reading state->n_* */
1454 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1518 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1455 clear_bit(NFS_OPEN_STATE, &state->flags); 1519 clear_bit(NFS_OPEN_STATE, &state->flags);
1456 smp_rmb(); 1520 smp_rmb();
1457 if (state->n_rdwr != 0) { 1521 if (state->n_rdwr != 0) {
1458 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1459 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1522 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1460 if (ret != 0) 1523 if (ret != 0)
1461 return ret; 1524 return ret;
@@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1463 return -ESTALE; 1526 return -ESTALE;
1464 } 1527 }
1465 if (state->n_wronly != 0) { 1528 if (state->n_wronly != 0) {
1466 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1467 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1529 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1468 if (ret != 0) 1530 if (ret != 0)
1469 return ret; 1531 return ret;
@@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1471 return -ESTALE; 1533 return -ESTALE;
1472 } 1534 }
1473 if (state->n_rdonly != 0) { 1535 if (state->n_rdonly != 0) {
1474 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1475 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1536 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1476 if (ret != 0) 1537 if (ret != 0)
1477 return ret; 1538 return ret;
@@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,
2244 } 2305 }
2245 } 2306 }
2246 2307
2247 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { 2308 if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
2248 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); 2309 if (!opendata->f_attr.mdsthreshold) {
2249 if (!opendata->f_attr.mdsthreshold) 2310 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
2250 goto err_free_label; 2311 if (!opendata->f_attr.mdsthreshold)
2312 goto err_free_label;
2313 }
2251 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; 2314 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
2252 } 2315 }
2253 if (dentry->d_inode != NULL) 2316 if (dentry->d_inode != NULL)
@@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir,
2275 if (opendata->file_created) 2338 if (opendata->file_created)
2276 *opened |= FILE_CREATED; 2339 *opened |= FILE_CREATED;
2277 2340
2278 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2341 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
2279 *ctx_th = opendata->f_attr.mdsthreshold; 2342 *ctx_th = opendata->f_attr.mdsthreshold;
2280 else 2343 opendata->f_attr.mdsthreshold = NULL;
2281 kfree(opendata->f_attr.mdsthreshold); 2344 }
2282 opendata->f_attr.mdsthreshold = NULL;
2283 2345
2284 nfs4_label_free(olabel); 2346 nfs4_label_free(olabel);
2285 2347
@@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,
2289err_free_label: 2351err_free_label:
2290 nfs4_label_free(olabel); 2352 nfs4_label_free(olabel);
2291err_opendata_put: 2353err_opendata_put:
2292 kfree(opendata->f_attr.mdsthreshold);
2293 nfs4_opendata_put(opendata); 2354 nfs4_opendata_put(opendata);
2294err_put_state_owner: 2355err_put_state_owner:
2295 nfs4_put_state_owner(sp); 2356 nfs4_put_state_owner(sp);
@@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data)
2479 kfree(calldata); 2540 kfree(calldata);
2480} 2541}
2481 2542
2482static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
2483 fmode_t fmode)
2484{
2485 spin_lock(&state->owner->so_lock);
2486 clear_bit(NFS_O_RDWR_STATE, &state->flags);
2487 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
2488 case FMODE_WRITE:
2489 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
2490 break;
2491 case FMODE_READ:
2492 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
2493 break;
2494 case 0:
2495 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
2496 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
2497 clear_bit(NFS_OPEN_STATE, &state->flags);
2498 }
2499 spin_unlock(&state->owner->so_lock);
2500}
2501
2502static void nfs4_close_done(struct rpc_task *task, void *data) 2543static void nfs4_close_done(struct rpc_task *task, void *data)
2503{ 2544{
2504 struct nfs4_closedata *calldata = data; 2545 struct nfs4_closedata *calldata = data;
@@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2517 if (calldata->roc) 2558 if (calldata->roc)
2518 pnfs_roc_set_barrier(state->inode, 2559 pnfs_roc_set_barrier(state->inode,
2519 calldata->roc_barrier); 2560 calldata->roc_barrier);
2520 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 2561 nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
2521 renew_lease(server, calldata->timestamp); 2562 renew_lease(server, calldata->timestamp);
2522 break; 2563 goto out_release;
2523 case -NFS4ERR_ADMIN_REVOKED: 2564 case -NFS4ERR_ADMIN_REVOKED:
2524 case -NFS4ERR_STALE_STATEID: 2565 case -NFS4ERR_STALE_STATEID:
2525 case -NFS4ERR_OLD_STATEID: 2566 case -NFS4ERR_OLD_STATEID:
@@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2533 goto out_release; 2574 goto out_release;
2534 } 2575 }
2535 } 2576 }
2536 nfs4_close_clear_stateid_flags(state, calldata->arg.fmode); 2577 nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
2537out_release: 2578out_release:
2538 nfs_release_seqid(calldata->arg.seqid); 2579 nfs_release_seqid(calldata->arg.seqid);
2539 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2580 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3507 return 1; 3548 return 1;
3508} 3549}
3509 3550
3510static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3511 struct inode *new_dir, struct qstr *new_name)
3512{
3513 struct nfs_server *server = NFS_SERVER(old_dir);
3514 struct nfs_renameargs arg = {
3515 .old_dir = NFS_FH(old_dir),
3516 .new_dir = NFS_FH(new_dir),
3517 .old_name = old_name,
3518 .new_name = new_name,
3519 };
3520 struct nfs_renameres res = {
3521 .server = server,
3522 };
3523 struct rpc_message msg = {
3524 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
3525 .rpc_argp = &arg,
3526 .rpc_resp = &res,
3527 };
3528 int status = -ENOMEM;
3529
3530 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3531 if (!status) {
3532 update_changeattr(old_dir, &res.old_cinfo);
3533 update_changeattr(new_dir, &res.new_cinfo);
3534 }
3535 return status;
3536}
3537
3538static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3539 struct inode *new_dir, struct qstr *new_name)
3540{
3541 struct nfs4_exception exception = { };
3542 int err;
3543 do {
3544 err = _nfs4_proc_rename(old_dir, old_name,
3545 new_dir, new_name);
3546 trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
3547 err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
3548 &exception);
3549 } while (exception.retry);
3550 return err;
3551}
3552
3553static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 3551static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
3554{ 3552{
3555 struct nfs_server *server = NFS_SERVER(inode); 3553 struct nfs_server *server = NFS_SERVER(inode);
@@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,
4884 nodename); 4882 nodename);
4885} 4883}
4886 4884
4885/*
4886 * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
4887 * services. Advertise one based on the address family of the
4888 * clientaddr.
4889 */
4890static unsigned int
4891nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
4892{
4893 if (strchr(clp->cl_ipaddr, ':') != NULL)
4894 return scnprintf(buf, len, "tcp6");
4895 else
4896 return scnprintf(buf, len, "tcp");
4897}
4898
4887/** 4899/**
4888 * nfs4_proc_setclientid - Negotiate client ID 4900 * nfs4_proc_setclientid - Negotiate client ID
4889 * @clp: state data structure 4901 * @clp: state data structure
@@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4925 setclientid.sc_name, 4937 setclientid.sc_name,
4926 sizeof(setclientid.sc_name)); 4938 sizeof(setclientid.sc_name));
4927 /* cb_client4 */ 4939 /* cb_client4 */
4928 rcu_read_lock(); 4940 setclientid.sc_netid_len =
4929 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4941 nfs4_init_callback_netid(clp,
4930 sizeof(setclientid.sc_netid), "%s", 4942 setclientid.sc_netid,
4931 rpc_peeraddr2str(clp->cl_rpcclient, 4943 sizeof(setclientid.sc_netid));
4932 RPC_DISPLAY_NETID));
4933 rcu_read_unlock();
4934 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, 4944 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
4935 sizeof(setclientid.sc_uaddr), "%s.%u.%u", 4945 sizeof(setclientid.sc_uaddr), "%s.%u.%u",
4936 clp->cl_ipaddr, port >> 8, port & 255); 4946 clp->cl_ipaddr, port >> 8, port & 255);
@@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
8408 .unlink_setup = nfs4_proc_unlink_setup, 8418 .unlink_setup = nfs4_proc_unlink_setup,
8409 .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, 8419 .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
8410 .unlink_done = nfs4_proc_unlink_done, 8420 .unlink_done = nfs4_proc_unlink_done,
8411 .rename = nfs4_proc_rename,
8412 .rename_setup = nfs4_proc_rename_setup, 8421 .rename_setup = nfs4_proc_rename_setup,
8413 .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, 8422 .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
8414 .rename_done = nfs4_proc_rename_done, 8423 .rename_done = nfs4_proc_rename_done,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0deb32105ccf..2349518eef2c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
1316 return 1; 1316 return 1;
1317} 1317}
1318 1318
1319static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) 1319int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
1320{ 1320{
1321 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); 1321 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
1322 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1322 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -2075,8 +2075,10 @@ again:
2075 switch (status) { 2075 switch (status) {
2076 case 0: 2076 case 0:
2077 break; 2077 break;
2078 case -NFS4ERR_DELAY:
2079 case -ETIMEDOUT: 2078 case -ETIMEDOUT:
2079 if (clnt->cl_softrtry)
2080 break;
2081 case -NFS4ERR_DELAY:
2080 case -EAGAIN: 2082 case -EAGAIN:
2081 ssleep(1); 2083 ssleep(1);
2082 case -NFS4ERR_STALE_CLIENTID: 2084 case -NFS4ERR_STALE_CLIENTID:
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 808f29574412..6f340f02f2ba 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
90 */ 90 */
91static void nfs4_evict_inode(struct inode *inode) 91static void nfs4_evict_inode(struct inode *inode)
92{ 92{
93 truncate_inode_pages(&inode->i_data, 0); 93 truncate_inode_pages_final(&inode->i_data);
94 clear_inode(inode); 94 clear_inode(inode);
95 pnfs_return_layout(inode); 95 pnfs_return_layout(inode);
96 pnfs_destroy_layout(NFS_I(inode)); 96 pnfs_destroy_layout(NFS_I(inode));
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 72f3bf1754ef..73ce8d4fe2c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int);
203 2 + encode_verifier_maxsz + 5 + \ 203 2 + encode_verifier_maxsz + 5 + \
204 nfs4_label_maxsz) 204 nfs4_label_maxsz)
205#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ 205#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
206 decode_verifier_maxsz + \ 206 decode_verifier_maxsz)
207 nfs4_label_maxsz + nfs4_fattr_maxsz)
208#define encode_readlink_maxsz (op_encode_hdr_maxsz) 207#define encode_readlink_maxsz (op_encode_hdr_maxsz)
209#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) 208#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
210#define encode_write_maxsz (op_encode_hdr_maxsz + \ 209#define encode_write_maxsz (op_encode_hdr_maxsz + \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4755858e37a0..cb53d450ae32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
662 */ 662 */
663static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 663static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
664{ 664{
665 return (s32)s1 - (s32)s2 > 0; 665 return (s32)(s1 - s2) > 0;
666}
667
668static void
669pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
670 const nfs4_stateid *new,
671 struct list_head *free_me_list)
672{
673 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
674 return;
675 /* Layout is new! Kill existing layout segments */
676 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
666} 677}
667 678
668/* update lo->plh_stateid with new if is more recent */ 679/* update lo->plh_stateid with new if is more recent */
@@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1315 struct nfs4_layoutget_res *res = &lgp->res; 1326 struct nfs4_layoutget_res *res = &lgp->res;
1316 struct pnfs_layout_segment *lseg; 1327 struct pnfs_layout_segment *lseg;
1317 struct inode *ino = lo->plh_inode; 1328 struct inode *ino = lo->plh_inode;
1329 LIST_HEAD(free_me);
1318 int status = 0; 1330 int status = 0;
1319 1331
1320 /* Inject layout blob into I/O device driver */ 1332 /* Inject layout blob into I/O device driver */
@@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1341 goto out_forget_reply; 1353 goto out_forget_reply;
1342 } 1354 }
1343 1355
1356 /* Check that the new stateid matches the old stateid */
1357 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
1344 /* Done processing layoutget. Set the layout stateid */ 1358 /* Done processing layoutget. Set the layout stateid */
1345 pnfs_set_layout_stateid(lo, &res->stateid, false); 1359 pnfs_set_layout_stateid(lo, &res->stateid, false);
1346 1360
@@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1355 } 1369 }
1356 1370
1357 spin_unlock(&ino->i_lock); 1371 spin_unlock(&ino->i_lock);
1372 pnfs_free_lseg_list(&free_me);
1358 return lseg; 1373 return lseg;
1359out: 1374out:
1360 return ERR_PTR(status); 1375 return ERR_PTR(status);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fddbba2d9eff..e55ce9e8b034 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
357} 357}
358 358
359static int 359static int
360nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
361 struct inode *new_dir, struct qstr *new_name)
362{
363 struct nfs_renameargs arg = {
364 .old_dir = NFS_FH(old_dir),
365 .old_name = old_name,
366 .new_dir = NFS_FH(new_dir),
367 .new_name = new_name,
368 };
369 struct rpc_message msg = {
370 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
371 .rpc_argp = &arg,
372 };
373 int status;
374
375 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
376 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
377 nfs_mark_for_revalidate(old_dir);
378 nfs_mark_for_revalidate(new_dir);
379 dprintk("NFS reply rename: %d\n", status);
380 return status;
381}
382
383static int
384nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 360nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
385{ 361{
386 struct nfs_linkargs arg = { 362 struct nfs_linkargs arg = {
@@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
745 .unlink_setup = nfs_proc_unlink_setup, 721 .unlink_setup = nfs_proc_unlink_setup,
746 .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, 722 .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
747 .unlink_done = nfs_proc_unlink_done, 723 .unlink_done = nfs_proc_unlink_done,
748 .rename = nfs_proc_rename,
749 .rename_setup = nfs_proc_rename_setup, 724 .rename_setup = nfs_proc_rename_setup,
750 .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, 725 .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
751 .rename_done = nfs_proc_rename_done, 726 .rename_done = nfs_proc_rename_done,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed906eb82..2cb56943e232 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; 2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version; 2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version;
2217 2217
2218 sync_filesystem(sb);
2219
2218 /* 2220 /*
2219 * Userspace mount programs that send binary options generally send 2221 * Userspace mount programs that send binary options generally send
2220 * them populated with default values. We have no way to know which 2222 * them populated with default values. We have no way to know which
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 11d78944de79..de54129336c6 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,7 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/fsnotify.h>
17 18
18#include "internal.h" 19#include "internal.h"
19#include "nfs4_fs.h" 20#include "nfs4_fs.h"
@@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
353 return; 354 return;
354 } 355 }
355 356
356 if (task->tk_status != 0) 357 if (data->complete)
357 nfs_cancel_async_unlink(old_dentry); 358 data->complete(task, data);
358} 359}
359 360
360/** 361/**
@@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {
399 * 400 *
400 * It's expected that valid references to the dentries and inodes are held 401 * It's expected that valid references to the dentries and inodes are held
401 */ 402 */
402static struct rpc_task * 403struct rpc_task *
403nfs_async_rename(struct inode *old_dir, struct inode *new_dir, 404nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
404 struct dentry *old_dentry, struct dentry *new_dentry) 405 struct dentry *old_dentry, struct dentry *new_dentry,
406 void (*complete)(struct rpc_task *, struct nfs_renamedata *))
405{ 407{
406 struct nfs_renamedata *data; 408 struct nfs_renamedata *data;
407 struct rpc_message msg = { }; 409 struct rpc_message msg = { };
@@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
438 data->new_dentry = dget(new_dentry); 440 data->new_dentry = dget(new_dentry);
439 nfs_fattr_init(&data->old_fattr); 441 nfs_fattr_init(&data->old_fattr);
440 nfs_fattr_init(&data->new_fattr); 442 nfs_fattr_init(&data->new_fattr);
443 data->complete = complete;
441 444
442 /* set up nfs_renameargs */ 445 /* set up nfs_renameargs */
443 data->args.old_dir = NFS_FH(old_dir); 446 data->args.old_dir = NFS_FH(old_dir);
@@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
456 return rpc_run_task(&task_setup_data); 459 return rpc_run_task(&task_setup_data);
457} 460}
458 461
462/*
463 * Perform tasks needed when a sillyrename is done such as cancelling the
464 * queued async unlink if it failed.
465 */
466static void
467nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
468{
469 struct dentry *dentry = data->old_dentry;
470
471 if (task->tk_status != 0) {
472 nfs_cancel_async_unlink(dentry);
473 return;
474 }
475
476 /*
477 * vfs_unlink and the like do not issue this when a file is
478 * sillyrenamed, so do it here.
479 */
480 fsnotify_nameremove(dentry, 0);
481}
482
459#define SILLYNAME_PREFIX ".nfs" 483#define SILLYNAME_PREFIX ".nfs"
460#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) 484#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
461#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) 485#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
@@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
548 } 572 }
549 573
550 /* run the rename task, undo unlink if it fails */ 574 /* run the rename task, undo unlink if it fails */
551 task = nfs_async_rename(dir, dir, dentry, sdentry); 575 task = nfs_async_rename(dir, dir, dentry, sdentry,
576 nfs_complete_sillyrename);
552 if (IS_ERR(task)) { 577 if (IS_ERR(task)) {
553 error = -EBUSY; 578 error = -EBUSY;
554 nfs_cancel_async_unlink(dentry); 579 nfs_cancel_async_unlink(dentry);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd572264..2645be435e75 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
71 if (gid_eq(new->fsgid, INVALID_GID)) 71 if (gid_eq(new->fsgid, INVALID_GID))
72 new->fsgid = exp->ex_anon_gid; 72 new->fsgid = exp->ex_anon_gid;
73 73
74 ret = set_groups(new, gi); 74 set_groups(new, gi);
75 put_group_info(gi); 75 put_group_info(gi);
76 if (ret < 0)
77 goto error;
78 76
79 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) 77 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
80 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 78 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
89 87
90oom: 88oom:
91 ret = -ENOMEM; 89 ret = -ENOMEM;
92error:
93 abort_creds(new); 90 abort_creds(new);
94 return ret; 91 return ret;
95} 92}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6d7be3f80356..915808b36df7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1694,7 +1694,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) 1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
1695 goto out_dput_new; 1695 goto out_dput_new;
1696 1696
1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); 1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
1698 if (!host_err) { 1698 if (!host_err) {
1699 host_err = commit_metadata(tfhp); 1699 host_err = commit_metadata(tfhp);
1700 if (!host_err) 1700 if (!host_err)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index deaa3d33a0aa..0d58075f34e2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
942 struct inode *cpfile; 942 struct inode *cpfile;
943 int err; 943 int err;
944 944
945 if (cpsize > sb->s_blocksize) {
946 printk(KERN_ERR
947 "NILFS: too large checkpoint size: %zu bytes.\n",
948 cpsize);
949 return -EINVAL;
950 } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
951 printk(KERN_ERR
952 "NILFS: too small checkpoint size: %zu bytes.\n",
953 cpsize);
954 return -EINVAL;
955 }
956
945 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); 957 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
946 if (unlikely(!cpfile)) 958 if (unlikely(!cpfile))
947 return -ENOMEM; 959 return -ENOMEM;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fa0f80308c2d..0d5fada91191 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
484 struct nilfs_dat_info *di; 484 struct nilfs_dat_info *di;
485 int err; 485 int err;
486 486
487 if (entry_size > sb->s_blocksize) {
488 printk(KERN_ERR
489 "NILFS: too large DAT entry size: %zu bytes.\n",
490 entry_size);
491 return -EINVAL;
492 } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
493 printk(KERN_ERR
494 "NILFS: too small DAT entry size: %zu bytes.\n",
495 entry_size);
496 return -EINVAL;
497 }
498
487 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); 499 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
488 if (unlikely(!dat)) 500 if (unlikely(!dat))
489 return -ENOMEM; 501 return -ENOMEM;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 08fdb77852ac..f3a82fbcae02 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
134 134
135static const struct vm_operations_struct nilfs_file_vm_ops = { 135static const struct vm_operations_struct nilfs_file_vm_ops = {
136 .fault = filemap_fault, 136 .fault = filemap_fault,
137 .map_pages = filemap_map_pages,
137 .page_mkwrite = nilfs_page_mkwrite, 138 .page_mkwrite = nilfs_page_mkwrite,
138 .remap_pages = generic_file_remap_pages, 139 .remap_pages = generic_file_remap_pages,
139}; 140};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0e..b9c5726120e3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode)
783 int ret; 783 int ret;
784 784
785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
786 if (inode->i_data.nrpages) 786 truncate_inode_pages_final(&inode->i_data);
787 truncate_inode_pages(&inode->i_data, 0);
788 clear_inode(inode); 787 clear_inode(inode);
789 nilfs_clear_inode(inode); 788 nilfs_clear_inode(inode);
790 return; 789 return;
791 } 790 }
792 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 791 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
793 792
794 if (inode->i_data.nrpages) 793 truncate_inode_pages_final(&inode->i_data);
795 truncate_inode_pages(&inode->i_data, 0);
796 794
797 /* TODO: some of the following operations may fail. */ 795 /* TODO: some of the following operations may fail. */
798 nilfs_truncate_bmap(ii, 0); 796 nilfs_truncate_bmap(ii, 0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021948e4..422fb54b7377 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,48 @@ out:
1072} 1072}
1073 1073
1074/** 1074/**
1075 * nilfs_ioctl_trim_fs() - trim ioctl handle function
1076 * @inode: inode object
1077 * @argp: pointer on argument from userspace
1078 *
1079 * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
1080 * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
1081 * performs the actual trim operation.
1082 *
1083 * Return Value: On success, 0 is returned or negative error code, otherwise.
1084 */
1085static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
1086{
1087 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1088 struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
1089 struct fstrim_range range;
1090 int ret;
1091
1092 if (!capable(CAP_SYS_ADMIN))
1093 return -EPERM;
1094
1095 if (!blk_queue_discard(q))
1096 return -EOPNOTSUPP;
1097
1098 if (copy_from_user(&range, argp, sizeof(range)))
1099 return -EFAULT;
1100
1101 range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
1102
1103 down_read(&nilfs->ns_segctor_sem);
1104 ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
1105 up_read(&nilfs->ns_segctor_sem);
1106
1107 if (ret < 0)
1108 return ret;
1109
1110 if (copy_to_user(argp, &range, sizeof(range)))
1111 return -EFAULT;
1112
1113 return 0;
1114}
1115
1116/**
1075 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated 1117 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
1076 * @inode: inode object 1118 * @inode: inode object
1077 * @argp: pointer on argument from userspace 1119 * @argp: pointer on argument from userspace
@@ -1163,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
1163 return ret; 1205 return ret;
1164} 1206}
1165 1207
1208/**
1209 * nilfs_ioctl_set_suinfo - set segment usage info
1210 * @inode: inode object
1211 * @filp: file object
1212 * @cmd: ioctl's request code
1213 * @argp: pointer on argument from userspace
1214 *
1215 * Description: Expects an array of nilfs_suinfo_update structures
1216 * encapsulated in nilfs_argv and updates the segment usage info
1217 * according to the flags in nilfs_suinfo_update.
1218 *
1219 * Return Value: On success, 0 is returned. On error, one of the
1220 * following negative error codes is returned.
1221 *
1222 * %-EPERM - Not enough permissions
1223 *
1224 * %-EFAULT - Error copying input data
1225 *
1226 * %-EIO - I/O error.
1227 *
1228 * %-ENOMEM - Insufficient amount of memory available.
1229 *
1230 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
1231 */
1232static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
1233 unsigned int cmd, void __user *argp)
1234{
1235 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1236 struct nilfs_transaction_info ti;
1237 struct nilfs_argv argv;
1238 size_t len;
1239 void __user *base;
1240 void *kbuf;
1241 int ret;
1242
1243 if (!capable(CAP_SYS_ADMIN))
1244 return -EPERM;
1245
1246 ret = mnt_want_write_file(filp);
1247 if (ret)
1248 return ret;
1249
1250 ret = -EFAULT;
1251 if (copy_from_user(&argv, argp, sizeof(argv)))
1252 goto out;
1253
1254 ret = -EINVAL;
1255 if (argv.v_size < sizeof(struct nilfs_suinfo_update))
1256 goto out;
1257
1258 if (argv.v_nmembs > nilfs->ns_nsegments)
1259 goto out;
1260
1261 if (argv.v_nmembs >= UINT_MAX / argv.v_size)
1262 goto out;
1263
1264 len = argv.v_size * argv.v_nmembs;
1265 if (!len) {
1266 ret = 0;
1267 goto out;
1268 }
1269
1270 base = (void __user *)(unsigned long)argv.v_base;
1271 kbuf = vmalloc(len);
1272 if (!kbuf) {
1273 ret = -ENOMEM;
1274 goto out;
1275 }
1276
1277 if (copy_from_user(kbuf, base, len)) {
1278 ret = -EFAULT;
1279 goto out_free;
1280 }
1281
1282 nilfs_transaction_begin(inode->i_sb, &ti, 0);
1283 ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
1284 argv.v_nmembs);
1285 if (unlikely(ret < 0))
1286 nilfs_transaction_abort(inode->i_sb);
1287 else
1288 nilfs_transaction_commit(inode->i_sb); /* never fails */
1289
1290out_free:
1291 vfree(kbuf);
1292out:
1293 mnt_drop_write_file(filp);
1294 return ret;
1295}
1296
1166long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1297long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1167{ 1298{
1168 struct inode *inode = file_inode(filp); 1299 struct inode *inode = file_inode(filp);
@@ -1189,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1189 return nilfs_ioctl_get_info(inode, filp, cmd, argp, 1320 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
1190 sizeof(struct nilfs_suinfo), 1321 sizeof(struct nilfs_suinfo),
1191 nilfs_ioctl_do_get_suinfo); 1322 nilfs_ioctl_do_get_suinfo);
1323 case NILFS_IOCTL_SET_SUINFO:
1324 return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
1192 case NILFS_IOCTL_GET_SUSTAT: 1325 case NILFS_IOCTL_GET_SUSTAT:
1193 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); 1326 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
1194 case NILFS_IOCTL_GET_VINFO: 1327 case NILFS_IOCTL_GET_VINFO:
@@ -1205,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1205 return nilfs_ioctl_resize(inode, filp, argp); 1338 return nilfs_ioctl_resize(inode, filp, argp);
1206 case NILFS_IOCTL_SET_ALLOC_RANGE: 1339 case NILFS_IOCTL_SET_ALLOC_RANGE:
1207 return nilfs_ioctl_set_alloc_range(inode, argp); 1340 return nilfs_ioctl_set_alloc_range(inode, argp);
1341 case FITRIM:
1342 return nilfs_ioctl_trim_fs(inode, argp);
1208 default: 1343 default:
1209 return -ENOTTY; 1344 return -ENOTTY;
1210 } 1345 }
@@ -1228,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1228 case NILFS_IOCTL_GET_CPINFO: 1363 case NILFS_IOCTL_GET_CPINFO:
1229 case NILFS_IOCTL_GET_CPSTAT: 1364 case NILFS_IOCTL_GET_CPSTAT:
1230 case NILFS_IOCTL_GET_SUINFO: 1365 case NILFS_IOCTL_GET_SUINFO:
1366 case NILFS_IOCTL_SET_SUINFO:
1231 case NILFS_IOCTL_GET_SUSTAT: 1367 case NILFS_IOCTL_GET_SUSTAT:
1232 case NILFS_IOCTL_GET_VINFO: 1368 case NILFS_IOCTL_GET_VINFO:
1233 case NILFS_IOCTL_GET_BDESCS: 1369 case NILFS_IOCTL_GET_BDESCS:
@@ -1235,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1235 case NILFS_IOCTL_SYNC: 1371 case NILFS_IOCTL_SYNC:
1236 case NILFS_IOCTL_RESIZE: 1372 case NILFS_IOCTL_RESIZE:
1237 case NILFS_IOCTL_SET_ALLOC_RANGE: 1373 case NILFS_IOCTL_SET_ALLOC_RANGE:
1374 case FITRIM:
1238 break; 1375 break;
1239 default: 1376 default:
1240 return -ENOIOCTLCMD; 1377 return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f438a7..2a869c35c362 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
870} 870}
871 871
872/** 872/**
873 * nilfs_sufile_set_suinfo - sets segment usage info
874 * @sufile: inode of segment usage file
875 * @buf: array of suinfo_update
876 * @supsz: byte size of suinfo_update
877 * @nsup: size of suinfo_update array
878 *
879 * Description: Takes an array of nilfs_suinfo_update structs and updates
880 * segment usage accordingly. Only the fields indicated by the sup_flags
881 * are updated.
882 *
883 * Return Value: On success, 0 is returned. On error, one of the
884 * following negative error codes is returned.
885 *
886 * %-EIO - I/O error.
887 *
888 * %-ENOMEM - Insufficient amount of memory available.
889 *
890 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
891 */
892ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
893 unsigned supsz, size_t nsup)
894{
895 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
896 struct buffer_head *header_bh, *bh;
897 struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
898 struct nilfs_segment_usage *su;
899 void *kaddr;
900 unsigned long blkoff, prev_blkoff;
901 int cleansi, cleansu, dirtysi, dirtysu;
902 long ncleaned = 0, ndirtied = 0;
903 int ret = 0;
904
905 if (unlikely(nsup == 0))
906 return ret;
907
908 for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
909 if (sup->sup_segnum >= nilfs->ns_nsegments
910 || (sup->sup_flags &
911 (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
912 || (nilfs_suinfo_update_nblocks(sup) &&
913 sup->sup_sui.sui_nblocks >
914 nilfs->ns_blocks_per_segment))
915 return -EINVAL;
916 }
917
918 down_write(&NILFS_MDT(sufile)->mi_sem);
919
920 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
921 if (ret < 0)
922 goto out_sem;
923
924 sup = buf;
925 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
926 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
927 if (ret < 0)
928 goto out_header;
929
930 for (;;) {
931 kaddr = kmap_atomic(bh->b_page);
932 su = nilfs_sufile_block_get_segment_usage(
933 sufile, sup->sup_segnum, bh, kaddr);
934
935 if (nilfs_suinfo_update_lastmod(sup))
936 su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
937
938 if (nilfs_suinfo_update_nblocks(sup))
939 su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
940
941 if (nilfs_suinfo_update_flags(sup)) {
942 /*
943 * Active flag is a virtual flag projected by running
944 * nilfs kernel code - drop it not to write it to
945 * disk.
946 */
947 sup->sup_sui.sui_flags &=
948 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
949
950 cleansi = nilfs_suinfo_clean(&sup->sup_sui);
951 cleansu = nilfs_segment_usage_clean(su);
952 dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
953 dirtysu = nilfs_segment_usage_dirty(su);
954
955 if (cleansi && !cleansu)
956 ++ncleaned;
957 else if (!cleansi && cleansu)
958 --ncleaned;
959
960 if (dirtysi && !dirtysu)
961 ++ndirtied;
962 else if (!dirtysi && dirtysu)
963 --ndirtied;
964
965 su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
966 }
967
968 kunmap_atomic(kaddr);
969
970 sup = (void *)sup + supsz;
971 if (sup >= supend)
972 break;
973
974 prev_blkoff = blkoff;
975 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
976 if (blkoff == prev_blkoff)
977 continue;
978
979 /* get different block */
980 mark_buffer_dirty(bh);
981 put_bh(bh);
982 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
983 if (unlikely(ret < 0))
984 goto out_mark;
985 }
986 mark_buffer_dirty(bh);
987 put_bh(bh);
988
989 out_mark:
990 if (ncleaned || ndirtied) {
991 nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
992 (u64)ndirtied);
993 NILFS_SUI(sufile)->ncleansegs += ncleaned;
994 }
995 nilfs_mdt_mark_dirty(sufile);
996 out_header:
997 put_bh(header_bh);
998 out_sem:
999 up_write(&NILFS_MDT(sufile)->mi_sem);
1000 return ret;
1001}
1002
1003/**
1004 * nilfs_sufile_trim_fs() - trim ioctl handle function
1005 * @sufile: inode of segment usage file
1006 * @range: fstrim_range structure
1007 *
1008 * start: First Byte to trim
1009 * len: number of Bytes to trim from start
1010 * minlen: minimum extent length in Bytes
1011 *
1012 * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
1013 * from start to start+len. start is rounded up to the next block boundary
1014 * and start+len is rounded down. For each clean segment blkdev_issue_discard
1015 * function is invoked.
1016 *
1017 * Return Value: On success, 0 is returned or negative error code, otherwise.
1018 */
1019int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
1020{
1021 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
1022 struct buffer_head *su_bh;
1023 struct nilfs_segment_usage *su;
1024 void *kaddr;
1025 size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
1026 sector_t seg_start, seg_end, start_block, end_block;
1027 sector_t start = 0, nblocks = 0;
1028 u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
1029 int ret = 0;
1030 unsigned int sects_per_block;
1031
1032 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
1033 bdev_logical_block_size(nilfs->ns_bdev);
1034 len = range->len >> nilfs->ns_blocksize_bits;
1035 minlen = range->minlen >> nilfs->ns_blocksize_bits;
1036 max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
1037
1038 if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
1039 return -EINVAL;
1040
1041 start_block = (range->start + nilfs->ns_blocksize - 1) >>
1042 nilfs->ns_blocksize_bits;
1043
1044 /*
1045 * range->len can be very large (actually, it is set to
1046 * ULLONG_MAX by default) - truncate upper end of the range
1047 * carefully so as not to overflow.
1048 */
1049 if (max_blocks - start_block < len)
1050 end_block = max_blocks - 1;
1051 else
1052 end_block = start_block + len - 1;
1053
1054 segnum = nilfs_get_segnum_of_block(nilfs, start_block);
1055 segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
1056
1057 down_read(&NILFS_MDT(sufile)->mi_sem);
1058
1059 while (segnum <= segnum_end) {
1060 n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
1061 segnum_end);
1062
1063 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
1064 &su_bh);
1065 if (ret < 0) {
1066 if (ret != -ENOENT)
1067 goto out_sem;
1068 /* hole */
1069 segnum += n;
1070 continue;
1071 }
1072
1073 kaddr = kmap_atomic(su_bh->b_page);
1074 su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
1075 su_bh, kaddr);
1076 for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
1077 if (!nilfs_segment_usage_clean(su))
1078 continue;
1079
1080 nilfs_get_segment_range(nilfs, segnum, &seg_start,
1081 &seg_end);
1082
1083 if (!nblocks) {
1084 /* start new extent */
1085 start = seg_start;
1086 nblocks = seg_end - seg_start + 1;
1087 continue;
1088 }
1089
1090 if (start + nblocks == seg_start) {
1091 /* add to previous extent */
1092 nblocks += seg_end - seg_start + 1;
1093 continue;
1094 }
1095
1096 /* discard previous extent */
1097 if (start < start_block) {
1098 nblocks -= start_block - start;
1099 start = start_block;
1100 }
1101
1102 if (nblocks >= minlen) {
1103 kunmap_atomic(kaddr);
1104
1105 ret = blkdev_issue_discard(nilfs->ns_bdev,
1106 start * sects_per_block,
1107 nblocks * sects_per_block,
1108 GFP_NOFS, 0);
1109 if (ret < 0) {
1110 put_bh(su_bh);
1111 goto out_sem;
1112 }
1113
1114 ndiscarded += nblocks;
1115 kaddr = kmap_atomic(su_bh->b_page);
1116 su = nilfs_sufile_block_get_segment_usage(
1117 sufile, segnum, su_bh, kaddr);
1118 }
1119
1120 /* start new extent */
1121 start = seg_start;
1122 nblocks = seg_end - seg_start + 1;
1123 }
1124 kunmap_atomic(kaddr);
1125 put_bh(su_bh);
1126 }
1127
1128
1129 if (nblocks) {
1130 /* discard last extent */
1131 if (start < start_block) {
1132 nblocks -= start_block - start;
1133 start = start_block;
1134 }
1135 if (start + nblocks > end_block + 1)
1136 nblocks = end_block - start + 1;
1137
1138 if (nblocks >= minlen) {
1139 ret = blkdev_issue_discard(nilfs->ns_bdev,
1140 start * sects_per_block,
1141 nblocks * sects_per_block,
1142 GFP_NOFS, 0);
1143 if (!ret)
1144 ndiscarded += nblocks;
1145 }
1146 }
1147
1148out_sem:
1149 up_read(&NILFS_MDT(sufile)->mi_sem);
1150
1151 range->len = ndiscarded << nilfs->ns_blocksize_bits;
1152 return ret;
1153}
1154
1155/**
873 * nilfs_sufile_read - read or get sufile inode 1156 * nilfs_sufile_read - read or get sufile inode
874 * @sb: super block instance 1157 * @sb: super block instance
875 * @susize: size of a segment usage entry 1158 * @susize: size of a segment usage entry
@@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
886 void *kaddr; 1169 void *kaddr;
887 int err; 1170 int err;
888 1171
1172 if (susize > sb->s_blocksize) {
1173 printk(KERN_ERR
1174 "NILFS: too large segment usage size: %zu bytes.\n",
1175 susize);
1176 return -EINVAL;
1177 } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
1178 printk(KERN_ERR
1179 "NILFS: too small segment usage size: %zu bytes.\n",
1180 susize);
1181 return -EINVAL;
1182 }
1183
889 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); 1184 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
890 if (unlikely(!sufile)) 1185 if (unlikely(!sufile))
891 return -ENOMEM; 1186 return -ENOMEM;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b51fc1..b8afd72f2379 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); 44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, 45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
46 size_t); 46 size_t);
47ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
47 48
48int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, 49int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
49 void (*dofunc)(struct inode *, __u64, 50 void (*dofunc)(struct inode *, __u64,
@@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
65int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); 66int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
66int nilfs_sufile_read(struct super_block *sb, size_t susize, 67int nilfs_sufile_read(struct super_block *sb, size_t susize,
67 struct nilfs_inode *raw_inode, struct inode **inodep); 68 struct nilfs_inode *raw_inode, struct inode **inodep);
69int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
68 70
69/** 71/**
70 * nilfs_sufile_scrap - make a segment garbage 72 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1d..8c532b2ca3ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1129 unsigned long old_mount_opt; 1129 unsigned long old_mount_opt;
1130 int err; 1130 int err;
1131 1131
1132 sync_filesystem(sb);
1132 old_sb_flags = sb->s_flags; 1133 old_sb_flags = sb->s_flags;
1133 old_mount_opt = nilfs->ns_mount_opt; 1134 old_mount_opt = nilfs->ns_mount_opt;
1134 1135
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 94c451ce6d24..8ba8229ba076 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
399 return -EINVAL; 399 return -EINVAL;
400 400
401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); 401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
402 if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
403 printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
404 nilfs->ns_inode_size);
405 return -EINVAL;
406 } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
407 printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
408 nilfs->ns_inode_size);
409 return -EINVAL;
410 }
411
402 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); 412 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
403 413
404 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 414 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index dc638f786d5c..ee9cb3795c2b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
60} 60}
61 61
62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
63static int fanotify_get_response_from_access(struct fsnotify_group *group, 63static int fanotify_get_response(struct fsnotify_group *group,
64 struct fanotify_event_info *event) 64 struct fanotify_perm_event_info *event)
65{ 65{
66 int ret; 66 int ret;
67 67
@@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
142 return false; 142 return false;
143} 143}
144 144
145struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
146 struct path *path)
147{
148 struct fanotify_event_info *event;
149
150#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
151 if (mask & FAN_ALL_PERM_EVENTS) {
152 struct fanotify_perm_event_info *pevent;
153
154 pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
155 GFP_KERNEL);
156 if (!pevent)
157 return NULL;
158 event = &pevent->fae;
159 pevent->response = 0;
160 goto init;
161 }
162#endif
163 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
164 if (!event)
165 return NULL;
166init: __maybe_unused
167 fsnotify_init_event(&event->fse, inode, mask);
168 event->tgid = get_pid(task_tgid(current));
169 if (path) {
170 event->path = *path;
171 path_get(&event->path);
172 } else {
173 event->path.mnt = NULL;
174 event->path.dentry = NULL;
175 }
176 return event;
177}
178
145static int fanotify_handle_event(struct fsnotify_group *group, 179static int fanotify_handle_event(struct fsnotify_group *group,
146 struct inode *inode, 180 struct inode *inode,
147 struct fsnotify_mark *inode_mark, 181 struct fsnotify_mark *inode_mark,
@@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
171 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, 205 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
172 mask); 206 mask);
173 207
174 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 208 event = fanotify_alloc_event(inode, mask, data);
175 if (unlikely(!event)) 209 if (unlikely(!event))
176 return -ENOMEM; 210 return -ENOMEM;
177 211
178 fsn_event = &event->fse; 212 fsn_event = &event->fse;
179 fsnotify_init_event(fsn_event, inode, mask);
180 event->tgid = get_pid(task_tgid(current));
181 if (data_type == FSNOTIFY_EVENT_PATH) {
182 struct path *path = data;
183 event->path = *path;
184 path_get(&event->path);
185 } else {
186 event->path.mnt = NULL;
187 event->path.dentry = NULL;
188 }
189#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
190 event->response = 0;
191#endif
192
193 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); 213 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
194 if (ret) { 214 if (ret) {
195 /* Permission events shouldn't be merged */ 215 /* Permission events shouldn't be merged */
@@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
202 222
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 223#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (mask & FAN_ALL_PERM_EVENTS) { 224 if (mask & FAN_ALL_PERM_EVENTS) {
205 ret = fanotify_get_response_from_access(group, event); 225 ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
206 fsnotify_destroy_event(group, fsn_event); 226 fsnotify_destroy_event(group, fsn_event);
207 } 227 }
208#endif 228#endif
@@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
225 event = FANOTIFY_E(fsn_event); 245 event = FANOTIFY_E(fsn_event);
226 path_put(&event->path); 246 path_put(&event->path);
227 put_pid(event->tgid); 247 put_pid(event->tgid);
248#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
249 if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
250 kmem_cache_free(fanotify_perm_event_cachep,
251 FANOTIFY_PE(fsn_event));
252 return;
253 }
254#endif
228 kmem_cache_free(fanotify_event_cachep, event); 255 kmem_cache_free(fanotify_event_cachep, event);
229} 256}
230 257
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 32a2f034fb94..2a5fb14115df 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -3,13 +3,12 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5extern struct kmem_cache *fanotify_event_cachep; 5extern struct kmem_cache *fanotify_event_cachep;
6extern struct kmem_cache *fanotify_perm_event_cachep;
6 7
7/* 8/*
8 * Lifetime of the structure differs for normal and permission events. In both 9 * Structure for normal fanotify events. It gets allocated in
9 * cases the structure is allocated in fanotify_handle_event(). For normal 10 * fanotify_handle_event() and freed when the information is retrieved by
10 * events the structure is freed immediately after reporting it to userspace. 11 * userspace
11 * For permission events we free it only after we receive response from
12 * userspace.
13 */ 12 */
14struct fanotify_event_info { 13struct fanotify_event_info {
15 struct fsnotify_event fse; 14 struct fsnotify_event fse;
@@ -19,12 +18,33 @@ struct fanotify_event_info {
19 */ 18 */
20 struct path path; 19 struct path path;
21 struct pid *tgid; 20 struct pid *tgid;
21};
22
22#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 23#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
23 u32 response; /* userspace answer to question */ 24/*
24#endif 25 * Structure for permission fanotify events. It gets allocated and freed in
26 * fanotify_handle_event() since we wait there for user response. When the
27 * information is retrieved by userspace the structure is moved from
28 * group->notification_list to group->fanotify_data.access_list to wait for
29 * user response.
30 */
31struct fanotify_perm_event_info {
32 struct fanotify_event_info fae;
33 int response; /* userspace answer to question */
34 int fd; /* fd we passed to userspace for this event */
25}; 35};
26 36
37static inline struct fanotify_perm_event_info *
38FANOTIFY_PE(struct fsnotify_event *fse)
39{
40 return container_of(fse, struct fanotify_perm_event_info, fae.fse);
41}
42#endif
43
27static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) 44static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
28{ 45{
29 return container_of(fse, struct fanotify_event_info, fse); 46 return container_of(fse, struct fanotify_event_info, fse);
30} 47}
48
49struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
50 struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 287a22c04149..4e565c814309 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -28,14 +28,8 @@
28extern const struct fsnotify_ops fanotify_fsnotify_ops; 28extern const struct fsnotify_ops fanotify_fsnotify_ops;
29 29
30static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly; 31struct kmem_cache *fanotify_event_cachep __read_mostly;
33 32struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
34struct fanotify_response_event {
35 struct list_head list;
36 __s32 fd;
37 struct fanotify_event_info *event;
38};
39 33
40/* 34/*
41 * Get an fsnotify notification event if one exists and is small 35 * Get an fsnotify notification event if one exists and is small
@@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group,
135} 129}
136 130
137#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 131#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
138static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, 132static struct fanotify_perm_event_info *dequeue_event(
139 __s32 fd) 133 struct fsnotify_group *group, int fd)
140{ 134{
141 struct fanotify_response_event *re, *return_re = NULL; 135 struct fanotify_perm_event_info *event, *return_e = NULL;
142 136
143 mutex_lock(&group->fanotify_data.access_mutex); 137 spin_lock(&group->fanotify_data.access_lock);
144 list_for_each_entry(re, &group->fanotify_data.access_list, list) { 138 list_for_each_entry(event, &group->fanotify_data.access_list,
145 if (re->fd != fd) 139 fae.fse.list) {
140 if (event->fd != fd)
146 continue; 141 continue;
147 142
148 list_del_init(&re->list); 143 list_del_init(&event->fae.fse.list);
149 return_re = re; 144 return_e = event;
150 break; 145 break;
151 } 146 }
152 mutex_unlock(&group->fanotify_data.access_mutex); 147 spin_unlock(&group->fanotify_data.access_lock);
153 148
154 pr_debug("%s: found return_re=%p\n", __func__, return_re); 149 pr_debug("%s: found return_re=%p\n", __func__, return_e);
155 150
156 return return_re; 151 return return_e;
157} 152}
158 153
159static int process_access_response(struct fsnotify_group *group, 154static int process_access_response(struct fsnotify_group *group,
160 struct fanotify_response *response_struct) 155 struct fanotify_response *response_struct)
161{ 156{
162 struct fanotify_response_event *re; 157 struct fanotify_perm_event_info *event;
163 __s32 fd = response_struct->fd; 158 int fd = response_struct->fd;
164 __u32 response = response_struct->response; 159 int response = response_struct->response;
165 160
166 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, 161 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
167 fd, response); 162 fd, response);
@@ -181,58 +176,15 @@ static int process_access_response(struct fsnotify_group *group,
181 if (fd < 0) 176 if (fd < 0)
182 return -EINVAL; 177 return -EINVAL;
183 178
184 re = dequeue_re(group, fd); 179 event = dequeue_event(group, fd);
185 if (!re) 180 if (!event)
186 return -ENOENT; 181 return -ENOENT;
187 182
188 re->event->response = response; 183 event->response = response;
189
190 wake_up(&group->fanotify_data.access_waitq); 184 wake_up(&group->fanotify_data.access_waitq);
191 185
192 kmem_cache_free(fanotify_response_event_cache, re);
193
194 return 0;
195}
196
197static int prepare_for_access_response(struct fsnotify_group *group,
198 struct fsnotify_event *event,
199 __s32 fd)
200{
201 struct fanotify_response_event *re;
202
203 if (!(event->mask & FAN_ALL_PERM_EVENTS))
204 return 0;
205
206 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
207 if (!re)
208 return -ENOMEM;
209
210 re->event = FANOTIFY_E(event);
211 re->fd = fd;
212
213 mutex_lock(&group->fanotify_data.access_mutex);
214
215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
216 mutex_unlock(&group->fanotify_data.access_mutex);
217 kmem_cache_free(fanotify_response_event_cache, re);
218 FANOTIFY_E(event)->response = FAN_ALLOW;
219 return 0;
220 }
221
222 list_add_tail(&re->list, &group->fanotify_data.access_list);
223 mutex_unlock(&group->fanotify_data.access_mutex);
224
225 return 0;
226}
227
228#else
229static int prepare_for_access_response(struct fsnotify_group *group,
230 struct fsnotify_event *event,
231 __s32 fd)
232{
233 return 0; 186 return 0;
234} 187}
235
236#endif 188#endif
237 189
238static ssize_t copy_event_to_user(struct fsnotify_group *group, 190static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -247,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
247 199
248 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); 200 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
249 if (ret < 0) 201 if (ret < 0)
250 goto out; 202 return ret;
251 203
252 fd = fanotify_event_metadata.fd; 204 fd = fanotify_event_metadata.fd;
253 ret = -EFAULT; 205 ret = -EFAULT;
@@ -255,9 +207,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
255 fanotify_event_metadata.event_len)) 207 fanotify_event_metadata.event_len))
256 goto out_close_fd; 208 goto out_close_fd;
257 209
258 ret = prepare_for_access_response(group, event, fd); 210#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
259 if (ret) 211 if (event->mask & FAN_ALL_PERM_EVENTS)
260 goto out_close_fd; 212 FANOTIFY_PE(event)->fd = fd;
213#endif
261 214
262 if (fd != FAN_NOFD) 215 if (fd != FAN_NOFD)
263 fd_install(fd, f); 216 fd_install(fd, f);
@@ -268,13 +221,6 @@ out_close_fd:
268 put_unused_fd(fd); 221 put_unused_fd(fd);
269 fput(f); 222 fput(f);
270 } 223 }
271out:
272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
273 if (event->mask & FAN_ALL_PERM_EVENTS) {
274 FANOTIFY_E(event)->response = FAN_DENY;
275 wake_up(&group->fanotify_data.access_waitq);
276 }
277#endif
278 return ret; 224 return ret;
279} 225}
280 226
@@ -314,35 +260,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
314 kevent = get_one_event(group, count); 260 kevent = get_one_event(group, count);
315 mutex_unlock(&group->notification_mutex); 261 mutex_unlock(&group->notification_mutex);
316 262
317 if (kevent) { 263 if (IS_ERR(kevent)) {
318 ret = PTR_ERR(kevent); 264 ret = PTR_ERR(kevent);
319 if (IS_ERR(kevent)) 265 break;
266 }
267
268 if (!kevent) {
269 ret = -EAGAIN;
270 if (file->f_flags & O_NONBLOCK)
320 break; 271 break;
321 ret = copy_event_to_user(group, kevent, buf); 272
322 /* 273 ret = -ERESTARTSYS;
323 * Permission events get destroyed after we 274 if (signal_pending(current))
324 * receive response 275 break;
325 */ 276
326 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) 277 if (start != buf)
327 fsnotify_destroy_event(group, kevent);
328 if (ret < 0)
329 break; 278 break;
330 buf += ret; 279 schedule();
331 count -= ret;
332 continue; 280 continue;
333 } 281 }
334 282
335 ret = -EAGAIN; 283 ret = copy_event_to_user(group, kevent, buf);
336 if (file->f_flags & O_NONBLOCK) 284 /*
337 break; 285 * Permission events get queued to wait for response. Other
338 ret = -ERESTARTSYS; 286 * events can be destroyed now.
339 if (signal_pending(current)) 287 */
340 break; 288 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
341 289 fsnotify_destroy_event(group, kevent);
342 if (start != buf) 290 if (ret < 0)
343 break; 291 break;
344 292 } else {
345 schedule(); 293#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
294 if (ret < 0) {
295 FANOTIFY_PE(kevent)->response = FAN_DENY;
296 wake_up(&group->fanotify_data.access_waitq);
297 break;
298 }
299 spin_lock(&group->fanotify_data.access_lock);
300 list_add_tail(&kevent->list,
301 &group->fanotify_data.access_list);
302 spin_unlock(&group->fanotify_data.access_lock);
303#endif
304 }
305 buf += ret;
306 count -= ret;
346 } 307 }
347 308
348 finish_wait(&group->notification_waitq, &wait); 309 finish_wait(&group->notification_waitq, &wait);
@@ -383,22 +344,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
383 struct fsnotify_group *group = file->private_data; 344 struct fsnotify_group *group = file->private_data;
384 345
385#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 346#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
386 struct fanotify_response_event *re, *lre; 347 struct fanotify_perm_event_info *event, *next;
387 348
388 mutex_lock(&group->fanotify_data.access_mutex); 349 spin_lock(&group->fanotify_data.access_lock);
389 350
390 atomic_inc(&group->fanotify_data.bypass_perm); 351 atomic_inc(&group->fanotify_data.bypass_perm);
391 352
392 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 353 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
393 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 354 fae.fse.list) {
394 re, re->event); 355 pr_debug("%s: found group=%p event=%p\n", __func__, group,
356 event);
395 357
396 list_del_init(&re->list); 358 list_del_init(&event->fae.fse.list);
397 re->event->response = FAN_ALLOW; 359 event->response = FAN_ALLOW;
398
399 kmem_cache_free(fanotify_response_event_cache, re);
400 } 360 }
401 mutex_unlock(&group->fanotify_data.access_mutex); 361 spin_unlock(&group->fanotify_data.access_lock);
402 362
403 wake_up(&group->fanotify_data.access_waitq); 363 wake_up(&group->fanotify_data.access_waitq);
404#endif 364#endif
@@ -731,21 +691,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
731 group->fanotify_data.user = user; 691 group->fanotify_data.user = user;
732 atomic_inc(&user->fanotify_listeners); 692 atomic_inc(&user->fanotify_listeners);
733 693
734 oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 694 oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
735 if (unlikely(!oevent)) { 695 if (unlikely(!oevent)) {
736 fd = -ENOMEM; 696 fd = -ENOMEM;
737 goto out_destroy_group; 697 goto out_destroy_group;
738 } 698 }
739 group->overflow_event = &oevent->fse; 699 group->overflow_event = &oevent->fse;
740 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
741 oevent->tgid = get_pid(task_tgid(current));
742 oevent->path.mnt = NULL;
743 oevent->path.dentry = NULL;
744 700
745 group->fanotify_data.f_flags = event_f_flags; 701 group->fanotify_data.f_flags = event_f_flags;
746#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 702#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
747 oevent->response = 0; 703 spin_lock_init(&group->fanotify_data.access_lock);
748 mutex_init(&group->fanotify_data.access_mutex);
749 init_waitqueue_head(&group->fanotify_data.access_waitq); 704 init_waitqueue_head(&group->fanotify_data.access_waitq);
750 INIT_LIST_HEAD(&group->fanotify_data.access_list); 705 INIT_LIST_HEAD(&group->fanotify_data.access_list);
751 atomic_set(&group->fanotify_data.bypass_perm, 0); 706 atomic_set(&group->fanotify_data.bypass_perm, 0);
@@ -920,9 +875,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
920static int __init fanotify_user_setup(void) 875static int __init fanotify_user_setup(void)
921{ 876{
922 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 877 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
923 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
924 SLAB_PANIC);
925 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); 878 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
879#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
880 fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
881 SLAB_PANIC);
882#endif
926 883
927 return 0; 884 return 0;
928} 885}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 807150e2c2b9..dd6103cc93c1 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
18 * distribution in the file COPYING); if not, write to the Free Software 18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22#include "debug.h" 22#include "debug.h"
23 23
24/*
25 * A static buffer to hold the error string being displayed and a spinlock
26 * to protect concurrent accesses to it.
27 */
28static char err_buf[1024];
29static DEFINE_SPINLOCK(err_buf_lock);
30
31/** 24/**
32 * __ntfs_warning - output a warning to the syslog 25 * __ntfs_warning - output a warning to the syslog
33 * @function: name of function outputting the warning 26 * @function: name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
50void __ntfs_warning(const char *function, const struct super_block *sb, 43void __ntfs_warning(const char *function, const struct super_block *sb,
51 const char *fmt, ...) 44 const char *fmt, ...)
52{ 45{
46 struct va_format vaf;
53 va_list args; 47 va_list args;
54 int flen = 0; 48 int flen = 0;
55 49
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
59#endif 53#endif
60 if (function) 54 if (function)
61 flen = strlen(function); 55 flen = strlen(function);
62 spin_lock(&err_buf_lock);
63 va_start(args, fmt); 56 va_start(args, fmt);
64 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 57 vaf.fmt = fmt;
65 va_end(args); 58 vaf.va = &args;
66 if (sb) 59 if (sb)
67 printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", 60 pr_warn("(device %s): %s(): %pV\n",
68 sb->s_id, flen ? function : "", err_buf); 61 sb->s_id, flen ? function : "", &vaf);
69 else 62 else
70 printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", 63 pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
71 flen ? function : "", err_buf); 64 va_end(args);
72 spin_unlock(&err_buf_lock);
73} 65}
74 66
75/** 67/**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
94void __ntfs_error(const char *function, const struct super_block *sb, 86void __ntfs_error(const char *function, const struct super_block *sb,
95 const char *fmt, ...) 87 const char *fmt, ...)
96{ 88{
89 struct va_format vaf;
97 va_list args; 90 va_list args;
98 int flen = 0; 91 int flen = 0;
99 92
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
103#endif 96#endif
104 if (function) 97 if (function)
105 flen = strlen(function); 98 flen = strlen(function);
106 spin_lock(&err_buf_lock);
107 va_start(args, fmt); 99 va_start(args, fmt);
108 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 100 vaf.fmt = fmt;
109 va_end(args); 101 vaf.va = &args;
110 if (sb) 102 if (sb)
111 printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", 103 pr_err("(device %s): %s(): %pV\n",
112 sb->s_id, flen ? function : "", err_buf); 104 sb->s_id, flen ? function : "", &vaf);
113 else 105 else
114 printk(KERN_ERR "NTFS-fs error: %s(): %s\n", 106 pr_err("%s(): %pV\n", flen ? function : "", &vaf);
115 flen ? function : "", err_buf); 107 va_end(args);
116 spin_unlock(&err_buf_lock);
117} 108}
118 109
119#ifdef DEBUG 110#ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
124void __ntfs_debug (const char *file, int line, const char *function, 115void __ntfs_debug (const char *file, int line, const char *function,
125 const char *fmt, ...) 116 const char *fmt, ...)
126{ 117{
118 struct va_format vaf;
127 va_list args; 119 va_list args;
128 int flen = 0; 120 int flen = 0;
129 121
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
131 return; 123 return;
132 if (function) 124 if (function)
133 flen = strlen(function); 125 flen = strlen(function);
134 spin_lock(&err_buf_lock);
135 va_start(args, fmt); 126 va_start(args, fmt);
136 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 127 vaf.fmt = fmt;
128 vaf.va = &args;
129 pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
137 va_end(args); 130 va_end(args);
138 printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
139 flen ? function : "", err_buf);
140 spin_unlock(&err_buf_lock);
141} 131}
142 132
143/* Dump a runlist. Caller has to provide synchronisation for @rl. */ 133/* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
149 139
150 if (!debug_msgs) 140 if (!debug_msgs)
151 return; 141 return;
152 printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); 142 pr_debug("Dumping runlist (values in hex):\n");
153 if (!rl) { 143 if (!rl) {
154 printk(KERN_DEBUG "Run list not present.\n"); 144 pr_debug("Run list not present.\n");
155 return; 145 return;
156 } 146 }
157 printk(KERN_DEBUG "VCN LCN Run length\n"); 147 pr_debug("VCN LCN Run length\n");
158 for (i = 0; ; i++) { 148 for (i = 0; ; i++) {
159 LCN lcn = (rl + i)->lcn; 149 LCN lcn = (rl + i)->lcn;
160 150
@@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
163 153
164 if (index > -LCN_ENOENT - 1) 154 if (index > -LCN_ENOENT - 1)
165 index = 3; 155 index = 3;
166 printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", 156 pr_debug("%-16Lx %s %-16Lx%s\n",
167 (long long)(rl + i)->vcn, lcn_str[index], 157 (long long)(rl + i)->vcn, lcn_str[index],
168 (long long)(rl + i)->length, 158 (long long)(rl + i)->length,
169 (rl + i)->length ? "" : 159 (rl + i)->length ? "" :
170 " (runlist end)"); 160 " (runlist end)");
171 } else 161 } else
172 printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", 162 pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
173 (long long)(rl + i)->vcn, 163 (long long)(rl + i)->vcn,
174 (long long)(rl + i)->lcn, 164 (long long)(rl + i)->lcn,
175 (long long)(rl + i)->length, 165 (long long)(rl + i)->length,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 53c27eaf2307..61bf091e32a8 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
48 48
49#else /* !DEBUG */ 49#else /* !DEBUG */
50 50
51#define ntfs_debug(f, a...) do {} while (0) 51#define ntfs_debug(fmt, ...) \
52do { \
53 if (0) \
54 no_printk(fmt, ##__VA_ARGS__); \
55} while (0)
56
52#define ntfs_debug_dump_runlist(rl) do {} while (0) 57#define ntfs_debug_dump_runlist(rl) do {} while (0)
53 58
54#endif /* !DEBUG */ 59#endif /* !DEBUG */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ffb9b3675736..9d8153ebacfb 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2259,7 +2259,7 @@ void ntfs_evict_big_inode(struct inode *vi)
2259{ 2259{
2260 ntfs_inode *ni = NTFS_I(vi); 2260 ntfs_inode *ni = NTFS_I(vi);
2261 2261
2262 truncate_inode_pages(&vi->i_data, 0); 2262 truncate_inode_pages_final(&vi->i_data);
2263 clear_inode(vi); 2263 clear_inode(vi);
2264 2264
2265#ifdef NTFS_RW 2265#ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d916..9de2491f2926 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -19,6 +19,7 @@
19 * distribution in the file COPYING); if not, write to the Free Software 19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */ 21 */
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 23
23#include <linux/stddef.h> 24#include <linux/stddef.h>
24#include <linux/init.h> 25#include <linux/init.h>
@@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
468 469
469 ntfs_debug("Entering with remount options string: %s", opt); 470 ntfs_debug("Entering with remount options string: %s", opt);
470 471
472 sync_filesystem(sb);
473
471#ifndef NTFS_RW 474#ifndef NTFS_RW
472 /* For read-only compiled driver, enforce read-only flag. */ 475 /* For read-only compiled driver, enforce read-only flag. */
473 *flags |= MS_RDONLY; 476 *flags |= MS_RDONLY;
@@ -1894,7 +1897,7 @@ get_ctx_vol_failed:
1894 vol->minor_ver = vi->minor_ver; 1897 vol->minor_ver = vi->minor_ver;
1895 ntfs_attr_put_search_ctx(ctx); 1898 ntfs_attr_put_search_ctx(ctx);
1896 unmap_mft_record(NTFS_I(vol->vol_ino)); 1899 unmap_mft_record(NTFS_I(vol->vol_ino));
1897 printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, 1900 pr_info("volume version %i.%i.\n", vol->major_ver,
1898 vol->minor_ver); 1901 vol->minor_ver);
1899 if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { 1902 if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
1900 ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " 1903 ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
@@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void)
3093 int err = 0; 3096 int err = 0;
3094 3097
3095 /* This may be ugly but it results in pretty output so who cares. (-8 */ 3098 /* This may be ugly but it results in pretty output so who cares. (-8 */
3096 printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" 3099 pr_info("driver " NTFS_VERSION " [Flags: R/"
3097#ifdef NTFS_RW 3100#ifdef NTFS_RW
3098 "W" 3101 "W"
3099#else 3102#else
@@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void)
3113 sizeof(ntfs_index_context), 0 /* offset */, 3116 sizeof(ntfs_index_context), 0 /* offset */,
3114 SLAB_HWCACHE_ALIGN, NULL /* ctor */); 3117 SLAB_HWCACHE_ALIGN, NULL /* ctor */);
3115 if (!ntfs_index_ctx_cache) { 3118 if (!ntfs_index_ctx_cache) {
3116 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3119 pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
3117 ntfs_index_ctx_cache_name);
3118 goto ictx_err_out; 3120 goto ictx_err_out;
3119 } 3121 }
3120 ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, 3122 ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
3121 sizeof(ntfs_attr_search_ctx), 0 /* offset */, 3123 sizeof(ntfs_attr_search_ctx), 0 /* offset */,
3122 SLAB_HWCACHE_ALIGN, NULL /* ctor */); 3124 SLAB_HWCACHE_ALIGN, NULL /* ctor */);
3123 if (!ntfs_attr_ctx_cache) { 3125 if (!ntfs_attr_ctx_cache) {
3124 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3126 pr_crit("NTFS: Failed to create %s!\n",
3125 ntfs_attr_ctx_cache_name); 3127 ntfs_attr_ctx_cache_name);
3126 goto actx_err_out; 3128 goto actx_err_out;
3127 } 3129 }
3128 3130
@@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void)
3130 (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, 3132 (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
3131 SLAB_HWCACHE_ALIGN, NULL); 3133 SLAB_HWCACHE_ALIGN, NULL);
3132 if (!ntfs_name_cache) { 3134 if (!ntfs_name_cache) {
3133 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3135 pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
3134 ntfs_name_cache_name);
3135 goto name_err_out; 3136 goto name_err_out;
3136 } 3137 }
3137 3138
@@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void)
3139 sizeof(ntfs_inode), 0, 3140 sizeof(ntfs_inode), 0,
3140 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 3141 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
3141 if (!ntfs_inode_cache) { 3142 if (!ntfs_inode_cache) {
3142 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3143 pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
3143 ntfs_inode_cache_name);
3144 goto inode_err_out; 3144 goto inode_err_out;
3145 } 3145 }
3146 3146
@@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void)
3149 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 3149 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
3150 ntfs_big_inode_init_once); 3150 ntfs_big_inode_init_once);
3151 if (!ntfs_big_inode_cache) { 3151 if (!ntfs_big_inode_cache) {
3152 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3152 pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
3153 ntfs_big_inode_cache_name);
3154 goto big_inode_err_out; 3153 goto big_inode_err_out;
3155 } 3154 }
3156 3155
3157 /* Register the ntfs sysctls. */ 3156 /* Register the ntfs sysctls. */
3158 err = ntfs_sysctl(1); 3157 err = ntfs_sysctl(1);
3159 if (err) { 3158 if (err) {
3160 printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); 3159 pr_crit("Failed to register NTFS sysctls!\n");
3161 goto sysctl_err_out; 3160 goto sysctl_err_out;
3162 } 3161 }
3163 3162
@@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void)
3166 ntfs_debug("NTFS driver registered successfully."); 3165 ntfs_debug("NTFS driver registered successfully.");
3167 return 0; /* Success! */ 3166 return 0; /* Success! */
3168 } 3167 }
3169 printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); 3168 pr_crit("Failed to register NTFS filesystem driver!\n");
3170 3169
3171 /* Unregister the ntfs sysctls. */ 3170 /* Unregister the ntfs sysctls. */
3172 ntfs_sysctl(0); 3171 ntfs_sysctl(0);
@@ -3182,8 +3181,7 @@ actx_err_out:
3182 kmem_cache_destroy(ntfs_index_ctx_cache); 3181 kmem_cache_destroy(ntfs_index_ctx_cache);
3183ictx_err_out: 3182ictx_err_out:
3184 if (!err) { 3183 if (!err) {
3185 printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " 3184 pr_crit("Aborting NTFS filesystem driver registration...\n");
3186 "registration...\n");
3187 err = -ENOMEM; 3185 err = -ENOMEM;
3188 } 3186 }
3189 return err; 3187 return err;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 555f4cddefe3..7e8282dcea2a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
205 di->i_mode = cpu_to_le16(inode->i_mode); 205 di->i_mode = cpu_to_le16(inode->i_mode);
206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
208 ocfs2_update_inode_fsync_trans(handle, inode, 0);
208 209
209 ocfs2_journal_dirty(handle, di_bh); 210 ocfs2_journal_dirty(handle, di_bh);
210 211
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e2edff38be52..b4deb5f750d9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5728 } 5728 }
5729 5729
5730 ocfs2_et_update_clusters(et, -len); 5730 ocfs2_et_update_clusters(et, -len);
5731 ocfs2_update_inode_fsync_trans(handle, inode, 1);
5731 5732
5732 ocfs2_journal_dirty(handle, et->et_root_bh); 5733 ocfs2_journal_dirty(handle, et->et_root_bh);
5733 5734
@@ -6932,6 +6933,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6932 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6933 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6933 spin_unlock(&oi->ip_lock); 6934 spin_unlock(&oi->ip_lock);
6934 6935
6936 ocfs2_update_inode_fsync_trans(handle, inode, 1);
6935 ocfs2_dinode_new_extent_list(inode, di); 6937 ocfs2_dinode_new_extent_list(inode, di);
6936 6938
6937 ocfs2_journal_dirty(handle, di_bh); 6939 ocfs2_journal_dirty(handle, di_bh);
@@ -7208,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7208 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7210 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7209 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7211 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7210 7212
7213 ocfs2_update_inode_fsync_trans(handle, inode, 1);
7211 ocfs2_journal_dirty(handle, di_bh); 7214 ocfs2_journal_dirty(handle, di_bh);
7212 7215
7213out_commit: 7216out_commit:
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index aeb44e879c51..d310d12a9adc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
571{ 571{
572 struct inode *inode = file_inode(iocb->ki_filp); 572 struct inode *inode = file_inode(iocb->ki_filp);
573 int level; 573 int level;
574 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
575 574
576 /* this io's submitter should not have unlocked this before we could */ 575 /* this io's submitter should not have unlocked this before we could */
577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 576 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
582 if (ocfs2_iocb_is_unaligned_aio(iocb)) { 581 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
583 ocfs2_iocb_clear_unaligned_aio(iocb); 582 ocfs2_iocb_clear_unaligned_aio(iocb);
584 583
585 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && 584 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
586 waitqueue_active(wq)) {
587 wake_up_all(wq);
588 }
589 } 585 }
590 586
591 ocfs2_iocb_clear_rw_locked(iocb); 587 ocfs2_iocb_clear_rw_locked(iocb);
@@ -2043,6 +2039,7 @@ out_write_size:
2043 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2039 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2044 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2040 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2045 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2041 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2042 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2046 ocfs2_journal_dirty(handle, wc->w_di_bh); 2043 ocfs2_journal_dirty(handle, wc->w_di_bh);
2047 2044
2048 ocfs2_commit_trans(osb, handle); 2045 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb34..6cae155d54df 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {
102#define ocfs2_iocb_is_unaligned_aio(iocb) \ 102#define ocfs2_iocb_is_unaligned_aio(iocb) \
103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
104 104
105#define OCFS2_IOEND_WQ_HASH_SZ 37
106#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
107 OCFS2_IOEND_WQ_HASH_SZ])
108extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
109
110#endif /* OCFS2_FILE_H */ 105#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5b704c63a103..1edcb141f639 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
90 * information for this bh as it's not marked locally 90 * information for this bh as it's not marked locally
91 * uptodate. */ 91 * uptodate. */
92 ret = -EIO; 92 ret = -EIO;
93 put_bh(bh);
94 mlog_errno(ret); 93 mlog_errno(ret);
95 } 94 }
96 95
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
420 419
421 if (!buffer_uptodate(bh)) { 420 if (!buffer_uptodate(bh)) {
422 ret = -EIO; 421 ret = -EIO;
423 put_bh(bh);
424 mlog_errno(ret); 422 mlog_errno(ret);
425 } 423 }
426 424
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..b7f57271d49c 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); 41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
42} 42}
43static struct kobj_attribute attr_version = 43static struct kobj_attribute attr_version =
44 __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); 44 __ATTR(interface_revision, S_IRUGO, version_show, NULL);
45 45
46static struct attribute *o2cb_attrs[] = { 46static struct attribute *o2cb_attrs[] = {
47 &attr_version.attr, 47 &attr_version.attr,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b4140..eb649d23a4de 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
262 262
263#endif /* CONFIG_OCFS2_FS_STATS */ 263#endif /* CONFIG_OCFS2_FS_STATS */
264 264
265static inline int o2net_reconnect_delay(void) 265static inline unsigned int o2net_reconnect_delay(void)
266{ 266{
267 return o2nm_single_cluster->cl_reconnect_delay_ms; 267 return o2nm_single_cluster->cl_reconnect_delay_ms;
268} 268}
269 269
270static inline int o2net_keepalive_delay(void) 270static inline unsigned int o2net_keepalive_delay(void)
271{ 271{
272 return o2nm_single_cluster->cl_keepalive_delay_ms; 272 return o2nm_single_cluster->cl_keepalive_delay_ms;
273} 273}
274 274
275static inline int o2net_idle_timeout(void) 275static inline unsigned int o2net_idle_timeout(void)
276{ 276{
277 return o2nm_single_cluster->cl_idle_timeout_ms; 277 return o2nm_single_cluster->cl_idle_timeout_ms;
278} 278}
@@ -1964,18 +1964,30 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
1964 goto out; 1964 goto out;
1965 } 1965 }
1966 1966
1967 /* ->sk_data_ready is also called for a newly established child socket 1967 /* This callback may called twice when a new connection
1968 * before it has been accepted and the acceptor has set up their 1968 * is being established as a child socket inherits everything
1969 * data_ready.. we only want to queue listen work for our listening 1969 * from a parent LISTEN socket, including the data_ready cb of
1970 * socket */ 1970 * the parent. This leads to a hazard. In o2net_accept_one()
1971 * we are still initializing the child socket but have not
1972 * changed the inherited data_ready callback yet when
1973 * data starts arriving.
1974 * We avoid this hazard by checking the state.
1975 * For the listening socket, the state will be TCP_LISTEN; for the new
1976 * socket, will be TCP_ESTABLISHED. Also, in this case,
1977 * sk->sk_user_data is not a valid function pointer.
1978 */
1979
1971 if (sk->sk_state == TCP_LISTEN) { 1980 if (sk->sk_state == TCP_LISTEN) {
1972 mlog(ML_TCP, "bytes: %d\n", bytes); 1981 mlog(ML_TCP, "bytes: %d\n", bytes);
1973 queue_work(o2net_wq, &o2net_listen_work); 1982 queue_work(o2net_wq, &o2net_listen_work);
1983 } else {
1984 ready = NULL;
1974 } 1985 }
1975 1986
1976out: 1987out:
1977 read_unlock(&sk->sk_callback_lock); 1988 read_unlock(&sk->sk_callback_lock);
1978 ready(sk, bytes); 1989 if (ready != NULL)
1990 ready(sk, bytes);
1979} 1991}
1980 1992
1981static int o2net_open_listening_sock(__be32 addr, __be16 port) 1993static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f6..e2e05a106beb 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
37#include "dlmglue.h" 37#include "dlmglue.h"
38#include "file.h" 38#include "file.h"
39#include "inode.h" 39#include "inode.h"
40#include "super.h"
41#include "ocfs2_trace.h" 40#include "ocfs2_trace.h"
42 41
43void ocfs2_dentry_attach_gen(struct dentry *dentry) 42void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
346 return ret; 345 return ret;
347} 346}
348 347
349DEFINE_SPINLOCK(dentry_list_lock);
350
351/* We limit the number of dentry locks to drop in one go. We have
352 * this limit so that we don't starve other users of ocfs2_wq. */
353#define DL_INODE_DROP_COUNT 64
354
355/* Drop inode references from dentry locks */
356static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
357{
358 struct ocfs2_dentry_lock *dl;
359
360 spin_lock(&dentry_list_lock);
361 while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
362 dl = osb->dentry_lock_list;
363 osb->dentry_lock_list = dl->dl_next;
364 spin_unlock(&dentry_list_lock);
365 iput(dl->dl_inode);
366 kfree(dl);
367 spin_lock(&dentry_list_lock);
368 }
369 spin_unlock(&dentry_list_lock);
370}
371
372void ocfs2_drop_dl_inodes(struct work_struct *work)
373{
374 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
375 dentry_lock_work);
376
377 __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
378 /*
379 * Don't queue dropping if umount is in progress. We flush the
380 * list in ocfs2_dismount_volume
381 */
382 spin_lock(&dentry_list_lock);
383 if (osb->dentry_lock_list &&
384 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
385 queue_work(ocfs2_wq, &osb->dentry_lock_work);
386 spin_unlock(&dentry_list_lock);
387}
388
389/* Flush the whole work queue */
390void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
391{
392 __ocfs2_drop_dl_inodes(osb, -1);
393}
394
395/* 348/*
396 * ocfs2_dentry_iput() and friends. 349 * ocfs2_dentry_iput() and friends.
397 * 350 *
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
416static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 369static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
417 struct ocfs2_dentry_lock *dl) 370 struct ocfs2_dentry_lock *dl)
418{ 371{
372 iput(dl->dl_inode);
419 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 373 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
420 ocfs2_lock_res_free(&dl->dl_lockres); 374 ocfs2_lock_res_free(&dl->dl_lockres);
421 375 kfree(dl);
422 /* We leave dropping of inode reference to ocfs2_wq as that can
423 * possibly lead to inode deletion which gets tricky */
424 spin_lock(&dentry_list_lock);
425 if (!osb->dentry_lock_list &&
426 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
427 queue_work(ocfs2_wq, &osb->dentry_lock_work);
428 dl->dl_next = osb->dentry_lock_list;
429 osb->dentry_lock_list = dl;
430 spin_unlock(&dentry_list_lock);
431} 376}
432 377
433void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 378void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
434 struct ocfs2_dentry_lock *dl) 379 struct ocfs2_dentry_lock *dl)
435{ 380{
436 int unlock; 381 int unlock = 0;
437 382
438 BUG_ON(dl->dl_count == 0); 383 BUG_ON(dl->dl_count == 0);
439 384
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff709958..55f58892b153 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
29extern const struct dentry_operations ocfs2_dentry_ops; 29extern const struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
33 unsigned int dl_count; 32 unsigned int dl_count;
34 union { 33 u64 dl_parent_blkno;
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
39 34
40 /* 35 /*
41 * The ocfs2_dentry_lock keeps an inode reference until 36 * The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, 44int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
50 u64 parent_blkno); 45 u64 parent_blkno);
51 46
52extern spinlock_t dentry_list_lock;
53
54void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 47void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
55 struct ocfs2_dentry_lock *dl); 48 struct ocfs2_dentry_lock *dl);
56 49
57void ocfs2_drop_dl_inodes(struct work_struct *work);
58void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
59
60struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
61 int skip_unhashed); 51 int skip_unhashed);
62 52
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..0717662b4aef 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2958 } 2958 }
2959 2959
2960 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2960 ocfs2_journal_dirty(handle, dirdata_bh); 2961 ocfs2_journal_dirty(handle, dirdata_bh);
2961 2962
2962 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2963 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 di->i_size = cpu_to_le64(sb->s_blocksize); 3006 di->i_size = cpu_to_le64(sb->s_blocksize);
3006 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); 3007 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
3007 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); 3008 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
3009 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3008 3010
3009 /* 3011 /*
3010 * This should never fail as our extent list is empty and all 3012 * This should never fail as our extent list is empty and all
@@ -3338,6 +3340,7 @@ do_extend:
3338 } else { 3340 } else {
3339 de->rec_len = cpu_to_le16(sb->s_blocksize); 3341 de->rec_len = cpu_to_le16(sb->s_blocksize);
3340 } 3342 }
3343 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3341 ocfs2_journal_dirty(handle, new_bh); 3344 ocfs2_journal_dirty(handle, new_bh);
3342 3345
3343 dir_i_size += dir->i_sb->s_blocksize; 3346 dir_i_size += dir->i_sb->s_blocksize;
@@ -3896,6 +3899,7 @@ out_commit:
3896 dquot_free_space_nodirty(dir, 3899 dquot_free_space_nodirty(dir,
3897 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3900 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3898 3901
3902 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3899 ocfs2_commit_trans(osb, handle); 3903 ocfs2_commit_trans(osb, handle);
3900 3904
3901out: 3905out:
@@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4134 mlog_errno(ret); 4138 mlog_errno(ret);
4135 did_quota = 0; 4139 did_quota = 0;
4136 4140
4141 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4137 ocfs2_journal_dirty(handle, dx_root_bh); 4142 ocfs2_journal_dirty(handle, dx_root_bh);
4138 4143
4139out_commit: 4144out_commit:
@@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4401 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4406 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4402 spin_unlock(&OCFS2_I(dir)->ip_lock); 4407 spin_unlock(&OCFS2_I(dir)->ip_lock);
4403 di->i_dx_root = cpu_to_le64(0ULL); 4408 di->i_dx_root = cpu_to_le64(0ULL);
4409 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4404 4410
4405 ocfs2_journal_dirty(handle, di_bh); 4411 ocfs2_journal_dirty(handle, di_bh);
4406 4412
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 33660a4a52fa..c973690dc0bc 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1123 struct dlm_ctxt *dlm = NULL; 1123 struct dlm_ctxt *dlm = NULL;
1124 char *local = NULL; 1124 char *local = NULL;
1125 int status = 0; 1125 int status = 0;
1126 int locked = 0;
1127 1126
1128 qr = (struct dlm_query_region *) msg->buf; 1127 qr = (struct dlm_query_region *) msg->buf;
1129 1128
@@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1132 1131
1133 /* buffer used in dlm_mast_regions() */ 1132 /* buffer used in dlm_mast_regions() */
1134 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); 1133 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
1135 if (!local) { 1134 if (!local)
1136 status = -ENOMEM; 1135 return -ENOMEM;
1137 goto bail;
1138 }
1139 1136
1140 status = -EINVAL; 1137 status = -EINVAL;
1141 1138
@@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1144 if (!dlm) { 1141 if (!dlm) {
1145 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1142 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1146 "before join domain\n", qr->qr_node, qr->qr_domain); 1143 "before join domain\n", qr->qr_node, qr->qr_domain);
1147 goto bail; 1144 goto out_domain_lock;
1148 } 1145 }
1149 1146
1150 spin_lock(&dlm->spinlock); 1147 spin_lock(&dlm->spinlock);
1151 locked = 1;
1152 if (dlm->joining_node != qr->qr_node) { 1148 if (dlm->joining_node != qr->qr_node) {
1153 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1149 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1154 "but joining node is %d\n", qr->qr_node, qr->qr_domain, 1150 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1155 dlm->joining_node); 1151 dlm->joining_node);
1156 goto bail; 1152 goto out_dlm_lock;
1157 } 1153 }
1158 1154
1159 /* Support for global heartbeat was added in 1.1 */ 1155 /* Support for global heartbeat was added in 1.1 */
@@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1163 "but active dlm protocol is %d.%d\n", qr->qr_node, 1159 "but active dlm protocol is %d.%d\n", qr->qr_node,
1164 qr->qr_domain, dlm->dlm_locking_proto.pv_major, 1160 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1165 dlm->dlm_locking_proto.pv_minor); 1161 dlm->dlm_locking_proto.pv_minor);
1166 goto bail; 1162 goto out_dlm_lock;
1167 } 1163 }
1168 1164
1169 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); 1165 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
1170 1166
1171bail: 1167out_dlm_lock:
1172 if (locked) 1168 spin_unlock(&dlm->spinlock);
1173 spin_unlock(&dlm->spinlock); 1169
1170out_domain_lock:
1174 spin_unlock(&dlm_domain_lock); 1171 spin_unlock(&dlm_domain_lock);
1175 1172
1176 kfree(local); 1173 kfree(local);
@@ -1877,19 +1874,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1877 goto bail; 1874 goto bail;
1878 } 1875 }
1879 1876
1880 status = dlm_debug_init(dlm); 1877 status = dlm_launch_thread(dlm);
1881 if (status < 0) { 1878 if (status < 0) {
1882 mlog_errno(status); 1879 mlog_errno(status);
1883 goto bail; 1880 goto bail;
1884 } 1881 }
1885 1882
1886 status = dlm_launch_thread(dlm); 1883 status = dlm_launch_recovery_thread(dlm);
1887 if (status < 0) { 1884 if (status < 0) {
1888 mlog_errno(status); 1885 mlog_errno(status);
1889 goto bail; 1886 goto bail;
1890 } 1887 }
1891 1888
1892 status = dlm_launch_recovery_thread(dlm); 1889 status = dlm_debug_init(dlm);
1893 if (status < 0) { 1890 if (status < 0) {
1894 mlog_errno(status); 1891 mlog_errno(status);
1895 goto bail; 1892 goto bail;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7035af09cc03..fe29f7978f81 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
537 /* success! see if any other nodes need recovery */ 537 /* success! see if any other nodes need recovery */
538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", 538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
539 dlm->name, dlm->reco.dead_node, dlm->node_num); 539 dlm->name, dlm->reco.dead_node, dlm->node_num);
540 dlm_reset_recovery(dlm); 540 spin_lock(&dlm->spinlock);
541 __dlm_reset_recovery(dlm);
542 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
543 spin_unlock(&dlm->spinlock);
541 } 544 }
542 dlm_end_recovery(dlm); 545 dlm_end_recovery(dlm);
543 546
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
695 if (all_nodes_done) { 698 if (all_nodes_done) {
696 int ret; 699 int ret;
697 700
701 /* Set this flag on recovery master to avoid
702 * a new recovery for another dead node start
703 * before the recovery is not done. That may
704 * cause recovery hung.*/
705 spin_lock(&dlm->spinlock);
706 dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
707 spin_unlock(&dlm->spinlock);
708
698 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state 709 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
699 * just send a finalize message to everyone and 710 * just send a finalize message to everyone and
700 * clean up */ 711 * clean up */
@@ -1750,13 +1761,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1750 struct dlm_migratable_lockres *mres) 1761 struct dlm_migratable_lockres *mres)
1751{ 1762{
1752 struct dlm_migratable_lock *ml; 1763 struct dlm_migratable_lock *ml;
1753 struct list_head *queue; 1764 struct list_head *queue, *iter;
1754 struct list_head *tmpq = NULL; 1765 struct list_head *tmpq = NULL;
1755 struct dlm_lock *newlock = NULL; 1766 struct dlm_lock *newlock = NULL;
1756 struct dlm_lockstatus *lksb = NULL; 1767 struct dlm_lockstatus *lksb = NULL;
1757 int ret = 0; 1768 int ret = 0;
1758 int i, j, bad; 1769 int i, j, bad;
1759 struct dlm_lock *lock = NULL; 1770 struct dlm_lock *lock;
1760 u8 from = O2NM_MAX_NODES; 1771 u8 from = O2NM_MAX_NODES;
1761 unsigned int added = 0; 1772 unsigned int added = 0;
1762 __be64 c; 1773 __be64 c;
@@ -1791,14 +1802,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1791 /* MIGRATION ONLY! */ 1802 /* MIGRATION ONLY! */
1792 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); 1803 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1793 1804
1805 lock = NULL;
1794 spin_lock(&res->spinlock); 1806 spin_lock(&res->spinlock);
1795 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1807 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1796 tmpq = dlm_list_idx_to_ptr(res, j); 1808 tmpq = dlm_list_idx_to_ptr(res, j);
1797 list_for_each_entry(lock, tmpq, list) { 1809 list_for_each(iter, tmpq) {
1798 if (lock->ml.cookie != ml->cookie) 1810 lock = list_entry(iter,
1799 lock = NULL; 1811 struct dlm_lock, list);
1800 else 1812 if (lock->ml.cookie == ml->cookie)
1801 break; 1813 break;
1814 lock = NULL;
1802 } 1815 }
1803 if (lock) 1816 if (lock)
1804 break; 1817 break;
@@ -2882,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2882 BUG(); 2895 BUG();
2883 } 2896 }
2884 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; 2897 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2898 __dlm_reset_recovery(dlm);
2885 spin_unlock(&dlm->spinlock); 2899 spin_unlock(&dlm->spinlock);
2886 dlm_reset_recovery(dlm);
2887 dlm_kick_recovery_thread(dlm); 2900 dlm_kick_recovery_thread(dlm);
2888 break; 2901 break;
2889 default: 2902 default:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 19986959d149..6bd690b5a061 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3144,22 +3144,60 @@ out:
3144 return 0; 3144 return 0;
3145} 3145}
3146 3146
3147static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3148 struct ocfs2_lock_res *lockres);
3149
3147/* Mark the lockres as being dropped. It will no longer be 3150/* Mark the lockres as being dropped. It will no longer be
3148 * queued if blocking, but we still may have to wait on it 3151 * queued if blocking, but we still may have to wait on it
3149 * being dequeued from the downconvert thread before we can consider 3152 * being dequeued from the downconvert thread before we can consider
3150 * it safe to drop. 3153 * it safe to drop.
3151 * 3154 *
3152 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3155 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3156void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
3157 struct ocfs2_lock_res *lockres)
3154{ 3158{
3155 int status; 3159 int status;
3156 struct ocfs2_mask_waiter mw; 3160 struct ocfs2_mask_waiter mw;
3157 unsigned long flags; 3161 unsigned long flags, flags2;
3158 3162
3159 ocfs2_init_mask_waiter(&mw); 3163 ocfs2_init_mask_waiter(&mw);
3160 3164
3161 spin_lock_irqsave(&lockres->l_lock, flags); 3165 spin_lock_irqsave(&lockres->l_lock, flags);
3162 lockres->l_flags |= OCFS2_LOCK_FREEING; 3166 lockres->l_flags |= OCFS2_LOCK_FREEING;
3167 if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
3168 /*
3169 * We know the downconvert is queued but not in progress
3170 * because we are the downconvert thread and processing
3171 * different lock. So we can just remove the lock from the
3172 * queue. This is not only an optimization but also a way
3173 * to avoid the following deadlock:
3174 * ocfs2_dentry_post_unlock()
3175 * ocfs2_dentry_lock_put()
3176 * ocfs2_drop_dentry_lock()
3177 * iput()
3178 * ocfs2_evict_inode()
3179 * ocfs2_clear_inode()
3180 * ocfs2_mark_lockres_freeing()
3181 * ... blocks waiting for OCFS2_LOCK_QUEUED
3182 * since we are the downconvert thread which
3183 * should clear the flag.
3184 */
3185 spin_unlock_irqrestore(&lockres->l_lock, flags);
3186 spin_lock_irqsave(&osb->dc_task_lock, flags2);
3187 list_del_init(&lockres->l_blocked_list);
3188 osb->blocked_lock_count--;
3189 spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
3190 /*
3191 * Warn if we recurse into another post_unlock call. Strictly
3192 * speaking it isn't a problem but we need to be careful if
3193 * that happens (stack overflow, deadlocks, ...) so warn if
3194 * ocfs2 grows a path for which this can happen.
3195 */
3196 WARN_ON_ONCE(lockres->l_ops->post_unlock);
3197 /* Since the lock is freeing we don't do much in the fn below */
3198 ocfs2_process_blocked_lock(osb, lockres);
3199 return;
3200 }
3163 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 3201 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
3164 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 3202 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
3165 spin_unlock_irqrestore(&lockres->l_lock, flags); 3203 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
3180{ 3218{
3181 int ret; 3219 int ret;
3182 3220
3183 ocfs2_mark_lockres_freeing(lockres); 3221 ocfs2_mark_lockres_freeing(osb, lockres);
3184 ret = ocfs2_drop_lock(osb, lockres); 3222 ret = ocfs2_drop_lock(osb, lockres);
3185 if (ret) 3223 if (ret)
3186 mlog_errno(ret); 3224 mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..d293a22c32c5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); 157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
158 158
159 159
160void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 160void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
161 struct ocfs2_lock_res *lockres);
161void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 162void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
162 struct ocfs2_lock_res *lockres); 163 struct ocfs2_lock_res *lockres);
163 164
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c40e896..ff33c5ef87f2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync) 175 int datasync)
176{ 176{
177 int err = 0; 177 int err = 0;
178 journal_t *journal;
179 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
180 struct ocfs2_inode_info *oi = OCFS2_I(inode);
181 journal_t *journal = osb->journal->j_journal;
182 int ret;
183 tid_t commit_tid;
184 bool needs_barrier = false;
181 185
182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 186 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183 OCFS2_I(inode)->ip_blkno, 187 OCFS2_I(inode)->ip_blkno,
@@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
192 if (err) 196 if (err)
193 return err; 197 return err;
194 198
195 /* 199 commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
196 * Probably don't need the i_mutex at all in here, just putting it here 200 if (journal->j_flags & JBD2_BARRIER &&
197 * to be consistent with how fsync used to be called, someone more 201 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
198 * familiar with the fs could possibly remove it. 202 needs_barrier = true;
199 */ 203 err = jbd2_complete_transaction(journal, commit_tid);
200 mutex_lock(&inode->i_mutex); 204 if (needs_barrier) {
201 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 205 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
202 /* 206 if (!err)
203 * We still have to flush drive's caches to get data to the 207 err = ret;
204 * platter
205 */
206 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
207 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
208 goto bail;
209 } 208 }
210 209
211 journal = osb->journal->j_journal;
212 err = jbd2_journal_force_commit(journal);
213
214bail:
215 if (err) 210 if (err)
216 mlog_errno(err); 211 mlog_errno(err);
217 mutex_unlock(&inode->i_mutex);
218 212
219 return (err < 0) ? -EIO : 0; 213 return (err < 0) ? -EIO : 0;
220} 214}
@@ -292,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
292 inode->i_atime = CURRENT_TIME; 286 inode->i_atime = CURRENT_TIME;
293 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 287 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
294 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 288 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
289 ocfs2_update_inode_fsync_trans(handle, inode, 0);
295 ocfs2_journal_dirty(handle, bh); 290 ocfs2_journal_dirty(handle, bh);
296 291
297out_commit: 292out_commit:
@@ -341,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
341 if (ret < 0) 336 if (ret < 0)
342 mlog_errno(ret); 337 mlog_errno(ret);
343 338
339 ocfs2_update_inode_fsync_trans(handle, inode, 0);
344 ocfs2_commit_trans(osb, handle); 340 ocfs2_commit_trans(osb, handle);
345out: 341out:
346 return ret; 342 return ret;
@@ -435,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
435 di->i_size = cpu_to_le64(new_i_size); 431 di->i_size = cpu_to_le64(new_i_size);
436 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 432 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
437 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 433 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
434 ocfs2_update_inode_fsync_trans(handle, inode, 0);
438 435
439 ocfs2_journal_dirty(handle, fe_bh); 436 ocfs2_journal_dirty(handle, fe_bh);
440 437
@@ -650,7 +647,7 @@ restarted_transaction:
650 mlog_errno(status); 647 mlog_errno(status);
651 goto leave; 648 goto leave;
652 } 649 }
653 650 ocfs2_update_inode_fsync_trans(handle, inode, 1);
654 ocfs2_journal_dirty(handle, bh); 651 ocfs2_journal_dirty(handle, bh);
655 652
656 spin_lock(&OCFS2_I(inode)->ip_lock); 653 spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -743,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
743 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
744 if (ret) 741 if (ret)
745 mlog_errno(ret); 742 mlog_errno(ret);
743 ocfs2_update_inode_fsync_trans(handle, inode, 1);
746 744
747out: 745out:
748 if (ret) { 746 if (ret) {
@@ -840,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
840 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
841 di->i_mtime_nsec = di->i_ctime_nsec; 839 di->i_mtime_nsec = di->i_ctime_nsec;
842 ocfs2_journal_dirty(handle, di_bh); 840 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1);
843 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
844 } 843 }
845 844
@@ -1344,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1344 1343
1345 di = (struct ocfs2_dinode *) bh->b_data; 1344 di = (struct ocfs2_dinode *) bh->b_data;
1346 di->i_mode = cpu_to_le16(inode->i_mode); 1345 di->i_mode = cpu_to_le16(inode->i_mode);
1346 ocfs2_update_inode_fsync_trans(handle, inode, 0);
1347 1347
1348 ocfs2_journal_dirty(handle, bh); 1348 ocfs2_journal_dirty(handle, bh);
1349 1349
@@ -1576,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1576 if (ret) 1576 if (ret)
1577 mlog_errno(ret); 1577 mlog_errno(ret);
1578 } 1578 }
1579 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1579 1580
1580 ocfs2_commit_trans(osb, handle); 1581 ocfs2_commit_trans(osb, handle);
1581out: 1582out:
@@ -2061,13 +2062,6 @@ out:
2061 return ret; 2062 return ret;
2062} 2063}
2063 2064
2064static void ocfs2_aiodio_wait(struct inode *inode)
2065{
2066 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2067
2068 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2069}
2070
2071static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) 2065static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2072{ 2066{
2073 int blockmask = inode->i_sb->s_blocksize - 1; 2067 int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2345,10 +2339,8 @@ relock:
2345 * Wait on previous unaligned aio to complete before 2339 * Wait on previous unaligned aio to complete before
2346 * proceeding. 2340 * proceeding.
2347 */ 2341 */
2348 ocfs2_aiodio_wait(inode); 2342 mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
2349 2343 /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
2350 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2352 ocfs2_iocb_set_unaligned_aio(iocb); 2344 ocfs2_iocb_set_unaligned_aio(iocb);
2353 } 2345 }
2354 2346
@@ -2428,7 +2420,7 @@ out_dio:
2428 2420
2429 if (unaligned_dio) { 2421 if (unaligned_dio) {
2430 ocfs2_iocb_clear_unaligned_aio(iocb); 2422 ocfs2_iocb_clear_unaligned_aio(iocb);
2431 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2423 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2432 } 2424 }
2433 2425
2434out: 2426out:
@@ -2645,7 +2637,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2645 case SEEK_SET: 2637 case SEEK_SET:
2646 break; 2638 break;
2647 case SEEK_END: 2639 case SEEK_END:
2648 offset += inode->i_size; 2640 /* SEEK_END requires the OCFS2 inode lock for the file
2641 * because it references the file's size.
2642 */
2643 ret = ocfs2_inode_lock(inode, NULL, 0);
2644 if (ret < 0) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648 offset += i_size_read(inode);
2649 ocfs2_inode_unlock(inode, 0);
2649 break; 2650 break;
2650 case SEEK_CUR: 2651 case SEEK_CUR:
2651 if (offset == 0) { 2652 if (offset == 0) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..437de7f768c6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
130 struct inode *inode = NULL; 130 struct inode *inode = NULL;
131 struct super_block *sb = osb->sb; 131 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 132 struct ocfs2_find_inode_args args;
133 journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
133 134
134 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 135 trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135 sysfile_type); 136 sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
169 goto bail; 170 goto bail;
170 } 171 }
171 172
173 /*
174 * Set transaction id's of transactions that have to be committed
175 * to finish f[data]sync. We set them to currently running transaction
176 * as we cannot be sure that the inode or some of its metadata isn't
177 * part of the transaction - the inode could have been reclaimed and
178 * now it is reread from disk.
179 */
180 if (journal) {
181 transaction_t *transaction;
182 tid_t tid;
183 struct ocfs2_inode_info *oi = OCFS2_I(inode);
184
185 read_lock(&journal->j_state_lock);
186 if (journal->j_running_transaction)
187 transaction = journal->j_running_transaction;
188 else
189 transaction = journal->j_committing_transaction;
190 if (transaction)
191 tid = transaction->t_tid;
192 else
193 tid = journal->j_commit_sequence;
194 read_unlock(&journal->j_state_lock);
195 oi->i_sync_tid = tid;
196 oi->i_datasync_tid = tid;
197 }
198
172bail: 199bail:
173 if (!IS_ERR(inode)) { 200 if (!IS_ERR(inode)) {
174 trace_ocfs2_iget_end(inode, 201 trace_ocfs2_iget_end(inode,
@@ -804,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
804 goto bail; 831 goto bail;
805 } 832 }
806 833
807 /* If we're coming from downconvert_thread we can't go into our own 834 /*
808 * voting [hello, deadlock city!], so unforuntately we just 835 * If we're coming from downconvert_thread we can't go into our own
809 * have to skip deleting this guy. That's OK though because 836 * voting [hello, deadlock city!] so we cannot delete the inode. But
810 * the node who's doing the actual deleting should handle it 837 * since we dropped last inode ref when downconverting dentry lock,
811 * anyway. */ 838 * we cannot have the file open and thus the node doing unlink will
839 * take care of deleting the inode.
840 */
812 if (current == osb->dc_task) 841 if (current == osb->dc_task)
813 goto bail; 842 goto bail;
814 843
@@ -822,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
822 goto bail_unlock; 851 goto bail_unlock;
823 } 852 }
824 853
825 /* If we have allowd wipe of this inode for another node, it
826 * will be marked here so we can safely skip it. Recovery will
827 * cleanup any inodes we might inadvertently skip here. */
828 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
829 goto bail_unlock;
830
831 ret = 1; 854 ret = 1;
832bail_unlock: 855bail_unlock:
833 spin_unlock(&oi->ip_lock); 856 spin_unlock(&oi->ip_lock);
@@ -941,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
941 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 964 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
942 if (sync_data) 965 if (sync_data)
943 filemap_write_and_wait(inode->i_mapping); 966 filemap_write_and_wait(inode->i_mapping);
944 truncate_inode_pages(&inode->i_data, 0); 967 truncate_inode_pages_final(&inode->i_data);
945} 968}
946 969
947static void ocfs2_delete_inode(struct inode *inode) 970static void ocfs2_delete_inode(struct inode *inode)
@@ -960,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)
960 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) 983 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
961 goto bail; 984 goto bail;
962 985
963 dquot_initialize(inode);
964
965 if (!ocfs2_inode_is_valid_to_delete(inode)) { 986 if (!ocfs2_inode_is_valid_to_delete(inode)) {
966 /* It's probably not necessary to truncate_inode_pages 987 /* It's probably not necessary to truncate_inode_pages
967 * here but we do it for safety anyway (it will most 988 * here but we do it for safety anyway (it will most
@@ -970,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)
970 goto bail; 991 goto bail;
971 } 992 }
972 993
994 dquot_initialize(inode);
995
973 /* We want to block signals in delete_inode as the lock and 996 /* We want to block signals in delete_inode as the lock and
974 * messaging paths may return us -ERESTARTSYS. Which would 997 * messaging paths may return us -ERESTARTSYS. Which would
975 * cause us to exit early, resulting in inodes being orphaned 998 * cause us to exit early, resulting in inodes being orphaned
@@ -1057,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1057{ 1080{
1058 int status; 1081 int status;
1059 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1082 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1083 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1060 1084
1061 clear_inode(inode); 1085 clear_inode(inode);
1062 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1086 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
@@ -1073,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1073 1097
1074 /* Do these before all the other work so that we don't bounce 1098 /* Do these before all the other work so that we don't bounce
1075 * the downconvert thread while waiting to destroy the locks. */ 1099 * the downconvert thread while waiting to destroy the locks. */
1076 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1100 ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
1077 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1101 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
1078 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1102 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
1079 1103
1080 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, 1104 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1081 &oi->ip_la_data_resv); 1105 &oi->ip_la_data_resv);
@@ -1157,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)
1157 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { 1181 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1158 ocfs2_delete_inode(inode); 1182 ocfs2_delete_inode(inode);
1159 } else { 1183 } else {
1160 truncate_inode_pages(&inode->i_data, 0); 1184 truncate_inode_pages_final(&inode->i_data);
1161 } 1185 }
1162 ocfs2_clear_inode(inode); 1186 ocfs2_clear_inode(inode);
1163} 1187}
@@ -1260,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1260 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1284 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1261 1285
1262 ocfs2_journal_dirty(handle, bh); 1286 ocfs2_journal_dirty(handle, bh);
1287 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1263leave: 1288leave:
1264 return status; 1289 return status;
1265} 1290}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..a6c991c0fc98 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -44,7 +44,7 @@ struct ocfs2_inode_info
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */ 46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio; 47 struct mutex ip_unaligned_aio;
48 48
49 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
50 spinlock_t ip_lock; 50 spinlock_t ip_lock;
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
73 u32 ip_dir_lock_gen; 73 u32 ip_dir_lock_gen;
74 74
75 struct ocfs2_alloc_reservation ip_la_data_resv; 75 struct ocfs2_alloc_reservation ip_la_data_resv;
76
77 /*
78 * Transactions that contain inode's metadata needed to complete
79 * fsync and fdatasync, respectively.
80 */
81 tid_t i_sync_tid;
82 tid_t i_datasync_tid;
76}; 83};
77 84
78/* 85/*
@@ -84,8 +91,6 @@ struct ocfs2_inode_info
84#define OCFS2_INODE_BITMAP 0x00000004 91#define OCFS2_INODE_BITMAP 0x00000004
85/* This inode has been wiped from disk */ 92/* This inode has been wiped from disk */
86#define OCFS2_INODE_DELETED 0x00000008 93#define OCFS2_INODE_DELETED 0x00000008
87/* Another node is deleting, so our delete is a nop */
88#define OCFS2_INODE_SKIP_DELETE 0x00000010
89/* Has the inode been orphaned on another node? 94/* Has the inode been orphaned on another node?
90 * 95 *
91 * This hints to ocfs2_drop_inode that it should clear i_nlink before 96 * This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +105,11 @@ struct ocfs2_inode_info
100 * rely on ocfs2_delete_inode to sort things out under the proper 105 * rely on ocfs2_delete_inode to sort things out under the proper
101 * cluster locks. 106 * cluster locks.
102 */ 107 */
103#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 108#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
104/* Does someone have the file open O_DIRECT */ 109/* Does someone have the file open O_DIRECT */
105#define OCFS2_INODE_OPEN_DIRECT 0x00000040 110#define OCFS2_INODE_OPEN_DIRECT 0x00000020
106/* Tell the inode wipe code it's not in orphan dir */ 111/* Tell the inode wipe code it's not in orphan dir */
107#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 112#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
108 113
109static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 114static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
110{ 115{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8ca3c29accbf..490229f43731 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
413 } 413 }
414 414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); 415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418 416
419 iput(inode_alloc); 417 iput(inode_alloc);
420 inode_alloc = NULL; 418 inode_alloc = NULL;
419
420 if (status < 0)
421 goto bail;
421 } 422 }
422 423
423 o2info_set_request_filled(&oifi->ifi_req); 424 o2info_set_request_filled(&oifi->ifi_req);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..03ea9314fecd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2132 iter = oi->ip_next_orphan; 2132 iter = oi->ip_next_orphan;
2133 2133
2134 spin_lock(&oi->ip_lock); 2134 spin_lock(&oi->ip_lock);
2135 /* The remote delete code may have set these on the
2136 * assumption that the other node would wipe them
2137 * successfully. If they are still in the node's
2138 * orphan dir, we need to reset that state. */
2139 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
2140
2141 /* Set the proper information to get us going into 2135 /* Set the proper information to get us going into
2142 * ocfs2_delete_inode. */ 2136 * ocfs2_delete_inode. */
2143 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 2137 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9ff4e8cf9d97..7f8cde94abfe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
626 new_size); 626 new_size);
627} 627}
628 628
629static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
630 struct inode *inode,
631 int datasync)
632{
633 struct ocfs2_inode_info *oi = OCFS2_I(inode);
634
635 oi->i_sync_tid = handle->h_transaction->t_tid;
636 if (datasync)
637 oi->i_datasync_tid = handle->h_transaction->t_tid;
638}
639
629#endif /* OCFS2_JOURNAL_H */ 640#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069ea..6b6d092b0998 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
82 } 82 }
83 83
84 ret = flock_lock_file_wait(file, fl); 84 ret = flock_lock_file_wait(file, fl);
85 if (ret)
86 ocfs2_file_unlock(file);
85 87
86out: 88out:
87 mutex_unlock(&fp->fp_mutex); 89 mutex_unlock(&fp->fp_mutex);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 64c304d668f0..599eb4c4c8be 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,
151 old_blkno, len); 151 old_blkno, len);
152 } 152 }
153 153
154 ocfs2_update_inode_fsync_trans(handle, inode, 0);
154out: 155out:
155 ocfs2_free_path(path); 156 ocfs2_free_path(path);
156 return ret; 157 return ret;
@@ -690,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
690 691
691 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 692 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
692 goal_bit, len); 693 goal_bit, len);
693 if (ret) 694 if (ret) {
695 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
696 le16_to_cpu(gd->bg_chain));
694 mlog_errno(ret); 697 mlog_errno(ret);
698 }
695 699
696 /* 700 /*
697 * Here we should write the new page out first if we are 701 * Here we should write the new page out first if we are
@@ -957,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
957 inode->i_ctime = CURRENT_TIME; 961 inode->i_ctime = CURRENT_TIME;
958 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 962 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
959 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 963 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
964 ocfs2_update_inode_fsync_trans(handle, inode, 0);
960 965
961 ocfs2_journal_dirty(handle, di_bh); 966 ocfs2_journal_dirty(handle, di_bh);
962 967
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3683643f3f0e..2060fc398445 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -450,7 +450,6 @@ leave:
450 450
451 brelse(new_fe_bh); 451 brelse(new_fe_bh);
452 brelse(parent_fe_bh); 452 brelse(parent_fe_bh);
453 kfree(si.name);
454 kfree(si.value); 453 kfree(si.value);
455 454
456 ocfs2_free_dir_lookup_result(&lookup); 455 ocfs2_free_dir_lookup_result(&lookup);
@@ -495,6 +494,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
495 struct ocfs2_dinode *fe = NULL; 494 struct ocfs2_dinode *fe = NULL;
496 struct ocfs2_extent_list *fel; 495 struct ocfs2_extent_list *fel;
497 u16 feat; 496 u16 feat;
497 struct ocfs2_inode_info *oi = OCFS2_I(inode);
498 498
499 *new_fe_bh = NULL; 499 *new_fe_bh = NULL;
500 500
@@ -576,8 +576,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
576 mlog_errno(status); 576 mlog_errno(status);
577 } 577 }
578 578
579 status = 0; /* error in ocfs2_create_new_inode_locks is not 579 oi->i_sync_tid = handle->h_transaction->t_tid;
580 * critical */ 580 oi->i_datasync_tid = handle->h_transaction->t_tid;
581 581
582leave: 582leave:
583 if (status < 0) { 583 if (status < 0) {
@@ -1855,7 +1855,6 @@ bail:
1855 1855
1856 brelse(new_fe_bh); 1856 brelse(new_fe_bh);
1857 brelse(parent_fe_bh); 1857 brelse(parent_fe_bh);
1858 kfree(si.name);
1859 kfree(si.value); 1858 kfree(si.value);
1860 ocfs2_free_dir_lookup_result(&lookup); 1859 ocfs2_free_dir_lookup_result(&lookup);
1861 if (inode_ac) 1860 if (inode_ac)
@@ -2481,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2481 di->i_orphaned_slot = 0; 2480 di->i_orphaned_slot = 0;
2482 set_nlink(inode, 1); 2481 set_nlink(inode, 1);
2483 ocfs2_set_links_count(di, inode->i_nlink); 2482 ocfs2_set_links_count(di, inode->i_nlink);
2483 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2484 ocfs2_journal_dirty(handle, di_bh); 2484 ocfs2_journal_dirty(handle, di_bh);
2485 2485
2486 status = ocfs2_add_entry(handle, dentry, inode, 2486 status = ocfs2_add_entry(handle, dentry, inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 553f53cc73ae..8d64a97a9d5e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/wait.h> 31#include <linux/wait.h>
32#include <linux/list.h> 32#include <linux/list.h>
33#include <linux/llist.h>
33#include <linux/rbtree.h> 34#include <linux/rbtree.h>
34#include <linux/workqueue.h> 35#include <linux/workqueue.h>
35#include <linux/kref.h> 36#include <linux/kref.h>
@@ -274,19 +275,16 @@ enum ocfs2_mount_options
274 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 275 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
275}; 276};
276 277
277#define OCFS2_OSB_SOFT_RO 0x0001 278#define OCFS2_OSB_SOFT_RO 0x0001
278#define OCFS2_OSB_HARD_RO 0x0002 279#define OCFS2_OSB_HARD_RO 0x0002
279#define OCFS2_OSB_ERROR_FS 0x0004 280#define OCFS2_OSB_ERROR_FS 0x0004
280#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 281#define OCFS2_DEFAULT_ATIME_QUANTUM 60
281
282#define OCFS2_DEFAULT_ATIME_QUANTUM 60
283 282
284struct ocfs2_journal; 283struct ocfs2_journal;
285struct ocfs2_slot_info; 284struct ocfs2_slot_info;
286struct ocfs2_recovery_map; 285struct ocfs2_recovery_map;
287struct ocfs2_replay_map; 286struct ocfs2_replay_map;
288struct ocfs2_quota_recovery; 287struct ocfs2_quota_recovery;
289struct ocfs2_dentry_lock;
290struct ocfs2_super 288struct ocfs2_super
291{ 289{
292 struct task_struct *commit_task; 290 struct task_struct *commit_task;
@@ -414,10 +412,9 @@ struct ocfs2_super
414 struct list_head blocked_lock_list; 412 struct list_head blocked_lock_list;
415 unsigned long blocked_lock_count; 413 unsigned long blocked_lock_count;
416 414
417 /* List of dentry locks to release. Anyone can add locks to 415 /* List of dquot structures to drop last reference to */
418 * the list, ocfs2_wq processes the list */ 416 struct llist_head dquot_drop_list;
419 struct ocfs2_dentry_lock *dentry_lock_list; 417 struct work_struct dquot_drop_work;
420 struct work_struct dentry_lock_work;
421 418
422 wait_queue_head_t osb_mount_event; 419 wait_queue_head_t osb_mount_event;
423 420
@@ -449,6 +446,8 @@ struct ocfs2_super
449 /* rb tree root for refcount lock. */ 446 /* rb tree root for refcount lock. */
450 struct rb_root osb_rf_lock_tree; 447 struct rb_root osb_rf_lock_tree;
451 struct ocfs2_refcount_tree *osb_ref_tree_lru; 448 struct ocfs2_refcount_tree *osb_ref_tree_lru;
449
450 struct mutex system_file_mutex;
452}; 451};
453 452
454#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 453#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -579,18 +578,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
579 spin_unlock(&osb->osb_lock); 578 spin_unlock(&osb->osb_lock);
580} 579}
581 580
582
583static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
584 unsigned long flag)
585{
586 unsigned long ret;
587
588 spin_lock(&osb->osb_lock);
589 ret = osb->osb_flags & flag;
590 spin_unlock(&osb->osb_lock);
591 return ret;
592}
593
594static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, 581static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
595 int hard) 582 int hard)
596{ 583{
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..f266d67df3c6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
29 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
30 s64 dq_originodes; /* Last globally synced inode usage */ 30 s64 dq_originodes; /* Last globally synced inode usage */
31 struct llist_node list; /* Member of list of dquots to drop */
31}; 32};
32 33
33/* Description of one chunk to recover in memory */ 34/* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
110int ocfs2_create_local_dquot(struct dquot *dquot); 111int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); 112int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot); 113int ocfs2_local_write_dquot(struct dquot *dquot);
114void ocfs2_drop_dquot_refs(struct work_struct *work);
113 115
114extern const struct dquot_operations ocfs2_quota_operations; 116extern const struct dquot_operations ocfs2_quota_operations;
115extern struct quota_format_type ocfs2_quota_format; 117extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d7b5108789e2..b990a62cff50 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/workqueue.h> 12#include <linux/workqueue.h>
13#include <linux/llist.h>
13 14
14#include <cluster/masklog.h> 15#include <cluster/masklog.h>
15 16
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
679 OCFS2_INODE_UPDATE_CREDITS; 680 OCFS2_INODE_UPDATE_CREDITS;
680} 681}
681 682
683void ocfs2_drop_dquot_refs(struct work_struct *work)
684{
685 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
686 dquot_drop_work);
687 struct llist_node *list;
688 struct ocfs2_dquot *odquot, *next_odquot;
689
690 list = llist_del_all(&osb->dquot_drop_list);
691 llist_for_each_entry_safe(odquot, next_odquot, list, list) {
692 /* Drop the reference we acquired in ocfs2_dquot_release() */
693 dqput(&odquot->dq_dquot);
694 }
695}
696
697/*
698 * Called when the last reference to dquot is dropped. If we are called from
699 * downconvert thread, we cannot do all the handling here because grabbing
700 * quota lock could deadlock (the node holding the quota lock could need some
701 * other cluster lock to proceed but with blocked downconvert thread we cannot
702 * release any lock).
703 */
682static int ocfs2_release_dquot(struct dquot *dquot) 704static int ocfs2_release_dquot(struct dquot *dquot)
683{ 705{
684 handle_t *handle; 706 handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
694 /* Check whether we are not racing with some other dqget() */ 716 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1) 717 if (atomic_read(&dquot->dq_count) > 1)
696 goto out; 718 goto out;
719 /* Running from downconvert thread? Postpone quota processing to wq */
720 if (current == osb->dc_task) {
721 /*
722 * Grab our own reference to dquot and queue it for delayed
723 * dropping. Quota code rechecks after calling
724 * ->release_dquot() and won't free dquot structure.
725 */
726 dqgrab(dquot);
727 /* First entry on list -> queue work */
728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
729 queue_work(ocfs2_wq, &osb->dquot_drop_work);
730 goto out;
731 }
697 status = ocfs2_lock_global_qf(oinfo, 1); 732 status = ocfs2_lock_global_qf(oinfo, 1);
698 if (status < 0) 733 if (status < 0)
699 goto out; 734 goto out;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ca5ce14cbddc..83f1a665ae97 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
496} 496}
497 497
498static struct kobj_attribute ocfs2_attr_max_locking_protocol = 498static struct kobj_attribute ocfs2_attr_max_locking_protocol =
499 __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, 499 __ATTR(max_locking_protocol, S_IRUGO,
500 ocfs2_max_locking_protocol_show, NULL); 500 ocfs2_max_locking_protocol_show, NULL);
501 501
502static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, 502static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
528} 528}
529 529
530static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = 530static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
531 __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, 531 __ATTR(loaded_cluster_plugins, S_IRUGO,
532 ocfs2_loaded_cluster_plugins_show, NULL); 532 ocfs2_loaded_cluster_plugins_show, NULL);
533 533
534static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, 534static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
550} 550}
551 551
552static struct kobj_attribute ocfs2_attr_active_cluster_plugin = 552static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
553 __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, 553 __ATTR(active_cluster_plugin, S_IRUGO,
554 ocfs2_active_cluster_plugin_show, NULL); 554 ocfs2_active_cluster_plugin_show, NULL);
555 555
556static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, 556static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -599,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
599 599
600 600
601static struct kobj_attribute ocfs2_attr_cluster_stack = 601static struct kobj_attribute ocfs2_attr_cluster_stack =
602 __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, 602 __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
603 ocfs2_cluster_stack_show, 603 ocfs2_cluster_stack_show,
604 ocfs2_cluster_stack_store); 604 ocfs2_cluster_stack_store);
605 605
606
607
608static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
609 struct kobj_attribute *attr,
610 char *buf)
611{
612 return snprintf(buf, PAGE_SIZE, "1\n");
613}
614
615static struct kobj_attribute ocfs2_attr_dlm_recover_support =
616 __ATTR(dlm_recover_callback_support, S_IRUGO,
617 ocfs2_dlm_recover_show, NULL);
618
606static struct attribute *ocfs2_attrs[] = { 619static struct attribute *ocfs2_attrs[] = {
607 &ocfs2_attr_max_locking_protocol.attr, 620 &ocfs2_attr_max_locking_protocol.attr,
608 &ocfs2_attr_loaded_cluster_plugins.attr, 621 &ocfs2_attr_loaded_cluster_plugins.attr,
609 &ocfs2_attr_active_cluster_plugin.attr, 622 &ocfs2_attr_active_cluster_plugin.attr,
610 &ocfs2_attr_cluster_stack.attr, 623 &ocfs2_attr_cluster_stack.attr,
624 &ocfs2_attr_dlm_recover_support.attr,
611 NULL, 625 NULL,
612}; 626};
613 627
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 47ae2663a6f5..0cb889a17ae1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
774 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
774 775
775 status = 0; 776 status = 0;
776 777
@@ -1607,6 +1608,21 @@ out:
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1611void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1612 struct buffer_head *di_bh,
1613 u32 num_bits,
1614 u16 chain)
1615{
1616 u32 tmp_used;
1617 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1618 struct ocfs2_chain_list *cl;
1619
1620 cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1621 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1622 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1623 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1624}
1625
1610static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1626static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1611 struct ocfs2_extent_rec *rec, 1627 struct ocfs2_extent_rec *rec,
1612 struct ocfs2_chain_list *cl) 1628 struct ocfs2_chain_list *cl)
@@ -1707,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1707 1723
1708 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1724 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1709 res->sr_bit_offset, res->sr_bits); 1725 res->sr_bit_offset, res->sr_bits);
1710 if (ret < 0) 1726 if (ret < 0) {
1727 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1728 res->sr_bits,
1729 le16_to_cpu(gd->bg_chain));
1711 mlog_errno(ret); 1730 mlog_errno(ret);
1731 }
1712 1732
1713out_loc_only: 1733out_loc_only:
1714 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1734 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1838,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1838 res->sr_bit_offset, 1858 res->sr_bit_offset,
1839 res->sr_bits); 1859 res->sr_bits);
1840 if (status < 0) { 1860 if (status < 0) {
1861 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1862 ac->ac_bh, res->sr_bits, chain);
1841 mlog_errno(status); 1863 mlog_errno(status);
1842 goto bail; 1864 goto bail;
1843 } 1865 }
@@ -2091,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
2091 2113
2092 ac->ac_find_loc_priv = res; 2114 ac->ac_find_loc_priv = res;
2093 *fe_blkno = res->sr_blkno; 2115 *fe_blkno = res->sr_blkno;
2094 2116 ocfs2_update_inode_fsync_trans(handle, dir, 0);
2095out: 2117out:
2096 if (handle) 2118 if (handle)
2097 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2119 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2149,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2149 res->sr_bit_offset, 2171 res->sr_bit_offset,
2150 res->sr_bits); 2172 res->sr_bits);
2151 if (ret < 0) { 2173 if (ret < 0) {
2174 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2175 ac->ac_bh, res->sr_bits, chain);
2152 mlog_errno(ret); 2176 mlog_errno(ret);
2153 goto out; 2177 goto out;
2154 } 2178 }
@@ -2870,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2870 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2894 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2871 if (status < 0) { 2895 if (status < 0) {
2872 mutex_unlock(&inode_alloc_inode->i_mutex); 2896 mutex_unlock(&inode_alloc_inode->i_mutex);
2897 iput(inode_alloc_inode);
2873 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2898 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2874 (u32)suballoc_slot, status); 2899 (u32)suballoc_slot, status);
2875 goto bail; 2900 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 218d8036b3e7..2d2501767c0c 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode,
91 struct buffer_head *di_bh, 91 struct buffer_head *di_bh,
92 u32 num_bits, 92 u32 num_bits,
93 u16 chain); 93 u16 chain);
94void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
95 struct buffer_head *di_bh,
96 u32 num_bits,
97 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle, 98int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode, 99 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg, 100 struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 49d84f80f36c..a7cdd56f4c79 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
561 if (!oi) 561 if (!oi)
562 return NULL; 562 return NULL;
563 563
564 oi->i_sync_tid = 0;
565 oi->i_datasync_tid = 0;
566
564 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); 567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
565 return &oi->vfs_inode; 568 return &oi->vfs_inode;
566} 569}
@@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
631 struct ocfs2_super *osb = OCFS2_SB(sb); 634 struct ocfs2_super *osb = OCFS2_SB(sb);
632 u32 tmp; 635 u32 tmp;
633 636
637 sync_filesystem(sb);
638
634 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
635 !ocfs2_check_set_options(sb, &parsed_options)) { 640 !ocfs2_check_set_options(sb, &parsed_options)) {
636 ret = -EINVAL; 641 ret = -EINVAL;
@@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1238 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); 1243 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1239} 1244}
1240 1245
1241static void ocfs2_kill_sb(struct super_block *sb)
1242{
1243 struct ocfs2_super *osb = OCFS2_SB(sb);
1244
1245 /* Failed mount? */
1246 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1247 goto out;
1248
1249 /* Prevent further queueing of inode drop events */
1250 spin_lock(&dentry_list_lock);
1251 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1252 spin_unlock(&dentry_list_lock);
1253 /* Wait for work to finish and/or remove it */
1254 cancel_work_sync(&osb->dentry_lock_work);
1255out:
1256 kill_block_super(sb);
1257}
1258
1259static struct file_system_type ocfs2_fs_type = { 1246static struct file_system_type ocfs2_fs_type = {
1260 .owner = THIS_MODULE, 1247 .owner = THIS_MODULE,
1261 .name = "ocfs2", 1248 .name = "ocfs2",
1262 .mount = ocfs2_mount, 1249 .mount = ocfs2_mount,
1263 .kill_sb = ocfs2_kill_sb, 1250 .kill_sb = kill_block_super,
1264
1265 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1251 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1266 .next = NULL 1252 .next = NULL
1267}; 1253};
@@ -1612,14 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1612 return 0; 1598 return 0;
1613} 1599}
1614 1600
1615wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1616
1617static int __init ocfs2_init(void) 1601static int __init ocfs2_init(void)
1618{ 1602{
1619 int status, i; 1603 int status;
1620
1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1623 1604
1624 status = init_ocfs2_uptodate_cache(); 1605 status = init_ocfs2_uptodate_cache();
1625 if (status < 0) 1606 if (status < 0)
@@ -1761,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
1761 ocfs2_extent_map_init(&oi->vfs_inode); 1742 ocfs2_extent_map_init(&oi->vfs_inode);
1762 INIT_LIST_HEAD(&oi->ip_io_markers); 1743 INIT_LIST_HEAD(&oi->ip_io_markers);
1763 oi->ip_dir_start_lookup = 0; 1744 oi->ip_dir_start_lookup = 0;
1764 atomic_set(&oi->ip_unaligned_aio, 0); 1745 mutex_init(&oi->ip_unaligned_aio);
1765 init_rwsem(&oi->ip_alloc_sem); 1746 init_rwsem(&oi->ip_alloc_sem);
1766 init_rwsem(&oi->ip_xattr_sem); 1747 init_rwsem(&oi->ip_xattr_sem);
1767 mutex_init(&oi->ip_io_mutex); 1748 mutex_init(&oi->ip_io_mutex);
@@ -1932,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1932 1913
1933 debugfs_remove(osb->osb_ctxt); 1914 debugfs_remove(osb->osb_ctxt);
1934 1915
1935 /*
1936 * Flush inode dropping work queue so that deletes are
1937 * performed while the filesystem is still working
1938 */
1939 ocfs2_drop_all_dl_inodes(osb);
1940
1941 /* Orphan scan should be stopped as early as possible */ 1916 /* Orphan scan should be stopped as early as possible */
1942 ocfs2_orphan_scan_stop(osb); 1917 ocfs2_orphan_scan_stop(osb);
1943 1918
1944 ocfs2_disable_quotas(osb); 1919 ocfs2_disable_quotas(osb);
1945 1920
1921 /* All dquots should be freed by now */
1922 WARN_ON(!llist_empty(&osb->dquot_drop_list));
1923 /* Wait for worker to be done with the work structure in osb */
1924 cancel_work_sync(&osb->dquot_drop_work);
1925
1946 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1947 1927
1948 /* This will disable recovery and flush any recovery work. */ 1928 /* This will disable recovery and flush any recovery work. */
@@ -2077,7 +2057,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2077 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 2057 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2078 struct inode *inode = NULL; 2058 struct inode *inode = NULL;
2079 struct ocfs2_journal *journal; 2059 struct ocfs2_journal *journal;
2080 __le32 uuid_net_key;
2081 struct ocfs2_super *osb; 2060 struct ocfs2_super *osb;
2082 u64 total_blocks; 2061 u64 total_blocks;
2083 2062
@@ -2123,6 +2102,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2123 spin_lock_init(&osb->osb_xattr_lock); 2102 spin_lock_init(&osb->osb_xattr_lock);
2124 ocfs2_init_steal_slots(osb); 2103 ocfs2_init_steal_slots(osb);
2125 2104
2105 mutex_init(&osb->system_file_mutex);
2106
2126 atomic_set(&osb->alloc_stats.moves, 0); 2107 atomic_set(&osb->alloc_stats.moves, 0);
2127 atomic_set(&osb->alloc_stats.local_data, 0); 2108 atomic_set(&osb->alloc_stats.local_data, 0);
2128 atomic_set(&osb->alloc_stats.bitmap_data, 0); 2109 atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2276,8 +2257,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2276 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 2257 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
2277 journal->j_state = OCFS2_JOURNAL_FREE; 2258 journal->j_state = OCFS2_JOURNAL_FREE;
2278 2259
2279 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); 2260 INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
2280 osb->dentry_lock_list = NULL; 2261 init_llist_head(&osb->dquot_drop_list);
2281 2262
2282 /* get some pseudo constants for clustersize bits */ 2263 /* get some pseudo constants for clustersize bits */
2283 osb->s_clustersize_bits = 2264 osb->s_clustersize_bits =
@@ -2311,8 +2292,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2311 goto bail; 2292 goto bail;
2312 } 2293 }
2313 2294
2314 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
2315
2316 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 2295 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
2317 osb->vol_label[63] = '\0'; 2296 osb->vol_label[63] = '\0';
2318 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 2297 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a3..af155c183123 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
113 } else 113 } else
114 arr = get_local_system_inode(osb, type, slot); 114 arr = get_local_system_inode(osb, type, slot);
115 115
116 mutex_lock(&osb->system_file_mutex);
116 if (arr && ((inode = *arr) != NULL)) { 117 if (arr && ((inode = *arr) != NULL)) {
117 /* get a ref in addition to the array ref */ 118 /* get a ref in addition to the array ref */
118 inode = igrab(inode); 119 inode = igrab(inode);
120 mutex_unlock(&osb->system_file_mutex);
119 BUG_ON(!inode); 121 BUG_ON(!inode);
120 122
121 return inode; 123 return inode;
@@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
129 *arr = igrab(inode); 131 *arr = igrab(inode);
130 BUG_ON(!*arr); 132 BUG_ON(!*arr);
131 } 133 }
134 mutex_unlock(&osb->system_file_mutex);
132 return inode; 135 return inode;
133} 136}
134 137
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 185fa3b7f962..016f01df3825 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
369 * them fully. 369 * them fully.
370 */ 370 */
371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, 371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
372 u64 xb_blkno) 372 u64 xb_blkno, int new)
373{ 373{
374 int i, rc = 0; 374 int i, rc = 0;
375 375
@@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
383 } 383 }
384 384
385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
386 bucket->bu_bhs[i])) 386 bucket->bu_bhs[i])) {
387 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 387 if (new)
388 bucket->bu_bhs[i]); 388 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
389 bucket->bu_bhs[i]);
390 else {
391 set_buffer_uptodate(bucket->bu_bhs[i]);
392 ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
393 bucket->bu_bhs[i]);
394 }
395 }
389 } 396 }
390 397
391 if (rc) 398 if (rc)
@@ -2602,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2602 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); 2609 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
2603 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2610 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2604 spin_unlock(&oi->ip_lock); 2611 spin_unlock(&oi->ip_lock);
2612 ocfs2_update_inode_fsync_trans(handle, inode, 0);
2605 2613
2606 ocfs2_journal_dirty(handle, di_bh); 2614 ocfs2_journal_dirty(handle, di_bh);
2607out_commit: 2615out_commit:
@@ -3200,8 +3208,15 @@ meta_guess:
3200 clusters_add += 1; 3208 clusters_add += 1;
3201 } 3209 }
3202 } else { 3210 } else {
3203 meta_add += 1;
3204 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 3211 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
3212 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
3213 struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
3214 meta_add += ocfs2_extend_meta_needed(el);
3215 credits += ocfs2_calc_extend_credits(inode->i_sb,
3216 el);
3217 } else {
3218 meta_add += 1;
3219 }
3205 } 3220 }
3206out: 3221out:
3207 if (clusters_need) 3222 if (clusters_need)
@@ -3614,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,
3614 } 3629 }
3615 3630
3616 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); 3631 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
3632 ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
3617 3633
3618 ocfs2_commit_trans(osb, ctxt.handle); 3634 ocfs2_commit_trans(osb, ctxt.handle);
3619 3635
@@ -4294,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4294 4310
4295 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); 4311 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
4296 4312
4297 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); 4313 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
4298 if (ret) { 4314 if (ret) {
4299 mlog_errno(ret); 4315 mlog_errno(ret);
4300 goto out; 4316 goto out;
@@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4638 * Even if !new_bucket_head, we're overwriting t_bucket. Thus, 4654 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
4639 * there's no need to read it. 4655 * there's no need to read it.
4640 */ 4656 */
4641 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); 4657 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
4642 if (ret) { 4658 if (ret) {
4643 mlog_errno(ret); 4659 mlog_errno(ret);
4644 goto out; 4660 goto out;
@@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
4804 * Even if !t_is_new, we're overwriting t_bucket. Thus, 4820 * Even if !t_is_new, we're overwriting t_bucket. Thus,
4805 * there's no need to read it. 4821 * there's no need to read it.
4806 */ 4822 */
4807 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); 4823 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
4808 if (ret) 4824 if (ret)
4809 goto out; 4825 goto out;
4810 4826
@@ -5476,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5476 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5492 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5477 if (ret) 5493 if (ret)
5478 mlog_errno(ret); 5494 mlog_errno(ret);
5495 ocfs2_update_inode_fsync_trans(handle, inode, 0);
5479 5496
5480out_commit: 5497out_commit:
5481 ocfs2_commit_trans(osb, handle); 5498 ocfs2_commit_trans(osb, handle);
@@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
6830 break; 6847 break;
6831 } 6848 }
6832 6849
6833 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); 6850 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
6834 if (ret) { 6851 if (ret) {
6835 mlog_errno(ret); 6852 mlog_errno(ret);
6836 break; 6853 break;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afde2179..ec58c7659183 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
183 */ 183 */
184static void omfs_evict_inode(struct inode *inode) 184static void omfs_evict_inode(struct inode *inode)
185{ 185{
186 truncate_inode_pages(&inode->i_data, 0); 186 truncate_inode_pages_final(&inode->i_data);
187 clear_inode(inode); 187 clear_inode(inode);
188 188
189 if (inode->i_nlink) 189 if (inode->i_nlink)
diff --git a/fs/open.c b/fs/open.c
index b9ed8b25c108..631aea815def 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
231 return -EINVAL; 231 return -EINVAL;
232 232
233 /* Return error if mode is not supported */ 233 /* Return error if mode is not supported */
234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
235 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
236 return -EOPNOTSUPP;
237
238 /* Punch hole and zero range are mutually exclusive */
239 if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
240 (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
235 return -EOPNOTSUPP; 241 return -EOPNOTSUPP;
236 242
237 /* Punch hole must have keep size set */ 243 /* Punch hole must have keep size set */
@@ -239,11 +245,20 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
239 !(mode & FALLOC_FL_KEEP_SIZE)) 245 !(mode & FALLOC_FL_KEEP_SIZE))
240 return -EOPNOTSUPP; 246 return -EOPNOTSUPP;
241 247
248 /* Collapse range should only be used exclusively. */
249 if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
250 (mode & ~FALLOC_FL_COLLAPSE_RANGE))
251 return -EINVAL;
252
242 if (!(file->f_mode & FMODE_WRITE)) 253 if (!(file->f_mode & FMODE_WRITE))
243 return -EBADF; 254 return -EBADF;
244 255
245 /* It's not possible punch hole on append only file */ 256 /*
246 if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) 257 * It's not possible to punch hole or perform collapse range
258 * on append only file
259 */
260 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)
261 && IS_APPEND(inode))
247 return -EPERM; 262 return -EPERM;
248 263
249 if (IS_IMMUTABLE(inode)) 264 if (IS_IMMUTABLE(inode))
@@ -271,6 +286,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
271 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 286 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
272 return -EFBIG; 287 return -EFBIG;
273 288
289 /*
290 * There is no need to overlap collapse range with EOF, in which case
291 * it is effectively a truncate operation
292 */
293 if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
294 (offset + len >= i_size_read(inode)))
295 return -EINVAL;
296
274 if (!file->f_op->fallocate) 297 if (!file->f_op->fallocate)
275 return -EOPNOTSUPP; 298 return -EOPNOTSUPP;
276 299
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f7..15e4500cda3e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
368 368
369static int openprom_remount(struct super_block *sb, int *flags, char *data) 369static int openprom_remount(struct super_block *sb, int *flags, char *data)
370{ 370{
371 sync_filesystem(sb);
371 *flags |= MS_NOATIME; 372 *flags |= MS_NOATIME;
372 return 0; 373 return 0;
373} 374}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 11c54fd51e16..9e363e41dacc 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -723,7 +723,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
723 void *buffer, size_t size) 723 void *buffer, size_t size)
724{ 724{
725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; 725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
726 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; 726 posix_acl_xattr_entry *ext_entry;
727 int real_size, n; 727 int real_size, n;
728 728
729 real_size = posix_acl_xattr_size(acl->a_count); 729 real_size = posix_acl_xattr_size(acl->a_count);
@@ -731,7 +731,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
731 return real_size; 731 return real_size;
732 if (real_size > size) 732 if (real_size > size)
733 return -ERANGE; 733 return -ERANGE;
734 734
735 ext_entry = ext_acl->a_entries;
735 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); 736 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
736 737
737 for (n=0; n < acl->a_count; n++, ext_entry++) { 738 for (n=0; n < acl->a_count; n++, ext_entry++) {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f5..239493ec718e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
27proc-$(CONFIG_NET) += proc_net.o 27proc-$(CONFIG_NET) += proc_net.o
28proc-$(CONFIG_PROC_KCORE) += kcore.o 28proc-$(CONFIG_PROC_KCORE) += kcore.o
29proc-$(CONFIG_PROC_VMCORE) += vmcore.o 29proc-$(CONFIG_PROC_VMCORE) += vmcore.o
30proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
31proc-$(CONFIG_PRINTK) += kmsg.o 30proc-$(CONFIG_PRINTK) += kmsg.o
32proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o 31proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 656e401794de..64db2bceac59 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,8 +138,8 @@ static const char * const task_state_array[] = {
138 "D (disk sleep)", /* 2 */ 138 "D (disk sleep)", /* 2 */
139 "T (stopped)", /* 4 */ 139 "T (stopped)", /* 4 */
140 "t (tracing stop)", /* 8 */ 140 "t (tracing stop)", /* 8 */
141 "Z (zombie)", /* 16 */ 141 "X (dead)", /* 16 */
142 "X (dead)", /* 32 */ 142 "Z (zombie)", /* 32 */
143}; 143};
144 144
145static inline const char *get_task_state(struct task_struct *tsk) 145static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9760628e1fd..6b7087e2e8fb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1236,6 +1236,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1236 make_it_fail = simple_strtol(strstrip(buffer), &end, 0); 1236 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1237 if (*end) 1237 if (*end)
1238 return -EINVAL; 1238 return -EINVAL;
1239 if (make_it_fail < 0 || make_it_fail > 1)
1240 return -EINVAL;
1241
1239 task = get_proc_task(file_inode(file)); 1242 task = get_proc_task(file_inode(file));
1240 if (!task) 1243 if (!task)
1241 return -ESRCH; 1244 return -ESRCH;
@@ -2588,7 +2591,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2588 REG("environ", S_IRUSR, proc_environ_operations), 2591 REG("environ", S_IRUSR, proc_environ_operations),
2589 INF("auxv", S_IRUSR, proc_pid_auxv), 2592 INF("auxv", S_IRUSR, proc_pid_auxv),
2590 ONE("status", S_IRUGO, proc_pid_status), 2593 ONE("status", S_IRUGO, proc_pid_status),
2591 ONE("personality", S_IRUGO, proc_pid_personality), 2594 ONE("personality", S_IRUSR, proc_pid_personality),
2592 INF("limits", S_IRUGO, proc_pid_limits), 2595 INF("limits", S_IRUGO, proc_pid_limits),
2593#ifdef CONFIG_SCHED_DEBUG 2596#ifdef CONFIG_SCHED_DEBUG
2594 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2597 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2598,7 +2601,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2598#endif 2601#endif
2599 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2602 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2600#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2603#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2601 INF("syscall", S_IRUGO, proc_pid_syscall), 2604 INF("syscall", S_IRUSR, proc_pid_syscall),
2602#endif 2605#endif
2603 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2606 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2604 ONE("stat", S_IRUGO, proc_tgid_stat), 2607 ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2617,7 +2620,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2617#ifdef CONFIG_PROC_PAGE_MONITOR 2620#ifdef CONFIG_PROC_PAGE_MONITOR
2618 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2621 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2619 REG("smaps", S_IRUGO, proc_pid_smaps_operations), 2622 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
2620 REG("pagemap", S_IRUGO, proc_pagemap_operations), 2623 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2621#endif 2624#endif
2622#ifdef CONFIG_SECURITY 2625#ifdef CONFIG_SECURITY
2623 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2626 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2626,7 +2629,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2626 INF("wchan", S_IRUGO, proc_pid_wchan), 2629 INF("wchan", S_IRUGO, proc_pid_wchan),
2627#endif 2630#endif
2628#ifdef CONFIG_STACKTRACE 2631#ifdef CONFIG_STACKTRACE
2629 ONE("stack", S_IRUGO, proc_pid_stack), 2632 ONE("stack", S_IRUSR, proc_pid_stack),
2630#endif 2633#endif
2631#ifdef CONFIG_SCHEDSTATS 2634#ifdef CONFIG_SCHEDSTATS
2632 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2635 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -2927,14 +2930,14 @@ static const struct pid_entry tid_base_stuff[] = {
2927 REG("environ", S_IRUSR, proc_environ_operations), 2930 REG("environ", S_IRUSR, proc_environ_operations),
2928 INF("auxv", S_IRUSR, proc_pid_auxv), 2931 INF("auxv", S_IRUSR, proc_pid_auxv),
2929 ONE("status", S_IRUGO, proc_pid_status), 2932 ONE("status", S_IRUGO, proc_pid_status),
2930 ONE("personality", S_IRUGO, proc_pid_personality), 2933 ONE("personality", S_IRUSR, proc_pid_personality),
2931 INF("limits", S_IRUGO, proc_pid_limits), 2934 INF("limits", S_IRUGO, proc_pid_limits),
2932#ifdef CONFIG_SCHED_DEBUG 2935#ifdef CONFIG_SCHED_DEBUG
2933 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2936 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2934#endif 2937#endif
2935 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2938 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2936#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2939#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2937 INF("syscall", S_IRUGO, proc_pid_syscall), 2940 INF("syscall", S_IRUSR, proc_pid_syscall),
2938#endif 2941#endif
2939 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2942 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2940 ONE("stat", S_IRUGO, proc_tid_stat), 2943 ONE("stat", S_IRUGO, proc_tid_stat),
@@ -2955,7 +2958,7 @@ static const struct pid_entry tid_base_stuff[] = {
2955#ifdef CONFIG_PROC_PAGE_MONITOR 2958#ifdef CONFIG_PROC_PAGE_MONITOR
2956 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2959 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2957 REG("smaps", S_IRUGO, proc_tid_smaps_operations), 2960 REG("smaps", S_IRUGO, proc_tid_smaps_operations),
2958 REG("pagemap", S_IRUGO, proc_pagemap_operations), 2961 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2959#endif 2962#endif
2960#ifdef CONFIG_SECURITY 2963#ifdef CONFIG_SECURITY
2961 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2964 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2964,7 +2967,7 @@ static const struct pid_entry tid_base_stuff[] = {
2964 INF("wchan", S_IRUGO, proc_pid_wchan), 2967 INF("wchan", S_IRUGO, proc_pid_wchan),
2965#endif 2968#endif
2966#ifdef CONFIG_STACKTRACE 2969#ifdef CONFIG_STACKTRACE
2967 ONE("stack", S_IRUGO, proc_pid_stack), 2970 ONE("stack", S_IRUSR, proc_pid_stack),
2968#endif 2971#endif
2969#ifdef CONFIG_SCHEDSTATS 2972#ifdef CONFIG_SCHEDSTATS
2970 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2973 INF("schedstat", S_IRUGO, proc_pid_schedstat),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 985ea881b5bc..0788d093f5d8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/proc_fs.h> 12#include <linux/proc_fs.h>
13 13
14#include "../mount.h"
14#include "internal.h" 15#include "internal.h"
15#include "fd.h" 16#include "fd.h"
16 17
@@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v)
48 } 49 }
49 50
50 if (!ret) { 51 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", 52 seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
52 (long long)file->f_pos, f_flags); 53 (long long)file->f_pos, f_flags,
54 real_mount(file->f_path.mnt)->mnt_id);
53 if (file->f_op->show_fdinfo) 55 if (file->f_op->show_fdinfo)
54 ret = file->f_op->show_fdinfo(m, file); 56 ret = file->f_op->show_fdinfo(m, file);
55 fput(file); 57 fput(file);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 124fc43c7090..0adbc02d60e3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
35 const struct proc_ns_operations *ns_ops; 35 const struct proc_ns_operations *ns_ops;
36 void *ns; 36 void *ns;
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 clear_inode(inode); 39 clear_inode(inode);
40 40
41 /* Stop tracking associated processes */ 41 /* Stop tracking associated processes */
@@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode)
47 pde_put(de); 47 pde_put(de);
48 head = PROC_I(inode)->sysctl; 48 head = PROC_I(inode)->sysctl;
49 if (head) { 49 if (head) {
50 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); 50 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
51 sysctl_head_put(head); 51 sysctl_head_put(head);
52 } 52 }
53 /* Release any associated namespace */ 53 /* Release any associated namespace */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dde..3ab6d14e71c5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *);
211extern void proc_entry_rundown(struct proc_dir_entry *); 211extern void proc_entry_rundown(struct proc_dir_entry *);
212 212
213/* 213/*
214 * proc_devtree.c
215 */
216#ifdef CONFIG_PROC_DEVICETREE
217extern void proc_device_tree_init(void);
218#endif
219
220/*
221 * proc_namespaces.c 214 * proc_namespaces.c
222 */ 215 */
223extern const struct inode_operations proc_ns_dir_inode_operations; 216extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548d9567..7445af0b1aa3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
73 available += pagecache; 73 available += pagecache;
74 74
75 /* 75 /*
76 * Part of the reclaimable swap consists of items that are in use, 76 * Part of the reclaimable slab consists of items that are in use,
77 * and cannot be freed. Cap this estimate at the low watermark. 77 * and cannot be freed. Cap this estimate at the low watermark.
78 */ 78 */
79 available += global_page_state(NR_SLAB_RECLAIMABLE) - 79 available += global_page_state(NR_SLAB_RECLAIMABLE) -
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index c82dd5147845..000000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,241 +0,0 @@
1/*
2 * proc_devtree.c - handles /proc/device-tree
3 *
4 * Copyright 1997 Paul Mackerras
5 */
6#include <linux/errno.h>
7#include <linux/init.h>
8#include <linux/time.h>
9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
11#include <linux/printk.h>
12#include <linux/stat.h>
13#include <linux/string.h>
14#include <linux/of.h>
15#include <linux/export.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h>
18#include "internal.h"
19
20static inline void set_node_proc_entry(struct device_node *np,
21 struct proc_dir_entry *de)
22{
23 np->pde = de;
24}
25
26static struct proc_dir_entry *proc_device_tree;
27
28/*
29 * Supply data on a read from /proc/device-tree/node/property.
30 */
31static int property_proc_show(struct seq_file *m, void *v)
32{
33 struct property *pp = m->private;
34
35 seq_write(m, pp->value, pp->length);
36 return 0;
37}
38
39static int property_proc_open(struct inode *inode, struct file *file)
40{
41 return single_open(file, property_proc_show, __PDE_DATA(inode));
42}
43
44static const struct file_operations property_proc_fops = {
45 .owner = THIS_MODULE,
46 .open = property_proc_open,
47 .read = seq_read,
48 .llseek = seq_lseek,
49 .release = single_release,
50};
51
52/*
53 * For a node with a name like "gc@10", we make symlinks called "gc"
54 * and "@10" to it.
55 */
56
57/*
58 * Add a property to a node
59 */
60static struct proc_dir_entry *
61__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
62 const char *name)
63{
64 struct proc_dir_entry *ent;
65
66 /*
67 * Unfortunately proc_register puts each new entry
68 * at the beginning of the list. So we rearrange them.
69 */
70 ent = proc_create_data(name,
71 strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
72 de, &property_proc_fops, pp);
73 if (ent == NULL)
74 return NULL;
75
76 if (!strncmp(name, "security-", 9))
77 proc_set_size(ent, 0); /* don't leak number of password chars */
78 else
79 proc_set_size(ent, pp->length);
80
81 return ent;
82}
83
84
85void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
86{
87 __proc_device_tree_add_prop(pde, prop, prop->name);
88}
89
90void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
91 struct property *prop)
92{
93 remove_proc_entry(prop->name, pde);
94}
95
96void proc_device_tree_update_prop(struct proc_dir_entry *pde,
97 struct property *newprop,
98 struct property *oldprop)
99{
100 struct proc_dir_entry *ent;
101
102 if (!oldprop) {
103 proc_device_tree_add_prop(pde, newprop);
104 return;
105 }
106
107 for (ent = pde->subdir; ent != NULL; ent = ent->next)
108 if (ent->data == oldprop)
109 break;
110 if (ent == NULL) {
111 pr_warn("device-tree: property \"%s\" does not exist\n",
112 oldprop->name);
113 } else {
114 ent->data = newprop;
115 ent->size = newprop->length;
116 }
117}
118
119/*
120 * Various dodgy firmware might give us nodes and/or properties with
121 * conflicting names. That's generally ok, except for exporting via /proc,
122 * so munge names here to ensure they're unique.
123 */
124
125static int duplicate_name(struct proc_dir_entry *de, const char *name)
126{
127 struct proc_dir_entry *ent;
128 int found = 0;
129
130 spin_lock(&proc_subdir_lock);
131
132 for (ent = de->subdir; ent != NULL; ent = ent->next) {
133 if (strcmp(ent->name, name) == 0) {
134 found = 1;
135 break;
136 }
137 }
138
139 spin_unlock(&proc_subdir_lock);
140
141 return found;
142}
143
144static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
145 const char *name)
146{
147 char *fixed_name;
148 int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
149 int i = 1, size;
150
151realloc:
152 fixed_name = kmalloc(fixup_len, GFP_KERNEL);
153 if (fixed_name == NULL) {
154 pr_err("device-tree: Out of memory trying to fixup "
155 "name \"%s\"\n", name);
156 return name;
157 }
158
159retry:
160 size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
161 size++; /* account for NULL */
162
163 if (size > fixup_len) {
164 /* We ran out of space, free and reallocate. */
165 kfree(fixed_name);
166 fixup_len = size;
167 goto realloc;
168 }
169
170 if (duplicate_name(de, fixed_name)) {
171 /* Multiple duplicates. Retry with a different offset. */
172 i++;
173 goto retry;
174 }
175
176 pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
177 np->full_name, fixed_name);
178
179 return fixed_name;
180}
181
182/*
183 * Process a node, adding entries for its children and its properties.
184 */
185void proc_device_tree_add_node(struct device_node *np,
186 struct proc_dir_entry *de)
187{
188 struct property *pp;
189 struct proc_dir_entry *ent;
190 struct device_node *child;
191 const char *p;
192
193 set_node_proc_entry(np, de);
194 for (child = NULL; (child = of_get_next_child(np, child));) {
195 /* Use everything after the last slash, or the full name */
196 p = kbasename(child->full_name);
197
198 if (duplicate_name(de, p))
199 p = fixup_name(np, de, p);
200
201 ent = proc_mkdir(p, de);
202 if (ent == NULL)
203 break;
204 proc_device_tree_add_node(child, ent);
205 }
206 of_node_put(child);
207
208 for (pp = np->properties; pp != NULL; pp = pp->next) {
209 p = pp->name;
210
211 if (strchr(p, '/'))
212 continue;
213
214 if (duplicate_name(de, p))
215 p = fixup_name(np, de, p);
216
217 ent = __proc_device_tree_add_prop(de, pp, p);
218 if (ent == NULL)
219 break;
220 }
221}
222
223/*
224 * Called on initialization to set up the /proc/device-tree subtree
225 */
226void __init proc_device_tree_init(void)
227{
228 struct device_node *root;
229
230 proc_device_tree = proc_mkdir("device-tree", NULL);
231 if (proc_device_tree == NULL)
232 return;
233 root = of_find_node_by_path("/");
234 if (root == NULL) {
235 remove_proc_entry("device-tree", NULL);
236 pr_debug("/proc/device-tree: can't find root\n");
237 return;
238 }
239 proc_device_tree_add_node(root, proc_device_tree);
240 of_node_put(root);
241}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe4..5dbadecb234d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
92int proc_remount(struct super_block *sb, int *flags, char *data) 92int proc_remount(struct super_block *sb, int *flags, char *data)
93{ 93{
94 struct pid_namespace *pid = sb->s_fs_info; 94 struct pid_namespace *pid = sb->s_fs_info;
95
96 sync_filesystem(sb);
95 return !proc_parse_options(data, pid); 97 return !proc_parse_options(data, pid);
96} 98}
97 99
@@ -183,9 +185,6 @@ void __init proc_root_init(void)
183 proc_mkdir("openprom", NULL); 185 proc_mkdir("openprom", NULL);
184#endif 186#endif
185 proc_tty_init(); 187 proc_tty_init();
186#ifdef CONFIG_PROC_DEVICETREE
187 proc_device_tree_init();
188#endif
189 proc_mkdir("bus", NULL); 188 proc_mkdir("bus", NULL);
190 proc_sys_init(); 189 proc_sys_init();
191} 190}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6f599c62f0cc..9d231e9e5f0e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,7 +9,7 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <linux/cputime.h>
13#include <linux/tick.h> 13#include <linux/tick.h>
14 14
15#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..442177b1119a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/vmacache.h>
2#include <linux/hugetlb.h> 3#include <linux/hugetlb.h>
3#include <linux/huge_mm.h> 4#include <linux/huge_mm.h>
4#include <linux/mount.h> 5#include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
152 153
153 /* 154 /*
154 * We remember last_addr rather than next_addr to hit with 155 * We remember last_addr rather than next_addr to hit with
155 * mmap_cache most of the time. We have zero last_addr at 156 * vmacache most of the time. We have zero last_addr at
156 * the beginning and also after lseek. We will have -1 last_addr 157 * the beginning and also after lseek. We will have -1 last_addr
157 * after the end of the vmas. 158 * after the end of the vmas.
158 */ 159 */
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7141b8d0ca9e..33de567c25af 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,7 +5,7 @@
5#include <linux/seq_file.h> 5#include <linux/seq_file.h>
6#include <linux/time.h> 6#include <linux/time.h>
7#include <linux/kernel_stat.h> 7#include <linux/kernel_stat.h>
8#include <asm/cputime.h> 8#include <linux/cputime.h>
9 9
10static int uptime_proc_show(struct seq_file *m, void *v) 10static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 88d4585b30f1..6a8e785b29da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
484 phdr_ptr->p_memsz = real_sz; 484 phdr_ptr->p_memsz = real_sz;
485 if (real_sz == 0) { 485 if (real_sz == 0) {
486 pr_warn("Warning: Zero PT_NOTE entries found\n"); 486 pr_warn("Warning: Zero PT_NOTE entries found\n");
487 return -EINVAL;
488 } 487 }
489 } 488 }
490 489
@@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
671 phdr_ptr->p_memsz = real_sz; 670 phdr_ptr->p_memsz = real_sz;
672 if (real_sz == 0) { 671 if (real_sz == 0) {
673 pr_warn("Warning: Zero PT_NOTE entries found\n"); 672 pr_warn("Warning: Zero PT_NOTE entries found\n");
674 return -EINVAL;
675 } 673 }
676 } 674 }
677 675
@@ -1118,4 +1116,3 @@ void vmcore_cleanup(void)
1118 } 1116 }
1119 free_elfcorebuf(); 1117 free_elfcorebuf();
1120} 1118}
1121EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..192297b0090d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
249 249
250static int pstore_remount(struct super_block *sb, int *flags, char *data) 250static int pstore_remount(struct super_block *sb, int *flags, char *data)
251{ 251{
252 sync_filesystem(sb);
252 parse_options(data); 253 parse_options(data);
253 254
254 return 0; 255 return 0;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 78c3c2097787..46d269e38706 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -497,6 +497,7 @@ void pstore_get_records(int quiet)
497 big_oops_buf_sz); 497 big_oops_buf_sz);
498 498
499 if (unzipped_len > 0) { 499 if (unzipped_len > 0) {
500 kfree(buf);
500 buf = big_oops_buf; 501 buf = big_oops_buf;
501 size = unzipped_len; 502 size = unzipped_len;
502 compressed = false; 503 compressed = false;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3a..3b5744306ed8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -86,6 +86,7 @@ struct ramoops_context {
86 struct persistent_ram_ecc_info ecc_info; 86 struct persistent_ram_ecc_info ecc_info;
87 unsigned int max_dump_cnt; 87 unsigned int max_dump_cnt;
88 unsigned int dump_write_cnt; 88 unsigned int dump_write_cnt;
89 /* _read_cnt need clear on ramoops_pstore_open */
89 unsigned int dump_read_cnt; 90 unsigned int dump_read_cnt;
90 unsigned int console_read_cnt; 91 unsigned int console_read_cnt;
91 unsigned int ftrace_read_cnt; 92 unsigned int ftrace_read_cnt;
@@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
101 102
102 cxt->dump_read_cnt = 0; 103 cxt->dump_read_cnt = 0;
103 cxt->console_read_cnt = 0; 104 cxt->console_read_cnt = 0;
105 cxt->ftrace_read_cnt = 0;
104 return 0; 106 return 0;
105} 107}
106 108
@@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
117 return NULL; 119 return NULL;
118 120
119 prz = przs[i]; 121 prz = przs[i];
122 if (!prz)
123 return NULL;
120 124
121 if (update) { 125 /* Update old/shadowed buffer. */
122 /* Update old/shadowed buffer. */ 126 if (update)
123 persistent_ram_save_old(prz); 127 persistent_ram_save_old(prz);
124 if (!persistent_ram_old_size(prz)) 128
125 return NULL; 129 if (!persistent_ram_old_size(prz))
126 } 130 return NULL;
127 131
128 *typep = type; 132 *typep = type;
129 *id = i; 133 *id = i;
@@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
316{ 320{
317 int i; 321 int i;
318 322
323 cxt->max_dump_cnt = 0;
319 if (!cxt->przs) 324 if (!cxt->przs)
320 return; 325 return;
321 326
@@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
346 GFP_KERNEL); 351 GFP_KERNEL);
347 if (!cxt->przs) { 352 if (!cxt->przs) {
348 dev_err(dev, "failed to initialize a prz array for dumps\n"); 353 dev_err(dev, "failed to initialize a prz array for dumps\n");
349 return -ENOMEM; 354 goto fail_prz;
350 } 355 }
351 356
352 for (i = 0; i < cxt->max_dump_cnt; i++) { 357 for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev)
428 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) 433 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
429 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 434 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
430 435
431 cxt->dump_read_cnt = 0;
432 cxt->size = pdata->mem_size; 436 cxt->size = pdata->mem_size;
433 cxt->phys_addr = pdata->mem_address; 437 cxt->phys_addr = pdata->mem_address;
434 cxt->record_size = pdata->record_size; 438 cxt->record_size = pdata->record_size;
@@ -505,7 +509,6 @@ fail_buf:
505 kfree(cxt->pstore.buf); 509 kfree(cxt->pstore.buf);
506fail_clear: 510fail_clear:
507 cxt->pstore.bufsize = 0; 511 cxt->pstore.bufsize = 0;
508 cxt->max_dump_cnt = 0;
509fail_cnt: 512fail_cnt:
510 kfree(cxt->fprz); 513 kfree(cxt->fprz);
511fail_init_fprz: 514fail_init_fprz:
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d426763..ff7e3d4df5a1 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -54,7 +54,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
54 do { 54 do {
55 old = atomic_read(&prz->buffer->start); 55 old = atomic_read(&prz->buffer->start);
56 new = old + a; 56 new = old + a;
57 while (unlikely(new > prz->buffer_size)) 57 while (unlikely(new >= prz->buffer_size))
58 new -= prz->buffer_size; 58 new -= prz->buffer_size;
59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); 59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
60 60
@@ -91,7 +91,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
91 91
92 old = atomic_read(&prz->buffer->start); 92 old = atomic_read(&prz->buffer->start);
93 new = old + a; 93 new = old + a;
94 while (unlikely(new > prz->buffer_size)) 94 while (unlikely(new >= prz->buffer_size))
95 new -= prz->buffer_size; 95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new); 96 atomic_set(&prz->buffer->start, new);
97 97
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810381c..c4bcb778886e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
44{ 44{
45 struct qnx4_sb_info *qs; 45 struct qnx4_sb_info *qs;
46 46
47 sync_filesystem(sb);
47 qs = qnx4_sb(sb); 48 qs = qnx4_sb(sb);
48 qs->Version = QNX4_VERSION; 49 qs->Version = QNX4_VERSION;
49 *flags |= MS_RDONLY; 50 *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa1..65cdaab3ed49 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
55 55
56static int qnx6_remount(struct super_block *sb, int *flags, char *data) 56static int qnx6_remount(struct super_block *sb, int *flags, char *data)
57{ 57{
58 sync_filesystem(sb);
58 *flags |= MS_RDONLY; 59 *flags |= MS_RDONLY;
59 return 0; 60 return 0;
60} 61}
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 880fd9884366..c51df1dd237e 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -8,9 +8,10 @@ config QUOTA
8 help 8 help
9 If you say Y here, you will be able to set per user limits for disk 9 If you say Y here, you will be able to set per user limits for disk
10 usage (also called disk quotas). Currently, it works for the 10 usage (also called disk quotas). Currently, it works for the
11 ext2, ext3, and reiserfs file system. ext3 also supports journalled 11 ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
12 quotas for which you don't need to run quotacheck(8) after an unclean 12 Note that gfs2 and xfs use their own quota system.
13 shutdown. 13 Ext3, ext4 and reiserfs also support journaled quotas for which
14 you don't need to run quotacheck(8) after an unclean shutdown.
14 For further details, read the Quota mini-HOWTO, available from 15 For further details, read the Quota mini-HOWTO, available from
15 <http://www.tldp.org/docs.html#howto>, or the documentation provided 16 <http://www.tldp.org/docs.html#howto>, or the documentation provided
16 with the quota tools. Probably the quota support is only useful for 17 with the quota tools. Probably the quota support is only useful for
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index cfc8dcc16043..9cd5f63715c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
528 if (atomic_read(&dquot->dq_count)) { 528 if (atomic_read(&dquot->dq_count)) {
529 DEFINE_WAIT(wait); 529 DEFINE_WAIT(wait);
530 530
531 atomic_inc(&dquot->dq_count); 531 dqgrab(dquot);
532 prepare_to_wait(&dquot->dq_wait_unused, &wait, 532 prepare_to_wait(&dquot->dq_wait_unused, &wait,
533 TASK_UNINTERRUPTIBLE); 533 TASK_UNINTERRUPTIBLE);
534 spin_unlock(&dq_list_lock); 534 spin_unlock(&dq_list_lock);
@@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
632 /* Now we have active dquot from which someone is 632 /* Now we have active dquot from which someone is
633 * holding reference so we can safely just increase 633 * holding reference so we can safely just increase
634 * use count */ 634 * use count */
635 atomic_inc(&dquot->dq_count); 635 dqgrab(dquot);
636 spin_unlock(&dq_list_lock); 636 spin_unlock(&dq_list_lock);
637 dqstats_inc(DQST_LOOKUPS); 637 dqstats_inc(DQST_LOOKUPS);
638 err = sb->dq_op->write_dquot(dquot); 638 err = sb->dq_op->write_dquot(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index 28cc9c810744..31c6efa43183 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -994,9 +994,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
994 return ret; 994 return ret;
995} 995}
996 996
997COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 997static long __compat_sys_preadv64(unsigned long fd,
998 const struct compat_iovec __user *,vec, 998 const struct compat_iovec __user *vec,
999 unsigned long, vlen, loff_t, pos) 999 unsigned long vlen, loff_t pos)
1000{ 1000{
1001 struct fd f; 1001 struct fd f;
1002 ssize_t ret; 1002 ssize_t ret;
@@ -1013,12 +1013,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1013 return ret; 1013 return ret;
1014} 1014}
1015 1015
1016#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1017COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1018 const struct compat_iovec __user *,vec,
1019 unsigned long, vlen, loff_t, pos)
1020{
1021 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022}
1023#endif
1024
1016COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1025COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1017 const struct compat_iovec __user *,vec, 1026 const struct compat_iovec __user *,vec,
1018 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1027 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1019{ 1028{
1020 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1029 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1021 return compat_sys_preadv64(fd, vec, vlen, pos); 1030
1031 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022} 1032}
1023 1033
1024static size_t compat_writev(struct file *file, 1034static size_t compat_writev(struct file *file,
@@ -1061,9 +1071,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1061 return ret; 1071 return ret;
1062} 1072}
1063 1073
1064COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1074static long __compat_sys_pwritev64(unsigned long fd,
1065 const struct compat_iovec __user *,vec, 1075 const struct compat_iovec __user *vec,
1066 unsigned long, vlen, loff_t, pos) 1076 unsigned long vlen, loff_t pos)
1067{ 1077{
1068 struct fd f; 1078 struct fd f;
1069 ssize_t ret; 1079 ssize_t ret;
@@ -1080,12 +1090,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1080 return ret; 1090 return ret;
1081} 1091}
1082 1092
1093#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1094COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1095 const struct compat_iovec __user *,vec,
1096 unsigned long, vlen, loff_t, pos)
1097{
1098 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1099}
1100#endif
1101
1083COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1102COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1084 const struct compat_iovec __user *,vec, 1103 const struct compat_iovec __user *,vec,
1085 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1104 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1086{ 1105{
1087 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1106 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1088 return compat_sys_pwritev64(fd, vec, vlen, pos); 1107
1108 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1089} 1109}
1090#endif 1110#endif
1091 1111
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 1fd2051109a3..af677353a3f5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
125 int d_reclen; 125 int d_reclen;
126 char *d_name; 126 char *d_name;
127 ino_t d_ino; 127 ino_t d_ino;
128 loff_t cur_pos = deh_offset(deh);
128 129
129 if (!de_visible(deh)) 130 if (!de_visible(deh))
130 /* it is hidden entry */ 131 /* it is hidden entry */
@@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
196 if (local_buf != small_buf) { 197 if (local_buf != small_buf) {
197 kfree(local_buf); 198 kfree(local_buf);
198 } 199 }
199 // next entry should be looked for with such offset 200
200 next_pos = deh_offset(deh) + 1; 201 /* deh_offset(deh) may be invalid now. */
202 next_pos = cur_pos + 1;
201 203
202 if (item_moved(&tmp_ih, &path_to_entry)) { 204 if (item_moved(&tmp_ih, &path_to_entry)) {
203 set_cpu_key_k_offset(&pos_key, 205 set_cpu_key_k_offset(&pos_key,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb451e..bc8b8009897d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode)
35 if (!inode->i_nlink && !is_bad_inode(inode)) 35 if (!inode->i_nlink && !is_bad_inode(inode))
36 dquot_initialize(inode); 36 dquot_initialize(inode);
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 if (inode->i_nlink) 39 if (inode->i_nlink)
40 goto no_delete; 40 goto no_delete;
41 41
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 8d06adf89948..83d4eac8059a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s);
2831 */ 2831 */
2832__le32 reiserfs_choose_packing(struct inode *dir); 2832__le32 reiserfs_choose_packing(struct inode *dir);
2833 2833
2834void show_alloc_options(struct seq_file *seq, struct super_block *s);
2834int reiserfs_init_bitmap_cache(struct super_block *sb); 2835int reiserfs_init_bitmap_cache(struct super_block *sb);
2835void reiserfs_free_bitmap_cache(struct super_block *sb); 2836void reiserfs_free_bitmap_cache(struct super_block *sb);
2836void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); 2837void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2c803353f8ac..9fb20426005e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
62 62
63static int reiserfs_remount(struct super_block *s, int *flags, char *data); 63static int reiserfs_remount(struct super_block *s, int *flags, char *data);
64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
65void show_alloc_options(struct seq_file *seq, struct super_block *s);
66 65
67static int reiserfs_sync_fs(struct super_block *s, int wait) 66static int reiserfs_sync_fs(struct super_block *s, int wait)
68{ 67{
@@ -597,7 +596,7 @@ static void init_once(void *foo)
597 inode_init_once(&ei->vfs_inode); 596 inode_init_once(&ei->vfs_inode);
598} 597}
599 598
600static int init_inodecache(void) 599static int __init init_inodecache(void)
601{ 600{
602 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", 601 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
603 sizeof(struct 602 sizeof(struct
@@ -1319,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1319 int i; 1318 int i;
1320#endif 1319#endif
1321 1320
1321 sync_filesystem(s);
1322 reiserfs_write_lock(s); 1322 reiserfs_write_lock(s);
1323 1323
1324#ifdef CONFIG_QUOTA 1324#ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d8418782862b..ef90e8bca95a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
432 */ 432 */
433static int romfs_remount(struct super_block *sb, int *flags, char *data) 433static int romfs_remount(struct super_block *sb, int *flags, char *data)
434{ 434{
435 sync_filesystem(sb);
435 *flags |= MS_RDONLY; 436 *flags |= MS_RDONLY;
436 return 0; 437 return 0;
437} 438}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df6312d4e..031c8d67fd51 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
371 371
372static int squashfs_remount(struct super_block *sb, int *flags, char *data) 372static int squashfs_remount(struct super_block *sb, int *flags, char *data)
373{ 373{
374 sync_filesystem(sb);
374 *flags |= MS_RDONLY; 375 *flags |= MS_RDONLY;
375 return 0; 376 return 0;
376} 377}
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2ca765..e9dc3c3fe159 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
719 } 719 }
720 } 720 }
721 721
722 sync_filesystem(sb);
723
724 if (sb->s_op->remount_fs) { 722 if (sb->s_op->remount_fs) {
725 retval = sb->s_op->remount_fs(sb, &flags, data); 723 retval = sb->s_op->remount_fs(sb, &flags, data);
726 if (retval) { 724 if (retval) {
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index 8c41feacbac5..b2756014508c 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,6 +1,7 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EXPERT 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 select KERNFS
4 help 5 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 6 The sysfs filesystem is a virtual filesystem that the kernel uses to
6 export internal kernel objects, their attributes, and their 7 export internal kernel objects, their attributes, and their
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ee0d761c3179..0b45ff42f374 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -19,39 +19,18 @@
19 19
20DEFINE_SPINLOCK(sysfs_symlink_target_lock); 20DEFINE_SPINLOCK(sysfs_symlink_target_lock);
21 21
22/**
23 * sysfs_pathname - return full path to sysfs dirent
24 * @kn: kernfs_node whose path we want
25 * @path: caller allocated buffer of size PATH_MAX
26 *
27 * Gives the name "/" to the sysfs_root entry; any path returned
28 * is relative to wherever sysfs is mounted.
29 */
30static char *sysfs_pathname(struct kernfs_node *kn, char *path)
31{
32 if (kn->parent) {
33 sysfs_pathname(kn->parent, path);
34 strlcat(path, "/", PATH_MAX);
35 }
36 strlcat(path, kn->name, PATH_MAX);
37 return path;
38}
39
40void sysfs_warn_dup(struct kernfs_node *parent, const char *name) 22void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
41{ 23{
42 char *path; 24 char *buf, *path = NULL;
43 25
44 path = kzalloc(PATH_MAX, GFP_KERNEL); 26 buf = kzalloc(PATH_MAX, GFP_KERNEL);
45 if (path) { 27 if (buf)
46 sysfs_pathname(parent, path); 28 path = kernfs_path(parent, buf, PATH_MAX);
47 strlcat(path, "/", PATH_MAX);
48 strlcat(path, name, PATH_MAX);
49 }
50 29
51 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", 30 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
52 path ? path : name); 31 path, name);
53 32
54 kfree(path); 33 kfree(buf);
55} 34}
56 35
57/** 36/**
@@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj)
122int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, 101int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
123 const void *new_ns) 102 const void *new_ns)
124{ 103{
125 struct kernfs_node *parent = kobj->sd->parent; 104 struct kernfs_node *parent;
105 int ret;
126 106
127 return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); 107 parent = kernfs_get_parent(kobj->sd);
108 ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
109 kernfs_put(parent);
110 return ret;
128} 111}
129 112
130int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, 113int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
133 struct kernfs_node *kn = kobj->sd; 116 struct kernfs_node *kn = kobj->sd;
134 struct kernfs_node *new_parent; 117 struct kernfs_node *new_parent;
135 118
136 BUG_ON(!kn->parent);
137 new_parent = new_parent_kobj && new_parent_kobj->sd ? 119 new_parent = new_parent_kobj && new_parent_kobj->sd ?
138 new_parent_kobj->sd : sysfs_root_kn; 120 new_parent_kobj->sd : sysfs_root_kn;
139 121
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 810cf6e613e5..1b8b91b67fdb 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -372,6 +372,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
372} 372}
373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); 373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
374 374
375/**
376 * sysfs_remove_file_self - remove an object attribute from its own method
377 * @kobj: object we're acting for
378 * @attr: attribute descriptor
379 *
380 * See kernfs_remove_self() for details.
381 */
382bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
383{
384 struct kernfs_node *parent = kobj->sd;
385 struct kernfs_node *kn;
386 bool ret;
387
388 kn = kernfs_find_and_get(parent, attr->name);
389 if (WARN_ON_ONCE(!kn))
390 return false;
391
392 ret = kernfs_remove_self(kn);
393
394 kernfs_put(kn);
395 return ret;
396}
397
375void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) 398void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
376{ 399{
377 int i; 400 int i;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 6b579387c67a..aa0406895b53 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,8 +70,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
70 if (grp->bin_attrs) { 70 if (grp->bin_attrs) {
71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { 71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
72 if (update) 72 if (update)
73 sysfs_remove_bin_file(kobj, *bin_attr); 73 kernfs_remove_by_name(parent,
74 error = sysfs_create_bin_file(kobj, *bin_attr); 74 (*bin_attr)->attr.name);
75 error = sysfs_add_file_mode_ns(parent,
76 &(*bin_attr)->attr, true,
77 (*bin_attr)->attr.mode, NULL);
75 if (error) 78 if (error)
76 break; 79 break;
77 } 80 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 3eaf5c6622eb..a66ad6196f59 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -63,7 +63,7 @@ int __init sysfs_init(void)
63{ 63{
64 int err; 64 int err;
65 65
66 sysfs_root = kernfs_create_root(NULL, NULL); 66 sysfs_root = kernfs_create_root(NULL, 0, NULL);
67 if (IS_ERR(sysfs_root)) 67 if (IS_ERR(sysfs_root))
68 return PTR_ERR(sysfs_root); 68 return PTR_ERR(sysfs_root);
69 69
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4ee1235..88956309cc86 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
60{ 60{
61 struct sysv_sb_info *sbi = SYSV_SB(sb); 61 struct sysv_sb_info *sbi = SYSV_SB(sb);
62 62
63 sync_filesystem(sb);
63 if (sbi->s_forced_ro) 64 if (sbi->s_forced_ro)
64 *flags |= MS_RDONLY; 65 *flags |= MS_RDONLY;
65 return 0; 66 return 0;
@@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode)
295 296
296static void sysv_evict_inode(struct inode *inode) 297static void sysv_evict_inode(struct inode *inode)
297{ 298{
298 truncate_inode_pages(&inode->i_data, 0); 299 truncate_inode_pages_final(&inode->i_data);
299 if (!inode->i_nlink) { 300 if (!inode->i_nlink) {
300 inode->i_size = 0; 301 inode->i_size = 0;
301 sysv_truncate(inode); 302 sysv_truncate(inode);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 929312180dd0..0013142c0475 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
317 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
318 clockid != CLOCK_REALTIME && 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM && 319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME &&
320 clockid != CLOCK_BOOTTIME_ALARM)) 321 clockid != CLOCK_BOOTTIME_ALARM))
321 return -EINVAL; 322 return -EINVAL;
322 323
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 123c79b7261e..4f34dbae823d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,6 +1538,7 @@ out_unlock:
1538 1538
1539static const struct vm_operations_struct ubifs_file_vm_ops = { 1539static const struct vm_operations_struct ubifs_file_vm_ops = {
1540 .fault = filemap_fault, 1540 .fault = filemap_fault,
1541 .map_pages = filemap_map_pages,
1541 .page_mkwrite = ubifs_vm_page_mkwrite, 1542 .page_mkwrite = ubifs_vm_page_mkwrite,
1542 .remap_pages = generic_file_remap_pages, 1543 .remap_pages = generic_file_remap_pages,
1543}; 1544};
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded8490c0c6..a1266089eca1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); 351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
352 ubifs_assert(!atomic_read(&inode->i_count)); 352 ubifs_assert(!atomic_read(&inode->i_count));
353 353
354 truncate_inode_pages(&inode->i_data, 0); 354 truncate_inode_pages_final(&inode->i_data);
355 355
356 if (inode->i_nlink) 356 if (inode->i_nlink)
357 goto done; 357 goto done;
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1827 int err; 1827 int err;
1828 struct ubifs_info *c = sb->s_fs_info; 1828 struct ubifs_info *c = sb->s_fs_info;
1829 1829
1830 sync_filesystem(sb);
1830 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); 1831 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
1831 1832
1832 err = ubifs_parse_options(c, data, 1); 1833 err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 982ce05c87ed..5d643706212f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
146 want_delete = 1; 146 want_delete = 1;
147 udf_setsize(inode, 0); 147 udf_setsize(inode, 0);
148 udf_update_inode(inode, IS_SYNC(inode)); 148 udf_update_inode(inode, IS_SYNC(inode));
149 } else 149 }
150 truncate_inode_pages(&inode->i_data, 0); 150 truncate_inode_pages_final(&inode->i_data);
151 invalidate_inode_buffers(inode); 151 invalidate_inode_buffers(inode);
152 clear_inode(inode); 152 clear_inode(inode);
153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f69bed..3286db047a40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,7 +175,7 @@ static void init_once(void *foo)
175 inode_init_once(&ei->vfs_inode); 175 inode_init_once(&ei->vfs_inode);
176} 176}
177 177
178static int init_inodecache(void) 178static int __init init_inodecache(void)
179{ 179{
180 udf_inode_cachep = kmem_cache_create("udf_inode_cache", 180 udf_inode_cachep = kmem_cache_create("udf_inode_cache",
181 sizeof(struct udf_inode_info), 181 sizeof(struct udf_inode_info),
@@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
505 while ((p = strsep(&options, ",")) != NULL) { 505 while ((p = strsep(&options, ",")) != NULL) {
506 substring_t args[MAX_OPT_ARGS]; 506 substring_t args[MAX_OPT_ARGS];
507 int token; 507 int token;
508 unsigned n;
508 if (!*p) 509 if (!*p)
509 continue; 510 continue;
510 511
@@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
516 case Opt_bs: 517 case Opt_bs:
517 if (match_int(&args[0], &option)) 518 if (match_int(&args[0], &option))
518 return 0; 519 return 0;
519 uopt->blocksize = option; 520 n = option;
521 if (n != 512 && n != 1024 && n != 2048 && n != 4096)
522 return 0;
523 uopt->blocksize = n;
520 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); 524 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
521 break; 525 break;
522 case Opt_unhide: 526 case Opt_unhide:
@@ -646,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
646 int error = 0; 650 int error = 0;
647 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); 651 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
648 652
653 sync_filesystem(sb);
649 if (lvidiu) { 654 if (lvidiu) {
650 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); 655 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
651 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) 656 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae660..0ab1de4b39a5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
38{ 38{
39 struct super_block * sb; 39 struct super_block * sb;
40 struct ufs_sb_private_info * uspi; 40 struct ufs_sb_private_info * uspi;
41 struct ufs_super_block_first * usb1;
42 struct ufs_cg_private_info * ucpi; 41 struct ufs_cg_private_info * ucpi;
43 struct ufs_cylinder_group * ucg; 42 struct ufs_cylinder_group * ucg;
44 unsigned cgno, bit, end_bit, bbase, blkmap, i; 43 unsigned cgno, bit, end_bit, bbase, blkmap, i;
@@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
46 45
47 sb = inode->i_sb; 46 sb = inode->i_sb;
48 uspi = UFS_SB(sb)->s_uspi; 47 uspi = UFS_SB(sb)->s_uspi;
49 usb1 = ubh_get_usb_first(uspi);
50 48
51 UFSD("ENTER, fragment %llu, count %u\n", 49 UFSD("ENTER, fragment %llu, count %u\n",
52 (unsigned long long)fragment, count); 50 (unsigned long long)fragment, count);
@@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
135{ 133{
136 struct super_block * sb; 134 struct super_block * sb;
137 struct ufs_sb_private_info * uspi; 135 struct ufs_sb_private_info * uspi;
138 struct ufs_super_block_first * usb1;
139 struct ufs_cg_private_info * ucpi; 136 struct ufs_cg_private_info * ucpi;
140 struct ufs_cylinder_group * ucg; 137 struct ufs_cylinder_group * ucg;
141 unsigned overflow, cgno, bit, end_bit, i; 138 unsigned overflow, cgno, bit, end_bit, i;
@@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
143 140
144 sb = inode->i_sb; 141 sb = inode->i_sb;
145 uspi = UFS_SB(sb)->s_uspi; 142 uspi = UFS_SB(sb)->s_uspi;
146 usb1 = ubh_get_usb_first(uspi);
147 143
148 UFSD("ENTER, fragment %llu, count %u\n", 144 UFSD("ENTER, fragment %llu, count %u\n",
149 (unsigned long long)fragment, count); 145 (unsigned long long)fragment, count);
@@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
499{ 495{
500 struct super_block * sb; 496 struct super_block * sb;
501 struct ufs_sb_private_info * uspi; 497 struct ufs_sb_private_info * uspi;
502 struct ufs_super_block_first * usb1;
503 struct ufs_cg_private_info * ucpi; 498 struct ufs_cg_private_info * ucpi;
504 struct ufs_cylinder_group * ucg; 499 struct ufs_cylinder_group * ucg;
505 unsigned cgno, fragno, fragoff, count, fragsize, i; 500 unsigned cgno, fragno, fragoff, count, fragsize, i;
@@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
509 504
510 sb = inode->i_sb; 505 sb = inode->i_sb;
511 uspi = UFS_SB(sb)->s_uspi; 506 uspi = UFS_SB(sb)->s_uspi;
512 usb1 = ubh_get_usb_first (uspi);
513 count = newcount - oldcount; 507 count = newcount - oldcount;
514 508
515 cgno = ufs_dtog(uspi, fragment); 509 cgno = ufs_dtog(uspi, fragment);
@@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
577{ 571{
578 struct super_block * sb; 572 struct super_block * sb;
579 struct ufs_sb_private_info * uspi; 573 struct ufs_sb_private_info * uspi;
580 struct ufs_super_block_first * usb1;
581 struct ufs_cg_private_info * ucpi; 574 struct ufs_cg_private_info * ucpi;
582 struct ufs_cylinder_group * ucg; 575 struct ufs_cylinder_group * ucg;
583 unsigned oldcg, i, j, k, allocsize; 576 unsigned oldcg, i, j, k, allocsize;
@@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
588 581
589 sb = inode->i_sb; 582 sb = inode->i_sb;
590 uspi = UFS_SB(sb)->s_uspi; 583 uspi = UFS_SB(sb)->s_uspi;
591 usb1 = ubh_get_usb_first(uspi);
592 oldcg = cgno; 584 oldcg = cgno;
593 585
594 /* 586 /*
@@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
690{ 682{
691 struct super_block * sb; 683 struct super_block * sb;
692 struct ufs_sb_private_info * uspi; 684 struct ufs_sb_private_info * uspi;
693 struct ufs_super_block_first * usb1;
694 struct ufs_cylinder_group * ucg; 685 struct ufs_cylinder_group * ucg;
695 u64 result, blkno; 686 u64 result, blkno;
696 687
@@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
698 689
699 sb = inode->i_sb; 690 sb = inode->i_sb;
700 uspi = UFS_SB(sb)->s_uspi; 691 uspi = UFS_SB(sb)->s_uspi;
701 usb1 = ubh_get_usb_first(uspi);
702 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 692 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
703 693
704 if (goal == 0) { 694 if (goal == 0) {
@@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
794 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe 784 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
795 }; 785 };
796 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 786 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
797 struct ufs_super_block_first *usb1;
798 struct ufs_cylinder_group *ucg; 787 struct ufs_cylinder_group *ucg;
799 unsigned start, length, loc; 788 unsigned start, length, loc;
800 unsigned pos, want, blockmap, mask, end; 789 unsigned pos, want, blockmap, mask, end;
@@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
803 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, 792 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
804 (unsigned long long)goal, count); 793 (unsigned long long)goal, count);
805 794
806 usb1 = ubh_get_usb_first (uspi);
807 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 795 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
808 796
809 if (goal) 797 if (goal)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817b..98f7211599ff 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode)
57{ 57{
58 struct super_block * sb; 58 struct super_block * sb;
59 struct ufs_sb_private_info * uspi; 59 struct ufs_sb_private_info * uspi;
60 struct ufs_super_block_first * usb1;
61 struct ufs_cg_private_info * ucpi; 60 struct ufs_cg_private_info * ucpi;
62 struct ufs_cylinder_group * ucg; 61 struct ufs_cylinder_group * ucg;
63 int is_directory; 62 int is_directory;
@@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode)
67 66
68 sb = inode->i_sb; 67 sb = inode->i_sb;
69 uspi = UFS_SB(sb)->s_uspi; 68 uspi = UFS_SB(sb)->s_uspi;
70 usb1 = ubh_get_usb_first(uspi);
71 69
72 ino = inode->i_ino; 70 ino = inode->i_ino;
73 71
@@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
175 struct super_block * sb; 173 struct super_block * sb;
176 struct ufs_sb_info * sbi; 174 struct ufs_sb_info * sbi;
177 struct ufs_sb_private_info * uspi; 175 struct ufs_sb_private_info * uspi;
178 struct ufs_super_block_first * usb1;
179 struct ufs_cg_private_info * ucpi; 176 struct ufs_cg_private_info * ucpi;
180 struct ufs_cylinder_group * ucg; 177 struct ufs_cylinder_group * ucg;
181 struct inode * inode; 178 struct inode * inode;
@@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
195 ufsi = UFS_I(inode); 192 ufsi = UFS_I(inode);
196 sbi = UFS_SB(sb); 193 sbi = UFS_SB(sb);
197 uspi = sbi->s_uspi; 194 uspi = sbi->s_uspi;
198 usb1 = ubh_get_usb_first(uspi);
199 195
200 mutex_lock(&sbi->s_lock); 196 mutex_lock(&sbi->s_lock);
201 197
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca96086784..61e8a9b021dd 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
885 if (!inode->i_nlink && !is_bad_inode(inode)) 885 if (!inode->i_nlink && !is_bad_inode(inode))
886 want_delete = 1; 886 want_delete = 1;
887 887
888 truncate_inode_pages(&inode->i_data, 0); 888 truncate_inode_pages_final(&inode->i_data);
889 if (want_delete) { 889 if (want_delete) {
890 loff_t old_i_size; 890 loff_t old_i_size;
891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..c1183f9f69dc 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
524 struct ufs_buffer_head * ubh; 524 struct ufs_buffer_head * ubh;
525 unsigned char * base, * space; 525 unsigned char * base, * space;
526 unsigned size, blks, i; 526 unsigned size, blks, i;
527 struct ufs_super_block_third *usb3;
528 527
529 UFSD("ENTER\n"); 528 UFSD("ENTER\n");
530 529
531 usb3 = ubh_get_usb_third(uspi);
532 /* 530 /*
533 * Read cs structures from (usually) first data block 531 * Read cs structures from (usually) first data block
534 * on the device. 532 * on the device.
@@ -1280,6 +1278,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 unsigned new_mount_opt, ufstype; 1278 unsigned new_mount_opt, ufstype;
1281 unsigned flags; 1279 unsigned flags;
1282 1280
1281 sync_filesystem(sb);
1283 lock_ufs(sb); 1282 lock_ufs(sb);
1284 mutex_lock(&UFS_SB(sb)->s_lock); 1283 mutex_lock(&UFS_SB(sb)->s_lock);
1285 uspi = UFS_SB(sb)->s_uspi; 1284 uspi = UFS_SB(sb)->s_uspi;
@@ -1389,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1389 struct super_block *sb = dentry->d_sb; 1388 struct super_block *sb = dentry->d_sb;
1390 struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; 1389 struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
1391 unsigned flags = UFS_SB(sb)->s_flags; 1390 unsigned flags = UFS_SB(sb)->s_flags;
1392 struct ufs_super_block_first *usb1;
1393 struct ufs_super_block_second *usb2;
1394 struct ufs_super_block_third *usb3; 1391 struct ufs_super_block_third *usb3;
1395 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 1392 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1396 1393
1397 lock_ufs(sb); 1394 lock_ufs(sb);
1398 1395
1399 usb1 = ubh_get_usb_first(uspi);
1400 usb2 = ubh_get_usb_second(uspi);
1401 usb3 = ubh_get_usb_third(uspi); 1396 usb3 = ubh_get_usb_third(uspi);
1402 1397
1403 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 1398 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1453,7 +1448,7 @@ static void init_once(void *foo)
1453 inode_init_once(&ei->vfs_inode); 1448 inode_init_once(&ei->vfs_inode);
1454} 1449}
1455 1450
1456static int init_inodecache(void) 1451static int __init init_inodecache(void)
1457{ 1452{
1458 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", 1453 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
1459 sizeof(struct ufs_inode_info), 1454 sizeof(struct ufs_inode_info),
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 66a36befc5c0..844e288b9576 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
65void * 65void *
66kmem_zalloc_large(size_t size, xfs_km_flags_t flags) 66kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
67{ 67{
68 unsigned noio_flag = 0;
68 void *ptr; 69 void *ptr;
70 gfp_t lflags;
69 71
70 ptr = kmem_zalloc(size, flags | KM_MAYFAIL); 72 ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
71 if (ptr) 73 if (ptr)
72 return ptr; 74 return ptr;
73 return vzalloc(size); 75
76 /*
77 * __vmalloc() will allocate data pages and auxillary structures (e.g.
78 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
79 * here. Hence we need to tell memory reclaim that we are in such a
80 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
81 * the filesystem here and potentially deadlocking.
82 */
83 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
84 noio_flag = memalloc_noio_save();
85
86 lflags = kmem_flags_convert(flags);
87 ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
88
89 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
90 memalloc_noio_restore(noio_flag);
91
92 return ptr;
74} 93}
75 94
76void 95void
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0ecec1896f25..6888ad886ff6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -281,7 +281,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
281 if (!acl) 281 if (!acl)
282 goto set_acl; 282 goto set_acl;
283 283
284 error = -EINVAL; 284 error = -E2BIG;
285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) 285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
286 return error; 286 return error;
287 287
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 3fc109819c34..0fdd4109c624 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -89,6 +89,8 @@ typedef struct xfs_agf {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_agf_t; 90} xfs_agf_t;
91 91
92#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
93
92#define XFS_AGF_MAGICNUM 0x00000001 94#define XFS_AGF_MAGICNUM 0x00000001
93#define XFS_AGF_VERSIONNUM 0x00000002 95#define XFS_AGF_VERSIONNUM 0x00000002
94#define XFS_AGF_SEQNO 0x00000004 96#define XFS_AGF_SEQNO 0x00000004
@@ -167,6 +169,8 @@ typedef struct xfs_agi {
167 /* structure must be padded to 64 bit alignment */ 169 /* structure must be padded to 64 bit alignment */
168} xfs_agi_t; 170} xfs_agi_t;
169 171
172#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
173
170#define XFS_AGI_MAGICNUM 0x00000001 174#define XFS_AGI_MAGICNUM 0x00000001
171#define XFS_AGI_VERSIONNUM 0x00000002 175#define XFS_AGI_VERSIONNUM 0x00000002
172#define XFS_AGI_SEQNO 0x00000004 176#define XFS_AGI_SEQNO 0x00000004
@@ -222,6 +226,8 @@ typedef struct xfs_agfl {
222 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ 226 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
223} xfs_agfl_t; 227} xfs_agfl_t;
224 228
229#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
230
225/* 231/*
226 * tags for inode radix tree 232 * tags for inode radix tree
227 */ 233 */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 9eab2dfdcbb5..c1cf6a336a72 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -474,7 +474,6 @@ xfs_agfl_read_verify(
474 struct xfs_buf *bp) 474 struct xfs_buf *bp)
475{ 475{
476 struct xfs_mount *mp = bp->b_target->bt_mount; 476 struct xfs_mount *mp = bp->b_target->bt_mount;
477 int agfl_ok = 1;
478 477
479 /* 478 /*
480 * There is no verification of non-crc AGFLs because mkfs does not 479 * There is no verification of non-crc AGFLs because mkfs does not
@@ -485,15 +484,13 @@ xfs_agfl_read_verify(
485 if (!xfs_sb_version_hascrc(&mp->m_sb)) 484 if (!xfs_sb_version_hascrc(&mp->m_sb))
486 return; 485 return;
487 486
488 agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 487 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
489 offsetof(struct xfs_agfl, agfl_crc)); 488 xfs_buf_ioerror(bp, EFSBADCRC);
490 489 else if (!xfs_agfl_verify(bp))
491 agfl_ok = agfl_ok && xfs_agfl_verify(bp);
492
493 if (!agfl_ok) {
494 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
495 xfs_buf_ioerror(bp, EFSCORRUPTED); 490 xfs_buf_ioerror(bp, EFSCORRUPTED);
496 } 491
492 if (bp->b_error)
493 xfs_verifier_error(bp);
497} 494}
498 495
499static void 496static void
@@ -508,16 +505,15 @@ xfs_agfl_write_verify(
508 return; 505 return;
509 506
510 if (!xfs_agfl_verify(bp)) { 507 if (!xfs_agfl_verify(bp)) {
511 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
512 xfs_buf_ioerror(bp, EFSCORRUPTED); 508 xfs_buf_ioerror(bp, EFSCORRUPTED);
509 xfs_verifier_error(bp);
513 return; 510 return;
514 } 511 }
515 512
516 if (bip) 513 if (bip)
517 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 514 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
518 515
519 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 516 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
520 offsetof(struct xfs_agfl, agfl_crc));
521} 517}
522 518
523const struct xfs_buf_ops xfs_agfl_buf_ops = { 519const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -2238,19 +2234,17 @@ xfs_agf_read_verify(
2238 struct xfs_buf *bp) 2234 struct xfs_buf *bp)
2239{ 2235{
2240 struct xfs_mount *mp = bp->b_target->bt_mount; 2236 struct xfs_mount *mp = bp->b_target->bt_mount;
2241 int agf_ok = 1;
2242
2243 if (xfs_sb_version_hascrc(&mp->m_sb))
2244 agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
2245 offsetof(struct xfs_agf, agf_crc));
2246 2237
2247 agf_ok = agf_ok && xfs_agf_verify(mp, bp); 2238 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2248 2239 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2249 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, 2240 xfs_buf_ioerror(bp, EFSBADCRC);
2250 XFS_RANDOM_ALLOC_READ_AGF))) { 2241 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
2251 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 2242 XFS_ERRTAG_ALLOC_READ_AGF,
2243 XFS_RANDOM_ALLOC_READ_AGF))
2252 xfs_buf_ioerror(bp, EFSCORRUPTED); 2244 xfs_buf_ioerror(bp, EFSCORRUPTED);
2253 } 2245
2246 if (bp->b_error)
2247 xfs_verifier_error(bp);
2254} 2248}
2255 2249
2256static void 2250static void
@@ -2261,8 +2255,8 @@ xfs_agf_write_verify(
2261 struct xfs_buf_log_item *bip = bp->b_fspriv; 2255 struct xfs_buf_log_item *bip = bp->b_fspriv;
2262 2256
2263 if (!xfs_agf_verify(mp, bp)) { 2257 if (!xfs_agf_verify(mp, bp)) {
2264 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
2265 xfs_buf_ioerror(bp, EFSCORRUPTED); 2258 xfs_buf_ioerror(bp, EFSCORRUPTED);
2259 xfs_verifier_error(bp);
2266 return; 2260 return;
2267 } 2261 }
2268 2262
@@ -2272,8 +2266,7 @@ xfs_agf_write_verify(
2272 if (bip) 2266 if (bip)
2273 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 2267 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2274 2268
2275 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 2269 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2276 offsetof(struct xfs_agf, agf_crc));
2277} 2270}
2278 2271
2279const struct xfs_buf_ops xfs_agf_buf_ops = { 2272const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 13085429e523..cc1eadcbb049 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -355,12 +355,14 @@ static void
355xfs_allocbt_read_verify( 355xfs_allocbt_read_verify(
356 struct xfs_buf *bp) 356 struct xfs_buf *bp)
357{ 357{
358 if (!(xfs_btree_sblock_verify_crc(bp) && 358 if (!xfs_btree_sblock_verify_crc(bp))
359 xfs_allocbt_verify(bp))) { 359 xfs_buf_ioerror(bp, EFSBADCRC);
360 trace_xfs_btree_corrupt(bp, _RET_IP_); 360 else if (!xfs_allocbt_verify(bp))
361 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
362 bp->b_target->bt_mount, bp->b_addr);
363 xfs_buf_ioerror(bp, EFSCORRUPTED); 361 xfs_buf_ioerror(bp, EFSCORRUPTED);
362
363 if (bp->b_error) {
364 trace_xfs_btree_corrupt(bp, _RET_IP_);
365 xfs_verifier_error(bp);
364 } 366 }
365} 367}
366 368
@@ -370,9 +372,9 @@ xfs_allocbt_write_verify(
370{ 372{
371 if (!xfs_allocbt_verify(bp)) { 373 if (!xfs_allocbt_verify(bp)) {
372 trace_xfs_btree_corrupt(bp, _RET_IP_); 374 trace_xfs_btree_corrupt(bp, _RET_IP_);
373 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
374 bp->b_target->bt_mount, bp->b_addr);
375 xfs_buf_ioerror(bp, EFSCORRUPTED); 375 xfs_buf_ioerror(bp, EFSCORRUPTED);
376 xfs_verifier_error(bp);
377 return;
376 } 378 }
377 xfs_btree_sblock_calc_crc(bp); 379 xfs_btree_sblock_calc_crc(bp);
378 380
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index db2cfb067d0b..75df77d09f75 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -632,38 +632,46 @@ xfs_map_at_offset(
632} 632}
633 633
634/* 634/*
635 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page contains at least one buffer of a given @type.
636 * or delayed allocate extent. 636 * If @check_all_buffers is true, then we walk all the buffers in the page to
637 * try to find one of the type passed in. If it is not set, then the caller only
638 * needs to check the first buffer on the page for a match.
637 */ 639 */
638STATIC int 640STATIC bool
639xfs_check_page_type( 641xfs_check_page_type(
640 struct page *page, 642 struct page *page,
641 unsigned int type) 643 unsigned int type,
644 bool check_all_buffers)
642{ 645{
643 if (PageWriteback(page)) 646 struct buffer_head *bh;
644 return 0; 647 struct buffer_head *head;
645 648
646 if (page->mapping && page_has_buffers(page)) { 649 if (PageWriteback(page))
647 struct buffer_head *bh, *head; 650 return false;
648 int acceptable = 0; 651 if (!page->mapping)
652 return false;
653 if (!page_has_buffers(page))
654 return false;
649 655
650 bh = head = page_buffers(page); 656 bh = head = page_buffers(page);
651 do { 657 do {
652 if (buffer_unwritten(bh)) 658 if (buffer_unwritten(bh)) {
653 acceptable += (type == XFS_IO_UNWRITTEN); 659 if (type == XFS_IO_UNWRITTEN)
654 else if (buffer_delay(bh)) 660 return true;
655 acceptable += (type == XFS_IO_DELALLOC); 661 } else if (buffer_delay(bh)) {
656 else if (buffer_dirty(bh) && buffer_mapped(bh)) 662 if (type == XFS_IO_DELALLOC)
657 acceptable += (type == XFS_IO_OVERWRITE); 663 return true;
658 else 664 } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
659 break; 665 if (type == XFS_IO_OVERWRITE)
660 } while ((bh = bh->b_this_page) != head); 666 return true;
667 }
661 668
662 if (acceptable) 669 /* If we are only checking the first buffer, we are done now. */
663 return 1; 670 if (!check_all_buffers)
664 } 671 break;
672 } while ((bh = bh->b_this_page) != head);
665 673
666 return 0; 674 return false;
667} 675}
668 676
669/* 677/*
@@ -697,7 +705,7 @@ xfs_convert_page(
697 goto fail_unlock_page; 705 goto fail_unlock_page;
698 if (page->mapping != inode->i_mapping) 706 if (page->mapping != inode->i_mapping)
699 goto fail_unlock_page; 707 goto fail_unlock_page;
700 if (!xfs_check_page_type(page, (*ioendp)->io_type)) 708 if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
701 goto fail_unlock_page; 709 goto fail_unlock_page;
702 710
703 /* 711 /*
@@ -742,6 +750,15 @@ xfs_convert_page(
742 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
743 page_dirty = p_offset / len; 751 page_dirty = p_offset / len;
744 752
753 /*
754 * The moment we find a buffer that doesn't match our current type
755 * specification or can't be written, abort the loop and start
756 * writeback. As per the above xfs_imap_valid() check, only
757 * xfs_vm_writepage() can handle partial page writeback fully - we are
758 * limited here to the buffers that are contiguous with the current
759 * ioend, and hence a buffer we can't write breaks that contiguity and
760 * we have to defer the rest of the IO to xfs_vm_writepage().
761 */
745 bh = head = page_buffers(page); 762 bh = head = page_buffers(page);
746 do { 763 do {
747 if (offset >= end_offset) 764 if (offset >= end_offset)
@@ -750,7 +767,7 @@ xfs_convert_page(
750 uptodate = 0; 767 uptodate = 0;
751 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
752 done = 1; 769 done = 1;
753 continue; 770 break;
754 } 771 }
755 772
756 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -762,10 +779,11 @@ xfs_convert_page(
762 else 779 else
763 type = XFS_IO_OVERWRITE; 780 type = XFS_IO_OVERWRITE;
764 781
765 if (!xfs_imap_valid(inode, imap, offset)) { 782 /*
766 done = 1; 783 * imap should always be valid because of the above
767 continue; 784 * partial page end_offset check on the imap.
768 } 785 */
786 ASSERT(xfs_imap_valid(inode, imap, offset));
769 787
770 lock_buffer(bh); 788 lock_buffer(bh);
771 if (type != XFS_IO_OVERWRITE) 789 if (type != XFS_IO_OVERWRITE)
@@ -777,6 +795,7 @@ xfs_convert_page(
777 count++; 795 count++;
778 } else { 796 } else {
779 done = 1; 797 done = 1;
798 break;
780 } 799 }
781 } while (offset += len, (bh = bh->b_this_page) != head); 800 } while (offset += len, (bh = bh->b_this_page) != head);
782 801
@@ -868,7 +887,7 @@ xfs_aops_discard_page(
868 struct buffer_head *bh, *head; 887 struct buffer_head *bh, *head;
869 loff_t offset = page_offset(page); 888 loff_t offset = page_offset(page);
870 889
871 if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) 890 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
872 goto out_invalidate; 891 goto out_invalidate;
873 892
874 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1441,7 +1460,8 @@ xfs_vm_direct_IO(
1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1460 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1442 offset, nr_segs, 1461 offset, nr_segs,
1443 xfs_get_blocks_direct, 1462 xfs_get_blocks_direct,
1444 xfs_end_io_direct_write, NULL, 0); 1463 xfs_end_io_direct_write, NULL,
1464 DIO_ASYNC_EXTEND);
1445 if (ret != -EIOCBQUEUED && iocb->private) 1465 if (ret != -EIOCBQUEUED && iocb->private)
1446 goto out_destroy_ioend; 1466 goto out_destroy_ioend;
1447 } else { 1467 } else {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 7b126f46a2f9..fe9587fab17a 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -213,8 +213,8 @@ xfs_attr3_leaf_write_verify(
213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; 213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
214 214
215 if (!xfs_attr3_leaf_verify(bp)) { 215 if (!xfs_attr3_leaf_verify(bp)) {
216 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
217 xfs_buf_ioerror(bp, EFSCORRUPTED); 216 xfs_buf_ioerror(bp, EFSCORRUPTED);
217 xfs_verifier_error(bp);
218 return; 218 return;
219 } 219 }
220 220
@@ -224,7 +224,7 @@ xfs_attr3_leaf_write_verify(
224 if (bip) 224 if (bip)
225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
226 226
227 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); 227 xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
228} 228}
229 229
230/* 230/*
@@ -239,13 +239,14 @@ xfs_attr3_leaf_read_verify(
239{ 239{
240 struct xfs_mount *mp = bp->b_target->bt_mount; 240 struct xfs_mount *mp = bp->b_target->bt_mount;
241 241
242 if ((xfs_sb_version_hascrc(&mp->m_sb) && 242 if (xfs_sb_version_hascrc(&mp->m_sb) &&
243 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 243 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
244 XFS_ATTR3_LEAF_CRC_OFF)) || 244 xfs_buf_ioerror(bp, EFSBADCRC);
245 !xfs_attr3_leaf_verify(bp)) { 245 else if (!xfs_attr3_leaf_verify(bp))
246 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
247 xfs_buf_ioerror(bp, EFSCORRUPTED); 246 xfs_buf_ioerror(bp, EFSCORRUPTED);
248 } 247
248 if (bp->b_error)
249 xfs_verifier_error(bp);
249} 250}
250 251
251const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { 252const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 5549d69ddb45..6e37823e2932 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -125,7 +125,6 @@ xfs_attr3_rmt_read_verify(
125 struct xfs_mount *mp = bp->b_target->bt_mount; 125 struct xfs_mount *mp = bp->b_target->bt_mount;
126 char *ptr; 126 char *ptr;
127 int len; 127 int len;
128 bool corrupt = false;
129 xfs_daddr_t bno; 128 xfs_daddr_t bno;
130 129
131 /* no verification of non-crc buffers */ 130 /* no verification of non-crc buffers */
@@ -140,11 +139,11 @@ xfs_attr3_rmt_read_verify(
140 while (len > 0) { 139 while (len > 0) {
141 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), 140 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
142 XFS_ATTR3_RMT_CRC_OFF)) { 141 XFS_ATTR3_RMT_CRC_OFF)) {
143 corrupt = true; 142 xfs_buf_ioerror(bp, EFSBADCRC);
144 break; 143 break;
145 } 144 }
146 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 145 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
147 corrupt = true; 146 xfs_buf_ioerror(bp, EFSCORRUPTED);
148 break; 147 break;
149 } 148 }
150 len -= XFS_LBSIZE(mp); 149 len -= XFS_LBSIZE(mp);
@@ -152,10 +151,9 @@ xfs_attr3_rmt_read_verify(
152 bno += mp->m_bsize; 151 bno += mp->m_bsize;
153 } 152 }
154 153
155 if (corrupt) { 154 if (bp->b_error)
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 155 xfs_verifier_error(bp);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 156 else
158 } else
159 ASSERT(len == 0); 157 ASSERT(len == 0);
160} 158}
161 159
@@ -180,9 +178,8 @@ xfs_attr3_rmt_write_verify(
180 178
181 while (len > 0) { 179 while (len > 0) {
182 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 180 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
183 XFS_CORRUPTION_ERROR(__func__,
184 XFS_ERRLEVEL_LOW, mp, bp->b_addr);
185 xfs_buf_ioerror(bp, EFSCORRUPTED); 181 xfs_buf_ioerror(bp, EFSCORRUPTED);
182 xfs_verifier_error(bp);
186 return; 183 return;
187 } 184 }
188 if (bip) { 185 if (bip) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 152543c4ca70..5b6092ef51ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5378,3 +5378,196 @@ error0:
5378 } 5378 }
5379 return error; 5379 return error;
5380} 5380}
5381
5382/*
5383 * Shift extent records to the left to cover a hole.
5384 *
5385 * The maximum number of extents to be shifted in a single operation
5386 * is @num_exts, and @current_ext keeps track of the current extent
5387 * index we have shifted. @offset_shift_fsb is the length by which each
5388 * extent is shifted. If there is no hole to shift the extents
5389 * into, this will be considered invalid operation and we abort immediately.
5390 */
5391int
5392xfs_bmap_shift_extents(
5393 struct xfs_trans *tp,
5394 struct xfs_inode *ip,
5395 int *done,
5396 xfs_fileoff_t start_fsb,
5397 xfs_fileoff_t offset_shift_fsb,
5398 xfs_extnum_t *current_ext,
5399 xfs_fsblock_t *firstblock,
5400 struct xfs_bmap_free *flist,
5401 int num_exts)
5402{
5403 struct xfs_btree_cur *cur;
5404 struct xfs_bmbt_rec_host *gotp;
5405 struct xfs_bmbt_irec got;
5406 struct xfs_bmbt_irec left;
5407 struct xfs_mount *mp = ip->i_mount;
5408 struct xfs_ifork *ifp;
5409 xfs_extnum_t nexts = 0;
5410 xfs_fileoff_t startoff;
5411 int error = 0;
5412 int i;
5413 int whichfork = XFS_DATA_FORK;
5414 int logflags;
5415 xfs_filblks_t blockcount = 0;
5416
5417 if (unlikely(XFS_TEST_ERROR(
5418 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5419 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
5420 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5421 XFS_ERROR_REPORT("xfs_bmap_shift_extents",
5422 XFS_ERRLEVEL_LOW, mp);
5423 return XFS_ERROR(EFSCORRUPTED);
5424 }
5425
5426 if (XFS_FORCED_SHUTDOWN(mp))
5427 return XFS_ERROR(EIO);
5428
5429 ASSERT(current_ext != NULL);
5430
5431 ifp = XFS_IFORK_PTR(ip, whichfork);
5432
5433 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
5434 /* Read in all the extents */
5435 error = xfs_iread_extents(tp, ip, whichfork);
5436 if (error)
5437 return error;
5438 }
5439
5440 /*
5441 * If *current_ext is 0, we would need to lookup the extent
5442 * from where we would start shifting and store it in gotp.
5443 */
5444 if (!*current_ext) {
5445 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
5446 /*
5447 * gotp can be null in 2 cases: 1) if there are no extents
5448 * or 2) start_fsb lies in a hole beyond which there are
5449 * no extents. Either way, we are done.
5450 */
5451 if (!gotp) {
5452 *done = 1;
5453 return 0;
5454 }
5455 }
5456
5457 /* We are going to change core inode */
5458 logflags = XFS_ILOG_CORE;
5459
5460 if (ifp->if_flags & XFS_IFBROOT) {
5461 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5462 cur->bc_private.b.firstblock = *firstblock;
5463 cur->bc_private.b.flist = flist;
5464 cur->bc_private.b.flags = 0;
5465 } else {
5466 cur = NULL;
5467 logflags |= XFS_ILOG_DEXT;
5468 }
5469
5470 while (nexts++ < num_exts &&
5471 *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) {
5472
5473 gotp = xfs_iext_get_ext(ifp, *current_ext);
5474 xfs_bmbt_get_all(gotp, &got);
5475 startoff = got.br_startoff - offset_shift_fsb;
5476
5477 /*
5478 * Before shifting extent into hole, make sure that the hole
5479 * is large enough to accomodate the shift.
5480 */
5481 if (*current_ext) {
5482 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5483 *current_ext - 1), &left);
5484
5485 if (startoff < left.br_startoff + left.br_blockcount)
5486 error = XFS_ERROR(EINVAL);
5487 } else if (offset_shift_fsb > got.br_startoff) {
5488 /*
5489 * When first extent is shifted, offset_shift_fsb
5490 * should be less than the stating offset of
5491 * the first extent.
5492 */
5493 error = XFS_ERROR(EINVAL);
5494 }
5495
5496 if (error)
5497 goto del_cursor;
5498
5499 if (cur) {
5500 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5501 got.br_startblock,
5502 got.br_blockcount,
5503 &i);
5504 if (error)
5505 goto del_cursor;
5506 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5507 }
5508
5509 /* Check if we can merge 2 adjacent extents */
5510 if (*current_ext &&
5511 left.br_startoff + left.br_blockcount == startoff &&
5512 left.br_startblock + left.br_blockcount ==
5513 got.br_startblock &&
5514 left.br_state == got.br_state &&
5515 left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
5516 blockcount = left.br_blockcount +
5517 got.br_blockcount;
5518 xfs_iext_remove(ip, *current_ext, 1, 0);
5519 if (cur) {
5520 error = xfs_btree_delete(cur, &i);
5521 if (error)
5522 goto del_cursor;
5523 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5524 }
5525 XFS_IFORK_NEXT_SET(ip, whichfork,
5526 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5527 gotp = xfs_iext_get_ext(ifp, --*current_ext);
5528 xfs_bmbt_get_all(gotp, &got);
5529
5530 /* Make cursor point to the extent we will update */
5531 if (cur) {
5532 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5533 got.br_startblock,
5534 got.br_blockcount,
5535 &i);
5536 if (error)
5537 goto del_cursor;
5538 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5539 }
5540
5541 xfs_bmbt_set_blockcount(gotp, blockcount);
5542 got.br_blockcount = blockcount;
5543 } else {
5544 /* We have to update the startoff */
5545 xfs_bmbt_set_startoff(gotp, startoff);
5546 got.br_startoff = startoff;
5547 }
5548
5549 if (cur) {
5550 error = xfs_bmbt_update(cur, got.br_startoff,
5551 got.br_startblock,
5552 got.br_blockcount,
5553 got.br_state);
5554 if (error)
5555 goto del_cursor;
5556 }
5557
5558 (*current_ext)++;
5559 }
5560
5561 /* Check if we are done */
5562 if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork))
5563 *done = 1;
5564
5565del_cursor:
5566 if (cur)
5567 xfs_btree_del_cursor(cur,
5568 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5569
5570 xfs_trans_log_inode(tp, ip, logflags);
5571
5572 return error;
5573}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f351225..f84bd7af43be 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
127 { BMAP_RIGHT_FILLING, "RF" }, \ 127 { BMAP_RIGHT_FILLING, "RF" }, \
128 { BMAP_ATTRFORK, "ATTR" } 128 { BMAP_ATTRFORK, "ATTR" }
129 129
130
131/*
132 * This macro is used to determine how many extents will be shifted
133 * in one write transaction. We could require two splits,
134 * an extent move on the first and an extent merge on the second,
135 * So it is proper that one extent is shifted inside write transaction
136 * at a time.
137 */
138#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
139
130#ifdef DEBUG 140#ifdef DEBUG
131void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 141void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
132 int whichfork, unsigned long caller_ip); 142 int whichfork, unsigned long caller_ip);
@@ -169,5 +179,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
169int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, 179int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
170 xfs_extnum_t num); 180 xfs_extnum_t num);
171uint xfs_default_attroffset(struct xfs_inode *ip); 181uint xfs_default_attroffset(struct xfs_inode *ip);
182int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
183 int *done, xfs_fileoff_t start_fsb,
184 xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
185 xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
186 int num_exts);
172 187
173#endif /* __XFS_BMAP_H__ */ 188#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 706bc3f777cb..818d546664e7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -780,12 +780,14 @@ static void
780xfs_bmbt_read_verify( 780xfs_bmbt_read_verify(
781 struct xfs_buf *bp) 781 struct xfs_buf *bp)
782{ 782{
783 if (!(xfs_btree_lblock_verify_crc(bp) && 783 if (!xfs_btree_lblock_verify_crc(bp))
784 xfs_bmbt_verify(bp))) { 784 xfs_buf_ioerror(bp, EFSBADCRC);
785 trace_xfs_btree_corrupt(bp, _RET_IP_); 785 else if (!xfs_bmbt_verify(bp))
786 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
787 bp->b_target->bt_mount, bp->b_addr);
788 xfs_buf_ioerror(bp, EFSCORRUPTED); 786 xfs_buf_ioerror(bp, EFSCORRUPTED);
787
788 if (bp->b_error) {
789 trace_xfs_btree_corrupt(bp, _RET_IP_);
790 xfs_verifier_error(bp);
789 } 791 }
790} 792}
791 793
@@ -794,11 +796,9 @@ xfs_bmbt_write_verify(
794 struct xfs_buf *bp) 796 struct xfs_buf *bp)
795{ 797{
796 if (!xfs_bmbt_verify(bp)) { 798 if (!xfs_bmbt_verify(bp)) {
797 xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
798 trace_xfs_btree_corrupt(bp, _RET_IP_); 799 trace_xfs_btree_corrupt(bp, _RET_IP_);
799 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
800 bp->b_target->bt_mount, bp->b_addr);
801 xfs_buf_ioerror(bp, EFSCORRUPTED); 800 xfs_buf_ioerror(bp, EFSCORRUPTED);
801 xfs_verifier_error(bp);
802 return; 802 return;
803 } 803 }
804 xfs_btree_lblock_calc_crc(bp); 804 xfs_btree_lblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f264616080ca..01f6a646caa1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1349,7 +1349,6 @@ xfs_free_file_space(
1349 * the freeing of the space succeeds at ENOSPC. 1349 * the freeing of the space succeeds at ENOSPC.
1350 */ 1350 */
1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1352 tp->t_flags |= XFS_TRANS_RESERVE;
1353 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); 1352 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1354 1353
1355 /* 1354 /*
@@ -1468,6 +1467,102 @@ out:
1468} 1467}
1469 1468
1470/* 1469/*
1470 * xfs_collapse_file_space()
1471 * This routine frees disk space and shift extent for the given file.
1472 * The first thing we do is to free data blocks in the specified range
1473 * by calling xfs_free_file_space(). It would also sync dirty data
1474 * and invalidate page cache over the region on which collapse range
1475 * is working. And Shift extent records to the left to cover a hole.
1476 * RETURNS:
1477 * 0 on success
1478 * errno on error
1479 *
1480 */
1481int
1482xfs_collapse_file_space(
1483 struct xfs_inode *ip,
1484 xfs_off_t offset,
1485 xfs_off_t len)
1486{
1487 int done = 0;
1488 struct xfs_mount *mp = ip->i_mount;
1489 struct xfs_trans *tp;
1490 int error;
1491 xfs_extnum_t current_ext = 0;
1492 struct xfs_bmap_free free_list;
1493 xfs_fsblock_t first_block;
1494 int committed;
1495 xfs_fileoff_t start_fsb;
1496 xfs_fileoff_t shift_fsb;
1497
1498 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1499
1500 trace_xfs_collapse_file_space(ip);
1501
1502 start_fsb = XFS_B_TO_FSB(mp, offset + len);
1503 shift_fsb = XFS_B_TO_FSB(mp, len);
1504
1505 error = xfs_free_file_space(ip, offset, len);
1506 if (error)
1507 return error;
1508
1509 while (!error && !done) {
1510 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1511 tp->t_flags |= XFS_TRANS_RESERVE;
1512 /*
1513 * We would need to reserve permanent block for transaction.
1514 * This will come into picture when after shifting extent into
1515 * hole we found that adjacent extents can be merged which
1516 * may lead to freeing of a block during record update.
1517 */
1518 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1519 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1520 if (error) {
1521 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1522 xfs_trans_cancel(tp, 0);
1523 break;
1524 }
1525
1526 xfs_ilock(ip, XFS_ILOCK_EXCL);
1527 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1528 ip->i_gdquot, ip->i_pdquot,
1529 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1530 XFS_QMOPT_RES_REGBLKS);
1531 if (error)
1532 goto out;
1533
1534 xfs_trans_ijoin(tp, ip, 0);
1535
1536 xfs_bmap_init(&free_list, &first_block);
1537
1538 /*
1539 * We are using the write transaction in which max 2 bmbt
1540 * updates are allowed
1541 */
1542 error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
1543 shift_fsb, &current_ext,
1544 &first_block, &free_list,
1545 XFS_BMAP_MAX_SHIFT_EXTENTS);
1546 if (error)
1547 goto out;
1548
1549 error = xfs_bmap_finish(&tp, &free_list, &committed);
1550 if (error)
1551 goto out;
1552
1553 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1554 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1555 }
1556
1557 return error;
1558
1559out:
1560 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1561 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1562 return error;
1563}
1564
1565/*
1471 * We need to check that the format of the data fork in the temporary inode is 1566 * We need to check that the format of the data fork in the temporary inode is
1472 * valid for the target inode before doing the swap. This is not a problem with 1567 * valid for the target inode before doing the swap. This is not a problem with
1473 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized 1568 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 900747b25772..935ed2b24edf 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -99,6 +99,8 @@ int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
99 xfs_off_t len); 99 xfs_off_t len);
100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, 100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
101 xfs_off_t len); 101 xfs_off_t len);
102int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
103 xfs_off_t len);
102 104
103/* EOF block manipulation functions */ 105/* EOF block manipulation functions */
104bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); 106bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 9adaae4f3e2f..e80d59fdf89a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -234,8 +234,7 @@ xfs_btree_lblock_calc_crc(
234 return; 234 return;
235 if (bip) 235 if (bip)
236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 237 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238 XFS_BTREE_LBLOCK_CRC_OFF);
239} 238}
240 239
241bool 240bool
@@ -243,8 +242,8 @@ xfs_btree_lblock_verify_crc(
243 struct xfs_buf *bp) 242 struct xfs_buf *bp)
244{ 243{
245 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 244 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
246 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 245 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
247 XFS_BTREE_LBLOCK_CRC_OFF); 246
248 return true; 247 return true;
249} 248}
250 249
@@ -267,8 +266,7 @@ xfs_btree_sblock_calc_crc(
267 return; 266 return;
268 if (bip) 267 if (bip)
269 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 268 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
270 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 269 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
271 XFS_BTREE_SBLOCK_CRC_OFF);
272} 270}
273 271
274bool 272bool
@@ -276,8 +274,8 @@ xfs_btree_sblock_verify_crc(
276 struct xfs_buf *bp) 274 struct xfs_buf *bp)
277{ 275{
278 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 276 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 277 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280 XFS_BTREE_SBLOCK_CRC_OFF); 278
281 return true; 279 return true;
282} 280}
283 281
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9c061ef2b0d9..107f2fdfe41f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -396,7 +396,17 @@ _xfs_buf_map_pages(
396 bp->b_addr = NULL; 396 bp->b_addr = NULL;
397 } else { 397 } else {
398 int retried = 0; 398 int retried = 0;
399 unsigned noio_flag;
399 400
401 /*
402 * vm_map_ram() will allocate auxillary structures (e.g.
403 * pagetables) with GFP_KERNEL, yet we are likely to be under
404 * GFP_NOFS context here. Hence we need to tell memory reclaim
405 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
406 * memory reclaim re-entering the filesystem here and
407 * potentially deadlocking.
408 */
409 noio_flag = memalloc_noio_save();
400 do { 410 do {
401 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 411 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
402 -1, PAGE_KERNEL); 412 -1, PAGE_KERNEL);
@@ -404,6 +414,7 @@ _xfs_buf_map_pages(
404 break; 414 break;
405 vm_unmap_aliases(); 415 vm_unmap_aliases();
406 } while (retried++ <= 1); 416 } while (retried++ <= 1);
417 memalloc_noio_restore(noio_flag);
407 418
408 if (!bp->b_addr) 419 if (!bp->b_addr)
409 return -ENOMEM; 420 return -ENOMEM;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 995339534db6..b8a3abf6cf47 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -369,6 +369,20 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
369 xfs_buf_rele(bp); 369 xfs_buf_rele(bp);
370} 370}
371 371
372static inline int
373xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
374{
375 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
376 cksum_offset);
377}
378
379static inline void
380xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
381{
382 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
383 cksum_offset);
384}
385
372/* 386/*
373 * Handling of buftargs. 387 * Handling of buftargs.
374 */ 388 */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 33149113e333..8752821443be 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -796,20 +796,6 @@ xfs_buf_item_init(
796 bip->bli_formats[i].blf_map_size = map_size; 796 bip->bli_formats[i].blf_map_size = map_size;
797 } 797 }
798 798
799#ifdef XFS_TRANS_DEBUG
800 /*
801 * Allocate the arrays for tracking what needs to be logged
802 * and what our callers request to be logged. bli_orig
803 * holds a copy of the original, clean buffer for comparison
804 * against, and bli_logged keeps a 1 bit flag per byte in
805 * the buffer to indicate which bytes the callers have asked
806 * to have logged.
807 */
808 bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
809 memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
810 bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
811#endif
812
813 /* 799 /*
814 * Put the buf item into the list of items attached to the 800 * Put the buf item into the list of items attached to the
815 * buffer at the front. 801 * buffer at the front.
@@ -957,11 +943,6 @@ STATIC void
957xfs_buf_item_free( 943xfs_buf_item_free(
958 xfs_buf_log_item_t *bip) 944 xfs_buf_log_item_t *bip)
959{ 945{
960#ifdef XFS_TRANS_DEBUG
961 kmem_free(bip->bli_orig);
962 kmem_free(bip->bli_logged);
963#endif /* XFS_TRANS_DEBUG */
964
965 xfs_buf_item_free_format(bip); 946 xfs_buf_item_free_format(bip);
966 kmem_zone_free(xfs_buf_item_zone, bip); 947 kmem_zone_free(xfs_buf_item_zone, bip);
967} 948}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 796272a2e129..6cc5f6785a77 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -185,8 +185,8 @@ xfs_da3_node_write_verify(
185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
186 186
187 if (!xfs_da3_node_verify(bp)) { 187 if (!xfs_da3_node_verify(bp)) {
188 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
189 xfs_buf_ioerror(bp, EFSCORRUPTED); 188 xfs_buf_ioerror(bp, EFSCORRUPTED);
189 xfs_verifier_error(bp);
190 return; 190 return;
191 } 191 }
192 192
@@ -196,7 +196,7 @@ xfs_da3_node_write_verify(
196 if (bip) 196 if (bip)
197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
198 198
199 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); 199 xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
200} 200}
201 201
202/* 202/*
@@ -209,18 +209,20 @@ static void
209xfs_da3_node_read_verify( 209xfs_da3_node_read_verify(
210 struct xfs_buf *bp) 210 struct xfs_buf *bp)
211{ 211{
212 struct xfs_mount *mp = bp->b_target->bt_mount;
213 struct xfs_da_blkinfo *info = bp->b_addr; 212 struct xfs_da_blkinfo *info = bp->b_addr;
214 213
215 switch (be16_to_cpu(info->magic)) { 214 switch (be16_to_cpu(info->magic)) {
216 case XFS_DA3_NODE_MAGIC: 215 case XFS_DA3_NODE_MAGIC:
217 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
218 XFS_DA3_NODE_CRC_OFF)) 217 xfs_buf_ioerror(bp, EFSBADCRC);
219 break; 218 break;
219 }
220 /* fall through */ 220 /* fall through */
221 case XFS_DA_NODE_MAGIC: 221 case XFS_DA_NODE_MAGIC:
222 if (!xfs_da3_node_verify(bp)) 222 if (!xfs_da3_node_verify(bp)) {
223 xfs_buf_ioerror(bp, EFSCORRUPTED);
223 break; 224 break;
225 }
224 return; 226 return;
225 case XFS_ATTR_LEAF_MAGIC: 227 case XFS_ATTR_LEAF_MAGIC:
226 case XFS_ATTR3_LEAF_MAGIC: 228 case XFS_ATTR3_LEAF_MAGIC:
@@ -237,8 +239,7 @@ xfs_da3_node_read_verify(
237 } 239 }
238 240
239 /* corrupt block */ 241 /* corrupt block */
240 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 242 xfs_verifier_error(bp);
241 xfs_buf_ioerror(bp, EFSCORRUPTED);
242} 243}
243 244
244const struct xfs_buf_ops xfs_da3_node_buf_ops = { 245const struct xfs_buf_ops xfs_da3_node_buf_ops = {
@@ -1295,7 +1296,7 @@ xfs_da3_fixhashpath(
1295 node = blk->bp->b_addr; 1296 node = blk->bp->b_addr;
1296 dp->d_ops->node_hdr_from_disk(&nodehdr, node); 1297 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1297 btree = dp->d_ops->node_tree_p(node); 1298 btree = dp->d_ops->node_tree_p(node);
1298 if (be32_to_cpu(btree->hashval) == lasthash) 1299 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1299 break; 1300 break;
1300 blk->hashval = lasthash; 1301 blk->hashval = lasthash;
1301 btree[blk->index].hashval = cpu_to_be32(lasthash); 1302 btree[blk->index].hashval = cpu_to_be32(lasthash);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5869b50dc41..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -89,6 +89,8 @@ typedef struct xfs_dinode {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_dinode_t; 90} xfs_dinode_t;
91 91
92#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
93
92#define DI_MAX_FLUSH 0xffff 94#define DI_MAX_FLUSH 0xffff
93 95
94/* 96/*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index ce16ef02997a..fda46253966a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -180,16 +180,23 @@ xfs_dir_init(
180 xfs_inode_t *dp, 180 xfs_inode_t *dp,
181 xfs_inode_t *pdp) 181 xfs_inode_t *pdp)
182{ 182{
183 xfs_da_args_t args; 183 struct xfs_da_args *args;
184 int error; 184 int error;
185 185
186 memset((char *)&args, 0, sizeof(args));
187 args.dp = dp;
188 args.trans = tp;
189 ASSERT(S_ISDIR(dp->i_d.di_mode)); 186 ASSERT(S_ISDIR(dp->i_d.di_mode));
190 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) 187 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
188 if (error)
191 return error; 189 return error;
192 return xfs_dir2_sf_create(&args, pdp->i_ino); 190
191 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
192 if (!args)
193 return ENOMEM;
194
195 args->dp = dp;
196 args->trans = tp;
197 error = xfs_dir2_sf_create(args, pdp->i_ino);
198 kmem_free(args);
199 return error;
193} 200}
194 201
195/* 202/*
@@ -205,41 +212,56 @@ xfs_dir_createname(
205 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 212 xfs_bmap_free_t *flist, /* bmap's freeblock list */
206 xfs_extlen_t total) /* bmap's total block count */ 213 xfs_extlen_t total) /* bmap's total block count */
207{ 214{
208 xfs_da_args_t args; 215 struct xfs_da_args *args;
209 int rval; 216 int rval;
210 int v; /* type-checking value */ 217 int v; /* type-checking value */
211 218
212 ASSERT(S_ISDIR(dp->i_d.di_mode)); 219 ASSERT(S_ISDIR(dp->i_d.di_mode));
213 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 220 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
221 if (rval)
214 return rval; 222 return rval;
215 XFS_STATS_INC(xs_dir_create); 223 XFS_STATS_INC(xs_dir_create);
216 224
217 memset(&args, 0, sizeof(xfs_da_args_t)); 225 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
218 args.name = name->name; 226 if (!args)
219 args.namelen = name->len; 227 return ENOMEM;
220 args.filetype = name->type; 228
221 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 229 args->name = name->name;
222 args.inumber = inum; 230 args->namelen = name->len;
223 args.dp = dp; 231 args->filetype = name->type;
224 args.firstblock = first; 232 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
225 args.flist = flist; 233 args->inumber = inum;
226 args.total = total; 234 args->dp = dp;
227 args.whichfork = XFS_DATA_FORK; 235 args->firstblock = first;
228 args.trans = tp; 236 args->flist = flist;
229 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 237 args->total = total;
230 238 args->whichfork = XFS_DATA_FORK;
231 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 239 args->trans = tp;
232 rval = xfs_dir2_sf_addname(&args); 240 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
233 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 241
234 return rval; 242 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
235 else if (v) 243 rval = xfs_dir2_sf_addname(args);
236 rval = xfs_dir2_block_addname(&args); 244 goto out_free;
237 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 245 }
238 return rval; 246
239 else if (v) 247 rval = xfs_dir2_isblock(tp, dp, &v);
240 rval = xfs_dir2_leaf_addname(&args); 248 if (rval)
249 goto out_free;
250 if (v) {
251 rval = xfs_dir2_block_addname(args);
252 goto out_free;
253 }
254
255 rval = xfs_dir2_isleaf(tp, dp, &v);
256 if (rval)
257 goto out_free;
258 if (v)
259 rval = xfs_dir2_leaf_addname(args);
241 else 260 else
242 rval = xfs_dir2_node_addname(&args); 261 rval = xfs_dir2_node_addname(args);
262
263out_free:
264 kmem_free(args);
243 return rval; 265 return rval;
244} 266}
245 267
@@ -282,46 +304,66 @@ xfs_dir_lookup(
282 xfs_ino_t *inum, /* out: inode number */ 304 xfs_ino_t *inum, /* out: inode number */
283 struct xfs_name *ci_name) /* out: actual name if CI match */ 305 struct xfs_name *ci_name) /* out: actual name if CI match */
284{ 306{
285 xfs_da_args_t args; 307 struct xfs_da_args *args;
286 int rval; 308 int rval;
287 int v; /* type-checking value */ 309 int v; /* type-checking value */
288 310
289 ASSERT(S_ISDIR(dp->i_d.di_mode)); 311 ASSERT(S_ISDIR(dp->i_d.di_mode));
290 XFS_STATS_INC(xs_dir_lookup); 312 XFS_STATS_INC(xs_dir_lookup);
291 313
292 memset(&args, 0, sizeof(xfs_da_args_t)); 314 /*
293 args.name = name->name; 315 * We need to use KM_NOFS here so that lockdep will not throw false
294 args.namelen = name->len; 316 * positive deadlock warnings on a non-transactional lookup path. It is
295 args.filetype = name->type; 317 * safe to recurse into inode recalim in that case, but lockdep can't
296 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 318 * easily be taught about it. Hence KM_NOFS avoids having to add more
297 args.dp = dp; 319 * lockdep Doing this avoids having to add a bunch of lockdep class
298 args.whichfork = XFS_DATA_FORK; 320 * annotations into the reclaim path for the ilock.
299 args.trans = tp; 321 */
300 args.op_flags = XFS_DA_OP_OKNOENT; 322 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
323 args->name = name->name;
324 args->namelen = name->len;
325 args->filetype = name->type;
326 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
327 args->dp = dp;
328 args->whichfork = XFS_DATA_FORK;
329 args->trans = tp;
330 args->op_flags = XFS_DA_OP_OKNOENT;
301 if (ci_name) 331 if (ci_name)
302 args.op_flags |= XFS_DA_OP_CILOOKUP; 332 args->op_flags |= XFS_DA_OP_CILOOKUP;
303 333
304 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 334 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
305 rval = xfs_dir2_sf_lookup(&args); 335 rval = xfs_dir2_sf_lookup(args);
306 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 336 goto out_check_rval;
307 return rval; 337 }
308 else if (v) 338
309 rval = xfs_dir2_block_lookup(&args); 339 rval = xfs_dir2_isblock(tp, dp, &v);
310 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 340 if (rval)
311 return rval; 341 goto out_free;
312 else if (v) 342 if (v) {
313 rval = xfs_dir2_leaf_lookup(&args); 343 rval = xfs_dir2_block_lookup(args);
344 goto out_check_rval;
345 }
346
347 rval = xfs_dir2_isleaf(tp, dp, &v);
348 if (rval)
349 goto out_free;
350 if (v)
351 rval = xfs_dir2_leaf_lookup(args);
314 else 352 else
315 rval = xfs_dir2_node_lookup(&args); 353 rval = xfs_dir2_node_lookup(args);
354
355out_check_rval:
316 if (rval == EEXIST) 356 if (rval == EEXIST)
317 rval = 0; 357 rval = 0;
318 if (!rval) { 358 if (!rval) {
319 *inum = args.inumber; 359 *inum = args->inumber;
320 if (ci_name) { 360 if (ci_name) {
321 ci_name->name = args.value; 361 ci_name->name = args->value;
322 ci_name->len = args.valuelen; 362 ci_name->len = args->valuelen;
323 } 363 }
324 } 364 }
365out_free:
366 kmem_free(args);
325 return rval; 367 return rval;
326} 368}
327 369
@@ -338,38 +380,51 @@ xfs_dir_removename(
338 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 380 xfs_bmap_free_t *flist, /* bmap's freeblock list */
339 xfs_extlen_t total) /* bmap's total block count */ 381 xfs_extlen_t total) /* bmap's total block count */
340{ 382{
341 xfs_da_args_t args; 383 struct xfs_da_args *args;
342 int rval; 384 int rval;
343 int v; /* type-checking value */ 385 int v; /* type-checking value */
344 386
345 ASSERT(S_ISDIR(dp->i_d.di_mode)); 387 ASSERT(S_ISDIR(dp->i_d.di_mode));
346 XFS_STATS_INC(xs_dir_remove); 388 XFS_STATS_INC(xs_dir_remove);
347 389
348 memset(&args, 0, sizeof(xfs_da_args_t)); 390 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
349 args.name = name->name; 391 if (!args)
350 args.namelen = name->len; 392 return ENOMEM;
351 args.filetype = name->type; 393
352 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 394 args->name = name->name;
353 args.inumber = ino; 395 args->namelen = name->len;
354 args.dp = dp; 396 args->filetype = name->type;
355 args.firstblock = first; 397 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
356 args.flist = flist; 398 args->inumber = ino;
357 args.total = total; 399 args->dp = dp;
358 args.whichfork = XFS_DATA_FORK; 400 args->firstblock = first;
359 args.trans = tp; 401 args->flist = flist;
360 402 args->total = total;
361 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 403 args->whichfork = XFS_DATA_FORK;
362 rval = xfs_dir2_sf_removename(&args); 404 args->trans = tp;
363 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 405
364 return rval; 406 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
365 else if (v) 407 rval = xfs_dir2_sf_removename(args);
366 rval = xfs_dir2_block_removename(&args); 408 goto out_free;
367 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 409 }
368 return rval; 410
369 else if (v) 411 rval = xfs_dir2_isblock(tp, dp, &v);
370 rval = xfs_dir2_leaf_removename(&args); 412 if (rval)
413 goto out_free;
414 if (v) {
415 rval = xfs_dir2_block_removename(args);
416 goto out_free;
417 }
418
419 rval = xfs_dir2_isleaf(tp, dp, &v);
420 if (rval)
421 goto out_free;
422 if (v)
423 rval = xfs_dir2_leaf_removename(args);
371 else 424 else
372 rval = xfs_dir2_node_removename(&args); 425 rval = xfs_dir2_node_removename(args);
426out_free:
427 kmem_free(args);
373 return rval; 428 return rval;
374} 429}
375 430
@@ -386,40 +441,54 @@ xfs_dir_replace(
386 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 441 xfs_bmap_free_t *flist, /* bmap's freeblock list */
387 xfs_extlen_t total) /* bmap's total block count */ 442 xfs_extlen_t total) /* bmap's total block count */
388{ 443{
389 xfs_da_args_t args; 444 struct xfs_da_args *args;
390 int rval; 445 int rval;
391 int v; /* type-checking value */ 446 int v; /* type-checking value */
392 447
393 ASSERT(S_ISDIR(dp->i_d.di_mode)); 448 ASSERT(S_ISDIR(dp->i_d.di_mode));
394 449
395 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 450 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
451 if (rval)
396 return rval; 452 return rval;
397 453
398 memset(&args, 0, sizeof(xfs_da_args_t)); 454 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
399 args.name = name->name; 455 if (!args)
400 args.namelen = name->len; 456 return ENOMEM;
401 args.filetype = name->type; 457
402 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 458 args->name = name->name;
403 args.inumber = inum; 459 args->namelen = name->len;
404 args.dp = dp; 460 args->filetype = name->type;
405 args.firstblock = first; 461 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
406 args.flist = flist; 462 args->inumber = inum;
407 args.total = total; 463 args->dp = dp;
408 args.whichfork = XFS_DATA_FORK; 464 args->firstblock = first;
409 args.trans = tp; 465 args->flist = flist;
410 466 args->total = total;
411 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 467 args->whichfork = XFS_DATA_FORK;
412 rval = xfs_dir2_sf_replace(&args); 468 args->trans = tp;
413 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 469
414 return rval; 470 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
415 else if (v) 471 rval = xfs_dir2_sf_replace(args);
416 rval = xfs_dir2_block_replace(&args); 472 goto out_free;
417 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 473 }
418 return rval; 474
419 else if (v) 475 rval = xfs_dir2_isblock(tp, dp, &v);
420 rval = xfs_dir2_leaf_replace(&args); 476 if (rval)
477 goto out_free;
478 if (v) {
479 rval = xfs_dir2_block_replace(args);
480 goto out_free;
481 }
482
483 rval = xfs_dir2_isleaf(tp, dp, &v);
484 if (rval)
485 goto out_free;
486 if (v)
487 rval = xfs_dir2_leaf_replace(args);
421 else 488 else
422 rval = xfs_dir2_node_replace(&args); 489 rval = xfs_dir2_node_replace(args);
490out_free:
491 kmem_free(args);
423 return rval; 492 return rval;
424} 493}
425 494
@@ -434,7 +503,7 @@ xfs_dir_canenter(
434 struct xfs_name *name, /* name of entry to add */ 503 struct xfs_name *name, /* name of entry to add */
435 uint resblks) 504 uint resblks)
436{ 505{
437 xfs_da_args_t args; 506 struct xfs_da_args *args;
438 int rval; 507 int rval;
439 int v; /* type-checking value */ 508 int v; /* type-checking value */
440 509
@@ -443,29 +512,42 @@ xfs_dir_canenter(
443 512
444 ASSERT(S_ISDIR(dp->i_d.di_mode)); 513 ASSERT(S_ISDIR(dp->i_d.di_mode));
445 514
446 memset(&args, 0, sizeof(xfs_da_args_t)); 515 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
447 args.name = name->name; 516 if (!args)
448 args.namelen = name->len; 517 return ENOMEM;
449 args.filetype = name->type; 518
450 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 519 args->name = name->name;
451 args.dp = dp; 520 args->namelen = name->len;
452 args.whichfork = XFS_DATA_FORK; 521 args->filetype = name->type;
453 args.trans = tp; 522 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
454 args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | 523 args->dp = dp;
524 args->whichfork = XFS_DATA_FORK;
525 args->trans = tp;
526 args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
455 XFS_DA_OP_OKNOENT; 527 XFS_DA_OP_OKNOENT;
456 528
457 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 529 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
458 rval = xfs_dir2_sf_addname(&args); 530 rval = xfs_dir2_sf_addname(args);
459 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 531 goto out_free;
460 return rval; 532 }
461 else if (v) 533
462 rval = xfs_dir2_block_addname(&args); 534 rval = xfs_dir2_isblock(tp, dp, &v);
463 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 535 if (rval)
464 return rval; 536 goto out_free;
465 else if (v) 537 if (v) {
466 rval = xfs_dir2_leaf_addname(&args); 538 rval = xfs_dir2_block_addname(args);
539 goto out_free;
540 }
541
542 rval = xfs_dir2_isleaf(tp, dp, &v);
543 if (rval)
544 goto out_free;
545 if (v)
546 rval = xfs_dir2_leaf_addname(args);
467 else 547 else
468 rval = xfs_dir2_node_addname(&args); 548 rval = xfs_dir2_node_addname(args);
549out_free:
550 kmem_free(args);
469 return rval; 551 return rval;
470} 552}
471 553
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 90cdbf4b5f19..4f6a38cb83a4 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -89,13 +89,14 @@ xfs_dir3_block_read_verify(
89{ 89{
90 struct xfs_mount *mp = bp->b_target->bt_mount; 90 struct xfs_mount *mp = bp->b_target->bt_mount;
91 91
92 if ((xfs_sb_version_hascrc(&mp->m_sb) && 92 if (xfs_sb_version_hascrc(&mp->m_sb) &&
93 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
94 XFS_DIR3_DATA_CRC_OFF)) || 94 xfs_buf_ioerror(bp, EFSBADCRC);
95 !xfs_dir3_block_verify(bp)) { 95 else if (!xfs_dir3_block_verify(bp))
96 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
97 xfs_buf_ioerror(bp, EFSCORRUPTED); 96 xfs_buf_ioerror(bp, EFSCORRUPTED);
98 } 97
98 if (bp->b_error)
99 xfs_verifier_error(bp);
99} 100}
100 101
101static void 102static void
@@ -107,8 +108,8 @@ xfs_dir3_block_write_verify(
107 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
108 109
109 if (!xfs_dir3_block_verify(bp)) { 110 if (!xfs_dir3_block_verify(bp)) {
110 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
111 xfs_buf_ioerror(bp, EFSCORRUPTED); 111 xfs_buf_ioerror(bp, EFSCORRUPTED);
112 xfs_verifier_error(bp);
112 return; 113 return;
113 } 114 }
114 115
@@ -118,7 +119,7 @@ xfs_dir3_block_write_verify(
118 if (bip) 119 if (bip)
119 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 120 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
120 121
121 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 122 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
122} 123}
123 124
124const struct xfs_buf_ops xfs_dir3_block_buf_ops = { 125const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 70acff4ee173..afa4ad523f3f 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -241,7 +241,6 @@ static void
241xfs_dir3_data_reada_verify( 241xfs_dir3_data_reada_verify(
242 struct xfs_buf *bp) 242 struct xfs_buf *bp)
243{ 243{
244 struct xfs_mount *mp = bp->b_target->bt_mount;
245 struct xfs_dir2_data_hdr *hdr = bp->b_addr; 244 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
246 245
247 switch (hdr->magic) { 246 switch (hdr->magic) {
@@ -255,8 +254,8 @@ xfs_dir3_data_reada_verify(
255 xfs_dir3_data_verify(bp); 254 xfs_dir3_data_verify(bp);
256 return; 255 return;
257 default: 256 default:
258 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
259 xfs_buf_ioerror(bp, EFSCORRUPTED); 257 xfs_buf_ioerror(bp, EFSCORRUPTED);
258 xfs_verifier_error(bp);
260 break; 259 break;
261 } 260 }
262} 261}
@@ -267,13 +266,14 @@ xfs_dir3_data_read_verify(
267{ 266{
268 struct xfs_mount *mp = bp->b_target->bt_mount; 267 struct xfs_mount *mp = bp->b_target->bt_mount;
269 268
270 if ((xfs_sb_version_hascrc(&mp->m_sb) && 269 if (xfs_sb_version_hascrc(&mp->m_sb) &&
271 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 270 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
272 XFS_DIR3_DATA_CRC_OFF)) || 271 xfs_buf_ioerror(bp, EFSBADCRC);
273 !xfs_dir3_data_verify(bp)) { 272 else if (!xfs_dir3_data_verify(bp))
274 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
275 xfs_buf_ioerror(bp, EFSCORRUPTED); 273 xfs_buf_ioerror(bp, EFSCORRUPTED);
276 } 274
275 if (bp->b_error)
276 xfs_verifier_error(bp);
277} 277}
278 278
279static void 279static void
@@ -285,8 +285,8 @@ xfs_dir3_data_write_verify(
285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
286 286
287 if (!xfs_dir3_data_verify(bp)) { 287 if (!xfs_dir3_data_verify(bp)) {
288 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
289 xfs_buf_ioerror(bp, EFSCORRUPTED); 288 xfs_buf_ioerror(bp, EFSCORRUPTED);
289 xfs_verifier_error(bp);
290 return; 290 return;
291 } 291 }
292 292
@@ -296,7 +296,7 @@ xfs_dir3_data_write_verify(
296 if (bip) 296 if (bip)
297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
298 298
299 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 299 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
300} 300}
301 301
302const struct xfs_buf_ops xfs_dir3_data_buf_ops = { 302const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ae47ec6e16c4..d36e97df1187 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -179,13 +179,14 @@ __read_verify(
179{ 179{
180 struct xfs_mount *mp = bp->b_target->bt_mount; 180 struct xfs_mount *mp = bp->b_target->bt_mount;
181 181
182 if ((xfs_sb_version_hascrc(&mp->m_sb) && 182 if (xfs_sb_version_hascrc(&mp->m_sb) &&
183 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 183 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
184 XFS_DIR3_LEAF_CRC_OFF)) || 184 xfs_buf_ioerror(bp, EFSBADCRC);
185 !xfs_dir3_leaf_verify(bp, magic)) { 185 else if (!xfs_dir3_leaf_verify(bp, magic))
186 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
187 xfs_buf_ioerror(bp, EFSCORRUPTED); 186 xfs_buf_ioerror(bp, EFSCORRUPTED);
188 } 187
188 if (bp->b_error)
189 xfs_verifier_error(bp);
189} 190}
190 191
191static void 192static void
@@ -198,8 +199,8 @@ __write_verify(
198 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; 199 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
199 200
200 if (!xfs_dir3_leaf_verify(bp, magic)) { 201 if (!xfs_dir3_leaf_verify(bp, magic)) {
201 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
202 xfs_buf_ioerror(bp, EFSCORRUPTED); 202 xfs_buf_ioerror(bp, EFSCORRUPTED);
203 xfs_verifier_error(bp);
203 return; 204 return;
204 } 205 }
205 206
@@ -209,7 +210,7 @@ __write_verify(
209 if (bip) 210 if (bip)
210 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 211 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
211 212
212 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); 213 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
213} 214}
214 215
215static void 216static void
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 48c7d18f68c3..cb434d732681 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -115,13 +115,14 @@ xfs_dir3_free_read_verify(
115{ 115{
116 struct xfs_mount *mp = bp->b_target->bt_mount; 116 struct xfs_mount *mp = bp->b_target->bt_mount;
117 117
118 if ((xfs_sb_version_hascrc(&mp->m_sb) && 118 if (xfs_sb_version_hascrc(&mp->m_sb) &&
119 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
120 XFS_DIR3_FREE_CRC_OFF)) || 120 xfs_buf_ioerror(bp, EFSBADCRC);
121 !xfs_dir3_free_verify(bp)) { 121 else if (!xfs_dir3_free_verify(bp))
122 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
123 xfs_buf_ioerror(bp, EFSCORRUPTED); 122 xfs_buf_ioerror(bp, EFSCORRUPTED);
124 } 123
124 if (bp->b_error)
125 xfs_verifier_error(bp);
125} 126}
126 127
127static void 128static void
@@ -133,8 +134,8 @@ xfs_dir3_free_write_verify(
133 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
134 135
135 if (!xfs_dir3_free_verify(bp)) { 136 if (!xfs_dir3_free_verify(bp)) {
136 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
137 xfs_buf_ioerror(bp, EFSCORRUPTED); 137 xfs_buf_ioerror(bp, EFSCORRUPTED);
138 xfs_verifier_error(bp);
138 return; 139 return;
139 } 140 }
140 141
@@ -144,7 +145,7 @@ xfs_dir3_free_write_verify(
144 if (bip) 145 if (bip)
145 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 146 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
146 147
147 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); 148 xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
148} 149}
149 150
150const struct xfs_buf_ops xfs_dir3_free_buf_ops = { 151const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7aeb4c895b32..868b19f096bf 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -615,7 +615,7 @@ xfs_qm_dqread(
615 615
616 if (flags & XFS_QMOPT_DQALLOC) { 616 if (flags & XFS_QMOPT_DQALLOC) {
617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, 618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
619 XFS_QM_DQALLOC_SPACE_RES(mp), 0); 619 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
620 if (error) 620 if (error)
621 goto error1; 621 goto error1;
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index d401457d2f25..610da8177737 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -257,10 +257,13 @@ xfs_dquot_buf_read_verify(
257{ 257{
258 struct xfs_mount *mp = bp->b_target->bt_mount; 258 struct xfs_mount *mp = bp->b_target->bt_mount;
259 259
260 if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { 260 if (!xfs_dquot_buf_verify_crc(mp, bp))
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 261 xfs_buf_ioerror(bp, EFSBADCRC);
262 else if (!xfs_dquot_buf_verify(mp, bp))
262 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
263 } 264
265 if (bp->b_error)
266 xfs_verifier_error(bp);
264} 267}
265 268
266/* 269/*
@@ -275,8 +278,8 @@ xfs_dquot_buf_write_verify(
275 struct xfs_mount *mp = bp->b_target->bt_mount; 278 struct xfs_mount *mp = bp->b_target->bt_mount;
276 279
277 if (!xfs_dquot_buf_verify(mp, bp)) { 280 if (!xfs_dquot_buf_verify(mp, bp)) {
278 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
279 xfs_buf_ioerror(bp, EFSCORRUPTED); 281 xfs_buf_ioerror(bp, EFSCORRUPTED);
282 xfs_verifier_error(bp);
280 return; 283 return;
281 } 284 }
282} 285}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9995b807d627..edac5b057d28 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -156,7 +156,7 @@ xfs_error_report(
156{ 156{
157 if (level <= xfs_error_level) { 157 if (level <= xfs_error_level) {
158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, 158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
159 "Internal error %s at line %d of file %s. Caller 0x%p", 159 "Internal error %s at line %d of file %s. Caller %pF",
160 tag, linenum, filename, ra); 160 tag, linenum, filename, ra);
161 161
162 xfs_stack_trace(); 162 xfs_stack_trace();
@@ -178,3 +178,28 @@ xfs_corruption_error(
178 xfs_error_report(tag, level, mp, filename, linenum, ra); 178 xfs_error_report(tag, level, mp, filename, linenum, ra);
179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); 179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
180} 180}
181
182/*
183 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
184 * values, and omit the stack trace unless the error level is tuned high.
185 */
186void
187xfs_verifier_error(
188 struct xfs_buf *bp)
189{
190 struct xfs_mount *mp = bp->b_target->bt_mount;
191
192 xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
193 bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
194 __return_address, bp->b_bn);
195
196 xfs_alert(mp, "Unmount and run xfs_repair");
197
198 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
199 xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
200 xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
201 }
202
203 if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
204 xfs_stack_trace();
205}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44ee..c1c57d4a4b5d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
34extern void xfs_corruption_error(const char *tag, int level, 34extern void xfs_corruption_error(const char *tag, int level,
35 struct xfs_mount *mp, void *p, const char *filename, 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra); 36 int linenum, inst_t *ra);
37extern void xfs_verifier_error(struct xfs_buf *bp);
37 38
38#define XFS_ERROR_REPORT(e, lvl, mp) \ 39#define XFS_ERROR_REPORT(e, lvl, mp) \
39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 40 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 64b48eade91d..003c0051b62f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -823,7 +823,8 @@ xfs_file_fallocate(
823 823
824 if (!S_ISREG(inode->i_mode)) 824 if (!S_ISREG(inode->i_mode))
825 return -EINVAL; 825 return -EINVAL;
826 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 826 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
827 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
827 return -EOPNOTSUPP; 828 return -EOPNOTSUPP;
828 829
829 xfs_ilock(ip, XFS_IOLOCK_EXCL); 830 xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -831,6 +832,20 @@ xfs_file_fallocate(
831 error = xfs_free_file_space(ip, offset, len); 832 error = xfs_free_file_space(ip, offset, len);
832 if (error) 833 if (error)
833 goto out_unlock; 834 goto out_unlock;
835 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
836 unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
837
838 if (offset & blksize_mask || len & blksize_mask) {
839 error = -EINVAL;
840 goto out_unlock;
841 }
842
843 ASSERT(offset + len < i_size_read(inode));
844 new_size = i_size_read(inode) - len;
845
846 error = xfs_collapse_file_space(ip, offset, len);
847 if (error)
848 goto out_unlock;
834 } else { 849 } else {
835 if (!(mode & FALLOC_FL_KEEP_SIZE) && 850 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
836 offset + len > i_size_read(inode)) { 851 offset + len > i_size_read(inode)) {
@@ -840,8 +855,11 @@ xfs_file_fallocate(
840 goto out_unlock; 855 goto out_unlock;
841 } 856 }
842 857
843 error = xfs_alloc_file_space(ip, offset, len, 858 if (mode & FALLOC_FL_ZERO_RANGE)
844 XFS_BMAPI_PREALLOC); 859 error = xfs_zero_file_space(ip, offset, len);
860 else
861 error = xfs_alloc_file_space(ip, offset, len,
862 XFS_BMAPI_PREALLOC);
845 if (error) 863 if (error)
846 goto out_unlock; 864 goto out_unlock;
847 } 865 }
@@ -859,7 +877,7 @@ xfs_file_fallocate(
859 if (ip->i_d.di_mode & S_IXGRP) 877 if (ip->i_d.di_mode & S_IXGRP)
860 ip->i_d.di_mode &= ~S_ISGID; 878 ip->i_d.di_mode &= ~S_ISGID;
861 879
862 if (!(mode & FALLOC_FL_PUNCH_HOLE)) 880 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
863 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 881 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
864 882
865 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 883 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1465,6 +1483,7 @@ const struct file_operations xfs_dir_file_operations = {
1465 1483
1466static const struct vm_operations_struct xfs_file_vm_ops = { 1484static const struct vm_operations_struct xfs_file_vm_ops = {
1467 .fault = filemap_fault, 1485 .fault = filemap_fault,
1486 .map_pages = filemap_map_pages,
1468 .page_mkwrite = xfs_vm_page_mkwrite, 1487 .page_mkwrite = xfs_vm_page_mkwrite,
1469 .remap_pages = generic_file_remap_pages, 1488 .remap_pages = generic_file_remap_pages,
1470}; 1489};
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index b6ab5a3cfa12..9898f31d05d8 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr {
145 __be64 sl_lsn; 145 __be64 sl_lsn;
146}; 146};
147 147
148#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
149
148/* 150/*
149 * The maximum pathlen is 1024 bytes. Since the minimum file system 151 * The maximum pathlen is 1024 bytes. Since the minimum file system
150 * blocksize is 512 bytes, we can get a max of 3 extents back from 152 * blocksize is 512 bytes, we can get a max of 3 extents back from
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5d7f105a1c82..8f711db61a0c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc(
363 args.minleft = args.mp->m_in_maxlevels - 1; 363 args.minleft = args.mp->m_in_maxlevels - 1;
364 if ((error = xfs_alloc_vextent(&args))) 364 if ((error = xfs_alloc_vextent(&args)))
365 return error; 365 return error;
366
367 /*
368 * This request might have dirtied the transaction if the AG can
369 * satisfy the request, but the exact block was not available.
370 * If the allocation did fail, subsequent requests will relax
371 * the exact agbno requirement and increase the alignment
372 * instead. It is critical that the total size of the request
373 * (len + alignment + slop) does not increase from this point
374 * on, so reset minalignslop to ensure it is not included in
375 * subsequent requests.
376 */
377 args.minalignslop = 0;
366 } else 378 } else
367 args.fsbno = NULLFSBLOCK; 379 args.fsbno = NULLFSBLOCK;
368 380
@@ -1568,18 +1580,17 @@ xfs_agi_read_verify(
1568 struct xfs_buf *bp) 1580 struct xfs_buf *bp)
1569{ 1581{
1570 struct xfs_mount *mp = bp->b_target->bt_mount; 1582 struct xfs_mount *mp = bp->b_target->bt_mount;
1571 int agi_ok = 1;
1572
1573 if (xfs_sb_version_hascrc(&mp->m_sb))
1574 agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
1575 offsetof(struct xfs_agi, agi_crc));
1576 agi_ok = agi_ok && xfs_agi_verify(bp);
1577 1583
1578 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, 1584 if (xfs_sb_version_hascrc(&mp->m_sb) &&
1579 XFS_RANDOM_IALLOC_READ_AGI))) { 1585 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
1580 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 1586 xfs_buf_ioerror(bp, EFSBADCRC);
1587 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
1588 XFS_ERRTAG_IALLOC_READ_AGI,
1589 XFS_RANDOM_IALLOC_READ_AGI))
1581 xfs_buf_ioerror(bp, EFSCORRUPTED); 1590 xfs_buf_ioerror(bp, EFSCORRUPTED);
1582 } 1591
1592 if (bp->b_error)
1593 xfs_verifier_error(bp);
1583} 1594}
1584 1595
1585static void 1596static void
@@ -1590,8 +1601,8 @@ xfs_agi_write_verify(
1590 struct xfs_buf_log_item *bip = bp->b_fspriv; 1601 struct xfs_buf_log_item *bip = bp->b_fspriv;
1591 1602
1592 if (!xfs_agi_verify(bp)) { 1603 if (!xfs_agi_verify(bp)) {
1593 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
1594 xfs_buf_ioerror(bp, EFSCORRUPTED); 1604 xfs_buf_ioerror(bp, EFSCORRUPTED);
1605 xfs_verifier_error(bp);
1595 return; 1606 return;
1596 } 1607 }
1597 1608
@@ -1600,8 +1611,7 @@ xfs_agi_write_verify(
1600 1611
1601 if (bip) 1612 if (bip)
1602 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); 1613 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
1603 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 1614 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
1604 offsetof(struct xfs_agi, agi_crc));
1605} 1615}
1606 1616
1607const struct xfs_buf_ops xfs_agi_buf_ops = { 1617const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c8fa5bbb36de..7e309b11e87d 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -243,12 +243,14 @@ static void
243xfs_inobt_read_verify( 243xfs_inobt_read_verify(
244 struct xfs_buf *bp) 244 struct xfs_buf *bp)
245{ 245{
246 if (!(xfs_btree_sblock_verify_crc(bp) && 246 if (!xfs_btree_sblock_verify_crc(bp))
247 xfs_inobt_verify(bp))) { 247 xfs_buf_ioerror(bp, EFSBADCRC);
248 trace_xfs_btree_corrupt(bp, _RET_IP_); 248 else if (!xfs_inobt_verify(bp))
249 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
250 bp->b_target->bt_mount, bp->b_addr);
251 xfs_buf_ioerror(bp, EFSCORRUPTED); 249 xfs_buf_ioerror(bp, EFSCORRUPTED);
250
251 if (bp->b_error) {
252 trace_xfs_btree_corrupt(bp, _RET_IP_);
253 xfs_verifier_error(bp);
252 } 254 }
253} 255}
254 256
@@ -258,9 +260,9 @@ xfs_inobt_write_verify(
258{ 260{
259 if (!xfs_inobt_verify(bp)) { 261 if (!xfs_inobt_verify(bp)) {
260 trace_xfs_btree_corrupt(bp, _RET_IP_); 262 trace_xfs_btree_corrupt(bp, _RET_IP_);
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
262 bp->b_target->bt_mount, bp->b_addr);
263 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
264 xfs_verifier_error(bp);
265 return;
264 } 266 }
265 xfs_btree_sblock_calc_crc(bp); 267 xfs_btree_sblock_calc_crc(bp);
266 268
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a137e9f9a7d..5e7a38fa6ee6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,7 +42,6 @@
42#include "xfs_bmap_util.h" 42#include "xfs_bmap_util.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_dinode.h"
46#include "xfs_filestream.h" 45#include "xfs_filestream.h"
47#include "xfs_cksum.h" 46#include "xfs_cksum.h"
48#include "xfs_trace.h" 47#include "xfs_trace.h"
@@ -62,6 +61,8 @@ kmem_zone_t *xfs_inode_zone;
62 61
63STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 62STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
64 63
64STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
65
65/* 66/*
66 * helper function to extract extent size hint from inode 67 * helper function to extract extent size hint from inode
67 */ 68 */
@@ -1115,7 +1116,7 @@ xfs_bumplink(
1115{ 1116{
1116 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1117 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1117 1118
1118 ASSERT(ip->i_d.di_nlink > 0); 1119 ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
1119 ip->i_d.di_nlink++; 1120 ip->i_d.di_nlink++;
1120 inc_nlink(VFS_I(ip)); 1121 inc_nlink(VFS_I(ip));
1121 if ((ip->i_d.di_version == 1) && 1122 if ((ip->i_d.di_version == 1) &&
@@ -1165,10 +1166,7 @@ xfs_create(
1165 if (XFS_FORCED_SHUTDOWN(mp)) 1166 if (XFS_FORCED_SHUTDOWN(mp))
1166 return XFS_ERROR(EIO); 1167 return XFS_ERROR(EIO);
1167 1168
1168 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1169 prid = xfs_get_initial_prid(dp);
1169 prid = xfs_get_projid(dp);
1170 else
1171 prid = XFS_PROJID_DEFAULT;
1172 1170
1173 /* 1171 /*
1174 * Make sure that we have allocated dquot(s) on disk. 1172 * Make sure that we have allocated dquot(s) on disk.
@@ -1333,6 +1331,113 @@ xfs_create(
1333} 1331}
1334 1332
1335int 1333int
1334xfs_create_tmpfile(
1335 struct xfs_inode *dp,
1336 struct dentry *dentry,
1337 umode_t mode)
1338{
1339 struct xfs_mount *mp = dp->i_mount;
1340 struct xfs_inode *ip = NULL;
1341 struct xfs_trans *tp = NULL;
1342 int error;
1343 uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1344 prid_t prid;
1345 struct xfs_dquot *udqp = NULL;
1346 struct xfs_dquot *gdqp = NULL;
1347 struct xfs_dquot *pdqp = NULL;
1348 struct xfs_trans_res *tres;
1349 uint resblks;
1350
1351 if (XFS_FORCED_SHUTDOWN(mp))
1352 return XFS_ERROR(EIO);
1353
1354 prid = xfs_get_initial_prid(dp);
1355
1356 /*
1357 * Make sure that we have allocated dquot(s) on disk.
1358 */
1359 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1360 xfs_kgid_to_gid(current_fsgid()), prid,
1361 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1362 &udqp, &gdqp, &pdqp);
1363 if (error)
1364 return error;
1365
1366 resblks = XFS_IALLOC_SPACE_RES(mp);
1367 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
1368
1369 tres = &M_RES(mp)->tr_create_tmpfile;
1370 error = xfs_trans_reserve(tp, tres, resblks, 0);
1371 if (error == ENOSPC) {
1372 /* No space at all so try a "no-allocation" reservation */
1373 resblks = 0;
1374 error = xfs_trans_reserve(tp, tres, 0, 0);
1375 }
1376 if (error) {
1377 cancel_flags = 0;
1378 goto out_trans_cancel;
1379 }
1380
1381 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1382 pdqp, resblks, 1, 0);
1383 if (error)
1384 goto out_trans_cancel;
1385
1386 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
1387 prid, resblks > 0, &ip, NULL);
1388 if (error) {
1389 if (error == ENOSPC)
1390 goto out_trans_cancel;
1391 goto out_trans_abort;
1392 }
1393
1394 if (mp->m_flags & XFS_MOUNT_WSYNC)
1395 xfs_trans_set_sync(tp);
1396
1397 /*
1398 * Attach the dquot(s) to the inodes and modify them incore.
1399 * These ids of the inode couldn't have changed since the new
1400 * inode has been locked ever since it was created.
1401 */
1402 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1403
1404 ip->i_d.di_nlink--;
1405 d_tmpfile(dentry, VFS_I(ip));
1406 error = xfs_iunlink(tp, ip);
1407 if (error)
1408 goto out_trans_abort;
1409
1410 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1411 if (error)
1412 goto out_release_inode;
1413
1414 xfs_qm_dqrele(udqp);
1415 xfs_qm_dqrele(gdqp);
1416 xfs_qm_dqrele(pdqp);
1417
1418 return 0;
1419
1420 out_trans_abort:
1421 cancel_flags |= XFS_TRANS_ABORT;
1422 out_trans_cancel:
1423 xfs_trans_cancel(tp, cancel_flags);
1424 out_release_inode:
1425 /*
1426 * Wait until after the current transaction is aborted to
1427 * release the inode. This prevents recursive transactions
1428 * and deadlocks from xfs_inactive.
1429 */
1430 if (ip)
1431 IRELE(ip);
1432
1433 xfs_qm_dqrele(udqp);
1434 xfs_qm_dqrele(gdqp);
1435 xfs_qm_dqrele(pdqp);
1436
1437 return error;
1438}
1439
1440int
1336xfs_link( 1441xfs_link(
1337 xfs_inode_t *tdp, 1442 xfs_inode_t *tdp,
1338 xfs_inode_t *sip, 1443 xfs_inode_t *sip,
@@ -1397,6 +1502,12 @@ xfs_link(
1397 1502
1398 xfs_bmap_init(&free_list, &first_block); 1503 xfs_bmap_init(&free_list, &first_block);
1399 1504
1505 if (sip->i_d.di_nlink == 0) {
1506 error = xfs_iunlink_remove(tp, sip);
1507 if (error)
1508 goto abort_return;
1509 }
1510
1400 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1511 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1401 &first_block, &free_list, resblks); 1512 &first_block, &free_list, resblks);
1402 if (error) 1513 if (error)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65e2350f449c..396cc1fafd0d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,6 +20,7 @@
20 20
21#include "xfs_inode_buf.h" 21#include "xfs_inode_buf.h"
22#include "xfs_inode_fork.h" 22#include "xfs_inode_fork.h"
23#include "xfs_dinode.h"
23 24
24/* 25/*
25 * Kernel only inode definitions 26 * Kernel only inode definitions
@@ -192,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
192 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); 193 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
193} 194}
194 195
196static inline prid_t
197xfs_get_initial_prid(struct xfs_inode *dp)
198{
199 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
200 return xfs_get_projid(dp);
201
202 return XFS_PROJID_DEFAULT;
203}
204
195/* 205/*
196 * In-core inode flags. 206 * In-core inode flags.
197 */ 207 */
@@ -323,6 +333,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
323 struct xfs_inode **ipp, struct xfs_name *ci_name); 333 struct xfs_inode **ipp, struct xfs_name *ci_name);
324int xfs_create(struct xfs_inode *dp, struct xfs_name *name, 334int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
325 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); 335 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
336int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
337 umode_t mode);
326int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 338int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
327 struct xfs_inode *ip); 339 struct xfs_inode *ip);
328int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 340int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 4fc9f39dd89e..24e993996bdc 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -102,8 +102,7 @@ xfs_inode_buf_verify(
102 } 102 }
103 103
104 xfs_buf_ioerror(bp, EFSCORRUPTED); 104 xfs_buf_ioerror(bp, EFSCORRUPTED);
105 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, 105 xfs_verifier_error(bp);
106 mp, dip);
107#ifdef DEBUG 106#ifdef DEBUG
108 xfs_alert(mp, 107 xfs_alert(mp,
109 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 108 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
@@ -306,7 +305,7 @@ xfs_dinode_verify(
306 if (!xfs_sb_version_hascrc(&mp->m_sb)) 305 if (!xfs_sb_version_hascrc(&mp->m_sb))
307 return false; 306 return false;
308 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 307 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
309 offsetof(struct xfs_dinode, di_crc))) 308 XFS_DINODE_CRC_OFF))
310 return false; 309 return false;
311 if (be64_to_cpu(dip->di_ino) != ip->i_ino) 310 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
312 return false; 311 return false;
@@ -327,7 +326,7 @@ xfs_dinode_calc_crc(
327 326
328 ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); 327 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
329 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, 328 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
330 offsetof(struct xfs_dinode, di_crc)); 329 XFS_DINODE_CRC_OFF);
331 dip->di_crc = xfs_end_cksum(crc); 330 dip->di_crc = xfs_end_cksum(crc);
332} 331}
333 332
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 22d1cbea283d..3b80ebae05f5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -128,7 +128,6 @@ xfs_iomap_write_direct(
128 xfs_fsblock_t firstfsb; 128 xfs_fsblock_t firstfsb;
129 xfs_extlen_t extsz, temp; 129 xfs_extlen_t extsz, temp;
130 int nimaps; 130 int nimaps;
131 int bmapi_flag;
132 int quota_flag; 131 int quota_flag;
133 int rt; 132 int rt;
134 xfs_trans_t *tp; 133 xfs_trans_t *tp;
@@ -200,18 +199,15 @@ xfs_iomap_write_direct(
200 199
201 xfs_trans_ijoin(tp, ip, 0); 200 xfs_trans_ijoin(tp, ip, 0);
202 201
203 bmapi_flag = 0;
204 if (offset < XFS_ISIZE(ip) || extsz)
205 bmapi_flag |= XFS_BMAPI_PREALLOC;
206
207 /* 202 /*
208 * From this point onwards we overwrite the imap pointer that the 203 * From this point onwards we overwrite the imap pointer that the
209 * caller gave to us. 204 * caller gave to us.
210 */ 205 */
211 xfs_bmap_init(&free_list, &firstfsb); 206 xfs_bmap_init(&free_list, &firstfsb);
212 nimaps = 1; 207 nimaps = 1;
213 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, 208 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
214 &firstfsb, 0, imap, &nimaps, &free_list); 209 XFS_BMAPI_PREALLOC, &firstfsb, 0,
210 imap, &nimaps, &free_list);
215 if (error) 211 if (error)
216 goto out_bmap_cancel; 212 goto out_bmap_cancel;
217 213
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ddfb8190ca1..89b07e43ca28 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -39,6 +39,7 @@
39#include "xfs_da_btree.h" 39#include "xfs_da_btree.h"
40#include "xfs_dir2_priv.h" 40#include "xfs_dir2_priv.h"
41#include "xfs_dinode.h" 41#include "xfs_dinode.h"
42#include "xfs_trans_space.h"
42 43
43#include <linux/capability.h> 44#include <linux/capability.h>
44#include <linux/xattr.h> 45#include <linux/xattr.h>
@@ -48,6 +49,18 @@
48#include <linux/fiemap.h> 49#include <linux/fiemap.h>
49#include <linux/slab.h> 50#include <linux/slab.h>
50 51
52/*
53 * Directories have different lock order w.r.t. mmap_sem compared to regular
54 * files. This is due to readdir potentially triggering page faults on a user
55 * buffer inside filldir(), and this happens with the ilock on the directory
56 * held. For regular files, the lock order is the other way around - the
57 * mmap_sem is taken during the page fault, and then we lock the ilock to do
58 * block mapping. Hence we need a different class for the directory ilock so
59 * that lockdep can tell them apart.
60 */
61static struct lock_class_key xfs_nondir_ilock_class;
62static struct lock_class_key xfs_dir_ilock_class;
63
51static int 64static int
52xfs_initxattrs( 65xfs_initxattrs(
53 struct inode *inode, 66 struct inode *inode,
@@ -1034,6 +1047,19 @@ xfs_vn_fiemap(
1034 return 0; 1047 return 0;
1035} 1048}
1036 1049
1050STATIC int
1051xfs_vn_tmpfile(
1052 struct inode *dir,
1053 struct dentry *dentry,
1054 umode_t mode)
1055{
1056 int error;
1057
1058 error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
1059
1060 return -error;
1061}
1062
1037static const struct inode_operations xfs_inode_operations = { 1063static const struct inode_operations xfs_inode_operations = {
1038 .get_acl = xfs_get_acl, 1064 .get_acl = xfs_get_acl,
1039 .set_acl = xfs_set_acl, 1065 .set_acl = xfs_set_acl,
@@ -1072,6 +1098,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1072 .removexattr = generic_removexattr, 1098 .removexattr = generic_removexattr,
1073 .listxattr = xfs_vn_listxattr, 1099 .listxattr = xfs_vn_listxattr,
1074 .update_time = xfs_vn_update_time, 1100 .update_time = xfs_vn_update_time,
1101 .tmpfile = xfs_vn_tmpfile,
1075}; 1102};
1076 1103
1077static const struct inode_operations xfs_dir_ci_inode_operations = { 1104static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1099,6 +1126,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1099 .removexattr = generic_removexattr, 1126 .removexattr = generic_removexattr,
1100 .listxattr = xfs_vn_listxattr, 1127 .listxattr = xfs_vn_listxattr,
1101 .update_time = xfs_vn_update_time, 1128 .update_time = xfs_vn_update_time,
1129 .tmpfile = xfs_vn_tmpfile,
1102}; 1130};
1103 1131
1104static const struct inode_operations xfs_symlink_inode_operations = { 1132static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1191,6 +1219,7 @@ xfs_setup_inode(
1191 xfs_diflags_to_iflags(inode, ip); 1219 xfs_diflags_to_iflags(inode, ip);
1192 1220
1193 ip->d_ops = ip->i_mount->m_nondir_inode_ops; 1221 ip->d_ops = ip->i_mount->m_nondir_inode_ops;
1222 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
1194 switch (inode->i_mode & S_IFMT) { 1223 switch (inode->i_mode & S_IFMT) {
1195 case S_IFREG: 1224 case S_IFREG:
1196 inode->i_op = &xfs_inode_operations; 1225 inode->i_op = &xfs_inode_operations;
@@ -1198,6 +1227,7 @@ xfs_setup_inode(
1198 inode->i_mapping->a_ops = &xfs_address_space_operations; 1227 inode->i_mapping->a_ops = &xfs_address_space_operations;
1199 break; 1228 break;
1200 case S_IFDIR: 1229 case S_IFDIR:
1230 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
1201 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) 1231 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
1202 inode->i_op = &xfs_dir_ci_inode_operations; 1232 inode->i_op = &xfs_dir_ci_inode_operations;
1203 else 1233 else
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index f9bb590acc0e..825249d2dfc1 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t;
119#include "xfs_iops.h" 119#include "xfs_iops.h"
120#include "xfs_aops.h" 120#include "xfs_aops.h"
121#include "xfs_super.h" 121#include "xfs_super.h"
122#include "xfs_cksum.h"
122#include "xfs_buf.h" 123#include "xfs_buf.h"
123#include "xfs_message.h" 124#include "xfs_message.h"
124 125
@@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t;
178#define ENOATTR ENODATA /* Attribute not found */ 179#define ENOATTR ENODATA /* Attribute not found */
179#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ 180#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
180#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 181#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
182#define EFSBADCRC EBADMSG /* Bad CRC detected */
181 183
182#define SYNCHRONIZE() barrier() 184#define SYNCHRONIZE() barrier()
183#define __return_address __builtin_return_address(0) 185#define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b0f4ef77fa70..2c4004475e71 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -175,7 +175,7 @@ void xlog_iodone(struct xfs_buf *);
175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
176void xfs_log_ticket_put(struct xlog_ticket *ticket); 176void xfs_log_ticket_put(struct xlog_ticket *ticket);
177 177
178int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 178void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
179 xfs_lsn_t *commit_lsn, int flags); 179 xfs_lsn_t *commit_lsn, int flags);
180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
181 181
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4ef6fdbced78..7e5455391176 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -499,13 +499,6 @@ xlog_cil_push(
499 cil->xc_ctx = new_ctx; 499 cil->xc_ctx = new_ctx;
500 500
501 /* 501 /*
502 * mirror the new sequence into the cil structure so that we can do
503 * unlocked checks against the current sequence in log forces without
504 * risking deferencing a freed context pointer.
505 */
506 cil->xc_current_sequence = new_ctx->sequence;
507
508 /*
509 * The switch is now done, so we can drop the context lock and move out 502 * The switch is now done, so we can drop the context lock and move out
510 * of a shared context. We can't just go straight to the commit record, 503 * of a shared context. We can't just go straight to the commit record,
511 * though - we need to synchronise with previous and future commits so 504 * though - we need to synchronise with previous and future commits so
@@ -523,8 +516,15 @@ xlog_cil_push(
523 * Hence we need to add this context to the committing context list so 516 * Hence we need to add this context to the committing context list so
524 * that higher sequences will wait for us to write out a commit record 517 * that higher sequences will wait for us to write out a commit record
525 * before they do. 518 * before they do.
519 *
520 * xfs_log_force_lsn requires us to mirror the new sequence into the cil
521 * structure atomically with the addition of this sequence to the
522 * committing list. This also ensures that we can do unlocked checks
523 * against the current sequence in log forces without risking
524 * deferencing a freed context pointer.
526 */ 525 */
527 spin_lock(&cil->xc_push_lock); 526 spin_lock(&cil->xc_push_lock);
527 cil->xc_current_sequence = new_ctx->sequence;
528 list_add(&ctx->committing, &cil->xc_committing); 528 list_add(&ctx->committing, &cil->xc_committing);
529 spin_unlock(&cil->xc_push_lock); 529 spin_unlock(&cil->xc_push_lock);
530 up_write(&cil->xc_ctx_lock); 530 up_write(&cil->xc_ctx_lock);
@@ -662,8 +662,14 @@ xlog_cil_push_background(
662 662
663} 663}
664 664
665/*
666 * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
667 * number that is passed. When it returns, the work will be queued for
668 * @push_seq, but it won't be completed. The caller is expected to do any
669 * waiting for push_seq to complete if it is required.
670 */
665static void 671static void
666xlog_cil_push_foreground( 672xlog_cil_push_now(
667 struct xlog *log, 673 struct xlog *log,
668 xfs_lsn_t push_seq) 674 xfs_lsn_t push_seq)
669{ 675{
@@ -688,10 +694,8 @@ xlog_cil_push_foreground(
688 } 694 }
689 695
690 cil->xc_push_seq = push_seq; 696 cil->xc_push_seq = push_seq;
697 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
691 spin_unlock(&cil->xc_push_lock); 698 spin_unlock(&cil->xc_push_lock);
692
693 /* do the push now */
694 xlog_cil_push(log);
695} 699}
696 700
697bool 701bool
@@ -721,7 +725,7 @@ xlog_cil_empty(
721 * background commit, returns without it held once background commits are 725 * background commit, returns without it held once background commits are
722 * allowed again. 726 * allowed again.
723 */ 727 */
724int 728void
725xfs_log_commit_cil( 729xfs_log_commit_cil(
726 struct xfs_mount *mp, 730 struct xfs_mount *mp,
727 struct xfs_trans *tp, 731 struct xfs_trans *tp,
@@ -767,7 +771,6 @@ xfs_log_commit_cil(
767 xlog_cil_push_background(log); 771 xlog_cil_push_background(log);
768 772
769 up_read(&cil->xc_ctx_lock); 773 up_read(&cil->xc_ctx_lock);
770 return 0;
771} 774}
772 775
773/* 776/*
@@ -796,7 +799,8 @@ xlog_cil_force_lsn(
796 * xlog_cil_push() handles racing pushes for the same sequence, 799 * xlog_cil_push() handles racing pushes for the same sequence,
797 * so no need to deal with it here. 800 * so no need to deal with it here.
798 */ 801 */
799 xlog_cil_push_foreground(log, sequence); 802restart:
803 xlog_cil_push_now(log, sequence);
800 804
801 /* 805 /*
802 * See if we can find a previous sequence still committing. 806 * See if we can find a previous sequence still committing.
@@ -804,7 +808,6 @@ xlog_cil_force_lsn(
804 * before allowing the force of push_seq to go ahead. Hence block 808 * before allowing the force of push_seq to go ahead. Hence block
805 * on commits for those as well. 809 * on commits for those as well.
806 */ 810 */
807restart:
808 spin_lock(&cil->xc_push_lock); 811 spin_lock(&cil->xc_push_lock);
809 list_for_each_entry(ctx, &cil->xc_committing, committing) { 812 list_for_each_entry(ctx, &cil->xc_committing, committing) {
810 if (ctx->sequence > sequence) 813 if (ctx->sequence > sequence)
@@ -822,6 +825,28 @@ restart:
822 /* found it! */ 825 /* found it! */
823 commit_lsn = ctx->commit_lsn; 826 commit_lsn = ctx->commit_lsn;
824 } 827 }
828
829 /*
830 * The call to xlog_cil_push_now() executes the push in the background.
831 * Hence by the time we have got here it our sequence may not have been
832 * pushed yet. This is true if the current sequence still matches the
833 * push sequence after the above wait loop and the CIL still contains
834 * dirty objects.
835 *
836 * When the push occurs, it will empty the CIL and
837 * atomically increment the currect sequence past the push sequence and
838 * move it into the committing list. Of course, if the CIL is clean at
839 * the time of the push, it won't have pushed the CIL at all, so in that
840 * case we should try the push for this sequence again from the start
841 * just in case.
842 */
843
844 if (sequence == cil->xc_current_sequence &&
845 !list_empty(&cil->xc_cil)) {
846 spin_unlock(&cil->xc_push_lock);
847 goto restart;
848 }
849
825 spin_unlock(&cil->xc_push_lock); 850 spin_unlock(&cil->xc_push_lock);
826 return commit_lsn; 851 return commit_lsn;
827} 852}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f96c05669a9e..993cb19e7d39 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,6 +314,9 @@ reread:
314 error = bp->b_error; 314 error = bp->b_error;
315 if (loud) 315 if (loud)
316 xfs_warn(mp, "SB validate failed with error %d.", error); 316 xfs_warn(mp, "SB validate failed with error %d.", error);
317 /* bad CRC means corrupted metadata */
318 if (error == EFSBADCRC)
319 error = EFSCORRUPTED;
317 goto release_buf; 320 goto release_buf;
318 } 321 }
319 322
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a6a76b2b6a85..ec5ca65c6211 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -842,7 +842,7 @@ xfs_growfs_rt_alloc(
842 /* 842 /*
843 * Reserve space & log for one extent added to the file. 843 * Reserve space & log for one extent added to the file.
844 */ 844 */
845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, 845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
846 resblks, 0); 846 resblks, 0);
847 if (error) 847 if (error)
848 goto error_cancel; 848 goto error_cancel;
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1e116794bb66..0c0e41bbe4e3 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -288,6 +288,7 @@ xfs_mount_validate_sb(
288 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 288 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
289 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 289 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
290 sbp->sb_inodesize != (1 << sbp->sb_inodelog) || 290 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
291 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
291 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 292 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
292 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 293 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
293 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 294 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
@@ -610,12 +611,11 @@ xfs_sb_read_verify(
610 XFS_SB_VERSION_5) || 611 XFS_SB_VERSION_5) ||
611 dsb->sb_crc != 0)) { 612 dsb->sb_crc != 0)) {
612 613
613 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 614 if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
614 offsetof(struct xfs_sb, sb_crc))) {
615 /* Only fail bad secondaries on a known V5 filesystem */ 615 /* Only fail bad secondaries on a known V5 filesystem */
616 if (bp->b_bn == XFS_SB_DADDR || 616 if (bp->b_bn == XFS_SB_DADDR ||
617 xfs_sb_version_hascrc(&mp->m_sb)) { 617 xfs_sb_version_hascrc(&mp->m_sb)) {
618 error = EFSCORRUPTED; 618 error = EFSBADCRC;
619 goto out_error; 619 goto out_error;
620 } 620 }
621 } 621 }
@@ -624,10 +624,9 @@ xfs_sb_read_verify(
624 624
625out_error: 625out_error:
626 if (error) { 626 if (error) {
627 if (error == EFSCORRUPTED)
628 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
629 mp, bp->b_addr);
630 xfs_buf_ioerror(bp, error); 627 xfs_buf_ioerror(bp, error);
628 if (error == EFSCORRUPTED || error == EFSBADCRC)
629 xfs_verifier_error(bp);
631 } 630 }
632} 631}
633 632
@@ -662,9 +661,8 @@ xfs_sb_write_verify(
662 661
663 error = xfs_sb_verify(bp, false); 662 error = xfs_sb_verify(bp, false);
664 if (error) { 663 if (error) {
665 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
666 mp, bp->b_addr);
667 xfs_buf_ioerror(bp, error); 664 xfs_buf_ioerror(bp, error);
665 xfs_verifier_error(bp);
668 return; 666 return;
669 } 667 }
670 668
@@ -674,8 +672,7 @@ xfs_sb_write_verify(
674 if (bip) 672 if (bip)
675 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 673 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
676 674
677 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 675 xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
678 offsetof(struct xfs_sb, sb_crc));
679} 676}
680 677
681const struct xfs_buf_ops xfs_sb_buf_ops = { 678const struct xfs_buf_ops xfs_sb_buf_ops = {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 35061d4b614c..f7b2fe77c5a5 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -182,6 +182,8 @@ typedef struct xfs_sb {
182 /* must be padded to 64 bit alignment */ 182 /* must be padded to 64 bit alignment */
183} xfs_sb_t; 183} xfs_sb_t;
184 184
185#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
186
185/* 187/*
186 * Superblock - on disk version. Must match the in core version above. 188 * Superblock - on disk version. Must match the in core version above.
187 * Must be padded to 64 bit alignment. 189 * Must be padded to 64 bit alignment.
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 8c5035a13df1..4484e5151395 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -104,7 +104,8 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
104#define XFS_TRANS_SB_COUNT 41 104#define XFS_TRANS_SB_COUNT 41
105#define XFS_TRANS_CHECKPOINT 42 105#define XFS_TRANS_CHECKPOINT 42
106#define XFS_TRANS_ICREATE 43 106#define XFS_TRANS_ICREATE 43
107#define XFS_TRANS_TYPE_MAX 43 107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
108/* new transaction types need to be reflected in xfs_logprint(8) */ 109/* new transaction types need to be reflected in xfs_logprint(8) */
109 110
110#define XFS_TRANS_TYPES \ 111#define XFS_TRANS_TYPES \
@@ -112,6 +113,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
112 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ 113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
113 { XFS_TRANS_INACTIVE, "INACTIVE" }, \ 114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
114 { XFS_TRANS_CREATE, "CREATE" }, \ 115 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
115 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ 117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
116 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ 118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
117 { XFS_TRANS_REMOVE, "REMOVE" }, \ 119 { XFS_TRANS_REMOVE, "REMOVE" }, \
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d971f4932b5d..205376776377 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -996,7 +996,7 @@ xfs_fs_evict_inode(
996 996
997 trace_xfs_evict_inode(ip); 997 trace_xfs_evict_inode(ip);
998 998
999 truncate_inode_pages(&inode->i_data, 0); 999 truncate_inode_pages_final(&inode->i_data);
1000 clear_inode(inode); 1000 clear_inode(inode);
1001 XFS_STATS_INC(vn_rele); 1001 XFS_STATS_INC(vn_rele);
1002 XFS_STATS_INC(vn_remove); 1002 XFS_STATS_INC(vn_remove);
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
1197 char *p; 1197 char *p;
1198 int error; 1198 int error;
1199 1199
1200 sync_filesystem(sb);
1200 while ((p = strsep(&options, ",")) != NULL) { 1201 while ((p = strsep(&options, ",")) != NULL) {
1201 int token; 1202 int token;
1202 1203
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 14e58f2c96bd..52979aa90986 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -80,6 +80,10 @@ xfs_readlink_bmap(
80 if (error) { 80 if (error) {
81 xfs_buf_ioerror_alert(bp, __func__); 81 xfs_buf_ioerror_alert(bp, __func__);
82 xfs_buf_relse(bp); 82 xfs_buf_relse(bp);
83
84 /* bad CRC means corrupted metadata */
85 if (error == EFSBADCRC)
86 error = EFSCORRUPTED;
83 goto out; 87 goto out;
84 } 88 }
85 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); 89 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -208,10 +212,7 @@ xfs_symlink(
208 return XFS_ERROR(ENAMETOOLONG); 212 return XFS_ERROR(ENAMETOOLONG);
209 213
210 udqp = gdqp = NULL; 214 udqp = gdqp = NULL;
211 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 215 prid = xfs_get_initial_prid(dp);
212 prid = xfs_get_projid(dp);
213 else
214 prid = XFS_PROJID_DEFAULT;
215 216
216 /* 217 /*
217 * Make sure that we have allocated dquot(s) on disk. 218 * Make sure that we have allocated dquot(s) on disk.
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index bf59a2b45f8c..9b32052ff65e 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -133,12 +133,13 @@ xfs_symlink_read_verify(
133 if (!xfs_sb_version_hascrc(&mp->m_sb)) 133 if (!xfs_sb_version_hascrc(&mp->m_sb))
134 return; 134 return;
135 135
136 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 136 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
137 offsetof(struct xfs_dsymlink_hdr, sl_crc)) || 137 xfs_buf_ioerror(bp, EFSBADCRC);
138 !xfs_symlink_verify(bp)) { 138 else if (!xfs_symlink_verify(bp))
139 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
140 xfs_buf_ioerror(bp, EFSCORRUPTED); 139 xfs_buf_ioerror(bp, EFSCORRUPTED);
141 } 140
141 if (bp->b_error)
142 xfs_verifier_error(bp);
142} 143}
143 144
144static void 145static void
@@ -153,8 +154,8 @@ xfs_symlink_write_verify(
153 return; 154 return;
154 155
155 if (!xfs_symlink_verify(bp)) { 156 if (!xfs_symlink_verify(bp)) {
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 157 xfs_buf_ioerror(bp, EFSCORRUPTED);
158 xfs_verifier_error(bp);
158 return; 159 return;
159 } 160 }
160 161
@@ -162,8 +163,7 @@ xfs_symlink_write_verify(
162 struct xfs_dsymlink_hdr *dsl = bp->b_addr; 163 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
163 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 164 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
164 } 165 }
165 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 166 xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
166 offsetof(struct xfs_dsymlink_hdr, sl_crc));
167} 167}
168 168
169const struct xfs_buf_ops xfs_symlink_buf_ops = { 169const struct xfs_buf_ops xfs_symlink_buf_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 425dfa45b9a0..a4ae41c179a8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
603DEFINE_INODE_EVENT(xfs_inactive_symlink); 603DEFINE_INODE_EVENT(xfs_inactive_symlink);
604DEFINE_INODE_EVENT(xfs_alloc_file_space); 604DEFINE_INODE_EVENT(xfs_alloc_file_space);
605DEFINE_INODE_EVENT(xfs_free_file_space); 605DEFINE_INODE_EVENT(xfs_free_file_space);
606DEFINE_INODE_EVENT(xfs_collapse_file_space);
606DEFINE_INODE_EVENT(xfs_readdir); 607DEFINE_INODE_EVENT(xfs_readdir);
607#ifdef CONFIG_XFS_POSIX_ACL 608#ifdef CONFIG_XFS_POSIX_ACL
608DEFINE_INODE_EVENT(xfs_get_acl); 609DEFINE_INODE_EVENT(xfs_get_acl);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c812c5c060de..54a57326d85b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -887,12 +887,7 @@ xfs_trans_commit(
887 xfs_trans_apply_sb_deltas(tp); 887 xfs_trans_apply_sb_deltas(tp);
888 xfs_trans_apply_dquot_deltas(tp); 888 xfs_trans_apply_dquot_deltas(tp);
889 889
890 error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); 890 xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
891 if (error == ENOMEM) {
892 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
893 error = XFS_ERROR(EIO);
894 goto out_unreserve;
895 }
896 891
897 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 892 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
898 xfs_trans_free(tp); 893 xfs_trans_free(tp);
@@ -902,10 +897,7 @@ xfs_trans_commit(
902 * log out now and wait for it. 897 * log out now and wait for it.
903 */ 898 */
904 if (sync) { 899 if (sync) {
905 if (!error) { 900 error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
906 error = _xfs_log_force_lsn(mp, commit_lsn,
907 XFS_LOG_SYNC, NULL);
908 }
909 XFS_STATS_INC(xs_trans_sync); 901 XFS_STATS_INC(xs_trans_sync);
910 } else { 902 } else {
911 XFS_STATS_INC(xs_trans_async); 903 XFS_STATS_INC(xs_trans_async);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 647b6f1d8923..b8eef0549f3f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -275,6 +275,10 @@ xfs_trans_read_buf_map(
275 XFS_BUF_UNDONE(bp); 275 XFS_BUF_UNDONE(bp);
276 xfs_buf_stale(bp); 276 xfs_buf_stale(bp);
277 xfs_buf_relse(bp); 277 xfs_buf_relse(bp);
278
279 /* bad CRC means corrupted metadata */
280 if (error == EFSBADCRC)
281 error = EFSCORRUPTED;
278 return error; 282 return error;
279 } 283 }
280#ifdef DEBUG 284#ifdef DEBUG
@@ -338,6 +342,9 @@ xfs_trans_read_buf_map(
338 if (tp->t_flags & XFS_TRANS_DIRTY) 342 if (tp->t_flags & XFS_TRANS_DIRTY)
339 xfs_force_shutdown(tp->t_mountp, 343 xfs_force_shutdown(tp->t_mountp,
340 SHUTDOWN_META_IO_ERROR); 344 SHUTDOWN_META_IO_ERROR);
345 /* bad CRC means corrupted metadata */
346 if (error == EFSBADCRC)
347 error = EFSCORRUPTED;
341 return error; 348 return error;
342 } 349 }
343 } 350 }
@@ -375,6 +382,10 @@ xfs_trans_read_buf_map(
375 if (tp->t_flags & XFS_TRANS_DIRTY) 382 if (tp->t_flags & XFS_TRANS_DIRTY)
376 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 383 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
377 xfs_buf_relse(bp); 384 xfs_buf_relse(bp);
385
386 /* bad CRC means corrupted metadata */
387 if (error == EFSBADCRC)
388 error = EFSCORRUPTED;
378 return error; 389 return error;
379 } 390 }
380#ifdef DEBUG 391#ifdef DEBUG
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2ffd3e331b49..ae368165244d 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -81,20 +81,28 @@ xfs_calc_buf_res(
81 * on disk. Hence we need an inode reservation function that calculates all this 81 * on disk. Hence we need an inode reservation function that calculates all this
82 * correctly. So, we log: 82 * correctly. So, we log:
83 * 83 *
84 * - log op headers for object 84 * - 4 log op headers for object
85 * - for the ilf, the inode core and 2 forks
85 * - inode log format object 86 * - inode log format object
86 * - the entire inode contents (core + 2 forks) 87 * - the inode core
87 * - two bmap btree block headers 88 * - two inode forks containing bmap btree root blocks.
89 * - the btree data contained by both forks will fit into the inode size,
90 * hence when combined with the inode core above, we have a total of the
91 * actual inode size.
92 * - the BMBT headers need to be accounted separately, as they are
93 * additional to the records and pointers that fit inside the inode
94 * forks.
88 */ 95 */
89STATIC uint 96STATIC uint
90xfs_calc_inode_res( 97xfs_calc_inode_res(
91 struct xfs_mount *mp, 98 struct xfs_mount *mp,
92 uint ninodes) 99 uint ninodes)
93{ 100{
94 return ninodes * (sizeof(struct xlog_op_header) + 101 return ninodes *
95 sizeof(struct xfs_inode_log_format) + 102 (4 * sizeof(struct xlog_op_header) +
96 mp->m_sb.sb_inodesize + 103 sizeof(struct xfs_inode_log_format) +
97 2 * XFS_BMBT_BLOCK_LEN(mp)); 104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
98} 106}
99 107
100/* 108/*
@@ -204,6 +212,19 @@ xfs_calc_rename_reservation(
204} 212}
205 213
206/* 214/*
215 * For removing an inode from unlinked list at first, we can modify:
216 * the agi hash list and counters: sector size
217 * the on disk inode before ours in the agi hash list: inode cluster size
218 */
219STATIC uint
220xfs_calc_iunlink_remove_reservation(
221 struct xfs_mount *mp)
222{
223 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
224 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
225}
226
227/*
207 * For creating a link to an inode: 228 * For creating a link to an inode:
208 * the parent directory inode: inode size 229 * the parent directory inode: inode size
209 * the linked inode: inode size 230 * the linked inode: inode size
@@ -220,6 +241,7 @@ xfs_calc_link_reservation(
220 struct xfs_mount *mp) 241 struct xfs_mount *mp)
221{ 242{
222 return XFS_DQUOT_LOGRES(mp) + 243 return XFS_DQUOT_LOGRES(mp) +
244 xfs_calc_iunlink_remove_reservation(mp) +
223 MAX((xfs_calc_inode_res(mp, 2) + 245 MAX((xfs_calc_inode_res(mp, 2) +
224 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 246 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
225 XFS_FSB_TO_B(mp, 1))), 247 XFS_FSB_TO_B(mp, 1))),
@@ -229,6 +251,18 @@ xfs_calc_link_reservation(
229} 251}
230 252
231/* 253/*
254 * For adding an inode to unlinked list we can modify:
255 * the agi hash list: sector size
256 * the unlinked inode: inode size
257 */
258STATIC uint
259xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
260{
261 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
262 xfs_calc_inode_res(mp, 1);
263}
264
265/*
232 * For removing a directory entry we can modify: 266 * For removing a directory entry we can modify:
233 * the parent directory inode: inode size 267 * the parent directory inode: inode size
234 * the removed inode: inode size 268 * the removed inode: inode size
@@ -245,10 +279,11 @@ xfs_calc_remove_reservation(
245 struct xfs_mount *mp) 279 struct xfs_mount *mp)
246{ 280{
247 return XFS_DQUOT_LOGRES(mp) + 281 return XFS_DQUOT_LOGRES(mp) +
248 MAX((xfs_calc_inode_res(mp, 2) + 282 xfs_calc_iunlink_add_reservation(mp) +
283 MAX((xfs_calc_inode_res(mp, 1) +
249 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 284 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
250 XFS_FSB_TO_B(mp, 1))), 285 XFS_FSB_TO_B(mp, 1))),
251 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + 286 (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
252 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), 287 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
253 XFS_FSB_TO_B(mp, 1)))); 288 XFS_FSB_TO_B(mp, 1))));
254} 289}
@@ -343,6 +378,20 @@ xfs_calc_create_reservation(
343 378
344} 379}
345 380
381STATIC uint
382xfs_calc_create_tmpfile_reservation(
383 struct xfs_mount *mp)
384{
385 uint res = XFS_DQUOT_LOGRES(mp);
386
387 if (xfs_sb_version_hascrc(&mp->m_sb))
388 res += xfs_calc_icreate_resv_alloc(mp);
389 else
390 res += xfs_calc_create_resv_alloc(mp);
391
392 return res + xfs_calc_iunlink_add_reservation(mp);
393}
394
346/* 395/*
347 * Making a new directory is the same as creating a new file. 396 * Making a new directory is the same as creating a new file.
348 */ 397 */
@@ -383,9 +432,9 @@ xfs_calc_ifree_reservation(
383{ 432{
384 return XFS_DQUOT_LOGRES(mp) + 433 return XFS_DQUOT_LOGRES(mp) +
385 xfs_calc_inode_res(mp, 1) + 434 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 435 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 436 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + 437 xfs_calc_iunlink_remove_reservation(mp) +
389 xfs_calc_buf_res(1, 0) + 438 xfs_calc_buf_res(1, 0) +
390 xfs_calc_buf_res(2 + mp->m_ialloc_blks + 439 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
391 mp->m_in_maxlevels, 0) + 440 mp->m_in_maxlevels, 0) +
@@ -644,15 +693,14 @@ xfs_calc_qm_setqlim_reservation(
644 693
645/* 694/*
646 * Allocating quota on disk if needed. 695 * Allocating quota on disk if needed.
647 * the write transaction log space: M_RES(mp)->tr_write.tr_logres 696 * the write transaction log space for quota file extent allocation
648 * the unit of quota allocation: one system block size 697 * the unit of quota allocation: one system block size
649 */ 698 */
650STATIC uint 699STATIC uint
651xfs_calc_qm_dqalloc_reservation( 700xfs_calc_qm_dqalloc_reservation(
652 struct xfs_mount *mp) 701 struct xfs_mount *mp)
653{ 702{
654 ASSERT(M_RES(mp)->tr_write.tr_logres); 703 return xfs_calc_write_reservation(mp) +
655 return M_RES(mp)->tr_write.tr_logres +
656 xfs_calc_buf_res(1, 704 xfs_calc_buf_res(1,
657 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); 705 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
658} 706}
@@ -729,6 +777,11 @@ xfs_trans_resv_calc(
729 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; 777 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
730 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 778 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
731 779
780 resp->tr_create_tmpfile.tr_logres =
781 xfs_calc_create_tmpfile_reservation(mp);
782 resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
783 resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
784
732 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); 785 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
733 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; 786 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
734 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 787 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -784,7 +837,6 @@ xfs_trans_resv_calc(
784 /* The following transaction are logged in logical format */ 837 /* The following transaction are logged in logical format */
785 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); 838 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
786 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); 839 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
787 resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
788 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); 840 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
789 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); 841 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
790 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); 842 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
index de7de9aaad8a..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/xfs_trans_resv.h
@@ -38,11 +38,11 @@ struct xfs_trans_resv {
38 struct xfs_trans_res tr_remove; /* unlink trans */ 38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */ 39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */ 40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
41 struct xfs_trans_res tr_mkdir; /* mkdir trans */ 42 struct xfs_trans_res tr_mkdir; /* mkdir trans */
42 struct xfs_trans_res tr_ifree; /* inode free trans */ 43 struct xfs_trans_res tr_ifree; /* inode free trans */
43 struct xfs_trans_res tr_ichange; /* inode update trans */ 44 struct xfs_trans_res tr_ichange; /* inode update trans */
44 struct xfs_trans_res tr_growdata; /* fs data section grow trans */ 45 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
45 struct xfs_trans_res tr_swrite; /* sync write inode trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ 46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ 47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer 48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
@@ -100,6 +100,7 @@ struct xfs_trans_resv {
100#define XFS_ITRUNCATE_LOG_COUNT 2 100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2 101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2 102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_CREATE_TMPFILE_LOG_COUNT 2
103#define XFS_MKDIR_LOG_COUNT 3 104#define XFS_MKDIR_LOG_COUNT 3
104#define XFS_SYMLINK_LOG_COUNT 3 105#define XFS_SYMLINK_LOG_COUNT 3
105#define XFS_REMOVE_LOG_COUNT 2 106#define XFS_REMOVE_LOG_COUNT 2