aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2014-06-24 04:46:07 -0400
committerJames Morris <james.l.morris@oracle.com>2014-06-24 04:46:07 -0400
commitf01387d2693813eb5271a3448e6a082322c7d75d (patch)
treeb591ca73c85276bae53d7db57ff1565be45a29da /fs
parent92953ff38ba59b4f7b1a54ab28b84be35fafaecc (diff)
parent1860e379875dfe7271c649058aeddffe5afd9d0d (diff)
Merge commit 'v3.15' into next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/super.c3
-rw-r--r--fs/affs/affs.h20
-rw-r--r--fs/affs/amigaffs.c23
-rw-r--r--fs/affs/dir.c28
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/namei.c32
-rw-r--r--fs/affs/super.c11
-rw-r--r--fs/afs/cmservice.c19
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/rxrpc.c84
-rw-r--r--fs/aio.c162
-rw-r--r--fs/autofs4/dev-ioctl.c3
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/befs/Makefile2
-rw-r--r--fs/befs/befs.h3
-rw-r--r--fs/befs/btree.c93
-rw-r--r--fs/befs/datastream.c87
-rw-r--r--fs/befs/debug.c74
-rw-r--r--fs/befs/inode.c10
-rw-r--r--fs/befs/io.c24
-rw-r--r--fs/befs/linuxvfs.c113
-rw-r--r--fs/bfs/inode.c4
-rw-r--r--fs/binfmt_elf.c13
-rw-r--r--fs/binfmt_misc.c1
-rw-r--r--fs/bio-integrity.c100
-rw-r--r--fs/bio.c13
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/btrfs/async-thread.c850
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c117
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c105
-rw-r--r--fs/btrfs/ctree.h100
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c314
-rw-r--r--fs/btrfs/extent-tree.c99
-rw-r--r--fs/btrfs/extent_io.c23
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c182
-rw-r--r--fs/btrfs/inode-map.c38
-rw-r--r--fs/btrfs/inode.c151
-rw-r--r--fs/btrfs/ioctl.c245
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c23
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c205
-rw-r--r--fs/btrfs/send.c874
-rw-r--r--fs/btrfs/super.c78
-rw-r--r--fs/btrfs/sysfs.c33
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c87
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c81
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c8
-rw-r--r--fs/cachefiles/bind.c1
-rw-r--r--fs/cachefiles/namei.c7
-rw-r--r--fs/cachefiles/rdwr.c33
-rw-r--r--fs/ceph/cache.c1
-rw-r--r--fs/ceph/cache.h10
-rw-r--r--fs/ceph/caps.c11
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c86
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c23
-rw-r--r--fs/ceph/inode.c147
-rw-r--r--fs/ceph/ioctl.c11
-rw-r--r--fs/ceph/locks.c99
-rw-r--r--fs/ceph/mds_client.c97
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c48
-rw-r--r--fs/cifs/cifsfs.c20
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/file.c162
-rw-r--r--fs/cifs/inode.c3
-rw-r--r--fs/cifs/misc.c74
-rw-r--r--fs/cifs/smb1ops.c11
-rw-r--r--fs/cifs/smb2misc.c18
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/coda/coda_int.h2
-rw-r--r--fs/coda/inode.c5
-rw-r--r--fs/compat.c162
-rw-r--r--fs/compat_binfmt_elf.c5
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/coredump.c7
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dcache.c484
-rw-r--r--fs/debugfs/inode.c7
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/dir.c4
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c7
-rw-r--r--fs/dlm/lockspace.c8
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dlm/recoverd.c34
-rw-r--r--fs/drop_caches.c16
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efivarfs/file.c13
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/exec.c42
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exofs/ore_raid.c4
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/acl.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext2/xattr_security.c4
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c88
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h28
-rw-r--r--fs/ext4/ext4_jbd2.c10
-rw-r--r--fs/ext4/extents.c861
-rw-r--r--fs/ext4/extents_status.c28
-rw-r--r--fs/ext4/extents_status.h9
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/inode.c177
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c25
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c5
-rw-r--r--fs/ext4/namei.c480
-rw-r--r--fs/ext4/page-io.c5
-rw-r--r--fs/ext4/super.c91
-rw-r--r--fs/ext4/xattr.c82
-rw-r--r--fs/ext4/xattr.h6
-rw-r--r--fs/f2fs/acl.c8
-rw-r--r--fs/f2fs/checkpoint.c208
-rw-r--r--fs/f2fs/data.c106
-rw-r--r--fs/f2fs/debug.c12
-rw-r--r--fs/f2fs/dir.c85
-rw-r--r--fs/f2fs/f2fs.h105
-rw-r--r--fs/f2fs/file.c32
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c29
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c334
-rw-r--r--fs/f2fs/node.h25
-rw-r--r--fs/f2fs/recovery.c37
-rw-r--r--fs/f2fs/segment.c222
-rw-r--r--fs/f2fs/segment.h75
-rw-r--r--fs/f2fs/super.c99
-rw-r--r--fs/f2fs/xattr.c7
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fcntl.c37
-rw-r--r--fs/file.c13
-rw-r--r--fs/file_table.c45
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_super.c1
-rw-r--r--fs/fs-writeback.c33
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/cuse.c9
-rw-r--r--fs/fuse/dev.c14
-rw-r--r--fs/fuse/dir.c193
-rw-r--r--fs/fuse/file.c344
-rw-r--r--fs/fuse/fuse_i.h26
-rw-r--r--fs/fuse/inode.c46
-rw-r--r--fs/gfs2/acl.c23
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c132
-rw-r--r--fs/gfs2/bmap.c115
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c23
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c28
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h37
-rw-r--r--fs/gfs2/inode.c75
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c102
-rw-r--r--fs/gfs2/lops.c85
-rw-r--r--fs/gfs2/lops.h5
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c89
-rw-r--r--fs/gfs2/quota.c18
-rw-r--r--fs/gfs2/recovery.c16
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c32
-rw-r--r--fs/gfs2/super.c41
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c29
-rw-r--r--fs/gfs2/util.c101
-rw-r--r--fs/gfs2/util.h31
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/attributes.c2
-rw-r--r--fs/hfsplus/extents.c16
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c22
-rw-r--r--fs/inode.c60
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd2/commit.c77
-rw-r--r--fs/jbd2/journal.c10
-rw-r--r--fs/jbd2/transaction.c46
-rw-r--r--fs/jffs2/compr_rtime.c4
-rw-r--r--fs/jffs2/fs.c13
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/nodemgmt.c14
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jfs/inode.c4
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/kernfs/Kconfig7
-rw-r--r--fs/kernfs/dir.c762
-rw-r--r--fs/kernfs/file.c41
-rw-r--r--fs/kernfs/inode.c16
-rw-r--r--fs/kernfs/kernfs-internal.h15
-rw-r--r--fs/kernfs/mount.c50
-rw-r--r--fs/kernfs/symlink.c6
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/locks.c408
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/mbcache.c540
-rw-r--r--fs/minix/inode.c5
-rw-r--r--fs/mount.h5
-rw-r--r--fs/namei.c390
-rw-r--r--fs/namespace.c56
-rw-r--r--fs/ncpfs/dir.c69
-rw-r--r--fs/ncpfs/file.c24
-rw-r--r--fs/ncpfs/getopt.c12
-rw-r--r--fs/ncpfs/inode.c85
-rw-r--r--fs/ncpfs/ioctl.c17
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/ncp_fs.h30
-rw-r--r--fs/ncpfs/ncp_fs_sb.h6
-rw-r--r--fs/ncpfs/ncplib_kernel.c28
-rw-r--r--fs/ncpfs/sock.c53
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/dir.c62
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/inode.c36
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3proc.c36
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c197
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4super.c2
-rw-r--r--fs/nfs/nfs4xdr.c3
-rw-r--r--fs/nfs/pnfs.c17
-rw-r--r--fs/nfs/proc.c25
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/unlink.c35
-rw-r--r--fs/nfsd/acl.h10
-rw-r--r--fs/nfsd/auth.c5
-rw-r--r--fs/nfsd/nfs4acl.c30
-rw-r--r--fs/nfsd/nfs4callback.c23
-rw-r--r--fs/nfsd/nfs4proc.c39
-rw-r--r--fs/nfsd/nfs4state.c68
-rw-r--r--fs/nfsd/nfs4xdr.c22
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.h14
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/nilfs2/cpfile.c12
-rw-r--r--fs/nilfs2/dat.c12
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/nilfs2/ioctl.c137
-rw-r--r--fs/nilfs2/sufile.c295
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/nilfs2/the_nilfs.c10
-rw-r--r--fs/notify/fanotify/fanotify.c63
-rw-r--r--fs/notify/fanotify/fanotify.h34
-rw-r--r--fs/notify/fanotify/fanotify_user.c199
-rw-r--r--fs/ntfs/debug.c58
-rw-r--r--fs/ntfs/debug.h7
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ntfs/super.c30
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c3
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/aops.h5
-rw-r--r--fs/ocfs2/buffer_head_io.c2
-rw-r--r--fs/ocfs2/cluster/sys.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c90
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/dcache.c61
-rw-r--r--fs/ocfs2/dcache.h12
-rw-r--r--fs/ocfs2/dir.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c27
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c8
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlmglue.c44
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/file.c78
-rw-r--r--fs/ocfs2/inode.c61
-rw-r--r--fs/ocfs2/inode.h17
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h11
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/move_extents.c7
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/ocfs2.h33
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c35
-rw-r--r--fs/ocfs2/stackglue.c22
-rw-r--r--fs/ocfs2/suballoc.c29
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c55
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/ocfs2/xattr.c35
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c94
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/pipe.c133
-rw-r--r--fs/pnode.c198
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/posix_acl.c11
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c55
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/namespaces.c14
-rw-r--r--fs/proc/proc_devtree.c241
-rw-r--r--fs/proc/root.c5
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c5
-rw-r--r--fs/proc/uptime.c2
-rw-r--r--fs/proc/vmcore.c3
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/Kconfig7
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/read_write.c36
-rw-r--r--fs/reiserfs/dir.c6
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/splice.c128
-rw-r--r--fs/squashfs/super.c1
-rw-r--r--fs/super.c7
-rw-r--r--fs/sysfs/Kconfig1
-rw-r--r--fs/sysfs/dir.c44
-rw-r--r--fs/sysfs/file.c118
-rw-r--r--fs/sysfs/group.c7
-rw-r--r--fs/sysfs/mount.c7
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/super.c5
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c4
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/ufs/ialloc.c4
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/xfs/kmem.c21
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc.c45
-rw-r--r--fs/xfs/xfs_alloc_btree.c16
-rw-r--r--fs/xfs/xfs_aops.c135
-rw-r--r--fs/xfs/xfs_attr.c24
-rw-r--r--fs/xfs/xfs_attr_leaf.c38
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_attr_remote.c23
-rw-r--r--fs/xfs/xfs_bmap.c198
-rw-r--r--fs/xfs/xfs_bmap.h15
-rw-r--r--fs/xfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/xfs_bmap_util.c110
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_btree.c14
-rw-r--r--fs/xfs/xfs_buf.c27
-rw-r--r--fs/xfs/xfs_buf.h14
-rw-r--r--fs/xfs/xfs_buf_item.c19
-rw-r--r--fs/xfs/xfs_da_btree.c19
-rw-r--r--fs/xfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c342
-rw-r--r--fs/xfs/xfs_dir2_block.c17
-rw-r--r--fs/xfs/xfs_dir2_data.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c17
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_dquot_buf.c11
-rw-r--r--fs/xfs/xfs_error.c27
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_file.c54
-rw-r--r--fs/xfs/xfs_format.h2
-rw-r--r--fs/xfs/xfs_ialloc.c36
-rw-r--r--fs/xfs/xfs_ialloc_btree.c16
-rw-r--r--fs/xfs/xfs_inode.c124
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_inode_buf.c7
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_iops.c73
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c63
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_qm.c26
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_sb.c21
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_shared.h4
-rw-r--r--fs/xfs/xfs_super.c7
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_symlink_remote.c16
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--fs/xfs/xfs_trans.c12
-rw-r--r--fs/xfs/xfs_trans_buf.c11
-rw-r--r--fs/xfs/xfs_trans_resv.c82
-rw-r--r--fs/xfs/xfs_trans_resv.h3
464 files changed, 12718 insertions, 8147 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a16b0ff497ca..d8223209d4b1 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
832 832
833static const struct vm_operations_struct v9fs_file_vm_ops = { 833static const struct vm_operations_struct v9fs_file_vm_ops = {
834 .fault = filemap_fault, 834 .fault = filemap_fault,
835 .map_pages = filemap_map_pages,
835 .page_mkwrite = v9fs_vm_page_mkwrite, 836 .page_mkwrite = v9fs_vm_page_mkwrite,
836 .remap_pages = generic_file_remap_pages, 837 .remap_pages = generic_file_remap_pages,
837}; 838};
@@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
839static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { 840static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
840 .close = v9fs_mmap_vm_close, 841 .close = v9fs_mmap_vm_close,
841 .fault = filemap_fault, 842 .fault = filemap_fault,
843 .map_pages = filemap_map_pages,
842 .page_mkwrite = v9fs_vm_page_mkwrite, 844 .page_mkwrite = v9fs_vm_page_mkwrite,
843 .remap_pages = generic_file_remap_pages, 845 .remap_pages = generic_file_remap_pages,
844}; 846};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bb7991c7e5c7..53161ec058a7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode)
451{ 451{
452 struct v9fs_inode *v9inode = V9FS_I(inode); 452 struct v9fs_inode *v9inode = V9FS_I(inode);
453 453
454 truncate_inode_pages(inode->i_mapping, 0); 454 truncate_inode_pages_final(inode->i_mapping);
455 clear_inode(inode); 455 clear_inode(inode);
456 filemap_fdatawrite(inode->i_mapping); 456 filemap_fdatawrite(inode->i_mapping);
457 457
diff --git a/fs/Kconfig b/fs/Kconfig
index 7385e54be4b9..312393f32948 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -96,6 +96,7 @@ endif # BLOCK
96menu "Pseudo filesystems" 96menu "Pseudo filesystems"
97 97
98source "fs/proc/Kconfig" 98source "fs/proc/Kconfig"
99source "fs/kernfs/Kconfig"
99source "fs/sysfs/Kconfig" 100source "fs/sysfs/Kconfig"
100 101
101config TMPFS 102config TMPFS
diff --git a/fs/Makefile b/fs/Makefile
index 47ac07bb4acc..f9cb9876e466 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
52obj-y += quota/ 52obj-y += quota/
53 53
54obj-$(CONFIG_PROC_FS) += proc/ 54obj-$(CONFIG_PROC_FS) += proc/
55obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ 55obj-$(CONFIG_KERNFS) += kernfs/
56obj-$(CONFIG_SYSFS) += sysfs/
56obj-$(CONFIG_CONFIGFS_FS) += configfs/ 57obj-$(CONFIG_CONFIGFS_FS) += configfs/
57obj-y += devpts/ 58obj-y += devpts/
58 59
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003cb6f1b..9852bdf34d76 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
212 212
213static int adfs_remount(struct super_block *sb, int *flags, char *data) 213static int adfs_remount(struct super_block *sb, int *flags, char *data)
214{ 214{
215 sync_filesystem(sb);
215 *flags |= MS_NODIRATIME; 216 *flags |= MS_NODIRATIME;
216 return parse_options(sb, data); 217 return parse_options(sb, data);
217} 218}
@@ -265,7 +266,7 @@ static void init_once(void *foo)
265 inode_init_once(&ei->vfs_inode); 266 inode_init_once(&ei->vfs_inode);
266} 267}
267 268
268static int init_inodecache(void) 269static int __init init_inodecache(void)
269{ 270{
270 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", 271 adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
271 sizeof(struct adfs_inode_info), 272 sizeof(struct adfs_inode_info),
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3952121f2f28..25b23b1e7f22 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -5,14 +5,6 @@
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/workqueue.h> 6#include <linux/workqueue.h>
7 7
8/* AmigaOS allows file names with up to 30 characters length.
9 * Names longer than that will be silently truncated. If you
10 * want to disallow this, comment out the following #define.
11 * Creating filesystem objects with longer names will then
12 * result in an error (ENAMETOOLONG).
13 */
14/*#define AFFS_NO_TRUNCATE */
15
16/* Ugly macros make the code more pretty. */ 8/* Ugly macros make the code more pretty. */
17 9
18#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) 10#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
@@ -28,7 +20,6 @@
28 20
29#define AFFS_CACHE_SIZE PAGE_SIZE 21#define AFFS_CACHE_SIZE PAGE_SIZE
30 22
31#define AFFS_MAX_PREALLOC 32
32#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) 23#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2)
33#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) 24#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
34#define AFFS_AC_MASK (AFFS_AC_SIZE-1) 25#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
@@ -118,6 +109,7 @@ struct affs_sb_info {
118#define SF_OFS 0x0200 /* Old filesystem */ 109#define SF_OFS 0x0200 /* Old filesystem */
119#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ 110#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
120#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ 111#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
112#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
121 113
122/* short cut to get to the affs specific sb data */ 114/* short cut to get to the affs specific sb data */
123static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) 115static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
137extern void secs_to_datestamp(time_t secs, struct affs_date *ds); 129extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
138extern umode_t prot_to_mode(u32 prot); 130extern umode_t prot_to_mode(u32 prot);
139extern void mode_to_prot(struct inode *inode); 131extern void mode_to_prot(struct inode *inode);
140extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); 132extern void affs_error(struct super_block *sb, const char *function,
141extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); 133 const char *fmt, ...);
142extern int affs_check_name(const unsigned char *name, int len); 134extern void affs_warning(struct super_block *sb, const char *function,
135 const char *fmt, ...);
136extern bool affs_nofilenametruncate(const struct dentry *dentry);
137extern int affs_check_name(const unsigned char *name, int len,
138 bool notruncate);
143extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); 139extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry);
144 140
145/* bitmap. c */ 141/* bitmap. c */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d9a43674cb94..533a322c41c0 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
471 function,ErrorBuffer); 471 function,ErrorBuffer);
472} 472}
473 473
474bool
475affs_nofilenametruncate(const struct dentry *dentry)
476{
477 struct inode *inode = dentry->d_inode;
478 return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
479
480}
481
474/* Check if the name is valid for a affs object. */ 482/* Check if the name is valid for a affs object. */
475 483
476int 484int
477affs_check_name(const unsigned char *name, int len) 485affs_check_name(const unsigned char *name, int len, bool notruncate)
478{ 486{
479 int i; 487 int i;
480 488
481 if (len > 30) 489 if (len > 30) {
482#ifdef AFFS_NO_TRUNCATE 490 if (notruncate)
483 return -ENAMETOOLONG; 491 return -ENAMETOOLONG;
484#else 492 else
485 len = 30; 493 len = 30;
486#endif 494 }
487
488 for (i = 0; i < len; i++) { 495 for (i = 0; i < len; i++) {
489 if (name[i] < ' ' || name[i] == ':' 496 if (name[i] < ' ' || name[i] == ':'
490 || (name[i] > 0x7e && name[i] < 0xa0)) 497 || (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1eba8c3644e..cbbda476a805 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx)
52 int hash_pos; 52 int hash_pos;
53 int chain_pos; 53 int chain_pos;
54 u32 ino; 54 u32 ino;
55 int error = 0;
55 56
56 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); 57 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",
58 inode->i_ino, (unsigned long)ctx->pos);
57 59
58 if (ctx->pos < 2) { 60 if (ctx->pos < 2) {
59 file->private_data = (void *)0; 61 file->private_data = (void *)0;
@@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
72 } 74 }
73 dir_bh = affs_bread(sb, inode->i_ino); 75 dir_bh = affs_bread(sb, inode->i_ino);
74 if (!dir_bh) 76 if (!dir_bh)
75 goto readdir_out; 77 goto out_unlock_dir;
76 78
77 /* If the directory hasn't changed since the last call to readdir(), 79 /* If the directory hasn't changed since the last call to readdir(),
78 * we can jump directly to where we left off. 80 * we can jump directly to where we left off.
@@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
88 fh_bh = affs_bread(sb, ino); 90 fh_bh = affs_bread(sb, ino);
89 if (!fh_bh) { 91 if (!fh_bh) {
90 affs_error(sb, "readdir","Cannot read block %d", i); 92 affs_error(sb, "readdir","Cannot read block %d", i);
91 return -EIO; 93 error = -EIO;
94 goto out_brelse_dir;
92 } 95 }
93 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 96 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
94 affs_brelse(fh_bh); 97 affs_brelse(fh_bh);
@@ -107,29 +110,34 @@ inside:
107 do { 110 do {
108 fh_bh = affs_bread(sb, ino); 111 fh_bh = affs_bread(sb, ino);
109 if (!fh_bh) { 112 if (!fh_bh) {
110 affs_error(sb, "readdir","Cannot read block %d", ino); 113 affs_error(sb, "readdir",
114 "Cannot read block %d", ino);
111 break; 115 break;
112 } 116 }
113 117
114 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 118 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
115 name = AFFS_TAIL(sb, fh_bh)->name + 1; 119 name = AFFS_TAIL(sb, fh_bh)->name + 1;
116 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", 120 pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", "
121 "ino=%u), hash=%d, f_pos=%x\n",
117 namelen, name, ino, hash_pos, (u32)ctx->pos); 122 namelen, name, ino, hash_pos, (u32)ctx->pos);
123
118 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) 124 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
119 goto readdir_done; 125 goto done;
120 ctx->pos++; 126 ctx->pos++;
121 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 127 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
122 affs_brelse(fh_bh); 128 affs_brelse(fh_bh);
123 fh_bh = NULL; 129 fh_bh = NULL;
124 } while (ino); 130 } while (ino);
125 } 131 }
126readdir_done: 132done:
127 file->f_version = inode->i_version; 133 file->f_version = inode->i_version;
128 file->private_data = (void *)(long)ino; 134 file->private_data = (void *)(long)ino;
135 affs_brelse(fh_bh);
129 136
130readdir_out: 137out_brelse_dir:
131 affs_brelse(dir_bh); 138 affs_brelse(dir_bh);
132 affs_brelse(fh_bh); 139
140out_unlock_dir:
133 affs_unlock_dir(inode); 141 affs_unlock_dir(inode);
134 return 0; 142 return error;
135} 143}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d08680e..96df91e8c334 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode)
259{ 259{
260 unsigned long cache_page; 260 unsigned long cache_page;
261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
262 truncate_inode_pages(&inode->i_data, 0); 262 truncate_inode_pages_final(&inode->i_data);
263 263
264 if (!inode->i_nlink) { 264 if (!inode->i_nlink) {
265 inode->i_size = 0; 265 inode->i_size = 0;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c36cbb4537a2..6dae1ccd176d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
60 * Note: the dentry argument is the parent dentry. 60 * Note: the dentry argument is the parent dentry.
61 */ 61 */
62static inline int 62static inline int
63__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) 63__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
64{ 64{
65 const u8 *name = qstr->name; 65 const u8 *name = qstr->name;
66 unsigned long hash; 66 unsigned long hash;
67 int i; 67 int i;
68 68
69 i = affs_check_name(qstr->name, qstr->len); 69 i = affs_check_name(qstr->name, qstr->len, notruncate);
70 if (i) 70 if (i)
71 return i; 71 return i;
72 72
@@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
82static int 82static int
83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) 83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
84{ 84{
85 return __affs_hash_dentry(qstr, affs_toupper); 85 return __affs_hash_dentry(qstr, affs_toupper,
86 affs_nofilenametruncate(dentry));
87
86} 88}
89
87static int 90static int
88affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) 91affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
89{ 92{
90 return __affs_hash_dentry(qstr, affs_intl_toupper); 93 return __affs_hash_dentry(qstr, affs_intl_toupper,
94 affs_nofilenametruncate(dentry));
95
91} 96}
92 97
93static inline int __affs_compare_dentry(unsigned int len, 98static inline int __affs_compare_dentry(unsigned int len,
94 const char *str, const struct qstr *name, toupper_t toupper) 99 const char *str, const struct qstr *name, toupper_t toupper,
100 bool notruncate)
95{ 101{
96 const u8 *aname = str; 102 const u8 *aname = str;
97 const u8 *bname = name->name; 103 const u8 *bname = name->name;
@@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
101 * must be valid. 'name' must be validated first. 107 * must be valid. 'name' must be validated first.
102 */ 108 */
103 109
104 if (affs_check_name(name->name, name->len)) 110 if (affs_check_name(name->name, name->len, notruncate))
105 return 1; 111 return 1;
106 112
107 /* 113 /*
@@ -126,13 +132,18 @@ static int
126affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, 132affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
127 unsigned int len, const char *str, const struct qstr *name) 133 unsigned int len, const char *str, const struct qstr *name)
128{ 134{
129 return __affs_compare_dentry(len, str, name, affs_toupper); 135
136 return __affs_compare_dentry(len, str, name, affs_toupper,
137 affs_nofilenametruncate(parent));
130} 138}
139
131static int 140static int
132affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, 141affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
133 unsigned int len, const char *str, const struct qstr *name) 142 unsigned int len, const char *str, const struct qstr *name)
134{ 143{
135 return __affs_compare_dentry(len, str, name, affs_intl_toupper); 144 return __affs_compare_dentry(len, str, name, affs_intl_toupper,
145 affs_nofilenametruncate(parent));
146
136} 147}
137 148
138/* 149/*
@@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
411 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 422 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
412 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 423 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
413 424
414 retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); 425 retval = affs_check_name(new_dentry->d_name.name,
426 new_dentry->d_name.len,
427 affs_nofilenametruncate(old_dentry));
428
415 if (retval) 429 if (retval)
416 return retval; 430 return retval;
417 431
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731b82ff..895ac7dc9dbf 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -128,7 +128,7 @@ static void init_once(void *foo)
128 inode_init_once(&ei->vfs_inode); 128 inode_init_once(&ei->vfs_inode);
129} 129}
130 130
131static int init_inodecache(void) 131static int __init init_inodecache(void)
132{ 132{
133 affs_inode_cachep = kmem_cache_create("affs_inode_cache", 133 affs_inode_cachep = kmem_cache_create("affs_inode_cache",
134 sizeof(struct affs_inode_info), 134 sizeof(struct affs_inode_info),
@@ -163,7 +163,7 @@ static const struct super_operations affs_sops = {
163}; 163};
164 164
165enum { 165enum {
166 Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, 166 Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
167 Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, 167 Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
168 Opt_verbose, Opt_volume, Opt_ignore, Opt_err, 168 Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
169}; 169};
@@ -172,6 +172,7 @@ static const match_table_t tokens = {
172 {Opt_bs, "bs=%u"}, 172 {Opt_bs, "bs=%u"},
173 {Opt_mode, "mode=%o"}, 173 {Opt_mode, "mode=%o"},
174 {Opt_mufs, "mufs"}, 174 {Opt_mufs, "mufs"},
175 {Opt_notruncate, "nofilenametruncate"},
175 {Opt_prefix, "prefix=%s"}, 176 {Opt_prefix, "prefix=%s"},
176 {Opt_protect, "protect"}, 177 {Opt_protect, "protect"},
177 {Opt_reserved, "reserved=%u"}, 178 {Opt_reserved, "reserved=%u"},
@@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
233 case Opt_mufs: 234 case Opt_mufs:
234 *mount_opts |= SF_MUFS; 235 *mount_opts |= SF_MUFS;
235 break; 236 break;
237 case Opt_notruncate:
238 *mount_opts |= SF_NO_TRUNCATE;
239 break;
236 case Opt_prefix: 240 case Opt_prefix:
237 *prefix = match_strdup(&args[0]); 241 *prefix = match_strdup(&args[0]);
238 if (!*prefix) 242 if (!*prefix)
@@ -336,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
336 &blocksize,&sbi->s_prefix, 340 &blocksize,&sbi->s_prefix,
337 sbi->s_volume, &mount_flags)) { 341 sbi->s_volume, &mount_flags)) {
338 printk(KERN_ERR "AFFS: Error parsing options\n"); 342 printk(KERN_ERR "AFFS: Error parsing options\n");
339 kfree(sbi->s_prefix);
340 kfree(sbi);
341 return -EINVAL; 343 return -EINVAL;
342 } 344 }
343 /* N.B. after this point s_prefix must be released */ 345 /* N.B. after this point s_prefix must be released */
@@ -530,6 +532,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
530 532
531 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 533 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
532 534
535 sync_filesystem(sb);
533 *flags |= MS_NODIRATIME; 536 *flags |= MS_NODIRATIME;
534 537
535 memcpy(volume, sbi->s_volume, 32); 538 memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de30..4b0eff6da674 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
130{ 130{
131 _enter(""); 131 _enter("");
132 132
133 /* Break the callbacks here so that we do it after the final ACK is
134 * received. The step number here must match the final number in
135 * afs_deliver_cb_callback().
136 */
137 if (call->unmarshall == 6) {
138 ASSERT(call->server && call->count && call->request);
139 afs_break_callbacks(call->server, call->count, call->request);
140 }
141
133 afs_put_server(call->server); 142 afs_put_server(call->server);
134 call->server = NULL; 143 call->server = NULL;
135 kfree(call->buffer); 144 kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
272 _debug("trailer"); 281 _debug("trailer");
273 if (skb->len != 0) 282 if (skb->len != 0)
274 return -EBADMSG; 283 return -EBADMSG;
284
285 /* Record that the message was unmarshalled successfully so
286 * that the call destructor can know do the callback breaking
287 * work, even if the final ACK isn't received.
288 *
289 * If the step number changes, then afs_cm_destructor() must be
290 * updated also.
291 */
292 call->unmarshall++;
293 case 6:
275 break; 294 break;
276 } 295 }
277 296
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index ce25d755b7aa..294671288449 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)
422 422
423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
424 424
425 truncate_inode_pages(&inode->i_data, 0); 425 truncate_inode_pages_final(&inode->i_data);
426 clear_inode(inode); 426 clear_inode(inode);
427 427
428 afs_give_up_callback(vnode); 428 afs_give_up_callback(vnode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6621f8008122..590b55f46d61 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
75 const struct afs_call_type *type; /* type of call */ 75 const struct afs_call_type *type; /* type of call */
76 const struct afs_wait_mode *wait_mode; /* completion wait mode */ 76 const struct afs_wait_mode *wait_mode; /* completion wait mode */
77 wait_queue_head_t waitq; /* processes awaiting completion */ 77 wait_queue_head_t waitq; /* processes awaiting completion */
78 void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
78 struct work_struct async_work; /* asynchronous work processor */ 79 struct work_struct async_work; /* asynchronous work processor */
79 struct work_struct work; /* actual work processor */ 80 struct work_struct work; /* actual work processor */
80 struct sk_buff_head rx_queue; /* received packets */ 81 struct sk_buff_head rx_queue; /* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8ad8c2a0703a..03a3beb17004 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
25static int afs_wait_for_call_to_complete(struct afs_call *); 25static int afs_wait_for_call_to_complete(struct afs_call *);
26static void afs_wake_up_async_call(struct afs_call *); 26static void afs_wake_up_async_call(struct afs_call *);
27static int afs_dont_wait_for_call_to_complete(struct afs_call *); 27static int afs_dont_wait_for_call_to_complete(struct afs_call *);
28static void afs_process_async_call(struct work_struct *); 28static void afs_process_async_call(struct afs_call *);
29static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); 29static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
30static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); 30static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
31 31
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
58static struct sk_buff_head afs_incoming_calls; 58static struct sk_buff_head afs_incoming_calls;
59static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); 59static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
60 60
61static void afs_async_workfn(struct work_struct *work)
62{
63 struct afs_call *call = container_of(work, struct afs_call, async_work);
64
65 call->async_workfn(call);
66}
67
61/* 68/*
62 * open an RxRPC socket and bind it to be a server for callback notifications 69 * open an RxRPC socket and bind it to be a server for callback notifications
63 * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT 70 * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
184} 191}
185 192
186/* 193/*
194 * End a call but do not free it
195 */
196static void afs_end_call_nofree(struct afs_call *call)
197{
198 if (call->rxcall) {
199 rxrpc_kernel_end_call(call->rxcall);
200 call->rxcall = NULL;
201 }
202 if (call->type->destructor)
203 call->type->destructor(call);
204}
205
206/*
207 * End a call and free it
208 */
209static void afs_end_call(struct afs_call *call)
210{
211 afs_end_call_nofree(call);
212 afs_free_call(call);
213}
214
215/*
187 * allocate a call with flat request and reply buffers 216 * allocate a call with flat request and reply buffers
188 */ 217 */
189struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, 218struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
326 atomic_read(&afs_outstanding_calls)); 355 atomic_read(&afs_outstanding_calls));
327 356
328 call->wait_mode = wait_mode; 357 call->wait_mode = wait_mode;
329 INIT_WORK(&call->async_work, afs_process_async_call); 358 call->async_workfn = afs_process_async_call;
359 INIT_WORK(&call->async_work, afs_async_workfn);
330 360
331 memset(&srx, 0, sizeof(srx)); 361 memset(&srx, 0, sizeof(srx));
332 srx.srx_family = AF_RXRPC; 362 srx.srx_family = AF_RXRPC;
@@ -383,11 +413,8 @@ error_do_abort:
383 rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); 413 rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
384 while ((skb = skb_dequeue(&call->rx_queue))) 414 while ((skb = skb_dequeue(&call->rx_queue)))
385 afs_free_skb(skb); 415 afs_free_skb(skb);
386 rxrpc_kernel_end_call(rxcall);
387 call->rxcall = NULL;
388error_kill_call: 416error_kill_call:
389 call->type->destructor(call); 417 afs_end_call(call);
390 afs_free_call(call);
391 _leave(" = %d", ret); 418 _leave(" = %d", ret);
392 return ret; 419 return ret;
393} 420}
@@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
509 if (call->state >= AFS_CALL_COMPLETE) { 536 if (call->state >= AFS_CALL_COMPLETE) {
510 while ((skb = skb_dequeue(&call->rx_queue))) 537 while ((skb = skb_dequeue(&call->rx_queue)))
511 afs_free_skb(skb); 538 afs_free_skb(skb);
512 if (call->incoming) { 539 if (call->incoming)
513 rxrpc_kernel_end_call(call->rxcall); 540 afs_end_call(call);
514 call->rxcall = NULL;
515 call->type->destructor(call);
516 afs_free_call(call);
517 }
518 } 541 }
519 542
520 _leave(""); 543 _leave("");
@@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
564 } 587 }
565 588
566 _debug("call complete"); 589 _debug("call complete");
567 rxrpc_kernel_end_call(call->rxcall); 590 afs_end_call(call);
568 call->rxcall = NULL;
569 call->type->destructor(call);
570 afs_free_call(call);
571 _leave(" = %d", ret); 591 _leave(" = %d", ret);
572 return ret; 592 return ret;
573} 593}
@@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
603/* 623/*
604 * delete an asynchronous call 624 * delete an asynchronous call
605 */ 625 */
606static void afs_delete_async_call(struct work_struct *work) 626static void afs_delete_async_call(struct afs_call *call)
607{ 627{
608 struct afs_call *call =
609 container_of(work, struct afs_call, async_work);
610
611 _enter(""); 628 _enter("");
612 629
613 afs_free_call(call); 630 afs_free_call(call);
@@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
620 * - on a multiple-thread workqueue this work item may try to run on several 637 * - on a multiple-thread workqueue this work item may try to run on several
621 * CPUs at the same time 638 * CPUs at the same time
622 */ 639 */
623static void afs_process_async_call(struct work_struct *work) 640static void afs_process_async_call(struct afs_call *call)
624{ 641{
625 struct afs_call *call =
626 container_of(work, struct afs_call, async_work);
627
628 _enter(""); 642 _enter("");
629 643
630 if (!skb_queue_empty(&call->rx_queue)) 644 if (!skb_queue_empty(&call->rx_queue))
@@ -637,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work)
637 call->reply = NULL; 651 call->reply = NULL;
638 652
639 /* kill the call */ 653 /* kill the call */
640 rxrpc_kernel_end_call(call->rxcall); 654 afs_end_call_nofree(call);
641 call->rxcall = NULL;
642 if (call->type->destructor)
643 call->type->destructor(call);
644 655
645 /* we can't just delete the call because the work item may be 656 /* we can't just delete the call because the work item may be
646 * queued */ 657 * queued */
647 PREPARE_WORK(&call->async_work, afs_delete_async_call); 658 call->async_workfn = afs_delete_async_call;
648 queue_work(afs_async_calls, &call->async_work); 659 queue_work(afs_async_calls, &call->async_work);
649 } 660 }
650 661
@@ -685,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
685 return; 696 return;
686 } 697 }
687 698
688 INIT_WORK(&call->async_work, afs_process_async_call); 699 call->async_workfn = afs_process_async_call;
700 INIT_WORK(&call->async_work, afs_async_workfn);
689 call->wait_mode = &afs_async_incoming_call; 701 call->wait_mode = &afs_async_incoming_call;
690 call->type = &afs_RXCMxxxx; 702 call->type = &afs_RXCMxxxx;
691 init_waitqueue_head(&call->waitq); 703 init_waitqueue_head(&call->waitq);
@@ -782,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
782 _debug("oom"); 794 _debug("oom");
783 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); 795 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
784 default: 796 default:
785 rxrpc_kernel_end_call(call->rxcall); 797 afs_end_call(call);
786 call->rxcall = NULL;
787 call->type->destructor(call);
788 afs_free_call(call);
789 _leave(" [error]"); 798 _leave(" [error]");
790 return; 799 return;
791 } 800 }
@@ -815,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
815 call->state = AFS_CALL_AWAIT_ACK; 824 call->state = AFS_CALL_AWAIT_ACK;
816 n = rxrpc_kernel_send_data(call->rxcall, &msg, len); 825 n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
817 if (n >= 0) { 826 if (n >= 0) {
827 /* Success */
818 _leave(" [replied]"); 828 _leave(" [replied]");
819 return; 829 return;
820 } 830 }
831
821 if (n == -ENOMEM) { 832 if (n == -ENOMEM) {
822 _debug("oom"); 833 _debug("oom");
823 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); 834 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
824 } 835 }
825 rxrpc_kernel_end_call(call->rxcall); 836 afs_end_call(call);
826 call->rxcall = NULL;
827 call->type->destructor(call);
828 afs_free_call(call);
829 _leave(" [error]"); 837 _leave(" [error]");
830} 838}
831 839
diff --git a/fs/aio.c b/fs/aio.c
index 062a5f6a1448..a0ed6c7d2cd2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -52,7 +52,8 @@
52struct aio_ring { 52struct aio_ring {
53 unsigned id; /* kernel internal index number */ 53 unsigned id; /* kernel internal index number */
54 unsigned nr; /* number of io_events */ 54 unsigned nr; /* number of io_events */
55 unsigned head; 55 unsigned head; /* Written to by userland or under ring_lock
56 * mutex by aio_read_events_ring(). */
56 unsigned tail; 57 unsigned tail;
57 58
58 unsigned magic; 59 unsigned magic;
@@ -111,6 +112,11 @@ struct kioctx {
111 112
112 struct work_struct free_work; 113 struct work_struct free_work;
113 114
115 /*
116 * signals when all in-flight requests are done
117 */
118 struct completion *requests_done;
119
114 struct { 120 struct {
115 /* 121 /*
116 * This counts the number of available slots in the ringbuffer, 122 * This counts the number of available slots in the ringbuffer,
@@ -243,6 +249,11 @@ static void aio_free_ring(struct kioctx *ctx)
243{ 249{
244 int i; 250 int i;
245 251
252 /* Disconnect the kiotx from the ring file. This prevents future
253 * accesses to the kioctx from page migration.
254 */
255 put_aio_ring_file(ctx);
256
246 for (i = 0; i < ctx->nr_pages; i++) { 257 for (i = 0; i < ctx->nr_pages; i++) {
247 struct page *page; 258 struct page *page;
248 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, 259 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +265,6 @@ static void aio_free_ring(struct kioctx *ctx)
254 put_page(page); 265 put_page(page);
255 } 266 }
256 267
257 put_aio_ring_file(ctx);
258
259 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { 268 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
260 kfree(ctx->ring_pages); 269 kfree(ctx->ring_pages);
261 ctx->ring_pages = NULL; 270 ctx->ring_pages = NULL;
@@ -283,29 +292,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
283{ 292{
284 struct kioctx *ctx; 293 struct kioctx *ctx;
285 unsigned long flags; 294 unsigned long flags;
295 pgoff_t idx;
286 int rc; 296 int rc;
287 297
288 rc = 0; 298 rc = 0;
289 299
290 /* Make sure the old page hasn't already been changed */ 300 /* mapping->private_lock here protects against the kioctx teardown. */
291 spin_lock(&mapping->private_lock); 301 spin_lock(&mapping->private_lock);
292 ctx = mapping->private_data; 302 ctx = mapping->private_data;
293 if (ctx) { 303 if (!ctx) {
294 pgoff_t idx; 304 rc = -EINVAL;
295 spin_lock_irqsave(&ctx->completion_lock, flags); 305 goto out;
296 idx = old->index; 306 }
297 if (idx < (pgoff_t)ctx->nr_pages) { 307
298 if (ctx->ring_pages[idx] != old) 308 /* The ring_lock mutex. The prevents aio_read_events() from writing
299 rc = -EAGAIN; 309 * to the ring's head, and prevents page migration from mucking in
300 } else 310 * a partially initialized kiotx.
301 rc = -EINVAL; 311 */
302 spin_unlock_irqrestore(&ctx->completion_lock, flags); 312 if (!mutex_trylock(&ctx->ring_lock)) {
313 rc = -EAGAIN;
314 goto out;
315 }
316
317 idx = old->index;
318 if (idx < (pgoff_t)ctx->nr_pages) {
319 /* Make sure the old page hasn't already been changed */
320 if (ctx->ring_pages[idx] != old)
321 rc = -EAGAIN;
303 } else 322 } else
304 rc = -EINVAL; 323 rc = -EINVAL;
305 spin_unlock(&mapping->private_lock);
306 324
307 if (rc != 0) 325 if (rc != 0)
308 return rc; 326 goto out_unlock;
309 327
310 /* Writeback must be complete */ 328 /* Writeback must be complete */
311 BUG_ON(PageWriteback(old)); 329 BUG_ON(PageWriteback(old));
@@ -314,38 +332,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
314 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); 332 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
315 if (rc != MIGRATEPAGE_SUCCESS) { 333 if (rc != MIGRATEPAGE_SUCCESS) {
316 put_page(new); 334 put_page(new);
317 return rc; 335 goto out_unlock;
318 } 336 }
319 337
320 /* We can potentially race against kioctx teardown here. Use the 338 /* Take completion_lock to prevent other writes to the ring buffer
321 * address_space's private data lock to protect the mapping's 339 * while the old page is copied to the new. This prevents new
322 * private_data. 340 * events from being lost.
323 */ 341 */
324 spin_lock(&mapping->private_lock); 342 spin_lock_irqsave(&ctx->completion_lock, flags);
325 ctx = mapping->private_data; 343 migrate_page_copy(new, old);
326 if (ctx) { 344 BUG_ON(ctx->ring_pages[idx] != old);
327 pgoff_t idx; 345 ctx->ring_pages[idx] = new;
328 spin_lock_irqsave(&ctx->completion_lock, flags); 346 spin_unlock_irqrestore(&ctx->completion_lock, flags);
329 migrate_page_copy(new, old);
330 idx = old->index;
331 if (idx < (pgoff_t)ctx->nr_pages) {
332 /* And only do the move if things haven't changed */
333 if (ctx->ring_pages[idx] == old)
334 ctx->ring_pages[idx] = new;
335 else
336 rc = -EAGAIN;
337 } else
338 rc = -EINVAL;
339 spin_unlock_irqrestore(&ctx->completion_lock, flags);
340 } else
341 rc = -EBUSY;
342 spin_unlock(&mapping->private_lock);
343 347
344 if (rc == MIGRATEPAGE_SUCCESS) 348 /* The old page is no longer accessible. */
345 put_page(old); 349 put_page(old);
346 else
347 put_page(new);
348 350
351out_unlock:
352 mutex_unlock(&ctx->ring_lock);
353out:
354 spin_unlock(&mapping->private_lock);
349 return rc; 355 return rc;
350} 356}
351#endif 357#endif
@@ -380,7 +386,7 @@ static int aio_setup_ring(struct kioctx *ctx)
380 file = aio_private_file(ctx, nr_pages); 386 file = aio_private_file(ctx, nr_pages);
381 if (IS_ERR(file)) { 387 if (IS_ERR(file)) {
382 ctx->aio_ring_file = NULL; 388 ctx->aio_ring_file = NULL;
383 return -EAGAIN; 389 return -ENOMEM;
384 } 390 }
385 391
386 ctx->aio_ring_file = file; 392 ctx->aio_ring_file = file;
@@ -415,7 +421,7 @@ static int aio_setup_ring(struct kioctx *ctx)
415 421
416 if (unlikely(i != nr_pages)) { 422 if (unlikely(i != nr_pages)) {
417 aio_free_ring(ctx); 423 aio_free_ring(ctx);
418 return -EAGAIN; 424 return -ENOMEM;
419 } 425 }
420 426
421 ctx->mmap_size = nr_pages * PAGE_SIZE; 427 ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -429,7 +435,7 @@ static int aio_setup_ring(struct kioctx *ctx)
429 if (IS_ERR((void *)ctx->mmap_base)) { 435 if (IS_ERR((void *)ctx->mmap_base)) {
430 ctx->mmap_size = 0; 436 ctx->mmap_size = 0;
431 aio_free_ring(ctx); 437 aio_free_ring(ctx);
432 return -EAGAIN; 438 return -ENOMEM;
433 } 439 }
434 440
435 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 441 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
@@ -507,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
507{ 513{
508 struct kioctx *ctx = container_of(ref, struct kioctx, reqs); 514 struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
509 515
516 /* At this point we know that there are no any in-flight requests */
517 if (ctx->requests_done)
518 complete(ctx->requests_done);
519
510 INIT_WORK(&ctx->free_work, free_ioctx); 520 INIT_WORK(&ctx->free_work, free_ioctx);
511 schedule_work(&ctx->free_work); 521 schedule_work(&ctx->free_work);
512} 522}
@@ -556,6 +566,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
556 rcu_read_unlock(); 566 rcu_read_unlock();
557 spin_unlock(&mm->ioctx_lock); 567 spin_unlock(&mm->ioctx_lock);
558 568
569 /* While kioctx setup is in progress,
570 * we are protected from page migration
571 * changes ring_pages by ->ring_lock.
572 */
559 ring = kmap_atomic(ctx->ring_pages[0]); 573 ring = kmap_atomic(ctx->ring_pages[0]);
560 ring->id = ctx->id; 574 ring->id = ctx->id;
561 kunmap_atomic(ring); 575 kunmap_atomic(ring);
@@ -640,24 +654,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
640 654
641 ctx->max_reqs = nr_events; 655 ctx->max_reqs = nr_events;
642 656
643 if (percpu_ref_init(&ctx->users, free_ioctx_users))
644 goto err;
645
646 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
647 goto err;
648
649 spin_lock_init(&ctx->ctx_lock); 657 spin_lock_init(&ctx->ctx_lock);
650 spin_lock_init(&ctx->completion_lock); 658 spin_lock_init(&ctx->completion_lock);
651 mutex_init(&ctx->ring_lock); 659 mutex_init(&ctx->ring_lock);
660 /* Protect against page migration throughout kiotx setup by keeping
661 * the ring_lock mutex held until setup is complete. */
662 mutex_lock(&ctx->ring_lock);
652 init_waitqueue_head(&ctx->wait); 663 init_waitqueue_head(&ctx->wait);
653 664
654 INIT_LIST_HEAD(&ctx->active_reqs); 665 INIT_LIST_HEAD(&ctx->active_reqs);
655 666
667 if (percpu_ref_init(&ctx->users, free_ioctx_users))
668 goto err;
669
670 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
671 goto err;
672
656 ctx->cpu = alloc_percpu(struct kioctx_cpu); 673 ctx->cpu = alloc_percpu(struct kioctx_cpu);
657 if (!ctx->cpu) 674 if (!ctx->cpu)
658 goto err; 675 goto err;
659 676
660 if (aio_setup_ring(ctx) < 0) 677 err = aio_setup_ring(ctx);
678 if (err < 0)
661 goto err; 679 goto err;
662 680
663 atomic_set(&ctx->reqs_available, ctx->nr_events - 1); 681 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
@@ -683,6 +701,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
683 if (err) 701 if (err)
684 goto err_cleanup; 702 goto err_cleanup;
685 703
704 /* Release the ring_lock mutex now that all setup is complete. */
705 mutex_unlock(&ctx->ring_lock);
706
686 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 707 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
687 ctx, ctx->user_id, mm, ctx->nr_events); 708 ctx, ctx->user_id, mm, ctx->nr_events);
688 return ctx; 709 return ctx;
@@ -692,6 +713,7 @@ err_cleanup:
692err_ctx: 713err_ctx:
693 aio_free_ring(ctx); 714 aio_free_ring(ctx);
694err: 715err:
716 mutex_unlock(&ctx->ring_lock);
695 free_percpu(ctx->cpu); 717 free_percpu(ctx->cpu);
696 free_percpu(ctx->reqs.pcpu_count); 718 free_percpu(ctx->reqs.pcpu_count);
697 free_percpu(ctx->users.pcpu_count); 719 free_percpu(ctx->users.pcpu_count);
@@ -705,7 +727,8 @@ err:
705 * when the processes owning a context have all exited to encourage 727 * when the processes owning a context have all exited to encourage
706 * the rapid destruction of the kioctx. 728 * the rapid destruction of the kioctx.
707 */ 729 */
708static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) 730static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
731 struct completion *requests_done)
709{ 732{
710 if (!atomic_xchg(&ctx->dead, 1)) { 733 if (!atomic_xchg(&ctx->dead, 1)) {
711 struct kioctx_table *table; 734 struct kioctx_table *table;
@@ -734,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
734 if (ctx->mmap_size) 757 if (ctx->mmap_size)
735 vm_munmap(ctx->mmap_base, ctx->mmap_size); 758 vm_munmap(ctx->mmap_base, ctx->mmap_size);
736 759
760 ctx->requests_done = requests_done;
737 percpu_ref_kill(&ctx->users); 761 percpu_ref_kill(&ctx->users);
762 } else {
763 if (requests_done)
764 complete(requests_done);
738 } 765 }
739} 766}
740 767
@@ -796,7 +823,7 @@ void exit_aio(struct mm_struct *mm)
796 */ 823 */
797 ctx->mmap_size = 0; 824 ctx->mmap_size = 0;
798 825
799 kill_ioctx(mm, ctx); 826 kill_ioctx(mm, ctx, NULL);
800 } 827 }
801} 828}
802 829
@@ -1024,6 +1051,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
1024 1051
1025 mutex_lock(&ctx->ring_lock); 1052 mutex_lock(&ctx->ring_lock);
1026 1053
1054 /* Access to ->ring_pages here is protected by ctx->ring_lock. */
1027 ring = kmap_atomic(ctx->ring_pages[0]); 1055 ring = kmap_atomic(ctx->ring_pages[0]);
1028 head = ring->head; 1056 head = ring->head;
1029 tail = ring->tail; 1057 tail = ring->tail;
@@ -1171,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1171 if (!IS_ERR(ioctx)) { 1199 if (!IS_ERR(ioctx)) {
1172 ret = put_user(ioctx->user_id, ctxp); 1200 ret = put_user(ioctx->user_id, ctxp);
1173 if (ret) 1201 if (ret)
1174 kill_ioctx(current->mm, ioctx); 1202 kill_ioctx(current->mm, ioctx, NULL);
1175 percpu_ref_put(&ioctx->users); 1203 percpu_ref_put(&ioctx->users);
1176 } 1204 }
1177 1205
@@ -1189,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1189{ 1217{
1190 struct kioctx *ioctx = lookup_ioctx(ctx); 1218 struct kioctx *ioctx = lookup_ioctx(ctx);
1191 if (likely(NULL != ioctx)) { 1219 if (likely(NULL != ioctx)) {
1192 kill_ioctx(current->mm, ioctx); 1220 struct completion requests_done =
1221 COMPLETION_INITIALIZER_ONSTACK(requests_done);
1222
1223 /* Pass requests_done to kill_ioctx() where it can be set
1224 * in a thread-safe way. If we try to set it here then we have
1225 * a race condition if two io_destroy() called simultaneously.
1226 */
1227 kill_ioctx(current->mm, ioctx, &requests_done);
1193 percpu_ref_put(&ioctx->users); 1228 percpu_ref_put(&ioctx->users);
1229
1230 /* Wait until all IO for the context are done. Otherwise kernel
1231 * keep using user-space buffers even if user thinks the context
1232 * is destroyed.
1233 */
1234 wait_for_completion(&requests_done);
1235
1194 return 0; 1236 return 0;
1195 } 1237 }
1196 pr_debug("EINVAL: io_destroy: invalid context id\n"); 1238 pr_debug("EINVAL: io_destroy: invalid context id\n");
@@ -1285,10 +1327,8 @@ rw_common:
1285 &iovec, compat) 1327 &iovec, compat)
1286 : aio_setup_single_vector(req, rw, buf, &nr_segs, 1328 : aio_setup_single_vector(req, rw, buf, &nr_segs,
1287 iovec); 1329 iovec);
1288 if (ret) 1330 if (!ret)
1289 return ret; 1331 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
1290
1291 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
1292 if (ret < 0) { 1332 if (ret < 0) {
1293 if (iovec != &inline_vec) 1333 if (iovec != &inline_vec)
1294 kfree(iovec); 1334 kfree(iovec);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3182c0e68b42..232e03d4780d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 if (tmp.size > (PATH_MAX + sizeof(tmp)))
107 return ERR_PTR(-ENAMETOOLONG);
108
106 return memdup_user(in, tmp.size); 109 return memdup_user(in, tmp.size);
107} 110}
108 111
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2caf36ac3e93..cc87c1abac97 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
179 spin_lock(&active->d_lock); 179 spin_lock(&active->d_lock);
180 180
181 /* Already gone? */ 181 /* Already gone? */
182 if (!d_count(active)) 182 if ((int) d_count(active) <= 0)
183 goto next; 183 goto next;
184 184
185 qstr = &active->d_name; 185 qstr = &active->d_name;
@@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
230 230
231 spin_lock(&expiring->d_lock); 231 spin_lock(&expiring->d_lock);
232 232
233 /* Bad luck, we've already been dentry_iput */ 233 /* We've already been dentry_iput or unlinked */
234 if (!expiring->d_inode) 234 if (!expiring->d_inode)
235 goto next; 235 goto next;
236 236
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50d..8b9f66642a83 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
3# 3#
4 4
5obj-$(CONFIG_BEFS_FS) += befs.o 5obj-$(CONFIG_BEFS_FS) += befs.o
6 6ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG
7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o 7befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b26642839156..3a7813ab8c95 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -88,8 +88,11 @@ enum befs_err {
88 88
89/****************************/ 89/****************************/
90/* debug.c */ 90/* debug.c */
91__printf(2, 3)
91void befs_error(const struct super_block *sb, const char *fmt, ...); 92void befs_error(const struct super_block *sb, const char *fmt, ...);
93__printf(2, 3)
92void befs_warning(const struct super_block *sb, const char *fmt, ...); 94void befs_warning(const struct super_block *sb, const char *fmt, ...);
95__printf(2, 3)
93void befs_debug(const struct super_block *sb, const char *fmt, ...); 96void befs_debug(const struct super_block *sb, const char *fmt, ...);
94 97
95void befs_dump_super_block(const struct super_block *sb, befs_super_block *); 98void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 74e397db0b8b..a2cd305a993a 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
137 struct buffer_head *bh = NULL; 137 struct buffer_head *bh = NULL;
138 befs_disk_btree_super *od_sup = NULL; 138 befs_disk_btree_super *od_sup = NULL;
139 139
140 befs_debug(sb, "---> befs_btree_read_super()"); 140 befs_debug(sb, "---> %s", __func__);
141 141
142 bh = befs_read_datastream(sb, ds, 0, NULL); 142 bh = befs_read_datastream(sb, ds, 0, NULL);
143 143
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
162 goto error; 162 goto error;
163 } 163 }
164 164
165 befs_debug(sb, "<--- befs_btree_read_super()"); 165 befs_debug(sb, "<--- %s", __func__);
166 return BEFS_OK; 166 return BEFS_OK;
167 167
168 error: 168 error:
169 befs_debug(sb, "<--- befs_btree_read_super() ERROR"); 169 befs_debug(sb, "<--- %s ERROR", __func__);
170 return BEFS_ERR; 170 return BEFS_ERR;
171} 171}
172 172
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
195{ 195{
196 uint off = 0; 196 uint off = 0;
197 197
198 befs_debug(sb, "---> befs_bt_read_node()"); 198 befs_debug(sb, "---> %s", __func__);
199 199
200 if (node->bh) 200 if (node->bh)
201 brelse(node->bh); 201 brelse(node->bh);
202 202
203 node->bh = befs_read_datastream(sb, ds, node_off, &off); 203 node->bh = befs_read_datastream(sb, ds, node_off, &off);
204 if (!node->bh) { 204 if (!node->bh) {
205 befs_error(sb, "befs_bt_read_node() failed to read " 205 befs_error(sb, "%s failed to read "
206 "node at %Lu", node_off); 206 "node at %llu", __func__, node_off);
207 befs_debug(sb, "<--- befs_bt_read_node() ERROR"); 207 befs_debug(sb, "<--- %s ERROR", __func__);
208 208
209 return BEFS_ERR; 209 return BEFS_ERR;
210 } 210 }
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
221 node->head.all_key_length = 221 node->head.all_key_length =
222 fs16_to_cpu(sb, node->od_node->all_key_length); 222 fs16_to_cpu(sb, node->od_node->all_key_length);
223 223
224 befs_debug(sb, "<--- befs_btree_read_node()"); 224 befs_debug(sb, "<--- %s", __func__);
225 return BEFS_OK; 225 return BEFS_OK;
226} 226}
227 227
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
252 befs_off_t node_off; 252 befs_off_t node_off;
253 int res; 253 int res;
254 254
255 befs_debug(sb, "---> befs_btree_find() Key: %s", key); 255 befs_debug(sb, "---> %s Key: %s", __func__, key);
256 256
257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 257 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
258 befs_error(sb, 258 befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
263 this_node = kmalloc(sizeof (befs_btree_node), 263 this_node = kmalloc(sizeof (befs_btree_node),
264 GFP_NOFS); 264 GFP_NOFS);
265 if (!this_node) { 265 if (!this_node) {
266 befs_error(sb, "befs_btree_find() failed to allocate %u " 266 befs_error(sb, "befs_btree_find() failed to allocate %zu "
267 "bytes of memory", sizeof (befs_btree_node)); 267 "bytes of memory", sizeof (befs_btree_node));
268 goto error; 268 goto error;
269 } 269 }
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
274 node_off = bt_super.root_node_ptr; 274 node_off = bt_super.root_node_ptr;
275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 275 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
276 befs_error(sb, "befs_btree_find() failed to read " 276 befs_error(sb, "befs_btree_find() failed to read "
277 "node at %Lu", node_off); 277 "node at %llu", node_off);
278 goto error_alloc; 278 goto error_alloc;
279 } 279 }
280 280
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
285 /* if no match, go to overflow node */ 285 /* if no match, go to overflow node */
286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 286 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
287 befs_error(sb, "befs_btree_find() failed to read " 287 befs_error(sb, "befs_btree_find() failed to read "
288 "node at %Lu", node_off); 288 "node at %llu", node_off);
289 goto error_alloc; 289 goto error_alloc;
290 } 290 }
291 } 291 }
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
298 kfree(this_node); 298 kfree(this_node);
299 299
300 if (res != BEFS_BT_MATCH) { 300 if (res != BEFS_BT_MATCH) {
301 befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); 301 befs_debug(sb, "<--- %s Key %s not found", __func__, key);
302 *value = 0; 302 *value = 0;
303 return BEFS_BT_NOT_FOUND; 303 return BEFS_BT_NOT_FOUND;
304 } 304 }
305 befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", 305 befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
306 key, *value); 306 key, *value);
307 return BEFS_OK; 307 return BEFS_OK;
308 308
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
310 kfree(this_node); 310 kfree(this_node);
311 error: 311 error:
312 *value = 0; 312 *value = 0;
313 befs_debug(sb, "<--- befs_btree_find() ERROR"); 313 befs_debug(sb, "<--- %s ERROR", __func__);
314 return BEFS_ERR; 314 return BEFS_ERR;
315} 315}
316 316
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
343 char *thiskey; 343 char *thiskey;
344 fs64 *valarray; 344 fs64 *valarray;
345 345
346 befs_debug(sb, "---> befs_find_key() %s", findkey); 346 befs_debug(sb, "---> %s %s", __func__, findkey);
347 347
348 *value = 0; 348 *value = 0;
349 349
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
355 355
356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); 356 eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
357 if (eq < 0) { 357 if (eq < 0) {
358 befs_debug(sb, "<--- befs_find_key() %s not found", findkey); 358 befs_debug(sb, "<--- %s %s not found", __func__, findkey);
359 return BEFS_BT_NOT_FOUND; 359 return BEFS_BT_NOT_FOUND;
360 } 360 }
361 361
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
373 findkey_len); 373 findkey_len);
374 374
375 if (eq == 0) { 375 if (eq == 0) {
376 befs_debug(sb, "<--- befs_find_key() found %s at %d", 376 befs_debug(sb, "<--- %s found %s at %d",
377 thiskey, mid); 377 __func__, thiskey, mid);
378 378
379 *value = fs64_to_cpu(sb, valarray[mid]); 379 *value = fs64_to_cpu(sb, valarray[mid]);
380 return BEFS_BT_MATCH; 380 return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
388 *value = fs64_to_cpu(sb, valarray[mid + 1]); 388 *value = fs64_to_cpu(sb, valarray[mid + 1]);
389 else 389 else
390 *value = fs64_to_cpu(sb, valarray[mid]); 390 *value = fs64_to_cpu(sb, valarray[mid]);
391 befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); 391 befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
392 return BEFS_BT_PARMATCH; 392 return BEFS_BT_PARMATCH;
393} 393}
394 394
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
428 428
429 uint key_sum = 0; 429 uint key_sum = 0;
430 430
431 befs_debug(sb, "---> befs_btree_read()"); 431 befs_debug(sb, "---> %s", __func__);
432 432
433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { 433 if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
434 befs_error(sb, 434 befs_error(sb,
@@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
437 } 437 }
438 438
439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { 439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
440 befs_error(sb, "befs_btree_read() failed to allocate %u " 440 befs_error(sb, "befs_btree_read() failed to allocate %zu "
441 "bytes of memory", sizeof (befs_btree_node)); 441 "bytes of memory", sizeof (befs_btree_node));
442 goto error; 442 goto error;
443 } 443 }
@@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
452 kfree(this_node); 452 kfree(this_node);
453 *value = 0; 453 *value = 0;
454 *keysize = 0; 454 *keysize = 0;
455 befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); 455 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
456 return BEFS_BT_EMPTY; 456 return BEFS_BT_EMPTY;
457 } else if (res == BEFS_ERR) { 457 } else if (res == BEFS_ERR) {
458 goto error_alloc; 458 goto error_alloc;
@@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
467 *keysize = 0; 467 *keysize = 0;
468 *value = 0; 468 *value = 0;
469 befs_debug(sb, 469 befs_debug(sb,
470 "<--- befs_btree_read() END of keys at %Lu", 470 "<--- %s END of keys at %llu", __func__,
471 (unsigned long long)
471 key_sum + this_node->head.all_key_count); 472 key_sum + this_node->head.all_key_count);
472 brelse(this_node->bh); 473 brelse(this_node->bh);
473 kfree(this_node); 474 kfree(this_node);
@@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
478 node_off = this_node->head.right; 479 node_off = this_node->head.right;
479 480
480 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { 481 if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
481 befs_error(sb, "befs_btree_read() failed to read " 482 befs_error(sb, "%s failed to read node at %llu",
482 "node at %Lu", node_off); 483 __func__, (unsigned long long)node_off);
483 goto error_alloc; 484 goto error_alloc;
484 } 485 }
485 } 486 }
@@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
492 493
493 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); 494 keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
494 495
495 befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); 496 befs_debug(sb, "Read [%llu,%d]: keysize %d",
497 (long long unsigned int)node_off, (int)cur_key,
498 (int)keylen);
496 499
497 if (bufsize < keylen + 1) { 500 if (bufsize < keylen + 1) {
498 befs_error(sb, "befs_btree_read() keybuf too small (%u) " 501 befs_error(sb, "%s keybuf too small (%zu) "
499 "for key of size %d", bufsize, keylen); 502 "for key of size %d", __func__, bufsize, keylen);
500 brelse(this_node->bh); 503 brelse(this_node->bh);
501 goto error_alloc; 504 goto error_alloc;
502 }; 505 };
@@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
506 *keysize = keylen; 509 *keysize = keylen;
507 keybuf[keylen] = '\0'; 510 keybuf[keylen] = '\0';
508 511
509 befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, 512 befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
510 cur_key, keylen, keybuf, *value); 513 cur_key, keylen, keybuf, *value);
511 514
512 brelse(this_node->bh); 515 brelse(this_node->bh);
513 kfree(this_node); 516 kfree(this_node);
514 517
515 befs_debug(sb, "<--- befs_btree_read()"); 518 befs_debug(sb, "<--- %s", __func__);
516 519
517 return BEFS_OK; 520 return BEFS_OK;
518 521
@@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
522 error: 525 error:
523 *keysize = 0; 526 *keysize = 0;
524 *value = 0; 527 *value = 0;
525 befs_debug(sb, "<--- befs_btree_read() ERROR"); 528 befs_debug(sb, "<--- %s ERROR", __func__);
526 return BEFS_ERR; 529 return BEFS_ERR;
527} 530}
528 531
@@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
547 befs_off_t * node_off) 550 befs_off_t * node_off)
548{ 551{
549 552
550 befs_debug(sb, "---> befs_btree_seekleaf()"); 553 befs_debug(sb, "---> %s", __func__);
551 554
552 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 555 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
553 befs_error(sb, "befs_btree_seekleaf() failed to read " 556 befs_error(sb, "%s failed to read "
554 "node at %Lu", *node_off); 557 "node at %llu", __func__, *node_off);
555 goto error; 558 goto error;
556 } 559 }
557 befs_debug(sb, "Seekleaf to root node %Lu", *node_off); 560 befs_debug(sb, "Seekleaf to root node %llu", *node_off);
558 561
559 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { 562 if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
560 befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); 563 befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
561 return BEFS_BT_EMPTY; 564 return BEFS_BT_EMPTY;
562 } 565 }
563 566
564 while (!befs_leafnode(this_node)) { 567 while (!befs_leafnode(this_node)) {
565 568
566 if (this_node->head.all_key_count == 0) { 569 if (this_node->head.all_key_count == 0) {
567 befs_debug(sb, "befs_btree_seekleaf() encountered " 570 befs_debug(sb, "%s encountered "
568 "an empty interior node: %Lu. Using Overflow " 571 "an empty interior node: %llu. Using Overflow "
569 "node: %Lu", *node_off, 572 "node: %llu", __func__, *node_off,
570 this_node->head.overflow); 573 this_node->head.overflow);
571 *node_off = this_node->head.overflow; 574 *node_off = this_node->head.overflow;
572 } else { 575 } else {
@@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
574 *node_off = fs64_to_cpu(sb, valarray[0]); 577 *node_off = fs64_to_cpu(sb, valarray[0]);
575 } 578 }
576 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 579 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
577 befs_error(sb, "befs_btree_seekleaf() failed to read " 580 befs_error(sb, "%s failed to read "
578 "node at %Lu", *node_off); 581 "node at %llu", __func__, *node_off);
579 goto error; 582 goto error;
580 } 583 }
581 584
582 befs_debug(sb, "Seekleaf to child node %Lu", *node_off); 585 befs_debug(sb, "Seekleaf to child node %llu", *node_off);
583 } 586 }
584 befs_debug(sb, "Node %Lu is a leaf node", *node_off); 587 befs_debug(sb, "Node %llu is a leaf node", *node_off);
585 588
586 return BEFS_OK; 589 return BEFS_OK;
587 590
588 error: 591 error:
589 befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); 592 befs_debug(sb, "<--- %s ERROR", __func__);
590 return BEFS_ERR; 593 return BEFS_ERR;
591} 594}
592 595
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc7..c467bebd50af 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
52 befs_block_run run; 52 befs_block_run run;
53 befs_blocknr_t block; /* block coresponding to pos */ 53 befs_blocknr_t block; /* block coresponding to pos */
54 54
55 befs_debug(sb, "---> befs_read_datastream() %Lu", pos); 55 befs_debug(sb, "---> %s %llu", __func__, pos);
56 block = pos >> BEFS_SB(sb)->block_shift; 56 block = pos >> BEFS_SB(sb)->block_shift;
57 if (off) 57 if (off)
58 *off = pos - (block << BEFS_SB(sb)->block_shift); 58 *off = pos - (block << BEFS_SB(sb)->block_shift);
59 59
60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { 60 if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
61 befs_error(sb, "BeFS: Error finding disk addr of block %lu", 61 befs_error(sb, "BeFS: Error finding disk addr of block %lu",
62 block); 62 (unsigned long)block);
63 befs_debug(sb, "<--- befs_read_datastream() ERROR"); 63 befs_debug(sb, "<--- %s ERROR", __func__);
64 return NULL; 64 return NULL;
65 } 65 }
66 bh = befs_bread_iaddr(sb, run); 66 bh = befs_bread_iaddr(sb, run);
67 if (!bh) { 67 if (!bh) {
68 befs_error(sb, "BeFS: Error reading block %lu from datastream", 68 befs_error(sb, "BeFS: Error reading block %lu from datastream",
69 block); 69 (unsigned long)block);
70 return NULL; 70 return NULL;
71 } 71 }
72 72
73 befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", 73 befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
74 pos);
75 74
76 return bh; 75 return bh;
77} 76}
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
106 } else { 105 } else {
107 befs_error(sb, 106 befs_error(sb,
108 "befs_fblock2brun() was asked to find block %lu, " 107 "befs_fblock2brun() was asked to find block %lu, "
109 "which is not mapped by the datastream\n", fblock); 108 "which is not mapped by the datastream\n",
109 (unsigned long)fblock);
110 err = BEFS_ERR; 110 err = BEFS_ERR;
111 } 111 }
112 return err; 112 return err;
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
128 befs_off_t bytes_read = 0; /* bytes readed */ 128 befs_off_t bytes_read = 0; /* bytes readed */
129 u16 plen; 129 u16 plen;
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); 131 befs_debug(sb, "---> %s length: %llu", __func__, len);
132 132
133 while (bytes_read < len) { 133 while (bytes_read < len) {
134 bh = befs_read_datastream(sb, ds, bytes_read, NULL); 134 bh = befs_read_datastream(sb, ds, bytes_read, NULL);
135 if (!bh) { 135 if (!bh) {
136 befs_error(sb, "BeFS: Error reading datastream block " 136 befs_error(sb, "BeFS: Error reading datastream block "
137 "starting from %Lu", bytes_read); 137 "starting from %llu", bytes_read);
138 befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); 138 befs_debug(sb, "<--- %s ERROR", __func__);
139 return bytes_read; 139 return bytes_read;
140 140
141 } 141 }
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
146 bytes_read += plen; 146 bytes_read += plen;
147 } 147 }
148 148
149 befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); 149 befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
150 bytes_read);
150 return bytes_read; 151 return bytes_read;
151} 152}
152 153
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
169 befs_blocknr_t metablocks; /* FS metadata blocks */ 170 befs_blocknr_t metablocks; /* FS metadata blocks */
170 befs_sb_info *befs_sb = BEFS_SB(sb); 171 befs_sb_info *befs_sb = BEFS_SB(sb);
171 172
172 befs_debug(sb, "---> befs_count_blocks()"); 173 befs_debug(sb, "---> %s", __func__);
173 174
174 datablocks = ds->size >> befs_sb->block_shift; 175 datablocks = ds->size >> befs_sb->block_shift;
175 if (ds->size & (befs_sb->block_size - 1)) 176 if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
206 } 207 }
207 208
208 blocks = datablocks + metablocks; 209 blocks = datablocks + metablocks;
209 befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); 210 befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
210 211
211 return blocks; 212 return blocks;
212} 213}
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
251 befs_blocknr_t max_block = 252 befs_blocknr_t max_block =
252 data->max_direct_range >> BEFS_SB(sb)->block_shift; 253 data->max_direct_range >> BEFS_SB(sb)->block_shift;
253 254
254 befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); 255 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
255 256
256 if (blockno > max_block) { 257 if (blockno > max_block) {
257 befs_error(sb, "befs_find_brun_direct() passed block outside of" 258 befs_error(sb, "%s passed block outside of direct region",
258 "direct region"); 259 __func__);
259 return BEFS_ERR; 260 return BEFS_ERR;
260 } 261 }
261 262
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
267 run->start = array[i].start + offset; 268 run->start = array[i].start + offset;
268 run->len = array[i].len - offset; 269 run->len = array[i].len - offset;
269 270
270 befs_debug(sb, "---> befs_find_brun_direct(), " 271 befs_debug(sb, "---> %s, "
271 "found %lu at direct[%d]", blockno, i); 272 "found %lu at direct[%d]", __func__,
273 (unsigned long)blockno, i);
272 return BEFS_OK; 274 return BEFS_OK;
273 } 275 }
274 } 276 }
275 277
276 befs_debug(sb, "---> befs_find_brun_direct() ERROR"); 278 befs_debug(sb, "---> %s ERROR", __func__);
277 return BEFS_ERR; 279 return BEFS_ERR;
278} 280}
279 281
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
316 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); 318 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
317 int arraylen = befs_iaddrs_per_block(sb); 319 int arraylen = befs_iaddrs_per_block(sb);
318 320
319 befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); 321 befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
320 322
321 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; 323 indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
322 search_blk = blockno - indir_start_blk; 324 search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
325 for (i = 0; i < indirect.len; i++) { 327 for (i = 0; i < indirect.len; i++) {
326 indirblock = befs_bread(sb, indirblockno + i); 328 indirblock = befs_bread(sb, indirblockno + i);
327 if (indirblock == NULL) { 329 if (indirblock == NULL) {
328 befs_debug(sb, 330 befs_debug(sb, "---> %s failed to read "
329 "---> befs_find_brun_indirect() failed to " 331 "disk block %lu from the indirect brun",
330 "read disk block %lu from the indirect brun", 332 __func__, (unsigned long)indirblockno + i);
331 indirblockno + i);
332 return BEFS_ERR; 333 return BEFS_ERR;
333 } 334 }
334 335
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
348 349
349 brelse(indirblock); 350 brelse(indirblock);
350 befs_debug(sb, 351 befs_debug(sb,
351 "<--- befs_find_brun_indirect() found " 352 "<--- %s found file block "
352 "file block %lu at indirect[%d]", 353 "%lu at indirect[%d]", __func__,
353 blockno, j + (i * arraylen)); 354 (unsigned long)blockno,
355 j + (i * arraylen));
354 return BEFS_OK; 356 return BEFS_OK;
355 } 357 }
356 sum += len; 358 sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
360 } 362 }
361 363
362 /* Only fallthrough is an error */ 364 /* Only fallthrough is an error */
363 befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " 365 befs_error(sb, "BeFS: %s failed to find "
364 "file block %lu", blockno); 366 "file block %lu", __func__, (unsigned long)blockno);
365 367
366 befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); 368 befs_debug(sb, "<--- %s ERROR", __func__);
367 return BEFS_ERR; 369 return BEFS_ERR;
368} 370}
369 371
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
444 size_t diblklen = iblklen * befs_iaddrs_per_block(sb) 446 size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
445 * BEFS_DBLINDIR_BRUN_LEN; 447 * BEFS_DBLINDIR_BRUN_LEN;
446 448
447 befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); 449 befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
448 450
449 /* First, discover which of the double_indir->indir blocks 451 /* First, discover which of the double_indir->indir blocks
450 * contains pos. Then figure out how much of pos that 452 * contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
460 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); 462 dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
461 if (dbl_which_block > data->double_indirect.len) { 463 if (dbl_which_block > data->double_indirect.len) {
462 befs_error(sb, "The double-indirect index calculated by " 464 befs_error(sb, "The double-indirect index calculated by "
463 "befs_read_brun_dblindirect(), %d, is outside the range " 465 "%s, %d, is outside the range "
464 "of the double-indirect block", dblindir_indx); 466 "of the double-indirect block", __func__,
467 dblindir_indx);
465 return BEFS_ERR; 468 return BEFS_ERR;
466 } 469 }
467 470
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
469 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + 472 befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
470 dbl_which_block); 473 dbl_which_block);
471 if (dbl_indir_block == NULL) { 474 if (dbl_indir_block == NULL) {
472 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 475 befs_error(sb, "%s couldn't read the "
473 "double-indirect block at blockno %lu", 476 "double-indirect block at blockno %lu", __func__,
474 iaddr2blockno(sb, 477 (unsigned long)
475 &data->double_indirect) + 478 iaddr2blockno(sb, &data->double_indirect) +
476 dbl_which_block); 479 dbl_which_block);
477 brelse(dbl_indir_block); 480 brelse(dbl_indir_block);
478 return BEFS_ERR; 481 return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
489 which_block = indir_indx / befs_iaddrs_per_block(sb); 492 which_block = indir_indx / befs_iaddrs_per_block(sb);
490 if (which_block > indir_run.len) { 493 if (which_block > indir_run.len) {
491 befs_error(sb, "The indirect index calculated by " 494 befs_error(sb, "The indirect index calculated by "
492 "befs_read_brun_dblindirect(), %d, is outside the range " 495 "%s, %d, is outside the range "
493 "of the indirect block", indir_indx); 496 "of the indirect block", __func__, indir_indx);
494 return BEFS_ERR; 497 return BEFS_ERR;
495 } 498 }
496 499
497 indir_block = 500 indir_block =
498 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); 501 befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
499 if (indir_block == NULL) { 502 if (indir_block == NULL) {
500 befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " 503 befs_error(sb, "%s couldn't read the indirect block "
501 "indirect block at blockno %lu", 504 "at blockno %lu", __func__, (unsigned long)
502 iaddr2blockno(sb, &indir_run) + which_block); 505 iaddr2blockno(sb, &indir_run) + which_block);
503 brelse(indir_block); 506 brelse(indir_block);
504 return BEFS_ERR; 507 return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
519 run->len -= offset; 522 run->len -= offset;
520 523
521 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," 524 befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
522 " double_indirect_leftover = %lu", 525 " double_indirect_leftover = %lu", (unsigned long)
523 blockno, dblindir_indx, indir_indx, dblindir_leftover); 526 blockno, dblindir_indx, indir_indx, dblindir_leftover);
524 527
525 return BEFS_OK; 528 return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c83..4de7cffcd662 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
10 * debug functions 10 * debug functions
11 */ 11 */
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#ifdef __KERNEL__ 14#ifdef __KERNEL__
14 15
15#include <stdarg.h> 16#include <stdarg.h>
@@ -23,43 +24,30 @@
23 24
24#include "befs.h" 25#include "befs.h"
25 26
26#define ERRBUFSIZE 1024
27
28void 27void
29befs_error(const struct super_block *sb, const char *fmt, ...) 28befs_error(const struct super_block *sb, const char *fmt, ...)
30{ 29{
30 struct va_format vaf;
31 va_list args; 31 va_list args;
32 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
33 if (err_buf == NULL) {
34 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
35 return;
36 }
37 32
38 va_start(args, fmt); 33 va_start(args, fmt);
39 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 34 vaf.fmt = fmt;
35 vaf.va = &args;
36 pr_err("(%s): %pV\n", sb->s_id, &vaf);
40 va_end(args); 37 va_end(args);
41
42 printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
43 kfree(err_buf);
44} 38}
45 39
46void 40void
47befs_warning(const struct super_block *sb, const char *fmt, ...) 41befs_warning(const struct super_block *sb, const char *fmt, ...)
48{ 42{
43 struct va_format vaf;
49 va_list args; 44 va_list args;
50 char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
51 if (err_buf == NULL) {
52 printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
53 return;
54 }
55 45
56 va_start(args, fmt); 46 va_start(args, fmt);
57 vsnprintf(err_buf, ERRBUFSIZE, fmt, args); 47 vaf.fmt = fmt;
48 vaf.va = &args;
49 pr_warn("(%s): %pV\n", sb->s_id, &vaf);
58 va_end(args); 50 va_end(args);
59
60 printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
61
62 kfree(err_buf);
63} 51}
64 52
65void 53void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
67{ 55{
68#ifdef CONFIG_BEFS_DEBUG 56#ifdef CONFIG_BEFS_DEBUG
69 57
58 struct va_format vaf;
70 va_list args; 59 va_list args;
71 char *err_buf = NULL; 60 va_start(args, fmt);
72 61 vaf.fmt = fmt;
73 if (BEFS_SB(sb)->mount_opts.debug) { 62 vaf.va = &args;
74 err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); 63 pr_debug("(%s): %pV\n", sb->s_id, &vaf);
75 if (err_buf == NULL) { 64 va_end(args);
76 printk(KERN_ERR "could not allocate %d bytes\n",
77 ERRBUFSIZE);
78 return;
79 }
80
81 va_start(args, fmt);
82 vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
83 va_end(args);
84
85 printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
86
87 kfree(err_buf);
88 }
89 65
90#endif //CONFIG_BEFS_DEBUG 66#endif //CONFIG_BEFS_DEBUG
91} 67}
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
109 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); 85 befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid));
110 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); 86 befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode));
111 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); 87 befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags));
112 befs_debug(sb, " create_time %Lu", 88 befs_debug(sb, " create_time %llu",
113 fs64_to_cpu(sb, inode->create_time)); 89 fs64_to_cpu(sb, inode->create_time));
114 befs_debug(sb, " last_modified_time %Lu", 90 befs_debug(sb, " last_modified_time %llu",
115 fs64_to_cpu(sb, inode->last_modified_time)); 91 fs64_to_cpu(sb, inode->last_modified_time));
116 92
117 tmp_run = fsrun_to_cpu(sb, inode->parent); 93 tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
137 tmp_run.allocation_group, tmp_run.start, 113 tmp_run.allocation_group, tmp_run.start,
138 tmp_run.len); 114 tmp_run.len);
139 } 115 }
140 befs_debug(sb, " max_direct_range %Lu", 116 befs_debug(sb, " max_direct_range %llu",
141 fs64_to_cpu(sb, 117 fs64_to_cpu(sb,
142 inode->data.datastream. 118 inode->data.datastream.
143 max_direct_range)); 119 max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
147 tmp_run.allocation_group, 123 tmp_run.allocation_group,
148 tmp_run.start, tmp_run.len); 124 tmp_run.start, tmp_run.len);
149 125
150 befs_debug(sb, " max_indirect_range %Lu", 126 befs_debug(sb, " max_indirect_range %llu",
151 fs64_to_cpu(sb, 127 fs64_to_cpu(sb,
152 inode->data.datastream. 128 inode->data.datastream.
153 max_indirect_range)); 129 max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
158 tmp_run.allocation_group, tmp_run.start, 134 tmp_run.allocation_group, tmp_run.start,
159 tmp_run.len); 135 tmp_run.len);
160 136
161 befs_debug(sb, " max_double_indirect_range %Lu", 137 befs_debug(sb, " max_double_indirect_range %llu",
162 fs64_to_cpu(sb, 138 fs64_to_cpu(sb,
163 inode->data.datastream. 139 inode->data.datastream.
164 max_double_indirect_range)); 140 max_double_indirect_range));
165 141
166 befs_debug(sb, " size %Lu", 142 befs_debug(sb, " size %llu",
167 fs64_to_cpu(sb, inode->data.datastream.size)); 143 fs64_to_cpu(sb, inode->data.datastream.size));
168 } 144 }
169 145
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
191 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); 167 befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size));
192 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); 168 befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift));
193 169
194 befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); 170 befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
195 befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); 171 befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
196 172
197 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); 173 befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2));
198 befs_debug(sb, " blocks_per_ag %u", 174 befs_debug(sb, " blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
206 befs_debug(sb, " log_blocks %u, %hu, %hu", 182 befs_debug(sb, " log_blocks %u, %hu, %hu",
207 tmp_run.allocation_group, tmp_run.start, tmp_run.len); 183 tmp_run.allocation_group, tmp_run.start, tmp_run.len);
208 184
209 befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); 185 befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start));
210 befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); 186 befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end));
211 187
212 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); 188 befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3));
213 189
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a9576..fa4b718de597 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
25 /* check magic header. */ 25 /* check magic header. */
26 if (magic1 != BEFS_INODE_MAGIC1) { 26 if (magic1 != BEFS_INODE_MAGIC1) {
27 befs_error(sb, 27 befs_error(sb,
28 "Inode has a bad magic header - inode = %lu", inode); 28 "Inode has a bad magic header - inode = %lu",
29 (unsigned long)inode);
29 return BEFS_BAD_INODE; 30 return BEFS_BAD_INODE;
30 } 31 }
31 32
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
34 */ 35 */
35 if (inode != iaddr2blockno(sb, &ino_num)) { 36 if (inode != iaddr2blockno(sb, &ino_num)) {
36 befs_error(sb, "inode blocknr field disagrees with vfs " 37 befs_error(sb, "inode blocknr field disagrees with vfs "
37 "VFS: %lu, Inode %lu", 38 "VFS: %lu, Inode %lu", (unsigned long)
38 inode, iaddr2blockno(sb, &ino_num)); 39 inode, (unsigned long)iaddr2blockno(sb, &ino_num));
39 return BEFS_BAD_INODE; 40 return BEFS_BAD_INODE;
40 } 41 }
41 42
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
44 */ 45 */
45 46
46 if (!(flags & BEFS_INODE_IN_USE)) { 47 if (!(flags & BEFS_INODE_IN_USE)) {
47 befs_error(sb, "inode is not used - inode = %lu", inode); 48 befs_error(sb, "inode is not used - inode = %lu",
49 (unsigned long)inode);
48 return BEFS_BAD_INODE; 50 return BEFS_BAD_INODE;
49 } 51 }
50 52
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255d..0408a3d601d0 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
30 befs_blocknr_t block = 0; 30 befs_blocknr_t block = 0;
31 befs_sb_info *befs_sb = BEFS_SB(sb); 31 befs_sb_info *befs_sb = BEFS_SB(sb);
32 32
33 befs_debug(sb, "---> Enter befs_read_iaddr() " 33 befs_debug(sb, "---> Enter %s "
34 "[%u, %hu, %hu]", 34 "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
35 iaddr.allocation_group, iaddr.start, iaddr.len); 35 iaddr.start, iaddr.len);
36 36
37 if (iaddr.allocation_group > befs_sb->num_ags) { 37 if (iaddr.allocation_group > befs_sb->num_ags) {
38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", 38 befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
42 42
43 block = iaddr2blockno(sb, &iaddr); 43 block = iaddr2blockno(sb, &iaddr);
44 44
45 befs_debug(sb, "befs_read_iaddr: offset = %lu", block); 45 befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
46 46
47 bh = sb_bread(sb, block); 47 bh = sb_bread(sb, block);
48 48
49 if (bh == NULL) { 49 if (bh == NULL) {
50 befs_error(sb, "Failed to read block %lu", block); 50 befs_error(sb, "Failed to read block %lu",
51 (unsigned long)block);
51 goto error; 52 goto error;
52 } 53 }
53 54
54 befs_debug(sb, "<--- befs_read_iaddr()"); 55 befs_debug(sb, "<--- %s", __func__);
55 return bh; 56 return bh;
56 57
57 error: 58 error:
58 befs_debug(sb, "<--- befs_read_iaddr() ERROR"); 59 befs_debug(sb, "<--- %s ERROR", __func__);
59 return NULL; 60 return NULL;
60} 61}
61 62
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
64{ 65{
65 struct buffer_head *bh = NULL; 66 struct buffer_head *bh = NULL;
66 67
67 befs_debug(sb, "---> Enter befs_read() %Lu", block); 68 befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
68 69
69 bh = sb_bread(sb, block); 70 bh = sb_bread(sb, block);
70 71
71 if (bh == NULL) { 72 if (bh == NULL) {
72 befs_error(sb, "Failed to read block %lu", block); 73 befs_error(sb, "Failed to read block %lu",
74 (unsigned long)block);
73 goto error; 75 goto error;
74 } 76 }
75 77
76 befs_debug(sb, "<--- befs_read()"); 78 befs_debug(sb, "<--- %s", __func__);
77 79
78 return bh; 80 return bh;
79 81
80 error: 82 error:
81 befs_debug(sb, "<--- befs_read() ERROR"); 83 befs_debug(sb, "<--- %s ERROR", __func__);
82 return NULL; 84 return NULL;
83} 85}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 845d2d690ce2..d626756ff721 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
5 * 5 *
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/module.h> 10#include <linux/module.h>
9#include <linux/slab.h> 11#include <linux/slab.h>
10#include <linux/fs.h> 12#include <linux/fs.h>
@@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int)
39static struct inode *befs_iget(struct super_block *, unsigned long); 41static struct inode *befs_iget(struct super_block *, unsigned long);
40static struct inode *befs_alloc_inode(struct super_block *sb); 42static struct inode *befs_alloc_inode(struct super_block *sb);
41static void befs_destroy_inode(struct inode *inode); 43static void befs_destroy_inode(struct inode *inode);
42static int befs_init_inodecache(void);
43static void befs_destroy_inodecache(void); 44static void befs_destroy_inodecache(void);
44static void *befs_follow_link(struct dentry *, struct nameidata *); 45static void *befs_follow_link(struct dentry *, struct nameidata *);
45static void *befs_fast_follow_link(struct dentry *, struct nameidata *); 46static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
@@ -131,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block,
131 ulong disk_off; 132 ulong disk_off;
132 133
133 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", 134 befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
134 inode->i_ino, block); 135 (unsigned long)inode->i_ino, (long)block);
135 136
136 if (block < 0) { 137 if (block < 0) {
137 befs_error(sb, "befs_get_block() was asked for a block " 138 befs_error(sb, "befs_get_block() was asked for a block "
138 "number less than zero: block %ld in inode %lu", 139 "number less than zero: block %ld in inode %lu",
139 block, inode->i_ino); 140 (long)block, (unsigned long)inode->i_ino);
140 return -EIO; 141 return -EIO;
141 } 142 }
142 143
143 if (create) { 144 if (create) {
144 befs_error(sb, "befs_get_block() was asked to write to " 145 befs_error(sb, "befs_get_block() was asked to write to "
145 "block %ld in inode %lu", block, inode->i_ino); 146 "block %ld in inode %lu", (long)block,
147 (unsigned long)inode->i_ino);
146 return -EPERM; 148 return -EPERM;
147 } 149 }
148 150
149 res = befs_fblock2brun(sb, ds, block, &run); 151 res = befs_fblock2brun(sb, ds, block, &run);
150 if (res != BEFS_OK) { 152 if (res != BEFS_OK) {
151 befs_error(sb, 153 befs_error(sb,
152 "<--- befs_get_block() for inode %lu, block " 154 "<--- %s for inode %lu, block %ld ERROR",
153 "%ld ERROR", inode->i_ino, block); 155 __func__, (unsigned long)inode->i_ino,
156 (long)block);
154 return -EFBIG; 157 return -EFBIG;
155 } 158 }
156 159
@@ -158,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block,
158 161
159 map_bh(bh_result, inode->i_sb, disk_off); 162 map_bh(bh_result, inode->i_sb, disk_off);
160 163
161 befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " 164 befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
162 "disk address %lu", inode->i_ino, block, disk_off); 165 __func__, (unsigned long)inode->i_ino, (long)block,
166 (unsigned long)disk_off);
163 167
164 return 0; 168 return 0;
165} 169}
@@ -176,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
176 char *utfname; 180 char *utfname;
177 const char *name = dentry->d_name.name; 181 const char *name = dentry->d_name.name;
178 182
179 befs_debug(sb, "---> befs_lookup() " 183 befs_debug(sb, "---> %s name %s inode %ld", __func__,
180 "name %s inode %ld", dentry->d_name.name, dir->i_ino); 184 dentry->d_name.name, dir->i_ino);
181 185
182 /* Convert to UTF-8 */ 186 /* Convert to UTF-8 */
183 if (BEFS_SB(sb)->nls) { 187 if (BEFS_SB(sb)->nls) {
184 ret = 188 ret =
185 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); 189 befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
186 if (ret < 0) { 190 if (ret < 0) {
187 befs_debug(sb, "<--- befs_lookup() ERROR"); 191 befs_debug(sb, "<--- %s ERROR", __func__);
188 return ERR_PTR(ret); 192 return ERR_PTR(ret);
189 } 193 }
190 ret = befs_btree_find(sb, ds, utfname, &offset); 194 ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -195,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
195 } 199 }
196 200
197 if (ret == BEFS_BT_NOT_FOUND) { 201 if (ret == BEFS_BT_NOT_FOUND) {
198 befs_debug(sb, "<--- befs_lookup() %s not found", 202 befs_debug(sb, "<--- %s %s not found", __func__,
199 dentry->d_name.name); 203 dentry->d_name.name);
200 return ERR_PTR(-ENOENT); 204 return ERR_PTR(-ENOENT);
201 205
202 } else if (ret != BEFS_OK || offset == 0) { 206 } else if (ret != BEFS_OK || offset == 0) {
203 befs_warning(sb, "<--- befs_lookup() Error"); 207 befs_warning(sb, "<--- %s Error", __func__);
204 return ERR_PTR(-ENODATA); 208 return ERR_PTR(-ENODATA);
205 } 209 }
206 210
@@ -210,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
210 214
211 d_add(dentry, inode); 215 d_add(dentry, inode);
212 216
213 befs_debug(sb, "<--- befs_lookup()"); 217 befs_debug(sb, "<--- %s", __func__);
214 218
215 return NULL; 219 return NULL;
216} 220}
@@ -228,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx)
228 char keybuf[BEFS_NAME_LEN + 1]; 232 char keybuf[BEFS_NAME_LEN + 1];
229 const char *dirname = file->f_path.dentry->d_name.name; 233 const char *dirname = file->f_path.dentry->d_name.name;
230 234
231 befs_debug(sb, "---> befs_readdir() " 235 befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
232 "name %s, inode %ld, ctx->pos %Ld", 236 __func__, dirname, inode->i_ino, ctx->pos);
233 dirname, inode->i_ino, ctx->pos);
234 237
235more: 238more:
236 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, 239 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
237 keybuf, &keysize, &value); 240 keybuf, &keysize, &value);
238 241
239 if (result == BEFS_ERR) { 242 if (result == BEFS_ERR) {
240 befs_debug(sb, "<--- befs_readdir() ERROR"); 243 befs_debug(sb, "<--- %s ERROR", __func__);
241 befs_error(sb, "IO error reading %s (inode %lu)", 244 befs_error(sb, "IO error reading %s (inode %lu)",
242 dirname, inode->i_ino); 245 dirname, inode->i_ino);
243 return -EIO; 246 return -EIO;
244 247
245 } else if (result == BEFS_BT_END) { 248 } else if (result == BEFS_BT_END) {
246 befs_debug(sb, "<--- befs_readdir() END"); 249 befs_debug(sb, "<--- %s END", __func__);
247 return 0; 250 return 0;
248 251
249 } else if (result == BEFS_BT_EMPTY) { 252 } else if (result == BEFS_BT_EMPTY) {
250 befs_debug(sb, "<--- befs_readdir() Empty directory"); 253 befs_debug(sb, "<--- %s Empty directory", __func__);
251 return 0; 254 return 0;
252 } 255 }
253 256
@@ -260,7 +263,7 @@ more:
260 result = 263 result =
261 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); 264 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
262 if (result < 0) { 265 if (result < 0) {
263 befs_debug(sb, "<--- befs_readdir() ERROR"); 266 befs_debug(sb, "<--- %s ERROR", __func__);
264 return result; 267 return result;
265 } 268 }
266 if (!dir_emit(ctx, nlsname, nlsnamelen, 269 if (!dir_emit(ctx, nlsname, nlsnamelen,
@@ -277,7 +280,7 @@ more:
277 ctx->pos++; 280 ctx->pos++;
278 goto more; 281 goto more;
279 282
280 befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); 283 befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
281 284
282 return 0; 285 return 0;
283} 286}
@@ -321,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
321 struct inode *inode; 324 struct inode *inode;
322 long ret = -EIO; 325 long ret = -EIO;
323 326
324 befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); 327 befs_debug(sb, "---> %s inode = %lu", __func__, ino);
325 328
326 inode = iget_locked(sb, ino); 329 inode = iget_locked(sb, ino);
327 if (!inode) 330 if (!inode)
@@ -428,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
428 } 431 }
429 432
430 brelse(bh); 433 brelse(bh);
431 befs_debug(sb, "<--- befs_read_inode()"); 434 befs_debug(sb, "<--- %s", __func__);
432 unlock_new_inode(inode); 435 unlock_new_inode(inode);
433 return inode; 436 return inode;
434 437
@@ -437,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
437 440
438 unacquire_none: 441 unacquire_none:
439 iget_failed(inode); 442 iget_failed(inode);
440 befs_debug(sb, "<--- befs_read_inode() - Bad inode"); 443 befs_debug(sb, "<--- %s - Bad inode", __func__);
441 return ERR_PTR(ret); 444 return ERR_PTR(ret);
442} 445}
443 446
@@ -445,7 +448,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
445 * 448 *
446 * Taken from NFS implementation by Al Viro. 449 * Taken from NFS implementation by Al Viro.
447 */ 450 */
448static int 451static int __init
449befs_init_inodecache(void) 452befs_init_inodecache(void)
450{ 453{
451 befs_inode_cachep = kmem_cache_create("befs_inode_cache", 454 befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -454,11 +457,9 @@ befs_init_inodecache(void)
454 SLAB_MEM_SPREAD), 457 SLAB_MEM_SPREAD),
455 init_once); 458 init_once);
456 if (befs_inode_cachep == NULL) { 459 if (befs_inode_cachep == NULL) {
457 printk(KERN_ERR "befs_init_inodecache: " 460 pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
458 "Couldn't initialize inode slabcache\n");
459 return -ENOMEM; 461 return -ENOMEM;
460 } 462 }
461
462 return 0; 463 return 0;
463} 464}
464 465
@@ -544,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
544 */ 545 */
545 int maxlen = in_len + 1; 546 int maxlen = in_len + 1;
546 547
547 befs_debug(sb, "---> utf2nls()"); 548 befs_debug(sb, "---> %s", __func__);
548 549
549 if (!nls) { 550 if (!nls) {
550 befs_error(sb, "befs_utf2nls called with no NLS table loaded"); 551 befs_error(sb, "%s called with no NLS table loaded", __func__);
551 return -EINVAL; 552 return -EINVAL;
552 } 553 }
553 554
554 *out = result = kmalloc(maxlen, GFP_NOFS); 555 *out = result = kmalloc(maxlen, GFP_NOFS);
555 if (!*out) { 556 if (!*out) {
556 befs_error(sb, "befs_utf2nls() cannot allocate memory"); 557 befs_error(sb, "%s cannot allocate memory", __func__);
557 *out_len = 0; 558 *out_len = 0;
558 return -ENOMEM; 559 return -ENOMEM;
559 } 560 }
@@ -575,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
575 result[o] = '\0'; 576 result[o] = '\0';
576 *out_len = o; 577 *out_len = o;
577 578
578 befs_debug(sb, "<--- utf2nls()"); 579 befs_debug(sb, "<--- %s", __func__);
579 580
580 return o; 581 return o;
581 582
582 conv_err: 583 conv_err:
583 befs_error(sb, "Name using character set %s contains a character that " 584 befs_error(sb, "Name using character set %s contains a character that "
584 "cannot be converted to unicode.", nls->charset); 585 "cannot be converted to unicode.", nls->charset);
585 befs_debug(sb, "<--- utf2nls()"); 586 befs_debug(sb, "<--- %s", __func__);
586 kfree(result); 587 kfree(result);
587 return -EILSEQ; 588 return -EILSEQ;
588} 589}
@@ -623,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
623 * in special cases */ 624 * in special cases */
624 int maxlen = (3 * in_len) + 1; 625 int maxlen = (3 * in_len) + 1;
625 626
626 befs_debug(sb, "---> nls2utf()\n"); 627 befs_debug(sb, "---> %s\n", __func__);
627 628
628 if (!nls) { 629 if (!nls) {
629 befs_error(sb, "befs_nls2utf called with no NLS table loaded."); 630 befs_error(sb, "%s called with no NLS table loaded.",
631 __func__);
630 return -EINVAL; 632 return -EINVAL;
631 } 633 }
632 634
633 *out = result = kmalloc(maxlen, GFP_NOFS); 635 *out = result = kmalloc(maxlen, GFP_NOFS);
634 if (!*out) { 636 if (!*out) {
635 befs_error(sb, "befs_nls2utf() cannot allocate memory"); 637 befs_error(sb, "%s cannot allocate memory", __func__);
636 *out_len = 0; 638 *out_len = 0;
637 return -ENOMEM; 639 return -ENOMEM;
638 } 640 }
@@ -653,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
653 result[o] = '\0'; 655 result[o] = '\0';
654 *out_len = o; 656 *out_len = o;
655 657
656 befs_debug(sb, "<--- nls2utf()"); 658 befs_debug(sb, "<--- %s", __func__);
657 659
658 return i; 660 return i;
659 661
660 conv_err: 662 conv_err:
661 befs_error(sb, "Name using charecter set %s contains a charecter that " 663 befs_error(sb, "Name using charecter set %s contains a charecter that "
662 "cannot be converted to unicode.", nls->charset); 664 "cannot be converted to unicode.", nls->charset);
663 befs_debug(sb, "<--- nls2utf()"); 665 befs_debug(sb, "<--- %s", __func__);
664 kfree(result); 666 kfree(result);
665 return -EILSEQ; 667 return -EILSEQ;
666} 668}
@@ -715,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts)
715 if (option >= 0) 717 if (option >= 0)
716 uid = make_kuid(current_user_ns(), option); 718 uid = make_kuid(current_user_ns(), option);
717 if (!uid_valid(uid)) { 719 if (!uid_valid(uid)) {
718 printk(KERN_ERR "BeFS: Invalid uid %d, " 720 pr_err("Invalid uid %d, "
719 "using default\n", option); 721 "using default\n", option);
720 break; 722 break;
721 } 723 }
722 opts->uid = uid; 724 opts->uid = uid;
@@ -729,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts)
729 if (option >= 0) 731 if (option >= 0)
730 gid = make_kgid(current_user_ns(), option); 732 gid = make_kgid(current_user_ns(), option);
731 if (!gid_valid(gid)) { 733 if (!gid_valid(gid)) {
732 printk(KERN_ERR "BeFS: Invalid gid %d, " 734 pr_err("Invalid gid %d, "
733 "using default\n", option); 735 "using default\n", option);
734 break; 736 break;
735 } 737 }
736 opts->gid = gid; 738 opts->gid = gid;
@@ -740,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
740 kfree(opts->iocharset); 742 kfree(opts->iocharset);
741 opts->iocharset = match_strdup(&args[0]); 743 opts->iocharset = match_strdup(&args[0]);
742 if (!opts->iocharset) { 744 if (!opts->iocharset) {
743 printk(KERN_ERR "BeFS: allocation failure for " 745 pr_err("allocation failure for "
744 "iocharset string\n"); 746 "iocharset string\n");
745 return 0; 747 return 0;
746 } 748 }
747 break; 749 break;
@@ -749,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts)
749 opts->debug = 1; 751 opts->debug = 1;
750 break; 752 break;
751 default: 753 default:
752 printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " 754 pr_err("Unrecognized mount option \"%s\" "
753 "or missing value\n", p); 755 "or missing value\n", p);
754 return 0; 756 return 0;
755 } 757 }
756 } 758 }
@@ -791,22 +793,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
791 793
792 save_mount_options(sb, data); 794 save_mount_options(sb, data);
793 795
794 sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); 796 sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
795 if (sb->s_fs_info == NULL) { 797 if (sb->s_fs_info == NULL) {
796 printk(KERN_ERR 798 pr_err("(%s): Unable to allocate memory for private "
797 "BeFS(%s): Unable to allocate memory for private "
798 "portion of superblock. Bailing.\n", sb->s_id); 799 "portion of superblock. Bailing.\n", sb->s_id);
799 goto unacquire_none; 800 goto unacquire_none;
800 } 801 }
801 befs_sb = BEFS_SB(sb); 802 befs_sb = BEFS_SB(sb);
802 memset(befs_sb, 0, sizeof(befs_sb_info));
803 803
804 if (!parse_options((char *) data, &befs_sb->mount_opts)) { 804 if (!parse_options((char *) data, &befs_sb->mount_opts)) {
805 befs_error(sb, "cannot parse mount options"); 805 befs_error(sb, "cannot parse mount options");
806 goto unacquire_priv_sbp; 806 goto unacquire_priv_sbp;
807 } 807 }
808 808
809 befs_debug(sb, "---> befs_fill_super()"); 809 befs_debug(sb, "---> %s", __func__);
810 810
811#ifndef CONFIG_BEFS_RW 811#ifndef CONFIG_BEFS_RW
812 if (!(sb->s_flags & MS_RDONLY)) { 812 if (!(sb->s_flags & MS_RDONLY)) {
@@ -854,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
854 goto unacquire_priv_sbp; 854 goto unacquire_priv_sbp;
855 855
856 if( befs_sb->num_blocks > ~((sector_t)0) ) { 856 if( befs_sb->num_blocks > ~((sector_t)0) ) {
857 befs_error(sb, "blocks count: %Lu " 857 befs_error(sb, "blocks count: %llu "
858 "is larger than the host can use", 858 "is larger than the host can use",
859 befs_sb->num_blocks); 859 befs_sb->num_blocks);
860 goto unacquire_priv_sbp; 860 goto unacquire_priv_sbp;
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
913static int 913static int
914befs_remount(struct super_block *sb, int *flags, char *data) 914befs_remount(struct super_block *sb, int *flags, char *data)
915{ 915{
916 sync_filesystem(sb);
916 if (!(*flags & MS_RDONLY)) 917 if (!(*flags & MS_RDONLY))
917 return -EINVAL; 918 return -EINVAL;
918 return 0; 919 return 0;
@@ -924,7 +925,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
924 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
925 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 befs_debug(sb, "---> befs_statfs()"); 928 befs_debug(sb, "---> %s", __func__);
928 929
929 buf->f_type = BEFS_SUPER_MAGIC; 930 buf->f_type = BEFS_SUPER_MAGIC;
930 buf->f_bsize = sb->s_blocksize; 931 buf->f_bsize = sb->s_blocksize;
@@ -937,7 +938,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
937 buf->f_fsid.val[1] = (u32)(id >> 32); 938 buf->f_fsid.val[1] = (u32)(id >> 32);
938 buf->f_namelen = BEFS_NAME_LEN; 939 buf->f_namelen = BEFS_NAME_LEN;
939 940
940 befs_debug(sb, "<--- befs_statfs()"); 941 befs_debug(sb, "<--- %s", __func__);
941 942
942 return 0; 943 return 0;
943} 944}
@@ -963,7 +964,7 @@ init_befs_fs(void)
963{ 964{
964 int err; 965 int err;
965 966
966 printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); 967 pr_info("version: %s\n", BEFS_VERSION);
967 968
968 err = befs_init_inodecache(); 969 err = befs_init_inodecache();
969 if (err) 970 if (err)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b3f9a2..7041ac35ace8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)
172 172
173 dprintf("ino=%08lx\n", ino); 173 dprintf("ino=%08lx\n", ino);
174 174
175 truncate_inode_pages(&inode->i_data, 0); 175 truncate_inode_pages_final(&inode->i_data);
176 invalidate_inode_buffers(inode); 176 invalidate_inode_buffers(inode);
177 clear_inode(inode); 177 clear_inode(inode);
178 178
@@ -266,7 +266,7 @@ static void init_once(void *foo)
266 inode_init_once(&bi->vfs_inode); 266 inode_init_once(&bi->vfs_inode);
267} 267}
268 268
269static int init_inodecache(void) 269static int __init init_inodecache(void)
270{ 270{
271 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", 271 bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
272 sizeof(struct bfs_inode_info), 272 sizeof(struct bfs_inode_info),
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..aa3cb626671e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
46#endif 46#endif
47 47
48static int load_elf_binary(struct linux_binprm *bprm); 48static int load_elf_binary(struct linux_binprm *bprm);
49static int load_elf_library(struct file *);
50static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
51 int, int, unsigned long); 50 int, int, unsigned long);
52 51
52#ifdef CONFIG_USELIB
53static int load_elf_library(struct file *);
54#else
55#define load_elf_library NULL
56#endif
57
53/* 58/*
54 * If we don't support core dumping, then supply a NULL so we 59 * If we don't support core dumping, then supply a NULL so we
55 * don't even try. 60 * don't even try.
@@ -579,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
579 unsigned long start_code, end_code, start_data, end_data; 584 unsigned long start_code, end_code, start_data, end_data;
580 unsigned long reloc_func_desc __maybe_unused = 0; 585 unsigned long reloc_func_desc __maybe_unused = 0;
581 int executable_stack = EXSTACK_DEFAULT; 586 int executable_stack = EXSTACK_DEFAULT;
582 unsigned long def_flags = 0;
583 struct pt_regs *regs = current_pt_regs(); 587 struct pt_regs *regs = current_pt_regs();
584 struct { 588 struct {
585 struct elfhdr elf_ex; 589 struct elfhdr elf_ex;
@@ -719,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
719 if (retval) 723 if (retval)
720 goto out_free_dentry; 724 goto out_free_dentry;
721 725
722 /* OK, This is the point of no return */
723 current->mm->def_flags = def_flags;
724
725 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 726 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
726 may depend on the personality. */ 727 may depend on the personality. */
727 SET_PERSONALITY(loc->elf_ex); 728 SET_PERSONALITY(loc->elf_ex);
@@ -1005,6 +1006,7 @@ out_free_ph:
1005 goto out; 1006 goto out;
1006} 1007}
1007 1008
1009#ifdef CONFIG_USELIB
1008/* This is really simpleminded and specialized - we are loading an 1010/* This is really simpleminded and specialized - we are loading an
1009 a.out library that is given an ELF header. */ 1011 a.out library that is given an ELF header. */
1010static int load_elf_library(struct file *file) 1012static int load_elf_library(struct file *file)
@@ -1083,6 +1085,7 @@ out_free_ph:
1083out: 1085out:
1084 return error; 1086 return error;
1085} 1087}
1088#endif /* #ifdef CONFIG_USELIB */
1086 1089
1087#ifdef CONFIG_ELF_CORE 1090#ifdef CONFIG_ELF_CORE
1088/* 1091/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1c740e152f38..b60500300dd7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
656 656
657 mutex_unlock(&root->d_inode->i_mutex); 657 mutex_unlock(&root->d_inode->i_mutex);
658 dput(root); 658 dput(root);
659 break;
659 default: return res; 660 default: return res;
660 } 661 }
661 return count; 662 return count;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4f70f383132c..1c2ce0c87711 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw)
182 */ 182 */
183int bio_integrity_enabled(struct bio *bio) 183int bio_integrity_enabled(struct bio *bio)
184{ 184{
185 if (!bio_is_rw(bio))
186 return 0;
187
185 /* Already protected? */ 188 /* Already protected? */
186 if (bio_integrity(bio)) 189 if (bio_integrity(bio))
187 return 0; 190 return 0;
@@ -301,45 +304,65 @@ int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
301EXPORT_SYMBOL(bio_integrity_get_tag); 304EXPORT_SYMBOL(bio_integrity_get_tag);
302 305
303/** 306/**
304 * bio_integrity_generate - Generate integrity metadata for a bio 307 * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
305 * @bio: bio to generate integrity metadata for 308 * @bio: bio to generate/verify integrity metadata for
306 * 309 * @operate: operate number, 1 for generate, 0 for verify
307 * Description: Generates integrity metadata for a bio by calling the
308 * block device's generation callback function. The bio must have a
309 * bip attached with enough room to accommodate the generated
310 * integrity metadata.
311 */ 310 */
312static void bio_integrity_generate(struct bio *bio) 311static int bio_integrity_generate_verify(struct bio *bio, int operate)
313{ 312{
314 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 313 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
315 struct blk_integrity_exchg bix; 314 struct blk_integrity_exchg bix;
316 struct bio_vec bv; 315 struct bio_vec *bv;
317 struct bvec_iter iter; 316 sector_t sector;
318 sector_t sector = bio->bi_iter.bi_sector; 317 unsigned int sectors, ret = 0, i;
319 unsigned int sectors, total;
320 void *prot_buf = bio->bi_integrity->bip_buf; 318 void *prot_buf = bio->bi_integrity->bip_buf;
321 319
322 total = 0; 320 if (operate)
321 sector = bio->bi_iter.bi_sector;
322 else
323 sector = bio->bi_integrity->bip_iter.bi_sector;
324
323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 325 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
324 bix.sector_size = bi->sector_size; 326 bix.sector_size = bi->sector_size;
325 327
326 bio_for_each_segment(bv, bio, iter) { 328 bio_for_each_segment_all(bv, bio, i) {
327 void *kaddr = kmap_atomic(bv.bv_page); 329 void *kaddr = kmap_atomic(bv->bv_page);
328 bix.data_buf = kaddr + bv.bv_offset; 330 bix.data_buf = kaddr + bv->bv_offset;
329 bix.data_size = bv.bv_len; 331 bix.data_size = bv->bv_len;
330 bix.prot_buf = prot_buf; 332 bix.prot_buf = prot_buf;
331 bix.sector = sector; 333 bix.sector = sector;
332 334
333 bi->generate_fn(&bix); 335 if (operate)
336 bi->generate_fn(&bix);
337 else {
338 ret = bi->verify_fn(&bix);
339 if (ret) {
340 kunmap_atomic(kaddr);
341 return ret;
342 }
343 }
334 344
335 sectors = bv.bv_len / bi->sector_size; 345 sectors = bv->bv_len / bi->sector_size;
336 sector += sectors; 346 sector += sectors;
337 prot_buf += sectors * bi->tuple_size; 347 prot_buf += sectors * bi->tuple_size;
338 total += sectors * bi->tuple_size;
339 BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
340 348
341 kunmap_atomic(kaddr); 349 kunmap_atomic(kaddr);
342 } 350 }
351 return ret;
352}
353
354/**
355 * bio_integrity_generate - Generate integrity metadata for a bio
356 * @bio: bio to generate integrity metadata for
357 *
358 * Description: Generates integrity metadata for a bio by calling the
359 * block device's generation callback function. The bio must have a
360 * bip attached with enough room to accommodate the generated
361 * integrity metadata.
362 */
363static void bio_integrity_generate(struct bio *bio)
364{
365 bio_integrity_generate_verify(bio, 1);
343} 366}
344 367
345static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) 368static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
@@ -454,40 +477,7 @@ EXPORT_SYMBOL(bio_integrity_prep);
454 */ 477 */
455static int bio_integrity_verify(struct bio *bio) 478static int bio_integrity_verify(struct bio *bio)
456{ 479{
457 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 480 return bio_integrity_generate_verify(bio, 0);
458 struct blk_integrity_exchg bix;
459 struct bio_vec *bv;
460 sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
461 unsigned int sectors, ret = 0;
462 void *prot_buf = bio->bi_integrity->bip_buf;
463 int i;
464
465 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
466 bix.sector_size = bi->sector_size;
467
468 bio_for_each_segment_all(bv, bio, i) {
469 void *kaddr = kmap_atomic(bv->bv_page);
470
471 bix.data_buf = kaddr + bv->bv_offset;
472 bix.data_size = bv->bv_len;
473 bix.prot_buf = prot_buf;
474 bix.sector = sector;
475
476 ret = bi->verify_fn(&bix);
477
478 if (ret) {
479 kunmap_atomic(kaddr);
480 return ret;
481 }
482
483 sectors = bv->bv_len / bi->sector_size;
484 sector += sectors;
485 prot_buf += sectors * bi->tuple_size;
486
487 kunmap_atomic(kaddr);
488 }
489
490 return ret;
491} 481}
492 482
493/** 483/**
diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b6eb49..6f0362b77806 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -116,7 +116,6 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
116 if (!slab) 116 if (!slab)
117 goto out_unlock; 117 goto out_unlock;
118 118
119 printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
120 bslab->slab = slab; 119 bslab->slab = slab;
121 bslab->slab_ref = 1; 120 bslab->slab_ref = 1;
122 bslab->slab_size = sz; 121 bslab->slab_size = sz;
@@ -1003,7 +1002,7 @@ struct bio_map_data {
1003}; 1002};
1004 1003
1005static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 1004static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
1006 struct sg_iovec *iov, int iov_count, 1005 const struct sg_iovec *iov, int iov_count,
1007 int is_our_pages) 1006 int is_our_pages)
1008{ 1007{
1009 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 1008 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
@@ -1023,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs,
1023 sizeof(struct sg_iovec) * iov_count, gfp_mask); 1022 sizeof(struct sg_iovec) * iov_count, gfp_mask);
1024} 1023}
1025 1024
1026static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, 1025static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
1027 int to_user, int from_user, int do_free_page) 1026 int to_user, int from_user, int do_free_page)
1028{ 1027{
1029 int ret = 0, i; 1028 int ret = 0, i;
@@ -1121,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user);
1121 */ 1120 */
1122struct bio *bio_copy_user_iov(struct request_queue *q, 1121struct bio *bio_copy_user_iov(struct request_queue *q,
1123 struct rq_map_data *map_data, 1122 struct rq_map_data *map_data,
1124 struct sg_iovec *iov, int iov_count, 1123 const struct sg_iovec *iov, int iov_count,
1125 int write_to_vm, gfp_t gfp_mask) 1124 int write_to_vm, gfp_t gfp_mask)
1126{ 1125{
1127 struct bio_map_data *bmd; 1126 struct bio_map_data *bmd;
@@ -1260,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user);
1260 1259
1261static struct bio *__bio_map_user_iov(struct request_queue *q, 1260static struct bio *__bio_map_user_iov(struct request_queue *q,
1262 struct block_device *bdev, 1261 struct block_device *bdev,
1263 struct sg_iovec *iov, int iov_count, 1262 const struct sg_iovec *iov, int iov_count,
1264 int write_to_vm, gfp_t gfp_mask) 1263 int write_to_vm, gfp_t gfp_mask)
1265{ 1264{
1266 int i, j; 1265 int i, j;
@@ -1408,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user);
1408 * device. Returns an error pointer in case of error. 1407 * device. Returns an error pointer in case of error.
1409 */ 1408 */
1410struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 1409struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
1411 struct sg_iovec *iov, int iov_count, 1410 const struct sg_iovec *iov, int iov_count,
1412 int write_to_vm, gfp_t gfp_mask) 1411 int write_to_vm, gfp_t gfp_mask)
1413{ 1412{
1414 struct bio *bio; 1413 struct bio *bio;
@@ -1970,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
1970 1969
1971 /* associate blkcg if exists */ 1970 /* associate blkcg if exists */
1972 rcu_read_lock(); 1971 rcu_read_lock();
1973 css = task_css(current, blkio_subsys_id); 1972 css = task_css(current, blkio_cgrp_id);
1974 if (css && css_tryget(css)) 1973 if (css && css_tryget(css))
1975 bio->bi_css = css; 1974 bio->bi_css = css;
1976 rcu_read_unlock(); 1975 rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823a9cbd..552a8d13bc32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev)
83{ 83{
84 struct address_space *mapping = bdev->bd_inode->i_mapping; 84 struct address_space *mapping = bdev->bd_inode->i_mapping;
85 85
86 if (mapping->nrpages == 0) 86 if (mapping->nrpages == 0 && mapping->nrshadows == 0)
87 return; 87 return;
88 88
89 invalidate_bh_lrus(); 89 invalidate_bh_lrus();
@@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode)
419{ 419{
420 struct block_device *bdev = &BDEV_I(inode)->bdev; 420 struct block_device *bdev = &BDEV_I(inode)->bdev;
421 struct list_head *p; 421 struct list_head *p;
422 truncate_inode_pages(&inode->i_data, 0); 422 truncate_inode_pages_final(&inode->i_data);
423 invalidate_inode_buffers(inode); /* is it needed here? */ 423 invalidate_inode_buffers(inode); /* is it needed here? */
424 clear_inode(inode); 424 clear_inode(inode);
425 spin_lock(&bdev_lock); 425 spin_lock(&bdev_lock);
@@ -1518,12 +1518,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1518 BUG_ON(iocb->ki_pos != pos); 1518 BUG_ON(iocb->ki_pos != pos);
1519 1519
1520 blk_start_plug(&plug); 1520 blk_start_plug(&plug);
1521 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1521 ret = __generic_file_aio_write(iocb, iov, nr_segs);
1522 if (ret > 0) { 1522 if (ret > 0) {
1523 ssize_t err; 1523 ssize_t err;
1524 1524
1525 err = generic_write_sync(file, pos, ret); 1525 err = generic_write_sync(file, pos, ret);
1526 if (err < 0 && ret > 0) 1526 if (err < 0)
1527 ret = err; 1527 ret = err;
1528 } 1528 }
1529 blk_finish_plug(&plug); 1529 blk_finish_plug(&plug);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..5a201d81049c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -21,708 +22,315 @@
21#include <linux/list.h> 22#include <linux/list.h>
22#include <linux/spinlock.h> 23#include <linux/spinlock.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
24#include "async-thread.h" 26#include "async-thread.h"
27#include "ctree.h"
28
29#define WORK_DONE_BIT 0
30#define WORK_ORDER_DONE_BIT 1
31#define WORK_HIGH_PRIO_BIT 2
32
33#define NO_THRESHOLD (-1)
34#define DFT_THRESHOLD (32)
35
36struct __btrfs_workqueue {
37 struct workqueue_struct *normal_wq;
38 /* List head pointing to ordered work list */
39 struct list_head ordered_list;
40
41 /* Spinlock for ordered_list */
42 spinlock_t list_lock;
43
44 /* Thresholding related variants */
45 atomic_t pending;
46 int max_active;
47 int current_max;
48 int thresh;
49 unsigned int count;
50 spinlock_t thres_lock;
51};
25 52
26#define WORK_QUEUED_BIT 0 53struct btrfs_workqueue {
27#define WORK_DONE_BIT 1 54 struct __btrfs_workqueue *normal;
28#define WORK_ORDER_DONE_BIT 2 55 struct __btrfs_workqueue *high;
29#define WORK_HIGH_PRIO_BIT 3 56};
30
31/*
32 * container for the kthread task pointer and the list of pending work
33 * One of these is allocated per thread.
34 */
35struct btrfs_worker_thread {
36 /* pool we belong to */
37 struct btrfs_workers *workers;
38
39 /* list of struct btrfs_work that are waiting for service */
40 struct list_head pending;
41 struct list_head prio_pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48 57
49 /* number of things on the pending list */ 58static inline struct __btrfs_workqueue
50 atomic_t num_pending; 59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh)
61{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
51 63
52 /* reference counter for this struct */ 64 if (unlikely(!ret))
53 atomic_t refs; 65 return NULL;
54 66
55 unsigned long sequence; 67 ret->max_active = max_active;
68 atomic_set(&ret->pending, 0);
69 if (thresh == 0)
70 thresh = DFT_THRESHOLD;
71 /* For low threshold, disabling threshold is a better choice */
72 if (thresh < DFT_THRESHOLD) {
73 ret->current_max = max_active;
74 ret->thresh = NO_THRESHOLD;
75 } else {
76 ret->current_max = 1;
77 ret->thresh = thresh;
78 }
56 79
57 /* protects the pending list. */ 80 if (flags & WQ_HIGHPRI)
58 spinlock_t lock; 81 ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
82 ret->max_active,
83 "btrfs", name);
84 else
85 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs",
87 name);
88 if (unlikely(!ret->normal_wq)) {
89 kfree(ret);
90 return NULL;
91 }
59 92
60 /* set to non-zero when this thread is already awake and kicking */ 93 INIT_LIST_HEAD(&ret->ordered_list);
61 int working; 94 spin_lock_init(&ret->list_lock);
95 spin_lock_init(&ret->thres_lock);
96 trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
97 return ret;
98}
62 99
63 /* are we currently idle */ 100static inline void
64 int idle; 101__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
65};
66 102
67static int __btrfs_start_workers(struct btrfs_workers *workers); 103struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
104 int flags,
105 int max_active,
106 int thresh)
107{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
68 109
69/* 110 if (unlikely(!ret))
70 * btrfs_start_workers uses kthread_run, which can block waiting for memory 111 return NULL;
71 * for a very long time. It will actually throttle on page writeback,
72 * and so it may not make progress until after our btrfs worker threads
73 * process all of the pending work structs in their queue
74 *
75 * This means we can't use btrfs_start_workers from inside a btrfs worker
76 * thread that is used as part of cleaning dirty memory, which pretty much
77 * involves all of the worker threads.
78 *
79 * Instead we have a helper queue who never has more than one thread
80 * where we scheduler thread start operations. This worker_start struct
81 * is used to contain the work and hold a pointer to the queue that needs
82 * another worker.
83 */
84struct worker_start {
85 struct btrfs_work work;
86 struct btrfs_workers *queue;
87};
88 112
89static void start_new_worker_func(struct btrfs_work *work) 113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
90{ 114 max_active, thresh);
91 struct worker_start *start; 115 if (unlikely(!ret->normal)) {
92 start = container_of(work, struct worker_start, work); 116 kfree(ret);
93 __btrfs_start_workers(start->queue); 117 return NULL;
94 kfree(start); 118 }
95}
96 119
97/* 120 if (flags & WQ_HIGHPRI) {
98 * helper function to move a thread onto the idle list after it 121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
99 * has finished some requests. 122 thresh);
100 */ 123 if (unlikely(!ret->high)) {
101static void check_idle_worker(struct btrfs_worker_thread *worker) 124 __btrfs_destroy_workqueue(ret->normal);
102{ 125 kfree(ret);
103 if (!worker->idle && atomic_read(&worker->num_pending) < 126 return NULL;
104 worker->workers->idle_thresh / 2) {
105 unsigned long flags;
106 spin_lock_irqsave(&worker->workers->lock, flags);
107 worker->idle = 1;
108
109 /* the list may be empty if the worker is just starting */
110 if (!list_empty(&worker->worker_list) &&
111 !worker->workers->stopping) {
112 list_move(&worker->worker_list,
113 &worker->workers->idle_list);
114 } 127 }
115 spin_unlock_irqrestore(&worker->workers->lock, flags);
116 } 128 }
129 return ret;
117} 130}
118 131
119/* 132/*
120 * helper function to move a thread off the idle list after new 133 * Hook for threshold which will be called in btrfs_queue_work.
121 * pending work is added. 134 * This hook WILL be called in IRQ handler context,
135 * so workqueue_set_max_active MUST NOT be called in this hook
122 */ 136 */
123static void check_busy_worker(struct btrfs_worker_thread *worker) 137static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
124{ 138{
125 if (worker->idle && atomic_read(&worker->num_pending) >= 139 if (wq->thresh == NO_THRESHOLD)
126 worker->workers->idle_thresh) { 140 return;
127 unsigned long flags; 141 atomic_inc(&wq->pending);
128 spin_lock_irqsave(&worker->workers->lock, flags);
129 worker->idle = 0;
130
131 if (!list_empty(&worker->worker_list) &&
132 !worker->workers->stopping) {
133 list_move_tail(&worker->worker_list,
134 &worker->workers->worker_list);
135 }
136 spin_unlock_irqrestore(&worker->workers->lock, flags);
137 }
138} 142}
139 143
140static void check_pending_worker_creates(struct btrfs_worker_thread *worker) 144/*
145 * Hook for threshold which will be called before executing the work,
146 * This hook is called in kthread content.
147 * So workqueue_set_max_active is called here.
148 */
149static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
141{ 150{
142 struct btrfs_workers *workers = worker->workers; 151 int new_max_active;
143 struct worker_start *start; 152 long pending;
144 unsigned long flags; 153 int need_change = 0;
145
146 rmb();
147 if (!workers->atomic_start_pending)
148 return;
149 154
150 start = kzalloc(sizeof(*start), GFP_NOFS); 155 if (wq->thresh == NO_THRESHOLD)
151 if (!start)
152 return; 156 return;
153 157
154 start->work.func = start_new_worker_func; 158 atomic_dec(&wq->pending);
155 start->queue = workers; 159 spin_lock(&wq->thres_lock);
156 160 /*
157 spin_lock_irqsave(&workers->lock, flags); 161 * Use wq->count to limit the calling frequency of
158 if (!workers->atomic_start_pending) 162 * workqueue_set_max_active.
159 goto out; 163 */
160 164 wq->count++;
161 workers->atomic_start_pending = 0; 165 wq->count %= (wq->thresh / 4);
162 if (workers->num_workers + workers->num_workers_starting >= 166 if (!wq->count)
163 workers->max_workers) 167 goto out;
164 goto out; 168 new_max_active = wq->current_max;
165
166 workers->num_workers_starting += 1;
167 spin_unlock_irqrestore(&workers->lock, flags);
168 btrfs_queue_worker(workers->atomic_worker_start, &start->work);
169 return;
170 169
170 /*
171 * pending may be changed later, but it's OK since we really
172 * don't need it so accurate to calculate new_max_active.
173 */
174 pending = atomic_read(&wq->pending);
175 if (pending > wq->thresh)
176 new_max_active++;
177 if (pending < wq->thresh / 2)
178 new_max_active--;
179 new_max_active = clamp_val(new_max_active, 1, wq->max_active);
180 if (new_max_active != wq->current_max) {
181 need_change = 1;
182 wq->current_max = new_max_active;
183 }
171out: 184out:
172 kfree(start); 185 spin_unlock(&wq->thres_lock);
173 spin_unlock_irqrestore(&workers->lock, flags); 186
187 if (need_change) {
188 workqueue_set_max_active(wq->normal_wq, wq->current_max);
189 }
174} 190}
175 191
176static noinline void run_ordered_completions(struct btrfs_workers *workers, 192static void run_ordered_work(struct __btrfs_workqueue *wq)
177 struct btrfs_work *work)
178{ 193{
179 if (!workers->ordered) 194 struct list_head *list = &wq->ordered_list;
180 return; 195 struct btrfs_work *work;
181 196 spinlock_t *lock = &wq->list_lock;
182 set_bit(WORK_DONE_BIT, &work->flags); 197 unsigned long flags;
183
184 spin_lock(&workers->order_lock);
185 198
186 while (1) { 199 while (1) {
187 if (!list_empty(&workers->prio_order_list)) { 200 spin_lock_irqsave(lock, flags);
188 work = list_entry(workers->prio_order_list.next, 201 if (list_empty(list))
189 struct btrfs_work, order_list);
190 } else if (!list_empty(&workers->order_list)) {
191 work = list_entry(workers->order_list.next,
192 struct btrfs_work, order_list);
193 } else {
194 break; 202 break;
195 } 203 work = list_entry(list->next, struct btrfs_work,
204 ordered_list);
196 if (!test_bit(WORK_DONE_BIT, &work->flags)) 205 if (!test_bit(WORK_DONE_BIT, &work->flags))
197 break; 206 break;
198 207
199 /* we are going to call the ordered done function, but 208 /*
209 * we are going to call the ordered done function, but
200 * we leave the work item on the list as a barrier so 210 * we leave the work item on the list as a barrier so
201 * that later work items that are done don't have their 211 * that later work items that are done don't have their
202 * functions called before this one returns 212 * functions called before this one returns
203 */ 213 */
204 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) 214 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
205 break; 215 break;
206 216 trace_btrfs_ordered_sched(work);
207 spin_unlock(&workers->order_lock); 217 spin_unlock_irqrestore(lock, flags);
208
209 work->ordered_func(work); 218 work->ordered_func(work);
210 219
211 /* now take the lock again and drop our item from the list */ 220 /* now take the lock again and drop our item from the list */
212 spin_lock(&workers->order_lock); 221 spin_lock_irqsave(lock, flags);
213 list_del(&work->order_list); 222 list_del(&work->ordered_list);
214 spin_unlock(&workers->order_lock); 223 spin_unlock_irqrestore(lock, flags);
215 224
216 /* 225 /*
217 * we don't want to call the ordered free functions 226 * we don't want to call the ordered free functions
218 * with the lock held though 227 * with the lock held though
219 */ 228 */
220 work->ordered_free(work); 229 work->ordered_free(work);
221 spin_lock(&workers->order_lock); 230 trace_btrfs_all_work_done(work);
222 }
223
224 spin_unlock(&workers->order_lock);
225}
226
227static void put_worker(struct btrfs_worker_thread *worker)
228{
229 if (atomic_dec_and_test(&worker->refs))
230 kfree(worker);
231}
232
233static int try_worker_shutdown(struct btrfs_worker_thread *worker)
234{
235 int freeit = 0;
236
237 spin_lock_irq(&worker->lock);
238 spin_lock(&worker->workers->lock);
239 if (worker->workers->num_workers > 1 &&
240 worker->idle &&
241 !worker->working &&
242 !list_empty(&worker->worker_list) &&
243 list_empty(&worker->prio_pending) &&
244 list_empty(&worker->pending) &&
245 atomic_read(&worker->num_pending) == 0) {
246 freeit = 1;
247 list_del_init(&worker->worker_list);
248 worker->workers->num_workers--;
249 } 231 }
250 spin_unlock(&worker->workers->lock); 232 spin_unlock_irqrestore(lock, flags);
251 spin_unlock_irq(&worker->lock);
252
253 if (freeit)
254 put_worker(worker);
255 return freeit;
256} 233}
257 234
258static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, 235static void normal_work_helper(struct work_struct *arg)
259 struct list_head *prio_head,
260 struct list_head *head)
261{ 236{
262 struct btrfs_work *work = NULL;
263 struct list_head *cur = NULL;
264
265 if (!list_empty(prio_head))
266 cur = prio_head->next;
267
268 smp_mb();
269 if (!list_empty(&worker->prio_pending))
270 goto refill;
271
272 if (!list_empty(head))
273 cur = head->next;
274
275 if (cur)
276 goto out;
277
278refill:
279 spin_lock_irq(&worker->lock);
280 list_splice_tail_init(&worker->prio_pending, prio_head);
281 list_splice_tail_init(&worker->pending, head);
282
283 if (!list_empty(prio_head))
284 cur = prio_head->next;
285 else if (!list_empty(head))
286 cur = head->next;
287 spin_unlock_irq(&worker->lock);
288
289 if (!cur)
290 goto out_fail;
291
292out:
293 work = list_entry(cur, struct btrfs_work, list);
294
295out_fail:
296 return work;
297}
298
299/*
300 * main loop for servicing work items
301 */
302static int worker_loop(void *arg)
303{
304 struct btrfs_worker_thread *worker = arg;
305 struct list_head head;
306 struct list_head prio_head;
307 struct btrfs_work *work; 237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq;
239 int need_order = 0;
308 240
309 INIT_LIST_HEAD(&head); 241 work = container_of(arg, struct btrfs_work, normal_work);
310 INIT_LIST_HEAD(&prio_head); 242 /*
311 243 * We should not touch things inside work in the following cases:
312 do { 244 * 1) after work->func() if it has no ordered_free
313again: 245 * Since the struct is freed in work->func().
314 while (1) { 246 * 2) after setting WORK_DONE_BIT
315 247 * The work may be freed in other threads almost instantly.
316 248 * So we save the needed things here.
317 work = get_next_work(worker, &prio_head, &head); 249 */
318 if (!work) 250 if (work->ordered_func)
319 break; 251 need_order = 1;
320 252 wq = work->wq;
321 list_del(&work->list); 253
322 clear_bit(WORK_QUEUED_BIT, &work->flags); 254 trace_btrfs_work_sched(work);
323 255 thresh_exec_hook(wq);
324 work->worker = worker; 256 work->func(work);
325 257 if (need_order) {
326 work->func(work); 258 set_bit(WORK_DONE_BIT, &work->flags);
327 259 run_ordered_work(wq);
328 atomic_dec(&worker->num_pending);
329 /*
330 * unless this is an ordered work queue,
331 * 'work' was probably freed by func above.
332 */
333 run_ordered_completions(worker->workers, work);
334
335 check_pending_worker_creates(worker);
336 cond_resched();
337 }
338
339 spin_lock_irq(&worker->lock);
340 check_idle_worker(worker);
341
342 if (freezing(current)) {
343 worker->working = 0;
344 spin_unlock_irq(&worker->lock);
345 try_to_freeze();
346 } else {
347 spin_unlock_irq(&worker->lock);
348 if (!kthread_should_stop()) {
349 cpu_relax();
350 /*
351 * we've dropped the lock, did someone else
352 * jump_in?
353 */
354 smp_mb();
355 if (!list_empty(&worker->pending) ||
356 !list_empty(&worker->prio_pending))
357 continue;
358
359 /*
360 * this short schedule allows more work to
361 * come in without the queue functions
362 * needing to go through wake_up_process()
363 *
364 * worker->working is still 1, so nobody
365 * is going to try and wake us up
366 */
367 schedule_timeout(1);
368 smp_mb();
369 if (!list_empty(&worker->pending) ||
370 !list_empty(&worker->prio_pending))
371 continue;
372
373 if (kthread_should_stop())
374 break;
375
376 /* still no more work?, sleep for real */
377 spin_lock_irq(&worker->lock);
378 set_current_state(TASK_INTERRUPTIBLE);
379 if (!list_empty(&worker->pending) ||
380 !list_empty(&worker->prio_pending)) {
381 spin_unlock_irq(&worker->lock);
382 set_current_state(TASK_RUNNING);
383 goto again;
384 }
385
386 /*
387 * this makes sure we get a wakeup when someone
388 * adds something new to the queue
389 */
390 worker->working = 0;
391 spin_unlock_irq(&worker->lock);
392
393 if (!kthread_should_stop()) {
394 schedule_timeout(HZ * 120);
395 if (!worker->working &&
396 try_worker_shutdown(worker)) {
397 return 0;
398 }
399 }
400 }
401 __set_current_state(TASK_RUNNING);
402 }
403 } while (!kthread_should_stop());
404 return 0;
405}
406
407/*
408 * this will wait for all the worker threads to shutdown
409 */
410void btrfs_stop_workers(struct btrfs_workers *workers)
411{
412 struct list_head *cur;
413 struct btrfs_worker_thread *worker;
414 int can_stop;
415
416 spin_lock_irq(&workers->lock);
417 workers->stopping = 1;
418 list_splice_init(&workers->idle_list, &workers->worker_list);
419 while (!list_empty(&workers->worker_list)) {
420 cur = workers->worker_list.next;
421 worker = list_entry(cur, struct btrfs_worker_thread,
422 worker_list);
423
424 atomic_inc(&worker->refs);
425 workers->num_workers -= 1;
426 if (!list_empty(&worker->worker_list)) {
427 list_del_init(&worker->worker_list);
428 put_worker(worker);
429 can_stop = 1;
430 } else
431 can_stop = 0;
432 spin_unlock_irq(&workers->lock);
433 if (can_stop)
434 kthread_stop(worker->task);
435 spin_lock_irq(&workers->lock);
436 put_worker(worker);
437 } 260 }
438 spin_unlock_irq(&workers->lock); 261 if (!need_order)
262 trace_btrfs_all_work_done(work);
439} 263}
440 264
441/* 265void btrfs_init_work(struct btrfs_work *work,
442 * simple init on struct btrfs_workers 266 btrfs_func_t func,
443 */ 267 btrfs_func_t ordered_func,
444void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 268 btrfs_func_t ordered_free)
445 struct btrfs_workers *async_helper)
446{ 269{
447 workers->num_workers = 0; 270 work->func = func;
448 workers->num_workers_starting = 0; 271 work->ordered_func = ordered_func;
449 INIT_LIST_HEAD(&workers->worker_list); 272 work->ordered_free = ordered_free;
450 INIT_LIST_HEAD(&workers->idle_list); 273 INIT_WORK(&work->normal_work, normal_work_helper);
451 INIT_LIST_HEAD(&workers->order_list); 274 INIT_LIST_HEAD(&work->ordered_list);
452 INIT_LIST_HEAD(&workers->prio_order_list); 275 work->flags = 0;
453 spin_lock_init(&workers->lock);
454 spin_lock_init(&workers->order_lock);
455 workers->max_workers = max;
456 workers->idle_thresh = 32;
457 workers->name = name;
458 workers->ordered = 0;
459 workers->atomic_start_pending = 0;
460 workers->atomic_worker_start = async_helper;
461 workers->stopping = 0;
462} 276}
463 277
464/* 278static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
465 * starts new worker threads. This does not enforce the max worker 279 struct btrfs_work *work)
466 * count in case you need to temporarily go past it.
467 */
468static int __btrfs_start_workers(struct btrfs_workers *workers)
469{ 280{
470 struct btrfs_worker_thread *worker; 281 unsigned long flags;
471 int ret = 0;
472
473 worker = kzalloc(sizeof(*worker), GFP_NOFS);
474 if (!worker) {
475 ret = -ENOMEM;
476 goto fail;
477 }
478
479 INIT_LIST_HEAD(&worker->pending);
480 INIT_LIST_HEAD(&worker->prio_pending);
481 INIT_LIST_HEAD(&worker->worker_list);
482 spin_lock_init(&worker->lock);
483
484 atomic_set(&worker->num_pending, 0);
485 atomic_set(&worker->refs, 1);
486 worker->workers = workers;
487 worker->task = kthread_create(worker_loop, worker,
488 "btrfs-%s-%d", workers->name,
489 workers->num_workers + 1);
490 if (IS_ERR(worker->task)) {
491 ret = PTR_ERR(worker->task);
492 goto fail;
493 }
494 282
495 spin_lock_irq(&workers->lock); 283 work->wq = wq;
496 if (workers->stopping) { 284 thresh_queue_hook(wq);
497 spin_unlock_irq(&workers->lock); 285 if (work->ordered_func) {
498 ret = -EINVAL; 286 spin_lock_irqsave(&wq->list_lock, flags);
499 goto fail_kthread; 287 list_add_tail(&work->ordered_list, &wq->ordered_list);
288 spin_unlock_irqrestore(&wq->list_lock, flags);
500 } 289 }
501 list_add_tail(&worker->worker_list, &workers->idle_list); 290 queue_work(wq->normal_wq, &work->normal_work);
502 worker->idle = 1; 291 trace_btrfs_work_queued(work);
503 workers->num_workers++;
504 workers->num_workers_starting--;
505 WARN_ON(workers->num_workers_starting < 0);
506 spin_unlock_irq(&workers->lock);
507
508 wake_up_process(worker->task);
509 return 0;
510
511fail_kthread:
512 kthread_stop(worker->task);
513fail:
514 kfree(worker);
515 spin_lock_irq(&workers->lock);
516 workers->num_workers_starting--;
517 spin_unlock_irq(&workers->lock);
518 return ret;
519} 292}
520 293
521int btrfs_start_workers(struct btrfs_workers *workers) 294void btrfs_queue_work(struct btrfs_workqueue *wq,
522{ 295 struct btrfs_work *work)
523 spin_lock_irq(&workers->lock);
524 workers->num_workers_starting++;
525 spin_unlock_irq(&workers->lock);
526 return __btrfs_start_workers(workers);
527}
528
529/*
530 * run through the list and find a worker thread that doesn't have a lot
531 * to do right now. This can return null if we aren't yet at the thread
532 * count limit and all of the threads are busy.
533 */
534static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
535{ 296{
536 struct btrfs_worker_thread *worker; 297 struct __btrfs_workqueue *dest_wq;
537 struct list_head *next;
538 int enforce_min;
539 298
540 enforce_min = (workers->num_workers + workers->num_workers_starting) < 299 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
541 workers->max_workers; 300 dest_wq = wq->high;
542 301 else
543 /* 302 dest_wq = wq->normal;
544 * if we find an idle thread, don't move it to the end of the 303 __btrfs_queue_work(dest_wq, work);
545 * idle list. This improves the chance that the next submission
546 * will reuse the same thread, and maybe catch it while it is still
547 * working
548 */
549 if (!list_empty(&workers->idle_list)) {
550 next = workers->idle_list.next;
551 worker = list_entry(next, struct btrfs_worker_thread,
552 worker_list);
553 return worker;
554 }
555 if (enforce_min || list_empty(&workers->worker_list))
556 return NULL;
557
558 /*
559 * if we pick a busy task, move the task to the end of the list.
560 * hopefully this will keep things somewhat evenly balanced.
561 * Do the move in batches based on the sequence number. This groups
562 * requests submitted at roughly the same time onto the same worker.
563 */
564 next = workers->worker_list.next;
565 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
566 worker->sequence++;
567
568 if (worker->sequence % workers->idle_thresh == 0)
569 list_move_tail(next, &workers->worker_list);
570 return worker;
571} 304}
572 305
573/* 306static inline void
574 * selects a worker thread to take the next job. This will either find 307__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
575 * an idle worker, start a new worker up to the max count, or just return
576 * one of the existing busy workers.
577 */
578static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
579{ 308{
580 struct btrfs_worker_thread *worker; 309 destroy_workqueue(wq->normal_wq);
581 unsigned long flags; 310 trace_btrfs_workqueue_destroy(wq);
582 struct list_head *fallback; 311 kfree(wq);
583 int ret;
584
585 spin_lock_irqsave(&workers->lock, flags);
586again:
587 worker = next_worker(workers);
588
589 if (!worker) {
590 if (workers->num_workers + workers->num_workers_starting >=
591 workers->max_workers) {
592 goto fallback;
593 } else if (workers->atomic_worker_start) {
594 workers->atomic_start_pending = 1;
595 goto fallback;
596 } else {
597 workers->num_workers_starting++;
598 spin_unlock_irqrestore(&workers->lock, flags);
599 /* we're below the limit, start another worker */
600 ret = __btrfs_start_workers(workers);
601 spin_lock_irqsave(&workers->lock, flags);
602 if (ret)
603 goto fallback;
604 goto again;
605 }
606 }
607 goto found;
608
609fallback:
610 fallback = NULL;
611 /*
612 * we have failed to find any workers, just
613 * return the first one we can find.
614 */
615 if (!list_empty(&workers->worker_list))
616 fallback = workers->worker_list.next;
617 if (!list_empty(&workers->idle_list))
618 fallback = workers->idle_list.next;
619 BUG_ON(!fallback);
620 worker = list_entry(fallback,
621 struct btrfs_worker_thread, worker_list);
622found:
623 /*
624 * this makes sure the worker doesn't exit before it is placed
625 * onto a busy/idle list
626 */
627 atomic_inc(&worker->num_pending);
628 spin_unlock_irqrestore(&workers->lock, flags);
629 return worker;
630} 312}
631 313
632/* 314void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
633 * btrfs_requeue_work just puts the work item back on the tail of the list
634 * it was taken from. It is intended for use with long running work functions
635 * that make some progress and want to give the cpu up for others.
636 */
637void btrfs_requeue_work(struct btrfs_work *work)
638{ 315{
639 struct btrfs_worker_thread *worker = work->worker; 316 if (!wq)
640 unsigned long flags;
641 int wake = 0;
642
643 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
644 return; 317 return;
645 318 if (wq->high)
646 spin_lock_irqsave(&worker->lock, flags); 319 __btrfs_destroy_workqueue(wq->high);
647 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 320 __btrfs_destroy_workqueue(wq->normal);
648 list_add_tail(&work->list, &worker->prio_pending); 321 kfree(wq);
649 else
650 list_add_tail(&work->list, &worker->pending);
651 atomic_inc(&worker->num_pending);
652
653 /* by definition we're busy, take ourselves off the idle
654 * list
655 */
656 if (worker->idle) {
657 spin_lock(&worker->workers->lock);
658 worker->idle = 0;
659 list_move_tail(&worker->worker_list,
660 &worker->workers->worker_list);
661 spin_unlock(&worker->workers->lock);
662 }
663 if (!worker->working) {
664 wake = 1;
665 worker->working = 1;
666 }
667
668 if (wake)
669 wake_up_process(worker->task);
670 spin_unlock_irqrestore(&worker->lock, flags);
671} 322}
672 323
673void btrfs_set_work_high_prio(struct btrfs_work *work) 324void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
674{ 325{
675 set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 326 if (!wq)
327 return;
328 wq->normal->max_active = max;
329 if (wq->high)
330 wq->high->max_active = max;
676} 331}
677 332
678/* 333void btrfs_set_work_high_priority(struct btrfs_work *work)
679 * places a struct btrfs_work into the pending queue of one of the kthreads
680 */
681void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
682{ 334{
683 struct btrfs_worker_thread *worker; 335 set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
684 unsigned long flags;
685 int wake = 0;
686
687 /* don't requeue something already on a list */
688 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
689 return;
690
691 worker = find_worker(workers);
692 if (workers->ordered) {
693 /*
694 * you're not allowed to do ordered queues from an
695 * interrupt handler
696 */
697 spin_lock(&workers->order_lock);
698 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
699 list_add_tail(&work->order_list,
700 &workers->prio_order_list);
701 } else {
702 list_add_tail(&work->order_list, &workers->order_list);
703 }
704 spin_unlock(&workers->order_lock);
705 } else {
706 INIT_LIST_HEAD(&work->order_list);
707 }
708
709 spin_lock_irqsave(&worker->lock, flags);
710
711 if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
712 list_add_tail(&work->list, &worker->prio_pending);
713 else
714 list_add_tail(&work->list, &worker->pending);
715 check_busy_worker(worker);
716
717 /*
718 * avoid calling into wake_up_process if this thread has already
719 * been kicked
720 */
721 if (!worker->working)
722 wake = 1;
723 worker->working = 1;
724
725 if (wake)
726 wake_up_process(worker->task);
727 spin_unlock_irqrestore(&worker->lock, flags);
728} 336}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * Copyright (C) 2014 Fujitsu. All rights reserved.
3 * 4 *
4 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 6 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
19#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
21 22
22struct btrfs_worker_thread; 23struct btrfs_workqueue;
24/* Internal use only */
25struct __btrfs_workqueue;
26struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg);
23 28
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work { 29struct btrfs_work {
39 /* 30 btrfs_func_t func;
40 * func should be set to the function you want called 31 btrfs_func_t ordered_func;
41 * your work struct is passed as the only arg 32 btrfs_func_t ordered_free;
42 * 33
43 * ordered_func must be set for work sent to an ordered work queue, 34 /* Don't touch things below */
44 * and it is called to complete a given work item in the same 35 struct work_struct normal_work;
45 * order they were sent to the queue. 36 struct list_head ordered_list;
46 */ 37 struct __btrfs_workqueue *wq;
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags; 38 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 int num_workers_starting;
68
69 /* max number of workers allowed. changed by btrfs_start_workers */
70 int max_workers;
71
72 /* once a worker has this many requests or fewer, it is idle */
73 int idle_thresh;
74
75 /* force completions in the order they were queued */
76 int ordered;
77
78 /* more workers required, but in an interrupt handler */
79 int atomic_start_pending;
80
81 /*
82 * are we allowed to sleep while starting workers or are we required
83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
85 */
86 struct btrfs_workers *atomic_worker_start;
87
88 /* list with all the work threads. The workers on the idle thread
89 * may be actively servicing jobs, but they haven't yet hit the
90 * idle thresh limit above.
91 */
92 struct list_head worker_list;
93 struct list_head idle_list;
94
95 /*
96 * when operating in ordered mode, this maintains the list
97 * of work items waiting for completion
98 */
99 struct list_head order_list;
100 struct list_head prio_order_list;
101
102 /* lock for finding the next worker thread to queue on */
103 spinlock_t lock;
104
105 /* lock for the ordered lists */
106 spinlock_t order_lock;
107
108 /* extra name for this worker, used for current->name */
109 char *name;
110
111 int stopping;
112}; 39};
113 40
114void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
115int btrfs_start_workers(struct btrfs_workers *workers); 42 int flags,
116void btrfs_stop_workers(struct btrfs_workers *workers); 43 int max_active,
117void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, 44 int thresh);
118 struct btrfs_workers *async_starter); 45void btrfs_init_work(struct btrfs_work *work,
119void btrfs_requeue_work(struct btrfs_work *work); 46 btrfs_func_t func,
120void btrfs_set_work_high_prio(struct btrfs_work *work); 47 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free);
49void btrfs_queue_work(struct btrfs_workqueue *wq,
50 struct btrfs_work *work);
51void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
52void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
53void btrfs_set_work_high_priority(struct btrfs_work *work);
121#endif 54#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..10db21fa0926 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
220 220
221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
222 struct ulist *parents, struct __prelim_ref *ref, 222 struct ulist *parents, struct __prelim_ref *ref,
223 int level, u64 time_seq, const u64 *extent_item_pos) 223 int level, u64 time_seq, const u64 *extent_item_pos,
224 u64 total_refs)
224{ 225{
225 int ret = 0; 226 int ret = 0;
226 int slot; 227 int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 250 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
250 ret = btrfs_next_old_leaf(root, path, time_seq); 251 ret = btrfs_next_old_leaf(root, path, time_seq);
251 252
252 while (!ret && count < ref->count) { 253 while (!ret && count < total_refs) {
253 eb = path->nodes[0]; 254 eb = path->nodes[0];
254 slot = path->slots[0]; 255 slot = path->slots[0];
255 256
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
306 struct btrfs_path *path, u64 time_seq, 307 struct btrfs_path *path, u64 time_seq,
307 struct __prelim_ref *ref, 308 struct __prelim_ref *ref,
308 struct ulist *parents, 309 struct ulist *parents,
309 const u64 *extent_item_pos) 310 const u64 *extent_item_pos, u64 total_refs)
310{ 311{
311 struct btrfs_root *root; 312 struct btrfs_root *root;
312 struct btrfs_key root_key; 313 struct btrfs_key root_key;
@@ -329,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
329 goto out; 330 goto out;
330 } 331 }
331 332
332 root_level = btrfs_old_root_level(root, time_seq); 333 if (path->search_commit_root)
334 root_level = btrfs_header_level(root->commit_root);
335 else
336 root_level = btrfs_old_root_level(root, time_seq);
333 337
334 if (root_level + 1 == level) { 338 if (root_level + 1 == level) {
335 srcu_read_unlock(&fs_info->subvol_srcu, index); 339 srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -361,7 +365,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
361 } 365 }
362 366
363 ret = add_all_parents(root, path, parents, ref, level, time_seq, 367 ret = add_all_parents(root, path, parents, ref, level, time_seq,
364 extent_item_pos); 368 extent_item_pos, total_refs);
365out: 369out:
366 path->lowest_level = 0; 370 path->lowest_level = 0;
367 btrfs_release_path(path); 371 btrfs_release_path(path);
@@ -374,7 +378,7 @@ out:
374static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 378static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
375 struct btrfs_path *path, u64 time_seq, 379 struct btrfs_path *path, u64 time_seq,
376 struct list_head *head, 380 struct list_head *head,
377 const u64 *extent_item_pos) 381 const u64 *extent_item_pos, u64 total_refs)
378{ 382{
379 int err; 383 int err;
380 int ret = 0; 384 int ret = 0;
@@ -400,7 +404,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
400 if (ref->count == 0) 404 if (ref->count == 0)
401 continue; 405 continue;
402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 406 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
403 parents, extent_item_pos); 407 parents, extent_item_pos,
408 total_refs);
404 /* 409 /*
405 * we can only tolerate ENOENT,otherwise,we should catch error 410 * we can only tolerate ENOENT,otherwise,we should catch error
406 * and return directly. 411 * and return directly.
@@ -557,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode)
557 * smaller or equal that seq to the list 562 * smaller or equal that seq to the list
558 */ 563 */
559static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 564static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
560 struct list_head *prefs) 565 struct list_head *prefs, u64 *total_refs)
561{ 566{
562 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 567 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
563 struct rb_node *n = &head->node.rb_node; 568 struct rb_node *n = &head->node.rb_node;
@@ -593,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
593 default: 598 default:
594 BUG_ON(1); 599 BUG_ON(1);
595 } 600 }
601 *total_refs += (node->ref_mod * sgn);
596 switch (node->type) { 602 switch (node->type) {
597 case BTRFS_TREE_BLOCK_REF_KEY: { 603 case BTRFS_TREE_BLOCK_REF_KEY: {
598 struct btrfs_delayed_tree_ref *ref; 604 struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
653 */ 659 */
654static int __add_inline_refs(struct btrfs_fs_info *fs_info, 660static int __add_inline_refs(struct btrfs_fs_info *fs_info,
655 struct btrfs_path *path, u64 bytenr, 661 struct btrfs_path *path, u64 bytenr,
656 int *info_level, struct list_head *prefs) 662 int *info_level, struct list_head *prefs,
663 u64 *total_refs)
657{ 664{
658 int ret = 0; 665 int ret = 0;
659 int slot; 666 int slot;
@@ -677,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
677 684
678 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 685 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
679 flags = btrfs_extent_flags(leaf, ei); 686 flags = btrfs_extent_flags(leaf, ei);
687 *total_refs += btrfs_extent_refs(leaf, ei);
680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 688 btrfs_item_key_to_cpu(leaf, &found_key, slot);
681 689
682 ptr = (unsigned long)(ei + 1); 690 ptr = (unsigned long)(ei + 1);
@@ -859,6 +867,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
859 struct list_head prefs; 867 struct list_head prefs;
860 struct __prelim_ref *ref; 868 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL; 869 struct extent_inode_elem *eie = NULL;
870 u64 total_refs = 0;
862 871
863 INIT_LIST_HEAD(&prefs); 872 INIT_LIST_HEAD(&prefs);
864 INIT_LIST_HEAD(&prefs_delayed); 873 INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
873 path = btrfs_alloc_path(); 882 path = btrfs_alloc_path();
874 if (!path) 883 if (!path)
875 return -ENOMEM; 884 return -ENOMEM;
876 if (!trans) 885 if (!trans) {
877 path->search_commit_root = 1; 886 path->search_commit_root = 1;
887 path->skip_locking = 1;
888 }
878 889
879 /* 890 /*
880 * grab both a lock on the path and a lock on the delayed ref head. 891 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +926,7 @@ again:
915 } 926 }
916 spin_unlock(&delayed_refs->lock); 927 spin_unlock(&delayed_refs->lock);
917 ret = __add_delayed_refs(head, time_seq, 928 ret = __add_delayed_refs(head, time_seq,
918 &prefs_delayed); 929 &prefs_delayed, &total_refs);
919 mutex_unlock(&head->mutex); 930 mutex_unlock(&head->mutex);
920 if (ret) 931 if (ret)
921 goto out; 932 goto out;
@@ -936,7 +947,8 @@ again:
936 (key.type == BTRFS_EXTENT_ITEM_KEY || 947 (key.type == BTRFS_EXTENT_ITEM_KEY ||
937 key.type == BTRFS_METADATA_ITEM_KEY)) { 948 key.type == BTRFS_METADATA_ITEM_KEY)) {
938 ret = __add_inline_refs(fs_info, path, bytenr, 949 ret = __add_inline_refs(fs_info, path, bytenr,
939 &info_level, &prefs); 950 &info_level, &prefs,
951 &total_refs);
940 if (ret) 952 if (ret)
941 goto out; 953 goto out;
942 ret = __add_keyed_refs(fs_info, path, bytenr, 954 ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +968,7 @@ again:
956 __merge_refs(&prefs, 1); 968 __merge_refs(&prefs, 1);
957 969
958 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 970 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
959 extent_item_pos); 971 extent_item_pos, total_refs);
960 if (ret) 972 if (ret)
961 goto out; 973 goto out;
962 974
@@ -965,7 +977,7 @@ again:
965 while (!list_empty(&prefs)) { 977 while (!list_empty(&prefs)) {
966 ref = list_first_entry(&prefs, struct __prelim_ref, list); 978 ref = list_first_entry(&prefs, struct __prelim_ref, list);
967 WARN_ON(ref->count < 0); 979 WARN_ON(ref->count < 0);
968 if (ref->count && ref->root_id && ref->parent == 0) { 980 if (roots && ref->count && ref->root_id && ref->parent == 0) {
969 /* no parent == root of tree */ 981 /* no parent == root of tree */
970 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 982 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
971 if (ret < 0) 983 if (ret < 0)
@@ -1061,22 +1073,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1061 u64 time_seq, struct ulist **leafs, 1073 u64 time_seq, struct ulist **leafs,
1062 const u64 *extent_item_pos) 1074 const u64 *extent_item_pos)
1063{ 1075{
1064 struct ulist *tmp;
1065 int ret; 1076 int ret;
1066 1077
1067 tmp = ulist_alloc(GFP_NOFS);
1068 if (!tmp)
1069 return -ENOMEM;
1070 *leafs = ulist_alloc(GFP_NOFS); 1078 *leafs = ulist_alloc(GFP_NOFS);
1071 if (!*leafs) { 1079 if (!*leafs)
1072 ulist_free(tmp);
1073 return -ENOMEM; 1080 return -ENOMEM;
1074 }
1075 1081
1076 ret = find_parent_nodes(trans, fs_info, bytenr, 1082 ret = find_parent_nodes(trans, fs_info, bytenr,
1077 time_seq, *leafs, tmp, extent_item_pos); 1083 time_seq, *leafs, NULL, extent_item_pos);
1078 ulist_free(tmp);
1079
1080 if (ret < 0 && ret != -ENOENT) { 1084 if (ret < 0 && ret != -ENOENT) {
1081 free_leaf_list(*leafs); 1085 free_leaf_list(*leafs);
1082 return ret; 1086 return ret;
@@ -1098,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1098 * 1102 *
1099 * returns 0 on success, < 0 on error. 1103 * returns 0 on success, < 0 on error.
1100 */ 1104 */
1101int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1105static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1102 struct btrfs_fs_info *fs_info, u64 bytenr, 1106 struct btrfs_fs_info *fs_info, u64 bytenr,
1103 u64 time_seq, struct ulist **roots) 1107 u64 time_seq, struct ulist **roots)
1104{ 1108{
1105 struct ulist *tmp; 1109 struct ulist *tmp;
1106 struct ulist_node *node = NULL; 1110 struct ulist_node *node = NULL;
@@ -1136,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1136 return 0; 1140 return 0;
1137} 1141}
1138 1142
1143int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1144 struct btrfs_fs_info *fs_info, u64 bytenr,
1145 u64 time_seq, struct ulist **roots)
1146{
1147 int ret;
1148
1149 if (!trans)
1150 down_read(&fs_info->commit_root_sem);
1151 ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
1152 if (!trans)
1153 up_read(&fs_info->commit_root_sem);
1154 return ret;
1155}
1156
1139/* 1157/*
1140 * this makes the path point to (inum INODE_ITEM ioff) 1158 * this makes the path point to (inum INODE_ITEM ioff)
1141 */ 1159 */
@@ -1333,38 +1351,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1333 if (ret < 0) 1351 if (ret < 0)
1334 return ret; 1352 return ret;
1335 1353
1336 while (1) { 1354 ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
1337 u32 nritems; 1355 if (ret) {
1338 if (path->slots[0] == 0) { 1356 if (ret > 0)
1339 btrfs_set_path_blocking(path); 1357 ret = -ENOENT;
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path); 1358 return ret;
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 } 1359 }
1367 1360 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1361 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1362 size = fs_info->extent_root->leafsize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1363 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
@@ -1540,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1540 if (IS_ERR(trans)) 1533 if (IS_ERR(trans))
1541 return PTR_ERR(trans); 1534 return PTR_ERR(trans);
1542 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1535 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1536 } else {
1537 down_read(&fs_info->commit_root_sem);
1543 } 1538 }
1544 1539
1545 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1540 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1550,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1550 1545
1551 ULIST_ITER_INIT(&ref_uiter); 1546 ULIST_ITER_INIT(&ref_uiter);
1552 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1547 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1553 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1548 ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
1554 tree_mod_seq_elem.seq, &roots); 1549 tree_mod_seq_elem.seq, &roots);
1555 if (ret) 1550 if (ret)
1556 break; 1551 break;
1557 ULIST_ITER_INIT(&root_uiter); 1552 ULIST_ITER_INIT(&root_uiter);
@@ -1573,6 +1568,8 @@ out:
1573 if (!search_commit_root) { 1568 if (!search_commit_root) {
1574 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1569 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1575 btrfs_end_transaction(trans, fs_info->extent_root); 1570 btrfs_end_transaction(trans, fs_info->extent_root);
1571 } else {
1572 up_read(&fs_info->commit_root_sem);
1576 } 1573 }
1577 1574
1578 return ret; 1575 return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
109 u64 last_trans; 109 u64 last_trans;
110 110
111 /* 111 /*
112 * log transid when this inode was last modified 112 * transid that last logged this inode
113 */ 113 */
114 u64 last_sub_trans; 114 u64 logged_trans;
115 115
116 /* 116 /*
117 * transid that last logged this inode 117 * log transid when this inode was last modified
118 */ 118 */
119 u64 logged_trans; 119 int last_sub_trans;
120
121 /* a local copy of root's last_log_commit */
122 int last_log_commit;
120 123
121 /* total number of bytes pending delalloc, used by stat to calc the 124 /* total number of bytes pending delalloc, used by stat to calc the
122 * real block usage of the file 125 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
155 /* flags field from the on disk inode */ 158 /* flags field from the on disk inode */
156 u32 flags; 159 u32 flags;
157 160
158 /* a local copy of root's last_log_commit */
159 unsigned long last_log_commit;
160
161 /* 161 /*
162 * Counters to keep track of the number of extent item's we may use due 162 * Counters to keep track of the number of extent item's we may use due
163 * to delalloc and such. outstanding_extents is the number of extent 163 * to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
472 rcu_read_lock(); 472 rcu_read_lock();
473 page = radix_tree_lookup(&mapping->page_tree, pg_index); 473 page = radix_tree_lookup(&mapping->page_tree, pg_index);
474 rcu_read_unlock(); 474 rcu_read_unlock();
475 if (page) { 475 if (page && !radix_tree_exceptional_entry(page)) {
476 misses++; 476 misses++;
477 if (misses > 4) 477 if (misses > 4)
478 break; 478 break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..1bcfcdb23cf4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2769,9 +2769,13 @@ again:
2769 * the commit roots are read only 2769 * the commit roots are read only
2770 * so we always do read locks 2770 * so we always do read locks
2771 */ 2771 */
2772 if (p->need_commit_sem)
2773 down_read(&root->fs_info->commit_root_sem);
2772 b = root->commit_root; 2774 b = root->commit_root;
2773 extent_buffer_get(b); 2775 extent_buffer_get(b);
2774 level = btrfs_header_level(b); 2776 level = btrfs_header_level(b);
2777 if (p->need_commit_sem)
2778 up_read(&root->fs_info->commit_root_sem);
2775 if (!p->skip_locking) 2779 if (!p->skip_locking)
2776 btrfs_tree_read_lock(b); 2780 btrfs_tree_read_lock(b);
2777 } else { 2781 } else {
@@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5360{ 5364{
5361 int ret; 5365 int ret;
5362 int cmp; 5366 int cmp;
5363 struct btrfs_trans_handle *trans = NULL;
5364 struct btrfs_path *left_path = NULL; 5367 struct btrfs_path *left_path = NULL;
5365 struct btrfs_path *right_path = NULL; 5368 struct btrfs_path *right_path = NULL;
5366 struct btrfs_key left_key; 5369 struct btrfs_key left_key;
@@ -5376,9 +5379,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5376 int advance_right; 5379 int advance_right;
5377 u64 left_blockptr; 5380 u64 left_blockptr;
5378 u64 right_blockptr; 5381 u64 right_blockptr;
5379 u64 left_start_ctransid; 5382 u64 left_gen;
5380 u64 right_start_ctransid; 5383 u64 right_gen;
5381 u64 ctransid;
5382 5384
5383 left_path = btrfs_alloc_path(); 5385 left_path = btrfs_alloc_path();
5384 if (!left_path) { 5386 if (!left_path) {
@@ -5402,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5402 right_path->search_commit_root = 1; 5404 right_path->search_commit_root = 1;
5403 right_path->skip_locking = 1; 5405 right_path->skip_locking = 1;
5404 5406
5405 spin_lock(&left_root->root_item_lock);
5406 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5407 spin_unlock(&left_root->root_item_lock);
5408
5409 spin_lock(&right_root->root_item_lock);
5410 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5411 spin_unlock(&right_root->root_item_lock);
5412
5413 trans = btrfs_join_transaction(left_root);
5414 if (IS_ERR(trans)) {
5415 ret = PTR_ERR(trans);
5416 trans = NULL;
5417 goto out;
5418 }
5419
5420 /* 5407 /*
5421 * Strategy: Go to the first items of both trees. Then do 5408 * Strategy: Go to the first items of both trees. Then do
5422 * 5409 *
@@ -5453,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5453 * the right if possible or go up and right. 5440 * the right if possible or go up and right.
5454 */ 5441 */
5455 5442
5443 down_read(&left_root->fs_info->commit_root_sem);
5456 left_level = btrfs_header_level(left_root->commit_root); 5444 left_level = btrfs_header_level(left_root->commit_root);
5457 left_root_level = left_level; 5445 left_root_level = left_level;
5458 left_path->nodes[left_level] = left_root->commit_root; 5446 left_path->nodes[left_level] = left_root->commit_root;
@@ -5462,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5462 right_root_level = right_level; 5450 right_root_level = right_level;
5463 right_path->nodes[right_level] = right_root->commit_root; 5451 right_path->nodes[right_level] = right_root->commit_root;
5464 extent_buffer_get(right_path->nodes[right_level]); 5452 extent_buffer_get(right_path->nodes[right_level]);
5453 up_read(&left_root->fs_info->commit_root_sem);
5465 5454
5466 if (left_level == 0) 5455 if (left_level == 0)
5467 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5456 btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5480,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5480 advance_left = advance_right = 0; 5469 advance_left = advance_right = 0;
5481 5470
5482 while (1) { 5471 while (1) {
5483 /*
5484 * We need to make sure the transaction does not get committed
5485 * while we do anything on commit roots. This means, we need to
5486 * join and leave transactions for every item that we process.
5487 */
5488 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5489 btrfs_release_path(left_path);
5490 btrfs_release_path(right_path);
5491
5492 ret = btrfs_end_transaction(trans, left_root);
5493 trans = NULL;
5494 if (ret < 0)
5495 goto out;
5496 }
5497 /* now rejoin the transaction */
5498 if (!trans) {
5499 trans = btrfs_join_transaction(left_root);
5500 if (IS_ERR(trans)) {
5501 ret = PTR_ERR(trans);
5502 trans = NULL;
5503 goto out;
5504 }
5505
5506 spin_lock(&left_root->root_item_lock);
5507 ctransid = btrfs_root_ctransid(&left_root->root_item);
5508 spin_unlock(&left_root->root_item_lock);
5509 if (ctransid != left_start_ctransid)
5510 left_start_ctransid = 0;
5511
5512 spin_lock(&right_root->root_item_lock);
5513 ctransid = btrfs_root_ctransid(&right_root->root_item);
5514 spin_unlock(&right_root->root_item_lock);
5515 if (ctransid != right_start_ctransid)
5516 right_start_ctransid = 0;
5517
5518 if (!left_start_ctransid || !right_start_ctransid) {
5519 WARN(1, KERN_WARNING
5520 "BTRFS: btrfs_compare_tree detected "
5521 "a change in one of the trees while "
5522 "iterating. This is probably a "
5523 "bug.\n");
5524 ret = -EIO;
5525 goto out;
5526 }
5527
5528 /*
5529 * the commit root may have changed, so start again
5530 * where we stopped
5531 */
5532 left_path->lowest_level = left_level;
5533 right_path->lowest_level = right_level;
5534 ret = btrfs_search_slot(NULL, left_root,
5535 &left_key, left_path, 0, 0);
5536 if (ret < 0)
5537 goto out;
5538 ret = btrfs_search_slot(NULL, right_root,
5539 &right_key, right_path, 0, 0);
5540 if (ret < 0)
5541 goto out;
5542 }
5543
5544 if (advance_left && !left_end_reached) { 5472 if (advance_left && !left_end_reached) {
5545 ret = tree_advance(left_root, left_path, &left_level, 5473 ret = tree_advance(left_root, left_path, &left_level,
5546 left_root_level, 5474 left_root_level,
@@ -5640,7 +5568,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5640 right_blockptr = btrfs_node_blockptr( 5568 right_blockptr = btrfs_node_blockptr(
5641 right_path->nodes[right_level], 5569 right_path->nodes[right_level],
5642 right_path->slots[right_level]); 5570 right_path->slots[right_level]);
5643 if (left_blockptr == right_blockptr) { 5571 left_gen = btrfs_node_ptr_generation(
5572 left_path->nodes[left_level],
5573 left_path->slots[left_level]);
5574 right_gen = btrfs_node_ptr_generation(
5575 right_path->nodes[right_level],
5576 right_path->slots[right_level]);
5577 if (left_blockptr == right_blockptr &&
5578 left_gen == right_gen) {
5644 /* 5579 /*
5645 * As we're on a shared block, don't 5580 * As we're on a shared block, don't
5646 * allow to go deeper. 5581 * allow to go deeper.
@@ -5663,14 +5598,6 @@ out:
5663 btrfs_free_path(left_path); 5598 btrfs_free_path(left_path);
5664 btrfs_free_path(right_path); 5599 btrfs_free_path(right_path);
5665 kfree(tmp_buf); 5600 kfree(tmp_buf);
5666
5667 if (trans) {
5668 if (!ret)
5669 ret = btrfs_end_transaction(trans, left_root);
5670 else
5671 btrfs_end_transaction(trans, left_root);
5672 }
5673
5674 return ret; 5601 return ret;
5675} 5602}
5676 5603
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..ba6b88528dc7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -608,6 +609,7 @@ struct btrfs_path {
608 unsigned int skip_locking:1; 609 unsigned int skip_locking:1;
609 unsigned int leave_spinning:1; 610 unsigned int leave_spinning:1;
610 unsigned int search_commit_root:1; 611 unsigned int search_commit_root:1;
612 unsigned int need_commit_sem:1;
611}; 613};
612 614
613/* 615/*
@@ -985,7 +987,8 @@ struct btrfs_dev_replace_item {
985#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 987#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
986#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) 988#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
987#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) 989#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
988#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 990#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
991 BTRFS_SPACE_INFO_GLOBAL_RSV)
989 992
990enum btrfs_raid_types { 993enum btrfs_raid_types {
991 BTRFS_RAID_RAID10, 994 BTRFS_RAID_RAID10,
@@ -1017,6 +1020,12 @@ enum btrfs_raid_types {
1017 */ 1020 */
1018#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) 1021#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
1019 1022
1023/*
1024 * A fake block group type that is used to communicate global block reserve
1025 * size to userspace via the SPACE_INFO ioctl.
1026 */
1027#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
1028
1020#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ 1029#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
1021 BTRFS_AVAIL_ALLOC_BIT_SINGLE) 1030 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
1022 1031
@@ -1439,7 +1448,7 @@ struct btrfs_fs_info {
1439 */ 1448 */
1440 struct mutex ordered_extent_flush_mutex; 1449 struct mutex ordered_extent_flush_mutex;
1441 1450
1442 struct rw_semaphore extent_commit_sem; 1451 struct rw_semaphore commit_root_sem;
1443 1452
1444 struct rw_semaphore cleanup_work_sem; 1453 struct rw_semaphore cleanup_work_sem;
1445 1454
@@ -1489,6 +1498,7 @@ struct btrfs_fs_info {
1489 */ 1498 */
1490 struct list_head ordered_roots; 1499 struct list_head ordered_roots;
1491 1500
1501 struct mutex delalloc_root_mutex;
1492 spinlock_t delalloc_root_lock; 1502 spinlock_t delalloc_root_lock;
1493 /* all fs/file tree roots that have delalloc inodes. */ 1503 /* all fs/file tree roots that have delalloc inodes. */
1494 struct list_head delalloc_roots; 1504 struct list_head delalloc_roots;
@@ -1503,28 +1513,27 @@ struct btrfs_fs_info {
1503 * A third pool does submit_bio to avoid deadlocking with the other 1513 * A third pool does submit_bio to avoid deadlocking with the other
1504 * two 1514 * two
1505 */ 1515 */
1506 struct btrfs_workers generic_worker; 1516 struct btrfs_workqueue *workers;
1507 struct btrfs_workers workers; 1517 struct btrfs_workqueue *delalloc_workers;
1508 struct btrfs_workers delalloc_workers; 1518 struct btrfs_workqueue *flush_workers;
1509 struct btrfs_workers flush_workers; 1519 struct btrfs_workqueue *endio_workers;
1510 struct btrfs_workers endio_workers; 1520 struct btrfs_workqueue *endio_meta_workers;
1511 struct btrfs_workers endio_meta_workers; 1521 struct btrfs_workqueue *endio_raid56_workers;
1512 struct btrfs_workers endio_raid56_workers; 1522 struct btrfs_workqueue *rmw_workers;
1513 struct btrfs_workers rmw_workers; 1523 struct btrfs_workqueue *endio_meta_write_workers;
1514 struct btrfs_workers endio_meta_write_workers; 1524 struct btrfs_workqueue *endio_write_workers;
1515 struct btrfs_workers endio_write_workers; 1525 struct btrfs_workqueue *endio_freespace_worker;
1516 struct btrfs_workers endio_freespace_worker; 1526 struct btrfs_workqueue *submit_workers;
1517 struct btrfs_workers submit_workers; 1527 struct btrfs_workqueue *caching_workers;
1518 struct btrfs_workers caching_workers; 1528 struct btrfs_workqueue *readahead_workers;
1519 struct btrfs_workers readahead_workers;
1520 1529
1521 /* 1530 /*
1522 * fixup workers take dirty pages that didn't properly go through 1531 * fixup workers take dirty pages that didn't properly go through
1523 * the cow mechanism and make them safe to write. It happens 1532 * the cow mechanism and make them safe to write. It happens
1524 * for the sys_munmap function call path 1533 * for the sys_munmap function call path
1525 */ 1534 */
1526 struct btrfs_workers fixup_workers; 1535 struct btrfs_workqueue *fixup_workers;
1527 struct btrfs_workers delayed_workers; 1536 struct btrfs_workqueue *delayed_workers;
1528 struct task_struct *transaction_kthread; 1537 struct task_struct *transaction_kthread;
1529 struct task_struct *cleaner_kthread; 1538 struct task_struct *cleaner_kthread;
1530 int thread_pool_size; 1539 int thread_pool_size;
@@ -1604,9 +1613,9 @@ struct btrfs_fs_info {
1604 atomic_t scrub_cancel_req; 1613 atomic_t scrub_cancel_req;
1605 wait_queue_head_t scrub_pause_wait; 1614 wait_queue_head_t scrub_pause_wait;
1606 int scrub_workers_refcnt; 1615 int scrub_workers_refcnt;
1607 struct btrfs_workers scrub_workers; 1616 struct btrfs_workqueue *scrub_workers;
1608 struct btrfs_workers scrub_wr_completion_workers; 1617 struct btrfs_workqueue *scrub_wr_completion_workers;
1609 struct btrfs_workers scrub_nocow_workers; 1618 struct btrfs_workqueue *scrub_nocow_workers;
1610 1619
1611#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1620#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1612 u32 check_integrity_print_mask; 1621 u32 check_integrity_print_mask;
@@ -1647,7 +1656,7 @@ struct btrfs_fs_info {
1647 /* qgroup rescan items */ 1656 /* qgroup rescan items */
1648 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1657 struct mutex qgroup_rescan_lock; /* protects the progress item */
1649 struct btrfs_key qgroup_rescan_progress; 1658 struct btrfs_key qgroup_rescan_progress;
1650 struct btrfs_workers qgroup_rescan_workers; 1659 struct btrfs_workqueue *qgroup_rescan_workers;
1651 struct completion qgroup_rescan_completion; 1660 struct completion qgroup_rescan_completion;
1652 struct btrfs_work qgroup_rescan_work; 1661 struct btrfs_work qgroup_rescan_work;
1653 1662
@@ -1674,10 +1683,18 @@ struct btrfs_fs_info {
1674 1683
1675 atomic_t mutually_exclusive_operation_running; 1684 atomic_t mutually_exclusive_operation_running;
1676 1685
1686 struct percpu_counter bio_counter;
1687 wait_queue_head_t replace_wait;
1688
1677 struct semaphore uuid_tree_rescan_sem; 1689 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1690 unsigned int update_uuid_tree_gen:1;
1679}; 1691};
1680 1692
1693struct btrfs_subvolume_writers {
1694 struct percpu_counter counter;
1695 wait_queue_head_t wait;
1696};
1697
1681/* 1698/*
1682 * in ram representation of the tree. extent_root is used for all allocations 1699 * in ram representation of the tree. extent_root is used for all allocations
1683 * and for the extent tree extent_root root. 1700 * and for the extent tree extent_root root.
@@ -1702,7 +1719,6 @@ struct btrfs_root {
1702 struct btrfs_block_rsv *block_rsv; 1719 struct btrfs_block_rsv *block_rsv;
1703 1720
1704 /* free ino cache stuff */ 1721 /* free ino cache stuff */
1705 struct mutex fs_commit_mutex;
1706 struct btrfs_free_space_ctl *free_ino_ctl; 1722 struct btrfs_free_space_ctl *free_ino_ctl;
1707 enum btrfs_caching_type cached; 1723 enum btrfs_caching_type cached;
1708 spinlock_t cache_lock; 1724 spinlock_t cache_lock;
@@ -1714,11 +1730,15 @@ struct btrfs_root {
1714 struct mutex log_mutex; 1730 struct mutex log_mutex;
1715 wait_queue_head_t log_writer_wait; 1731 wait_queue_head_t log_writer_wait;
1716 wait_queue_head_t log_commit_wait[2]; 1732 wait_queue_head_t log_commit_wait[2];
1733 struct list_head log_ctxs[2];
1717 atomic_t log_writers; 1734 atomic_t log_writers;
1718 atomic_t log_commit[2]; 1735 atomic_t log_commit[2];
1719 atomic_t log_batch; 1736 atomic_t log_batch;
1720 unsigned long log_transid; 1737 int log_transid;
1721 unsigned long last_log_commit; 1738 /* No matter the commit succeeds or not*/
1739 int log_transid_committed;
1740 /* Just be updated when the commit succeeds. */
1741 int last_log_commit;
1722 pid_t log_start_pid; 1742 pid_t log_start_pid;
1723 bool log_multiple_pids; 1743 bool log_multiple_pids;
1724 1744
@@ -1793,6 +1813,7 @@ struct btrfs_root {
1793 spinlock_t root_item_lock; 1813 spinlock_t root_item_lock;
1794 atomic_t refs; 1814 atomic_t refs;
1795 1815
1816 struct mutex delalloc_mutex;
1796 spinlock_t delalloc_lock; 1817 spinlock_t delalloc_lock;
1797 /* 1818 /*
1798 * all of the inodes that have delalloc bytes. It is possible for 1819 * all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1823,8 @@ struct btrfs_root {
1802 struct list_head delalloc_inodes; 1823 struct list_head delalloc_inodes;
1803 struct list_head delalloc_root; 1824 struct list_head delalloc_root;
1804 u64 nr_delalloc_inodes; 1825 u64 nr_delalloc_inodes;
1826
1827 struct mutex ordered_extent_mutex;
1805 /* 1828 /*
1806 * this is used by the balancing code to wait for all the pending 1829 * this is used by the balancing code to wait for all the pending
1807 * ordered extents 1830 * ordered extents
@@ -1822,6 +1845,8 @@ struct btrfs_root {
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS 1845 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */ 1846 */
1824 int send_in_progress; 1847 int send_in_progress;
1848 struct btrfs_subvolume_writers *subv_writers;
1849 atomic_t will_be_snapshoted;
1825}; 1850};
1826 1851
1827struct btrfs_ioctl_defrag_range_args { 1852struct btrfs_ioctl_defrag_range_args {
@@ -2033,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args {
2033#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) 2058#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
2034#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 2059#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
2035 BTRFS_MOUNT_##opt) 2060 BTRFS_MOUNT_##opt)
2061#define btrfs_set_and_info(root, opt, fmt, args...) \
2062{ \
2063 if (!btrfs_test_opt(root, opt)) \
2064 btrfs_info(root->fs_info, fmt, ##args); \
2065 btrfs_set_opt(root->fs_info->mount_opt, opt); \
2066}
2067
2068#define btrfs_clear_and_info(root, opt, fmt, args...) \
2069{ \
2070 if (btrfs_test_opt(root, opt)) \
2071 btrfs_info(root->fs_info, fmt, ##args); \
2072 btrfs_clear_opt(root->fs_info->mount_opt, opt); \
2073}
2074
2036/* 2075/*
2037 * Inode flags 2076 * Inode flags
2038 */ 2077 */
@@ -3346,6 +3385,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3346int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3385int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3347 struct btrfs_fs_info *fs_info); 3386 struct btrfs_fs_info *fs_info);
3348int __get_raid_index(u64 flags); 3387int __get_raid_index(u64 flags);
3388
3389int btrfs_start_nocow_write(struct btrfs_root *root);
3390void btrfs_end_nocow_write(struct btrfs_root *root);
3349/* ctree.c */ 3391/* ctree.c */
3350int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3392int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3351 int level, int *slot); 3393 int level, int *slot);
@@ -3723,7 +3765,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3723 u32 min_type); 3765 u32 min_type);
3724 3766
3725int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3767int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3726int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); 3768int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
3769 int nr);
3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3770int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3728 struct extent_state **cached_state); 3771 struct extent_state **cached_state);
3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3772int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4048,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4005int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4048int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4006 struct btrfs_scrub_progress *progress); 4049 struct btrfs_scrub_progress *progress);
4007 4050
4051/* dev-replace.c */
4052void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4053void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4054void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4055
4008/* reada.c */ 4056/* reada.c */
4009struct reada_control { 4057struct reada_control {
4010 struct btrfs_root *root; /* tree to prefetch */ 4058 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1392 return -ENOMEM; 1392 return -ENOMEM;
1393 1393
1394 async_work->delayed_root = delayed_root; 1394 async_work->delayed_root = delayed_root;
1395 async_work->work.func = btrfs_async_run_delayed_root; 1395 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
1396 async_work->work.flags = 0; 1396 NULL, NULL);
1397 async_work->nr = nr; 1397 async_work->nr = nr;
1398 1398
1399 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); 1399 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
1400 return 0; 1400 return 0;
1401} 1401}
1402 1402
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
199 */ 199 */
200static struct btrfs_delayed_ref_head * 200static struct btrfs_delayed_ref_head *
201find_ref_head(struct rb_root *root, u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
202 struct btrfs_delayed_ref_head **last, int return_bigger) 202 int return_bigger)
203{ 203{
204 struct rb_node *n; 204 struct rb_node *n;
205 struct btrfs_delayed_ref_head *entry; 205 struct btrfs_delayed_ref_head *entry;
206 int cmp = 0;
207 206
208again:
209 n = root->rb_node; 207 n = root->rb_node;
210 entry = NULL; 208 entry = NULL;
211 while (n) { 209 while (n) {
212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 210 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
213 if (last)
214 *last = entry;
215 211
216 if (bytenr < entry->node.bytenr) 212 if (bytenr < entry->node.bytenr)
217 cmp = -1;
218 else if (bytenr > entry->node.bytenr)
219 cmp = 1;
220 else
221 cmp = 0;
222
223 if (cmp < 0)
224 n = n->rb_left; 213 n = n->rb_left;
225 else if (cmp > 0) 214 else if (bytenr > entry->node.bytenr)
226 n = n->rb_right; 215 n = n->rb_right;
227 else 216 else
228 return entry; 217 return entry;
229 } 218 }
230 if (entry && return_bigger) { 219 if (entry && return_bigger) {
231 if (cmp > 0) { 220 if (bytenr > entry->node.bytenr) {
232 n = rb_next(&entry->href_node); 221 n = rb_next(&entry->href_node);
233 if (!n) 222 if (!n)
234 n = rb_first(root); 223 n = rb_first(root);
235 entry = rb_entry(n, struct btrfs_delayed_ref_head, 224 entry = rb_entry(n, struct btrfs_delayed_ref_head,
236 href_node); 225 href_node);
237 bytenr = entry->node.bytenr; 226 return entry;
238 return_bigger = 0;
239 goto again;
240 } 227 }
241 return entry; 228 return entry;
242 } 229 }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
415 402
416again: 403again:
417 start = delayed_refs->run_delayed_start; 404 start = delayed_refs->run_delayed_start;
418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 405 head = find_ref_head(&delayed_refs->href_root, start, 1);
419 if (!head && !loop) { 406 if (!head && !loop) {
420 delayed_refs->run_delayed_start = 0; 407 delayed_refs->run_delayed_start = 0;
421 start = 0; 408 start = 0;
422 loop = true; 409 loop = true;
423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); 410 head = find_ref_head(&delayed_refs->href_root, start, 1);
424 if (!head) 411 if (!head)
425 return NULL; 412 return NULL;
426 } else if (!head && loop) { 413 } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
508 ref = btrfs_delayed_node_to_head(update); 495 ref = btrfs_delayed_node_to_head(update);
509 BUG_ON(existing_ref->is_data != ref->is_data); 496 BUG_ON(existing_ref->is_data != ref->is_data);
510 497
498 spin_lock(&existing_ref->lock);
511 if (ref->must_insert_reserved) { 499 if (ref->must_insert_reserved) {
512 /* if the extent was freed and then 500 /* if the extent was freed and then
513 * reallocated before the delayed ref 501 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
549 * only need the lock for this case cause we could be processing it 537 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok. 538 * currently, for refs we just added we know we're a-ok.
551 */ 539 */
552 spin_lock(&existing_ref->lock);
553 existing->ref_mod += update->ref_mod; 540 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock); 541 spin_unlock(&existing_ref->lock);
555} 542}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
898 struct btrfs_delayed_ref_root *delayed_refs; 885 struct btrfs_delayed_ref_root *delayed_refs;
899 886
900 delayed_refs = &trans->transaction->delayed_refs; 887 delayed_refs = &trans->transaction->delayed_refs;
901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); 888 return find_ref_head(&delayed_refs->href_root, bytenr, 0);
902} 889}
903 890
904void btrfs_delayed_ref_exit(void) 891void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
470 */ 493 */
471 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 494 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
472 if (ret) { 495 if (ret) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 496 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return ret; 497 return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
484 WARN_ON(ret); 507 WARN_ON(ret);
485 508
486 /* keep away write_all_supers() during the finishing procedure */ 509 /* keep away write_all_supers() during the finishing procedure */
510 mutex_lock(&root->fs_info->chunk_mutex);
487 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 511 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
488 btrfs_dev_replace_lock(dev_replace); 512 btrfs_dev_replace_lock(dev_replace);
489 dev_replace->replace_state = 513 dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
494 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
495 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
496 520
497 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
498 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
500 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
503 rcu_str_deref(tgt_device->name), scrub_ret); 532 rcu_str_deref(tgt_device->name), scrub_ret);
504 btrfs_dev_replace_unlock(dev_replace); 533 btrfs_dev_replace_unlock(dev_replace);
505 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 534 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
535 mutex_unlock(&root->fs_info->chunk_mutex);
506 if (tgt_device) 536 if (tgt_device)
507 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 537 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
508 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 538 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
533 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
534 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
535 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
536 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
537 /* 571 /*
538 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
539 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
543 */ 577 */
544 btrfs_dev_replace_unlock(dev_replace); 578 btrfs_dev_replace_unlock(dev_replace);
545 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 579 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
580 mutex_unlock(&root->fs_info->chunk_mutex);
546 581
547 /* write back the superblocks */ 582 /* write back the superblocks */
548 trans = btrfs_start_transaction(root, 0); 583 trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
862 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
863 } 898 }
864} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..983314932af3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
329{ 329{
330 struct extent_state *cached_state = NULL; 330 struct extent_state *cached_state = NULL;
331 int ret; 331 int ret;
332 bool need_lock = (current->journal_info ==
333 (void *)BTRFS_SEND_TRANS_STUB);
332 334
333 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 335 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
334 return 0; 336 return 0;
@@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
336 if (atomic) 338 if (atomic)
337 return -EAGAIN; 339 return -EAGAIN;
338 340
341 if (need_lock) {
342 btrfs_tree_read_lock(eb);
343 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
344 }
345
339 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 346 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
340 0, &cached_state); 347 0, &cached_state);
341 if (extent_buffer_uptodate(eb) && 348 if (extent_buffer_uptodate(eb) &&
@@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
347 "found %llu\n", 354 "found %llu\n",
348 eb->start, parent_transid, btrfs_header_generation(eb)); 355 eb->start, parent_transid, btrfs_header_generation(eb));
349 ret = 1; 356 ret = 1;
350 clear_extent_buffer_uptodate(eb); 357
358 /*
359 * Things reading via commit roots that don't have normal protection,
360 * like send, can have a really old block in cache that may point at a
361 * block that has been free'd and re-allocated. So don't clear uptodate
362 * if we find an eb that is under IO (dirty/writeback) because we could
363 * end up reading in the stale data and then writing it back out and
364 * making everybody very sad.
365 */
366 if (!extent_buffer_under_io(eb))
367 clear_extent_buffer_uptodate(eb);
351out: 368out:
352 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 369 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
353 &cached_state, GFP_NOFS); 370 &cached_state, GFP_NOFS);
371 btrfs_tree_read_unlock_blocking(eb);
354 return ret; 372 return ret;
355} 373}
356 374
@@ -678,32 +696,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
678 696
679 fs_info = end_io_wq->info; 697 fs_info = end_io_wq->info;
680 end_io_wq->error = err; 698 end_io_wq->error = err;
681 end_io_wq->work.func = end_workqueue_fn; 699 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
682 end_io_wq->work.flags = 0;
683 700
684 if (bio->bi_rw & REQ_WRITE) { 701 if (bio->bi_rw & REQ_WRITE) {
685 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 702 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
686 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 703 btrfs_queue_work(fs_info->endio_meta_write_workers,
687 &end_io_wq->work); 704 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 705 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
689 btrfs_queue_worker(&fs_info->endio_freespace_worker, 706 btrfs_queue_work(fs_info->endio_freespace_worker,
690 &end_io_wq->work); 707 &end_io_wq->work);
691 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 708 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
692 btrfs_queue_worker(&fs_info->endio_raid56_workers, 709 btrfs_queue_work(fs_info->endio_raid56_workers,
693 &end_io_wq->work); 710 &end_io_wq->work);
694 else 711 else
695 btrfs_queue_worker(&fs_info->endio_write_workers, 712 btrfs_queue_work(fs_info->endio_write_workers,
696 &end_io_wq->work); 713 &end_io_wq->work);
697 } else { 714 } else {
698 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 715 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
699 btrfs_queue_worker(&fs_info->endio_raid56_workers, 716 btrfs_queue_work(fs_info->endio_raid56_workers,
700 &end_io_wq->work); 717 &end_io_wq->work);
701 else if (end_io_wq->metadata) 718 else if (end_io_wq->metadata)
702 btrfs_queue_worker(&fs_info->endio_meta_workers, 719 btrfs_queue_work(fs_info->endio_meta_workers,
703 &end_io_wq->work); 720 &end_io_wq->work);
704 else 721 else
705 btrfs_queue_worker(&fs_info->endio_workers, 722 btrfs_queue_work(fs_info->endio_workers,
706 &end_io_wq->work); 723 &end_io_wq->work);
707 } 724 }
708} 725}
709 726
@@ -738,7 +755,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) 755unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
739{ 756{
740 unsigned long limit = min_t(unsigned long, 757 unsigned long limit = min_t(unsigned long,
741 info->workers.max_workers, 758 info->thread_pool_size,
742 info->fs_devices->open_devices); 759 info->fs_devices->open_devices);
743 return 256 * limit; 760 return 256 * limit;
744} 761}
@@ -811,11 +828,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
811 async->submit_bio_start = submit_bio_start; 828 async->submit_bio_start = submit_bio_start;
812 async->submit_bio_done = submit_bio_done; 829 async->submit_bio_done = submit_bio_done;
813 830
814 async->work.func = run_one_async_start; 831 btrfs_init_work(&async->work, run_one_async_start,
815 async->work.ordered_func = run_one_async_done; 832 run_one_async_done, run_one_async_free);
816 async->work.ordered_free = run_one_async_free;
817 833
818 async->work.flags = 0;
819 async->bio_flags = bio_flags; 834 async->bio_flags = bio_flags;
820 async->bio_offset = bio_offset; 835 async->bio_offset = bio_offset;
821 836
@@ -824,9 +839,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
824 atomic_inc(&fs_info->nr_async_submits); 839 atomic_inc(&fs_info->nr_async_submits);
825 840
826 if (rw & REQ_SYNC) 841 if (rw & REQ_SYNC)
827 btrfs_set_work_high_prio(&async->work); 842 btrfs_set_work_high_priority(&async->work);
828 843
829 btrfs_queue_worker(&fs_info->workers, &async->work); 844 btrfs_queue_work(fs_info->workers, &async->work);
830 845
831 while (atomic_read(&fs_info->async_submit_draining) && 846 while (atomic_read(&fs_info->async_submit_draining) &&
832 atomic_read(&fs_info->nr_async_submits)) { 847 atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1164,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1149 } 1164 }
1150} 1165}
1151 1166
1167static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1168{
1169 struct btrfs_subvolume_writers *writers;
1170 int ret;
1171
1172 writers = kmalloc(sizeof(*writers), GFP_NOFS);
1173 if (!writers)
1174 return ERR_PTR(-ENOMEM);
1175
1176 ret = percpu_counter_init(&writers->counter, 0);
1177 if (ret < 0) {
1178 kfree(writers);
1179 return ERR_PTR(ret);
1180 }
1181
1182 init_waitqueue_head(&writers->wait);
1183 return writers;
1184}
1185
1186static void
1187btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1188{
1189 percpu_counter_destroy(&writers->counter);
1190 kfree(writers);
1191}
1192
1152static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1193static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 u32 stripesize, struct btrfs_root *root, 1194 u32 stripesize, struct btrfs_root *root,
1154 struct btrfs_fs_info *fs_info, 1195 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1235,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1194 spin_lock_init(&root->log_extents_lock[1]); 1235 spin_lock_init(&root->log_extents_lock[1]);
1195 mutex_init(&root->objectid_mutex); 1236 mutex_init(&root->objectid_mutex);
1196 mutex_init(&root->log_mutex); 1237 mutex_init(&root->log_mutex);
1238 mutex_init(&root->ordered_extent_mutex);
1239 mutex_init(&root->delalloc_mutex);
1197 init_waitqueue_head(&root->log_writer_wait); 1240 init_waitqueue_head(&root->log_writer_wait);
1198 init_waitqueue_head(&root->log_commit_wait[0]); 1241 init_waitqueue_head(&root->log_commit_wait[0]);
1199 init_waitqueue_head(&root->log_commit_wait[1]); 1242 init_waitqueue_head(&root->log_commit_wait[1]);
1243 INIT_LIST_HEAD(&root->log_ctxs[0]);
1244 INIT_LIST_HEAD(&root->log_ctxs[1]);
1200 atomic_set(&root->log_commit[0], 0); 1245 atomic_set(&root->log_commit[0], 0);
1201 atomic_set(&root->log_commit[1], 0); 1246 atomic_set(&root->log_commit[1], 0);
1202 atomic_set(&root->log_writers, 0); 1247 atomic_set(&root->log_writers, 0);
1203 atomic_set(&root->log_batch, 0); 1248 atomic_set(&root->log_batch, 0);
1204 atomic_set(&root->orphan_inodes, 0); 1249 atomic_set(&root->orphan_inodes, 0);
1205 atomic_set(&root->refs, 1); 1250 atomic_set(&root->refs, 1);
1251 atomic_set(&root->will_be_snapshoted, 0);
1206 root->log_transid = 0; 1252 root->log_transid = 0;
1253 root->log_transid_committed = -1;
1207 root->last_log_commit = 0; 1254 root->last_log_commit = 0;
1208 if (fs_info) 1255 if (fs_info)
1209 extent_io_tree_init(&root->dirty_log_pages, 1256 extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1464,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1417 WARN_ON(root->log_root); 1464 WARN_ON(root->log_root);
1418 root->log_root = log_root; 1465 root->log_root = log_root;
1419 root->log_transid = 0; 1466 root->log_transid = 0;
1467 root->log_transid_committed = -1;
1420 root->last_log_commit = 0; 1468 root->last_log_commit = 0;
1421 return 0; 1469 return 0;
1422} 1470}
@@ -1498,6 +1546,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1498int btrfs_init_fs_root(struct btrfs_root *root) 1546int btrfs_init_fs_root(struct btrfs_root *root)
1499{ 1547{
1500 int ret; 1548 int ret;
1549 struct btrfs_subvolume_writers *writers;
1501 1550
1502 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1551 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1503 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1552 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,15 +1556,24 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1507 goto fail; 1556 goto fail;
1508 } 1557 }
1509 1558
1559 writers = btrfs_alloc_subvolume_writers();
1560 if (IS_ERR(writers)) {
1561 ret = PTR_ERR(writers);
1562 goto fail;
1563 }
1564 root->subv_writers = writers;
1565
1510 btrfs_init_free_ino_ctl(root); 1566 btrfs_init_free_ino_ctl(root);
1511 mutex_init(&root->fs_commit_mutex);
1512 spin_lock_init(&root->cache_lock); 1567 spin_lock_init(&root->cache_lock);
1513 init_waitqueue_head(&root->cache_wait); 1568 init_waitqueue_head(&root->cache_wait);
1514 1569
1515 ret = get_anon_bdev(&root->anon_dev); 1570 ret = get_anon_bdev(&root->anon_dev);
1516 if (ret) 1571 if (ret)
1517 goto fail; 1572 goto free_writers;
1518 return 0; 1573 return 0;
1574
1575free_writers:
1576 btrfs_free_subvolume_writers(root->subv_writers);
1519fail: 1577fail:
1520 kfree(root->free_ino_ctl); 1578 kfree(root->free_ino_ctl);
1521 kfree(root->free_ino_pinned); 1579 kfree(root->free_ino_pinned);
@@ -1990,23 +2048,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
1990/* helper to cleanup workers */ 2048/* helper to cleanup workers */
1991static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2049static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1992{ 2050{
1993 btrfs_stop_workers(&fs_info->generic_worker); 2051 btrfs_destroy_workqueue(fs_info->fixup_workers);
1994 btrfs_stop_workers(&fs_info->fixup_workers); 2052 btrfs_destroy_workqueue(fs_info->delalloc_workers);
1995 btrfs_stop_workers(&fs_info->delalloc_workers); 2053 btrfs_destroy_workqueue(fs_info->workers);
1996 btrfs_stop_workers(&fs_info->workers); 2054 btrfs_destroy_workqueue(fs_info->endio_workers);
1997 btrfs_stop_workers(&fs_info->endio_workers); 2055 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
1998 btrfs_stop_workers(&fs_info->endio_meta_workers); 2056 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1999 btrfs_stop_workers(&fs_info->endio_raid56_workers); 2057 btrfs_destroy_workqueue(fs_info->rmw_workers);
2000 btrfs_stop_workers(&fs_info->rmw_workers); 2058 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2001 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2059 btrfs_destroy_workqueue(fs_info->endio_write_workers);
2002 btrfs_stop_workers(&fs_info->endio_write_workers); 2060 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2003 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2061 btrfs_destroy_workqueue(fs_info->submit_workers);
2004 btrfs_stop_workers(&fs_info->submit_workers); 2062 btrfs_destroy_workqueue(fs_info->delayed_workers);
2005 btrfs_stop_workers(&fs_info->delayed_workers); 2063 btrfs_destroy_workqueue(fs_info->caching_workers);
2006 btrfs_stop_workers(&fs_info->caching_workers); 2064 btrfs_destroy_workqueue(fs_info->readahead_workers);
2007 btrfs_stop_workers(&fs_info->readahead_workers); 2065 btrfs_destroy_workqueue(fs_info->flush_workers);
2008 btrfs_stop_workers(&fs_info->flush_workers); 2066 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2009 btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
2010} 2067}
2011 2068
2012static void free_root_extent_buffers(struct btrfs_root *root) 2069static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2154,8 @@ int open_ctree(struct super_block *sb,
2097 int err = -EINVAL; 2154 int err = -EINVAL;
2098 int num_backups_tried = 0; 2155 int num_backups_tried = 0;
2099 int backup_index = 0; 2156 int backup_index = 0;
2157 int max_active;
2158 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2100 bool create_uuid_tree; 2159 bool create_uuid_tree;
2101 bool check_uuid_tree; 2160 bool check_uuid_tree;
2102 2161
@@ -2133,10 +2192,16 @@ int open_ctree(struct super_block *sb,
2133 goto fail_dirty_metadata_bytes; 2192 goto fail_dirty_metadata_bytes;
2134 } 2193 }
2135 2194
2195 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2196 if (ret) {
2197 err = ret;
2198 goto fail_delalloc_bytes;
2199 }
2200
2136 fs_info->btree_inode = new_inode(sb); 2201 fs_info->btree_inode = new_inode(sb);
2137 if (!fs_info->btree_inode) { 2202 if (!fs_info->btree_inode) {
2138 err = -ENOMEM; 2203 err = -ENOMEM;
2139 goto fail_delalloc_bytes; 2204 goto fail_bio_counter;
2140 } 2205 }
2141 2206
2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2207 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2224,7 @@ int open_ctree(struct super_block *sb,
2159 spin_lock_init(&fs_info->buffer_lock); 2224 spin_lock_init(&fs_info->buffer_lock);
2160 rwlock_init(&fs_info->tree_mod_log_lock); 2225 rwlock_init(&fs_info->tree_mod_log_lock);
2161 mutex_init(&fs_info->reloc_mutex); 2226 mutex_init(&fs_info->reloc_mutex);
2227 mutex_init(&fs_info->delalloc_root_mutex);
2162 seqlock_init(&fs_info->profiles_lock); 2228 seqlock_init(&fs_info->profiles_lock);
2163 2229
2164 init_completion(&fs_info->kobj_unregister); 2230 init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2277,7 @@ int open_ctree(struct super_block *sb,
2211 atomic_set(&fs_info->scrub_pause_req, 0); 2277 atomic_set(&fs_info->scrub_pause_req, 0);
2212 atomic_set(&fs_info->scrubs_paused, 0); 2278 atomic_set(&fs_info->scrubs_paused, 0);
2213 atomic_set(&fs_info->scrub_cancel_req, 0); 2279 atomic_set(&fs_info->scrub_cancel_req, 0);
2280 init_waitqueue_head(&fs_info->replace_wait);
2214 init_waitqueue_head(&fs_info->scrub_pause_wait); 2281 init_waitqueue_head(&fs_info->scrub_pause_wait);
2215 fs_info->scrub_workers_refcnt = 0; 2282 fs_info->scrub_workers_refcnt = 0;
2216#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2283#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2274,7 +2341,7 @@ int open_ctree(struct super_block *sb,
2274 mutex_init(&fs_info->transaction_kthread_mutex); 2341 mutex_init(&fs_info->transaction_kthread_mutex);
2275 mutex_init(&fs_info->cleaner_mutex); 2342 mutex_init(&fs_info->cleaner_mutex);
2276 mutex_init(&fs_info->volume_mutex); 2343 mutex_init(&fs_info->volume_mutex);
2277 init_rwsem(&fs_info->extent_commit_sem); 2344 init_rwsem(&fs_info->commit_root_sem);
2278 init_rwsem(&fs_info->cleanup_work_sem); 2345 init_rwsem(&fs_info->cleanup_work_sem);
2279 init_rwsem(&fs_info->subvol_sem); 2346 init_rwsem(&fs_info->subvol_sem);
2280 sema_init(&fs_info->uuid_tree_rescan_sem, 1); 2347 sema_init(&fs_info->uuid_tree_rescan_sem, 1);
@@ -2458,104 +2525,68 @@ int open_ctree(struct super_block *sb,
2458 goto fail_alloc; 2525 goto fail_alloc;
2459 } 2526 }
2460 2527
2461 btrfs_init_workers(&fs_info->generic_worker, 2528 max_active = fs_info->thread_pool_size;
2462 "genwork", 1, NULL);
2463 2529
2464 btrfs_init_workers(&fs_info->workers, "worker", 2530 fs_info->workers =
2465 fs_info->thread_pool_size, 2531 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2466 &fs_info->generic_worker); 2532 max_active, 16);
2467 2533
2468 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 2534 fs_info->delalloc_workers =
2469 fs_info->thread_pool_size, NULL); 2535 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2470 2536
2471 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", 2537 fs_info->flush_workers =
2472 fs_info->thread_pool_size, NULL); 2538 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2473 2539
2474 btrfs_init_workers(&fs_info->submit_workers, "submit", 2540 fs_info->caching_workers =
2475 min_t(u64, fs_devices->num_devices, 2541 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2476 fs_info->thread_pool_size), NULL);
2477 2542
2478 btrfs_init_workers(&fs_info->caching_workers, "cache", 2543 /*
2479 fs_info->thread_pool_size, NULL); 2544 * a higher idle thresh on the submit workers makes it much more
2480
2481 /* a higher idle thresh on the submit workers makes it much more
2482 * likely that bios will be send down in a sane order to the 2545 * likely that bios will be send down in a sane order to the
2483 * devices 2546 * devices
2484 */ 2547 */
2485 fs_info->submit_workers.idle_thresh = 64; 2548 fs_info->submit_workers =
2486 2549 btrfs_alloc_workqueue("submit", flags,
2487 fs_info->workers.idle_thresh = 16; 2550 min_t(u64, fs_devices->num_devices,
2488 fs_info->workers.ordered = 1; 2551 max_active), 64);
2489 2552
2490 fs_info->delalloc_workers.idle_thresh = 2; 2553 fs_info->fixup_workers =
2491 fs_info->delalloc_workers.ordered = 1; 2554 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2492
2493 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2494 &fs_info->generic_worker);
2495 btrfs_init_workers(&fs_info->endio_workers, "endio",
2496 fs_info->thread_pool_size,
2497 &fs_info->generic_worker);
2498 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2499 fs_info->thread_pool_size,
2500 &fs_info->generic_worker);
2501 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2502 "endio-meta-write", fs_info->thread_pool_size,
2503 &fs_info->generic_worker);
2504 btrfs_init_workers(&fs_info->endio_raid56_workers,
2505 "endio-raid56", fs_info->thread_pool_size,
2506 &fs_info->generic_worker);
2507 btrfs_init_workers(&fs_info->rmw_workers,
2508 "rmw", fs_info->thread_pool_size,
2509 &fs_info->generic_worker);
2510 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2511 fs_info->thread_pool_size,
2512 &fs_info->generic_worker);
2513 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2514 1, &fs_info->generic_worker);
2515 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2516 fs_info->thread_pool_size,
2517 &fs_info->generic_worker);
2518 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2519 fs_info->thread_pool_size,
2520 &fs_info->generic_worker);
2521 btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
2522 &fs_info->generic_worker);
2523 2555
2524 /* 2556 /*
2525 * endios are largely parallel and should have a very 2557 * endios are largely parallel and should have a very
2526 * low idle thresh 2558 * low idle thresh
2527 */ 2559 */
2528 fs_info->endio_workers.idle_thresh = 4; 2560 fs_info->endio_workers =
2529 fs_info->endio_meta_workers.idle_thresh = 4; 2561 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2530 fs_info->endio_raid56_workers.idle_thresh = 4; 2562 fs_info->endio_meta_workers =
2531 fs_info->rmw_workers.idle_thresh = 2; 2563 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2532 2564 fs_info->endio_meta_write_workers =
2533 fs_info->endio_write_workers.idle_thresh = 2; 2565 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2534 fs_info->endio_meta_write_workers.idle_thresh = 2; 2566 fs_info->endio_raid56_workers =
2535 fs_info->readahead_workers.idle_thresh = 2; 2567 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2536 2568 fs_info->rmw_workers =
2537 /* 2569 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2538 * btrfs_start_workers can really only fail because of ENOMEM so just 2570 fs_info->endio_write_workers =
2539 * return -ENOMEM if any of these fail. 2571 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2540 */ 2572 fs_info->endio_freespace_worker =
2541 ret = btrfs_start_workers(&fs_info->workers); 2573 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2542 ret |= btrfs_start_workers(&fs_info->generic_worker); 2574 fs_info->delayed_workers =
2543 ret |= btrfs_start_workers(&fs_info->submit_workers); 2575 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2544 ret |= btrfs_start_workers(&fs_info->delalloc_workers); 2576 fs_info->readahead_workers =
2545 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2577 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2546 ret |= btrfs_start_workers(&fs_info->endio_workers); 2578 fs_info->qgroup_rescan_workers =
2547 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2579 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2548 ret |= btrfs_start_workers(&fs_info->rmw_workers); 2580
2549 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2581 if (!(fs_info->workers && fs_info->delalloc_workers &&
2550 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2582 fs_info->submit_workers && fs_info->flush_workers &&
2551 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2583 fs_info->endio_workers && fs_info->endio_meta_workers &&
2552 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2584 fs_info->endio_meta_write_workers &&
2553 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2585 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2554 ret |= btrfs_start_workers(&fs_info->caching_workers); 2586 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2555 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2587 fs_info->caching_workers && fs_info->readahead_workers &&
2556 ret |= btrfs_start_workers(&fs_info->flush_workers); 2588 fs_info->fixup_workers && fs_info->delayed_workers &&
2557 ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); 2589 fs_info->qgroup_rescan_workers)) {
2558 if (ret) {
2559 err = -ENOMEM; 2590 err = -ENOMEM;
2560 goto fail_sb_buffer; 2591 goto fail_sb_buffer;
2561 } 2592 }
@@ -2830,7 +2861,7 @@ retry_root_backup:
2830 printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2861 printk(KERN_ERR "BTRFS: failed to read log tree\n");
2831 free_extent_buffer(log_tree_root->node); 2862 free_extent_buffer(log_tree_root->node);
2832 kfree(log_tree_root); 2863 kfree(log_tree_root);
2833 goto fail_trans_kthread; 2864 goto fail_qgroup;
2834 } 2865 }
2835 /* returns with log_tree_root freed on success */ 2866 /* returns with log_tree_root freed on success */
2836 ret = btrfs_recover_log_trees(log_tree_root); 2867 ret = btrfs_recover_log_trees(log_tree_root);
@@ -2839,24 +2870,24 @@ retry_root_backup:
2839 "Failed to recover log tree"); 2870 "Failed to recover log tree");
2840 free_extent_buffer(log_tree_root->node); 2871 free_extent_buffer(log_tree_root->node);
2841 kfree(log_tree_root); 2872 kfree(log_tree_root);
2842 goto fail_trans_kthread; 2873 goto fail_qgroup;
2843 } 2874 }
2844 2875
2845 if (sb->s_flags & MS_RDONLY) { 2876 if (sb->s_flags & MS_RDONLY) {
2846 ret = btrfs_commit_super(tree_root); 2877 ret = btrfs_commit_super(tree_root);
2847 if (ret) 2878 if (ret)
2848 goto fail_trans_kthread; 2879 goto fail_qgroup;
2849 } 2880 }
2850 } 2881 }
2851 2882
2852 ret = btrfs_find_orphan_roots(tree_root); 2883 ret = btrfs_find_orphan_roots(tree_root);
2853 if (ret) 2884 if (ret)
2854 goto fail_trans_kthread; 2885 goto fail_qgroup;
2855 2886
2856 if (!(sb->s_flags & MS_RDONLY)) { 2887 if (!(sb->s_flags & MS_RDONLY)) {
2857 ret = btrfs_cleanup_fs_roots(fs_info); 2888 ret = btrfs_cleanup_fs_roots(fs_info);
2858 if (ret) 2889 if (ret)
2859 goto fail_trans_kthread; 2890 goto fail_qgroup;
2860 2891
2861 ret = btrfs_recover_relocation(tree_root); 2892 ret = btrfs_recover_relocation(tree_root);
2862 if (ret < 0) { 2893 if (ret < 0) {
@@ -2963,6 +2994,8 @@ fail_iput:
2963 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2994 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2964 2995
2965 iput(fs_info->btree_inode); 2996 iput(fs_info->btree_inode);
2997fail_bio_counter:
2998 percpu_counter_destroy(&fs_info->bio_counter);
2966fail_delalloc_bytes: 2999fail_delalloc_bytes:
2967 percpu_counter_destroy(&fs_info->delalloc_bytes); 3000 percpu_counter_destroy(&fs_info->delalloc_bytes);
2968fail_dirty_metadata_bytes: 3001fail_dirty_metadata_bytes:
@@ -3244,6 +3277,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3244 /* send down all the barriers */ 3277 /* send down all the barriers */
3245 head = &info->fs_devices->devices; 3278 head = &info->fs_devices->devices;
3246 list_for_each_entry_rcu(dev, head, dev_list) { 3279 list_for_each_entry_rcu(dev, head, dev_list) {
3280 if (dev->missing)
3281 continue;
3247 if (!dev->bdev) { 3282 if (!dev->bdev) {
3248 errors_send++; 3283 errors_send++;
3249 continue; 3284 continue;
@@ -3258,6 +3293,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3258 3293
3259 /* wait for all the barriers */ 3294 /* wait for all the barriers */
3260 list_for_each_entry_rcu(dev, head, dev_list) { 3295 list_for_each_entry_rcu(dev, head, dev_list) {
3296 if (dev->missing)
3297 continue;
3261 if (!dev->bdev) { 3298 if (!dev->bdev) {
3262 errors_wait++; 3299 errors_wait++;
3263 continue; 3300 continue;
@@ -3477,6 +3514,8 @@ static void free_fs_root(struct btrfs_root *root)
3477 root->orphan_block_rsv = NULL; 3514 root->orphan_block_rsv = NULL;
3478 if (root->anon_dev) 3515 if (root->anon_dev)
3479 free_anon_bdev(root->anon_dev); 3516 free_anon_bdev(root->anon_dev);
3517 if (root->subv_writers)
3518 btrfs_free_subvolume_writers(root->subv_writers);
3480 free_extent_buffer(root->node); 3519 free_extent_buffer(root->node);
3481 free_extent_buffer(root->commit_root); 3520 free_extent_buffer(root->commit_root);
3482 kfree(root->free_ino_ctl); 3521 kfree(root->free_ino_ctl);
@@ -3610,6 +3649,7 @@ int close_ctree(struct btrfs_root *root)
3610 3649
3611 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3650 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3612 percpu_counter_destroy(&fs_info->delalloc_bytes); 3651 percpu_counter_destroy(&fs_info->delalloc_bytes);
3652 percpu_counter_destroy(&fs_info->bio_counter);
3613 bdi_destroy(&fs_info->bdi); 3653 bdi_destroy(&fs_info->bdi);
3614 cleanup_srcu_struct(&fs_info->subvol_srcu); 3654 cleanup_srcu_struct(&fs_info->subvol_srcu);
3615 3655
@@ -3791,9 +3831,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3791 list_move_tail(&root->ordered_root, 3831 list_move_tail(&root->ordered_root,
3792 &fs_info->ordered_roots); 3832 &fs_info->ordered_roots);
3793 3833
3834 spin_unlock(&fs_info->ordered_root_lock);
3794 btrfs_destroy_ordered_extents(root); 3835 btrfs_destroy_ordered_extents(root);
3795 3836
3796 cond_resched_lock(&fs_info->ordered_root_lock); 3837 cond_resched();
3838 spin_lock(&fs_info->ordered_root_lock);
3797 } 3839 }
3798 spin_unlock(&fs_info->ordered_root_lock); 3840 spin_unlock(&fs_info->ordered_root_lock);
3799} 3841}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..5590af92094b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work)
419again: 419again:
420 mutex_lock(&caching_ctl->mutex); 420 mutex_lock(&caching_ctl->mutex);
421 /* need to make sure the commit_root doesn't disappear */ 421 /* need to make sure the commit_root doesn't disappear */
422 down_read(&fs_info->extent_commit_sem); 422 down_read(&fs_info->commit_root_sem);
423 423
424next: 424next:
425 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 425 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -443,10 +443,10 @@ next:
443 break; 443 break;
444 444
445 if (need_resched() || 445 if (need_resched() ||
446 rwsem_is_contended(&fs_info->extent_commit_sem)) { 446 rwsem_is_contended(&fs_info->commit_root_sem)) {
447 caching_ctl->progress = last; 447 caching_ctl->progress = last;
448 btrfs_release_path(path); 448 btrfs_release_path(path);
449 up_read(&fs_info->extent_commit_sem); 449 up_read(&fs_info->commit_root_sem);
450 mutex_unlock(&caching_ctl->mutex); 450 mutex_unlock(&caching_ctl->mutex);
451 cond_resched(); 451 cond_resched();
452 goto again; 452 goto again;
@@ -513,7 +513,7 @@ next:
513 513
514err: 514err:
515 btrfs_free_path(path); 515 btrfs_free_path(path);
516 up_read(&fs_info->extent_commit_sem); 516 up_read(&fs_info->commit_root_sem);
517 517
518 free_excluded_extents(extent_root, block_group); 518 free_excluded_extents(extent_root, block_group);
519 519
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
549 caching_ctl->block_group = cache; 549 caching_ctl->block_group = cache;
550 caching_ctl->progress = cache->key.objectid; 550 caching_ctl->progress = cache->key.objectid;
551 atomic_set(&caching_ctl->count, 1); 551 atomic_set(&caching_ctl->count, 1);
552 caching_ctl->work.func = caching_thread; 552 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
553 553
554 spin_lock(&cache->lock); 554 spin_lock(&cache->lock);
555 /* 555 /*
@@ -633,14 +633,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
633 return 0; 633 return 0;
634 } 634 }
635 635
636 down_write(&fs_info->extent_commit_sem); 636 down_write(&fs_info->commit_root_sem);
637 atomic_inc(&caching_ctl->count); 637 atomic_inc(&caching_ctl->count);
638 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 638 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
639 up_write(&fs_info->extent_commit_sem); 639 up_write(&fs_info->commit_root_sem);
640 640
641 btrfs_get_block_group(cache); 641 btrfs_get_block_group(cache);
642 642
643 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 643 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
644 644
645 return ret; 645 return ret;
646} 646}
@@ -1542,6 +1542,7 @@ again:
1542 ret = 0; 1542 ret = 0;
1543 } 1543 }
1544 if (ret) { 1544 if (ret) {
1545 key.objectid = bytenr;
1545 key.type = BTRFS_EXTENT_ITEM_KEY; 1546 key.type = BTRFS_EXTENT_ITEM_KEY;
1546 key.offset = num_bytes; 1547 key.offset = num_bytes;
1547 btrfs_release_path(path); 1548 btrfs_release_path(path);
@@ -2444,7 +2445,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2444 spin_unlock(&locked_ref->lock); 2445 spin_unlock(&locked_ref->lock);
2445 spin_lock(&delayed_refs->lock); 2446 spin_lock(&delayed_refs->lock);
2446 spin_lock(&locked_ref->lock); 2447 spin_lock(&locked_ref->lock);
2447 if (rb_first(&locked_ref->ref_root)) { 2448 if (rb_first(&locked_ref->ref_root) ||
2449 locked_ref->extent_op) {
2448 spin_unlock(&locked_ref->lock); 2450 spin_unlock(&locked_ref->lock);
2449 spin_unlock(&delayed_refs->lock); 2451 spin_unlock(&delayed_refs->lock);
2450 continue; 2452 continue;
@@ -3541,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3541 return extended_to_chunk(flags | tmp); 3543 return extended_to_chunk(flags | tmp);
3542} 3544}
3543 3545
3544static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3546static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3545{ 3547{
3546 unsigned seq; 3548 unsigned seq;
3549 u64 flags;
3547 3550
3548 do { 3551 do {
3552 flags = orig_flags;
3549 seq = read_seqbegin(&root->fs_info->profiles_lock); 3553 seq = read_seqbegin(&root->fs_info->profiles_lock);
3550 3554
3551 if (flags & BTRFS_BLOCK_GROUP_DATA) 3555 if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -3971,7 +3975,7 @@ static int can_overcommit(struct btrfs_root *root,
3971} 3975}
3972 3976
3973static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3977static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3974 unsigned long nr_pages) 3978 unsigned long nr_pages, int nr_items)
3975{ 3979{
3976 struct super_block *sb = root->fs_info->sb; 3980 struct super_block *sb = root->fs_info->sb;
3977 3981
@@ -3986,9 +3990,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3986 * the filesystem is readonly(all dirty pages are written to 3990 * the filesystem is readonly(all dirty pages are written to
3987 * the disk). 3991 * the disk).
3988 */ 3992 */
3989 btrfs_start_delalloc_roots(root->fs_info, 0); 3993 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
3990 if (!current->journal_info) 3994 if (!current->journal_info)
3991 btrfs_wait_ordered_roots(root->fs_info, -1); 3995 btrfs_wait_ordered_roots(root->fs_info, nr_items);
3992 } 3996 }
3993} 3997}
3994 3998
@@ -4045,7 +4049,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4045 while (delalloc_bytes && loops < 3) { 4049 while (delalloc_bytes && loops < 3) {
4046 max_reclaim = min(delalloc_bytes, to_reclaim); 4050 max_reclaim = min(delalloc_bytes, to_reclaim);
4047 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4051 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4048 btrfs_writeback_inodes_sb_nr(root, nr_pages); 4052 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4049 /* 4053 /*
4050 * We need to wait for the async pages to actually start before 4054 * We need to wait for the async pages to actually start before
4051 * we do anything. 4055 * we do anything.
@@ -4112,13 +4116,9 @@ static int may_commit_transaction(struct btrfs_root *root,
4112 goto commit; 4116 goto commit;
4113 4117
4114 /* See if there is enough pinned space to make this reservation */ 4118 /* See if there is enough pinned space to make this reservation */
4115 spin_lock(&space_info->lock);
4116 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4119 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4117 bytes) >= 0) { 4120 bytes) >= 0)
4118 spin_unlock(&space_info->lock);
4119 goto commit; 4121 goto commit;
4120 }
4121 spin_unlock(&space_info->lock);
4122 4122
4123 /* 4123 /*
4124 * See if there is some space in the delayed insertion reservation for 4124 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4127,13 @@ static int may_commit_transaction(struct btrfs_root *root,
4127 if (space_info != delayed_rsv->space_info) 4127 if (space_info != delayed_rsv->space_info)
4128 return -ENOSPC; 4128 return -ENOSPC;
4129 4129
4130 spin_lock(&space_info->lock);
4131 spin_lock(&delayed_rsv->lock); 4130 spin_lock(&delayed_rsv->lock);
4132 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4131 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4133 bytes - delayed_rsv->size) >= 0) { 4132 bytes - delayed_rsv->size) >= 0) {
4134 spin_unlock(&delayed_rsv->lock); 4133 spin_unlock(&delayed_rsv->lock);
4135 spin_unlock(&space_info->lock);
4136 return -ENOSPC; 4134 return -ENOSPC;
4137 } 4135 }
4138 spin_unlock(&delayed_rsv->lock); 4136 spin_unlock(&delayed_rsv->lock);
4139 spin_unlock(&space_info->lock);
4140 4137
4141commit: 4138commit:
4142 trans = btrfs_join_transaction(root); 4139 trans = btrfs_join_transaction(root);
@@ -4181,7 +4178,7 @@ static int flush_space(struct btrfs_root *root,
4181 break; 4178 break;
4182 case FLUSH_DELALLOC: 4179 case FLUSH_DELALLOC:
4183 case FLUSH_DELALLOC_WAIT: 4180 case FLUSH_DELALLOC_WAIT:
4184 shrink_delalloc(root, num_bytes, orig_bytes, 4181 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4185 state == FLUSH_DELALLOC_WAIT); 4182 state == FLUSH_DELALLOC_WAIT);
4186 break; 4183 break;
4187 case ALLOC_CHUNK: 4184 case ALLOC_CHUNK:
@@ -5477,7 +5474,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5477 struct btrfs_block_group_cache *cache; 5474 struct btrfs_block_group_cache *cache;
5478 struct btrfs_space_info *space_info; 5475 struct btrfs_space_info *space_info;
5479 5476
5480 down_write(&fs_info->extent_commit_sem); 5477 down_write(&fs_info->commit_root_sem);
5481 5478
5482 list_for_each_entry_safe(caching_ctl, next, 5479 list_for_each_entry_safe(caching_ctl, next,
5483 &fs_info->caching_block_groups, list) { 5480 &fs_info->caching_block_groups, list) {
@@ -5496,7 +5493,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5496 else 5493 else
5497 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5494 fs_info->pinned_extents = &fs_info->freed_extents[0];
5498 5495
5499 up_write(&fs_info->extent_commit_sem); 5496 up_write(&fs_info->commit_root_sem);
5500 5497
5501 list_for_each_entry_rcu(space_info, &fs_info->space_info, list) 5498 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5502 percpu_counter_set(&space_info->total_bytes_pinned, 0); 5499 percpu_counter_set(&space_info->total_bytes_pinned, 0);
@@ -5725,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5725 5722
5726 if (ret > 0 && skinny_metadata) { 5723 if (ret > 0 && skinny_metadata) {
5727 skinny_metadata = false; 5724 skinny_metadata = false;
5725 key.objectid = bytenr;
5728 key.type = BTRFS_EXTENT_ITEM_KEY; 5726 key.type = BTRFS_EXTENT_ITEM_KEY;
5729 key.offset = num_bytes; 5727 key.offset = num_bytes;
5730 btrfs_release_path(path); 5728 btrfs_release_path(path);
@@ -5751,6 +5749,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5751 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5749 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
5752 bytenr, parent, root_objectid, owner_objectid, 5750 bytenr, parent, root_objectid, owner_objectid,
5753 owner_offset); 5751 owner_offset);
5752 btrfs_abort_transaction(trans, extent_root, ret);
5753 goto out;
5754 } else { 5754 } else {
5755 btrfs_abort_transaction(trans, extent_root, ret); 5755 btrfs_abort_transaction(trans, extent_root, ret);
5756 goto out; 5756 goto out;
@@ -8262,14 +8262,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8262 struct btrfs_caching_control *caching_ctl; 8262 struct btrfs_caching_control *caching_ctl;
8263 struct rb_node *n; 8263 struct rb_node *n;
8264 8264
8265 down_write(&info->extent_commit_sem); 8265 down_write(&info->commit_root_sem);
8266 while (!list_empty(&info->caching_block_groups)) { 8266 while (!list_empty(&info->caching_block_groups)) {
8267 caching_ctl = list_entry(info->caching_block_groups.next, 8267 caching_ctl = list_entry(info->caching_block_groups.next,
8268 struct btrfs_caching_control, list); 8268 struct btrfs_caching_control, list);
8269 list_del(&caching_ctl->list); 8269 list_del(&caching_ctl->list);
8270 put_caching_control(caching_ctl); 8270 put_caching_control(caching_ctl);
8271 } 8271 }
8272 up_write(&info->extent_commit_sem); 8272 up_write(&info->commit_root_sem);
8273 8273
8274 spin_lock(&info->block_group_cache_lock); 8274 spin_lock(&info->block_group_cache_lock);
8275 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8275 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8343,9 +8343,15 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8343 struct btrfs_block_group_cache *cache) 8343 struct btrfs_block_group_cache *cache)
8344{ 8344{
8345 int index = get_block_group_index(cache); 8345 int index = get_block_group_index(cache);
8346 bool first = false;
8346 8347
8347 down_write(&space_info->groups_sem); 8348 down_write(&space_info->groups_sem);
8348 if (list_empty(&space_info->block_groups[index])) { 8349 if (list_empty(&space_info->block_groups[index]))
8350 first = true;
8351 list_add_tail(&cache->list, &space_info->block_groups[index]);
8352 up_write(&space_info->groups_sem);
8353
8354 if (first) {
8349 struct kobject *kobj = &space_info->block_group_kobjs[index]; 8355 struct kobject *kobj = &space_info->block_group_kobjs[index];
8350 int ret; 8356 int ret;
8351 8357
@@ -8357,8 +8363,6 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8357 kobject_put(&space_info->kobj); 8363 kobject_put(&space_info->kobj);
8358 } 8364 }
8359 } 8365 }
8360 list_add_tail(&cache->list, &space_info->block_groups[index]);
8361 up_write(&space_info->groups_sem);
8362} 8366}
8363 8367
8364static struct btrfs_block_group_cache * 8368static struct btrfs_block_group_cache *
@@ -8938,3 +8942,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8938 range->len = trimmed; 8942 range->len = trimmed;
8939 return ret; 8943 return ret;
8940} 8944}
8945
8946/*
8947 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
8948 * they are used to prevent the some tasks writing data into the page cache
8949 * by nocow before the subvolume is snapshoted, but flush the data into
8950 * the disk after the snapshot creation.
8951 */
8952void btrfs_end_nocow_write(struct btrfs_root *root)
8953{
8954 percpu_counter_dec(&root->subv_writers->counter);
8955 /*
8956 * Make sure counter is updated before we wake up
8957 * waiters.
8958 */
8959 smp_mb();
8960 if (waitqueue_active(&root->subv_writers->wait))
8961 wake_up(&root->subv_writers->wait);
8962}
8963
8964int btrfs_start_nocow_write(struct btrfs_root *root)
8965{
8966 if (unlikely(atomic_read(&root->will_be_snapshoted)))
8967 return 0;
8968
8969 percpu_counter_inc(&root->subv_writers->counter);
8970 /*
8971 * Make sure counter is updated before we check for snapshot creation.
8972 */
8973 smp_mb();
8974 if (unlikely(atomic_read(&root->will_be_snapshoted))) {
8975 btrfs_end_nocow_write(root);
8976 return 0;
8977 }
8978 return 1;
8979}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..3955e475ceec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
229 } 229 }
230} 230}
231 231
232static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root,
233 struct rb_node *search_start,
234 u64 offset,
233 struct rb_node *node, 235 struct rb_node *node,
234 struct rb_node ***p_in, 236 struct rb_node ***p_in,
235 struct rb_node **parent_in) 237 struct rb_node **parent_in)
236{ 238{
237 struct rb_node **p = &root->rb_node; 239 struct rb_node **p;
238 struct rb_node *parent = NULL; 240 struct rb_node *parent = NULL;
239 struct tree_entry *entry; 241 struct tree_entry *entry;
240 242
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
244 goto do_insert; 246 goto do_insert;
245 } 247 }
246 248
249 p = search_start ? &search_start : &root->rb_node;
247 while (*p) { 250 while (*p) {
248 parent = *p; 251 parent = *p;
249 entry = rb_entry(parent, struct tree_entry, rb_node); 252 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
430 433
431 set_state_bits(tree, state, bits); 434 set_state_bits(tree, state, bits);
432 435
433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 436 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
434 if (node) { 437 if (node) {
435 struct extent_state *found; 438 struct extent_state *found;
436 found = rb_entry(node, struct extent_state, rb_node); 439 found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
477 prealloc->state = orig->state; 480 prealloc->state = orig->state;
478 orig->start = split; 481 orig->start = split;
479 482
480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 483 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
481 NULL, NULL); 484 &prealloc->rb_node, NULL, NULL);
482 if (node) { 485 if (node) {
483 free_extent_state(prealloc); 486 free_extent_state(prealloc);
484 return -EEXIST; 487 return -EEXIST;
@@ -746,6 +749,7 @@ again:
746 * our range starts 749 * our range starts
747 */ 750 */
748 node = tree_search(tree, start); 751 node = tree_search(tree, start);
752process_node:
749 if (!node) 753 if (!node)
750 break; 754 break;
751 755
@@ -766,7 +770,10 @@ again:
766 if (start > end) 770 if (start > end)
767 break; 771 break;
768 772
769 cond_resched_lock(&tree->lock); 773 if (!cond_resched_lock(&tree->lock)) {
774 node = rb_next(node);
775 goto process_node;
776 }
770 } 777 }
771out: 778out:
772 spin_unlock(&tree->lock); 779 spin_unlock(&tree->lock);
@@ -2757,7 +2764,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2757 2764
2758 if (em_cached && *em_cached) { 2765 if (em_cached && *em_cached) {
2759 em = *em_cached; 2766 em = *em_cached;
2760 if (em->in_tree && start >= em->start && 2767 if (extent_map_in_tree(em) && start >= em->start &&
2761 start < extent_map_end(em)) { 2768 start < extent_map_end(em)) {
2762 atomic_inc(&em->refs); 2769 atomic_inc(&em->refs);
2763 return em; 2770 return em;
@@ -4303,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
4303 kmem_cache_free(extent_buffer_cache, eb); 4310 kmem_cache_free(extent_buffer_cache, eb);
4304} 4311}
4305 4312
4306static int extent_buffer_under_io(struct extent_buffer *eb) 4313int extent_buffer_under_io(struct extent_buffer *eb)
4307{ 4314{
4308 return (atomic_read(&eb->io_pages) || 4315 return (atomic_read(&eb->io_pages) ||
4309 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4316 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 58b27e5ab521..c488b45237bf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
320int set_extent_buffer_uptodate(struct extent_buffer *eb); 320int set_extent_buffer_uptodate(struct extent_buffer *eb);
321int clear_extent_buffer_uptodate(struct extent_buffer *eb); 321int clear_extent_buffer_uptodate(struct extent_buffer *eb);
322int extent_buffer_uptodate(struct extent_buffer *eb); 322int extent_buffer_uptodate(struct extent_buffer *eb);
323int extent_buffer_under_io(struct extent_buffer *eb);
323int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, 324int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
324 unsigned long min_len, char **map, 325 unsigned long min_len, char **map,
325 unsigned long *map_start, 326 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); 51 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
52 if (!em) 52 if (!em)
53 return NULL; 53 return NULL;
54 em->in_tree = 0; 54 RB_CLEAR_NODE(&em->rb_node);
55 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE; 56 em->compress_type = BTRFS_COMPRESS_NONE;
57 em->generation = 0; 57 em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
73 return; 73 return;
74 WARN_ON(atomic_read(&em->refs) == 0); 74 WARN_ON(atomic_read(&em->refs) == 0);
75 if (atomic_dec_and_test(&em->refs)) { 75 if (atomic_dec_and_test(&em->refs)) {
76 WARN_ON(em->in_tree); 76 WARN_ON(extent_map_in_tree(em));
77 WARN_ON(!list_empty(&em->list)); 77 WARN_ON(!list_empty(&em->list));
78 kmem_cache_free(extent_map_cache, em); 78 kmem_cache_free(extent_map_cache, em);
79 } 79 }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
99 parent = *p; 99 parent = *p;
100 entry = rb_entry(parent, struct extent_map, rb_node); 100 entry = rb_entry(parent, struct extent_map, rb_node);
101 101
102 WARN_ON(!entry->in_tree);
103
104 if (em->start < entry->start) 102 if (em->start < entry->start)
105 p = &(*p)->rb_left; 103 p = &(*p)->rb_left;
106 else if (em->start >= extent_map_end(entry)) 104 else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
128 if (end > entry->start && em->start < extent_map_end(entry)) 126 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST; 127 return -EEXIST;
130 128
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p); 129 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root); 130 rb_insert_color(&em->rb_node, root);
134 return 0; 131 return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
153 prev = n; 150 prev = n;
154 prev_entry = entry; 151 prev_entry = entry;
155 152
156 WARN_ON(!entry->in_tree);
157
158 if (offset < entry->start) 153 if (offset < entry->start)
159 n = n->rb_left; 154 n = n->rb_left;
160 else if (offset >= extent_map_end(entry)) 155 else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
240 em->len += merge->len; 235 em->len += merge->len;
241 em->block_len += merge->block_len; 236 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 237 em->block_start = merge->block_start;
243 merge->in_tree = 0;
244 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; 238 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
245 em->mod_start = merge->mod_start; 239 em->mod_start = merge->mod_start;
246 em->generation = max(em->generation, merge->generation); 240 em->generation = max(em->generation, merge->generation);
247 241
248 rb_erase(&merge->rb_node, &tree->map); 242 rb_erase(&merge->rb_node, &tree->map);
243 RB_CLEAR_NODE(&merge->rb_node);
249 free_extent_map(merge); 244 free_extent_map(merge);
250 } 245 }
251 } 246 }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
257 em->len += merge->len; 252 em->len += merge->len;
258 em->block_len += merge->block_len; 253 em->block_len += merge->block_len;
259 rb_erase(&merge->rb_node, &tree->map); 254 rb_erase(&merge->rb_node, &tree->map);
260 merge->in_tree = 0; 255 RB_CLEAR_NODE(&merge->rb_node);
261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 256 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
262 em->generation = max(em->generation, merge->generation); 257 em->generation = max(em->generation, merge->generation);
263 free_extent_map(merge); 258 free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
319void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) 314void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
320{ 315{
321 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 316 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
322 if (em->in_tree) 317 if (extent_map_in_tree(em))
318 try_merge_map(tree, em);
319}
320
321static inline void setup_extent_mapping(struct extent_map_tree *tree,
322 struct extent_map *em,
323 int modified)
324{
325 atomic_inc(&em->refs);
326 em->mod_start = em->start;
327 em->mod_len = em->len;
328
329 if (modified)
330 list_move(&em->list, &tree->modified_extents);
331 else
323 try_merge_map(tree, em); 332 try_merge_map(tree, em);
324} 333}
325 334
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
342 if (ret) 351 if (ret)
343 goto out; 352 goto out;
344 353
345 atomic_inc(&em->refs); 354 setup_extent_mapping(tree, em, modified);
346
347 em->mod_start = em->start;
348 em->mod_len = em->len;
349
350 if (modified)
351 list_move(&em->list, &tree->modified_extents);
352 else
353 try_merge_map(tree, em);
354out: 355out:
355 return ret; 356 return ret;
356} 357}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
434 rb_erase(&em->rb_node, &tree->map); 435 rb_erase(&em->rb_node, &tree->map);
435 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 436 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
436 list_del_init(&em->list); 437 list_del_init(&em->list);
437 em->in_tree = 0; 438 RB_CLEAR_NODE(&em->rb_node);
438 return ret; 439 return ret;
439} 440}
441
442void replace_extent_mapping(struct extent_map_tree *tree,
443 struct extent_map *cur,
444 struct extent_map *new,
445 int modified)
446{
447 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
448 ASSERT(extent_map_in_tree(cur));
449 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
450 list_del_init(&cur->list);
451 rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
452 RB_CLEAR_NODE(&cur->rb_node);
453
454 setup_extent_mapping(tree, new, modified);
455}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
33 unsigned long flags; 33 unsigned long flags;
34 struct block_device *bdev; 34 struct block_device *bdev;
35 atomic_t refs; 35 atomic_t refs;
36 unsigned int in_tree;
37 unsigned int compress_type; 36 unsigned int compress_type;
38 struct list_head list; 37 struct list_head list;
39}; 38};
@@ -44,6 +43,11 @@ struct extent_map_tree {
44 rwlock_t lock; 43 rwlock_t lock;
45}; 44};
46 45
46static inline int extent_map_in_tree(const struct extent_map *em)
47{
48 return !RB_EMPTY_NODE(&em->rb_node);
49}
50
47static inline u64 extent_map_end(struct extent_map *em) 51static inline u64 extent_map_end(struct extent_map *em)
48{ 52{
49 if (em->start + em->len < em->start) 53 if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
64int add_extent_mapping(struct extent_map_tree *tree, 68int add_extent_mapping(struct extent_map_tree *tree,
65 struct extent_map *em, int modified); 69 struct extent_map *em, int modified);
66int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 70int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
71void replace_extent_mapping(struct extent_map_tree *tree,
72 struct extent_map *cur,
73 struct extent_map *new,
74 int modified);
67 75
68struct extent_map *alloc_extent_map(void); 76struct extent_map *alloc_extent_map(void);
69void free_extent_map(struct extent_map *em); 77void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..ae6af072b635 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
425 struct page *page = prepared_pages[pg]; 425 struct page *page = prepared_pages[pg];
426 /* 426 /*
427 * Copy data from userspace to the current page 427 * Copy data from userspace to the current page
428 *
429 * Disable pagefault to avoid recursive lock since
430 * the pages are already locked
431 */ 428 */
432 pagefault_disable();
433 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 429 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
434 pagefault_enable();
435 430
436 /* Flush processor's dcache for this page */ 431 /* Flush processor's dcache for this page */
437 flush_dcache_page(page); 432 flush_dcache_page(page);
@@ -591,7 +586,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
591 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 586 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
592 clear_bit(EXTENT_FLAG_LOGGING, &flags); 587 clear_bit(EXTENT_FLAG_LOGGING, &flags);
593 modified = !list_empty(&em->list); 588 modified = !list_empty(&em->list);
594 remove_extent_mapping(em_tree, em);
595 if (no_splits) 589 if (no_splits)
596 goto next; 590 goto next;
597 591
@@ -622,8 +616,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
622 split->bdev = em->bdev; 616 split->bdev = em->bdev;
623 split->flags = flags; 617 split->flags = flags;
624 split->compress_type = em->compress_type; 618 split->compress_type = em->compress_type;
625 ret = add_extent_mapping(em_tree, split, modified); 619 replace_extent_mapping(em_tree, em, split, modified);
626 BUG_ON(ret); /* Logic error */
627 free_extent_map(split); 620 free_extent_map(split);
628 split = split2; 621 split = split2;
629 split2 = NULL; 622 split2 = NULL;
@@ -661,12 +654,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
661 split->orig_block_len = 0; 654 split->orig_block_len = 0;
662 } 655 }
663 656
664 ret = add_extent_mapping(em_tree, split, modified); 657 if (extent_map_in_tree(em)) {
665 BUG_ON(ret); /* Logic error */ 658 replace_extent_mapping(em_tree, em, split,
659 modified);
660 } else {
661 ret = add_extent_mapping(em_tree, split,
662 modified);
663 ASSERT(ret == 0); /* Logic error */
664 }
666 free_extent_map(split); 665 free_extent_map(split);
667 split = NULL; 666 split = NULL;
668 } 667 }
669next: 668next:
669 if (extent_map_in_tree(em))
670 remove_extent_mapping(em_tree, em);
670 write_unlock(&em_tree->lock); 671 write_unlock(&em_tree->lock);
671 672
672 /* once for us */ 673 /* once for us */
@@ -720,7 +721,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
720 if (drop_cache) 721 if (drop_cache)
721 btrfs_drop_extent_cache(inode, start, end - 1, 0); 722 btrfs_drop_extent_cache(inode, start, end - 1, 0);
722 723
723 if (start >= BTRFS_I(inode)->disk_i_size) 724 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
724 modify_tree = 0; 725 modify_tree = 0;
725 726
726 while (1) { 727 while (1) {
@@ -798,7 +799,10 @@ next_slot:
798 */ 799 */
799 if (start > key.offset && end < extent_end) { 800 if (start > key.offset && end < extent_end) {
800 BUG_ON(del_nr > 0); 801 BUG_ON(del_nr > 0);
801 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 802 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
803 ret = -EOPNOTSUPP;
804 break;
805 }
802 806
803 memcpy(&new_key, &key, sizeof(new_key)); 807 memcpy(&new_key, &key, sizeof(new_key));
804 new_key.offset = start; 808 new_key.offset = start;
@@ -841,7 +845,10 @@ next_slot:
841 * | -------- extent -------- | 845 * | -------- extent -------- |
842 */ 846 */
843 if (start <= key.offset && end < extent_end) { 847 if (start <= key.offset && end < extent_end) {
844 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 848 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
849 ret = -EOPNOTSUPP;
850 break;
851 }
845 852
846 memcpy(&new_key, &key, sizeof(new_key)); 853 memcpy(&new_key, &key, sizeof(new_key));
847 new_key.offset = end; 854 new_key.offset = end;
@@ -864,7 +871,10 @@ next_slot:
864 */ 871 */
865 if (start > key.offset && end >= extent_end) { 872 if (start > key.offset && end >= extent_end) {
866 BUG_ON(del_nr > 0); 873 BUG_ON(del_nr > 0);
867 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 874 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
875 ret = -EOPNOTSUPP;
876 break;
877 }
868 878
869 btrfs_set_file_extent_num_bytes(leaf, fi, 879 btrfs_set_file_extent_num_bytes(leaf, fi,
870 start - key.offset); 880 start - key.offset);
@@ -938,34 +948,42 @@ next_slot:
938 * Set path->slots[0] to first slot, so that after the delete 948 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or 949 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted 950 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion. 951 * path->slots[0] for our insertion (if replace_extent != 0).
942 */ 952 */
943 path->slots[0] = del_slot; 953 path->slots[0] = del_slot;
944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 954 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
945 if (ret) 955 if (ret)
946 btrfs_abort_transaction(trans, root, ret); 956 btrfs_abort_transaction(trans, root, ret);
957 }
947 958
948 leaf = path->nodes[0]; 959 leaf = path->nodes[0];
949 /* 960 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that 961 * If btrfs_del_items() was called, it might have deleted a leaf, in
951 * is, its contents got pushed to its neighbors), in which case 962 * which case it unlocked our path, so check path->locks[0] matches a
952 * it means path->locks[0] == 0 963 * write lock.
953 */ 964 */
954 if (!ret && replace_extent && leafs_visited == 1 && 965 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] && 966 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
956 btrfs_leaf_free_space(root, leaf) >= 967 path->locks[0] == BTRFS_WRITE_LOCK) &&
957 sizeof(struct btrfs_item) + extent_item_size) { 968 btrfs_leaf_free_space(root, leaf) >=
958 969 sizeof(struct btrfs_item) + extent_item_size) {
959 key.objectid = ino; 970
960 key.type = BTRFS_EXTENT_DATA_KEY; 971 key.objectid = ino;
961 key.offset = start; 972 key.type = BTRFS_EXTENT_DATA_KEY;
962 setup_items_for_insert(root, path, &key, 973 key.offset = start;
963 &extent_item_size, 974 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
964 extent_item_size, 975 struct btrfs_key slot_key;
965 sizeof(struct btrfs_item) + 976
966 extent_item_size, 1); 977 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
967 *key_inserted = 1; 978 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
979 path->slots[0]++;
968 } 980 }
981 setup_items_for_insert(root, path, &key,
982 &extent_item_size,
983 extent_item_size,
984 sizeof(struct btrfs_item) +
985 extent_item_size, 1);
986 *key_inserted = 1;
969 } 987 }
970 988
971 if (!replace_extent || !(*key_inserted)) 989 if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1364,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1346 struct btrfs_ordered_extent *ordered; 1364 struct btrfs_ordered_extent *ordered;
1347 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1365 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1348 start_pos, last_pos, 0, cached_state); 1366 start_pos, last_pos, 0, cached_state);
1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); 1367 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1368 last_pos - start_pos + 1);
1350 if (ordered && 1369 if (ordered &&
1351 ordered->file_offset + ordered->len > start_pos && 1370 ordered->file_offset + ordered->len > start_pos &&
1352 ordered->file_offset <= last_pos) { 1371 ordered->file_offset <= last_pos) {
1353 btrfs_put_ordered_extent(ordered);
1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1372 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1355 start_pos, last_pos, 1373 start_pos, last_pos,
1356 cached_state, GFP_NOFS); 1374 cached_state, GFP_NOFS);
@@ -1358,12 +1376,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1358 unlock_page(pages[i]); 1376 unlock_page(pages[i]);
1359 page_cache_release(pages[i]); 1377 page_cache_release(pages[i]);
1360 } 1378 }
1361 ret = btrfs_wait_ordered_range(inode, start_pos, 1379 btrfs_start_ordered_extent(inode, ordered, 1);
1362 last_pos - start_pos + 1); 1380 btrfs_put_ordered_extent(ordered);
1363 if (ret) 1381 return -EAGAIN;
1364 return ret;
1365 else
1366 return -EAGAIN;
1367 } 1382 }
1368 if (ordered) 1383 if (ordered)
1369 btrfs_put_ordered_extent(ordered); 1384 btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1411,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1396 u64 num_bytes; 1411 u64 num_bytes;
1397 int ret; 1412 int ret;
1398 1413
1414 ret = btrfs_start_nocow_write(root);
1415 if (!ret)
1416 return -ENOSPC;
1417
1399 lockstart = round_down(pos, root->sectorsize); 1418 lockstart = round_down(pos, root->sectorsize);
1400 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1419 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1401 1420
1402 while (1) { 1421 while (1) {
1403 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1422 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1434,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1415 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1434 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1416 if (ret <= 0) { 1435 if (ret <= 0) {
1417 ret = 0; 1436 ret = 0;
1437 btrfs_end_nocow_write(root);
1418 } else { 1438 } else {
1419 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1439 *write_bytes = min_t(size_t, *write_bytes ,
1420 EXTENT_DIRTY | EXTENT_DELALLOC | 1440 num_bytes - pos + lockstart);
1421 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1422 NULL, GFP_NOFS);
1423 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1424 } 1441 }
1425 1442
1426 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1443 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1527,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 if (!only_release_metadata) 1527 if (!only_release_metadata)
1511 btrfs_free_reserved_data_space(inode, 1528 btrfs_free_reserved_data_space(inode,
1512 reserve_bytes); 1529 reserve_bytes);
1530 else
1531 btrfs_end_nocow_write(root);
1513 break; 1532 break;
1514 } 1533 }
1515 1534
@@ -1598,6 +1617,9 @@ again:
1598 } 1617 }
1599 1618
1600 release_bytes = 0; 1619 release_bytes = 0;
1620 if (only_release_metadata)
1621 btrfs_end_nocow_write(root);
1622
1601 if (only_release_metadata && copied > 0) { 1623 if (only_release_metadata && copied > 0) {
1602 u64 lockstart = round_down(pos, root->sectorsize); 1624 u64 lockstart = round_down(pos, root->sectorsize);
1603 u64 lockend = lockstart + 1625 u64 lockend = lockstart +
@@ -1624,10 +1646,12 @@ again:
1624 kfree(pages); 1646 kfree(pages);
1625 1647
1626 if (release_bytes) { 1648 if (release_bytes) {
1627 if (only_release_metadata) 1649 if (only_release_metadata) {
1650 btrfs_end_nocow_write(root);
1628 btrfs_delalloc_release_metadata(inode, release_bytes); 1651 btrfs_delalloc_release_metadata(inode, release_bytes);
1629 else 1652 } else {
1630 btrfs_delalloc_release_space(inode, release_bytes); 1653 btrfs_delalloc_release_space(inode, release_bytes);
1654 }
1631 } 1655 }
1632 1656
1633 return num_written ? num_written : ret; 1657 return num_written ? num_written : ret;
@@ -1636,7 +1660,7 @@ again:
1636static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1660static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1637 const struct iovec *iov, 1661 const struct iovec *iov,
1638 unsigned long nr_segs, loff_t pos, 1662 unsigned long nr_segs, loff_t pos,
1639 loff_t *ppos, size_t count, size_t ocount) 1663 size_t count, size_t ocount)
1640{ 1664{
1641 struct file *file = iocb->ki_filp; 1665 struct file *file = iocb->ki_filp;
1642 struct iov_iter i; 1666 struct iov_iter i;
@@ -1645,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1645 loff_t endbyte; 1669 loff_t endbyte;
1646 int err; 1670 int err;
1647 1671
1648 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1672 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
1649 count, ocount); 1673 count, ocount);
1650 1674
1651 if (written < 0 || written == count) 1675 if (written < 0 || written == count)
@@ -1664,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1664 if (err) 1688 if (err)
1665 goto out; 1689 goto out;
1666 written += written_buffered; 1690 written += written_buffered;
1667 *ppos = pos + written_buffered; 1691 iocb->ki_pos = pos + written_buffered;
1668 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1692 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1669 endbyte >> PAGE_CACHE_SHIFT); 1693 endbyte >> PAGE_CACHE_SHIFT);
1670out: 1694out:
@@ -1696,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1696 struct file *file = iocb->ki_filp; 1720 struct file *file = iocb->ki_filp;
1697 struct inode *inode = file_inode(file); 1721 struct inode *inode = file_inode(file);
1698 struct btrfs_root *root = BTRFS_I(inode)->root; 1722 struct btrfs_root *root = BTRFS_I(inode)->root;
1699 loff_t *ppos = &iocb->ki_pos;
1700 u64 start_pos; 1723 u64 start_pos;
1724 u64 end_pos;
1701 ssize_t num_written = 0; 1725 ssize_t num_written = 0;
1702 ssize_t err = 0; 1726 ssize_t err = 0;
1703 size_t count, ocount; 1727 size_t count, ocount;
@@ -1752,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1752 1776
1753 start_pos = round_down(pos, root->sectorsize); 1777 start_pos = round_down(pos, root->sectorsize);
1754 if (start_pos > i_size_read(inode)) { 1778 if (start_pos > i_size_read(inode)) {
1755 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1779 /* Expand hole size to cover write data, preventing empty gap */
1780 end_pos = round_up(pos + count, root->sectorsize);
1781 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1756 if (err) { 1782 if (err) {
1757 mutex_unlock(&inode->i_mutex); 1783 mutex_unlock(&inode->i_mutex);
1758 goto out; 1784 goto out;
@@ -1764,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1764 1790
1765 if (unlikely(file->f_flags & O_DIRECT)) { 1791 if (unlikely(file->f_flags & O_DIRECT)) {
1766 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1792 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1767 pos, ppos, count, ocount); 1793 pos, count, ocount);
1768 } else { 1794 } else {
1769 struct iov_iter i; 1795 struct iov_iter i;
1770 1796
@@ -1772,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1772 1798
1773 num_written = __btrfs_buffered_write(file, &i, pos); 1799 num_written = __btrfs_buffered_write(file, &i, pos);
1774 if (num_written > 0) 1800 if (num_written > 0)
1775 *ppos = pos + num_written; 1801 iocb->ki_pos = pos + num_written;
1776 } 1802 }
1777 1803
1778 mutex_unlock(&inode->i_mutex); 1804 mutex_unlock(&inode->i_mutex);
@@ -1797,7 +1823,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1797 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1823 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1798 if (num_written > 0) { 1824 if (num_written > 0) {
1799 err = generic_write_sync(file, pos, num_written); 1825 err = generic_write_sync(file, pos, num_written);
1800 if (err < 0 && num_written > 0) 1826 if (err < 0)
1801 num_written = err; 1827 num_written = err;
1802 } 1828 }
1803 1829
@@ -1856,8 +1882,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1856 struct dentry *dentry = file->f_path.dentry; 1882 struct dentry *dentry = file->f_path.dentry;
1857 struct inode *inode = dentry->d_inode; 1883 struct inode *inode = dentry->d_inode;
1858 struct btrfs_root *root = BTRFS_I(inode)->root; 1884 struct btrfs_root *root = BTRFS_I(inode)->root;
1859 int ret = 0;
1860 struct btrfs_trans_handle *trans; 1885 struct btrfs_trans_handle *trans;
1886 struct btrfs_log_ctx ctx;
1887 int ret = 0;
1861 bool full_sync = 0; 1888 bool full_sync = 0;
1862 1889
1863 trace_btrfs_sync_file(file, datasync); 1890 trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1978,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1951 } 1978 }
1952 trans->sync = true; 1979 trans->sync = true;
1953 1980
1954 ret = btrfs_log_dentry_safe(trans, root, dentry); 1981 btrfs_init_log_ctx(&ctx);
1982
1983 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1955 if (ret < 0) { 1984 if (ret < 0) {
1956 /* Fallthrough and commit/free transaction. */ 1985 /* Fallthrough and commit/free transaction. */
1957 ret = 1; 1986 ret = 1;
@@ -1971,7 +2000,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1971 2000
1972 if (ret != BTRFS_NO_LOG_SYNC) { 2001 if (ret != BTRFS_NO_LOG_SYNC) {
1973 if (!ret) { 2002 if (!ret) {
1974 ret = btrfs_sync_log(trans, root); 2003 ret = btrfs_sync_log(trans, root, &ctx);
1975 if (!ret) { 2004 if (!ret) {
1976 ret = btrfs_end_transaction(trans, root); 2005 ret = btrfs_end_transaction(trans, root);
1977 goto out; 2006 goto out;
@@ -1993,6 +2022,7 @@ out:
1993 2022
1994static const struct vm_operations_struct btrfs_file_vm_ops = { 2023static const struct vm_operations_struct btrfs_file_vm_ops = {
1995 .fault = filemap_fault, 2024 .fault = filemap_fault,
2025 .map_pages = filemap_map_pages,
1996 .page_mkwrite = btrfs_page_mkwrite, 2026 .page_mkwrite = btrfs_page_mkwrite,
1997 .remap_pages = generic_file_remap_pages, 2027 .remap_pages = generic_file_remap_pages,
1998}; 2028};
@@ -2157,6 +2187,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2187 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2188 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2189 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2190 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2160 2191
2161 ret = btrfs_wait_ordered_range(inode, offset, len); 2192 ret = btrfs_wait_ordered_range(inode, offset, len);
2162 if (ret) 2193 if (ret)
@@ -2172,14 +2203,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172 * entire page. 2203 * entire page.
2173 */ 2204 */
2174 if (same_page && len < PAGE_CACHE_SIZE) { 2205 if (same_page && len < PAGE_CACHE_SIZE) {
2175 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2206 if (offset < ino_size)
2176 ret = btrfs_truncate_page(inode, offset, len, 0); 2207 ret = btrfs_truncate_page(inode, offset, len, 0);
2177 mutex_unlock(&inode->i_mutex); 2208 mutex_unlock(&inode->i_mutex);
2178 return ret; 2209 return ret;
2179 } 2210 }
2180 2211
2181 /* zero back part of the first page */ 2212 /* zero back part of the first page */
2182 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2213 if (offset < ino_size) {
2183 ret = btrfs_truncate_page(inode, offset, 0, 0); 2214 ret = btrfs_truncate_page(inode, offset, 0, 0);
2184 if (ret) { 2215 if (ret) {
2185 mutex_unlock(&inode->i_mutex); 2216 mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2219,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2188 } 2219 }
2189 2220
2190 /* zero the front end of the last page */ 2221 /* zero the front end of the last page */
2191 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2222 if (offset + len < ino_size) {
2192 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2223 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2193 if (ret) { 2224 if (ret) {
2194 mutex_unlock(&inode->i_mutex); 2225 mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2308,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 2308
2278 trans->block_rsv = &root->fs_info->trans_block_rsv; 2309 trans->block_rsv = &root->fs_info->trans_block_rsv;
2279 2310
2280 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2311 if (cur_offset < ino_size) {
2281 if (ret) { 2312 ret = fill_holes(trans, inode, path, cur_offset,
2282 err = ret; 2313 drop_end);
2283 break; 2314 if (ret) {
2315 err = ret;
2316 break;
2317 }
2284 } 2318 }
2285 2319
2286 cur_offset = drop_end; 2320 cur_offset = drop_end;
@@ -2313,10 +2347,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2313 } 2347 }
2314 2348
2315 trans->block_rsv = &root->fs_info->trans_block_rsv; 2349 trans->block_rsv = &root->fs_info->trans_block_rsv;
2316 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2350 if (cur_offset < ino_size) {
2317 if (ret) { 2351 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2318 err = ret; 2352 if (ret) {
2319 goto out_trans; 2353 err = ret;
2354 goto out_trans;
2355 }
2320 } 2356 }
2321 2357
2322out_trans: 2358out_trans:
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab485e57b6fe..86935f5ae291 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
55 key.type = BTRFS_INODE_ITEM_KEY; 55 key.type = BTRFS_INODE_ITEM_KEY;
56again: 56again:
57 /* need to make sure the commit_root doesn't disappear */ 57 /* need to make sure the commit_root doesn't disappear */
58 mutex_lock(&root->fs_commit_mutex); 58 down_read(&fs_info->commit_root_sem);
59 59
60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
61 if (ret < 0) 61 if (ret < 0)
@@ -88,7 +88,7 @@ again:
88 btrfs_item_key_to_cpu(leaf, &key, 0); 88 btrfs_item_key_to_cpu(leaf, &key, 0);
89 btrfs_release_path(path); 89 btrfs_release_path(path);
90 root->cache_progress = last; 90 root->cache_progress = last;
91 mutex_unlock(&root->fs_commit_mutex); 91 up_read(&fs_info->commit_root_sem);
92 schedule_timeout(1); 92 schedule_timeout(1);
93 goto again; 93 goto again;
94 } else 94 } else
@@ -127,7 +127,7 @@ next:
127 btrfs_unpin_free_ino(root); 127 btrfs_unpin_free_ino(root);
128out: 128out:
129 wake_up(&root->cache_wait); 129 wake_up(&root->cache_wait);
130 mutex_unlock(&root->fs_commit_mutex); 130 up_read(&fs_info->commit_root_sem);
131 131
132 btrfs_free_path(path); 132 btrfs_free_path(path);
133 133
@@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root)
176 176
177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", 177 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
178 root->root_key.objectid); 178 root->root_key.objectid);
179 BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task");
181 btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
182 "disabling inode map caching");
183 }
180} 184}
181 185
182int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) 186int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -205,42 +209,28 @@ again:
205 209
206void btrfs_return_ino(struct btrfs_root *root, u64 objectid) 210void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
207{ 211{
208 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
209 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; 212 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
210 213
211 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 214 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
212 return; 215 return;
213
214again: 216again:
215 if (root->cached == BTRFS_CACHE_FINISHED) { 217 if (root->cached == BTRFS_CACHE_FINISHED) {
216 __btrfs_add_free_space(ctl, objectid, 1); 218 __btrfs_add_free_space(pinned, objectid, 1);
217 } else { 219 } else {
218 /* 220 down_write(&root->fs_info->commit_root_sem);
219 * If we are in the process of caching free ino chunks,
220 * to avoid adding the same inode number to the free_ino
221 * tree twice due to cross transaction, we'll leave it
222 * in the pinned tree until a transaction is committed
223 * or the caching work is done.
224 */
225
226 mutex_lock(&root->fs_commit_mutex);
227 spin_lock(&root->cache_lock); 221 spin_lock(&root->cache_lock);
228 if (root->cached == BTRFS_CACHE_FINISHED) { 222 if (root->cached == BTRFS_CACHE_FINISHED) {
229 spin_unlock(&root->cache_lock); 223 spin_unlock(&root->cache_lock);
230 mutex_unlock(&root->fs_commit_mutex); 224 up_write(&root->fs_info->commit_root_sem);
231 goto again; 225 goto again;
232 } 226 }
233 spin_unlock(&root->cache_lock); 227 spin_unlock(&root->cache_lock);
234 228
235 start_caching(root); 229 start_caching(root);
236 230
237 if (objectid <= root->cache_progress || 231 __btrfs_add_free_space(pinned, objectid, 1);
238 objectid >= root->highest_objectid)
239 __btrfs_add_free_space(ctl, objectid, 1);
240 else
241 __btrfs_add_free_space(pinned, objectid, 1);
242 232
243 mutex_unlock(&root->fs_commit_mutex); 233 up_write(&root->fs_info->commit_root_sem);
244 } 234 }
245} 235}
246 236
@@ -250,7 +240,7 @@ again:
250 * and others will just be dropped, because the commit root we were 240 * and others will just be dropped, because the commit root we were
251 * searching has changed. 241 * searching has changed.
252 * 242 *
253 * Must be called with root->fs_commit_mutex held 243 * Must be called with root->fs_info->commit_root_sem held
254 */ 244 */
255void btrfs_unpin_free_ino(struct btrfs_root *root) 245void btrfs_unpin_free_ino(struct btrfs_root *root)
256{ 246{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..5f805bc944fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
394 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 394 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
395 btrfs_add_inode_defrag(NULL, inode); 395 btrfs_add_inode_defrag(NULL, inode);
396 396
397 /*
398 * skip compression for a small file range(<=blocksize) that
399 * isn't an inline extent, since it dosen't save disk space at all.
400 */
401 if ((end - start + 1) <= blocksize &&
402 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
403 goto cleanup_and_bail_uncompressed;
404
397 actual_end = min_t(u64, isize, end + 1); 405 actual_end = min_t(u64, isize, end + 1);
398again: 406again:
399 will_compress = 0; 407 will_compress = 0;
@@ -864,7 +872,8 @@ static noinline int cow_file_range(struct inode *inode,
864 872
865 if (btrfs_is_free_space_inode(inode)) { 873 if (btrfs_is_free_space_inode(inode)) {
866 WARN_ON_ONCE(1); 874 WARN_ON_ONCE(1);
867 return -EINVAL; 875 ret = -EINVAL;
876 goto out_unlock;
868 } 877 }
869 878
870 num_bytes = ALIGN(end - start + 1, blocksize); 879 num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1075 async_cow->end = cur_end; 1084 async_cow->end = cur_end;
1076 INIT_LIST_HEAD(&async_cow->extents); 1085 INIT_LIST_HEAD(&async_cow->extents);
1077 1086
1078 async_cow->work.func = async_cow_start; 1087 btrfs_init_work(&async_cow->work, async_cow_start,
1079 async_cow->work.ordered_func = async_cow_submit; 1088 async_cow_submit, async_cow_free);
1080 async_cow->work.ordered_free = async_cow_free;
1081 async_cow->work.flags = 0;
1082 1089
1083 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1090 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1084 PAGE_CACHE_SHIFT; 1091 PAGE_CACHE_SHIFT;
1085 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1092 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1086 1093
1087 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1094 btrfs_queue_work(root->fs_info->delalloc_workers,
1088 &async_cow->work); 1095 &async_cow->work);
1089 1096
1090 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1097 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1091 wait_event(root->fs_info->async_submit_wait, 1098 wait_event(root->fs_info->async_submit_wait,
@@ -1272,6 +1279,15 @@ next_slot:
1272 disk_bytenr += cur_offset - found_key.offset; 1279 disk_bytenr += cur_offset - found_key.offset;
1273 num_bytes = min(end + 1, extent_end) - cur_offset; 1280 num_bytes = min(end + 1, extent_end) - cur_offset;
1274 /* 1281 /*
1282 * if there are pending snapshots for this root,
1283 * we fall into common COW way.
1284 */
1285 if (!nolock) {
1286 err = btrfs_start_nocow_write(root);
1287 if (!err)
1288 goto out_check;
1289 }
1290 /*
1275 * force cow if csum exists in the range. 1291 * force cow if csum exists in the range.
1276 * this ensure that csum for a given extent are 1292 * this ensure that csum for a given extent are
1277 * either valid or do not exist. 1293 * either valid or do not exist.
@@ -1290,6 +1306,8 @@ next_slot:
1290out_check: 1306out_check:
1291 if (extent_end <= start) { 1307 if (extent_end <= start) {
1292 path->slots[0]++; 1308 path->slots[0]++;
1309 if (!nolock && nocow)
1310 btrfs_end_nocow_write(root);
1293 goto next_slot; 1311 goto next_slot;
1294 } 1312 }
1295 if (!nocow) { 1313 if (!nocow) {
@@ -1307,8 +1325,11 @@ out_check:
1307 ret = cow_file_range(inode, locked_page, 1325 ret = cow_file_range(inode, locked_page,
1308 cow_start, found_key.offset - 1, 1326 cow_start, found_key.offset - 1,
1309 page_started, nr_written, 1); 1327 page_started, nr_written, 1);
1310 if (ret) 1328 if (ret) {
1329 if (!nolock && nocow)
1330 btrfs_end_nocow_write(root);
1311 goto error; 1331 goto error;
1332 }
1312 cow_start = (u64)-1; 1333 cow_start = (u64)-1;
1313 } 1334 }
1314 1335
@@ -1355,8 +1376,11 @@ out_check:
1355 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1376 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1356 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1377 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1357 num_bytes); 1378 num_bytes);
1358 if (ret) 1379 if (ret) {
1380 if (!nolock && nocow)
1381 btrfs_end_nocow_write(root);
1359 goto error; 1382 goto error;
1383 }
1360 } 1384 }
1361 1385
1362 extent_clear_unlock_delalloc(inode, cur_offset, 1386 extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1364,6 +1388,8 @@ out_check:
1364 locked_page, EXTENT_LOCKED | 1388 locked_page, EXTENT_LOCKED |
1365 EXTENT_DELALLOC, PAGE_UNLOCK | 1389 EXTENT_DELALLOC, PAGE_UNLOCK |
1366 PAGE_SET_PRIVATE2); 1390 PAGE_SET_PRIVATE2);
1391 if (!nolock && nocow)
1392 btrfs_end_nocow_write(root);
1367 cur_offset = extent_end; 1393 cur_offset = extent_end;
1368 if (cur_offset > end) 1394 if (cur_offset > end)
1369 break; 1395 break;
@@ -1843,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1843 1869
1844 SetPageChecked(page); 1870 SetPageChecked(page);
1845 page_cache_get(page); 1871 page_cache_get(page);
1846 fixup->work.func = btrfs_writepage_fixup_worker; 1872 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1847 fixup->page = page; 1873 fixup->page = page;
1848 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1874 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1849 return -EBUSY; 1875 return -EBUSY;
1850} 1876}
1851 1877
@@ -2239,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2239 return PTR_ERR(root); 2265 return PTR_ERR(root);
2240 } 2266 }
2241 2267
2268 if (btrfs_root_readonly(root)) {
2269 srcu_read_unlock(&fs_info->subvol_srcu, index);
2270 return 0;
2271 }
2272
2242 /* step 2: get inode */ 2273 /* step 2: get inode */
2243 key.objectid = backref->inum; 2274 key.objectid = backref->inum;
2244 key.type = BTRFS_INODE_ITEM_KEY; 2275 key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2790,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2759 struct inode *inode = page->mapping->host; 2790 struct inode *inode = page->mapping->host;
2760 struct btrfs_root *root = BTRFS_I(inode)->root; 2791 struct btrfs_root *root = BTRFS_I(inode)->root;
2761 struct btrfs_ordered_extent *ordered_extent = NULL; 2792 struct btrfs_ordered_extent *ordered_extent = NULL;
2762 struct btrfs_workers *workers; 2793 struct btrfs_workqueue *workers;
2763 2794
2764 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2795 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2765 2796
@@ -2768,14 +2799,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2768 end - start + 1, uptodate)) 2799 end - start + 1, uptodate))
2769 return 0; 2800 return 0;
2770 2801
2771 ordered_extent->work.func = finish_ordered_fn; 2802 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2772 ordered_extent->work.flags = 0;
2773 2803
2774 if (btrfs_is_free_space_inode(inode)) 2804 if (btrfs_is_free_space_inode(inode))
2775 workers = &root->fs_info->endio_freespace_worker; 2805 workers = root->fs_info->endio_freespace_worker;
2776 else 2806 else
2777 workers = &root->fs_info->endio_write_workers; 2807 workers = root->fs_info->endio_write_workers;
2778 btrfs_queue_worker(workers, &ordered_extent->work); 2808 btrfs_queue_work(workers, &ordered_extent->work);
2779 2809
2780 return 0; 2810 return 0;
2781} 2811}
@@ -4593,7 +4623,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4593 struct rb_node *node; 4623 struct rb_node *node;
4594 4624
4595 ASSERT(inode->i_state & I_FREEING); 4625 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0); 4626 truncate_inode_pages_final(&inode->i_data);
4597 4627
4598 write_lock(&map_tree->lock); 4628 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4629 while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4954,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
4924 struct inode *inode; 4954 struct inode *inode;
4925 u64 objectid = 0; 4955 u64 objectid = 0;
4926 4956
4927 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4957 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4958 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4928 4959
4929 spin_lock(&root->inode_lock); 4960 spin_lock(&root->inode_lock);
4930again: 4961again:
@@ -5799,6 +5830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5799 } 5830 }
5800out_unlock: 5831out_unlock:
5801 btrfs_end_transaction(trans, root); 5832 btrfs_end_transaction(trans, root);
5833 btrfs_balance_delayed_items(root);
5802 btrfs_btree_balance_dirty(root); 5834 btrfs_btree_balance_dirty(root);
5803 if (drop_inode) { 5835 if (drop_inode) {
5804 inode_dec_link_count(inode); 5836 inode_dec_link_count(inode);
@@ -5872,6 +5904,7 @@ out_unlock:
5872 inode_dec_link_count(inode); 5904 inode_dec_link_count(inode);
5873 iput(inode); 5905 iput(inode);
5874 } 5906 }
5907 btrfs_balance_delayed_items(root);
5875 btrfs_btree_balance_dirty(root); 5908 btrfs_btree_balance_dirty(root);
5876 return err; 5909 return err;
5877} 5910}
@@ -5930,6 +5963,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5930 } 5963 }
5931 5964
5932 btrfs_end_transaction(trans, root); 5965 btrfs_end_transaction(trans, root);
5966 btrfs_balance_delayed_items(root);
5933fail: 5967fail:
5934 if (drop_inode) { 5968 if (drop_inode) {
5935 inode_dec_link_count(inode); 5969 inode_dec_link_count(inode);
@@ -5996,6 +6030,7 @@ out_fail:
5996 btrfs_end_transaction(trans, root); 6030 btrfs_end_transaction(trans, root);
5997 if (drop_on_err) 6031 if (drop_on_err)
5998 iput(inode); 6032 iput(inode);
6033 btrfs_balance_delayed_items(root);
5999 btrfs_btree_balance_dirty(root); 6034 btrfs_btree_balance_dirty(root);
6000 return err; 6035 return err;
6001} 6036}
@@ -6550,6 +6585,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6550 int ret; 6585 int ret;
6551 struct extent_buffer *leaf; 6586 struct extent_buffer *leaf;
6552 struct btrfs_root *root = BTRFS_I(inode)->root; 6587 struct btrfs_root *root = BTRFS_I(inode)->root;
6588 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6553 struct btrfs_file_extent_item *fi; 6589 struct btrfs_file_extent_item *fi;
6554 struct btrfs_key key; 6590 struct btrfs_key key;
6555 u64 disk_bytenr; 6591 u64 disk_bytenr;
@@ -6626,6 +6662,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6626 6662
6627 if (btrfs_extent_readonly(root, disk_bytenr)) 6663 if (btrfs_extent_readonly(root, disk_bytenr))
6628 goto out; 6664 goto out;
6665
6666 num_bytes = min(offset + *len, extent_end) - offset;
6667 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6668 u64 range_end;
6669
6670 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6671 ret = test_range_bit(io_tree, offset, range_end,
6672 EXTENT_DELALLOC, 0, NULL);
6673 if (ret) {
6674 ret = -EAGAIN;
6675 goto out;
6676 }
6677 }
6678
6629 btrfs_release_path(path); 6679 btrfs_release_path(path);
6630 6680
6631 /* 6681 /*
@@ -6654,7 +6704,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6654 */ 6704 */
6655 disk_bytenr += backref_offset; 6705 disk_bytenr += backref_offset;
6656 disk_bytenr += offset - key.offset; 6706 disk_bytenr += offset - key.offset;
6657 num_bytes = min(offset + *len, extent_end) - offset;
6658 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6707 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6659 goto out; 6708 goto out;
6660 /* 6709 /*
@@ -7024,10 +7073,9 @@ again:
7024 if (!ret) 7073 if (!ret)
7025 goto out_test; 7074 goto out_test;
7026 7075
7027 ordered->work.func = finish_ordered_fn; 7076 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7028 ordered->work.flags = 0; 7077 btrfs_queue_work(root->fs_info->endio_write_workers,
7029 btrfs_queue_worker(&root->fs_info->endio_write_workers, 7078 &ordered->work);
7030 &ordered->work);
7031out_test: 7079out_test:
7032 /* 7080 /*
7033 * our bio might span multiple ordered extents. If we haven't 7081 * our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7452,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7404 smp_mb__after_atomic_inc(); 7452 smp_mb__after_atomic_inc();
7405 7453
7406 /* 7454 /*
7407 * The generic stuff only does filemap_write_and_wait_range, which isn't 7455 * The generic stuff only does filemap_write_and_wait_range, which
7408 * enough if we've written compressed pages to this area, so we need to 7456 * isn't enough if we've written compressed pages to this area, so
7409 * call btrfs_wait_ordered_range to make absolutely sure that any 7457 * we need to flush the dirty pages again to make absolutely sure
7410 * outstanding dirty pages are on disk. 7458 * that any outstanding dirty pages are on disk.
7411 */ 7459 */
7412 count = iov_length(iov, nr_segs); 7460 count = iov_length(iov, nr_segs);
7413 ret = btrfs_wait_ordered_range(inode, offset, count); 7461 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7414 if (ret) 7462 &BTRFS_I(inode)->runtime_flags))
7415 return ret; 7463 filemap_fdatawrite_range(inode->i_mapping, offset, count);
7416 7464
7417 if (rw & WRITE) { 7465 if (rw & WRITE) {
7418 /* 7466 /*
@@ -8404,7 +8452,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8404 work->inode = inode; 8452 work->inode = inode;
8405 work->wait = wait; 8453 work->wait = wait;
8406 work->delay_iput = delay_iput; 8454 work->delay_iput = delay_iput;
8407 work->work.func = btrfs_run_delalloc_work; 8455 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8408 8456
8409 return work; 8457 return work;
8410} 8458}
@@ -8419,7 +8467,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8419 * some fairly slow code that needs optimization. This walks the list 8467 * some fairly slow code that needs optimization. This walks the list
8420 * of all the inodes with pending delalloc and forces them to disk. 8468 * of all the inodes with pending delalloc and forces them to disk.
8421 */ 8469 */
8422static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8470static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8471 int nr)
8423{ 8472{
8424 struct btrfs_inode *binode; 8473 struct btrfs_inode *binode;
8425 struct inode *inode; 8474 struct inode *inode;
@@ -8431,6 +8480,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8431 INIT_LIST_HEAD(&works); 8480 INIT_LIST_HEAD(&works);
8432 INIT_LIST_HEAD(&splice); 8481 INIT_LIST_HEAD(&splice);
8433 8482
8483 mutex_lock(&root->delalloc_mutex);
8434 spin_lock(&root->delalloc_lock); 8484 spin_lock(&root->delalloc_lock);
8435 list_splice_init(&root->delalloc_inodes, &splice); 8485 list_splice_init(&root->delalloc_inodes, &splice);
8436 while (!list_empty(&splice)) { 8486 while (!list_empty(&splice)) {
@@ -8456,19 +8506,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8456 goto out; 8506 goto out;
8457 } 8507 }
8458 list_add_tail(&work->list, &works); 8508 list_add_tail(&work->list, &works);
8459 btrfs_queue_worker(&root->fs_info->flush_workers, 8509 btrfs_queue_work(root->fs_info->flush_workers,
8460 &work->work); 8510 &work->work);
8461 8511 ret++;
8512 if (nr != -1 && ret >= nr)
8513 goto out;
8462 cond_resched(); 8514 cond_resched();
8463 spin_lock(&root->delalloc_lock); 8515 spin_lock(&root->delalloc_lock);
8464 } 8516 }
8465 spin_unlock(&root->delalloc_lock); 8517 spin_unlock(&root->delalloc_lock);
8466 8518
8467 list_for_each_entry_safe(work, next, &works, list) {
8468 list_del_init(&work->list);
8469 btrfs_wait_and_free_delalloc_work(work);
8470 }
8471 return 0;
8472out: 8519out:
8473 list_for_each_entry_safe(work, next, &works, list) { 8520 list_for_each_entry_safe(work, next, &works, list) {
8474 list_del_init(&work->list); 8521 list_del_init(&work->list);
@@ -8480,6 +8527,7 @@ out:
8480 list_splice_tail(&splice, &root->delalloc_inodes); 8527 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock); 8528 spin_unlock(&root->delalloc_lock);
8482 } 8529 }
8530 mutex_unlock(&root->delalloc_mutex);
8483 return ret; 8531 return ret;
8484} 8532}
8485 8533
@@ -8490,7 +8538,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 8538 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8491 return -EROFS; 8539 return -EROFS;
8492 8540
8493 ret = __start_delalloc_inodes(root, delay_iput); 8541 ret = __start_delalloc_inodes(root, delay_iput, -1);
8542 if (ret > 0)
8543 ret = 0;
8494 /* 8544 /*
8495 * the filemap_flush will queue IO into the worker threads, but 8545 * the filemap_flush will queue IO into the worker threads, but
8496 * we have to make sure the IO is actually started and that 8546 * we have to make sure the IO is actually started and that
@@ -8507,7 +8557,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8507 return ret; 8557 return ret;
8508} 8558}
8509 8559
8510int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) 8560int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8561 int nr)
8511{ 8562{
8512 struct btrfs_root *root; 8563 struct btrfs_root *root;
8513 struct list_head splice; 8564 struct list_head splice;
@@ -8518,9 +8569,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8518 8569
8519 INIT_LIST_HEAD(&splice); 8570 INIT_LIST_HEAD(&splice);
8520 8571
8572 mutex_lock(&fs_info->delalloc_root_mutex);
8521 spin_lock(&fs_info->delalloc_root_lock); 8573 spin_lock(&fs_info->delalloc_root_lock);
8522 list_splice_init(&fs_info->delalloc_roots, &splice); 8574 list_splice_init(&fs_info->delalloc_roots, &splice);
8523 while (!list_empty(&splice)) { 8575 while (!list_empty(&splice) && nr) {
8524 root = list_first_entry(&splice, struct btrfs_root, 8576 root = list_first_entry(&splice, struct btrfs_root,
8525 delalloc_root); 8577 delalloc_root);
8526 root = btrfs_grab_fs_root(root); 8578 root = btrfs_grab_fs_root(root);
@@ -8529,15 +8581,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8529 &fs_info->delalloc_roots); 8581 &fs_info->delalloc_roots);
8530 spin_unlock(&fs_info->delalloc_root_lock); 8582 spin_unlock(&fs_info->delalloc_root_lock);
8531 8583
8532 ret = __start_delalloc_inodes(root, delay_iput); 8584 ret = __start_delalloc_inodes(root, delay_iput, nr);
8533 btrfs_put_fs_root(root); 8585 btrfs_put_fs_root(root);
8534 if (ret) 8586 if (ret < 0)
8535 goto out; 8587 goto out;
8536 8588
8589 if (nr != -1) {
8590 nr -= ret;
8591 WARN_ON(nr < 0);
8592 }
8537 spin_lock(&fs_info->delalloc_root_lock); 8593 spin_lock(&fs_info->delalloc_root_lock);
8538 } 8594 }
8539 spin_unlock(&fs_info->delalloc_root_lock); 8595 spin_unlock(&fs_info->delalloc_root_lock);
8540 8596
8597 ret = 0;
8541 atomic_inc(&fs_info->async_submit_draining); 8598 atomic_inc(&fs_info->async_submit_draining);
8542 while (atomic_read(&fs_info->nr_async_submits) || 8599 while (atomic_read(&fs_info->nr_async_submits) ||
8543 atomic_read(&fs_info->async_delalloc_pages)) { 8600 atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8603,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8546 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8603 atomic_read(&fs_info->async_delalloc_pages) == 0));
8547 } 8604 }
8548 atomic_dec(&fs_info->async_submit_draining); 8605 atomic_dec(&fs_info->async_submit_draining);
8549 return 0;
8550out: 8606out:
8551 if (!list_empty_careful(&splice)) { 8607 if (!list_empty_careful(&splice)) {
8552 spin_lock(&fs_info->delalloc_root_lock); 8608 spin_lock(&fs_info->delalloc_root_lock);
8553 list_splice_tail(&splice, &fs_info->delalloc_roots); 8609 list_splice_tail(&splice, &fs_info->delalloc_roots);
8554 spin_unlock(&fs_info->delalloc_root_lock); 8610 spin_unlock(&fs_info->delalloc_root_lock);
8555 } 8611 }
8612 mutex_unlock(&fs_info->delalloc_root_mutex);
8556 return ret; 8613 return ret;
8557} 8614}
8558 8615
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..2f6d7b13b5bd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
59#include "props.h" 59#include "props.h"
60#include "sysfs.h" 60#include "sysfs.h"
61 61
62#ifdef CONFIG_64BIT
63/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
64 * structures are incorrect, as the timespec structure from userspace
65 * is 4 bytes too small. We define these alternatives here to teach
66 * the kernel about the 32-bit struct packing.
67 */
68struct btrfs_ioctl_timespec_32 {
69 __u64 sec;
70 __u32 nsec;
71} __attribute__ ((__packed__));
72
73struct btrfs_ioctl_received_subvol_args_32 {
74 char uuid[BTRFS_UUID_SIZE]; /* in */
75 __u64 stransid; /* in */
76 __u64 rtransid; /* out */
77 struct btrfs_ioctl_timespec_32 stime; /* in */
78 struct btrfs_ioctl_timespec_32 rtime; /* out */
79 __u64 flags; /* in */
80 __u64 reserved[16]; /* in */
81} __attribute__ ((__packed__));
82
83#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
84 struct btrfs_ioctl_received_subvol_args_32)
85#endif
86
87
62static int btrfs_clone(struct inode *src, struct inode *inode, 88static int btrfs_clone(struct inode *src, struct inode *inode,
63 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 89 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
64 90
@@ -585,6 +611,23 @@ fail:
585 return ret; 611 return ret;
586} 612}
587 613
614static void btrfs_wait_nocow_write(struct btrfs_root *root)
615{
616 s64 writers;
617 DEFINE_WAIT(wait);
618
619 do {
620 prepare_to_wait(&root->subv_writers->wait, &wait,
621 TASK_UNINTERRUPTIBLE);
622
623 writers = percpu_counter_sum(&root->subv_writers->counter);
624 if (writers)
625 schedule();
626
627 finish_wait(&root->subv_writers->wait, &wait);
628 } while (writers);
629}
630
588static int create_snapshot(struct btrfs_root *root, struct inode *dir, 631static int create_snapshot(struct btrfs_root *root, struct inode *dir,
589 struct dentry *dentry, char *name, int namelen, 632 struct dentry *dentry, char *name, int namelen,
590 u64 *async_transid, bool readonly, 633 u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
598 if (!root->ref_cows) 641 if (!root->ref_cows)
599 return -EINVAL; 642 return -EINVAL;
600 643
644 atomic_inc(&root->will_be_snapshoted);
645 smp_mb__after_atomic_inc();
646 btrfs_wait_nocow_write(root);
647
601 ret = btrfs_start_delalloc_inodes(root, 0); 648 ret = btrfs_start_delalloc_inodes(root, 0);
602 if (ret) 649 if (ret)
603 return ret; 650 goto out;
604 651
605 btrfs_wait_ordered_extents(root, -1); 652 btrfs_wait_ordered_extents(root, -1);
606 653
607 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 654 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
608 if (!pending_snapshot) 655 if (!pending_snapshot) {
609 return -ENOMEM; 656 ret = -ENOMEM;
657 goto out;
658 }
610 659
611 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 660 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
612 BTRFS_BLOCK_RSV_TEMP); 661 BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
623 &pending_snapshot->qgroup_reserved, 672 &pending_snapshot->qgroup_reserved,
624 false); 673 false);
625 if (ret) 674 if (ret)
626 goto out; 675 goto free;
627 676
628 pending_snapshot->dentry = dentry; 677 pending_snapshot->dentry = dentry;
629 pending_snapshot->root = root; 678 pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
674 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
675 &pending_snapshot->block_rsv, 724 &pending_snapshot->block_rsv,
676 pending_snapshot->qgroup_reserved); 725 pending_snapshot->qgroup_reserved);
677out: 726free:
678 kfree(pending_snapshot); 727 kfree(pending_snapshot);
728out:
729 atomic_dec(&root->will_be_snapshoted);
679 return ret; 730 return ret;
680} 731}
681 732
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
884 min_key.type = BTRFS_EXTENT_DATA_KEY; 935 min_key.type = BTRFS_EXTENT_DATA_KEY;
885 min_key.offset = *off; 936 min_key.offset = *off;
886 937
887 path->keep_locks = 1;
888
889 while (1) { 938 while (1) {
939 path->keep_locks = 1;
890 ret = btrfs_search_forward(root, &min_key, path, newer_than); 940 ret = btrfs_search_forward(root, &min_key, path, newer_than);
891 if (ret != 0) 941 if (ret != 0)
892 goto none; 942 goto none;
943 path->keep_locks = 0;
944 btrfs_unlock_up_safe(path, 1);
945process_slot:
893 if (min_key.objectid != ino) 946 if (min_key.objectid != ino)
894 goto none; 947 goto none;
895 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
908 return 0; 961 return 0;
909 } 962 }
910 963
964 path->slots[0]++;
965 if (path->slots[0] < btrfs_header_nritems(leaf)) {
966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
967 goto process_slot;
968 }
969
911 if (min_key.offset == (u64)-1) 970 if (min_key.offset == (u64)-1)
912 goto none; 971 goto none;
913 972
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
935 read_unlock(&em_tree->lock); 994 read_unlock(&em_tree->lock);
936 995
937 if (!em) { 996 if (!em) {
997 struct extent_state *cached = NULL;
998 u64 end = start + len - 1;
999
938 /* get the big lock and read metadata off disk */ 1000 /* get the big lock and read metadata off disk */
939 lock_extent(io_tree, start, start + len - 1); 1001 lock_extent_bits(io_tree, start, end, 0, &cached);
940 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
941 unlock_extent(io_tree, start, start + len - 1); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
942 1004
943 if (IS_ERR(em)) 1005 if (IS_ERR(em))
944 return NULL; 1006 return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
957 return false; 1019 return false;
958 1020
959 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
960 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
1023 (em->block_start + em->block_len == next->block_start))
961 ret = false; 1024 ret = false;
962 1025
963 free_extent_map(next); 1026 free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
1076 page_start = page_offset(page); 1139 page_start = page_offset(page);
1077 page_end = page_start + PAGE_CACHE_SIZE - 1; 1140 page_end = page_start + PAGE_CACHE_SIZE - 1;
1078 while (1) { 1141 while (1) {
1079 lock_extent(tree, page_start, page_end); 1142 lock_extent_bits(tree, page_start, page_end,
1143 0, &cached_state);
1080 ordered = btrfs_lookup_ordered_extent(inode, 1144 ordered = btrfs_lookup_ordered_extent(inode,
1081 page_start); 1145 page_start);
1082 unlock_extent(tree, page_start, page_end); 1146 unlock_extent_cached(tree, page_start, page_end,
1147 &cached_state, GFP_NOFS);
1083 if (!ordered) 1148 if (!ordered)
1084 break; 1149 break;
1085 1150
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1356 } 1421 }
1357 } 1422 }
1358 1423
1359 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
1360 filemap_flush(inode->i_mapping); 1425 filemap_flush(inode->i_mapping);
1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1427 &BTRFS_I(inode)->runtime_flags))
1428 filemap_flush(inode->i_mapping);
1429 }
1361 1430
1362 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1363 /* the filemap_flush will queue IO into the worker threads, but 1432 /* the filemap_flush will queue IO into the worker threads, but
@@ -1403,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1403 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1404 struct btrfs_device *device = NULL; 1473 struct btrfs_device *device = NULL;
1405 char *sizestr; 1474 char *sizestr;
1475 char *retptr;
1406 char *devstr = NULL; 1476 char *devstr = NULL;
1407 int ret = 0; 1477 int ret = 0;
1408 int mod = 0; 1478 int mod = 0;
@@ -1470,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1470 mod = 1; 1540 mod = 1;
1471 sizestr++; 1541 sizestr++;
1472 } 1542 }
1473 new_size = memparse(sizestr, NULL); 1543 new_size = memparse(sizestr, &retptr);
1474 if (new_size == 0) { 1544 if (*retptr != '\0' || new_size == 0) {
1475 ret = -EINVAL; 1545 ret = -EINVAL;
1476 goto out_free; 1546 goto out_free;
1477 } 1547 }
@@ -1573,7 +1643,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1573 if (src_inode->i_sb != file_inode(file)->i_sb) { 1643 if (src_inode->i_sb != file_inode(file)->i_sb) {
1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1644 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1575 "Snapshot src from another FS"); 1645 "Snapshot src from another FS");
1576 ret = -EINVAL; 1646 ret = -EXDEV;
1577 } else if (!inode_owner_or_capable(src_inode)) { 1647 } else if (!inode_owner_or_capable(src_inode)) {
1578 /* 1648 /*
1579 * Subvolume creation is not restricted, but snapshots 1649 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1867,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
1797 if (di && !IS_ERR(di)) { 1867 if (di && !IS_ERR(di)) {
1798 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1868 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1799 if (key.objectid == root->root_key.objectid) { 1869 if (key.objectid == root->root_key.objectid) {
1800 ret = -ENOTEMPTY; 1870 ret = -EPERM;
1871 btrfs_err(root->fs_info, "deleting default subvolume "
1872 "%llu is not allowed", key.objectid);
1801 goto out; 1873 goto out;
1802 } 1874 }
1803 btrfs_release_path(path); 1875 btrfs_release_path(path);
@@ -2994,8 +3066,9 @@ process_slot:
2994 new_key.offset + datal, 3066 new_key.offset + datal,
2995 1); 3067 1);
2996 if (ret) { 3068 if (ret) {
2997 btrfs_abort_transaction(trans, root, 3069 if (ret != -EOPNOTSUPP)
2998 ret); 3070 btrfs_abort_transaction(trans,
3071 root, ret);
2999 btrfs_end_transaction(trans, root); 3072 btrfs_end_transaction(trans, root);
3000 goto out; 3073 goto out;
3001 } 3074 }
@@ -3047,6 +3120,8 @@ process_slot:
3047 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3120 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
3048 u64 skip = 0; 3121 u64 skip = 0;
3049 u64 trim = 0; 3122 u64 trim = 0;
3123 u64 aligned_end = 0;
3124
3050 if (off > key.offset) { 3125 if (off > key.offset) {
3051 skip = off - key.offset; 3126 skip = off - key.offset;
3052 new_key.offset += skip; 3127 new_key.offset += skip;
@@ -3063,13 +3138,16 @@ process_slot:
3063 size -= skip + trim; 3138 size -= skip + trim;
3064 datal -= skip + trim; 3139 datal -= skip + trim;
3065 3140
3141 aligned_end = ALIGN(new_key.offset + datal,
3142 root->sectorsize);
3066 ret = btrfs_drop_extents(trans, root, inode, 3143 ret = btrfs_drop_extents(trans, root, inode,
3067 new_key.offset, 3144 new_key.offset,
3068 new_key.offset + datal, 3145 aligned_end,
3069 1); 3146 1);
3070 if (ret) { 3147 if (ret) {
3071 btrfs_abort_transaction(trans, root, 3148 if (ret != -EOPNOTSUPP)
3072 ret); 3149 btrfs_abort_transaction(trans,
3150 root, ret);
3073 btrfs_end_transaction(trans, root); 3151 btrfs_end_transaction(trans, root);
3074 goto out; 3152 goto out;
3075 } 3153 }
@@ -3153,8 +3231,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3153 * decompress into destination's address_space (the file offset 3231 * decompress into destination's address_space (the file offset
3154 * may change, so source mapping won't do), then recompress (or 3232 * may change, so source mapping won't do), then recompress (or
3155 * otherwise reinsert) a subrange. 3233 * otherwise reinsert) a subrange.
3156 * - allow ranges within the same file to be cloned (provided 3234 *
3157 * they don't overlap)? 3235 * - split destination inode's inline extents. The inline extents can
3236 * be either compressed or non-compressed.
3158 */ 3237 */
3159 3238
3160 /* the destination must be opened for writing */ 3239 /* the destination must be opened for writing */
@@ -3465,6 +3544,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
3465 up_read(&info->groups_sem); 3544 up_read(&info->groups_sem);
3466 } 3545 }
3467 3546
3547 /*
3548 * Global block reserve, exported as a space_info
3549 */
3550 slot_count++;
3551
3468 /* space_slots == 0 means they are asking for a count */ 3552 /* space_slots == 0 means they are asking for a count */
3469 if (space_args.space_slots == 0) { 3553 if (space_args.space_slots == 0) {
3470 space_args.total_spaces = slot_count; 3554 space_args.total_spaces = slot_count;
@@ -3523,6 +3607,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
3523 up_read(&info->groups_sem); 3607 up_read(&info->groups_sem);
3524 } 3608 }
3525 3609
3610 /*
3611 * Add global block reserve
3612 */
3613 if (slot_count) {
3614 struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
3615
3616 spin_lock(&block_rsv->lock);
3617 space.total_bytes = block_rsv->size;
3618 space.used_bytes = block_rsv->size - block_rsv->reserved;
3619 spin_unlock(&block_rsv->lock);
3620 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
3621 memcpy(dest, &space, sizeof(space));
3622 space_args.total_spaces++;
3623 }
3624
3526 user_dest = (struct btrfs_ioctl_space_info __user *) 3625 user_dest = (struct btrfs_ioctl_space_info __user *)
3527 (arg + sizeof(struct btrfs_ioctl_space_args)); 3626 (arg + sizeof(struct btrfs_ioctl_space_args));
3528 3627
@@ -4353,10 +4452,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
4353 return btrfs_qgroup_wait_for_completion(root->fs_info); 4452 return btrfs_qgroup_wait_for_completion(root->fs_info);
4354} 4453}
4355 4454
4356static long btrfs_ioctl_set_received_subvol(struct file *file, 4455static long _btrfs_ioctl_set_received_subvol(struct file *file,
4357 void __user *arg) 4456 struct btrfs_ioctl_received_subvol_args *sa)
4358{ 4457{
4359 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4360 struct inode *inode = file_inode(file); 4458 struct inode *inode = file_inode(file);
4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4459 struct btrfs_root *root = BTRFS_I(inode)->root;
4362 struct btrfs_root_item *root_item = &root->root_item; 4460 struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4482,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4384 goto out; 4482 goto out;
4385 } 4483 }
4386 4484
4387 sa = memdup_user(arg, sizeof(*sa));
4388 if (IS_ERR(sa)) {
4389 ret = PTR_ERR(sa);
4390 sa = NULL;
4391 goto out;
4392 }
4393
4394 /* 4485 /*
4395 * 1 - root item 4486 * 1 - root item
4396 * 2 - uuid items (received uuid + subvol uuid) 4487 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4535,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4444 goto out; 4535 goto out;
4445 } 4536 }
4446 4537
4538out:
4539 up_write(&root->fs_info->subvol_sem);
4540 mnt_drop_write_file(file);
4541 return ret;
4542}
4543
4544#ifdef CONFIG_64BIT
4545static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4546 void __user *arg)
4547{
4548 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4549 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4550 int ret = 0;
4551
4552 args32 = memdup_user(arg, sizeof(*args32));
4553 if (IS_ERR(args32)) {
4554 ret = PTR_ERR(args32);
4555 args32 = NULL;
4556 goto out;
4557 }
4558
4559 args64 = kmalloc(sizeof(*args64), GFP_NOFS);
4560 if (!args64) {
4561 ret = -ENOMEM;
4562 goto out;
4563 }
4564
4565 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4566 args64->stransid = args32->stransid;
4567 args64->rtransid = args32->rtransid;
4568 args64->stime.sec = args32->stime.sec;
4569 args64->stime.nsec = args32->stime.nsec;
4570 args64->rtime.sec = args32->rtime.sec;
4571 args64->rtime.nsec = args32->rtime.nsec;
4572 args64->flags = args32->flags;
4573
4574 ret = _btrfs_ioctl_set_received_subvol(file, args64);
4575 if (ret)
4576 goto out;
4577
4578 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4579 args32->stransid = args64->stransid;
4580 args32->rtransid = args64->rtransid;
4581 args32->stime.sec = args64->stime.sec;
4582 args32->stime.nsec = args64->stime.nsec;
4583 args32->rtime.sec = args64->rtime.sec;
4584 args32->rtime.nsec = args64->rtime.nsec;
4585 args32->flags = args64->flags;
4586
4587 ret = copy_to_user(arg, args32, sizeof(*args32));
4588 if (ret)
4589 ret = -EFAULT;
4590
4591out:
4592 kfree(args32);
4593 kfree(args64);
4594 return ret;
4595}
4596#endif
4597
4598static long btrfs_ioctl_set_received_subvol(struct file *file,
4599 void __user *arg)
4600{
4601 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4602 int ret = 0;
4603
4604 sa = memdup_user(arg, sizeof(*sa));
4605 if (IS_ERR(sa)) {
4606 ret = PTR_ERR(sa);
4607 sa = NULL;
4608 goto out;
4609 }
4610
4611 ret = _btrfs_ioctl_set_received_subvol(file, sa);
4612
4613 if (ret)
4614 goto out;
4615
4447 ret = copy_to_user(arg, sa, sizeof(*sa)); 4616 ret = copy_to_user(arg, sa, sizeof(*sa));
4448 if (ret) 4617 if (ret)
4449 ret = -EFAULT; 4618 ret = -EFAULT;
4450 4619
4451out: 4620out:
4452 kfree(sa); 4621 kfree(sa);
4453 up_write(&root->fs_info->subvol_sem);
4454 mnt_drop_write_file(file);
4455 return ret; 4622 return ret;
4456} 4623}
4457 4624
@@ -4746,7 +4913,7 @@ long btrfs_ioctl(struct file *file, unsigned int
4746 case BTRFS_IOC_SYNC: { 4913 case BTRFS_IOC_SYNC: {
4747 int ret; 4914 int ret;
4748 4915
4749 ret = btrfs_start_delalloc_roots(root->fs_info, 0); 4916 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
4750 if (ret) 4917 if (ret)
4751 return ret; 4918 return ret;
4752 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 4919 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4937,10 @@ long btrfs_ioctl(struct file *file, unsigned int
4770 return btrfs_ioctl_balance_progress(root, argp); 4937 return btrfs_ioctl_balance_progress(root, argp);
4771 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 4938 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
4772 return btrfs_ioctl_set_received_subvol(file, argp); 4939 return btrfs_ioctl_set_received_subvol(file, argp);
4940#ifdef CONFIG_64BIT
4941 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
4942 return btrfs_ioctl_set_received_subvol_32(file, argp);
4943#endif
4773 case BTRFS_IOC_SEND: 4944 case BTRFS_IOC_SEND:
4774 return btrfs_ioctl_send(file, argp); 4945 return btrfs_ioctl_send(file, argp);
4775 case BTRFS_IOC_GET_DEV_STATS: 4946 case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
349 if (!uptodate) 349 if (!uptodate)
350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 350 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 351
352 if (entry->bytes_left == 0) 352 if (entry->bytes_left == 0) {
353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 353 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 else 354 if (waitqueue_active(&entry->wait))
355 wake_up(&entry->wait);
356 } else {
355 ret = 1; 357 ret = 1;
358 }
356out: 359out:
357 if (!ret && cached && entry) { 360 if (!ret && cached && entry) {
358 *cached = entry; 361 *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
410 if (!uptodate) 413 if (!uptodate)
411 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 414 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
412 415
413 if (entry->bytes_left == 0) 416 if (entry->bytes_left == 0) {
414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 417 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 else 418 if (waitqueue_active(&entry->wait))
419 wake_up(&entry->wait);
420 } else {
416 ret = 1; 421 ret = 1;
422 }
417out: 423out:
418 if (!ret && cached && entry) { 424 if (!ret && cached && entry) {
419 *cached = entry; 425 *cached = entry;
@@ -424,27 +430,48 @@ out:
424} 430}
425 431
426/* Needs to either be called under a log transaction or the log_mutex */ 432/* Needs to either be called under a log transaction or the log_mutex */
427void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) 433void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list)
428{ 435{
429 struct btrfs_ordered_inode_tree *tree; 436 struct btrfs_ordered_inode_tree *tree;
430 struct btrfs_ordered_extent *ordered; 437 struct btrfs_ordered_extent *ordered;
431 struct rb_node *n; 438 struct rb_node *n;
432 int index = log->log_transid % 2;
433 439
434 tree = &BTRFS_I(inode)->ordered_tree; 440 tree = &BTRFS_I(inode)->ordered_tree;
435 spin_lock_irq(&tree->lock); 441 spin_lock_irq(&tree->lock);
436 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
437 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
438 spin_lock(&log->log_extents_lock[index]); 444 if (!list_empty(&ordered->log_list))
439 if (list_empty(&ordered->log_list)) { 445 continue;
440 list_add_tail(&ordered->log_list, &log->logged_list[index]); 446 list_add_tail(&ordered->log_list, logged_list);
441 atomic_inc(&ordered->refs); 447 atomic_inc(&ordered->refs);
442 }
443 spin_unlock(&log->log_extents_lock[index]);
444 } 448 }
445 spin_unlock_irq(&tree->lock); 449 spin_unlock_irq(&tree->lock);
446} 450}
447 451
452void btrfs_put_logged_extents(struct list_head *logged_list)
453{
454 struct btrfs_ordered_extent *ordered;
455
456 while (!list_empty(logged_list)) {
457 ordered = list_first_entry(logged_list,
458 struct btrfs_ordered_extent,
459 log_list);
460 list_del_init(&ordered->log_list);
461 btrfs_put_ordered_extent(ordered);
462 }
463}
464
465void btrfs_submit_logged_extents(struct list_head *logged_list,
466 struct btrfs_root *log)
467{
468 int index = log->log_transid % 2;
469
470 spin_lock_irq(&log->log_extents_lock[index]);
471 list_splice_tail(logged_list, &log->logged_list[index]);
472 spin_unlock_irq(&log->log_extents_lock[index]);
473}
474
448void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
449{ 476{
450 struct btrfs_ordered_extent *ordered; 477 struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
577 INIT_LIST_HEAD(&splice); 604 INIT_LIST_HEAD(&splice);
578 INIT_LIST_HEAD(&works); 605 INIT_LIST_HEAD(&works);
579 606
580 mutex_lock(&root->fs_info->ordered_operations_mutex); 607 mutex_lock(&root->ordered_extent_mutex);
581 spin_lock(&root->ordered_extent_lock); 608 spin_lock(&root->ordered_extent_lock);
582 list_splice_init(&root->ordered_extents, &splice); 609 list_splice_init(&root->ordered_extents, &splice);
583 while (!list_empty(&splice) && nr) { 610 while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
588 atomic_inc(&ordered->refs); 615 atomic_inc(&ordered->refs);
589 spin_unlock(&root->ordered_extent_lock); 616 spin_unlock(&root->ordered_extent_lock);
590 617
591 ordered->flush_work.func = btrfs_run_ordered_extent_work; 618 btrfs_init_work(&ordered->flush_work,
619 btrfs_run_ordered_extent_work, NULL, NULL);
592 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
593 btrfs_queue_worker(&root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
594 &ordered->flush_work); 622 &ordered->flush_work);
595 623
596 cond_resched(); 624 cond_resched();
597 spin_lock(&root->ordered_extent_lock); 625 spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
608 btrfs_put_ordered_extent(ordered); 636 btrfs_put_ordered_extent(ordered);
609 cond_resched(); 637 cond_resched();
610 } 638 }
611 mutex_unlock(&root->fs_info->ordered_operations_mutex); 639 mutex_unlock(&root->ordered_extent_mutex);
612 640
613 return count; 641 return count;
614} 642}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
621 649
622 INIT_LIST_HEAD(&splice); 650 INIT_LIST_HEAD(&splice);
623 651
652 mutex_lock(&fs_info->ordered_operations_mutex);
624 spin_lock(&fs_info->ordered_root_lock); 653 spin_lock(&fs_info->ordered_root_lock);
625 list_splice_init(&fs_info->ordered_roots, &splice); 654 list_splice_init(&fs_info->ordered_roots, &splice);
626 while (!list_empty(&splice) && nr) { 655 while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
643 } 672 }
644 list_splice_tail(&splice, &fs_info->ordered_roots); 673 list_splice_tail(&splice, &fs_info->ordered_roots);
645 spin_unlock(&fs_info->ordered_root_lock); 674 spin_unlock(&fs_info->ordered_root_lock);
675 mutex_unlock(&fs_info->ordered_operations_mutex);
646} 676}
647 677
648/* 678/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
704 goto out; 734 goto out;
705 } 735 }
706 list_add_tail(&work->list, &works); 736 list_add_tail(&work->list, &works);
707 btrfs_queue_worker(&root->fs_info->flush_workers, 737 btrfs_queue_work(root->fs_info->flush_workers,
708 &work->work); 738 &work->work);
709 739
710 cond_resched(); 740 cond_resched();
711 spin_lock(&root->fs_info->ordered_root_lock); 741 spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
197 struct inode *inode); 197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 200void btrfs_get_logged_extents(struct inode *inode,
201 struct list_head *logged_list);
202void btrfs_put_logged_extents(struct list_head *logged_list);
203void btrfs_submit_logged_extents(struct list_head *logged_list,
204 struct btrfs_root *log);
201void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 205void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
202void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 206void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
203int __init ordered_data_init(void); 207int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1509 ret = qgroup_rescan_init(fs_info, 0, 1); 1509 ret = qgroup_rescan_init(fs_info, 0, 1);
1510 if (!ret) { 1510 if (!ret) {
1511 qgroup_rescan_zero_tracking(fs_info); 1511 qgroup_rescan_zero_tracking(fs_info);
1512 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 1512 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1513 &fs_info->qgroup_rescan_work); 1513 &fs_info->qgroup_rescan_work);
1514 } 1514 }
1515 ret = 0; 1515 ret = 0;
1516 } 1516 }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2095 2095
2096 memset(&fs_info->qgroup_rescan_work, 0, 2096 memset(&fs_info->qgroup_rescan_work, 0,
2097 sizeof(fs_info->qgroup_rescan_work)); 2097 sizeof(fs_info->qgroup_rescan_work));
2098 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; 2098 btrfs_init_work(&fs_info->qgroup_rescan_work,
2099 btrfs_qgroup_rescan_worker, NULL, NULL);
2099 2100
2100 if (ret) { 2101 if (ret) {
2101err: 2102err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2158 2159
2159 qgroup_rescan_zero_tracking(fs_info); 2160 qgroup_rescan_zero_tracking(fs_info);
2160 2161
2161 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2162 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2162 &fs_info->qgroup_rescan_work); 2163 &fs_info->qgroup_rescan_work);
2163 2164
2164 return 0; 2165 return 0;
2165} 2166}
@@ -2190,6 +2191,6 @@ void
2190btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 2191btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2191{ 2192{
2192 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2193 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2193 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, 2194 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2194 &fs_info->qgroup_rescan_work); 2195 &fs_info->qgroup_rescan_work);
2195} 2196}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 rbio->work.flags = 0; 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420 rbio->work.func = rmw_work;
1421 1420
1422 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1421 btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 &rbio->work); 1422 &rbio->work);
1424} 1423}
1425 1424
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1425static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{ 1426{
1428 rbio->work.flags = 0; 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1429 rbio->work.func = read_rebuild_work;
1430 1428
1431 btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1429 btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 &rbio->work); 1430 &rbio->work);
1433} 1431}
1434 1432
1435/* 1433/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1665 plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 1666
1669 if (from_schedule) { 1667 if (from_schedule) {
1670 plug->work.flags = 0; 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1671 plug->work.func = unplug_work; 1669 btrfs_queue_work(plug->info->rmw_workers,
1672 btrfs_queue_worker(&plug->info->rmw_workers, 1670 &plug->work);
1673 &plug->work);
1674 return; 1671 return;
1675 } 1672 }
1676 run_plug(plug); 1673 run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
793 /* FIXME we cannot handle this properly right now */ 793 /* FIXME we cannot handle this properly right now */
794 BUG(); 794 BUG();
795 } 795 }
796 rmw->work.func = reada_start_machine_worker; 796 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
797 rmw->fs_info = fs_info; 797 rmw->fs_info = fs_info;
798 798
799 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); 799 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
800} 800}
801 801
802#ifdef DEBUG 802#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..7f92ab1daa87 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list)
2317static noinline_for_stack 2317static noinline_for_stack
2318int merge_reloc_roots(struct reloc_control *rc) 2318int merge_reloc_roots(struct reloc_control *rc)
2319{ 2319{
2320 struct btrfs_trans_handle *trans;
2321 struct btrfs_root *root; 2320 struct btrfs_root *root;
2322 struct btrfs_root *reloc_root; 2321 struct btrfs_root *reloc_root;
2323 u64 last_snap; 2322 u64 last_snap;
@@ -2375,26 +2374,6 @@ again:
2375 list_add_tail(&reloc_root->root_list, 2374 list_add_tail(&reloc_root->root_list,
2376 &reloc_roots); 2375 &reloc_roots);
2377 goto out; 2376 goto out;
2378 } else if (!ret) {
2379 /*
2380 * recover the last snapshot tranid to avoid
2381 * the space balance break NOCOW.
2382 */
2383 root = read_fs_root(rc->extent_root->fs_info,
2384 objectid);
2385 if (IS_ERR(root))
2386 continue;
2387
2388 trans = btrfs_join_transaction(root);
2389 BUG_ON(IS_ERR(trans));
2390
2391 /* Check if the fs/file tree was snapshoted or not. */
2392 if (btrfs_root_last_snapshot(&root->root_item) ==
2393 otransid - 1)
2394 btrfs_set_root_last_snapshot(&root->root_item,
2395 last_snap);
2396
2397 btrfs_end_transaction(trans, root);
2398 } 2377 }
2399 } 2378 }
2400 2379
@@ -4248,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4227 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4249 rc->block_group->key.objectid, rc->block_group->flags); 4228 rc->block_group->key.objectid, rc->block_group->flags);
4250 4229
4251 ret = btrfs_start_delalloc_roots(fs_info, 0); 4230 ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4252 if (ret < 0) { 4231 if (ret < 0) {
4253 err = ret; 4232 err = ret;
4254 goto out; 4233 goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/err.h>
19#include <linux/uuid.h> 20#include <linux/uuid.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
271 key.offset++; 272 key.offset++;
272 273
273 root = btrfs_read_fs_root(tree_root, &root_key); 274 root = btrfs_read_fs_root(tree_root, &root_key);
274 err = PTR_RET(root); 275 err = PTR_ERR_OR_ZERO(root);
275 if (err && err != -ENOENT) { 276 if (err && err != -ENOENT) {
276 break; 277 break;
277 } else if (err == -ENOENT) { 278 } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..0be77993378e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
315 atomic_inc(&fs_info->scrubs_running); 315 atomic_inc(&fs_info->scrubs_running);
316 atomic_inc(&fs_info->scrubs_paused); 316 atomic_inc(&fs_info->scrubs_paused);
317 mutex_unlock(&fs_info->scrub_lock); 317 mutex_unlock(&fs_info->scrub_lock);
318
319 /*
320 * check if @scrubs_running=@scrubs_paused condition
321 * inside wait_event() is not an atomic operation.
322 * which means we may inc/dec @scrub_running/paused
323 * at any time. Let's wake up @scrub_pause_wait as
324 * much as we can to let commit transaction blocked less.
325 */
326 wake_up(&fs_info->scrub_pause_wait);
327
318 atomic_inc(&sctx->workers_pending); 328 atomic_inc(&sctx->workers_pending);
319} 329}
320 330
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
418 sbio->index = i; 428 sbio->index = i;
419 sbio->sctx = sctx; 429 sbio->sctx = sctx;
420 sbio->page_count = 0; 430 sbio->page_count = 0;
421 sbio->work.func = scrub_bio_end_io_worker; 431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
432 NULL, NULL);
422 433
423 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
424 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
987 fixup_nodatasum->root = fs_info->extent_root; 998 fixup_nodatasum->root = fs_info->extent_root;
988 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 999 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
989 scrub_pending_trans_workers_inc(sctx); 1000 scrub_pending_trans_workers_inc(sctx);
990 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 1001 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
991 btrfs_queue_worker(&fs_info->scrub_workers, 1002 NULL, NULL);
992 &fixup_nodatasum->work); 1003 btrfs_queue_work(fs_info->scrub_workers,
1004 &fixup_nodatasum->work);
993 goto out; 1005 goto out;
994 } 1006 }
995 1007
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1603 sbio->err = err; 1615 sbio->err = err;
1604 sbio->bio = bio; 1616 sbio->bio = bio;
1605 1617
1606 sbio->work.func = scrub_wr_bio_end_io_worker; 1618 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1607 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); 1619 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1608} 1620}
1609 1621
1610static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1622static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
2072 sbio->err = err; 2084 sbio->err = err;
2073 sbio->bio = bio; 2085 sbio->bio = bio;
2074 2086
2075 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 2087 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2076} 2088}
2077 2089
2078static void scrub_bio_end_io_worker(struct btrfs_work *work) 2090static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2223,6 +2235,47 @@ behind_scrub_pages:
2223 return 0; 2235 return 0;
2224} 2236}
2225 2237
2238/*
2239 * Given a physical address, this will calculate it's
2240 * logical offset. if this is a parity stripe, it will return
2241 * the most left data stripe's logical offset.
2242 *
2243 * return 0 if it is a data stripe, 1 means parity stripe.
2244 */
2245static int get_raid56_logic_offset(u64 physical, int num,
2246 struct map_lookup *map, u64 *offset)
2247{
2248 int i;
2249 int j = 0;
2250 u64 stripe_nr;
2251 u64 last_offset;
2252 int stripe_index;
2253 int rot;
2254
2255 last_offset = (physical - map->stripes[num].physical) *
2256 nr_data_stripes(map);
2257 *offset = last_offset;
2258 for (i = 0; i < nr_data_stripes(map); i++) {
2259 *offset = last_offset + i * map->stripe_len;
2260
2261 stripe_nr = *offset;
2262 do_div(stripe_nr, map->stripe_len);
2263 do_div(stripe_nr, nr_data_stripes(map));
2264
2265 /* Work out the disk rotation on this stripe-set */
2266 rot = do_div(stripe_nr, map->num_stripes);
2267 /* calculate which stripe this data locates */
2268 rot += i;
2269 stripe_index = rot % map->num_stripes;
2270 if (stripe_index == num)
2271 return 0;
2272 if (stripe_index < num)
2273 j++;
2274 }
2275 *offset = last_offset + j * map->stripe_len;
2276 return 1;
2277}
2278
2226static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2279static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2227 struct map_lookup *map, 2280 struct map_lookup *map,
2228 struct btrfs_device *scrub_dev, 2281 struct btrfs_device *scrub_dev,
@@ -2244,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2244 u64 physical; 2297 u64 physical;
2245 u64 logical; 2298 u64 logical;
2246 u64 logic_end; 2299 u64 logic_end;
2300 u64 physical_end;
2247 u64 generation; 2301 u64 generation;
2248 int mirror_num; 2302 int mirror_num;
2249 struct reada_control *reada1; 2303 struct reada_control *reada1;
@@ -2257,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2257 u64 extent_len; 2311 u64 extent_len;
2258 struct btrfs_device *extent_dev; 2312 struct btrfs_device *extent_dev;
2259 int extent_mirror_num; 2313 int extent_mirror_num;
2260 int stop_loop; 2314 int stop_loop = 0;
2261
2262 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2263 BTRFS_BLOCK_GROUP_RAID6)) {
2264 if (num >= nr_data_stripes(map)) {
2265 return 0;
2266 }
2267 }
2268 2315
2269 nstripes = length; 2316 nstripes = length;
2317 physical = map->stripes[num].physical;
2270 offset = 0; 2318 offset = 0;
2271 do_div(nstripes, map->stripe_len); 2319 do_div(nstripes, map->stripe_len);
2272 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2320 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2284,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2284 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2332 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2285 increment = map->stripe_len; 2333 increment = map->stripe_len;
2286 mirror_num = num % map->num_stripes + 1; 2334 mirror_num = num % map->num_stripes + 1;
2335 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2336 BTRFS_BLOCK_GROUP_RAID6)) {
2337 get_raid56_logic_offset(physical, num, map, &offset);
2338 increment = map->stripe_len * nr_data_stripes(map);
2339 mirror_num = 1;
2287 } else { 2340 } else {
2288 increment = map->stripe_len; 2341 increment = map->stripe_len;
2289 mirror_num = 1; 2342 mirror_num = 1;
@@ -2307,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2307 * to not hold off transaction commits 2360 * to not hold off transaction commits
2308 */ 2361 */
2309 logical = base + offset; 2362 logical = base + offset;
2310 2363 physical_end = physical + nstripes * map->stripe_len;
2364 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2365 BTRFS_BLOCK_GROUP_RAID6)) {
2366 get_raid56_logic_offset(physical_end, num,
2367 map, &logic_end);
2368 logic_end += base;
2369 } else {
2370 logic_end = logical + increment * nstripes;
2371 }
2311 wait_event(sctx->list_wait, 2372 wait_event(sctx->list_wait,
2312 atomic_read(&sctx->bios_in_flight) == 0); 2373 atomic_read(&sctx->bios_in_flight) == 0);
2313 scrub_blocked_if_needed(fs_info); 2374 scrub_blocked_if_needed(fs_info);
@@ -2316,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2316 key_start.objectid = logical; 2377 key_start.objectid = logical;
2317 key_start.type = BTRFS_EXTENT_ITEM_KEY; 2378 key_start.type = BTRFS_EXTENT_ITEM_KEY;
2318 key_start.offset = (u64)0; 2379 key_start.offset = (u64)0;
2319 key_end.objectid = base + offset + nstripes * increment; 2380 key_end.objectid = logic_end;
2320 key_end.type = BTRFS_METADATA_ITEM_KEY; 2381 key_end.type = BTRFS_METADATA_ITEM_KEY;
2321 key_end.offset = (u64)-1; 2382 key_end.offset = (u64)-1;
2322 reada1 = btrfs_reada_add(root, &key_start, &key_end); 2383 reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2326,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2326 key_start.offset = logical; 2387 key_start.offset = logical;
2327 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 2388 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2328 key_end.type = BTRFS_EXTENT_CSUM_KEY; 2389 key_end.type = BTRFS_EXTENT_CSUM_KEY;
2329 key_end.offset = base + offset + nstripes * increment; 2390 key_end.offset = logic_end;
2330 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 2391 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2331 2392
2332 if (!IS_ERR(reada1)) 2393 if (!IS_ERR(reada1))
@@ -2344,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2344 /* 2405 /*
2345 * now find all extents for each stripe and scrub them 2406 * now find all extents for each stripe and scrub them
2346 */ 2407 */
2347 logical = base + offset;
2348 physical = map->stripes[num].physical;
2349 logic_end = logical + increment * nstripes;
2350 ret = 0; 2408 ret = 0;
2351 while (logical < logic_end) { 2409 while (physical < physical_end) {
2410 /* for raid56, we skip parity stripe */
2411 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2412 BTRFS_BLOCK_GROUP_RAID6)) {
2413 ret = get_raid56_logic_offset(physical, num,
2414 map, &logical);
2415 logical += base;
2416 if (ret)
2417 goto skip;
2418 }
2352 /* 2419 /*
2353 * canceled? 2420 * canceled?
2354 */ 2421 */
@@ -2492,15 +2559,29 @@ again:
2492 scrub_free_csums(sctx); 2559 scrub_free_csums(sctx);
2493 if (extent_logical + extent_len < 2560 if (extent_logical + extent_len <
2494 key.objectid + bytes) { 2561 key.objectid + bytes) {
2495 logical += increment; 2562 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2496 physical += map->stripe_len; 2563 BTRFS_BLOCK_GROUP_RAID6)) {
2497 2564 /*
2565 * loop until we find next data stripe
2566 * or we have finished all stripes.
2567 */
2568 do {
2569 physical += map->stripe_len;
2570 ret = get_raid56_logic_offset(
2571 physical, num,
2572 map, &logical);
2573 logical += base;
2574 } while (physical < physical_end && ret);
2575 } else {
2576 physical += map->stripe_len;
2577 logical += increment;
2578 }
2498 if (logical < key.objectid + bytes) { 2579 if (logical < key.objectid + bytes) {
2499 cond_resched(); 2580 cond_resched();
2500 goto again; 2581 goto again;
2501 } 2582 }
2502 2583
2503 if (logical >= logic_end) { 2584 if (physical >= physical_end) {
2504 stop_loop = 1; 2585 stop_loop = 1;
2505 break; 2586 break;
2506 } 2587 }
@@ -2509,6 +2590,7 @@ next:
2509 path->slots[0]++; 2590 path->slots[0]++;
2510 } 2591 }
2511 btrfs_release_path(path); 2592 btrfs_release_path(path);
2593skip:
2512 logical += increment; 2594 logical += increment;
2513 physical += map->stripe_len; 2595 physical += map->stripe_len;
2514 spin_lock(&sctx->stat_lock); 2596 spin_lock(&sctx->stat_lock);
@@ -2686,10 +2768,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2686 2768
2687 wait_event(sctx->list_wait, 2769 wait_event(sctx->list_wait,
2688 atomic_read(&sctx->bios_in_flight) == 0); 2770 atomic_read(&sctx->bios_in_flight) == 0);
2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2771 atomic_inc(&fs_info->scrubs_paused);
2772 wake_up(&fs_info->scrub_pause_wait);
2773
2774 /*
2775 * must be called before we decrease @scrub_paused.
2776 * make sure we don't block transaction commit while
2777 * we are waiting pending workers finished.
2778 */
2690 wait_event(sctx->list_wait, 2779 wait_event(sctx->list_wait,
2691 atomic_read(&sctx->workers_pending) == 0); 2780 atomic_read(&sctx->workers_pending) == 0);
2692 scrub_blocked_if_needed(fs_info); 2781 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2782
2783 mutex_lock(&fs_info->scrub_lock);
2784 __scrub_blocked_if_needed(fs_info);
2785 atomic_dec(&fs_info->scrubs_paused);
2786 mutex_unlock(&fs_info->scrub_lock);
2787 wake_up(&fs_info->scrub_pause_wait);
2693 2788
2694 btrfs_put_block_group(cache); 2789 btrfs_put_block_group(cache);
2695 if (ret) 2790 if (ret)
@@ -2757,33 +2852,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2757 int is_dev_replace) 2852 int is_dev_replace)
2758{ 2853{
2759 int ret = 0; 2854 int ret = 0;
2855 int flags = WQ_FREEZABLE | WQ_UNBOUND;
2856 int max_active = fs_info->thread_pool_size;
2760 2857
2761 if (fs_info->scrub_workers_refcnt == 0) { 2858 if (fs_info->scrub_workers_refcnt == 0) {
2762 if (is_dev_replace) 2859 if (is_dev_replace)
2763 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, 2860 fs_info->scrub_workers =
2764 &fs_info->generic_worker); 2861 btrfs_alloc_workqueue("btrfs-scrub", flags,
2862 1, 4);
2765 else 2863 else
2766 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2864 fs_info->scrub_workers =
2767 fs_info->thread_pool_size, 2865 btrfs_alloc_workqueue("btrfs-scrub", flags,
2768 &fs_info->generic_worker); 2866 max_active, 4);
2769 fs_info->scrub_workers.idle_thresh = 4; 2867 if (!fs_info->scrub_workers) {
2770 ret = btrfs_start_workers(&fs_info->scrub_workers); 2868 ret = -ENOMEM;
2771 if (ret)
2772 goto out; 2869 goto out;
2773 btrfs_init_workers(&fs_info->scrub_wr_completion_workers, 2870 }
2774 "scrubwrc", 2871 fs_info->scrub_wr_completion_workers =
2775 fs_info->thread_pool_size, 2872 btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2776 &fs_info->generic_worker); 2873 max_active, 2);
2777 fs_info->scrub_wr_completion_workers.idle_thresh = 2; 2874 if (!fs_info->scrub_wr_completion_workers) {
2778 ret = btrfs_start_workers( 2875 ret = -ENOMEM;
2779 &fs_info->scrub_wr_completion_workers);
2780 if (ret)
2781 goto out; 2876 goto out;
2782 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, 2877 }
2783 &fs_info->generic_worker); 2878 fs_info->scrub_nocow_workers =
2784 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); 2879 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2785 if (ret) 2880 if (!fs_info->scrub_nocow_workers) {
2881 ret = -ENOMEM;
2786 goto out; 2882 goto out;
2883 }
2787 } 2884 }
2788 ++fs_info->scrub_workers_refcnt; 2885 ++fs_info->scrub_workers_refcnt;
2789out: 2886out:
@@ -2793,9 +2890,9 @@ out:
2793static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2890static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2794{ 2891{
2795 if (--fs_info->scrub_workers_refcnt == 0) { 2892 if (--fs_info->scrub_workers_refcnt == 0) {
2796 btrfs_stop_workers(&fs_info->scrub_workers); 2893 btrfs_destroy_workqueue(fs_info->scrub_workers);
2797 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); 2894 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2798 btrfs_stop_workers(&fs_info->scrub_nocow_workers); 2895 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2799 } 2896 }
2800 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2897 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2801} 2898}
@@ -3106,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3106 nocow_ctx->len = len; 3203 nocow_ctx->len = len;
3107 nocow_ctx->mirror_num = mirror_num; 3204 nocow_ctx->mirror_num = mirror_num;
3108 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3205 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3109 nocow_ctx->work.func = copy_nocow_pages_worker; 3206 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
3110 INIT_LIST_HEAD(&nocow_ctx->inodes); 3207 INIT_LIST_HEAD(&nocow_ctx->inodes);
3111 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3208 btrfs_queue_work(fs_info->scrub_nocow_workers,
3112 &nocow_ctx->work); 3209 &nocow_ctx->work);
3113 3210
3114 return 0; 3211 return 0;
3115} 3212}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..484aacac2c89 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
51 struct { 51 struct {
52 char *start; 52 char *start;
53 char *end; 53 char *end;
54 char *prepared;
55 54
56 char *buf; 55 char *buf;
57 int buf_len; 56 unsigned short buf_len:15;
58 unsigned int reversed:1; 57 unsigned short reversed:1;
59 unsigned int virtual_mem:1;
60 char inline_buf[]; 58 char inline_buf[];
61 }; 59 };
62 char pad[PAGE_SIZE]; 60 /*
61 * Average path length does not exceed 200 bytes, we'll have
62 * better packing in the slab and higher chance to satisfy
63 * a allocation later during send.
64 */
65 char pad[256];
63 }; 66 };
64}; 67};
65#define FS_PATH_INLINE_SIZE \ 68#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
109 int cur_inode_deleted; 112 int cur_inode_deleted;
110 u64 cur_inode_size; 113 u64 cur_inode_size;
111 u64 cur_inode_mode; 114 u64 cur_inode_mode;
115 u64 cur_inode_rdev;
112 u64 cur_inode_last_extent; 116 u64 cur_inode_last_extent;
113 117
114 u64 send_progress; 118 u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
120 struct list_head name_cache_list; 124 struct list_head name_cache_list;
121 int name_cache_size; 125 int name_cache_size;
122 126
127 struct file_ra_state ra;
128
123 char *read_buf; 129 char *read_buf;
124 130
125 /* 131 /*
@@ -175,6 +181,47 @@ struct send_ctx {
175 * own move/rename can be performed. 181 * own move/rename can be performed.
176 */ 182 */
177 struct rb_root waiting_dir_moves; 183 struct rb_root waiting_dir_moves;
184
185 /*
186 * A directory that is going to be rm'ed might have a child directory
187 * which is in the pending directory moves index above. In this case,
188 * the directory can only be removed after the move/rename of its child
189 * is performed. Example:
190 *
191 * Parent snapshot:
192 *
193 * . (ino 256)
194 * |-- a/ (ino 257)
195 * |-- b/ (ino 258)
196 * |-- c/ (ino 259)
197 * | |-- x/ (ino 260)
198 * |
199 * |-- y/ (ino 261)
200 *
201 * Send snapshot:
202 *
203 * . (ino 256)
204 * |-- a/ (ino 257)
205 * |-- b/ (ino 258)
206 * |-- YY/ (ino 261)
207 * |-- x/ (ino 260)
208 *
209 * Sequence of steps that lead to the send snapshot:
210 * rm -f /a/b/c/foo.txt
211 * mv /a/b/y /a/b/YY
212 * mv /a/b/c/x /a/b/YY
213 * rmdir /a/b/c
214 *
215 * When the child is processed, its move/rename is delayed until its
216 * parent is processed (as explained above), but all other operations
217 * like update utimes, chown, chgrp, etc, are performed and the paths
218 * that it uses for those operations must use the orphanized name of
219 * its parent (the directory we're going to rm later), so we need to
220 * memorize that name.
221 *
222 * Indexed by the inode number of the directory to be deleted.
223 */
224 struct rb_root orphan_dirs;
178}; 225};
179 226
180struct pending_dir_move { 227struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
189struct waiting_dir_move { 236struct waiting_dir_move {
190 struct rb_node node; 237 struct rb_node node;
191 u64 ino; 238 u64 ino;
239 /*
240 * There might be some directory that could not be removed because it
241 * was waiting for this directory inode to be moved first. Therefore
242 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
243 */
244 u64 rmdir_ino;
245};
246
247struct orphan_dir_info {
248 struct rb_node node;
249 u64 ino;
250 u64 gen;
192}; 251};
193 252
194struct name_cache_entry { 253struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
214 273
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 274static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216 275
276static struct waiting_dir_move *
277get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
278
279static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
280
217static int need_send_hole(struct send_ctx *sctx) 281static int need_send_hole(struct send_ctx *sctx)
218{ 282{
219 return (sctx->parent_root && !sctx->cur_inode_new && 283 return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
242 if (!p) 306 if (!p)
243 return NULL; 307 return NULL;
244 p->reversed = 0; 308 p->reversed = 0;
245 p->virtual_mem = 0;
246 p->buf = p->inline_buf; 309 p->buf = p->inline_buf;
247 p->buf_len = FS_PATH_INLINE_SIZE; 310 p->buf_len = FS_PATH_INLINE_SIZE;
248 fs_path_reset(p); 311 fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
265{ 328{
266 if (!p) 329 if (!p)
267 return; 330 return;
268 if (p->buf != p->inline_buf) { 331 if (p->buf != p->inline_buf)
269 if (p->virtual_mem) 332 kfree(p->buf);
270 vfree(p->buf);
271 else
272 kfree(p->buf);
273 }
274 kfree(p); 333 kfree(p);
275} 334}
276 335
@@ -290,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
290 if (p->buf_len >= len) 349 if (p->buf_len >= len)
291 return 0; 350 return 0;
292 351
352 if (len > PATH_MAX) {
353 WARN_ON(1);
354 return -ENOMEM;
355 }
356
293 path_len = p->end - p->start; 357 path_len = p->end - p->start;
294 old_buf_len = p->buf_len; 358 old_buf_len = p->buf_len;
295 len = PAGE_ALIGN(len);
296 359
360 /*
361 * First time the inline_buf does not suffice
362 */
297 if (p->buf == p->inline_buf) { 363 if (p->buf == p->inline_buf) {
298 tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); 364 tmp_buf = kmalloc(len, GFP_NOFS);
299 if (!tmp_buf) { 365 if (tmp_buf)
300 tmp_buf = vmalloc(len); 366 memcpy(tmp_buf, p->buf, old_buf_len);
301 if (!tmp_buf)
302 return -ENOMEM;
303 p->virtual_mem = 1;
304 }
305 memcpy(tmp_buf, p->buf, p->buf_len);
306 p->buf = tmp_buf;
307 p->buf_len = len;
308 } else { 367 } else {
309 if (p->virtual_mem) { 368 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
310 tmp_buf = vmalloc(len);
311 if (!tmp_buf)
312 return -ENOMEM;
313 memcpy(tmp_buf, p->buf, p->buf_len);
314 vfree(p->buf);
315 } else {
316 tmp_buf = krealloc(p->buf, len, GFP_NOFS);
317 if (!tmp_buf) {
318 tmp_buf = vmalloc(len);
319 if (!tmp_buf)
320 return -ENOMEM;
321 memcpy(tmp_buf, p->buf, p->buf_len);
322 kfree(p->buf);
323 p->virtual_mem = 1;
324 }
325 }
326 p->buf = tmp_buf;
327 p->buf_len = len;
328 } 369 }
370 if (!tmp_buf)
371 return -ENOMEM;
372 p->buf = tmp_buf;
373 /*
374 * The real size of the buffer is bigger, this will let the fast path
375 * happen most of the time
376 */
377 p->buf_len = ksize(p->buf);
378
329 if (p->reversed) { 379 if (p->reversed) {
330 tmp_buf = p->buf + old_buf_len - path_len - 1; 380 tmp_buf = p->buf + old_buf_len - path_len - 1;
331 p->end = p->buf + p->buf_len - 1; 381 p->end = p->buf + p->buf_len - 1;
@@ -338,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
338 return 0; 388 return 0;
339} 389}
340 390
341static int fs_path_prepare_for_add(struct fs_path *p, int name_len) 391static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
392 char **prepared)
342{ 393{
343 int ret; 394 int ret;
344 int new_len; 395 int new_len;
@@ -354,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
354 if (p->start != p->end) 405 if (p->start != p->end)
355 *--p->start = '/'; 406 *--p->start = '/';
356 p->start -= name_len; 407 p->start -= name_len;
357 p->prepared = p->start; 408 *prepared = p->start;
358 } else { 409 } else {
359 if (p->start != p->end) 410 if (p->start != p->end)
360 *p->end++ = '/'; 411 *p->end++ = '/';
361 p->prepared = p->end; 412 *prepared = p->end;
362 p->end += name_len; 413 p->end += name_len;
363 *p->end = 0; 414 *p->end = 0;
364 } 415 }
@@ -370,12 +421,12 @@ out:
370static int fs_path_add(struct fs_path *p, const char *name, int name_len) 421static int fs_path_add(struct fs_path *p, const char *name, int name_len)
371{ 422{
372 int ret; 423 int ret;
424 char *prepared;
373 425
374 ret = fs_path_prepare_for_add(p, name_len); 426 ret = fs_path_prepare_for_add(p, name_len, &prepared);
375 if (ret < 0) 427 if (ret < 0)
376 goto out; 428 goto out;
377 memcpy(p->prepared, name, name_len); 429 memcpy(prepared, name, name_len);
378 p->prepared = NULL;
379 430
380out: 431out:
381 return ret; 432 return ret;
@@ -384,12 +435,12 @@ out:
384static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) 435static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
385{ 436{
386 int ret; 437 int ret;
438 char *prepared;
387 439
388 ret = fs_path_prepare_for_add(p, p2->end - p2->start); 440 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
389 if (ret < 0) 441 if (ret < 0)
390 goto out; 442 goto out;
391 memcpy(p->prepared, p2->start, p2->end - p2->start); 443 memcpy(prepared, p2->start, p2->end - p2->start);
392 p->prepared = NULL;
393 444
394out: 445out:
395 return ret; 446 return ret;
@@ -400,13 +451,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
400 unsigned long off, int len) 451 unsigned long off, int len)
401{ 452{
402 int ret; 453 int ret;
454 char *prepared;
403 455
404 ret = fs_path_prepare_for_add(p, len); 456 ret = fs_path_prepare_for_add(p, len, &prepared);
405 if (ret < 0) 457 if (ret < 0)
406 goto out; 458 goto out;
407 459
408 read_extent_buffer(eb, p->prepared, off, len); 460 read_extent_buffer(eb, prepared, off, len);
409 p->prepared = NULL;
410 461
411out: 462out:
412 return ret; 463 return ret;
@@ -450,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void)
450 return NULL; 501 return NULL;
451 path->search_commit_root = 1; 502 path->search_commit_root = 1;
452 path->skip_locking = 1; 503 path->skip_locking = 1;
504 path->need_commit_sem = 1;
453 return path; 505 return path;
454} 506}
455 507
@@ -728,29 +780,22 @@ out:
728/* 780/*
729 * Helper function to retrieve some fields from an inode item. 781 * Helper function to retrieve some fields from an inode item.
730 */ 782 */
731static int get_inode_info(struct btrfs_root *root, 783static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
732 u64 ino, u64 *size, u64 *gen, 784 u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
733 u64 *mode, u64 *uid, u64 *gid, 785 u64 *gid, u64 *rdev)
734 u64 *rdev)
735{ 786{
736 int ret; 787 int ret;
737 struct btrfs_inode_item *ii; 788 struct btrfs_inode_item *ii;
738 struct btrfs_key key; 789 struct btrfs_key key;
739 struct btrfs_path *path;
740
741 path = alloc_path_for_send();
742 if (!path)
743 return -ENOMEM;
744 790
745 key.objectid = ino; 791 key.objectid = ino;
746 key.type = BTRFS_INODE_ITEM_KEY; 792 key.type = BTRFS_INODE_ITEM_KEY;
747 key.offset = 0; 793 key.offset = 0;
748 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 794 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
749 if (ret < 0)
750 goto out;
751 if (ret) { 795 if (ret) {
752 ret = -ENOENT; 796 if (ret > 0)
753 goto out; 797 ret = -ENOENT;
798 return ret;
754 } 799 }
755 800
756 ii = btrfs_item_ptr(path->nodes[0], path->slots[0], 801 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -768,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root,
768 if (rdev) 813 if (rdev)
769 *rdev = btrfs_inode_rdev(path->nodes[0], ii); 814 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
770 815
771out: 816 return ret;
817}
818
819static int get_inode_info(struct btrfs_root *root,
820 u64 ino, u64 *size, u64 *gen,
821 u64 *mode, u64 *uid, u64 *gid,
822 u64 *rdev)
823{
824 struct btrfs_path *path;
825 int ret;
826
827 path = alloc_path_for_send();
828 if (!path)
829 return -ENOMEM;
830 ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
831 rdev);
772 btrfs_free_path(path); 832 btrfs_free_path(path);
773 return ret; 833 return ret;
774} 834}
@@ -915,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
915 struct btrfs_dir_item *di; 975 struct btrfs_dir_item *di;
916 struct btrfs_key di_key; 976 struct btrfs_key di_key;
917 char *buf = NULL; 977 char *buf = NULL;
918 char *buf2 = NULL; 978 const int buf_len = PATH_MAX;
919 int buf_len;
920 int buf_virtual = 0;
921 u32 name_len; 979 u32 name_len;
922 u32 data_len; 980 u32 data_len;
923 u32 cur; 981 u32 cur;
@@ -927,7 +985,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
927 int num; 985 int num;
928 u8 type; 986 u8 type;
929 987
930 buf_len = PAGE_SIZE;
931 buf = kmalloc(buf_len, GFP_NOFS); 988 buf = kmalloc(buf_len, GFP_NOFS);
932 if (!buf) { 989 if (!buf) {
933 ret = -ENOMEM; 990 ret = -ENOMEM;
@@ -949,30 +1006,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
949 type = btrfs_dir_type(eb, di); 1006 type = btrfs_dir_type(eb, di);
950 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1007 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
951 1008
1009 /*
1010 * Path too long
1011 */
952 if (name_len + data_len > buf_len) { 1012 if (name_len + data_len > buf_len) {
953 buf_len = PAGE_ALIGN(name_len + data_len); 1013 ret = -ENAMETOOLONG;
954 if (buf_virtual) { 1014 goto out;
955 buf2 = vmalloc(buf_len);
956 if (!buf2) {
957 ret = -ENOMEM;
958 goto out;
959 }
960 vfree(buf);
961 } else {
962 buf2 = krealloc(buf, buf_len, GFP_NOFS);
963 if (!buf2) {
964 buf2 = vmalloc(buf_len);
965 if (!buf2) {
966 ret = -ENOMEM;
967 goto out;
968 }
969 kfree(buf);
970 buf_virtual = 1;
971 }
972 }
973
974 buf = buf2;
975 buf2 = NULL;
976 } 1015 }
977 1016
978 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1017 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1034,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 } 1034 }
996 1035
997out: 1036out:
998 if (buf_virtual) 1037 kfree(buf);
999 vfree(buf);
1000 else
1001 kfree(buf);
1002 return ret; 1038 return ret;
1003} 1039}
1004 1040
@@ -1066,6 +1102,7 @@ out:
1066struct backref_ctx { 1102struct backref_ctx {
1067 struct send_ctx *sctx; 1103 struct send_ctx *sctx;
1068 1104
1105 struct btrfs_path *path;
1069 /* number of total found references */ 1106 /* number of total found references */
1070 u64 found; 1107 u64 found;
1071 1108
@@ -1136,8 +1173,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1136 * There are inodes that have extents that lie behind its i_size. Don't 1173 * There are inodes that have extents that lie behind its i_size. Don't
1137 * accept clones from these extents. 1174 * accept clones from these extents.
1138 */ 1175 */
1139 ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, 1176 ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
1140 NULL); 1177 NULL, NULL, NULL);
1178 btrfs_release_path(bctx->path);
1141 if (ret < 0) 1179 if (ret < 0)
1142 return ret; 1180 return ret;
1143 1181
@@ -1216,12 +1254,17 @@ static int find_extent_clone(struct send_ctx *sctx,
1216 if (!tmp_path) 1254 if (!tmp_path)
1217 return -ENOMEM; 1255 return -ENOMEM;
1218 1256
1257 /* We only use this path under the commit sem */
1258 tmp_path->need_commit_sem = 0;
1259
1219 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); 1260 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
1220 if (!backref_ctx) { 1261 if (!backref_ctx) {
1221 ret = -ENOMEM; 1262 ret = -ENOMEM;
1222 goto out; 1263 goto out;
1223 } 1264 }
1224 1265
1266 backref_ctx->path = tmp_path;
1267
1225 if (data_offset >= ino_size) { 1268 if (data_offset >= ino_size) {
1226 /* 1269 /*
1227 * There may be extents that lie behind the file's size. 1270 * There may be extents that lie behind the file's size.
@@ -1249,8 +1292,10 @@ static int find_extent_clone(struct send_ctx *sctx,
1249 } 1292 }
1250 logical = disk_byte + btrfs_file_extent_offset(eb, fi); 1293 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1251 1294
1295 down_read(&sctx->send_root->fs_info->commit_root_sem);
1252 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, 1296 ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
1253 &found_key, &flags); 1297 &found_key, &flags);
1298 up_read(&sctx->send_root->fs_info->commit_root_sem);
1254 btrfs_release_path(tmp_path); 1299 btrfs_release_path(tmp_path);
1255 1300
1256 if (ret < 0) 1301 if (ret < 0)
@@ -1292,8 +1337,6 @@ static int find_extent_clone(struct send_ctx *sctx,
1292 extent_item_pos = logical - found_key.objectid; 1337 extent_item_pos = logical - found_key.objectid;
1293 else 1338 else
1294 extent_item_pos = 0; 1339 extent_item_pos = 0;
1295
1296 extent_item_pos = logical - found_key.objectid;
1297 ret = iterate_extent_inodes(sctx->send_root->fs_info, 1340 ret = iterate_extent_inodes(sctx->send_root->fs_info,
1298 found_key.objectid, extent_item_pos, 1, 1341 found_key.objectid, extent_item_pos, 1,
1299 __iterate_backrefs, backref_ctx); 1342 __iterate_backrefs, backref_ctx);
@@ -1418,11 +1461,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1418 while (1) { 1461 while (1) {
1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1462 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1420 ino, gen, idx); 1463 ino, gen, idx);
1421 if (len >= sizeof(tmp)) { 1464 ASSERT(len < sizeof(tmp));
1422 /* should really not happen */
1423 ret = -EOVERFLOW;
1424 goto out;
1425 }
1426 1465
1427 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1466 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1428 path, BTRFS_FIRST_FREE_OBJECTID, 1467 path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1632,7 +1671,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
1632 goto out; 1671 goto out;
1633 } 1672 }
1634 1673
1635 if (key.type == BTRFS_INODE_REF_KEY) { 1674 if (found_key.type == BTRFS_INODE_REF_KEY) {
1636 struct btrfs_inode_ref *iref; 1675 struct btrfs_inode_ref *iref;
1637 iref = btrfs_item_ptr(path->nodes[0], path->slots[0], 1676 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1638 struct btrfs_inode_ref); 1677 struct btrfs_inode_ref);
@@ -1898,13 +1937,20 @@ static void name_cache_delete(struct send_ctx *sctx,
1898 1937
1899 nce_head = radix_tree_lookup(&sctx->name_cache, 1938 nce_head = radix_tree_lookup(&sctx->name_cache,
1900 (unsigned long)nce->ino); 1939 (unsigned long)nce->ino);
1901 BUG_ON(!nce_head); 1940 if (!nce_head) {
1941 btrfs_err(sctx->send_root->fs_info,
1942 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
1943 nce->ino, sctx->name_cache_size);
1944 }
1902 1945
1903 list_del(&nce->radix_list); 1946 list_del(&nce->radix_list);
1904 list_del(&nce->list); 1947 list_del(&nce->list);
1905 sctx->name_cache_size--; 1948 sctx->name_cache_size--;
1906 1949
1907 if (list_empty(nce_head)) { 1950 /*
1951 * We may not get to the final release of nce_head if the lookup fails
1952 */
1953 if (nce_head && list_empty(nce_head)) {
1908 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 1954 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
1909 kfree(nce_head); 1955 kfree(nce_head);
1910 } 1956 }
@@ -1977,7 +2023,6 @@ static void name_cache_free(struct send_ctx *sctx)
1977 */ 2023 */
1978static int __get_cur_name_and_parent(struct send_ctx *sctx, 2024static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 u64 ino, u64 gen, 2025 u64 ino, u64 gen,
1980 int skip_name_cache,
1981 u64 *parent_ino, 2026 u64 *parent_ino,
1982 u64 *parent_gen, 2027 u64 *parent_gen,
1983 struct fs_path *dest) 2028 struct fs_path *dest)
@@ -1987,8 +2032,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1987 struct btrfs_path *path = NULL; 2032 struct btrfs_path *path = NULL;
1988 struct name_cache_entry *nce = NULL; 2033 struct name_cache_entry *nce = NULL;
1989 2034
1990 if (skip_name_cache)
1991 goto get_ref;
1992 /* 2035 /*
1993 * First check if we already did a call to this function with the same 2036 * First check if we already did a call to this function with the same
1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2037 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2076,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2033 goto out_cache; 2076 goto out_cache;
2034 } 2077 }
2035 2078
2036get_ref:
2037 /* 2079 /*
2038 * Depending on whether the inode was already processed or not, use 2080 * Depending on whether the inode was already processed or not, use
2039 * send_root or parent_root for ref lookup. 2081 * send_root or parent_root for ref lookup.
2040 */ 2082 */
2041 if (ino < sctx->send_progress && !skip_name_cache) 2083 if (ino < sctx->send_progress)
2042 ret = get_first_ref(sctx->send_root, ino, 2084 ret = get_first_ref(sctx->send_root, ino,
2043 parent_ino, parent_gen, dest); 2085 parent_ino, parent_gen, dest);
2044 else 2086 else
@@ -2062,8 +2104,6 @@ get_ref:
2062 goto out; 2104 goto out;
2063 ret = 1; 2105 ret = 1;
2064 } 2106 }
2065 if (skip_name_cache)
2066 goto out;
2067 2107
2068out_cache: 2108out_cache:
2069 /* 2109 /*
@@ -2131,9 +2171,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2131 u64 parent_inode = 0; 2171 u64 parent_inode = 0;
2132 u64 parent_gen = 0; 2172 u64 parent_gen = 0;
2133 int stop = 0; 2173 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2137 2174
2138 name = fs_path_alloc(); 2175 name = fs_path_alloc();
2139 if (!name) { 2176 if (!name) {
@@ -2141,31 +2178,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2141 goto out; 2178 goto out;
2142 } 2179 }
2143 2180
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2148 dest->reversed = 1; 2181 dest->reversed = 1;
2149 fs_path_reset(dest); 2182 fs_path_reset(dest);
2150 2183
2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2184 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2152 fs_path_reset(name); 2185 fs_path_reset(name);
2153 2186
2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, 2187 if (is_waiting_for_rm(sctx, ino)) {
2155 &parent_inode, &parent_gen, name); 2188 ret = gen_unique_name(sctx, ino, gen, name);
2189 if (ret < 0)
2190 goto out;
2191 ret = fs_path_add_path(dest, name);
2192 break;
2193 }
2194
2195 if (is_waiting_for_move(sctx, ino)) {
2196 ret = get_first_ref(sctx->parent_root, ino,
2197 &parent_inode, &parent_gen, name);
2198 } else {
2199 ret = __get_cur_name_and_parent(sctx, ino, gen,
2200 &parent_inode,
2201 &parent_gen, name);
2202 if (ret)
2203 stop = 1;
2204 }
2205
2156 if (ret < 0) 2206 if (ret < 0)
2157 goto out; 2207 goto out;
2158 if (ret)
2159 stop = 1;
2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169 2208
2170 ret = fs_path_add_path(dest, name); 2209 ret = fs_path_add_path(dest, name);
2171 if (ret < 0) 2210 if (ret < 0)
@@ -2429,10 +2468,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2429 if (!p) 2468 if (!p)
2430 return -ENOMEM; 2469 return -ENOMEM;
2431 2470
2432 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, 2471 if (ino != sctx->cur_ino) {
2433 NULL, &rdev); 2472 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2434 if (ret < 0) 2473 NULL, NULL, &rdev);
2435 goto out; 2474 if (ret < 0)
2475 goto out;
2476 } else {
2477 gen = sctx->cur_inode_gen;
2478 mode = sctx->cur_inode_mode;
2479 rdev = sctx->cur_inode_rdev;
2480 }
2436 2481
2437 if (S_ISREG(mode)) { 2482 if (S_ISREG(mode)) {
2438 cmd = BTRFS_SEND_C_MKFILE; 2483 cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2557,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2512 key.objectid = dir; 2557 key.objectid = dir;
2513 key.type = BTRFS_DIR_INDEX_KEY; 2558 key.type = BTRFS_DIR_INDEX_KEY;
2514 key.offset = 0; 2559 key.offset = 0;
2560 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2561 if (ret < 0)
2562 goto out;
2563
2515 while (1) { 2564 while (1) {
2516 ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, 2565 eb = path->nodes[0];
2517 1, 0); 2566 slot = path->slots[0];
2518 if (ret < 0) 2567 if (slot >= btrfs_header_nritems(eb)) {
2519 goto out; 2568 ret = btrfs_next_leaf(sctx->send_root, path);
2520 if (!ret) { 2569 if (ret < 0) {
2521 eb = path->nodes[0]; 2570 goto out;
2522 slot = path->slots[0]; 2571 } else if (ret > 0) {
2523 btrfs_item_key_to_cpu(eb, &found_key, slot); 2572 ret = 0;
2573 break;
2574 }
2575 continue;
2524 } 2576 }
2525 if (ret || found_key.objectid != key.objectid || 2577
2578 btrfs_item_key_to_cpu(eb, &found_key, slot);
2579 if (found_key.objectid != key.objectid ||
2526 found_key.type != key.type) { 2580 found_key.type != key.type) {
2527 ret = 0; 2581 ret = 0;
2528 goto out; 2582 goto out;
@@ -2537,8 +2591,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
2537 goto out; 2591 goto out;
2538 } 2592 }
2539 2593
2540 key.offset = found_key.offset + 1; 2594 path->slots[0]++;
2541 btrfs_release_path(path);
2542 } 2595 }
2543 2596
2544out: 2597out:
@@ -2590,7 +2643,7 @@ struct recorded_ref {
2590 * everything mixed. So we first record all refs and later process them. 2643 * everything mixed. So we first record all refs and later process them.
2591 * This function is a helper to record one ref. 2644 * This function is a helper to record one ref.
2592 */ 2645 */
2593static int record_ref(struct list_head *head, u64 dir, 2646static int __record_ref(struct list_head *head, u64 dir,
2594 u64 dir_gen, struct fs_path *path) 2647 u64 dir_gen, struct fs_path *path)
2595{ 2648{
2596 struct recorded_ref *ref; 2649 struct recorded_ref *ref;
@@ -2676,12 +2729,78 @@ out:
2676 return ret; 2729 return ret;
2677} 2730}
2678 2731
2732static struct orphan_dir_info *
2733add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2734{
2735 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2736 struct rb_node *parent = NULL;
2737 struct orphan_dir_info *entry, *odi;
2738
2739 odi = kmalloc(sizeof(*odi), GFP_NOFS);
2740 if (!odi)
2741 return ERR_PTR(-ENOMEM);
2742 odi->ino = dir_ino;
2743 odi->gen = 0;
2744
2745 while (*p) {
2746 parent = *p;
2747 entry = rb_entry(parent, struct orphan_dir_info, node);
2748 if (dir_ino < entry->ino) {
2749 p = &(*p)->rb_left;
2750 } else if (dir_ino > entry->ino) {
2751 p = &(*p)->rb_right;
2752 } else {
2753 kfree(odi);
2754 return entry;
2755 }
2756 }
2757
2758 rb_link_node(&odi->node, parent, p);
2759 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2760 return odi;
2761}
2762
2763static struct orphan_dir_info *
2764get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2765{
2766 struct rb_node *n = sctx->orphan_dirs.rb_node;
2767 struct orphan_dir_info *entry;
2768
2769 while (n) {
2770 entry = rb_entry(n, struct orphan_dir_info, node);
2771 if (dir_ino < entry->ino)
2772 n = n->rb_left;
2773 else if (dir_ino > entry->ino)
2774 n = n->rb_right;
2775 else
2776 return entry;
2777 }
2778 return NULL;
2779}
2780
2781static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2782{
2783 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2784
2785 return odi != NULL;
2786}
2787
2788static void free_orphan_dir_info(struct send_ctx *sctx,
2789 struct orphan_dir_info *odi)
2790{
2791 if (!odi)
2792 return;
2793 rb_erase(&odi->node, &sctx->orphan_dirs);
2794 kfree(odi);
2795}
2796
2679/* 2797/*
2680 * Returns 1 if a directory can be removed at this point in time. 2798 * Returns 1 if a directory can be removed at this point in time.
2681 * We check this by iterating all dir items and checking if the inode behind 2799 * We check this by iterating all dir items and checking if the inode behind
2682 * the dir item was already processed. 2800 * the dir item was already processed.
2683 */ 2801 */
2684static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) 2802static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2803 u64 send_progress)
2685{ 2804{
2686 int ret = 0; 2805 int ret = 0;
2687 struct btrfs_root *root = sctx->parent_root; 2806 struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2823,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
2704 key.objectid = dir; 2823 key.objectid = dir;
2705 key.type = BTRFS_DIR_INDEX_KEY; 2824 key.type = BTRFS_DIR_INDEX_KEY;
2706 key.offset = 0; 2825 key.offset = 0;
2826 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2827 if (ret < 0)
2828 goto out;
2707 2829
2708 while (1) { 2830 while (1) {
2709 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2831 struct waiting_dir_move *dm;
2710 if (ret < 0) 2832
2711 goto out; 2833 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2712 if (!ret) { 2834 ret = btrfs_next_leaf(root, path);
2713 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2835 if (ret < 0)
2714 path->slots[0]); 2836 goto out;
2837 else if (ret > 0)
2838 break;
2839 continue;
2715 } 2840 }
2716 if (ret || found_key.objectid != key.objectid || 2841 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2717 found_key.type != key.type) { 2842 path->slots[0]);
2843 if (found_key.objectid != key.objectid ||
2844 found_key.type != key.type)
2718 break; 2845 break;
2719 }
2720 2846
2721 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 2847 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2722 struct btrfs_dir_item); 2848 struct btrfs_dir_item);
2723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 2849 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2724 2850
2851 dm = get_waiting_dir_move(sctx, loc.objectid);
2852 if (dm) {
2853 struct orphan_dir_info *odi;
2854
2855 odi = add_orphan_dir_info(sctx, dir);
2856 if (IS_ERR(odi)) {
2857 ret = PTR_ERR(odi);
2858 goto out;
2859 }
2860 odi->gen = dir_gen;
2861 dm->rmdir_ino = dir;
2862 ret = 0;
2863 goto out;
2864 }
2865
2725 if (loc.objectid > send_progress) { 2866 if (loc.objectid > send_progress) {
2726 ret = 0; 2867 ret = 0;
2727 goto out; 2868 goto out;
2728 } 2869 }
2729 2870
2730 btrfs_release_path(path); 2871 path->slots[0]++;
2731 key.offset = found_key.offset + 1;
2732 } 2872 }
2733 2873
2734 ret = 1; 2874 ret = 1;
@@ -2740,19 +2880,9 @@ out:
2740 2880
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 2881static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{ 2882{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2883 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
2744 struct waiting_dir_move *entry;
2745 2884
2746 while (n) { 2885 return entry != NULL;
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756} 2886}
2757 2887
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2888static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2895,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2765 if (!dm) 2895 if (!dm)
2766 return -ENOMEM; 2896 return -ENOMEM;
2767 dm->ino = ino; 2897 dm->ino = ino;
2898 dm->rmdir_ino = 0;
2768 2899
2769 while (*p) { 2900 while (*p) {
2770 parent = *p; 2901 parent = *p;
@@ -2784,31 +2915,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2784 return 0; 2915 return 0;
2785} 2916}
2786 2917
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) 2918static struct waiting_dir_move *
2919get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{ 2920{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 2921 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry; 2922 struct waiting_dir_move *entry;
2791 2923
2792 while (n) { 2924 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node); 2925 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) { 2926 if (ino < entry->ino)
2795 n = n->rb_left; 2927 n = n->rb_left;
2796 } else if (ino > entry->ino) { 2928 else if (ino > entry->ino)
2797 n = n->rb_right; 2929 n = n->rb_right;
2798 } else { 2930 else
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves); 2931 return entry;
2800 kfree(entry);
2801 return 0;
2802 }
2803 } 2932 }
2804 return -ENOENT; 2933 return NULL;
2934}
2935
2936static void free_waiting_dir_move(struct send_ctx *sctx,
2937 struct waiting_dir_move *dm)
2938{
2939 if (!dm)
2940 return;
2941 rb_erase(&dm->node, &sctx->waiting_dir_moves);
2942 kfree(dm);
2805} 2943}
2806 2944
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) 2945static int add_pending_dir_move(struct send_ctx *sctx,
2946 u64 ino,
2947 u64 ino_gen,
2948 u64 parent_ino)
2808{ 2949{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 2950 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL; 2951 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm; 2952 struct pending_dir_move *entry = NULL, *pm;
2812 struct recorded_ref *cur; 2953 struct recorded_ref *cur;
2813 int exists = 0; 2954 int exists = 0;
2814 int ret; 2955 int ret;
@@ -2817,8 +2958,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2817 if (!pm) 2958 if (!pm)
2818 return -ENOMEM; 2959 return -ENOMEM;
2819 pm->parent_ino = parent_ino; 2960 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino; 2961 pm->ino = ino;
2821 pm->gen = sctx->cur_inode_gen; 2962 pm->gen = ino_gen;
2822 INIT_LIST_HEAD(&pm->list); 2963 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs); 2964 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node); 2965 RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3029,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{ 3029{
2889 struct fs_path *from_path = NULL; 3030 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL; 3031 struct fs_path *to_path = NULL;
3032 struct fs_path *name = NULL;
2891 u64 orig_progress = sctx->send_progress; 3033 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur; 3034 struct recorded_ref *cur;
3035 u64 parent_ino, parent_gen;
3036 struct waiting_dir_move *dm = NULL;
3037 u64 rmdir_ino = 0;
2893 int ret; 3038 int ret;
2894 3039
3040 name = fs_path_alloc();
2895 from_path = fs_path_alloc(); 3041 from_path = fs_path_alloc();
2896 if (!from_path) 3042 if (!name || !from_path) {
2897 return -ENOMEM; 3043 ret = -ENOMEM;
3044 goto out;
3045 }
2898 3046
2899 sctx->send_progress = pm->ino; 3047 dm = get_waiting_dir_move(sctx, pm->ino);
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); 3048 ASSERT(dm);
3049 rmdir_ino = dm->rmdir_ino;
3050 free_waiting_dir_move(sctx, dm);
3051
3052 ret = get_first_ref(sctx->parent_root, pm->ino,
3053 &parent_ino, &parent_gen, name);
2901 if (ret < 0) 3054 if (ret < 0)
2902 goto out; 3055 goto out;
2903 3056
3057 if (parent_ino == sctx->cur_ino) {
3058 /* child only renamed, not moved */
3059 ASSERT(parent_gen == sctx->cur_inode_gen);
3060 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3061 from_path);
3062 if (ret < 0)
3063 goto out;
3064 ret = fs_path_add_path(from_path, name);
3065 if (ret < 0)
3066 goto out;
3067 } else {
3068 /* child moved and maybe renamed too */
3069 sctx->send_progress = pm->ino;
3070 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
3071 if (ret < 0)
3072 goto out;
3073 }
3074
3075 fs_path_free(name);
3076 name = NULL;
3077
2904 to_path = fs_path_alloc(); 3078 to_path = fs_path_alloc();
2905 if (!to_path) { 3079 if (!to_path) {
2906 ret = -ENOMEM; 3080 ret = -ENOMEM;
@@ -2908,9 +3082,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2908 } 3082 }
2909 3083
2910 sctx->send_progress = sctx->cur_ino + 1; 3084 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3085 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0) 3086 if (ret < 0)
2916 goto out; 3087 goto out;
@@ -2919,6 +3090,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2919 if (ret < 0) 3090 if (ret < 0)
2920 goto out; 3091 goto out;
2921 3092
3093 if (rmdir_ino) {
3094 struct orphan_dir_info *odi;
3095
3096 odi = get_orphan_dir_info(sctx, rmdir_ino);
3097 if (!odi) {
3098 /* already deleted */
3099 goto finish;
3100 }
3101 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
3102 if (ret < 0)
3103 goto out;
3104 if (!ret)
3105 goto finish;
3106
3107 name = fs_path_alloc();
3108 if (!name) {
3109 ret = -ENOMEM;
3110 goto out;
3111 }
3112 ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
3113 if (ret < 0)
3114 goto out;
3115 ret = send_rmdir(sctx, name);
3116 if (ret < 0)
3117 goto out;
3118 free_orphan_dir_info(sctx, odi);
3119 }
3120
3121finish:
2922 ret = send_utimes(sctx, pm->ino, pm->gen); 3122 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0) 3123 if (ret < 0)
2924 goto out; 3124 goto out;
@@ -2928,12 +3128,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2928 * and old parent(s). 3128 * and old parent(s).
2929 */ 3129 */
2930 list_for_each_entry(cur, &pm->update_refs, list) { 3130 list_for_each_entry(cur, &pm->update_refs, list) {
3131 if (cur->dir == rmdir_ino)
3132 continue;
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3133 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0) 3134 if (ret < 0)
2933 goto out; 3135 goto out;
2934 } 3136 }
2935 3137
2936out: 3138out:
3139 fs_path_free(name);
2937 fs_path_free(from_path); 3140 fs_path_free(from_path);
2938 fs_path_free(to_path); 3141 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress; 3142 sctx->send_progress = orig_progress;
@@ -3005,17 +3208,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3005 int ret; 3208 int ret;
3006 u64 ino = parent_ref->dir; 3209 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after; 3210 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen; 3211 u64 old_gen;
3009 struct fs_path *path_before = NULL; 3212 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL; 3213 struct fs_path *path_after = NULL;
3011 int len1, len2; 3214 int len1, len2;
3012 3215 int register_upper_dirs;
3013 if (parent_ref->dir <= sctx->cur_ino) 3216 u64 gen;
3014 return 0;
3015 3217
3016 if (is_waiting_for_move(sctx, ino)) 3218 if (is_waiting_for_move(sctx, ino))
3017 return 1; 3219 return 1;
3018 3220
3221 if (parent_ref->dir <= sctx->cur_ino)
3222 return 0;
3223
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, 3224 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL); 3225 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT) 3226 if (ret == -ENOENT)
@@ -3023,12 +3228,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3023 else if (ret < 0) 3228 else if (ret < 0)
3024 return ret; 3229 return ret;
3025 3230
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, 3231 if (parent_ref->dir_gen != old_gen)
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0; 3232 return 0;
3033 3233
3034 path_before = fs_path_alloc(); 3234 path_before = fs_path_alloc();
@@ -3051,7 +3251,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3051 } 3251 }
3052 3252
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 3253 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after); 3254 &gen, path_after);
3055 if (ret == -ENOENT) { 3255 if (ret == -ENOENT) {
3056 ret = 0; 3256 ret = 0;
3057 goto out; 3257 goto out;
@@ -3061,13 +3261,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3061 3261
3062 len1 = fs_path_len(path_before); 3262 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after); 3263 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 || 3264 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) { 3265 memcmp(path_before->start, path_after->start, len1)) {
3066 ret = 1; 3266 ret = 1;
3067 goto out; 3267 goto out;
3068 } 3268 }
3069 ret = 0; 3269 ret = 0;
3070 3270
3271 /*
3272 * Ok, our new most direct ancestor has a higher inode number but
3273 * wasn't moved/renamed. So maybe some of the new ancestors higher in
3274 * the hierarchy have an higher inode number too *and* were renamed
3275 * or moved - in this case we need to wait for the ancestor's rename
3276 * or move operation before we can do the move/rename for the current
3277 * inode.
3278 */
3279 register_upper_dirs = 0;
3280 ino = parent_ino_after;
3281again:
3282 while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
3283 u64 parent_gen;
3284
3285 fs_path_reset(path_before);
3286 fs_path_reset(path_after);
3287
3288 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3289 &parent_gen, path_after);
3290 if (ret < 0)
3291 goto out;
3292 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3293 NULL, path_before);
3294 if (ret == -ENOENT) {
3295 ret = 0;
3296 break;
3297 } else if (ret < 0) {
3298 goto out;
3299 }
3300
3301 len1 = fs_path_len(path_before);
3302 len2 = fs_path_len(path_after);
3303 if (parent_ino_before != parent_ino_after || len1 != len2 ||
3304 memcmp(path_before->start, path_after->start, len1)) {
3305 ret = 1;
3306 if (register_upper_dirs) {
3307 break;
3308 } else {
3309 register_upper_dirs = 1;
3310 ino = parent_ref->dir;
3311 gen = parent_ref->dir_gen;
3312 goto again;
3313 }
3314 } else if (register_upper_dirs) {
3315 ret = add_pending_dir_move(sctx, ino, gen,
3316 parent_ino_after);
3317 if (ret < 0 && ret != -EEXIST)
3318 goto out;
3319 }
3320
3321 ino = parent_ino_after;
3322 gen = parent_gen;
3323 }
3324
3071out: 3325out:
3072 fs_path_free(path_before); 3326 fs_path_free(path_before);
3073 fs_path_free(path_after); 3327 fs_path_free(path_after);
@@ -3089,6 +3343,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3089 u64 ow_gen; 3343 u64 ow_gen;
3090 int did_overwrite = 0; 3344 int did_overwrite = 0;
3091 int is_orphan = 0; 3345 int is_orphan = 0;
3346 u64 last_dir_ino_rm = 0;
3092 3347
3093verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); 3348verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3094 3349
@@ -3227,9 +3482,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3227 * dirs, we always have one new and one deleted 3482 * dirs, we always have one new and one deleted
3228 * ref. The deleted ref is ignored later. 3483 * ref. The deleted ref is ignored later.
3229 */ 3484 */
3230 if (wait_for_parent_move(sctx, cur)) { 3485 ret = wait_for_parent_move(sctx, cur);
3486 if (ret < 0)
3487 goto out;
3488 if (ret) {
3231 ret = add_pending_dir_move(sctx, 3489 ret = add_pending_dir_move(sctx,
3232 cur->dir); 3490 sctx->cur_ino,
3491 sctx->cur_inode_gen,
3492 cur->dir);
3233 *pending_move = 1; 3493 *pending_move = 1;
3234 } else { 3494 } else {
3235 ret = send_rename(sctx, valid_path, 3495 ret = send_rename(sctx, valid_path,
@@ -3259,7 +3519,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3259 * later, we do this check again and rmdir it then if possible. 3519 * later, we do this check again and rmdir it then if possible.
3260 * See the use of check_dirs for more details. 3520 * See the use of check_dirs for more details.
3261 */ 3521 */
3262 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); 3522 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3523 sctx->cur_ino);
3263 if (ret < 0) 3524 if (ret < 0)
3264 goto out; 3525 goto out;
3265 if (ret) { 3526 if (ret) {
@@ -3350,8 +3611,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3350 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3611 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3351 if (ret < 0) 3612 if (ret < 0)
3352 goto out; 3613 goto out;
3353 } else if (ret == inode_state_did_delete) { 3614 } else if (ret == inode_state_did_delete &&
3354 ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); 3615 cur->dir != last_dir_ino_rm) {
3616 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
3617 sctx->cur_ino);
3355 if (ret < 0) 3618 if (ret < 0)
3356 goto out; 3619 goto out;
3357 if (ret) { 3620 if (ret) {
@@ -3362,6 +3625,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3362 ret = send_rmdir(sctx, valid_path); 3625 ret = send_rmdir(sctx, valid_path);
3363 if (ret < 0) 3626 if (ret < 0)
3364 goto out; 3627 goto out;
3628 last_dir_ino_rm = cur->dir;
3365 } 3629 }
3366 } 3630 }
3367 } 3631 }
@@ -3375,9 +3639,8 @@ out:
3375 return ret; 3639 return ret;
3376} 3640}
3377 3641
3378static int __record_new_ref(int num, u64 dir, int index, 3642static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
3379 struct fs_path *name, 3643 struct fs_path *name, void *ctx, struct list_head *refs)
3380 void *ctx)
3381{ 3644{
3382 int ret = 0; 3645 int ret = 0;
3383 struct send_ctx *sctx = ctx; 3646 struct send_ctx *sctx = ctx;
@@ -3388,7 +3651,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3388 if (!p) 3651 if (!p)
3389 return -ENOMEM; 3652 return -ENOMEM;
3390 3653
3391 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, 3654 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
3392 NULL, NULL); 3655 NULL, NULL);
3393 if (ret < 0) 3656 if (ret < 0)
3394 goto out; 3657 goto out;
@@ -3400,7 +3663,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3400 if (ret < 0) 3663 if (ret < 0)
3401 goto out; 3664 goto out;
3402 3665
3403 ret = record_ref(&sctx->new_refs, dir, gen, p); 3666 ret = __record_ref(refs, dir, gen, p);
3404 3667
3405out: 3668out:
3406 if (ret) 3669 if (ret)
@@ -3408,37 +3671,23 @@ out:
3408 return ret; 3671 return ret;
3409} 3672}
3410 3673
3674static int __record_new_ref(int num, u64 dir, int index,
3675 struct fs_path *name,
3676 void *ctx)
3677{
3678 struct send_ctx *sctx = ctx;
3679 return record_ref(sctx->send_root, num, dir, index, name,
3680 ctx, &sctx->new_refs);
3681}
3682
3683
3411static int __record_deleted_ref(int num, u64 dir, int index, 3684static int __record_deleted_ref(int num, u64 dir, int index,
3412 struct fs_path *name, 3685 struct fs_path *name,
3413 void *ctx) 3686 void *ctx)
3414{ 3687{
3415 int ret = 0;
3416 struct send_ctx *sctx = ctx; 3688 struct send_ctx *sctx = ctx;
3417 struct fs_path *p; 3689 return record_ref(sctx->parent_root, num, dir, index, name,
3418 u64 gen; 3690 ctx, &sctx->deleted_refs);
3419
3420 p = fs_path_alloc();
3421 if (!p)
3422 return -ENOMEM;
3423
3424 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
3425 NULL, NULL);
3426 if (ret < 0)
3427 goto out;
3428
3429 ret = get_cur_path(sctx, dir, gen, p);
3430 if (ret < 0)
3431 goto out;
3432 ret = fs_path_add_path(p, name);
3433 if (ret < 0)
3434 goto out;
3435
3436 ret = record_ref(&sctx->deleted_refs, dir, gen, p);
3437
3438out:
3439 if (ret)
3440 fs_path_free(p);
3441 return ret;
3442} 3691}
3443 3692
3444static int record_new_ref(struct send_ctx *sctx) 3693static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3868,31 @@ static int process_all_refs(struct send_ctx *sctx,
3619 root = sctx->parent_root; 3868 root = sctx->parent_root;
3620 cb = __record_deleted_ref; 3869 cb = __record_deleted_ref;
3621 } else { 3870 } else {
3622 BUG(); 3871 btrfs_err(sctx->send_root->fs_info,
3872 "Wrong command %d in process_all_refs", cmd);
3873 ret = -EINVAL;
3874 goto out;
3623 } 3875 }
3624 3876
3625 key.objectid = sctx->cmp_key->objectid; 3877 key.objectid = sctx->cmp_key->objectid;
3626 key.type = BTRFS_INODE_REF_KEY; 3878 key.type = BTRFS_INODE_REF_KEY;
3627 key.offset = 0; 3879 key.offset = 0;
3628 while (1) { 3880 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3629 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 3881 if (ret < 0)
3630 if (ret < 0) 3882 goto out;
3631 goto out;
3632 if (ret)
3633 break;
3634 3883
3884 while (1) {
3635 eb = path->nodes[0]; 3885 eb = path->nodes[0];
3636 slot = path->slots[0]; 3886 slot = path->slots[0];
3887 if (slot >= btrfs_header_nritems(eb)) {
3888 ret = btrfs_next_leaf(root, path);
3889 if (ret < 0)
3890 goto out;
3891 else if (ret > 0)
3892 break;
3893 continue;
3894 }
3895
3637 btrfs_item_key_to_cpu(eb, &found_key, slot); 3896 btrfs_item_key_to_cpu(eb, &found_key, slot);
3638 3897
3639 if (found_key.objectid != key.objectid || 3898 if (found_key.objectid != key.objectid ||
@@ -3642,11 +3901,10 @@ static int process_all_refs(struct send_ctx *sctx,
3642 break; 3901 break;
3643 3902
3644 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 3903 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3645 btrfs_release_path(path);
3646 if (ret < 0) 3904 if (ret < 0)
3647 goto out; 3905 goto out;
3648 3906
3649 key.offset = found_key.offset + 1; 3907 path->slots[0]++;
3650 } 3908 }
3651 btrfs_release_path(path); 3909 btrfs_release_path(path);
3652 3910
@@ -3927,19 +4185,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3927 key.objectid = sctx->cmp_key->objectid; 4185 key.objectid = sctx->cmp_key->objectid;
3928 key.type = BTRFS_XATTR_ITEM_KEY; 4186 key.type = BTRFS_XATTR_ITEM_KEY;
3929 key.offset = 0; 4187 key.offset = 0;
3930 while (1) { 4188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3931 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4189 if (ret < 0)
3932 if (ret < 0) 4190 goto out;
3933 goto out;
3934 if (ret) {
3935 ret = 0;
3936 goto out;
3937 }
3938 4191
4192 while (1) {
3939 eb = path->nodes[0]; 4193 eb = path->nodes[0];
3940 slot = path->slots[0]; 4194 slot = path->slots[0];
3941 btrfs_item_key_to_cpu(eb, &found_key, slot); 4195 if (slot >= btrfs_header_nritems(eb)) {
4196 ret = btrfs_next_leaf(root, path);
4197 if (ret < 0) {
4198 goto out;
4199 } else if (ret > 0) {
4200 ret = 0;
4201 break;
4202 }
4203 continue;
4204 }
3942 4205
4206 btrfs_item_key_to_cpu(eb, &found_key, slot);
3943 if (found_key.objectid != key.objectid || 4207 if (found_key.objectid != key.objectid ||
3944 found_key.type != key.type) { 4208 found_key.type != key.type) {
3945 ret = 0; 4209 ret = 0;
@@ -3951,8 +4215,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3951 if (ret < 0) 4215 if (ret < 0)
3952 goto out; 4216 goto out;
3953 4217
3954 btrfs_release_path(path); 4218 path->slots[0]++;
3955 key.offset = found_key.offset + 1;
3956 } 4219 }
3957 4220
3958out: 4221out:
@@ -3991,6 +4254,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
3991 goto out; 4254 goto out;
3992 4255
3993 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; 4256 last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
4257
4258 /* initial readahead */
4259 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4260 file_ra_state_init(&sctx->ra, inode->i_mapping);
4261 btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
4262 last_index - index + 1);
4263
3994 while (index <= last_index) { 4264 while (index <= last_index) {
3995 unsigned cur_len = min_t(unsigned, len, 4265 unsigned cur_len = min_t(unsigned, len,
3996 PAGE_CACHE_SIZE - pg_offset); 4266 PAGE_CACHE_SIZE - pg_offset);
@@ -4174,6 +4444,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
4174 p = fs_path_alloc(); 4444 p = fs_path_alloc();
4175 if (!p) 4445 if (!p)
4176 return -ENOMEM; 4446 return -ENOMEM;
4447 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4448 if (ret < 0)
4449 goto tlv_put_failure;
4177 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); 4450 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
4178 while (offset < end) { 4451 while (offset < end) {
4179 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); 4452 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
@@ -4181,9 +4454,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
4181 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 4454 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
4182 if (ret < 0) 4455 if (ret < 0)
4183 break; 4456 break;
4184 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4185 if (ret < 0)
4186 break;
4187 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 4457 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
4188 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 4458 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
4189 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); 4459 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
@@ -4724,7 +4994,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4724 4994
4725 if (S_ISREG(sctx->cur_inode_mode)) { 4995 if (S_ISREG(sctx->cur_inode_mode)) {
4726 if (need_send_hole(sctx)) { 4996 if (need_send_hole(sctx)) {
4727 if (sctx->cur_inode_last_extent == (u64)-1) { 4997 if (sctx->cur_inode_last_extent == (u64)-1 ||
4998 sctx->cur_inode_last_extent <
4999 sctx->cur_inode_size) {
4728 ret = get_last_extent(sctx, (u64)-1); 5000 ret = get_last_extent(sctx, (u64)-1);
4729 if (ret) 5001 if (ret)
4730 goto out; 5002 goto out;
@@ -4763,18 +5035,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4763 ret = apply_children_dir_moves(sctx); 5035 ret = apply_children_dir_moves(sctx);
4764 if (ret) 5036 if (ret)
4765 goto out; 5037 goto out;
5038 /*
5039 * Need to send that every time, no matter if it actually
5040 * changed between the two trees as we have done changes to
5041 * the inode before. If our inode is a directory and it's
5042 * waiting to be moved/renamed, we will send its utimes when
5043 * it's moved/renamed, therefore we don't need to do it here.
5044 */
5045 sctx->send_progress = sctx->cur_ino + 1;
5046 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5047 if (ret < 0)
5048 goto out;
4766 } 5049 }
4767 5050
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4775 if (ret < 0)
4776 goto out;
4777
4778out: 5051out:
4779 return ret; 5052 return ret;
4780} 5053}
@@ -4840,6 +5113,8 @@ static int changed_inode(struct send_ctx *sctx,
4840 sctx->left_path->nodes[0], left_ii); 5113 sctx->left_path->nodes[0], left_ii);
4841 sctx->cur_inode_mode = btrfs_inode_mode( 5114 sctx->cur_inode_mode = btrfs_inode_mode(
4842 sctx->left_path->nodes[0], left_ii); 5115 sctx->left_path->nodes[0], left_ii);
5116 sctx->cur_inode_rdev = btrfs_inode_rdev(
5117 sctx->left_path->nodes[0], left_ii);
4843 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 5118 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
4844 ret = send_create_inode_if_needed(sctx); 5119 ret = send_create_inode_if_needed(sctx);
4845 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 5120 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5159,8 @@ static int changed_inode(struct send_ctx *sctx,
4884 sctx->left_path->nodes[0], left_ii); 5159 sctx->left_path->nodes[0], left_ii);
4885 sctx->cur_inode_mode = btrfs_inode_mode( 5160 sctx->cur_inode_mode = btrfs_inode_mode(
4886 sctx->left_path->nodes[0], left_ii); 5161 sctx->left_path->nodes[0], left_ii);
5162 sctx->cur_inode_rdev = btrfs_inode_rdev(
5163 sctx->left_path->nodes[0], left_ii);
4887 ret = send_create_inode_if_needed(sctx); 5164 ret = send_create_inode_if_needed(sctx);
4888 if (ret < 0) 5165 if (ret < 0)
4889 goto out; 5166 goto out;
@@ -5124,37 +5401,15 @@ static int full_send_tree(struct send_ctx *sctx)
5124 struct btrfs_path *path; 5401 struct btrfs_path *path;
5125 struct extent_buffer *eb; 5402 struct extent_buffer *eb;
5126 int slot; 5403 int slot;
5127 u64 start_ctransid;
5128 u64 ctransid;
5129 5404
5130 path = alloc_path_for_send(); 5405 path = alloc_path_for_send();
5131 if (!path) 5406 if (!path)
5132 return -ENOMEM; 5407 return -ENOMEM;
5133 5408
5134 spin_lock(&send_root->root_item_lock);
5135 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
5136 spin_unlock(&send_root->root_item_lock);
5137
5138 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 5409 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
5139 key.type = BTRFS_INODE_ITEM_KEY; 5410 key.type = BTRFS_INODE_ITEM_KEY;
5140 key.offset = 0; 5411 key.offset = 0;
5141 5412
5142 /*
5143 * Make sure the tree has not changed after re-joining. We detect this
5144 * by comparing start_ctransid and ctransid. They should always match.
5145 */
5146 spin_lock(&send_root->root_item_lock);
5147 ctransid = btrfs_root_ctransid(&send_root->root_item);
5148 spin_unlock(&send_root->root_item_lock);
5149
5150 if (ctransid != start_ctransid) {
5151 WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
5152 "send was modified in between. This is "
5153 "probably a bug.\n");
5154 ret = -EIO;
5155 goto out;
5156 }
5157
5158 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); 5413 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
5159 if (ret < 0) 5414 if (ret < 0)
5160 goto out; 5415 goto out;
@@ -5340,6 +5595,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5340 5595
5341 sctx->pending_dir_moves = RB_ROOT; 5596 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT; 5597 sctx->waiting_dir_moves = RB_ROOT;
5598 sctx->orphan_dirs = RB_ROOT;
5343 5599
5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5600 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
5345 (arg->clone_sources_count + 1)); 5601 (arg->clone_sources_count + 1));
@@ -5435,7 +5691,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5435 NULL); 5691 NULL);
5436 sort_clone_roots = 1; 5692 sort_clone_roots = 1;
5437 5693
5694 current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
5438 ret = send_subvol(sctx); 5695 ret = send_subvol(sctx);
5696 current->journal_info = NULL;
5439 if (ret < 0) 5697 if (ret < 0)
5440 goto out; 5698 goto out;
5441 5699
@@ -5477,6 +5735,16 @@ out:
5477 kfree(dm); 5735 kfree(dm);
5478 } 5736 }
5479 5737
5738 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
5739 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
5740 struct rb_node *n;
5741 struct orphan_dir_info *odi;
5742
5743 n = rb_first(&sctx->orphan_dirs);
5744 odi = rb_entry(n, struct orphan_dir_info, node);
5745 free_orphan_dir_info(sctx, odi);
5746 }
5747
5480 if (sort_clone_roots) { 5748 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++) 5749 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress( 5750 btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..9601d25a4607 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,8 @@
66static const struct super_operations btrfs_super_ops; 66static const struct super_operations btrfs_super_ops;
67static struct file_system_type btrfs_fs_type; 67static struct file_system_type btrfs_fs_type;
68 68
69static int btrfs_remount(struct super_block *sb, int *flags, char *data);
70
69static const char *btrfs_decode_error(int errno) 71static const char *btrfs_decode_error(int errno)
70{ 72{
71 char *errstr = "unknown"; 73 char *errstr = "unknown";
@@ -383,20 +385,6 @@ static match_table_t tokens = {
383 {Opt_err, NULL}, 385 {Opt_err, NULL},
384}; 386};
385 387
386#define btrfs_set_and_info(root, opt, fmt, args...) \
387{ \
388 if (!btrfs_test_opt(root, opt)) \
389 btrfs_info(root->fs_info, fmt, ##args); \
390 btrfs_set_opt(root->fs_info->mount_opt, opt); \
391}
392
393#define btrfs_clear_and_info(root, opt, fmt, args...) \
394{ \
395 if (btrfs_test_opt(root, opt)) \
396 btrfs_info(root->fs_info, fmt, ##args); \
397 btrfs_clear_opt(root->fs_info->mount_opt, opt); \
398}
399
400/* 388/*
401 * Regular mount options parser. Everything that is needed only when 389 * Regular mount options parser. Everything that is needed only when
402 * reading in a new superblock is parsed here. 390 * reading in a new superblock is parsed here.
@@ -1184,7 +1172,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1184 return ERR_PTR(-ENOMEM); 1172 return ERR_PTR(-ENOMEM);
1185 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, 1173 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
1186 newargs); 1174 newargs);
1175
1176 if (PTR_RET(mnt) == -EBUSY) {
1177 if (flags & MS_RDONLY) {
1178 mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
1179 newargs);
1180 } else {
1181 int r;
1182 mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
1183 newargs);
1184 if (IS_ERR(mnt)) {
1185 kfree(newargs);
1186 return ERR_CAST(mnt);
1187 }
1188
1189 r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
1190 if (r < 0) {
1191 /* FIXME: release vfsmount mnt ??*/
1192 kfree(newargs);
1193 return ERR_PTR(r);
1194 }
1195 }
1196 }
1197
1187 kfree(newargs); 1198 kfree(newargs);
1199
1188 if (IS_ERR(mnt)) 1200 if (IS_ERR(mnt))
1189 return ERR_CAST(mnt); 1201 return ERR_CAST(mnt);
1190 1202
@@ -1305,13 +1317,6 @@ error_fs_info:
1305 return ERR_PTR(error); 1317 return ERR_PTR(error);
1306} 1318}
1307 1319
1308static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1309{
1310 spin_lock_irq(&workers->lock);
1311 workers->max_workers = new_limit;
1312 spin_unlock_irq(&workers->lock);
1313}
1314
1315static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1320static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1316 int new_pool_size, int old_pool_size) 1321 int new_pool_size, int old_pool_size)
1317{ 1322{
@@ -1323,21 +1328,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1323 btrfs_info(fs_info, "resize thread pool %d -> %d", 1328 btrfs_info(fs_info, "resize thread pool %d -> %d",
1324 old_pool_size, new_pool_size); 1329 old_pool_size, new_pool_size);
1325 1330
1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1331 btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1327 btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1332 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1328 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1333 btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1329 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1334 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1330 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1335 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1331 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1336 btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1332 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1337 btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1333 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1338 new_pool_size);
1334 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1339 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1335 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1340 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1336 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1341 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1337 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1342 btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1338 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1343 btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1339 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, 1344 new_pool_size);
1340 new_pool_size);
1341} 1345}
1342 1346
1343static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) 1347static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1392,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1388 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1392 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1389 int ret; 1393 int ret;
1390 1394
1395 sync_filesystem(sb);
1391 btrfs_remount_prepare(fs_info); 1396 btrfs_remount_prepare(fs_info);
1392 1397
1393 ret = btrfs_parse_options(root, data); 1398 ret = btrfs_parse_options(root, data);
@@ -1479,6 +1484,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1479 sb->s_flags &= ~MS_RDONLY; 1484 sb->s_flags &= ~MS_RDONLY;
1480 } 1485 }
1481out: 1486out:
1487 wake_up_process(fs_info->transaction_kthread);
1482 btrfs_remount_cleanup(fs_info, old_opts); 1488 btrfs_remount_cleanup(fs_info, old_opts);
1483 return 0; 1489 return 0;
1484 1490
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h> 25#include <linux/bug.h>
26#include <linux/genhd.h> 26#include <linux/genhd.h>
27#include <linux/debugfs.h>
27 28
28#include "ctree.h" 29#include "ctree.h"
29#include "disk-io.h" 30#include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
599/* /sys/fs/btrfs/ entry */ 600/* /sys/fs/btrfs/ entry */
600static struct kset *btrfs_kset; 601static struct kset *btrfs_kset;
601 602
603/* /sys/kernel/debug/btrfs */
604static struct dentry *btrfs_debugfs_root_dentry;
605
606/* Debugging tunables and exported data */
607u64 btrfs_debugfs_test;
608
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 609int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{ 610{
604 int error; 611 int error;
@@ -642,27 +649,41 @@ failure:
642 return error; 649 return error;
643} 650}
644 651
652static int btrfs_init_debugfs(void)
653{
654#ifdef CONFIG_DEBUG_FS
655 btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
656 if (!btrfs_debugfs_root_dentry)
657 return -ENOMEM;
658
659 debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
660 &btrfs_debugfs_test);
661#endif
662 return 0;
663}
664
645int btrfs_init_sysfs(void) 665int btrfs_init_sysfs(void)
646{ 666{
647 int ret; 667 int ret;
668
648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 669 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
649 if (!btrfs_kset) 670 if (!btrfs_kset)
650 return -ENOMEM; 671 return -ENOMEM;
651 672
652 init_feature_attrs(); 673 ret = btrfs_init_debugfs();
674 if (ret)
675 return ret;
653 676
677 init_feature_attrs();
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 678 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659 679
660 return 0; 680 return ret;
661} 681}
662 682
663void btrfs_exit_sysfs(void) 683void btrfs_exit_sysfs(void)
664{ 684{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 685 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
666 kset_unregister(btrfs_kset); 686 kset_unregister(btrfs_kset);
687 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
667} 688}
668 689
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
1#ifndef _BTRFS_SYSFS_H_ 1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_ 2#define _BTRFS_SYSFS_H_
3 3
4/*
5 * Data exported through sysfs
6 */
7extern u64 btrfs_debugfs_test;
8
4enum btrfs_feature_set { 9enum btrfs_feature_set {
5 FEAT_COMPAT, 10 FEAT_COMPAT,
6 FEAT_COMPAT_RO, 11 FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..7579f6d0b854 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
75 } 75 }
76} 76}
77 77
78static noinline void switch_commit_root(struct btrfs_root *root) 78static noinline void switch_commit_roots(struct btrfs_transaction *trans,
79 struct btrfs_fs_info *fs_info)
79{ 80{
80 free_extent_buffer(root->commit_root); 81 struct btrfs_root *root, *tmp;
81 root->commit_root = btrfs_root_node(root); 82
83 down_write(&fs_info->commit_root_sem);
84 list_for_each_entry_safe(root, tmp, &trans->switch_commits,
85 dirty_list) {
86 list_del_init(&root->dirty_list);
87 free_extent_buffer(root->commit_root);
88 root->commit_root = btrfs_root_node(root);
89 if (is_fstree(root->objectid))
90 btrfs_unpin_free_ino(root);
91 }
92 up_write(&fs_info->commit_root_sem);
82} 93}
83 94
84static inline void extwriter_counter_inc(struct btrfs_transaction *trans, 95static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -208,6 +219,7 @@ loop:
208 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 219 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
209 INIT_LIST_HEAD(&cur_trans->ordered_operations); 220 INIT_LIST_HEAD(&cur_trans->ordered_operations);
210 INIT_LIST_HEAD(&cur_trans->pending_chunks); 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
222 INIT_LIST_HEAD(&cur_trans->switch_commits);
211 list_add_tail(&cur_trans->list, &fs_info->trans_list); 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
212 extent_io_tree_init(&cur_trans->dirty_pages, 224 extent_io_tree_init(&cur_trans->dirty_pages,
213 fs_info->btree_inode->i_mapping); 225 fs_info->btree_inode->i_mapping);
@@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
375 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 387 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
376 return ERR_PTR(-EROFS); 388 return ERR_PTR(-EROFS);
377 389
378 if (current->journal_info) { 390 if (current->journal_info &&
391 current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
379 WARN_ON(type & TRANS_EXTWRITERS); 392 WARN_ON(type & TRANS_EXTWRITERS);
380 h = current->journal_info; 393 h = current->journal_info;
381 h->use_count++; 394 h->use_count++;
@@ -683,7 +696,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
683 int lock = (trans->type != TRANS_JOIN_NOLOCK); 696 int lock = (trans->type != TRANS_JOIN_NOLOCK);
684 int err = 0; 697 int err = 0;
685 698
686 if (--trans->use_count) { 699 if (trans->use_count > 1) {
700 trans->use_count--;
687 trans->block_rsv = trans->orig_rsv; 701 trans->block_rsv = trans->orig_rsv;
688 return 0; 702 return 0;
689 } 703 }
@@ -731,17 +745,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
731 } 745 }
732 746
733 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 747 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
734 if (throttle) { 748 if (throttle)
735 /*
736 * We may race with somebody else here so end up having
737 * to call end_transaction on ourselves again, so inc
738 * our use_count.
739 */
740 trans->use_count++;
741 return btrfs_commit_transaction(trans, root); 749 return btrfs_commit_transaction(trans, root);
742 } else { 750 else
743 wake_up_process(info->transaction_kthread); 751 wake_up_process(info->transaction_kthread);
744 }
745 } 752 }
746 753
747 if (trans->type & __TRANS_FREEZABLE) 754 if (trans->type & __TRANS_FREEZABLE)
@@ -925,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
925 return ret; 932 return ret;
926 } 933 }
927 934
928 if (root != root->fs_info->extent_root)
929 switch_commit_root(root);
930
931 return 0; 935 return 0;
932} 936}
933 937
@@ -983,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
983 list_del_init(next); 987 list_del_init(next);
984 root = list_entry(next, struct btrfs_root, dirty_list); 988 root = list_entry(next, struct btrfs_root, dirty_list);
985 989
990 if (root != fs_info->extent_root)
991 list_add_tail(&root->dirty_list,
992 &trans->transaction->switch_commits);
986 ret = update_cowonly_root(trans, root); 993 ret = update_cowonly_root(trans, root);
987 if (ret) 994 if (ret)
988 return ret; 995 return ret;
989 } 996 }
990 997
991 down_write(&fs_info->extent_commit_sem); 998 list_add_tail(&fs_info->extent_root->dirty_list,
992 switch_commit_root(fs_info->extent_root); 999 &trans->transaction->switch_commits);
993 up_write(&fs_info->extent_commit_sem);
994
995 btrfs_after_dev_replace_commit(fs_info); 1000 btrfs_after_dev_replace_commit(fs_info);
996 1001
997 return 0; 1002 return 0;
@@ -1048,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1048 smp_wmb(); 1053 smp_wmb();
1049 1054
1050 if (root->commit_root != root->node) { 1055 if (root->commit_root != root->node) {
1051 mutex_lock(&root->fs_commit_mutex); 1056 list_add_tail(&root->dirty_list,
1052 switch_commit_root(root); 1057 &trans->transaction->switch_commits);
1053 btrfs_unpin_free_ino(root);
1054 mutex_unlock(&root->fs_commit_mutex);
1055
1056 btrfs_set_root_node(&root->root_item, 1058 btrfs_set_root_node(&root->root_item,
1057 root->node); 1059 root->node);
1058 } 1060 }
@@ -1578,10 +1580,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1578 1580
1579 trace_btrfs_transaction_commit(root); 1581 trace_btrfs_transaction_commit(root);
1580 1582
1581 btrfs_scrub_continue(root);
1582
1583 if (current->journal_info == trans) 1583 if (current->journal_info == trans)
1584 current->journal_info = NULL; 1584 current->journal_info = NULL;
1585 btrfs_scrub_cancel(root->fs_info);
1585 1586
1586 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1587 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1587} 1588}
@@ -1621,7 +1622,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1621static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1622static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1622{ 1623{
1623 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1624 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1624 return btrfs_start_delalloc_roots(fs_info, 1); 1625 return btrfs_start_delalloc_roots(fs_info, 1, -1);
1625 return 0; 1626 return 0;
1626} 1627}
1627 1628
@@ -1754,7 +1755,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 /* ->aborted might be set after the previous check, so check it */ 1755 /* ->aborted might be set after the previous check, so check it */
1755 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1756 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1756 ret = cur_trans->aborted; 1757 ret = cur_trans->aborted;
1757 goto cleanup_transaction; 1758 goto scrub_continue;
1758 } 1759 }
1759 /* 1760 /*
1760 * the reloc mutex makes sure that we stop 1761 * the reloc mutex makes sure that we stop
@@ -1771,7 +1772,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1771 ret = create_pending_snapshots(trans, root->fs_info); 1772 ret = create_pending_snapshots(trans, root->fs_info);
1772 if (ret) { 1773 if (ret) {
1773 mutex_unlock(&root->fs_info->reloc_mutex); 1774 mutex_unlock(&root->fs_info->reloc_mutex);
1774 goto cleanup_transaction; 1775 goto scrub_continue;
1775 } 1776 }
1776 1777
1777 /* 1778 /*
@@ -1787,13 +1788,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1787 ret = btrfs_run_delayed_items(trans, root); 1788 ret = btrfs_run_delayed_items(trans, root);
1788 if (ret) { 1789 if (ret) {
1789 mutex_unlock(&root->fs_info->reloc_mutex); 1790 mutex_unlock(&root->fs_info->reloc_mutex);
1790 goto cleanup_transaction; 1791 goto scrub_continue;
1791 } 1792 }
1792 1793
1793 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1794 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1794 if (ret) { 1795 if (ret) {
1795 mutex_unlock(&root->fs_info->reloc_mutex); 1796 mutex_unlock(&root->fs_info->reloc_mutex);
1796 goto cleanup_transaction; 1797 goto scrub_continue;
1797 } 1798 }
1798 1799
1799 /* 1800 /*
@@ -1823,7 +1824,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1823 if (ret) { 1824 if (ret) {
1824 mutex_unlock(&root->fs_info->tree_log_mutex); 1825 mutex_unlock(&root->fs_info->tree_log_mutex);
1825 mutex_unlock(&root->fs_info->reloc_mutex); 1826 mutex_unlock(&root->fs_info->reloc_mutex);
1826 goto cleanup_transaction; 1827 goto scrub_continue;
1827 } 1828 }
1828 1829
1829 /* 1830 /*
@@ -1844,7 +1845,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1844 if (ret) { 1845 if (ret) {
1845 mutex_unlock(&root->fs_info->tree_log_mutex); 1846 mutex_unlock(&root->fs_info->tree_log_mutex);
1846 mutex_unlock(&root->fs_info->reloc_mutex); 1847 mutex_unlock(&root->fs_info->reloc_mutex);
1847 goto cleanup_transaction; 1848 goto scrub_continue;
1848 } 1849 }
1849 1850
1850 /* 1851 /*
@@ -1855,7 +1856,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1855 ret = cur_trans->aborted; 1856 ret = cur_trans->aborted;
1856 mutex_unlock(&root->fs_info->tree_log_mutex); 1857 mutex_unlock(&root->fs_info->tree_log_mutex);
1857 mutex_unlock(&root->fs_info->reloc_mutex); 1858 mutex_unlock(&root->fs_info->reloc_mutex);
1858 goto cleanup_transaction; 1859 goto scrub_continue;
1859 } 1860 }
1860 1861
1861 btrfs_prepare_extent_commit(trans, root); 1862 btrfs_prepare_extent_commit(trans, root);
@@ -1864,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1864 1865
1865 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1866 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1866 root->fs_info->tree_root->node); 1867 root->fs_info->tree_root->node);
1867 switch_commit_root(root->fs_info->tree_root); 1868 list_add_tail(&root->fs_info->tree_root->dirty_list,
1869 &cur_trans->switch_commits);
1868 1870
1869 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1871 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1870 root->fs_info->chunk_root->node); 1872 root->fs_info->chunk_root->node);
1871 switch_commit_root(root->fs_info->chunk_root); 1873 list_add_tail(&root->fs_info->chunk_root->dirty_list,
1874 &cur_trans->switch_commits);
1875
1876 switch_commit_roots(cur_trans, root->fs_info);
1872 1877
1873 assert_qgroups_uptodate(trans); 1878 assert_qgroups_uptodate(trans);
1874 update_super_roots(root); 1879 update_super_roots(root);
@@ -1891,13 +1896,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1891 btrfs_error(root->fs_info, ret, 1896 btrfs_error(root->fs_info, ret,
1892 "Error while writing out transaction"); 1897 "Error while writing out transaction");
1893 mutex_unlock(&root->fs_info->tree_log_mutex); 1898 mutex_unlock(&root->fs_info->tree_log_mutex);
1894 goto cleanup_transaction; 1899 goto scrub_continue;
1895 } 1900 }
1896 1901
1897 ret = write_ctree_super(trans, root, 0); 1902 ret = write_ctree_super(trans, root, 0);
1898 if (ret) { 1903 if (ret) {
1899 mutex_unlock(&root->fs_info->tree_log_mutex); 1904 mutex_unlock(&root->fs_info->tree_log_mutex);
1900 goto cleanup_transaction; 1905 goto scrub_continue;
1901 } 1906 }
1902 1907
1903 /* 1908 /*
@@ -1940,6 +1945,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1940 1945
1941 return ret; 1946 return ret;
1942 1947
1948scrub_continue:
1949 btrfs_scrub_continue(root);
1943cleanup_transaction: 1950cleanup_transaction:
1944 btrfs_trans_release_metadata(trans, root); 1951 btrfs_trans_release_metadata(trans, root);
1945 trans->block_rsv = NULL; 1952 trans->block_rsv = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6ac037e9f9f0..b57b924e8e03 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks; 59 struct list_head pending_chunks;
60 struct list_head switch_commits;
60 struct btrfs_delayed_ref_root delayed_refs; 61 struct btrfs_delayed_ref_root delayed_refs;
61 int aborted; 62 int aborted;
62}; 63};
@@ -78,6 +79,8 @@ struct btrfs_transaction {
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ 79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH) 80 __TRANS_ATTACH)
80 81
82#define BTRFS_SEND_TRANS_STUB 1
83
81struct btrfs_trans_handle { 84struct btrfs_trans_handle {
82 u64 transid; 85 u64 transid;
83 u64 bytes_reserved; 86 u64 bytes_reserved;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
136 * syncing the tree wait for us to finish 136 * syncing the tree wait for us to finish
137 */ 137 */
138static int start_log_trans(struct btrfs_trans_handle *trans, 138static int start_log_trans(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root) 139 struct btrfs_root *root,
140 struct btrfs_log_ctx *ctx)
140{ 141{
142 int index;
141 int ret; 143 int ret;
142 int err = 0;
143 144
144 mutex_lock(&root->log_mutex); 145 mutex_lock(&root->log_mutex);
145 if (root->log_root) { 146 if (root->log_root) {
147 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
148 trans->transid) {
149 ret = -EAGAIN;
150 goto out;
151 }
152
146 if (!root->log_start_pid) { 153 if (!root->log_start_pid) {
147 root->log_start_pid = current->pid; 154 root->log_start_pid = current->pid;
148 root->log_multiple_pids = false; 155 root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
152 159
153 atomic_inc(&root->log_batch); 160 atomic_inc(&root->log_batch);
154 atomic_inc(&root->log_writers); 161 atomic_inc(&root->log_writers);
162 if (ctx) {
163 index = root->log_transid % 2;
164 list_add_tail(&ctx->list, &root->log_ctxs[index]);
165 ctx->log_transid = root->log_transid;
166 }
155 mutex_unlock(&root->log_mutex); 167 mutex_unlock(&root->log_mutex);
156 return 0; 168 return 0;
157 } 169 }
158 root->log_multiple_pids = false; 170
159 root->log_start_pid = current->pid; 171 ret = 0;
160 mutex_lock(&root->fs_info->tree_log_mutex); 172 mutex_lock(&root->fs_info->tree_log_mutex);
161 if (!root->fs_info->log_root_tree) { 173 if (!root->fs_info->log_root_tree)
162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 174 ret = btrfs_init_log_root_tree(trans, root->fs_info);
163 if (ret) 175 mutex_unlock(&root->fs_info->tree_log_mutex);
164 err = ret; 176 if (ret)
165 } 177 goto out;
166 if (err == 0 && !root->log_root) { 178
179 if (!root->log_root) {
167 ret = btrfs_add_log_tree(trans, root); 180 ret = btrfs_add_log_tree(trans, root);
168 if (ret) 181 if (ret)
169 err = ret; 182 goto out;
170 } 183 }
171 mutex_unlock(&root->fs_info->tree_log_mutex); 184 root->log_multiple_pids = false;
185 root->log_start_pid = current->pid;
172 atomic_inc(&root->log_batch); 186 atomic_inc(&root->log_batch);
173 atomic_inc(&root->log_writers); 187 atomic_inc(&root->log_writers);
188 if (ctx) {
189 index = root->log_transid % 2;
190 list_add_tail(&ctx->list, &root->log_ctxs[index]);
191 ctx->log_transid = root->log_transid;
192 }
193out:
174 mutex_unlock(&root->log_mutex); 194 mutex_unlock(&root->log_mutex);
175 return err; 195 return ret;
176} 196}
177 197
178/* 198/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
2359 return ret; 2379 return ret;
2360} 2380}
2361 2381
2362static int wait_log_commit(struct btrfs_trans_handle *trans, 2382static void wait_log_commit(struct btrfs_trans_handle *trans,
2363 struct btrfs_root *root, unsigned long transid) 2383 struct btrfs_root *root, int transid)
2364{ 2384{
2365 DEFINE_WAIT(wait); 2385 DEFINE_WAIT(wait);
2366 int index = transid % 2; 2386 int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
2375 &wait, TASK_UNINTERRUPTIBLE); 2395 &wait, TASK_UNINTERRUPTIBLE);
2376 mutex_unlock(&root->log_mutex); 2396 mutex_unlock(&root->log_mutex);
2377 2397
2378 if (root->fs_info->last_trans_log_full_commit != 2398 if (root->log_transid_committed < transid &&
2379 trans->transid && root->log_transid < transid + 2 &&
2380 atomic_read(&root->log_commit[index])) 2399 atomic_read(&root->log_commit[index]))
2381 schedule(); 2400 schedule();
2382 2401
2383 finish_wait(&root->log_commit_wait[index], &wait); 2402 finish_wait(&root->log_commit_wait[index], &wait);
2384 mutex_lock(&root->log_mutex); 2403 mutex_lock(&root->log_mutex);
2385 } while (root->fs_info->last_trans_log_full_commit != 2404 } while (root->log_transid_committed < transid &&
2386 trans->transid && root->log_transid < transid + 2 &&
2387 atomic_read(&root->log_commit[index])); 2405 atomic_read(&root->log_commit[index]));
2388 return 0;
2389} 2406}
2390 2407
2391static void wait_for_writer(struct btrfs_trans_handle *trans, 2408static void wait_for_writer(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2409 struct btrfs_root *root)
2393{ 2410{
2394 DEFINE_WAIT(wait); 2411 DEFINE_WAIT(wait);
2395 while (root->fs_info->last_trans_log_full_commit != 2412
2396 trans->transid && atomic_read(&root->log_writers)) { 2413 while (atomic_read(&root->log_writers)) {
2397 prepare_to_wait(&root->log_writer_wait, 2414 prepare_to_wait(&root->log_writer_wait,
2398 &wait, TASK_UNINTERRUPTIBLE); 2415 &wait, TASK_UNINTERRUPTIBLE);
2399 mutex_unlock(&root->log_mutex); 2416 mutex_unlock(&root->log_mutex);
2400 if (root->fs_info->last_trans_log_full_commit != 2417 if (atomic_read(&root->log_writers))
2401 trans->transid && atomic_read(&root->log_writers))
2402 schedule(); 2418 schedule();
2403 mutex_lock(&root->log_mutex); 2419 mutex_lock(&root->log_mutex);
2404 finish_wait(&root->log_writer_wait, &wait); 2420 finish_wait(&root->log_writer_wait, &wait);
2405 } 2421 }
2406} 2422}
2407 2423
2424static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2425 struct btrfs_log_ctx *ctx)
2426{
2427 if (!ctx)
2428 return;
2429
2430 mutex_lock(&root->log_mutex);
2431 list_del_init(&ctx->list);
2432 mutex_unlock(&root->log_mutex);
2433}
2434
2435/*
2436 * Invoked in log mutex context, or be sure there is no other task which
2437 * can access the list.
2438 */
2439static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2440 int index, int error)
2441{
2442 struct btrfs_log_ctx *ctx;
2443
2444 if (!error) {
2445 INIT_LIST_HEAD(&root->log_ctxs[index]);
2446 return;
2447 }
2448
2449 list_for_each_entry(ctx, &root->log_ctxs[index], list)
2450 ctx->log_ret = error;
2451
2452 INIT_LIST_HEAD(&root->log_ctxs[index]);
2453}
2454
2408/* 2455/*
2409 * btrfs_sync_log does sends a given tree log down to the disk and 2456 * btrfs_sync_log does sends a given tree log down to the disk and
2410 * updates the super blocks to record it. When this call is done, 2457 * updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2418 * that has happened. 2465 * that has happened.
2419 */ 2466 */
2420int btrfs_sync_log(struct btrfs_trans_handle *trans, 2467int btrfs_sync_log(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root) 2468 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2422{ 2469{
2423 int index1; 2470 int index1;
2424 int index2; 2471 int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2426 int ret; 2473 int ret;
2427 struct btrfs_root *log = root->log_root; 2474 struct btrfs_root *log = root->log_root;
2428 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2475 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2429 unsigned long log_transid = 0; 2476 int log_transid = 0;
2477 struct btrfs_log_ctx root_log_ctx;
2430 struct blk_plug plug; 2478 struct blk_plug plug;
2431 2479
2432 mutex_lock(&root->log_mutex); 2480 mutex_lock(&root->log_mutex);
2433 log_transid = root->log_transid; 2481 log_transid = ctx->log_transid;
2434 index1 = root->log_transid % 2; 2482 if (root->log_transid_committed >= log_transid) {
2483 mutex_unlock(&root->log_mutex);
2484 return ctx->log_ret;
2485 }
2486
2487 index1 = log_transid % 2;
2435 if (atomic_read(&root->log_commit[index1])) { 2488 if (atomic_read(&root->log_commit[index1])) {
2436 wait_log_commit(trans, root, root->log_transid); 2489 wait_log_commit(trans, root, log_transid);
2437 mutex_unlock(&root->log_mutex); 2490 mutex_unlock(&root->log_mutex);
2438 return 0; 2491 return ctx->log_ret;
2439 } 2492 }
2493 ASSERT(log_transid == root->log_transid);
2440 atomic_set(&root->log_commit[index1], 1); 2494 atomic_set(&root->log_commit[index1], 1);
2441 2495
2442 /* wait for previous tree log sync to complete */ 2496 /* wait for previous tree log sync to complete */
2443 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2497 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2444 wait_log_commit(trans, root, root->log_transid - 1); 2498 wait_log_commit(trans, root, log_transid - 1);
2499
2445 while (1) { 2500 while (1) {
2446 int batch = atomic_read(&root->log_batch); 2501 int batch = atomic_read(&root->log_batch);
2447 /* when we're on an ssd, just kick the log commit out */ 2502 /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2456 } 2511 }
2457 2512
2458 /* bail out if we need to do a full commit */ 2513 /* bail out if we need to do a full commit */
2459 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2514 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2515 trans->transid) {
2460 ret = -EAGAIN; 2516 ret = -EAGAIN;
2461 btrfs_free_logged_extents(log, log_transid); 2517 btrfs_free_logged_extents(log, log_transid);
2462 mutex_unlock(&root->log_mutex); 2518 mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2477 blk_finish_plug(&plug); 2533 blk_finish_plug(&plug);
2478 btrfs_abort_transaction(trans, root, ret); 2534 btrfs_abort_transaction(trans, root, ret);
2479 btrfs_free_logged_extents(log, log_transid); 2535 btrfs_free_logged_extents(log, log_transid);
2536 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2537 trans->transid;
2480 mutex_unlock(&root->log_mutex); 2538 mutex_unlock(&root->log_mutex);
2481 goto out; 2539 goto out;
2482 } 2540 }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2486 root->log_transid++; 2544 root->log_transid++;
2487 log->log_transid = root->log_transid; 2545 log->log_transid = root->log_transid;
2488 root->log_start_pid = 0; 2546 root->log_start_pid = 0;
2489 smp_mb();
2490 /* 2547 /*
2491 * IO has been started, blocks of the log tree have WRITTEN flag set 2548 * IO has been started, blocks of the log tree have WRITTEN flag set
2492 * in their headers. new modifications of the log will be written to 2549 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2494 */ 2551 */
2495 mutex_unlock(&root->log_mutex); 2552 mutex_unlock(&root->log_mutex);
2496 2553
2554 btrfs_init_log_ctx(&root_log_ctx);
2555
2497 mutex_lock(&log_root_tree->log_mutex); 2556 mutex_lock(&log_root_tree->log_mutex);
2498 atomic_inc(&log_root_tree->log_batch); 2557 atomic_inc(&log_root_tree->log_batch);
2499 atomic_inc(&log_root_tree->log_writers); 2558 atomic_inc(&log_root_tree->log_writers);
2559
2560 index2 = log_root_tree->log_transid % 2;
2561 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2562 root_log_ctx.log_transid = log_root_tree->log_transid;
2563
2500 mutex_unlock(&log_root_tree->log_mutex); 2564 mutex_unlock(&log_root_tree->log_mutex);
2501 2565
2502 ret = update_log_root(trans, log); 2566 ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2509 } 2573 }
2510 2574
2511 if (ret) { 2575 if (ret) {
2576 if (!list_empty(&root_log_ctx.list))
2577 list_del_init(&root_log_ctx.list);
2578
2512 blk_finish_plug(&plug); 2579 blk_finish_plug(&plug);
2580 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2581 trans->transid;
2513 if (ret != -ENOSPC) { 2582 if (ret != -ENOSPC) {
2514 btrfs_abort_transaction(trans, root, ret); 2583 btrfs_abort_transaction(trans, root, ret);
2515 mutex_unlock(&log_root_tree->log_mutex); 2584 mutex_unlock(&log_root_tree->log_mutex);
2516 goto out; 2585 goto out;
2517 } 2586 }
2518 root->fs_info->last_trans_log_full_commit = trans->transid;
2519 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2587 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2520 btrfs_free_logged_extents(log, log_transid); 2588 btrfs_free_logged_extents(log, log_transid);
2521 mutex_unlock(&log_root_tree->log_mutex); 2589 mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2523 goto out; 2591 goto out;
2524 } 2592 }
2525 2593
2526 index2 = log_root_tree->log_transid % 2; 2594 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2595 mutex_unlock(&log_root_tree->log_mutex);
2596 ret = root_log_ctx.log_ret;
2597 goto out;
2598 }
2599
2600 index2 = root_log_ctx.log_transid % 2;
2527 if (atomic_read(&log_root_tree->log_commit[index2])) { 2601 if (atomic_read(&log_root_tree->log_commit[index2])) {
2528 blk_finish_plug(&plug); 2602 blk_finish_plug(&plug);
2529 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2603 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2530 wait_log_commit(trans, log_root_tree, 2604 wait_log_commit(trans, log_root_tree,
2531 log_root_tree->log_transid); 2605 root_log_ctx.log_transid);
2532 btrfs_free_logged_extents(log, log_transid); 2606 btrfs_free_logged_extents(log, log_transid);
2533 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2534 ret = 0; 2608 ret = root_log_ctx.log_ret;
2535 goto out; 2609 goto out;
2536 } 2610 }
2611 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2537 atomic_set(&log_root_tree->log_commit[index2], 1); 2612 atomic_set(&log_root_tree->log_commit[index2], 1);
2538 2613
2539 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2614 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2540 wait_log_commit(trans, log_root_tree, 2615 wait_log_commit(trans, log_root_tree,
2541 log_root_tree->log_transid - 1); 2616 root_log_ctx.log_transid - 1);
2542 } 2617 }
2543 2618
2544 wait_for_writer(trans, log_root_tree); 2619 wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2547 * now that we've moved on to the tree of log tree roots, 2622 * now that we've moved on to the tree of log tree roots,
2548 * check the full commit flag again 2623 * check the full commit flag again
2549 */ 2624 */
2550 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2625 if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2626 trans->transid) {
2551 blk_finish_plug(&plug); 2627 blk_finish_plug(&plug);
2552 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2628 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2553 btrfs_free_logged_extents(log, log_transid); 2629 btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2561 EXTENT_DIRTY | EXTENT_NEW); 2637 EXTENT_DIRTY | EXTENT_NEW);
2562 blk_finish_plug(&plug); 2638 blk_finish_plug(&plug);
2563 if (ret) { 2639 if (ret) {
2640 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2641 trans->transid;
2564 btrfs_abort_transaction(trans, root, ret); 2642 btrfs_abort_transaction(trans, root, ret);
2565 btrfs_free_logged_extents(log, log_transid); 2643 btrfs_free_logged_extents(log, log_transid);
2566 mutex_unlock(&log_root_tree->log_mutex); 2644 mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2578 btrfs_header_level(log_root_tree->node)); 2656 btrfs_header_level(log_root_tree->node));
2579 2657
2580 log_root_tree->log_transid++; 2658 log_root_tree->log_transid++;
2581 smp_mb();
2582
2583 mutex_unlock(&log_root_tree->log_mutex); 2659 mutex_unlock(&log_root_tree->log_mutex);
2584 2660
2585 /* 2661 /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 */ 2667 */
2592 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2668 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2593 if (ret) { 2669 if (ret) {
2670 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2671 trans->transid;
2594 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_abort_transaction(trans, root, ret);
2595 goto out_wake_log_root; 2673 goto out_wake_log_root;
2596 } 2674 }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2601 mutex_unlock(&root->log_mutex); 2679 mutex_unlock(&root->log_mutex);
2602 2680
2603out_wake_log_root: 2681out_wake_log_root:
2682 /*
2683 * We needn't get log_mutex here because we are sure all
2684 * the other tasks are blocked.
2685 */
2686 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2687
2688 mutex_lock(&log_root_tree->log_mutex);
2689 log_root_tree->log_transid_committed++;
2604 atomic_set(&log_root_tree->log_commit[index2], 0); 2690 atomic_set(&log_root_tree->log_commit[index2], 0);
2605 smp_mb(); 2691 mutex_unlock(&log_root_tree->log_mutex);
2692
2606 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2693 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2607 wake_up(&log_root_tree->log_commit_wait[index2]); 2694 wake_up(&log_root_tree->log_commit_wait[index2]);
2608out: 2695out:
2696 /* See above. */
2697 btrfs_remove_all_log_ctxs(root, index1, ret);
2698
2699 mutex_lock(&root->log_mutex);
2700 root->log_transid_committed++;
2609 atomic_set(&root->log_commit[index1], 0); 2701 atomic_set(&root->log_commit[index1], 0);
2610 smp_mb(); 2702 mutex_unlock(&root->log_mutex);
2703
2611 if (waitqueue_active(&root->log_commit_wait[index1])) 2704 if (waitqueue_active(&root->log_commit_wait[index1]))
2612 wake_up(&root->log_commit_wait[index1]); 2705 wake_up(&root->log_commit_wait[index1]);
2613 return ret; 2706 return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3479 3572
3480static int log_one_extent(struct btrfs_trans_handle *trans, 3573static int log_one_extent(struct btrfs_trans_handle *trans,
3481 struct inode *inode, struct btrfs_root *root, 3574 struct inode *inode, struct btrfs_root *root,
3482 struct extent_map *em, struct btrfs_path *path) 3575 struct extent_map *em, struct btrfs_path *path,
3576 struct list_head *logged_list)
3483{ 3577{
3484 struct btrfs_root *log = root->log_root; 3578 struct btrfs_root *log = root->log_root;
3485 struct btrfs_file_extent_item *fi; 3579 struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3495 u64 extent_offset = em->start - em->orig_start; 3589 u64 extent_offset = em->start - em->orig_start;
3496 u64 block_len; 3590 u64 block_len;
3497 int ret; 3591 int ret;
3498 int index = log->log_transid % 2;
3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3592 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3500 int extent_inserted = 0; 3593 int extent_inserted = 0;
3501 3594
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3579 * First check and see if our csums are on our outstanding ordered 3672 * First check and see if our csums are on our outstanding ordered
3580 * extents. 3673 * extents.
3581 */ 3674 */
3582again: 3675 list_for_each_entry(ordered, logged_list, log_list) {
3583 spin_lock_irq(&log->log_extents_lock[index]);
3584 list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3585 struct btrfs_ordered_sum *sum; 3676 struct btrfs_ordered_sum *sum;
3586 3677
3587 if (!mod_len) 3678 if (!mod_len)
3588 break; 3679 break;
3589 3680
3590 if (ordered->inode != inode)
3591 continue;
3592
3593 if (ordered->file_offset + ordered->len <= mod_start || 3681 if (ordered->file_offset + ordered->len <= mod_start ||
3594 mod_start + mod_len <= ordered->file_offset) 3682 mod_start + mod_len <= ordered->file_offset)
3595 continue; 3683 continue;
@@ -3632,12 +3720,6 @@ again:
3632 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3720 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3633 &ordered->flags)) 3721 &ordered->flags))
3634 continue; 3722 continue;
3635 atomic_inc(&ordered->refs);
3636 spin_unlock_irq(&log->log_extents_lock[index]);
3637 /*
3638 * we've dropped the lock, we must either break or
3639 * start over after this.
3640 */
3641 3723
3642 if (ordered->csum_bytes_left) { 3724 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0); 3725 btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
3647 3729
3648 list_for_each_entry(sum, &ordered->list, list) { 3730 list_for_each_entry(sum, &ordered->list, list) {
3649 ret = btrfs_csum_file_blocks(trans, log, sum); 3731 ret = btrfs_csum_file_blocks(trans, log, sum);
3650 if (ret) { 3732 if (ret)
3651 btrfs_put_ordered_extent(ordered);
3652 goto unlocked; 3733 goto unlocked;
3653 }
3654 } 3734 }
3655 btrfs_put_ordered_extent(ordered);
3656 goto again;
3657 3735
3658 } 3736 }
3659 spin_unlock_irq(&log->log_extents_lock[index]);
3660unlocked: 3737unlocked:
3661 3738
3662 if (!mod_len || ret) 3739 if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
3694static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3771static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3695 struct btrfs_root *root, 3772 struct btrfs_root *root,
3696 struct inode *inode, 3773 struct inode *inode,
3697 struct btrfs_path *path) 3774 struct btrfs_path *path,
3775 struct list_head *logged_list)
3698{ 3776{
3699 struct extent_map *em, *n; 3777 struct extent_map *em, *n;
3700 struct list_head extents; 3778 struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
3752 3830
3753 write_unlock(&tree->lock); 3831 write_unlock(&tree->lock);
3754 3832
3755 ret = log_one_extent(trans, inode, root, em, path); 3833 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3756 write_lock(&tree->lock); 3834 write_lock(&tree->lock);
3757 clear_em_logging(tree, em); 3835 clear_em_logging(tree, em);
3758 free_extent_map(em); 3836 free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3788 struct btrfs_key max_key; 3866 struct btrfs_key max_key;
3789 struct btrfs_root *log = root->log_root; 3867 struct btrfs_root *log = root->log_root;
3790 struct extent_buffer *src = NULL; 3868 struct extent_buffer *src = NULL;
3869 LIST_HEAD(logged_list);
3791 u64 last_extent = 0; 3870 u64 last_extent = 0;
3792 int err = 0; 3871 int err = 0;
3793 int ret; 3872 int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3836 3915
3837 mutex_lock(&BTRFS_I(inode)->log_mutex); 3916 mutex_lock(&BTRFS_I(inode)->log_mutex);
3838 3917
3839 btrfs_get_logged_extents(log, inode); 3918 btrfs_get_logged_extents(inode, &logged_list);
3840 3919
3841 /* 3920 /*
3842 * a brute force approach to making sure we get the most uptodate 3921 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
3962 btrfs_release_path(path); 4041 btrfs_release_path(path);
3963 btrfs_release_path(dst_path); 4042 btrfs_release_path(dst_path);
3964 if (fast_search) { 4043 if (fast_search) {
3965 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 4044 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4045 &logged_list);
3966 if (ret) { 4046 if (ret) {
3967 err = ret; 4047 err = ret;
3968 goto out_unlock; 4048 goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
3987 BTRFS_I(inode)->logged_trans = trans->transid; 4067 BTRFS_I(inode)->logged_trans = trans->transid;
3988 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4068 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3989out_unlock: 4069out_unlock:
3990 if (err) 4070 if (unlikely(err))
3991 btrfs_free_logged_extents(log, log->log_transid); 4071 btrfs_put_logged_extents(&logged_list);
4072 else
4073 btrfs_submit_logged_extents(&logged_list, log);
3992 mutex_unlock(&BTRFS_I(inode)->log_mutex); 4074 mutex_unlock(&BTRFS_I(inode)->log_mutex);
3993 4075
3994 btrfs_free_path(path); 4076 btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
4079 */ 4161 */
4080static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4162static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4081 struct btrfs_root *root, struct inode *inode, 4163 struct btrfs_root *root, struct inode *inode,
4082 struct dentry *parent, int exists_only) 4164 struct dentry *parent, int exists_only,
4165 struct btrfs_log_ctx *ctx)
4083{ 4166{
4084 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4167 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4085 struct super_block *sb; 4168 struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4116 goto end_no_trans; 4199 goto end_no_trans;
4117 } 4200 }
4118 4201
4119 ret = start_log_trans(trans, root); 4202 ret = start_log_trans(trans, root, ctx);
4120 if (ret) 4203 if (ret)
4121 goto end_trans; 4204 goto end_no_trans;
4122 4205
4123 ret = btrfs_log_inode(trans, root, inode, inode_only); 4206 ret = btrfs_log_inode(trans, root, inode, inode_only);
4124 if (ret) 4207 if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
4166 root->fs_info->last_trans_log_full_commit = trans->transid; 4249 root->fs_info->last_trans_log_full_commit = trans->transid;
4167 ret = 1; 4250 ret = 1;
4168 } 4251 }
4252
4253 if (ret)
4254 btrfs_remove_log_ctx(root, ctx);
4169 btrfs_end_log_trans(root); 4255 btrfs_end_log_trans(root);
4170end_no_trans: 4256end_no_trans:
4171 return ret; 4257 return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
4178 * data on disk. 4264 * data on disk.
4179 */ 4265 */
4180int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4266int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4181 struct btrfs_root *root, struct dentry *dentry) 4267 struct btrfs_root *root, struct dentry *dentry,
4268 struct btrfs_log_ctx *ctx)
4182{ 4269{
4183 struct dentry *parent = dget_parent(dentry); 4270 struct dentry *parent = dget_parent(dentry);
4184 int ret; 4271 int ret;
4185 4272
4186 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4273 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4274 0, ctx);
4187 dput(parent); 4275 dput(parent);
4188 4276
4189 return ret; 4277 return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4420 root->fs_info->last_trans_committed)) 4508 root->fs_info->last_trans_committed))
4421 return 0; 4509 return 0;
4422 4510
4423 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4511 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4424} 4512}
4425 4513
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ 22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256 23#define BTRFS_NO_LOG_SYNC 256
24 24
25struct btrfs_log_ctx {
26 int log_ret;
27 int log_transid;
28 struct list_head list;
29};
30
31static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
32{
33 ctx->log_ret = 0;
34 ctx->log_transid = 0;
35 INIT_LIST_HEAD(&ctx->list);
36}
37
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 38int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 39 struct btrfs_root *root, struct btrfs_log_ctx *ctx);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 40int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 41int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info); 42 struct btrfs_fs_info *fs_info);
30int btrfs_recover_log_trees(struct btrfs_root *tree_root); 43int btrfs_recover_log_trees(struct btrfs_root *tree_root);
31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 44int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct dentry *dentry); 45 struct btrfs_root *root, struct dentry *dentry,
46 struct btrfs_log_ctx *ctx);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 47int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 48 struct btrfs_root *root,
35 const char *name, int name_len, 49 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..49d7fab73360 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
415 device->running_pending = 1; 415 device->running_pending = 1;
416 416
417 spin_unlock(&device->io_lock); 417 spin_unlock(&device->io_lock);
418 btrfs_requeue_work(&device->work); 418 btrfs_queue_work(fs_info->submit_workers,
419 &device->work);
419 goto done; 420 goto done;
420 } 421 }
421 /* unplug every 64 requests just for good measure */ 422 /* unplug every 64 requests just for good measure */
@@ -447,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work)
447 run_scheduled_bios(device); 448 run_scheduled_bios(device);
448} 449}
449 450
451/*
452 * Add new device to list of registered devices
453 *
454 * Returns:
455 * 1 - first time device is seen
456 * 0 - device already known
457 * < 0 - error
458 */
450static noinline int device_list_add(const char *path, 459static noinline int device_list_add(const char *path,
451 struct btrfs_super_block *disk_super, 460 struct btrfs_super_block *disk_super,
452 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 461 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -454,6 +463,7 @@ static noinline int device_list_add(const char *path,
454 struct btrfs_device *device; 463 struct btrfs_device *device;
455 struct btrfs_fs_devices *fs_devices; 464 struct btrfs_fs_devices *fs_devices;
456 struct rcu_string *name; 465 struct rcu_string *name;
466 int ret = 0;
457 u64 found_transid = btrfs_super_generation(disk_super); 467 u64 found_transid = btrfs_super_generation(disk_super);
458 468
459 fs_devices = find_fsid(disk_super->fsid); 469 fs_devices = find_fsid(disk_super->fsid);
@@ -494,6 +504,7 @@ static noinline int device_list_add(const char *path,
494 fs_devices->num_devices++; 504 fs_devices->num_devices++;
495 mutex_unlock(&fs_devices->device_list_mutex); 505 mutex_unlock(&fs_devices->device_list_mutex);
496 506
507 ret = 1;
497 device->fs_devices = fs_devices; 508 device->fs_devices = fs_devices;
498 } else if (!device->name || strcmp(device->name->str, path)) { 509 } else if (!device->name || strcmp(device->name->str, path)) {
499 name = rcu_string_strdup(path, GFP_NOFS); 510 name = rcu_string_strdup(path, GFP_NOFS);
@@ -512,7 +523,8 @@ static noinline int device_list_add(const char *path,
512 fs_devices->latest_trans = found_transid; 523 fs_devices->latest_trans = found_transid;
513 } 524 }
514 *fs_devices_ret = fs_devices; 525 *fs_devices_ret = fs_devices;
515 return 0; 526
527 return ret;
516} 528}
517 529
518static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 530static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -909,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
909 transid = btrfs_super_generation(disk_super); 921 transid = btrfs_super_generation(disk_super);
910 total_devices = btrfs_super_num_devices(disk_super); 922 total_devices = btrfs_super_num_devices(disk_super);
911 923
912 if (disk_super->label[0]) {
913 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
914 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
915 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
916 } else {
917 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
918 }
919
920 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
921
922 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 924 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
925 if (ret > 0) {
926 if (disk_super->label[0]) {
927 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
928 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
929 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
930 } else {
931 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
932 }
933
934 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
935 ret = 0;
936 }
923 if (!ret && fs_devices_ret) 937 if (!ret && fs_devices_ret)
924 (*fs_devices_ret)->total_devices = total_devices; 938 (*fs_devices_ret)->total_devices = total_devices;
925 939
@@ -5263,6 +5277,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5277static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5278{
5265 struct btrfs_bio *bbio = bio->bi_private; 5279 struct btrfs_bio *bbio = bio->bi_private;
5280 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5281 int is_orig_bio = 0;
5267 5282
5268 if (err) { 5283 if (err) {
@@ -5270,7 +5285,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5285 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5286 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5287 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5288
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5289 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5290 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5306,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5306 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5307 is_orig_bio = 1;
5294 5308
5309 btrfs_bio_counter_dec(bbio->fs_info);
5310
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5311 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5312 if (!is_orig_bio) {
5297 bio_put(bio); 5313 bio_put(bio);
@@ -5328,13 +5344,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5328 } 5344 }
5329} 5345}
5330 5346
5331struct async_sched {
5332 struct bio *bio;
5333 int rw;
5334 struct btrfs_fs_info *info;
5335 struct btrfs_work work;
5336};
5337
5338/* 5347/*
5339 * see run_scheduled_bios for a description of why bios are collected for 5348 * see run_scheduled_bios for a description of why bios are collected for
5340 * async submit. 5349 * async submit.
@@ -5391,8 +5400,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5391 spin_unlock(&device->io_lock); 5400 spin_unlock(&device->io_lock);
5392 5401
5393 if (should_queue) 5402 if (should_queue)
5394 btrfs_queue_worker(&root->fs_info->submit_workers, 5403 btrfs_queue_work(root->fs_info->submit_workers,
5395 &device->work); 5404 &device->work);
5396} 5405}
5397 5406
5398static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5407static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5456,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5447 } 5456 }
5448#endif 5457#endif
5449 bio->bi_bdev = dev->bdev; 5458 bio->bi_bdev = dev->bdev;
5459
5460 btrfs_bio_counter_inc_noblocked(root->fs_info);
5461
5450 if (async) 5462 if (async)
5451 btrfs_schedule_bio(root, dev, rw, bio); 5463 btrfs_schedule_bio(root, dev, rw, bio);
5452 else 5464 else
@@ -5515,28 +5527,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5515 length = bio->bi_iter.bi_size; 5527 length = bio->bi_iter.bi_size;
5516 map_length = length; 5528 map_length = length;
5517 5529
5530 btrfs_bio_counter_inc_blocked(root->fs_info);
5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5531 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5519 mirror_num, &raid_map); 5532 mirror_num, &raid_map);
5520 if (ret) /* -ENOMEM */ 5533 if (ret) {
5534 btrfs_bio_counter_dec(root->fs_info);
5521 return ret; 5535 return ret;
5536 }
5522 5537
5523 total_devs = bbio->num_stripes; 5538 total_devs = bbio->num_stripes;
5524 bbio->orig_bio = first_bio; 5539 bbio->orig_bio = first_bio;
5525 bbio->private = first_bio->bi_private; 5540 bbio->private = first_bio->bi_private;
5526 bbio->end_io = first_bio->bi_end_io; 5541 bbio->end_io = first_bio->bi_end_io;
5542 bbio->fs_info = root->fs_info;
5527 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5543 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5528 5544
5529 if (raid_map) { 5545 if (raid_map) {
5530 /* In this case, map_length has been set to the length of 5546 /* In this case, map_length has been set to the length of
5531 a single stripe; not the whole write */ 5547 a single stripe; not the whole write */
5532 if (rw & WRITE) { 5548 if (rw & WRITE) {
5533 return raid56_parity_write(root, bio, bbio, 5549 ret = raid56_parity_write(root, bio, bbio,
5534 raid_map, map_length); 5550 raid_map, map_length);
5535 } else { 5551 } else {
5536 return raid56_parity_recover(root, bio, bbio, 5552 ret = raid56_parity_recover(root, bio, bbio,
5537 raid_map, map_length, 5553 raid_map, map_length,
5538 mirror_num); 5554 mirror_num);
5539 } 5555 }
5556 /*
5557 * FIXME, replace dosen't support raid56 yet, please fix
5558 * it in the future.
5559 */
5560 btrfs_bio_counter_dec(root->fs_info);
5561 return ret;
5540 } 5562 }
5541 5563
5542 if (map_length < length) { 5564 if (map_length < length) {
@@ -5578,6 +5600,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5578 async_submit); 5600 async_submit);
5579 dev_nr++; 5601 dev_nr++;
5580 } 5602 }
5603 btrfs_bio_counter_dec(root->fs_info);
5581 return 0; 5604 return 0;
5582} 5605}
5583 5606
@@ -5666,7 +5689,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5666 else 5689 else
5667 generate_random_uuid(dev->uuid); 5690 generate_random_uuid(dev->uuid);
5668 5691
5669 dev->work.func = pending_bios_fn; 5692 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
5670 5693
5671 return dev; 5694 return dev;
5672} 5695}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8b43c1..9ddb9fc7d923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end);
2114 * Returns true if all buffers which correspond to a file portion 2114 * Returns true if all buffers which correspond to a file portion
2115 * we want to read are uptodate. 2115 * we want to read are uptodate.
2116 */ 2116 */
2117int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2117int block_is_partially_uptodate(struct page *page, unsigned long from,
2118 unsigned long from) 2118 unsigned long count)
2119{ 2119{
2120 unsigned block_start, block_end, blocksize; 2120 unsigned block_start, block_end, blocksize;
2121 unsigned to; 2121 unsigned to;
@@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2127 2127
2128 head = page_buffers(page); 2128 head = page_buffers(page);
2129 blocksize = head->b_size; 2129 blocksize = head->b_size;
2130 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2130 to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2131 to = from + to; 2131 to = from + to;
2132 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2132 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2133 return 0; 2133 return 0;
@@ -3088,7 +3088,7 @@ EXPORT_SYMBOL(submit_bh);
3088 * until the buffer gets unlocked). 3088 * until the buffer gets unlocked).
3089 * 3089 *
3090 * ll_rw_block sets b_end_io to simple completion handler that marks 3090 * ll_rw_block sets b_end_io to simple completion handler that marks
3091 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 3091 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3092 * any waiters. 3092 * any waiters.
3093 * 3093 *
3094 * All of the buffers must be for the same device, and must also be a 3094 * All of the buffers must be for the same device, and must also be a
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e484..5b99bafc31d1 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
124 /* check parameters */ 124 /* check parameters */
125 ret = -EOPNOTSUPP; 125 ret = -EOPNOTSUPP;
126 if (!root->d_inode || 126 if (!root->d_inode ||
127 !root->d_inode->i_op ||
128 !root->d_inode->i_op->lookup || 127 !root->d_inode->i_op->lookup ||
129 !root->d_inode->i_op->mkdir || 128 !root->d_inode->i_op->mkdir ||
130 !root->d_inode->i_op->setxattr || 129 !root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39dc8dc..c0a681705104 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,12 +391,12 @@ try_again:
391 path.dentry = dir; 391 path.dentry = dir;
392 path_to_graveyard.mnt = cache->mnt; 392 path_to_graveyard.mnt = cache->mnt;
393 path_to_graveyard.dentry = cache->graveyard; 393 path_to_graveyard.dentry = cache->graveyard;
394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave); 394 ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
395 if (ret < 0) { 395 if (ret < 0) {
396 cachefiles_io_error(cache, "Rename security error %d", ret); 396 cachefiles_io_error(cache, "Rename security error %d", ret);
397 } else { 397 } else {
398 ret = vfs_rename(dir->d_inode, rep, 398 ret = vfs_rename(dir->d_inode, rep,
399 cache->graveyard->d_inode, grave, NULL); 399 cache->graveyard->d_inode, grave, NULL, 0);
400 if (ret != 0 && ret != -ENOMEM) 400 if (ret != 0 && ret != -ENOMEM)
401 cachefiles_io_error(cache, 401 cachefiles_io_error(cache,
402 "Rename failed with error %d", ret); 402 "Rename failed with error %d", ret);
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
779 } 779 }
780 780
781 ret = -EPERM; 781 ret = -EPERM;
782 if (!subdir->d_inode->i_op || 782 if (!subdir->d_inode->i_op->setxattr ||
783 !subdir->d_inode->i_op->setxattr ||
784 !subdir->d_inode->i_op->getxattr || 783 !subdir->d_inode->i_op->getxattr ||
785 !subdir->d_inode->i_op->lookup || 784 !subdir->d_inode->i_op->lookup ||
786 !subdir->d_inode->i_op->mkdir || 785 !subdir->d_inode->i_op->mkdir ||
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120d..4b1fb5ca65b8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
265 goto nomem_monitor; 265 goto nomem_monitor;
266 } 266 }
267 267
268 ret = add_to_page_cache(newpage, bmapping, 268 ret = add_to_page_cache_lru(newpage, bmapping,
269 netpage->index, cachefiles_gfp); 269 netpage->index, cachefiles_gfp);
270 if (ret == 0) 270 if (ret == 0)
271 goto installed_new_backing_page; 271 goto installed_new_backing_page;
272 if (ret != -EEXIST) 272 if (ret != -EEXIST)
273 goto nomem_page; 273 goto nomem_page;
274 } 274 }
275 275
276 /* we've installed a new backing page, so now we need to add it 276 /* we've installed a new backing page, so now we need to start
277 * to the LRU list and start it reading */ 277 * it reading */
278installed_new_backing_page: 278installed_new_backing_page:
279 _debug("- new %p", newpage); 279 _debug("- new %p", newpage);
280 280
281 backpage = newpage; 281 backpage = newpage;
282 newpage = NULL; 282 newpage = NULL;
283 283
284 lru_cache_add_file(backpage);
285
286read_backing_page: 284read_backing_page:
287 ret = bmapping->a_ops->readpage(NULL, backpage); 285 ret = bmapping->a_ops->readpage(NULL, backpage);
288 if (ret < 0) 286 if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
510 goto nomem; 508 goto nomem;
511 } 509 }
512 510
513 ret = add_to_page_cache(newpage, bmapping, 511 ret = add_to_page_cache_lru(newpage, bmapping,
514 netpage->index, cachefiles_gfp); 512 netpage->index,
513 cachefiles_gfp);
515 if (ret == 0) 514 if (ret == 0)
516 goto installed_new_backing_page; 515 goto installed_new_backing_page;
517 if (ret != -EEXIST) 516 if (ret != -EEXIST)
518 goto nomem; 517 goto nomem;
519 } 518 }
520 519
521 /* we've installed a new backing page, so now we need to add it 520 /* we've installed a new backing page, so now we need
522 * to the LRU list and start it reading */ 521 * to start it reading */
523 installed_new_backing_page: 522 installed_new_backing_page:
524 _debug("- new %p", newpage); 523 _debug("- new %p", newpage);
525 524
526 backpage = newpage; 525 backpage = newpage;
527 newpage = NULL; 526 newpage = NULL;
528 527
529 lru_cache_add_file(backpage);
530
531 reread_backing_page: 528 reread_backing_page:
532 ret = bmapping->a_ops->readpage(NULL, backpage); 529 ret = bmapping->a_ops->readpage(NULL, backpage);
533 if (ret < 0) 530 if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
538 monitor_backing_page: 535 monitor_backing_page:
539 _debug("- monitor add"); 536 _debug("- monitor add");
540 537
541 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 538 ret = add_to_page_cache_lru(netpage, op->mapping,
542 cachefiles_gfp); 539 netpage->index, cachefiles_gfp);
543 if (ret < 0) { 540 if (ret < 0) {
544 if (ret == -EEXIST) { 541 if (ret == -EEXIST) {
545 page_cache_release(netpage); 542 page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
549 goto nomem; 546 goto nomem;
550 } 547 }
551 548
552 lru_cache_add_file(netpage);
553
554 /* install a monitor */ 549 /* install a monitor */
555 page_cache_get(netpage); 550 page_cache_get(netpage);
556 monitor->netfs_page = netpage; 551 monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
613 backing_page_already_uptodate: 608 backing_page_already_uptodate:
614 _debug("- uptodate"); 609 _debug("- uptodate");
615 610
616 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 611 ret = add_to_page_cache_lru(netpage, op->mapping,
617 cachefiles_gfp); 612 netpage->index, cachefiles_gfp);
618 if (ret < 0) { 613 if (ret < 0) {
619 if (ret == -EEXIST) { 614 if (ret == -EEXIST) {
620 page_cache_release(netpage); 615 page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
631 626
632 fscache_mark_page_cached(op, netpage); 627 fscache_mark_page_cached(op, netpage);
633 628
634 lru_cache_add_file(netpage);
635
636 /* the netpage is unlocked and marked up to date here */ 629 /* the netpage is unlocked and marked up to date here */
637 fscache_end_io(op, netpage, 0); 630 fscache_end_io(op, netpage, 0);
638 page_cache_release(netpage); 631 page_cache_release(netpage);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd4e1c3..834f9f3723fb 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
205 ci->fscache = fscache_acquire_cookie(fsc->fscache, 205 ci->fscache = fscache_acquire_cookie(fsc->fscache,
206 &ceph_fscache_inode_object_def, 206 &ceph_fscache_inode_object_def,
207 ci, true); 207 ci, true);
208 fscache_check_consistency(ci->fscache);
208done: 209done:
209 mutex_unlock(&inode->i_mutex); 210 mutex_unlock(&inode->i_mutex);
210 211
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index da95f61b7a09..5ac591bd012b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
48void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); 48void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
49void ceph_queue_revalidate(struct inode *inode); 49void ceph_queue_revalidate(struct inode *inode);
50 50
51static inline void ceph_fscache_update_objectsize(struct inode *inode)
52{
53 struct ceph_inode_info *ci = ceph_inode(inode);
54 fscache_attr_changed(ci->fscache);
55}
56
51static inline void ceph_fscache_invalidate(struct inode *inode) 57static inline void ceph_fscache_invalidate(struct inode *inode)
52{ 58{
53 fscache_invalidate(ceph_inode(inode)->fscache); 59 fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
135{ 141{
136} 142}
137 143
144static inline void ceph_fscache_update_objectsize(struct inode *inode)
145{
146}
147
138static inline void ceph_fscache_invalidate(struct inode *inode) 148static inline void ceph_fscache_invalidate(struct inode *inode)
139{ 149{
140} 150}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 17543383545c..c561b628ebce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -622,8 +622,10 @@ retry:
622 622
623 if (flags & CEPH_CAP_FLAG_AUTH) { 623 if (flags & CEPH_CAP_FLAG_AUTH) {
624 if (ci->i_auth_cap == NULL || 624 if (ci->i_auth_cap == NULL ||
625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
626 ci->i_auth_cap = cap; 626 ci->i_auth_cap = cap;
627 cap->mds_wanted = wanted;
628 }
627 ci->i_cap_exporting_issued = 0; 629 ci->i_cap_exporting_issued = 0;
628 } else { 630 } else {
629 WARN_ON(ci->i_auth_cap == cap); 631 WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
885 cap = rb_entry(p, struct ceph_cap, ci_node); 887 cap = rb_entry(p, struct ceph_cap, ci_node);
886 if (!__cap_is_valid(cap)) 888 if (!__cap_is_valid(cap))
887 continue; 889 continue;
888 mds_wanted |= cap->mds_wanted; 890 if (cap == ci->i_auth_cap)
891 mds_wanted |= cap->mds_wanted;
892 else
893 mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
889 } 894 }
890 return mds_wanted; 895 return mds_wanted;
891} 896}
@@ -3256,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3256 rel->seq = cpu_to_le32(cap->seq); 3261 rel->seq = cpu_to_le32(cap->seq);
3257 rel->issue_seq = cpu_to_le32(cap->issue_seq), 3262 rel->issue_seq = cpu_to_le32(cap->issue_seq),
3258 rel->mseq = cpu_to_le32(cap->mseq); 3263 rel->mseq = cpu_to_le32(cap->mseq);
3259 rel->caps = cpu_to_le32(cap->issued); 3264 rel->caps = cpu_to_le32(cap->implemented);
3260 rel->wanted = cpu_to_le32(cap->mds_wanted); 3265 rel->wanted = cpu_to_le32(cap->mds_wanted);
3261 rel->dname_len = 0; 3266 rel->dname_len = 0;
3262 rel->dname_seq = 0; 3267 rel->dname_seq = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..16b54aa31f08 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
93 } else if (req->r_path1) { 93 } else if (req->r_path1) {
94 seq_printf(s, " #%llx/%s", req->r_ino1.ino, 94 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
95 req->r_path1); 95 req->r_path1);
96 } else {
97 seq_printf(s, " #%llx", req->r_ino1.ino);
96 } 98 }
97 99
98 if (req->r_old_dentry) { 100 if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
102 path = NULL; 104 path = NULL;
103 spin_lock(&req->r_old_dentry->d_lock); 105 spin_lock(&req->r_old_dentry->d_lock);
104 seq_printf(s, " #%llx/%.*s (%s)", 106 seq_printf(s, " #%llx/%.*s (%s)",
105 ceph_ino(req->r_old_dentry_dir), 107 req->r_old_dentry_dir ?
108 ceph_ino(req->r_old_dentry_dir) : 0,
106 req->r_old_dentry->d_name.len, 109 req->r_old_dentry->d_name.len,
107 req->r_old_dentry->d_name.name, 110 req->r_old_dentry->d_name.name,
108 path ? path : ""); 111 path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 45eda6d7a40c..c29d6ae68874 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r)
119 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 119 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
120 * the MDS if/when the directory is modified). 120 * the MDS if/when the directory is modified).
121 */ 121 */
122static int __dcache_readdir(struct file *file, struct dir_context *ctx) 122static int __dcache_readdir(struct file *file, struct dir_context *ctx,
123 u32 shared_gen)
123{ 124{
124 struct ceph_file_info *fi = file->private_data; 125 struct ceph_file_info *fi = file->private_data;
125 struct dentry *parent = file->f_dentry; 126 struct dentry *parent = file->f_dentry;
@@ -133,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
133 last = fi->dentry; 134 last = fi->dentry;
134 fi->dentry = NULL; 135 fi->dentry = NULL;
135 136
136 dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, 137 dout("__dcache_readdir %p v%u at %llu (last %p)\n",
137 last); 138 dir, shared_gen, ctx->pos, last);
138 139
139 spin_lock(&parent->d_lock); 140 spin_lock(&parent->d_lock);
140 141
141 /* start at beginning? */ 142 /* start at beginning? */
142 if (ctx->pos == 2 || last == NULL || 143 if (ctx->pos == 2 || last == NULL ||
143 ctx->pos < ceph_dentry(last)->offset) { 144 fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
144 if (list_empty(&parent->d_subdirs)) 145 if (list_empty(&parent->d_subdirs))
145 goto out_unlock; 146 goto out_unlock;
146 p = parent->d_subdirs.prev; 147 p = parent->d_subdirs.prev;
@@ -161,7 +162,8 @@ more:
161 goto out_unlock; 162 goto out_unlock;
162 } 163 }
163 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 164 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
164 if (!d_unhashed(dentry) && dentry->d_inode && 165 if (di->lease_shared_gen == shared_gen &&
166 !d_unhashed(dentry) && dentry->d_inode &&
165 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 167 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
166 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 168 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
167 fpos_cmp(ctx->pos, di->offset) <= 0) 169 fpos_cmp(ctx->pos, di->offset) <= 0)
@@ -180,9 +182,16 @@ more:
180 spin_unlock(&dentry->d_lock); 182 spin_unlock(&dentry->d_lock);
181 spin_unlock(&parent->d_lock); 183 spin_unlock(&parent->d_lock);
182 184
185 /* make sure a dentry wasn't dropped while we didn't have parent lock */
186 if (!ceph_dir_is_complete(dir)) {
187 dout(" lost dir complete on %p; falling back to mds\n", dir);
188 dput(dentry);
189 err = -EAGAIN;
190 goto out;
191 }
192
183 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, 193 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
184 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 194 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
185 ctx->pos = di->offset;
186 if (!dir_emit(ctx, dentry->d_name.name, 195 if (!dir_emit(ctx, dentry->d_name.name,
187 dentry->d_name.len, 196 dentry->d_name.len,
188 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), 197 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -190,25 +199,18 @@ more:
190 if (last) { 199 if (last) {
191 /* remember our position */ 200 /* remember our position */
192 fi->dentry = last; 201 fi->dentry = last;
193 fi->next_offset = di->offset; 202 fi->next_offset = fpos_off(di->offset);
194 } 203 }
195 dput(dentry); 204 dput(dentry);
196 return 0; 205 return 0;
197 } 206 }
198 207
208 ctx->pos = di->offset + 1;
209
199 if (last) 210 if (last)
200 dput(last); 211 dput(last);
201 last = dentry; 212 last = dentry;
202 213
203 ctx->pos++;
204
205 /* make sure a dentry wasn't dropped while we didn't have parent lock */
206 if (!ceph_dir_is_complete(dir)) {
207 dout(" lost dir complete on %p; falling back to mds\n", dir);
208 err = -EAGAIN;
209 goto out;
210 }
211
212 spin_lock(&parent->d_lock); 214 spin_lock(&parent->d_lock);
213 p = p->prev; /* advance to next dentry */ 215 p = p->prev; /* advance to next dentry */
214 goto more; 216 goto more;
@@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
252 int err; 254 int err;
253 u32 ftype; 255 u32 ftype;
254 struct ceph_mds_reply_info_parsed *rinfo; 256 struct ceph_mds_reply_info_parsed *rinfo;
255 const int max_entries = fsc->mount_options->max_readdir;
256 const int max_bytes = fsc->mount_options->max_readdir_bytes;
257 257
258 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); 258 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
259 if (fi->flags & CEPH_F_ATEND) 259 if (fi->flags & CEPH_F_ATEND)
@@ -291,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
291 ceph_snap(inode) != CEPH_SNAPDIR && 291 ceph_snap(inode) != CEPH_SNAPDIR &&
292 __ceph_dir_is_complete(ci) && 292 __ceph_dir_is_complete(ci) &&
293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
294 u32 shared_gen = ci->i_shared_gen;
294 spin_unlock(&ci->i_ceph_lock); 295 spin_unlock(&ci->i_ceph_lock);
295 err = __dcache_readdir(file, ctx); 296 err = __dcache_readdir(file, ctx, shared_gen);
296 if (err != -EAGAIN) 297 if (err != -EAGAIN)
297 return err; 298 return err;
299 frag = fpos_frag(ctx->pos);
300 off = fpos_off(ctx->pos);
298 } else { 301 } else {
299 spin_unlock(&ci->i_ceph_lock); 302 spin_unlock(&ci->i_ceph_lock);
300 } 303 }
@@ -322,14 +325,16 @@ more:
322 fi->last_readdir = NULL; 325 fi->last_readdir = NULL;
323 } 326 }
324 327
325 /* requery frag tree, as the frag topology may have changed */
326 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
327
328 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 328 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
329 ceph_vinop(inode), frag, fi->last_name); 329 ceph_vinop(inode), frag, fi->last_name);
330 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 330 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
331 if (IS_ERR(req)) 331 if (IS_ERR(req))
332 return PTR_ERR(req); 332 return PTR_ERR(req);
333 err = ceph_alloc_readdir_reply_buffer(req, inode);
334 if (err) {
335 ceph_mdsc_put_request(req);
336 return err;
337 }
333 req->r_inode = inode; 338 req->r_inode = inode;
334 ihold(inode); 339 ihold(inode);
335 req->r_dentry = dget(file->f_dentry); 340 req->r_dentry = dget(file->f_dentry);
@@ -340,9 +345,6 @@ more:
340 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 345 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
341 req->r_readdir_offset = fi->next_offset; 346 req->r_readdir_offset = fi->next_offset;
342 req->r_args.readdir.frag = cpu_to_le32(frag); 347 req->r_args.readdir.frag = cpu_to_le32(frag);
343 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
344 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
345 req->r_num_caps = max_entries + 1;
346 err = ceph_mdsc_do_request(mdsc, NULL, req); 348 err = ceph_mdsc_do_request(mdsc, NULL, req);
347 if (err < 0) { 349 if (err < 0) {
348 ceph_mdsc_put_request(req); 350 ceph_mdsc_put_request(req);
@@ -369,9 +371,9 @@ more:
369 fi->next_offset = 0; 371 fi->next_offset = 0;
370 off = fi->next_offset; 372 off = fi->next_offset;
371 } 373 }
374 fi->frag = frag;
372 fi->offset = fi->next_offset; 375 fi->offset = fi->next_offset;
373 fi->last_readdir = req; 376 fi->last_readdir = req;
374 fi->frag = frag;
375 377
376 if (req->r_reply_info.dir_end) { 378 if (req->r_reply_info.dir_end) {
377 kfree(fi->last_name); 379 kfree(fi->last_name);
@@ -446,7 +448,6 @@ more:
446 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { 448 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
447 dout(" marking %p complete\n", inode); 449 dout(" marking %p complete\n", inode);
448 __ceph_dir_set_complete(ci, fi->dir_release_count); 450 __ceph_dir_set_complete(ci, fi->dir_release_count);
449 ci->i_max_offset = ctx->pos;
450 } 451 }
451 spin_unlock(&ci->i_ceph_lock); 452 spin_unlock(&ci->i_ceph_lock);
452 453
@@ -454,7 +455,7 @@ more:
454 return 0; 455 return 0;
455} 456}
456 457
457static void reset_readdir(struct ceph_file_info *fi) 458static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
458{ 459{
459 if (fi->last_readdir) { 460 if (fi->last_readdir) {
460 ceph_mdsc_put_request(fi->last_readdir); 461 ceph_mdsc_put_request(fi->last_readdir);
@@ -462,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi)
462 } 463 }
463 kfree(fi->last_name); 464 kfree(fi->last_name);
464 fi->last_name = NULL; 465 fi->last_name = NULL;
465 fi->next_offset = 2; /* compensate for . and .. */ 466 if (ceph_frag_is_leftmost(frag))
467 fi->next_offset = 2; /* compensate for . and .. */
468 else
469 fi->next_offset = 0;
466 if (fi->dentry) { 470 if (fi->dentry) {
467 dput(fi->dentry); 471 dput(fi->dentry);
468 fi->dentry = NULL; 472 fi->dentry = NULL;
@@ -474,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
474{ 478{
475 struct ceph_file_info *fi = file->private_data; 479 struct ceph_file_info *fi = file->private_data;
476 struct inode *inode = file->f_mapping->host; 480 struct inode *inode = file->f_mapping->host;
477 loff_t old_offset = offset; 481 loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
478 loff_t retval; 482 loff_t retval;
479 483
480 mutex_lock(&inode->i_mutex); 484 mutex_lock(&inode->i_mutex);
@@ -491,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
491 goto out; 495 goto out;
492 } 496 }
493 497
494 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 498 if (offset >= 0) {
495 if (offset != file->f_pos) { 499 if (offset != file->f_pos) {
496 file->f_pos = offset; 500 file->f_pos = offset;
497 file->f_version = 0; 501 file->f_version = 0;
@@ -504,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
504 * seek to new frag, or seek prior to current chunk. 508 * seek to new frag, or seek prior to current chunk.
505 */ 509 */
506 if (offset == 0 || 510 if (offset == 0 ||
507 fpos_frag(offset) != fpos_frag(old_offset) || 511 fpos_frag(offset) != fi->frag ||
508 fpos_off(offset) < fi->offset) { 512 fpos_off(offset) < fi->offset) {
509 dout("dir_llseek dropping %p content\n", file); 513 dout("dir_llseek dropping %p content\n", file);
510 reset_readdir(fi); 514 reset_readdir(fi, fpos_frag(offset));
511 } 515 }
512 516
513 /* bump dir_release_count if we did a forward seek */ 517 /* bump dir_release_count if we did a forward seek */
514 if (offset > old_offset) 518 if (fpos_cmp(offset, old_offset) > 0)
515 fi->dir_release_count--; 519 fi->dir_release_count--;
516 } 520 }
517out: 521out:
@@ -812,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
812 } 816 }
813 req->r_dentry = dget(dentry); 817 req->r_dentry = dget(dentry);
814 req->r_num_caps = 2; 818 req->r_num_caps = 2;
815 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 819 req->r_old_dentry = dget(old_dentry);
816 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
817 req->r_locked_dir = dir; 820 req->r_locked_dir = dir;
818 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 821 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
819 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 822 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -911,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
911 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 914 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
912 if (IS_ERR(req)) 915 if (IS_ERR(req))
913 return PTR_ERR(req); 916 return PTR_ERR(req);
917 ihold(old_dir);
914 req->r_dentry = dget(new_dentry); 918 req->r_dentry = dget(new_dentry);
915 req->r_num_caps = 2; 919 req->r_num_caps = 2;
916 req->r_old_dentry = dget(old_dentry); 920 req->r_old_dentry = dget(old_dentry);
917 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 921 req->r_old_dentry_dir = old_dir;
918 req->r_locked_dir = new_dir; 922 req->r_locked_dir = new_dir;
919 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 923 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
920 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 924 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -932,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
932 * to do it here. 936 * to do it here.
933 */ 937 */
934 938
935 /* d_move screws up d_subdirs order */
936 ceph_dir_clear_complete(new_dir);
937
938 d_move(old_dentry, new_dentry); 939 d_move(old_dentry, new_dentry);
939 940
940 /* ensure target dentry is invalidated, despite 941 /* ensure target dentry is invalidated, despite
941 rehashing bug in vfs_rename_dir */ 942 rehashing bug in vfs_rename_dir */
942 ceph_invalidate_dentry_lease(new_dentry); 943 ceph_invalidate_dentry_lease(new_dentry);
944
945 /* d_move screws up sibling dentries' offsets */
946 ceph_dir_clear_complete(old_dir);
947 ceph_dir_clear_complete(new_dir);
948
943 } 949 }
944 ceph_mdsc_put_request(req); 950 ceph_mdsc_put_request(req);
945 return err; 951 return err;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..00d6af6a32ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
8#include "mds_client.h" 8#include "mds_client.h"
9 9
10/* 10/*
11 * NFS export support
12 *
13 * NFS re-export of a ceph mount is, at present, only semireliable.
14 * The basic issue is that the Ceph architectures doesn't lend itself
15 * well to generating filehandles that will remain valid forever.
16 *
17 * So, we do our best. If you're lucky, your inode will be in the
18 * client's cache. If it's not, and you have a connectable fh, then
19 * the MDS server may be able to find it for you. Otherwise, you get
20 * ESTALE.
21 *
22 * There are ways to this more reliable, but in the non-connectable fh
23 * case, we won't every work perfectly, and in the connectable case,
24 * some changes are needed on the MDS side to work better.
25 */
26
27/*
28 * Basic fh 11 * Basic fh
29 */ 12 */
30struct ceph_nfs_fh { 13struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
32} __attribute__ ((packed)); 15} __attribute__ ((packed));
33 16
34/* 17/*
35 * Larger 'connectable' fh that includes parent ino and name hash. 18 * Larger fh that includes parent ino.
36 * Use this whenever possible, as it works more reliably.
37 */ 19 */
38struct ceph_nfs_confh { 20struct ceph_nfs_confh {
39 u64 ino, parent_ino; 21 u64 ino, parent_ino;
40 u32 parent_name_hash;
41} __attribute__ ((packed)); 22} __attribute__ ((packed));
42 23
43/*
44 * The presence of @parent_inode here tells us whether NFS wants a
45 * connectable file handle. However, we want to make a connectionable
46 * file handle unconditionally so that the MDS gets as much of a hint
47 * as possible. That means we only use @parent_dentry to indicate
48 * whether nfsd wants a connectable fh, and whether we should indicate
49 * failure from a too-small @max_len.
50 */
51static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 24static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
52 struct inode *parent_inode) 25 struct inode *parent_inode)
53{ 26{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
56 struct ceph_nfs_confh *cfh = (void *)rawfh; 29 struct ceph_nfs_confh *cfh = (void *)rawfh;
57 int connected_handle_length = sizeof(*cfh)/4; 30 int connected_handle_length = sizeof(*cfh)/4;
58 int handle_length = sizeof(*fh)/4; 31 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry;
60 struct dentry *parent;
61 32
62 /* don't re-export snaps */ 33 /* don't re-export snaps */
63 if (ceph_snap(inode) != CEPH_NOSNAP) 34 if (ceph_snap(inode) != CEPH_NOSNAP)
64 return -EINVAL; 35 return -EINVAL;
65 36
66 dentry = d_find_alias(inode); 37 if (parent_inode && (*max_len < connected_handle_length)) {
38 *max_len = connected_handle_length;
39 return FILEID_INVALID;
40 } else if (*max_len < handle_length) {
41 *max_len = handle_length;
42 return FILEID_INVALID;
43 }
67 44
68 /* if we found an alias, generate a connectable fh */ 45 if (parent_inode) {
69 if (*max_len >= connected_handle_length && dentry) { 46 dout("encode_fh %llx with parent %llx\n",
70 dout("encode_fh %p connectable\n", dentry); 47 ceph_ino(inode), ceph_ino(parent_inode));
71 spin_lock(&dentry->d_lock);
72 parent = dentry->d_parent;
73 cfh->ino = ceph_ino(inode); 48 cfh->ino = ceph_ino(inode);
74 cfh->parent_ino = ceph_ino(parent->d_inode); 49 cfh->parent_ino = ceph_ino(parent_inode);
75 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
76 dentry);
77 *max_len = connected_handle_length; 50 *max_len = connected_handle_length;
78 type = 2; 51 type = FILEID_INO32_GEN_PARENT;
79 spin_unlock(&dentry->d_lock);
80 } else if (*max_len >= handle_length) {
81 if (parent_inode) {
82 /* nfsd wants connectable */
83 *max_len = connected_handle_length;
84 type = FILEID_INVALID;
85 } else {
86 dout("encode_fh %p\n", dentry);
87 fh->ino = ceph_ino(inode);
88 *max_len = handle_length;
89 type = 1;
90 }
91 } else { 52 } else {
53 dout("encode_fh %llx\n", ceph_ino(inode));
54 fh->ino = ceph_ino(inode);
92 *max_len = handle_length; 55 *max_len = handle_length;
93 type = FILEID_INVALID; 56 type = FILEID_INO32_GEN;
94 } 57 }
95 if (dentry)
96 dput(dentry);
97 return type; 58 return type;
98} 59}
99 60
100/* 61static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
101 * convert regular fh to dentry
102 *
103 * FIXME: we should try harder by querying the mds for the ino.
104 */
105static struct dentry *__fh_to_dentry(struct super_block *sb,
106 struct ceph_nfs_fh *fh, int fh_len)
107{ 62{
108 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 63 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
109 struct inode *inode; 64 struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
111 struct ceph_vino vino; 66 struct ceph_vino vino;
112 int err; 67 int err;
113 68
114 if (fh_len < sizeof(*fh) / 4) 69 vino.ino = ino;
115 return ERR_PTR(-ESTALE);
116
117 dout("__fh_to_dentry %llx\n", fh->ino);
118 vino.ino = fh->ino;
119 vino.snap = CEPH_NOSNAP; 70 vino.snap = CEPH_NOSNAP;
120 inode = ceph_find_inode(sb, vino); 71 inode = ceph_find_inode(sb, vino);
121 if (!inode) { 72 if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
139 90
140 dentry = d_obtain_alias(inode); 91 dentry = d_obtain_alias(inode);
141 if (IS_ERR(dentry)) { 92 if (IS_ERR(dentry)) {
142 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
143 fh->ino, inode);
144 iput(inode); 93 iput(inode);
145 return dentry; 94 return dentry;
146 } 95 }
147 err = ceph_init_dentry(dentry); 96 err = ceph_init_dentry(dentry);
148 if (err < 0) { 97 if (err < 0) {
149 iput(inode); 98 dput(dentry);
150 return ERR_PTR(err); 99 return ERR_PTR(err);
151 } 100 }
152 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); 101 dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
153 return dentry; 102 return dentry;
154} 103}
155 104
156/* 105/*
157 * convert connectable fh to dentry 106 * convert regular fh to dentry
158 */ 107 */
159static struct dentry *__cfh_to_dentry(struct super_block *sb, 108static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
160 struct ceph_nfs_confh *cfh, int fh_len) 109 struct fid *fid,
110 int fh_len, int fh_type)
111{
112 struct ceph_nfs_fh *fh = (void *)fid->raw;
113
114 if (fh_type != FILEID_INO32_GEN &&
115 fh_type != FILEID_INO32_GEN_PARENT)
116 return NULL;
117 if (fh_len < sizeof(*fh) / 4)
118 return NULL;
119
120 dout("fh_to_dentry %llx\n", fh->ino);
121 return __fh_to_dentry(sb, fh->ino);
122}
123
124static struct dentry *__get_parent(struct super_block *sb,
125 struct dentry *child, u64 ino)
161{ 126{
162 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 127 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
128 struct ceph_mds_request *req;
163 struct inode *inode; 129 struct inode *inode;
164 struct dentry *dentry; 130 struct dentry *dentry;
165 struct ceph_vino vino;
166 int err; 131 int err;
167 132
168 if (fh_len < sizeof(*cfh) / 4) 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
169 return ERR_PTR(-ESTALE); 134 USE_ANY_MDS);
170 135 if (IS_ERR(req))
171 dout("__cfh_to_dentry %llx (%llx/%x)\n", 136 return ERR_CAST(req);
172 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
173
174 vino.ino = cfh->ino;
175 vino.snap = CEPH_NOSNAP;
176 inode = ceph_find_inode(sb, vino);
177 if (!inode) {
178 struct ceph_mds_request *req;
179
180 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
181 USE_ANY_MDS);
182 if (IS_ERR(req))
183 return ERR_CAST(req);
184 137
185 req->r_ino1 = vino; 138 if (child) {
186 req->r_ino2.ino = cfh->parent_ino; 139 req->r_inode = child->d_inode;
187 req->r_ino2.snap = CEPH_NOSNAP; 140 ihold(child->d_inode);
188 req->r_path2 = kmalloc(16, GFP_NOFS); 141 } else {
189 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); 142 req->r_ino1 = (struct ceph_vino) {
190 req->r_num_caps = 1; 143 .ino = ino,
191 err = ceph_mdsc_do_request(mdsc, NULL, req); 144 .snap = CEPH_NOSNAP,
192 inode = req->r_target_inode; 145 };
193 if (inode)
194 ihold(inode);
195 ceph_mdsc_put_request(req);
196 if (!inode)
197 return ERR_PTR(err ? err : -ESTALE);
198 } 146 }
147 req->r_num_caps = 1;
148 err = ceph_mdsc_do_request(mdsc, NULL, req);
149 inode = req->r_target_inode;
150 if (inode)
151 ihold(inode);
152 ceph_mdsc_put_request(req);
153 if (!inode)
154 return ERR_PTR(-ENOENT);
199 155
200 dentry = d_obtain_alias(inode); 156 dentry = d_obtain_alias(inode);
201 if (IS_ERR(dentry)) { 157 if (IS_ERR(dentry)) {
202 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
203 cfh->ino, inode);
204 iput(inode); 158 iput(inode);
205 return dentry; 159 return dentry;
206 } 160 }
207 err = ceph_init_dentry(dentry); 161 err = ceph_init_dentry(dentry);
208 if (err < 0) { 162 if (err < 0) {
209 iput(inode); 163 dput(dentry);
210 return ERR_PTR(err); 164 return ERR_PTR(err);
211 } 165 }
212 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); 166 dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
167 child ? ceph_ino(child->d_inode) : ino,
168 dentry, ceph_vinop(inode));
213 return dentry; 169 return dentry;
214} 170}
215 171
216static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, 172struct dentry *ceph_get_parent(struct dentry *child)
217 int fh_len, int fh_type)
218{ 173{
219 if (fh_type == 1) 174 /* don't re-export snaps */
220 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, 175 if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
221 fh_len); 176 return ERR_PTR(-EINVAL);
222 else 177
223 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, 178 dout("get_parent %p ino %llx.%llx\n",
224 fh_len); 179 child, ceph_vinop(child->d_inode));
180 return __get_parent(child->d_sb, child, 0);
225} 181}
226 182
227/* 183/*
228 * get parent, if possible. 184 * convert regular fh to parent
229 *
230 * FIXME: we could do better by querying the mds to discover the
231 * parent.
232 */ 185 */
233static struct dentry *ceph_fh_to_parent(struct super_block *sb, 186static struct dentry *ceph_fh_to_parent(struct super_block *sb,
234 struct fid *fid, 187 struct fid *fid,
235 int fh_len, int fh_type) 188 int fh_len, int fh_type)
236{ 189{
237 struct ceph_nfs_confh *cfh = (void *)fid->raw; 190 struct ceph_nfs_confh *cfh = (void *)fid->raw;
238 struct ceph_vino vino;
239 struct inode *inode;
240 struct dentry *dentry; 191 struct dentry *dentry;
241 int err;
242 192
243 if (fh_type == 1) 193 if (fh_type != FILEID_INO32_GEN_PARENT)
244 return ERR_PTR(-ESTALE); 194 return NULL;
245 if (fh_len < sizeof(*cfh) / 4) 195 if (fh_len < sizeof(*cfh) / 4)
246 return ERR_PTR(-ESTALE); 196 return NULL;
247 197
248 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, 198 dout("fh_to_parent %llx\n", cfh->parent_ino);
249 cfh->parent_name_hash); 199 dentry = __get_parent(sb, NULL, cfh->ino);
200 if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
201 dentry = __fh_to_dentry(sb, cfh->parent_ino);
202 return dentry;
203}
250 204
251 vino.ino = cfh->ino; 205static int ceph_get_name(struct dentry *parent, char *name,
252 vino.snap = CEPH_NOSNAP; 206 struct dentry *child)
253 inode = ceph_find_inode(sb, vino); 207{
254 if (!inode) 208 struct ceph_mds_client *mdsc;
255 return ERR_PTR(-ESTALE); 209 struct ceph_mds_request *req;
210 int err;
256 211
257 dentry = d_obtain_alias(inode); 212 mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
258 if (IS_ERR(dentry)) { 213 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
259 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 214 USE_ANY_MDS);
260 cfh->ino, inode); 215 if (IS_ERR(req))
261 iput(inode); 216 return PTR_ERR(req);
262 return dentry; 217
263 } 218 mutex_lock(&parent->d_inode->i_mutex);
264 err = ceph_init_dentry(dentry); 219
265 if (err < 0) { 220 req->r_inode = child->d_inode;
266 iput(inode); 221 ihold(child->d_inode);
267 return ERR_PTR(err); 222 req->r_ino2 = ceph_vino(parent->d_inode);
223 req->r_locked_dir = parent->d_inode;
224 req->r_num_caps = 2;
225 err = ceph_mdsc_do_request(mdsc, NULL, req);
226
227 mutex_unlock(&parent->d_inode->i_mutex);
228
229 if (!err) {
230 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
231 memcpy(name, rinfo->dname, rinfo->dname_len);
232 name[rinfo->dname_len] = 0;
233 dout("get_name %p ino %llx.%llx name %s\n",
234 child, ceph_vinop(child->d_inode), name);
235 } else {
236 dout("get_name %p ino %llx.%llx err %d\n",
237 child, ceph_vinop(child->d_inode), err);
268 } 238 }
269 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); 239
270 return dentry; 240 ceph_mdsc_put_request(req);
241 return err;
271} 242}
272 243
273const struct export_operations ceph_export_ops = { 244const struct export_operations ceph_export_ops = {
274 .encode_fh = ceph_encode_fh, 245 .encode_fh = ceph_encode_fh,
275 .fh_to_dentry = ceph_fh_to_dentry, 246 .fh_to_dentry = ceph_fh_to_dentry,
276 .fh_to_parent = ceph_fh_to_parent, 247 .fh_to_parent = ceph_fh_to_parent,
248 .get_parent = ceph_get_parent,
249 .get_name = ceph_get_name,
277}; 250};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 09c7afe32e49..88a6df4cbe6d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
210 ihold(inode); 210 ihold(inode);
211 211
212 req->r_num_caps = 1; 212 req->r_num_caps = 1;
213 if (flags & (O_CREAT|O_TRUNC)) 213 if (flags & O_CREAT)
214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 214 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
215 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 215 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
216 iput(parent_inode); 216 iput(parent_inode);
@@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
291 } 291 }
292 err = finish_open(file, dentry, ceph_open, opened); 292 err = finish_open(file, dentry, ceph_open, opened);
293 } 293 }
294
295out_err: 294out_err:
295 if (!req->r_err && req->r_target_inode)
296 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
296 ceph_mdsc_put_request(req); 297 ceph_mdsc_put_request(req);
297 dout("atomic_open result=%d\n", err); 298 dout("atomic_open result=%d\n", err);
298 return err; 299 return err;
@@ -600,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
600 false); 601 false);
601 if (IS_ERR(req)) { 602 if (IS_ERR(req)) {
602 ret = PTR_ERR(req); 603 ret = PTR_ERR(req);
603 goto out; 604 break;
604 } 605 }
605 606
606 num_pages = calc_pages_for(page_align, len); 607 num_pages = calc_pages_for(page_align, len);
@@ -718,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
718 false); 719 false);
719 if (IS_ERR(req)) { 720 if (IS_ERR(req)) {
720 ret = PTR_ERR(req); 721 ret = PTR_ERR(req);
721 goto out; 722 break;
722 } 723 }
723 724
724 /* 725 /*
@@ -970,6 +971,8 @@ retry_snap:
970 goto retry_snap; 971 goto retry_snap;
971 } 972 }
972 } else { 973 } else {
974 loff_t old_size = inode->i_size;
975 struct iov_iter from;
973 /* 976 /*
974 * No need to acquire the i_truncate_mutex. Because 977 * No need to acquire the i_truncate_mutex. Because
975 * the MDS revokes Fwb caps before sending truncate 978 * the MDS revokes Fwb caps before sending truncate
@@ -977,9 +980,12 @@ retry_snap:
977 * are pending vmtruncate. So write and vmtruncate 980 * are pending vmtruncate. So write and vmtruncate
978 * can not run at the same time 981 * can not run at the same time
979 */ 982 */
980 written = generic_file_buffered_write(iocb, iov, nr_segs, 983 iov_iter_init(&from, iov, nr_segs, count, 0);
981 pos, &iocb->ki_pos, 984 written = generic_perform_write(file, &from, pos);
982 count, 0); 985 if (likely(written >= 0))
986 iocb->ki_pos = pos + written;
987 if (inode->i_size > old_size)
988 ceph_fscache_update_objectsize(inode);
983 mutex_unlock(&inode->i_mutex); 989 mutex_unlock(&inode->i_mutex);
984 } 990 }
985 991
@@ -1215,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode,
1215 if (!S_ISREG(inode->i_mode)) 1221 if (!S_ISREG(inode->i_mode))
1216 return -EOPNOTSUPP; 1222 return -EOPNOTSUPP;
1217 1223
1218 if (IS_SWAPFILE(inode))
1219 return -ETXTBSY;
1220
1221 mutex_lock(&inode->i_mutex); 1224 mutex_lock(&inode->i_mutex);
1222 1225
1223 if (ceph_snap(inode) != CEPH_NOSNAP) { 1226 if (ceph_snap(inode) != CEPH_NOSNAP) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 32d519d8a2e2..233c6f96910a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode,
659 le32_to_cpu(info->time_warp_seq), 659 le32_to_cpu(info->time_warp_seq),
660 &ctime, &mtime, &atime); 660 &ctime, &mtime, &atime);
661 661
662 /* only update max_size on auth cap */
663 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
664 ci->i_max_size != le64_to_cpu(info->max_size)) {
665 dout("max_size %lld -> %llu\n", ci->i_max_size,
666 le64_to_cpu(info->max_size));
667 ci->i_max_size = le64_to_cpu(info->max_size);
668 }
669
670 ci->i_layout = info->layout; 662 ci->i_layout = info->layout;
671 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 663 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
672 664
@@ -752,9 +744,16 @@ static int fill_inode(struct inode *inode,
752 !__ceph_dir_is_complete(ci)) { 744 !__ceph_dir_is_complete(ci)) {
753 dout(" marking %p complete (empty)\n", inode); 745 dout(" marking %p complete (empty)\n", inode);
754 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 746 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
755 ci->i_max_offset = 2;
756 } 747 }
757no_change: 748no_change:
749 /* only update max_size on auth cap */
750 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
751 ci->i_max_size != le64_to_cpu(info->max_size)) {
752 dout("max_size %lld -> %llu\n", ci->i_max_size,
753 le64_to_cpu(info->max_size));
754 ci->i_max_size = le64_to_cpu(info->max_size);
755 }
756
758 spin_unlock(&ci->i_ceph_lock); 757 spin_unlock(&ci->i_ceph_lock);
759 758
760 /* queue truncate if we saw i_size decrease */ 759 /* queue truncate if we saw i_size decrease */
@@ -890,41 +889,6 @@ out_unlock:
890} 889}
891 890
892/* 891/*
893 * Set dentry's directory position based on the current dir's max, and
894 * order it in d_subdirs, so that dcache_readdir behaves.
895 *
896 * Always called under directory's i_mutex.
897 */
898static void ceph_set_dentry_offset(struct dentry *dn)
899{
900 struct dentry *dir = dn->d_parent;
901 struct inode *inode = dir->d_inode;
902 struct ceph_inode_info *ci;
903 struct ceph_dentry_info *di;
904
905 BUG_ON(!inode);
906
907 ci = ceph_inode(inode);
908 di = ceph_dentry(dn);
909
910 spin_lock(&ci->i_ceph_lock);
911 if (!__ceph_dir_is_complete(ci)) {
912 spin_unlock(&ci->i_ceph_lock);
913 return;
914 }
915 di->offset = ceph_inode(inode)->i_max_offset++;
916 spin_unlock(&ci->i_ceph_lock);
917
918 spin_lock(&dir->d_lock);
919 spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
920 list_move(&dn->d_u.d_child, &dir->d_subdirs);
921 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
922 dn->d_u.d_child.prev, dn->d_u.d_child.next);
923 spin_unlock(&dn->d_lock);
924 spin_unlock(&dir->d_lock);
925}
926
927/*
928 * splice a dentry to an inode. 892 * splice a dentry to an inode.
929 * caller must hold directory i_mutex for this to be safe. 893 * caller must hold directory i_mutex for this to be safe.
930 * 894 *
@@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
933 * the caller) if we fail. 897 * the caller) if we fail.
934 */ 898 */
935static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 899static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
936 bool *prehash, bool set_offset) 900 bool *prehash)
937{ 901{
938 struct dentry *realdn; 902 struct dentry *realdn;
939 903
@@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
965 } 929 }
966 if ((!prehash || *prehash) && d_unhashed(dn)) 930 if ((!prehash || *prehash) && d_unhashed(dn))
967 d_rehash(dn); 931 d_rehash(dn);
968 if (set_offset)
969 ceph_set_dentry_offset(dn);
970out: 932out:
971 return dn; 933 return dn;
972} 934}
@@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
987{ 949{
988 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 950 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
989 struct inode *in = NULL; 951 struct inode *in = NULL;
990 struct ceph_mds_reply_inode *ininfo;
991 struct ceph_vino vino; 952 struct ceph_vino vino;
992 struct ceph_fs_client *fsc = ceph_sb_to_client(sb); 953 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
993 int err = 0; 954 int err = 0;
@@ -1044,10 +1005,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1044 session, req->r_request_started, -1, 1005 session, req->r_request_started, -1,
1045 &req->r_caps_reservation); 1006 &req->r_caps_reservation);
1046 if (err < 0) 1007 if (err < 0)
1047 return err; 1008 goto done;
1048 } else { 1009 } else {
1049 WARN_ON_ONCE(1); 1010 WARN_ON_ONCE(1);
1050 } 1011 }
1012
1013 if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
1014 struct qstr dname;
1015 struct dentry *dn, *parent;
1016
1017 BUG_ON(!rinfo->head->is_target);
1018 BUG_ON(req->r_dentry);
1019
1020 parent = d_find_any_alias(dir);
1021 BUG_ON(!parent);
1022
1023 dname.name = rinfo->dname;
1024 dname.len = rinfo->dname_len;
1025 dname.hash = full_name_hash(dname.name, dname.len);
1026 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1027 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1028retry_lookup:
1029 dn = d_lookup(parent, &dname);
1030 dout("d_lookup on parent=%p name=%.*s got %p\n",
1031 parent, dname.len, dname.name, dn);
1032
1033 if (!dn) {
1034 dn = d_alloc(parent, &dname);
1035 dout("d_alloc %p '%.*s' = %p\n", parent,
1036 dname.len, dname.name, dn);
1037 if (dn == NULL) {
1038 dput(parent);
1039 err = -ENOMEM;
1040 goto done;
1041 }
1042 err = ceph_init_dentry(dn);
1043 if (err < 0) {
1044 dput(dn);
1045 dput(parent);
1046 goto done;
1047 }
1048 } else if (dn->d_inode &&
1049 (ceph_ino(dn->d_inode) != vino.ino ||
1050 ceph_snap(dn->d_inode) != vino.snap)) {
1051 dout(" dn %p points to wrong inode %p\n",
1052 dn, dn->d_inode);
1053 d_delete(dn);
1054 dput(dn);
1055 goto retry_lookup;
1056 }
1057
1058 req->r_dentry = dn;
1059 dput(parent);
1060 }
1051 } 1061 }
1052 1062
1053 if (rinfo->head->is_target) { 1063 if (rinfo->head->is_target) {
@@ -1063,7 +1073,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1063 1073
1064 err = fill_inode(in, &rinfo->targeti, NULL, 1074 err = fill_inode(in, &rinfo->targeti, NULL,
1065 session, req->r_request_started, 1075 session, req->r_request_started,
1066 (le32_to_cpu(rinfo->head->result) == 0) ? 1076 (!req->r_aborted && rinfo->head->result == 0) ?
1067 req->r_fmode : -1, 1077 req->r_fmode : -1,
1068 &req->r_caps_reservation); 1078 &req->r_caps_reservation);
1069 if (err < 0) { 1079 if (err < 0) {
@@ -1112,6 +1122,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1112 1122
1113 /* rename? */ 1123 /* rename? */
1114 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { 1124 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1125 struct inode *olddir = req->r_old_dentry_dir;
1126 BUG_ON(!olddir);
1127
1115 dout(" src %p '%.*s' dst %p '%.*s'\n", 1128 dout(" src %p '%.*s' dst %p '%.*s'\n",
1116 req->r_old_dentry, 1129 req->r_old_dentry,
1117 req->r_old_dentry->d_name.len, 1130 req->r_old_dentry->d_name.len,
@@ -1131,13 +1144,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1131 rehashing bug in vfs_rename_dir */ 1144 rehashing bug in vfs_rename_dir */
1132 ceph_invalidate_dentry_lease(dn); 1145 ceph_invalidate_dentry_lease(dn);
1133 1146
1134 /* 1147 /* d_move screws up sibling dentries' offsets */
1135 * d_move() puts the renamed dentry at the end of 1148 ceph_dir_clear_complete(dir);
1136 * d_subdirs. We need to assign it an appropriate 1149 ceph_dir_clear_complete(olddir);
1137 * directory offset so we can behave when dir is 1150
1138 * complete.
1139 */
1140 ceph_set_dentry_offset(req->r_old_dentry);
1141 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1151 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1142 ceph_dentry(req->r_old_dentry)->offset); 1152 ceph_dentry(req->r_old_dentry)->offset);
1143 1153
@@ -1164,8 +1174,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1164 1174
1165 /* attach proper inode */ 1175 /* attach proper inode */
1166 if (!dn->d_inode) { 1176 if (!dn->d_inode) {
1177 ceph_dir_clear_complete(dir);
1167 ihold(in); 1178 ihold(in);
1168 dn = splice_dentry(dn, in, &have_lease, true); 1179 dn = splice_dentry(dn, in, &have_lease);
1169 if (IS_ERR(dn)) { 1180 if (IS_ERR(dn)) {
1170 err = PTR_ERR(dn); 1181 err = PTR_ERR(dn);
1171 goto done; 1182 goto done;
@@ -1186,17 +1197,16 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1186 (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || 1197 (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1187 req->r_op == CEPH_MDS_OP_MKSNAP)) { 1198 req->r_op == CEPH_MDS_OP_MKSNAP)) {
1188 struct dentry *dn = req->r_dentry; 1199 struct dentry *dn = req->r_dentry;
1200 struct inode *dir = req->r_locked_dir;
1189 1201
1190 /* fill out a snapdir LOOKUPSNAP dentry */ 1202 /* fill out a snapdir LOOKUPSNAP dentry */
1191 BUG_ON(!dn); 1203 BUG_ON(!dn);
1192 BUG_ON(!req->r_locked_dir); 1204 BUG_ON(!dir);
1193 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); 1205 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1194 ininfo = rinfo->targeti.in;
1195 vino.ino = le64_to_cpu(ininfo->ino);
1196 vino.snap = le64_to_cpu(ininfo->snapid);
1197 dout(" linking snapped dir %p to dn %p\n", in, dn); 1206 dout(" linking snapped dir %p to dn %p\n", in, dn);
1207 ceph_dir_clear_complete(dir);
1198 ihold(in); 1208 ihold(in);
1199 dn = splice_dentry(dn, in, NULL, true); 1209 dn = splice_dentry(dn, in, NULL);
1200 if (IS_ERR(dn)) { 1210 if (IS_ERR(dn)) {
1201 err = PTR_ERR(dn); 1211 err = PTR_ERR(dn);
1202 goto done; 1212 goto done;
@@ -1358,7 +1368,7 @@ retry_lookup:
1358 } 1368 }
1359 1369
1360 if (!dn->d_inode) { 1370 if (!dn->d_inode) {
1361 dn = splice_dentry(dn, in, NULL, false); 1371 dn = splice_dentry(dn, in, NULL);
1362 if (IS_ERR(dn)) { 1372 if (IS_ERR(dn)) {
1363 err = PTR_ERR(dn); 1373 err = PTR_ERR(dn);
1364 dn = NULL; 1374 dn = NULL;
@@ -1616,8 +1626,6 @@ static const struct inode_operations ceph_symlink_iops = {
1616 .getxattr = ceph_getxattr, 1626 .getxattr = ceph_getxattr,
1617 .listxattr = ceph_listxattr, 1627 .listxattr = ceph_listxattr,
1618 .removexattr = ceph_removexattr, 1628 .removexattr = ceph_removexattr,
1619 .get_acl = ceph_get_acl,
1620 .set_acl = ceph_set_acl,
1621}; 1629};
1622 1630
1623/* 1631/*
@@ -1627,7 +1635,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1627{ 1635{
1628 struct inode *inode = dentry->d_inode; 1636 struct inode *inode = dentry->d_inode;
1629 struct ceph_inode_info *ci = ceph_inode(inode); 1637 struct ceph_inode_info *ci = ceph_inode(inode);
1630 struct inode *parent_inode;
1631 const unsigned int ia_valid = attr->ia_valid; 1638 const unsigned int ia_valid = attr->ia_valid;
1632 struct ceph_mds_request *req; 1639 struct ceph_mds_request *req;
1633 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; 1640 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1819,9 +1826,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1819 req->r_inode_drop = release; 1826 req->r_inode_drop = release;
1820 req->r_args.setattr.mask = cpu_to_le32(mask); 1827 req->r_args.setattr.mask = cpu_to_le32(mask);
1821 req->r_num_caps = 1; 1828 req->r_num_caps = 1;
1822 parent_inode = ceph_get_dentry_parent_inode(dentry); 1829 err = ceph_mdsc_do_request(mdsc, NULL, req);
1823 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1824 iput(parent_inode);
1825 } 1830 }
1826 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, 1831 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1827 ceph_cap_string(dirtied), mask); 1832 ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index dc66c9e023e4..a822a6e58290 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
1#include <linux/ceph/ceph_debug.h>
1#include <linux/in.h> 2#include <linux/in.h>
2 3
3#include "super.h" 4#include "super.h"
4#include "mds_client.h" 5#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h" 6#include "ioctl.h"
8 7
9 8
@@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
64static long ceph_ioctl_set_layout(struct file *file, void __user *arg) 63static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
65{ 64{
66 struct inode *inode = file_inode(file); 65 struct inode *inode = file_inode(file);
67 struct inode *parent_inode;
68 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 66 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
69 struct ceph_mds_request *req; 67 struct ceph_mds_request *req;
70 struct ceph_ioctl_layout l; 68 struct ceph_ioctl_layout l;
@@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
111 return PTR_ERR(req); 109 return PTR_ERR(req);
112 req->r_inode = inode; 110 req->r_inode = inode;
113 ihold(inode); 111 ihold(inode);
112 req->r_num_caps = 1;
113
114 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; 114 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
115 115
116 req->r_args.setlayout.layout.fl_stripe_unit = 116 req->r_args.setlayout.layout.fl_stripe_unit =
@@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
121 cpu_to_le32(l.object_size); 121 cpu_to_le32(l.object_size);
122 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); 122 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
123 123
124 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); 124 err = ceph_mdsc_do_request(mdsc, NULL, req);
125 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
126 iput(parent_inode);
127 ceph_mdsc_put_request(req); 125 ceph_mdsc_put_request(req);
128 return err; 126 return err;
129} 127}
@@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
157 return PTR_ERR(req); 155 return PTR_ERR(req);
158 req->r_inode = inode; 156 req->r_inode = inode;
159 ihold(inode); 157 ihold(inode);
158 req->r_num_caps = 1;
160 159
161 req->r_args.setlayout.layout.fl_stripe_unit = 160 req->r_args.setlayout.layout.fl_stripe_unit =
162 cpu_to_le32(l.stripe_unit); 161 cpu_to_le32(l.stripe_unit);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0f..191398852a2e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5#include <linux/random.h>
5 6
6#include "super.h" 7#include "super.h"
7#include "mds_client.h" 8#include "mds_client.h"
8#include <linux/ceph/pagelist.h> 9#include <linux/ceph/pagelist.h>
9 10
11static u64 lock_secret;
12
13static inline u64 secure_addr(void *addr)
14{
15 u64 v = lock_secret ^ (u64)(unsigned long)addr;
16 /*
17 * Set the most significant bit, so that MDS knows the 'owner'
18 * is sufficient to identify the owner of lock. (old code uses
19 * both 'owner' and 'pid')
20 */
21 v |= (1ULL << 63);
22 return v;
23}
24
25void __init ceph_flock_init(void)
26{
27 get_random_bytes(&lock_secret, sizeof(lock_secret));
28}
29
10/** 30/**
11 * Implement fcntl and flock locking functions. 31 * Implement fcntl and flock locking functions.
12 */ 32 */
@@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 int cmd, u8 wait, struct file_lock *fl) 34 int cmd, u8 wait, struct file_lock *fl)
15{ 35{
16 struct inode *inode = file_inode(file); 36 struct inode *inode = file_inode(file);
17 struct ceph_mds_client *mdsc = 37 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
18 ceph_sb_to_client(inode->i_sb)->mdsc;
19 struct ceph_mds_request *req; 38 struct ceph_mds_request *req;
20 int err; 39 int err;
21 u64 length = 0; 40 u64 length = 0;
41 u64 owner;
22 42
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 43 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req)) 44 if (IS_ERR(req))
25 return PTR_ERR(req); 45 return PTR_ERR(req);
26 req->r_inode = inode; 46 req->r_inode = inode;
27 ihold(inode); 47 ihold(inode);
48 req->r_num_caps = 1;
28 49
29 /* mds requires start and length rather than start and end */ 50 /* mds requires start and length rather than start and end */
30 if (LLONG_MAX == fl->fl_end) 51 if (LLONG_MAX == fl->fl_end)
@@ -32,25 +53,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
32 else 53 else
33 length = fl->fl_end - fl->fl_start + 1; 54 length = fl->fl_end - fl->fl_start + 1;
34 55
35 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 56 if (lock_type == CEPH_LOCK_FCNTL)
36 "length: %llu, wait: %d, type: %d", (int)lock_type, 57 owner = secure_addr(fl->fl_owner);
37 (int)operation, (u64)fl->fl_pid, fl->fl_start, 58 else
38 length, wait, fl->fl_type); 59 owner = secure_addr(fl->fl_file);
60
61 dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
62 "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
63 (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
64 wait, fl->fl_type);
39 65
40 req->r_args.filelock_change.rule = lock_type; 66 req->r_args.filelock_change.rule = lock_type;
41 req->r_args.filelock_change.type = cmd; 67 req->r_args.filelock_change.type = cmd;
68 req->r_args.filelock_change.owner = cpu_to_le64(owner);
42 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); 69 req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
43 /* This should be adjusted, but I'm not sure if
44 namespaces actually get id numbers*/
45 req->r_args.filelock_change.pid_namespace =
46 cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
47 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); 70 req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
48 req->r_args.filelock_change.length = cpu_to_le64(length); 71 req->r_args.filelock_change.length = cpu_to_le64(length);
49 req->r_args.filelock_change.wait = wait; 72 req->r_args.filelock_change.wait = wait;
50 73
51 err = ceph_mdsc_do_request(mdsc, inode, req); 74 err = ceph_mdsc_do_request(mdsc, inode, req);
52 75
53 if ( operation == CEPH_MDS_OP_GETFILELOCK){ 76 if (operation == CEPH_MDS_OP_GETFILELOCK) {
54 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); 77 fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
55 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) 78 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
56 fl->fl_type = F_RDLCK; 79 fl->fl_type = F_RDLCK;
@@ -87,14 +110,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
87 u8 wait = 0; 110 u8 wait = 0;
88 u16 op = CEPH_MDS_OP_SETFILELOCK; 111 u16 op = CEPH_MDS_OP_SETFILELOCK;
89 112
90 fl->fl_nspid = get_pid(task_tgid(current)); 113 if (!(fl->fl_flags & FL_POSIX))
91 dout("ceph_lock, fl_pid:%d", fl->fl_pid); 114 return -ENOLCK;
115 /* No mandatory locks */
116 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
117 return -ENOLCK;
118
119 dout("ceph_lock, fl_owner: %p", fl->fl_owner);
92 120
93 /* set wait bit as appropriate, then make command as Ceph expects it*/ 121 /* set wait bit as appropriate, then make command as Ceph expects it*/
94 if (F_SETLKW == cmd) 122 if (IS_GETLK(cmd))
95 wait = 1;
96 if (F_GETLK == cmd)
97 op = CEPH_MDS_OP_GETFILELOCK; 123 op = CEPH_MDS_OP_GETFILELOCK;
124 else if (IS_SETLKW(cmd))
125 wait = 1;
98 126
99 if (F_RDLCK == fl->fl_type) 127 if (F_RDLCK == fl->fl_type)
100 lock_cmd = CEPH_LOCK_SHARED; 128 lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +133,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
105 133
106 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); 134 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
107 if (!err) { 135 if (!err) {
108 if ( op != CEPH_MDS_OP_GETFILELOCK ){ 136 if (op != CEPH_MDS_OP_GETFILELOCK) {
109 dout("mds locked, locking locally"); 137 dout("mds locked, locking locally");
110 err = posix_lock_file(file, fl, NULL); 138 err = posix_lock_file(file, fl, NULL);
111 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 139 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +159,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
131{ 159{
132 u8 lock_cmd; 160 u8 lock_cmd;
133 int err; 161 int err;
134 u8 wait = 1; 162 u8 wait = 0;
135 163
136 fl->fl_nspid = get_pid(task_tgid(current)); 164 if (!(fl->fl_flags & FL_FLOCK))
137 dout("ceph_flock, fl_pid:%d", fl->fl_pid); 165 return -ENOLCK;
138 166 /* No mandatory locks */
139 /* set wait bit, then clear it out of cmd*/ 167 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
140 if (cmd & LOCK_NB) 168 return -ENOLCK;
141 wait = 0; 169
142 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); 170 dout("ceph_flock, fl_file: %p", fl->fl_file);
143 /* set command sequence that Ceph wants to see: 171
144 shared lock, exclusive lock, or unlock */ 172 if (IS_SETLKW(cmd))
145 if (LOCK_SH == cmd) 173 wait = 1;
174
175 if (F_RDLCK == fl->fl_type)
146 lock_cmd = CEPH_LOCK_SHARED; 176 lock_cmd = CEPH_LOCK_SHARED;
147 else if (LOCK_EX == cmd) 177 else if (F_WRLCK == fl->fl_type)
148 lock_cmd = CEPH_LOCK_EXCL; 178 lock_cmd = CEPH_LOCK_EXCL;
149 else 179 else
150 lock_cmd = CEPH_LOCK_UNLOCK; 180 lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +310,14 @@ int lock_to_ceph_filelock(struct file_lock *lock,
280 struct ceph_filelock *cephlock) 310 struct ceph_filelock *cephlock)
281{ 311{
282 int err = 0; 312 int err = 0;
283
284 cephlock->start = cpu_to_le64(lock->fl_start); 313 cephlock->start = cpu_to_le64(lock->fl_start);
285 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 314 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
286 cephlock->client = cpu_to_le64(0); 315 cephlock->client = cpu_to_le64(0);
287 cephlock->pid = cpu_to_le64(lock->fl_pid); 316 cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
288 cephlock->pid_namespace = 317 if (lock->fl_flags & FL_POSIX)
289 cpu_to_le64((u64)(unsigned long)lock->fl_nspid); 318 cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
319 else
320 cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file));
290 321
291 switch (lock->fl_type) { 322 switch (lock->fl_type) {
292 case F_RDLCK: 323 case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f4f050a69a48..2b4d093d0563 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/wait.h> 4#include <linux/wait.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/gfp.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7#include <linux/debugfs.h> 8#include <linux/debugfs.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
@@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
165 if (num == 0) 166 if (num == 0)
166 goto done; 167 goto done;
167 168
168 /* alloc large array */ 169 BUG_ON(!info->dir_in);
169 info->dir_nr = num;
170 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
171 sizeof(*info->dir_dname) +
172 sizeof(*info->dir_dname_len) +
173 sizeof(*info->dir_dlease),
174 GFP_NOFS);
175 if (info->dir_in == NULL) {
176 err = -ENOMEM;
177 goto out_bad;
178 }
179 info->dir_dname = (void *)(info->dir_in + num); 170 info->dir_dname = (void *)(info->dir_in + num);
180 info->dir_dname_len = (void *)(info->dir_dname + num); 171 info->dir_dname_len = (void *)(info->dir_dname + num);
181 info->dir_dlease = (void *)(info->dir_dname_len + num); 172 info->dir_dlease = (void *)(info->dir_dname_len + num);
173 if ((unsigned long)(info->dir_dlease + num) >
174 (unsigned long)info->dir_in + info->dir_buf_size) {
175 pr_err("dir contents are larger than expected\n");
176 WARN_ON(1);
177 goto bad;
178 }
182 179
180 info->dir_nr = num;
183 while (num) { 181 while (num) {
184 /* dentry */ 182 /* dentry */
185 ceph_decode_need(p, end, sizeof(u32)*2, bad); 183 ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -327,7 +325,9 @@ out_bad:
327 325
328static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 326static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
329{ 327{
330 kfree(info->dir_in); 328 if (!info->dir_in)
329 return;
330 free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
331} 331}
332 332
333 333
@@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
512 struct ceph_mds_request *req = container_of(kref, 512 struct ceph_mds_request *req = container_of(kref,
513 struct ceph_mds_request, 513 struct ceph_mds_request,
514 r_kref); 514 r_kref);
515 destroy_reply_info(&req->r_reply_info);
515 if (req->r_request) 516 if (req->r_request)
516 ceph_msg_put(req->r_request); 517 ceph_msg_put(req->r_request);
517 if (req->r_reply) { 518 if (req->r_reply)
518 ceph_msg_put(req->r_reply); 519 ceph_msg_put(req->r_reply);
519 destroy_reply_info(&req->r_reply_info);
520 }
521 if (req->r_inode) { 520 if (req->r_inode) {
522 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 521 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
523 iput(req->r_inode); 522 iput(req->r_inode);
@@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
528 iput(req->r_target_inode); 527 iput(req->r_target_inode);
529 if (req->r_dentry) 528 if (req->r_dentry)
530 dput(req->r_dentry); 529 dput(req->r_dentry);
531 if (req->r_old_dentry) { 530 if (req->r_old_dentry)
531 dput(req->r_old_dentry);
532 if (req->r_old_dentry_dir) {
532 /* 533 /*
533 * track (and drop pins for) r_old_dentry_dir 534 * track (and drop pins for) r_old_dentry_dir
534 * separately, since r_old_dentry's d_parent may have 535 * separately, since r_old_dentry's d_parent may have
@@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
537 */ 538 */
538 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), 539 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
539 CEPH_CAP_PIN); 540 CEPH_CAP_PIN);
540 dput(req->r_old_dentry);
541 iput(req->r_old_dentry_dir); 541 iput(req->r_old_dentry_dir);
542 } 542 }
543 kfree(req->r_path1); 543 kfree(req->r_path1);
@@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1311 trim_caps - session->s_trim_caps); 1311 trim_caps - session->s_trim_caps);
1312 session->s_trim_caps = 0; 1312 session->s_trim_caps = 0;
1313 } 1313 }
1314
1315 ceph_add_cap_releases(mdsc, session);
1316 ceph_send_cap_releases(mdsc, session);
1314 return 0; 1317 return 0;
1315} 1318}
1316 1319
@@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1461 1464
1462 dout("discard_cap_releases mds%d\n", session->s_mds); 1465 dout("discard_cap_releases mds%d\n", session->s_mds);
1463 1466
1464 /* zero out the in-progress message */ 1467 if (!list_empty(&session->s_cap_releases)) {
1465 msg = list_first_entry(&session->s_cap_releases, 1468 /* zero out the in-progress message */
1466 struct ceph_msg, list_head); 1469 msg = list_first_entry(&session->s_cap_releases,
1467 head = msg->front.iov_base; 1470 struct ceph_msg, list_head);
1468 num = le32_to_cpu(head->num); 1471 head = msg->front.iov_base;
1469 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1472 num = le32_to_cpu(head->num);
1470 head->num = cpu_to_le32(0); 1473 dout("discard_cap_releases mds%d %p %u\n",
1471 msg->front.iov_len = sizeof(*head); 1474 session->s_mds, msg, num);
1472 session->s_num_cap_releases += num; 1475 head->num = cpu_to_le32(0);
1476 msg->front.iov_len = sizeof(*head);
1477 session->s_num_cap_releases += num;
1478 }
1473 1479
1474 /* requeue completed messages */ 1480 /* requeue completed messages */
1475 while (!list_empty(&session->s_cap_releases_done)) { 1481 while (!list_empty(&session->s_cap_releases_done)) {
@@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1492 * requests 1498 * requests
1493 */ 1499 */
1494 1500
1501int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1502 struct inode *dir)
1503{
1504 struct ceph_inode_info *ci = ceph_inode(dir);
1505 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1506 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
1507 size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
1508 sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
1509 int order, num_entries;
1510
1511 spin_lock(&ci->i_ceph_lock);
1512 num_entries = ci->i_files + ci->i_subdirs;
1513 spin_unlock(&ci->i_ceph_lock);
1514 num_entries = max(num_entries, 1);
1515 num_entries = min(num_entries, opt->max_readdir);
1516
1517 order = get_order(size * num_entries);
1518 while (order >= 0) {
1519 rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
1520 order);
1521 if (rinfo->dir_in)
1522 break;
1523 order--;
1524 }
1525 if (!rinfo->dir_in)
1526 return -ENOMEM;
1527
1528 num_entries = (PAGE_SIZE << order) / size;
1529 num_entries = min(num_entries, opt->max_readdir);
1530
1531 rinfo->dir_buf_size = PAGE_SIZE << order;
1532 req->r_num_caps = num_entries + 1;
1533 req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
1534 req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
1535 return 0;
1536}
1537
1495/* 1538/*
1496 * Create an mds request. 1539 * Create an mds request.
1497 */ 1540 */
@@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2053 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 2096 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
2054 if (req->r_locked_dir) 2097 if (req->r_locked_dir)
2055 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 2098 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
2056 if (req->r_old_dentry) 2099 if (req->r_old_dentry_dir)
2057 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), 2100 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
2058 CEPH_CAP_PIN); 2101 CEPH_CAP_PIN);
2059 2102
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 68288917c737..e90cfccf93bd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
67 /* for readdir results */ 67 /* for readdir results */
68 struct { 68 struct {
69 struct ceph_mds_reply_dirfrag *dir_dir; 69 struct ceph_mds_reply_dirfrag *dir_dir;
70 size_t dir_buf_size;
70 int dir_nr; 71 int dir_nr;
71 char **dir_dname; 72 char **dir_dname;
72 u32 *dir_dname_len; 73 u32 *dir_dname_len;
@@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
346 struct dentry *dn); 347 struct dentry *dn);
347 348
348extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 349extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
349 350extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
351 struct inode *dir);
350extern struct ceph_mds_request * 352extern struct ceph_mds_request *
351ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 353ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
352extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 354extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4440f447fd3f..51cc23e48111 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op)
54 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; 54 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
55 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; 55 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
56 case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; 56 case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
57 case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
57 case CEPH_MDS_OP_GETATTR: return "getattr"; 58 case CEPH_MDS_OP_GETATTR: return "getattr";
58 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 59 case CEPH_MDS_OP_SETXATTR: return "setxattr";
59 case CEPH_MDS_OP_SETATTR: return "setattr"; 60 case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 10a4ccbf38da..06150fd745ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1026,6 +1026,7 @@ static int __init init_ceph(void)
1026 if (ret) 1026 if (ret)
1027 goto out; 1027 goto out;
1028 1028
1029 ceph_flock_init();
1029 ceph_xattr_init(); 1030 ceph_xattr_init();
1030 ret = register_filesystem(&ceph_fs_type); 1031 ret = register_filesystem(&ceph_fs_type);
1031 if (ret) 1032 if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d8801a95b685..ead05cc1f447 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -266,7 +266,6 @@ struct ceph_inode_info {
266 struct timespec i_rctime; 266 struct timespec i_rctime;
267 u64 i_rbytes, i_rfiles, i_rsubdirs; 267 u64 i_rbytes, i_rfiles, i_rsubdirs;
268 u64 i_files, i_subdirs; 268 u64 i_files, i_subdirs;
269 u64 i_max_offset; /* largest readdir offset, set with complete dir */
270 269
271 struct rb_root i_fragtree; 270 struct rb_root i_fragtree;
272 struct mutex i_fragtree_mutex; 271 struct mutex i_fragtree_mutex;
@@ -577,7 +576,7 @@ struct ceph_file_info {
577 576
578 /* readdir: position within a frag */ 577 /* readdir: position within a frag */
579 unsigned offset; /* offset of last chunk, adjusted for . and .. */ 578 unsigned offset; /* offset of last chunk, adjusted for . and .. */
580 u64 next_offset; /* offset of next chunk (last_name's + 1) */ 579 unsigned next_offset; /* offset of next chunk (last_name's + 1) */
581 char *last_name; /* last entry in previous chunk */ 580 char *last_name; /* last entry in previous chunk */
582 struct dentry *dentry; /* next dentry (for dcache readdir) */ 581 struct dentry *dentry; /* next dentry (for dcache readdir) */
583 int dir_release_count; 582 int dir_release_count;
@@ -871,6 +870,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
871extern const struct export_operations ceph_export_ops; 870extern const struct export_operations ceph_export_ops;
872 871
873/* locks.c */ 872/* locks.c */
873extern __init void ceph_flock_init(void);
874extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 874extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
875extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 875extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
876extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); 876extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a55ec37378c6..c9c2b887381e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
64} 64}
65 65
66static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 66static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
67 size_t size) 67 size_t size)
68{ 68{
69 int ret; 69 int ret;
70 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); 70 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
71 struct ceph_osd_client *osdc = &fsc->client->osdc; 71 struct ceph_osd_client *osdc = &fsc->client->osdc;
72 s64 pool = ceph_file_layout_pg_pool(ci->i_layout); 72 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
73 const char *pool_name; 73 const char *pool_name;
74 char buf[128];
74 75
75 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); 76 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
76 down_read(&osdc->map_sem); 77 down_read(&osdc->map_sem);
77 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 78 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
78 if (pool_name) 79 if (pool_name) {
79 ret = snprintf(val, size, 80 size_t len = strlen(pool_name);
80 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", 81 ret = snprintf(buf, sizeof(buf),
82 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
81 (unsigned long long)ceph_file_layout_su(ci->i_layout), 83 (unsigned long long)ceph_file_layout_su(ci->i_layout),
82 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 84 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
83 (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 85 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
84 pool_name); 86 if (!size) {
85 else 87 ret += len;
86 ret = snprintf(val, size, 88 } else if (ret + len > size) {
89 ret = -ERANGE;
90 } else {
91 memcpy(val, buf, ret);
92 memcpy(val + ret, pool_name, len);
93 ret += len;
94 }
95 } else {
96 ret = snprintf(buf, sizeof(buf),
87 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", 97 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
88 (unsigned long long)ceph_file_layout_su(ci->i_layout), 98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
89 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
90 (unsigned long long)ceph_file_layout_object_size(ci->i_layout), 100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
91 (unsigned long long)pool); 101 (unsigned long long)pool);
92 102 if (size) {
103 if (ret <= size)
104 memcpy(val, buf, ret);
105 else
106 ret = -ERANGE;
107 }
108 }
93 up_read(&osdc->map_sem); 109 up_read(&osdc->map_sem);
94 return ret; 110 return ret;
95} 111}
@@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
215 .name_size = sizeof("ceph.dir.layout"), 231 .name_size = sizeof("ceph.dir.layout"),
216 .getxattr_cb = ceph_vxattrcb_layout, 232 .getxattr_cb = ceph_vxattrcb_layout,
217 .readonly = false, 233 .readonly = false,
218 .hidden = false, 234 .hidden = true,
219 .exists_cb = ceph_vxattrcb_layout_exists, 235 .exists_cb = ceph_vxattrcb_layout_exists,
220 }, 236 },
221 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), 237 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
242 .name_size = sizeof("ceph.file.layout"), 258 .name_size = sizeof("ceph.file.layout"),
243 .getxattr_cb = ceph_vxattrcb_layout, 259 .getxattr_cb = ceph_vxattrcb_layout,
244 .readonly = false, 260 .readonly = false,
245 .hidden = false, 261 .hidden = true,
246 .exists_cb = ceph_vxattrcb_layout_exists, 262 .exists_cb = ceph_vxattrcb_layout_exists,
247 }, 263 },
248 XATTR_LAYOUT_FIELD(file, layout, stripe_unit), 264 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
842 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 858 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
843 struct inode *inode = dentry->d_inode; 859 struct inode *inode = dentry->d_inode;
844 struct ceph_inode_info *ci = ceph_inode(inode); 860 struct ceph_inode_info *ci = ceph_inode(inode);
845 struct inode *parent_inode;
846 struct ceph_mds_request *req; 861 struct ceph_mds_request *req;
847 struct ceph_mds_client *mdsc = fsc->mdsc; 862 struct ceph_mds_client *mdsc = fsc->mdsc;
848 int err; 863 int err;
@@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
893 req->r_data_len = size; 908 req->r_data_len = size;
894 909
895 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 910 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
896 parent_inode = ceph_get_dentry_parent_inode(dentry); 911 err = ceph_mdsc_do_request(mdsc, NULL, req);
897 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
898 iput(parent_inode);
899 ceph_mdsc_put_request(req); 912 ceph_mdsc_put_request(req);
900 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 913 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
901 914
@@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
1019 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1032 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
1020 struct ceph_mds_client *mdsc = fsc->mdsc; 1033 struct ceph_mds_client *mdsc = fsc->mdsc;
1021 struct inode *inode = dentry->d_inode; 1034 struct inode *inode = dentry->d_inode;
1022 struct inode *parent_inode;
1023 struct ceph_mds_request *req; 1035 struct ceph_mds_request *req;
1024 int err; 1036 int err;
1025 1037
@@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
1033 req->r_num_caps = 1; 1045 req->r_num_caps = 1;
1034 req->r_path2 = kstrdup(name, GFP_NOFS); 1046 req->r_path2 = kstrdup(name, GFP_NOFS);
1035 1047
1036 parent_inode = ceph_get_dentry_parent_inode(dentry); 1048 err = ceph_mdsc_do_request(mdsc, NULL, req);
1037 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1038 iput(parent_inode);
1039 ceph_mdsc_put_request(req); 1049 ceph_mdsc_put_request(req);
1040 return err; 1050 return err;
1041} 1051}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f6132b327..5be1f997ecde 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb)
253 cifs_set_oplock_level(cifs_inode, 0); 253 cifs_set_oplock_level(cifs_inode, 0);
254 cifs_inode->delete_pending = false; 254 cifs_inode->delete_pending = false;
255 cifs_inode->invalid_mapping = false; 255 cifs_inode->invalid_mapping = false;
256 clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
257 clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
258 clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
259 spin_lock_init(&cifs_inode->writers_lock);
260 cifs_inode->writers = 0;
256 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 261 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
257 cifs_inode->server_eof = 0; 262 cifs_inode->server_eof = 0;
258 cifs_inode->uniqueid = 0; 263 cifs_inode->uniqueid = 0;
@@ -286,7 +291,7 @@ cifs_destroy_inode(struct inode *inode)
286static void 291static void
287cifs_evict_inode(struct inode *inode) 292cifs_evict_inode(struct inode *inode)
288{ 293{
289 truncate_inode_pages(&inode->i_data, 0); 294 truncate_inode_pages_final(&inode->i_data);
290 clear_inode(inode); 295 clear_inode(inode);
291 cifs_fscache_release_inode_cookie(inode); 296 cifs_fscache_release_inode_cookie(inode);
292} 297}
@@ -541,6 +546,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
541 546
542static int cifs_remount(struct super_block *sb, int *flags, char *data) 547static int cifs_remount(struct super_block *sb, int *flags, char *data)
543{ 548{
549 sync_filesystem(sb);
544 *flags |= MS_NODIRATIME; 550 *flags |= MS_NODIRATIME;
545 return 0; 551 return 0;
546} 552}
@@ -731,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
731 unsigned long nr_segs, loff_t pos) 737 unsigned long nr_segs, loff_t pos)
732{ 738{
733 struct inode *inode = file_inode(iocb->ki_filp); 739 struct inode *inode = file_inode(iocb->ki_filp);
740 struct cifsInodeInfo *cinode = CIFS_I(inode);
734 ssize_t written; 741 ssize_t written;
735 int rc; 742 int rc;
736 743
744 written = cifs_get_writer(cinode);
745 if (written)
746 return written;
747
737 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 748 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
738 749
739 if (CIFS_CACHE_WRITE(CIFS_I(inode))) 750 if (CIFS_CACHE_WRITE(CIFS_I(inode)))
740 return written; 751 goto out;
741 752
742 rc = filemap_fdatawrite(inode->i_mapping); 753 rc = filemap_fdatawrite(inode->i_mapping);
743 if (rc) 754 if (rc)
744 cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", 755 cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
745 rc, inode); 756 rc, inode);
746 757
758out:
759 cifs_put_writer(cinode);
747 return written; 760 return written;
748} 761}
749 762
@@ -849,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = {
849/* revalidate:cifs_revalidate, */ 862/* revalidate:cifs_revalidate, */
850 .setattr = cifs_setattr, 863 .setattr = cifs_setattr,
851 .getattr = cifs_getattr, /* do we need this anymore? */ 864 .getattr = cifs_getattr, /* do we need this anymore? */
852 .rename = cifs_rename,
853 .permission = cifs_permission, 865 .permission = cifs_permission,
854#ifdef CONFIG_CIFS_XATTR 866#ifdef CONFIG_CIFS_XATTR
855 .setxattr = cifs_setxattr, 867 .setxattr = cifs_setxattr,
@@ -1005,7 +1017,7 @@ cifs_init_once(void *inode)
1005 init_rwsem(&cifsi->lock_sem); 1017 init_rwsem(&cifsi->lock_sem);
1006} 1018}
1007 1019
1008static int 1020static int __init
1009cifs_init_inodecache(void) 1021cifs_init_inodecache(void)
1010{ 1022{
1011 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", 1023 cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c0f3718b77a8..30f6e9251a4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,6 +228,8 @@ struct smb_version_operations {
228 /* verify the message */ 228 /* verify the message */
229 int (*check_message)(char *, unsigned int); 229 int (*check_message)(char *, unsigned int);
230 bool (*is_oplock_break)(char *, struct TCP_Server_Info *); 230 bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
231 void (*downgrade_oplock)(struct TCP_Server_Info *,
232 struct cifsInodeInfo *, bool);
231 /* process transaction2 response */ 233 /* process transaction2 response */
232 bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, 234 bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
233 char *, int); 235 char *, int);
@@ -1113,6 +1115,12 @@ struct cifsInodeInfo {
1113 unsigned int epoch; /* used to track lease state changes */ 1115 unsigned int epoch; /* used to track lease state changes */
1114 bool delete_pending; /* DELETE_ON_CLOSE is set */ 1116 bool delete_pending; /* DELETE_ON_CLOSE is set */
1115 bool invalid_mapping; /* pagecache is invalid */ 1117 bool invalid_mapping; /* pagecache is invalid */
1118 unsigned long flags;
1119#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
1120#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
1121#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
1122 spinlock_t writers_lock;
1123 unsigned int writers; /* Number of writers on this inode */
1116 unsigned long time; /* jiffies of last update of inode */ 1124 unsigned long time; /* jiffies of last update of inode */
1117 u64 server_eof; /* current file size on server -- protected by i_lock */ 1125 u64 server_eof; /* current file size on server -- protected by i_lock */
1118 u64 uniqueid; /* server inode number */ 1126 u64 uniqueid; /* server inode number */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index acc4ee8ed075..ca7980a1e303 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
127extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 127extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
128 int offset); 128 int offset);
129extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); 129extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
130extern int cifs_get_writer(struct cifsInodeInfo *cinode);
131extern void cifs_put_writer(struct cifsInodeInfo *cinode);
132extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
130extern int cifs_unlock_range(struct cifsFileInfo *cfile, 133extern int cifs_unlock_range(struct cifsFileInfo *cfile,
131 struct file_lock *flock, const unsigned int xid); 134 struct file_lock *flock, const unsigned int xid);
132extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); 135extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f3264bd7a83d..6ce4e0954b98 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -6197,6 +6197,9 @@ QAllEAsRetry:
6197 cifs_dbg(FYI, "ea length %d\n", list_len); 6197 cifs_dbg(FYI, "ea length %d\n", list_len);
6198 if (list_len <= 8) { 6198 if (list_len <= 8) {
6199 cifs_dbg(FYI, "empty EA list returned from server\n"); 6199 cifs_dbg(FYI, "empty EA list returned from server\n");
6200 /* didn't find the named attribute */
6201 if (ea_name)
6202 rc = -ENODATA;
6200 goto QAllEAsOut; 6203 goto QAllEAsOut;
6201 } 6204 }
6202 6205
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 834fce759d80..5ed03e0b8b40 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2579 struct cifsInodeInfo *cinode = CIFS_I(inode); 2579 struct cifsInodeInfo *cinode = CIFS_I(inode);
2580 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; 2580 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
2581 ssize_t rc = -EACCES; 2581 ssize_t rc = -EACCES;
2582 loff_t lock_pos = pos; 2582 loff_t lock_pos = iocb->ki_pos;
2583 2583
2584 if (file->f_flags & O_APPEND)
2585 lock_pos = i_size_read(inode);
2586 /* 2584 /*
2587 * We need to hold the sem to be sure nobody modifies lock list 2585 * We need to hold the sem to be sure nobody modifies lock list
2588 * with a brlock that prevents writing. 2586 * with a brlock that prevents writing.
2589 */ 2587 */
2590 down_read(&cinode->lock_sem); 2588 down_read(&cinode->lock_sem);
2589 mutex_lock(&inode->i_mutex);
2590 if (file->f_flags & O_APPEND)
2591 lock_pos = i_size_read(inode);
2591 if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), 2592 if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
2592 server->vals->exclusive_lock_type, NULL, 2593 server->vals->exclusive_lock_type, NULL,
2593 CIFS_WRITE_OP)) 2594 CIFS_WRITE_OP)) {
2594 rc = generic_file_aio_write(iocb, iov, nr_segs, pos); 2595 rc = __generic_file_aio_write(iocb, iov, nr_segs);
2596 mutex_unlock(&inode->i_mutex);
2597
2598 if (rc > 0) {
2599 ssize_t err;
2600
2601 err = generic_write_sync(file, iocb->ki_pos - rc, rc);
2602 if (err < 0)
2603 rc = err;
2604 }
2605 } else {
2606 mutex_unlock(&inode->i_mutex);
2607 }
2595 up_read(&cinode->lock_sem); 2608 up_read(&cinode->lock_sem);
2596 return rc; 2609 return rc;
2597} 2610}
@@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2608 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 2621 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2609 ssize_t written; 2622 ssize_t written;
2610 2623
2624 written = cifs_get_writer(cinode);
2625 if (written)
2626 return written;
2627
2611 if (CIFS_CACHE_WRITE(cinode)) { 2628 if (CIFS_CACHE_WRITE(cinode)) {
2612 if (cap_unix(tcon->ses) && 2629 if (cap_unix(tcon->ses) &&
2613 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) 2630 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
2614 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2631 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
2615 return generic_file_aio_write(iocb, iov, nr_segs, pos); 2632 written = generic_file_aio_write(
2616 return cifs_writev(iocb, iov, nr_segs, pos); 2633 iocb, iov, nr_segs, pos);
2634 goto out;
2635 }
2636 written = cifs_writev(iocb, iov, nr_segs, pos);
2637 goto out;
2617 } 2638 }
2618 /* 2639 /*
2619 * For non-oplocked files in strict cache mode we need to write the data 2640 * For non-oplocked files in strict cache mode we need to write the data
@@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2633 inode); 2654 inode);
2634 cinode->oplock = 0; 2655 cinode->oplock = 0;
2635 } 2656 }
2657out:
2658 cifs_put_writer(cinode);
2636 return written; 2659 return written;
2637} 2660}
2638 2661
@@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
2727/** 2750/**
2728 * cifs_readdata_to_iov - copy data from pages in response to an iovec 2751 * cifs_readdata_to_iov - copy data from pages in response to an iovec
2729 * @rdata: the readdata response with list of pages holding data 2752 * @rdata: the readdata response with list of pages holding data
2730 * @iov: vector in which we should copy the data 2753 * @iter: destination for our data
2731 * @nr_segs: number of segments in vector
2732 * @offset: offset into file of the first iovec
2733 * @copied: used to return the amount of data copied to the iov
2734 * 2754 *
2735 * This function copies data from a list of pages in a readdata response into 2755 * This function copies data from a list of pages in a readdata response into
2736 * an array of iovecs. It will first calculate where the data should go 2756 * an array of iovecs. It will first calculate where the data should go
2737 * based on the info in the readdata and then copy the data into that spot. 2757 * based on the info in the readdata and then copy the data into that spot.
2738 */ 2758 */
2739static ssize_t 2759static int
2740cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, 2760cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
2741 unsigned long nr_segs, loff_t offset, ssize_t *copied)
2742{ 2761{
2743 int rc = 0; 2762 size_t remaining = rdata->bytes;
2744 struct iov_iter ii;
2745 size_t pos = rdata->offset - offset;
2746 ssize_t remaining = rdata->bytes;
2747 unsigned char *pdata;
2748 unsigned int i; 2763 unsigned int i;
2749 2764
2750 /* set up iov_iter and advance to the correct offset */
2751 iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
2752 iov_iter_advance(&ii, pos);
2753
2754 *copied = 0;
2755 for (i = 0; i < rdata->nr_pages; i++) { 2765 for (i = 0; i < rdata->nr_pages; i++) {
2756 ssize_t copy;
2757 struct page *page = rdata->pages[i]; 2766 struct page *page = rdata->pages[i];
2758 2767 size_t copy = min_t(size_t, remaining, PAGE_SIZE);
2759 /* copy a whole page or whatever's left */ 2768 size_t written = copy_page_to_iter(page, 0, copy, iter);
2760 copy = min_t(ssize_t, remaining, PAGE_SIZE); 2769 remaining -= written;
2761 2770 if (written < copy && iov_iter_count(iter) > 0)
2762 /* ...but limit it to whatever space is left in the iov */ 2771 break;
2763 copy = min_t(ssize_t, copy, iov_iter_count(&ii));
2764
2765 /* go while there's data to be copied and no errors */
2766 if (copy && !rc) {
2767 pdata = kmap(page);
2768 rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
2769 (int)copy);
2770 kunmap(page);
2771 if (!rc) {
2772 *copied += copy;
2773 remaining -= copy;
2774 iov_iter_advance(&ii, copy);
2775 }
2776 }
2777 } 2772 }
2778 2773 return remaining ? -EFAULT : 0;
2779 return rc;
2780} 2774}
2781 2775
2782static void 2776static void
@@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
2837 return total_read > 0 ? total_read : result; 2831 return total_read > 0 ? total_read : result;
2838} 2832}
2839 2833
2840static ssize_t 2834ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
2841cifs_iovec_read(struct file *file, const struct iovec *iov, 2835 unsigned long nr_segs, loff_t pos)
2842 unsigned long nr_segs, loff_t *poffset)
2843{ 2836{
2837 struct file *file = iocb->ki_filp;
2844 ssize_t rc; 2838 ssize_t rc;
2845 size_t len, cur_len; 2839 size_t len, cur_len;
2846 ssize_t total_read = 0; 2840 ssize_t total_read = 0;
2847 loff_t offset = *poffset; 2841 loff_t offset = pos;
2848 unsigned int npages; 2842 unsigned int npages;
2849 struct cifs_sb_info *cifs_sb; 2843 struct cifs_sb_info *cifs_sb;
2850 struct cifs_tcon *tcon; 2844 struct cifs_tcon *tcon;
2851 struct cifsFileInfo *open_file; 2845 struct cifsFileInfo *open_file;
2852 struct cifs_readdata *rdata, *tmp; 2846 struct cifs_readdata *rdata, *tmp;
2853 struct list_head rdata_list; 2847 struct list_head rdata_list;
2848 struct iov_iter to;
2854 pid_t pid; 2849 pid_t pid;
2855 2850
2856 if (!nr_segs) 2851 if (!nr_segs)
@@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
2860 if (!len) 2855 if (!len)
2861 return 0; 2856 return 0;
2862 2857
2858 iov_iter_init(&to, iov, nr_segs, len, 0);
2859
2863 INIT_LIST_HEAD(&rdata_list); 2860 INIT_LIST_HEAD(&rdata_list);
2864 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 2861 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2865 open_file = file->private_data; 2862 open_file = file->private_data;
@@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
2885 cifs_uncached_readv_complete); 2882 cifs_uncached_readv_complete);
2886 if (!rdata) { 2883 if (!rdata) {
2887 rc = -ENOMEM; 2884 rc = -ENOMEM;
2888 goto error; 2885 break;
2889 } 2886 }
2890 2887
2891 rc = cifs_read_allocate_pages(rdata, npages); 2888 rc = cifs_read_allocate_pages(rdata, npages);
@@ -2917,55 +2914,44 @@ error:
2917 if (!list_empty(&rdata_list)) 2914 if (!list_empty(&rdata_list))
2918 rc = 0; 2915 rc = 0;
2919 2916
2917 len = iov_iter_count(&to);
2920 /* the loop below should proceed in the order of increasing offsets */ 2918 /* the loop below should proceed in the order of increasing offsets */
2921restart_loop:
2922 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { 2919 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
2920 again:
2923 if (!rc) { 2921 if (!rc) {
2924 ssize_t copied;
2925
2926 /* FIXME: freezable sleep too? */ 2922 /* FIXME: freezable sleep too? */
2927 rc = wait_for_completion_killable(&rdata->done); 2923 rc = wait_for_completion_killable(&rdata->done);
2928 if (rc) 2924 if (rc)
2929 rc = -EINTR; 2925 rc = -EINTR;
2930 else if (rdata->result) 2926 else if (rdata->result) {
2931 rc = rdata->result; 2927 rc = rdata->result;
2932 else { 2928 /* resend call if it's a retryable error */
2933 rc = cifs_readdata_to_iov(rdata, iov, 2929 if (rc == -EAGAIN) {
2934 nr_segs, *poffset, 2930 rc = cifs_retry_async_readv(rdata);
2935 &copied); 2931 goto again;
2936 total_read += copied; 2932 }
2933 } else {
2934 rc = cifs_readdata_to_iov(rdata, &to);
2937 } 2935 }
2938 2936
2939 /* resend call if it's a retryable error */
2940 if (rc == -EAGAIN) {
2941 rc = cifs_retry_async_readv(rdata);
2942 goto restart_loop;
2943 }
2944 } 2937 }
2945 list_del_init(&rdata->list); 2938 list_del_init(&rdata->list);
2946 kref_put(&rdata->refcount, cifs_uncached_readdata_release); 2939 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
2947 } 2940 }
2948 2941
2942 total_read = len - iov_iter_count(&to);
2943
2949 cifs_stats_bytes_read(tcon, total_read); 2944 cifs_stats_bytes_read(tcon, total_read);
2950 *poffset += total_read;
2951 2945
2952 /* mask nodata case */ 2946 /* mask nodata case */
2953 if (rc == -ENODATA) 2947 if (rc == -ENODATA)
2954 rc = 0; 2948 rc = 0;
2955 2949
2956 return total_read ? total_read : rc; 2950 if (total_read) {
2957} 2951 iocb->ki_pos = pos + total_read;
2958 2952 return total_read;
2959ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, 2953 }
2960 unsigned long nr_segs, loff_t pos) 2954 return rc;
2961{
2962 ssize_t read;
2963
2964 read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
2965 if (read > 0)
2966 iocb->ki_pos = pos;
2967
2968 return read;
2969} 2955}
2970 2956
2971ssize_t 2957ssize_t
@@ -3113,6 +3099,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
3113 3099
3114static struct vm_operations_struct cifs_file_vm_ops = { 3100static struct vm_operations_struct cifs_file_vm_ops = {
3115 .fault = filemap_fault, 3101 .fault = filemap_fault,
3102 .map_pages = filemap_map_pages,
3116 .page_mkwrite = cifs_page_mkwrite, 3103 .page_mkwrite = cifs_page_mkwrite,
3117 .remap_pages = generic_file_remap_pages, 3104 .remap_pages = generic_file_remap_pages,
3118}; 3105};
@@ -3644,6 +3631,13 @@ static int cifs_launder_page(struct page *page)
3644 return rc; 3631 return rc;
3645} 3632}
3646 3633
3634static int
3635cifs_pending_writers_wait(void *unused)
3636{
3637 schedule();
3638 return 0;
3639}
3640
3647void cifs_oplock_break(struct work_struct *work) 3641void cifs_oplock_break(struct work_struct *work)
3648{ 3642{
3649 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 3643 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3651,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work)
3651 struct inode *inode = cfile->dentry->d_inode; 3645 struct inode *inode = cfile->dentry->d_inode;
3652 struct cifsInodeInfo *cinode = CIFS_I(inode); 3646 struct cifsInodeInfo *cinode = CIFS_I(inode);
3653 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 3647 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
3648 struct TCP_Server_Info *server = tcon->ses->server;
3654 int rc = 0; 3649 int rc = 0;
3655 3650
3651 wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
3652 cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
3653
3654 server->ops->downgrade_oplock(server, cinode,
3655 test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
3656
3656 if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && 3657 if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
3657 cifs_has_mand_locks(cinode)) { 3658 cifs_has_mand_locks(cinode)) {
3658 cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", 3659 cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
@@ -3689,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work)
3689 cinode); 3690 cinode);
3690 cifs_dbg(FYI, "Oplock release rc = %d\n", rc); 3691 cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
3691 } 3692 }
3693 cifs_done_oplock_break(cinode);
3692} 3694}
3693 3695
3694/* 3696/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aadc2b68678b..a22d667f1069 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode)
1737 if (cifs_i->time == 0) 1737 if (cifs_i->time == 0)
1738 return true; 1738 return true;
1739 1739
1740 if (!cifs_sb->actimeo)
1741 return true;
1742
1740 if (!time_in_range(jiffies, cifs_i->time, 1743 if (!time_in_range(jiffies, cifs_i->time,
1741 cifs_i->time + cifs_sb->actimeo)) 1744 cifs_i->time + cifs_sb->actimeo))
1742 return true; 1745 return true;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2f9f3790679d..3b0c62e622da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
466 cifs_dbg(FYI, "file id match, oplock break\n"); 466 cifs_dbg(FYI, "file id match, oplock break\n");
467 pCifsInode = CIFS_I(netfile->dentry->d_inode); 467 pCifsInode = CIFS_I(netfile->dentry->d_inode);
468 468
469 cifs_set_oplock_level(pCifsInode, 469 set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
470 pSMB->OplockLevel ? OPLOCK_READ : 0); 470 &pCifsInode->flags);
471
472 /*
473 * Set flag if the server downgrades the oplock
474 * to L2 else clear.
475 */
476 if (pSMB->OplockLevel)
477 set_bit(
478 CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
479 &pCifsInode->flags);
480 else
481 clear_bit(
482 CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
483 &pCifsInode->flags);
484
471 queue_work(cifsiod_wq, 485 queue_work(cifsiod_wq,
472 &netfile->oplock_break); 486 &netfile->oplock_break);
473 netfile->oplock_break_cancelled = false; 487 netfile->oplock_break_cancelled = false;
@@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
551 cinode->oplock = 0; 565 cinode->oplock = 0;
552} 566}
553 567
568static int
569cifs_oplock_break_wait(void *unused)
570{
571 schedule();
572 return signal_pending(current) ? -ERESTARTSYS : 0;
573}
574
575/*
576 * We wait for oplock breaks to be processed before we attempt to perform
577 * writes.
578 */
579int cifs_get_writer(struct cifsInodeInfo *cinode)
580{
581 int rc;
582
583start:
584 rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
585 cifs_oplock_break_wait, TASK_KILLABLE);
586 if (rc)
587 return rc;
588
589 spin_lock(&cinode->writers_lock);
590 if (!cinode->writers)
591 set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
592 cinode->writers++;
593 /* Check to see if we have started servicing an oplock break */
594 if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
595 cinode->writers--;
596 if (cinode->writers == 0) {
597 clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
598 wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
599 }
600 spin_unlock(&cinode->writers_lock);
601 goto start;
602 }
603 spin_unlock(&cinode->writers_lock);
604 return 0;
605}
606
607void cifs_put_writer(struct cifsInodeInfo *cinode)
608{
609 spin_lock(&cinode->writers_lock);
610 cinode->writers--;
611 if (cinode->writers == 0) {
612 clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
613 wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
614 }
615 spin_unlock(&cinode->writers_lock);
616}
617
618void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
619{
620 clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
621 wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
622}
623
554bool 624bool
555backup_cred(struct cifs_sb_info *cifs_sb) 625backup_cred(struct cifs_sb_info *cifs_sb)
556{ 626{
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 526fb89f9230..d1fdfa848703 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
372 return 0; 372 return 0;
373} 373}
374 374
375static void
376cifs_downgrade_oplock(struct TCP_Server_Info *server,
377 struct cifsInodeInfo *cinode, bool set_level2)
378{
379 if (set_level2)
380 cifs_set_oplock_level(cinode, OPLOCK_READ);
381 else
382 cifs_set_oplock_level(cinode, 0);
383}
384
375static bool 385static bool
376cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, 386cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
377 char *buf, int malformed) 387 char *buf, int malformed)
@@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = {
1019 .clear_stats = cifs_clear_stats, 1029 .clear_stats = cifs_clear_stats,
1020 .print_stats = cifs_print_stats, 1030 .print_stats = cifs_print_stats,
1021 .is_oplock_break = is_valid_oplock_break, 1031 .is_oplock_break = is_valid_oplock_break,
1032 .downgrade_oplock = cifs_downgrade_oplock,
1022 .check_trans2 = cifs_check_trans2, 1033 .check_trans2 = cifs_check_trans2,
1023 .need_neg = cifs_need_neg, 1034 .need_neg = cifs_need_neg,
1024 .negotiate = cifs_negotiate, 1035 .negotiate = cifs_negotiate,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index fb3966265b6e..b8021fde987d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
575 else 575 else
576 cfile->oplock_break_cancelled = false; 576 cfile->oplock_break_cancelled = false;
577 577
578 server->ops->set_oplock_level(cinode, 578 set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
579 rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, 579 &cinode->flags);
580 0, NULL); 580
581 /*
582 * Set flag if the server downgrades the oplock
583 * to L2 else clear.
584 */
585 if (rsp->OplockLevel)
586 set_bit(
587 CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
588 &cinode->flags);
589 else
590 clear_bit(
591 CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
592 &cinode->flags);
581 593
582 queue_work(cifsiod_wq, &cfile->oplock_break); 594 queue_work(cifsiod_wq, &cfile->oplock_break);
583 595
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 192f51a12cf1..35ddc3ed119d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
905} 905}
906 906
907static void 907static void
908smb2_downgrade_oplock(struct TCP_Server_Info *server,
909 struct cifsInodeInfo *cinode, bool set_level2)
910{
911 if (set_level2)
912 server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
913 0, NULL);
914 else
915 server->ops->set_oplock_level(cinode, 0, 0, NULL);
916}
917
918static void
908smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, 919smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
909 unsigned int epoch, bool *purge_cache) 920 unsigned int epoch, bool *purge_cache)
910{ 921{
@@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = {
1110 .clear_stats = smb2_clear_stats, 1121 .clear_stats = smb2_clear_stats,
1111 .print_stats = smb2_print_stats, 1122 .print_stats = smb2_print_stats,
1112 .is_oplock_break = smb2_is_valid_oplock_break, 1123 .is_oplock_break = smb2_is_valid_oplock_break,
1124 .downgrade_oplock = smb2_downgrade_oplock,
1113 .need_neg = smb2_need_neg, 1125 .need_neg = smb2_need_neg,
1114 .negotiate = smb2_negotiate, 1126 .negotiate = smb2_negotiate,
1115 .negotiate_wsize = smb2_negotiate_wsize, 1127 .negotiate_wsize = smb2_negotiate_wsize,
@@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = {
1184 .clear_stats = smb2_clear_stats, 1196 .clear_stats = smb2_clear_stats,
1185 .print_stats = smb2_print_stats, 1197 .print_stats = smb2_print_stats,
1186 .is_oplock_break = smb2_is_valid_oplock_break, 1198 .is_oplock_break = smb2_is_valid_oplock_break,
1199 .downgrade_oplock = smb2_downgrade_oplock,
1187 .need_neg = smb2_need_neg, 1200 .need_neg = smb2_need_neg,
1188 .negotiate = smb2_negotiate, 1201 .negotiate = smb2_negotiate,
1189 .negotiate_wsize = smb2_negotiate_wsize, 1202 .negotiate_wsize = smb2_negotiate_wsize,
@@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = {
1259 .print_stats = smb2_print_stats, 1272 .print_stats = smb2_print_stats,
1260 .dump_share_caps = smb2_dump_share_caps, 1273 .dump_share_caps = smb2_dump_share_caps,
1261 .is_oplock_break = smb2_is_valid_oplock_break, 1274 .is_oplock_break = smb2_is_valid_oplock_break,
1275 .downgrade_oplock = smb2_downgrade_oplock,
1262 .need_neg = smb2_need_neg, 1276 .need_neg = smb2_need_neg,
1263 .negotiate = smb2_negotiate, 1277 .negotiate = smb2_negotiate,
1264 .negotiate_wsize = smb2_negotiate_wsize, 1278 .negotiate_wsize = smb2_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 860344701067..3802f8c94acc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
1352 u64 persistent_fid, u64 volatile_fid) 1352 u64 persistent_fid, u64 volatile_fid)
1353{ 1353{
1354 int rc; 1354 int rc;
1355 char *res_key = NULL;
1356 struct compress_ioctl fsctl_input; 1355 struct compress_ioctl fsctl_input;
1357 char *ret_data = NULL; 1356 char *ret_data = NULL;
1358 1357
@@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
1365 2 /* in data len */, &ret_data /* out data */, NULL); 1364 2 /* in data len */, &ret_data /* out data */, NULL);
1366 1365
1367 cifs_dbg(FYI, "set compression rc %d\n", rc); 1366 cifs_dbg(FYI, "set compression rc %d\n", rc);
1368 kfree(res_key);
1369 1367
1370 return rc; 1368 return rc;
1371} 1369}
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index b7143cf783ac..381c993b1427 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,7 +10,7 @@ extern int coda_hard;
10extern int coda_fake_statfs; 10extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int __init coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); 14int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
15void coda_sysctl_init(void); 15void coda_sysctl_init(void);
16void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 506de34a4ef3..d9c7751f10ac 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -73,7 +73,7 @@ static void init_once(void *foo)
73 inode_init_once(&ei->vfs_inode); 73 inode_init_once(&ei->vfs_inode);
74} 74}
75 75
76int coda_init_inodecache(void) 76int __init coda_init_inodecache(void)
77{ 77{
78 coda_inode_cachep = kmem_cache_create("coda_inode_cache", 78 coda_inode_cachep = kmem_cache_create("coda_inode_cache",
79 sizeof(struct coda_inode_info), 79 sizeof(struct coda_inode_info),
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
96 96
97static int coda_remount(struct super_block *sb, int *flags, char *data) 97static int coda_remount(struct super_block *sb, int *flags, char *data)
98{ 98{
99 sync_filesystem(sb);
99 *flags |= MS_NOATIME; 100 *flags |= MS_NOATIME;
100 return 0; 101 return 0;
101} 102}
@@ -250,7 +251,7 @@ static void coda_put_super(struct super_block *sb)
250 251
251static void coda_evict_inode(struct inode *inode) 252static void coda_evict_inode(struct inode *inode)
252{ 253{
253 truncate_inode_pages(&inode->i_data, 0); 254 truncate_inode_pages_final(&inode->i_data);
254 clear_inode(inode); 255 clear_inode(inode);
255 coda_cache_clear_inode(inode); 256 coda_cache_clear_inode(inode);
256} 257}
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..66d3d3c6b4b2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...)
72 * Not all architectures have sys_utime, so implement this in terms 72 * Not all architectures have sys_utime, so implement this in terms
73 * of sys_utimes. 73 * of sys_utimes.
74 */ 74 */
75asmlinkage long compat_sys_utime(const char __user *filename, 75COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
76 struct compat_utimbuf __user *t) 76 struct compat_utimbuf __user *, t)
77{ 77{
78 struct timespec tv[2]; 78 struct timespec tv[2];
79 79
@@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); 87 return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
88} 88}
89 89
90asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) 90COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
91{ 91{
92 struct timespec tv[2]; 92 struct timespec tv[2];
93 93
94 if (t) { 94 if (t) {
95 if (get_compat_timespec(&tv[0], &t[0]) || 95 if (compat_get_timespec(&tv[0], &t[0]) ||
96 get_compat_timespec(&tv[1], &t[1])) 96 compat_get_timespec(&tv[1], &t[1]))
97 return -EFAULT; 97 return -EFAULT;
98 98
99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 99 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
102 return do_utimes(dfd, filename, t ? tv : NULL, flags); 102 return do_utimes(dfd, filename, t ? tv : NULL, flags);
103} 103}
104 104
105asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) 105COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
106{ 106{
107 struct timespec tv[2]; 107 struct timespec tv[2];
108 108
@@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
121 return do_utimes(dfd, filename, t ? tv : NULL, 0); 121 return do_utimes(dfd, filename, t ? tv : NULL, 0);
122} 122}
123 123
124asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) 124COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
125{ 125{
126 return compat_sys_futimesat(AT_FDCWD, filename, t); 126 return compat_sys_futimesat(AT_FDCWD, filename, t);
127} 127}
@@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; 159 return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
160} 160}
161 161
162asmlinkage long compat_sys_newstat(const char __user * filename, 162COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
163 struct compat_stat __user *statbuf) 163 struct compat_stat __user *, statbuf)
164{ 164{
165 struct kstat stat; 165 struct kstat stat;
166 int error; 166 int error;
@@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
171 return cp_compat_stat(&stat, statbuf); 171 return cp_compat_stat(&stat, statbuf);
172} 172}
173 173
174asmlinkage long compat_sys_newlstat(const char __user * filename, 174COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
175 struct compat_stat __user *statbuf) 175 struct compat_stat __user *, statbuf)
176{ 176{
177 struct kstat stat; 177 struct kstat stat;
178 int error; 178 int error;
@@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
184} 184}
185 185
186#ifndef __ARCH_WANT_STAT64 186#ifndef __ARCH_WANT_STAT64
187asmlinkage long compat_sys_newfstatat(unsigned int dfd, 187COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
188 const char __user *filename, 188 const char __user *, filename,
189 struct compat_stat __user *statbuf, int flag) 189 struct compat_stat __user *, statbuf, int, flag)
190{ 190{
191 struct kstat stat; 191 struct kstat stat;
192 int error; 192 int error;
@@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
198} 198}
199#endif 199#endif
200 200
201asmlinkage long compat_sys_newfstat(unsigned int fd, 201COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
202 struct compat_stat __user * statbuf) 202 struct compat_stat __user *, statbuf)
203{ 203{
204 struct kstat stat; 204 struct kstat stat;
205 int error = vfs_fstat(fd, &stat); 205 int error = vfs_fstat(fd, &stat);
@@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
247 * The following statfs calls are copies of code from fs/statfs.c and 247 * The following statfs calls are copies of code from fs/statfs.c and
248 * should be checked against those from time to time 248 * should be checked against those from time to time
249 */ 249 */
250asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 250COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
251{ 251{
252 struct kstatfs tmp; 252 struct kstatfs tmp;
253 int error = user_statfs(pathname, &tmp); 253 int error = user_statfs(pathname, &tmp);
@@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
256 return error; 256 return error;
257} 257}
258 258
259asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 259COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
260{ 260{
261 struct kstatfs tmp; 261 struct kstatfs tmp;
262 int error = fd_statfs(fd, &tmp); 262 int error = fd_statfs(fd, &tmp);
@@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
298 return 0; 298 return 0;
299} 299}
300 300
301asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 301COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
302{ 302{
303 struct kstatfs tmp; 303 struct kstatfs tmp;
304 int error; 304 int error;
@@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
312 return error; 312 return error;
313} 313}
314 314
315asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 315COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
316{ 316{
317 struct kstatfs tmp; 317 struct kstatfs tmp;
318 int error; 318 int error;
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
331 * Given how simple this syscall is that apporach is more maintainable 331 * Given how simple this syscall is that apporach is more maintainable
332 * than the various conversion hacks. 332 * than the various conversion hacks.
333 */ 333 */
334asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) 334COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
335{ 335{
336 struct compat_ustat tmp; 336 struct compat_ustat tmp;
337 struct kstatfs sbuf; 337 struct kstatfs sbuf;
@@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
399} 399}
400#endif 400#endif
401 401
402asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, 402static unsigned int
403 unsigned long arg) 403convert_fcntl_cmd(unsigned int cmd)
404{
405 switch (cmd) {
406 case F_GETLK64:
407 return F_GETLK;
408 case F_SETLK64:
409 return F_SETLK;
410 case F_SETLKW64:
411 return F_SETLKW;
412 }
413
414 return cmd;
415}
416
417COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
418 compat_ulong_t, arg)
404{ 419{
405 mm_segment_t old_fs; 420 mm_segment_t old_fs;
406 struct flock f; 421 struct flock f;
407 long ret; 422 long ret;
423 unsigned int conv_cmd;
408 424
409 switch (cmd) { 425 switch (cmd) {
410 case F_GETLK: 426 case F_GETLK:
@@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
441 case F_GETLK64: 457 case F_GETLK64:
442 case F_SETLK64: 458 case F_SETLK64:
443 case F_SETLKW64: 459 case F_SETLKW64:
460 case F_OFD_GETLK:
461 case F_OFD_SETLK:
462 case F_OFD_SETLKW:
444 ret = get_compat_flock64(&f, compat_ptr(arg)); 463 ret = get_compat_flock64(&f, compat_ptr(arg));
445 if (ret != 0) 464 if (ret != 0)
446 break; 465 break;
447 old_fs = get_fs(); 466 old_fs = get_fs();
448 set_fs(KERNEL_DS); 467 set_fs(KERNEL_DS);
449 ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : 468 conv_cmd = convert_fcntl_cmd(cmd);
450 ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), 469 ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
451 (unsigned long)&f);
452 set_fs(old_fs); 470 set_fs(old_fs);
453 if (cmd == F_GETLK64 && ret == 0) { 471 if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
454 /* need to return lock information - see above for commentary */ 472 /* need to return lock information - see above for commentary */
455 if (f.l_start > COMPAT_LOFF_T_MAX) 473 if (f.l_start > COMPAT_LOFF_T_MAX)
456 ret = -EOVERFLOW; 474 ret = -EOVERFLOW;
@@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
468 return ret; 486 return ret;
469} 487}
470 488
471asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, 489COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
472 unsigned long arg) 490 compat_ulong_t, arg)
473{ 491{
474 if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) 492 switch (cmd) {
493 case F_GETLK64:
494 case F_SETLK64:
495 case F_SETLKW64:
496 case F_OFD_GETLK:
497 case F_OFD_SETLK:
498 case F_OFD_SETLKW:
475 return -EINVAL; 499 return -EINVAL;
500 }
476 return compat_sys_fcntl64(fd, cmd, arg); 501 return compat_sys_fcntl64(fd, cmd, arg);
477} 502}
478 503
479asmlinkage long 504COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
480compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
481{ 505{
482 long ret; 506 long ret;
483 aio_context_t ctx64; 507 aio_context_t ctx64;
@@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
496 return ret; 520 return ret;
497} 521}
498 522
499asmlinkage long 523COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
500compat_sys_io_getevents(aio_context_t ctx_id, 524 compat_long_t, min_nr,
501 unsigned long min_nr, 525 compat_long_t, nr,
502 unsigned long nr, 526 struct io_event __user *, events,
503 struct io_event __user *events, 527 struct compat_timespec __user *, timeout)
504 struct compat_timespec __user *timeout)
505{ 528{
506 long ret;
507 struct timespec t; 529 struct timespec t;
508 struct timespec __user *ut = NULL; 530 struct timespec __user *ut = NULL;
509 531
510 ret = -EFAULT;
511 if (unlikely(!access_ok(VERIFY_WRITE, events,
512 nr * sizeof(struct io_event))))
513 goto out;
514 if (timeout) { 532 if (timeout) {
515 if (get_compat_timespec(&t, timeout)) 533 if (compat_get_timespec(&t, timeout))
516 goto out; 534 return -EFAULT;
517 535
518 ut = compat_alloc_user_space(sizeof(*ut)); 536 ut = compat_alloc_user_space(sizeof(*ut));
519 if (copy_to_user(ut, &t, sizeof(t)) ) 537 if (copy_to_user(ut, &t, sizeof(t)) )
520 goto out; 538 return -EFAULT;
521 } 539 }
522 ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); 540 return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
523out:
524 return ret;
525} 541}
526 542
527/* A write operation does a read from user space and vice versa */ 543/* A write operation does a read from user space and vice versa */
@@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
617 633
618#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) 634#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
619 635
620asmlinkage long 636COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
621compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) 637 int, nr, u32 __user *, iocb)
622{ 638{
623 struct iocb __user * __user *iocb64; 639 struct iocb __user * __user *iocb64;
624 long ret; 640 long ret;
@@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
770#define NCPFS_NAME "ncpfs" 786#define NCPFS_NAME "ncpfs"
771#define NFS4_NAME "nfs4" 787#define NFS4_NAME "nfs4"
772 788
773asmlinkage long compat_sys_mount(const char __user * dev_name, 789COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
774 const char __user * dir_name, 790 const char __user *, dir_name,
775 const char __user * type, unsigned long flags, 791 const char __user *, type, compat_ulong_t, flags,
776 const void __user * data) 792 const void __user *, data)
777{ 793{
778 char *kernel_type; 794 char *kernel_type;
779 unsigned long data_page; 795 unsigned long data_page;
@@ -869,8 +885,8 @@ efault:
869 return -EFAULT; 885 return -EFAULT;
870} 886}
871 887
872asmlinkage long compat_sys_old_readdir(unsigned int fd, 888COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
873 struct compat_old_linux_dirent __user *dirent, unsigned int count) 889 struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
874{ 890{
875 int error; 891 int error;
876 struct fd f = fdget(fd); 892 struct fd f = fdget(fd);
@@ -948,8 +964,8 @@ efault:
948 return -EFAULT; 964 return -EFAULT;
949} 965}
950 966
951asmlinkage long compat_sys_getdents(unsigned int fd, 967COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
952 struct compat_linux_dirent __user *dirent, unsigned int count) 968 struct compat_linux_dirent __user *, dirent, unsigned int, count)
953{ 969{
954 struct fd f; 970 struct fd f;
955 struct compat_linux_dirent __user * lastdirent; 971 struct compat_linux_dirent __user * lastdirent;
@@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
981 return error; 997 return error;
982} 998}
983 999
984#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 1000#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
985 1001
986struct compat_getdents_callback64 { 1002struct compat_getdents_callback64 {
987 struct dir_context ctx; 1003 struct dir_context ctx;
@@ -1033,8 +1049,8 @@ efault:
1033 return -EFAULT; 1049 return -EFAULT;
1034} 1050}
1035 1051
1036asmlinkage long compat_sys_getdents64(unsigned int fd, 1052COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
1037 struct linux_dirent64 __user * dirent, unsigned int count) 1053 struct linux_dirent64 __user *, dirent, unsigned int, count)
1038{ 1054{
1039 struct fd f; 1055 struct fd f;
1040 struct linux_dirent64 __user * lastdirent; 1056 struct linux_dirent64 __user * lastdirent;
@@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1066 fdput(f); 1082 fdput(f);
1067 return error; 1083 return error;
1068} 1084}
1069#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ 1085#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
1070 1086
1071/* 1087/*
1072 * Exactly like fs/open.c:sys_open(), except that it doesn't set the 1088 * Exactly like fs/open.c:sys_open(), except that it doesn't set the
@@ -1287,9 +1303,9 @@ out_nofds:
1287 return ret; 1303 return ret;
1288} 1304}
1289 1305
1290asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, 1306COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
1291 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1307 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1292 struct compat_timeval __user *tvp) 1308 struct compat_timeval __user *, tvp)
1293{ 1309{
1294 struct timespec end_time, *to = NULL; 1310 struct timespec end_time, *to = NULL;
1295 struct compat_timeval tv; 1311 struct compat_timeval tv;
@@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct {
1320 compat_uptr_t tvp; 1336 compat_uptr_t tvp;
1321}; 1337};
1322 1338
1323asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) 1339COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
1324{ 1340{
1325 struct compat_sel_arg_struct a; 1341 struct compat_sel_arg_struct a;
1326 1342
@@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1381 return ret; 1397 return ret;
1382} 1398}
1383 1399
1384asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, 1400COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
1385 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1401 compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
1386 struct compat_timespec __user *tsp, void __user *sig) 1402 struct compat_timespec __user *, tsp, void __user *, sig)
1387{ 1403{
1388 compat_size_t sigsetsize = 0; 1404 compat_size_t sigsetsize = 0;
1389 compat_uptr_t up = 0; 1405 compat_uptr_t up = 0;
@@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
1400 sigsetsize); 1416 sigsetsize);
1401} 1417}
1402 1418
1403asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, 1419COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
1404 unsigned int nfds, struct compat_timespec __user *tsp, 1420 unsigned int, nfds, struct compat_timespec __user *, tsp,
1405 const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) 1421 const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
1406{ 1422{
1407 compat_sigset_t ss32; 1423 compat_sigset_t ss32;
1408 sigset_t ksigmask, sigsaved; 1424 sigset_t ksigmask, sigsaved;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index a81147e2e4ef..4d24d17bcfc1 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
88#define ELF_HWCAP COMPAT_ELF_HWCAP 88#define ELF_HWCAP COMPAT_ELF_HWCAP
89#endif 89#endif
90 90
91#ifdef COMPAT_ELF_HWCAP2
92#undef ELF_HWCAP2
93#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
94#endif
95
91#ifdef COMPAT_ARCH_DLINFO 96#ifdef COMPAT_ARCH_DLINFO
92#undef ARCH_DLINFO 97#undef ARCH_DLINFO
93#define ARCH_DLINFO COMPAT_ARCH_DLINFO 98#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3881610b6438..e82289047272 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd)
1538 return ioctl_pointer[i] == xcmd; 1538 return ioctl_pointer[i] == xcmd;
1539} 1539}
1540 1540
1541asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, 1541COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
1542 unsigned long arg) 1542 compat_ulong_t, arg32)
1543{ 1543{
1544 unsigned long arg = arg32;
1544 struct fd f = fdget(fd); 1545 struct fd f = fdget(fd);
1545 int error = -EBADF; 1546 int error = -EBADF;
1546 if (!f.file) 1547 if (!f.file)
diff --git a/fs/coredump.c b/fs/coredump.c
index e3ad709a4232..0b2528fb640e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size)
73static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) 73static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
74{ 74{
75 int free, need; 75 int free, need;
76 va_list arg_copy;
76 77
77again: 78again:
78 free = cn->size - cn->used; 79 free = cn->size - cn->used;
79 need = vsnprintf(cn->corename + cn->used, free, fmt, arg); 80
81 va_copy(arg_copy, arg);
82 need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
83 va_end(arg_copy);
84
80 if (need < free) { 85 if (need < free) {
81 cn->used += need; 86 cn->used += need;
82 return 0; 87 return 0;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf94d57..ddcfe590b8a8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
195 struct page *page = NULL; 195 struct page *page = NULL;
196 196
197 if (blocknr + i < devsize) { 197 if (blocknr + i < devsize) {
198 page = read_mapping_page_async(mapping, blocknr + i, 198 page = read_mapping_page(mapping, blocknr + i, NULL);
199 NULL);
200 /* synchronous error? */ 199 /* synchronous error? */
201 if (IS_ERR(page)) 200 if (IS_ERR(page))
202 page = NULL; 201 page = NULL;
@@ -244,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
244 243
245static int cramfs_remount(struct super_block *sb, int *flags, char *data) 244static int cramfs_remount(struct super_block *sb, int *flags, char *data)
246{ 245{
246 sync_filesystem(sb);
247 *flags |= MS_RDONLY; 247 *flags |= MS_RDONLY;
248 return 0; 248 return 0;
249} 249}
diff --git a/fs/dcache.c b/fs/dcache.c
index ca02c13a84aa..be2bea834bf4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head)
246 kmem_cache_free(dentry_cache, dentry); 246 kmem_cache_free(dentry_cache, dentry);
247} 247}
248 248
249/* 249static void dentry_free(struct dentry *dentry)
250 * no locks, please.
251 */
252static void d_free(struct dentry *dentry)
253{ 250{
254 BUG_ON((int)dentry->d_lockref.count > 0);
255 this_cpu_dec(nr_dentry);
256 if (dentry->d_op && dentry->d_op->d_release)
257 dentry->d_op->d_release(dentry);
258
259 /* if dentry was never visible to RCU, immediate free is OK */ 251 /* if dentry was never visible to RCU, immediate free is OK */
260 if (!(dentry->d_flags & DCACHE_RCUACCESS)) 252 if (!(dentry->d_flags & DCACHE_RCUACCESS))
261 __d_free(&dentry->d_u.d_rcu); 253 __d_free(&dentry->d_u.d_rcu);
@@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry)
403 d_lru_add(dentry); 395 d_lru_add(dentry);
404} 396}
405 397
406/*
407 * Remove a dentry with references from the LRU.
408 *
409 * If we are on the shrink list, then we can get to try_prune_one_dentry() and
410 * lose our last reference through the parent walk. In this case, we need to
411 * remove ourselves from the shrink list, not the LRU.
412 */
413static void dentry_lru_del(struct dentry *dentry)
414{
415 if (dentry->d_flags & DCACHE_LRU_LIST) {
416 if (dentry->d_flags & DCACHE_SHRINK_LIST)
417 return d_shrink_del(dentry);
418 d_lru_del(dentry);
419 }
420}
421
422/**
423 * d_kill - kill dentry and return parent
424 * @dentry: dentry to kill
425 * @parent: parent dentry
426 *
427 * The dentry must already be unhashed and removed from the LRU.
428 *
429 * If this is the root of the dentry tree, return NULL.
430 *
431 * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
432 * d_kill.
433 */
434static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
435 __releases(dentry->d_lock)
436 __releases(parent->d_lock)
437 __releases(dentry->d_inode->i_lock)
438{
439 list_del(&dentry->d_u.d_child);
440 /*
441 * Inform d_walk() that we are no longer attached to the
442 * dentry tree
443 */
444 dentry->d_flags |= DCACHE_DENTRY_KILLED;
445 if (parent)
446 spin_unlock(&parent->d_lock);
447 dentry_iput(dentry);
448 /*
449 * dentry_iput drops the locks, at which point nobody (except
450 * transient RCU lookups) can reach this dentry.
451 */
452 d_free(dentry);
453 return parent;
454}
455
456/** 398/**
457 * d_drop - drop a dentry 399 * d_drop - drop a dentry
458 * @dentry: dentry to drop 400 * @dentry: dentry to drop
@@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry)
499} 441}
500EXPORT_SYMBOL(d_drop); 442EXPORT_SYMBOL(d_drop);
501 443
502/* 444static void __dentry_kill(struct dentry *dentry)
503 * Finish off a dentry we've decided to kill.
504 * dentry->d_lock must be held, returns with it unlocked.
505 * If ref is non-zero, then decrement the refcount too.
506 * Returns dentry requiring refcount drop, or NULL if we're done.
507 */
508static struct dentry *
509dentry_kill(struct dentry *dentry, int unlock_on_failure)
510 __releases(dentry->d_lock)
511{ 445{
512 struct inode *inode; 446 struct dentry *parent = NULL;
513 struct dentry *parent; 447 bool can_free = true;
514 448 if (!IS_ROOT(dentry))
515 inode = dentry->d_inode;
516 if (inode && !spin_trylock(&inode->i_lock)) {
517relock:
518 if (unlock_on_failure) {
519 spin_unlock(&dentry->d_lock);
520 cpu_relax();
521 }
522 return dentry; /* try again with same dentry */
523 }
524 if (IS_ROOT(dentry))
525 parent = NULL;
526 else
527 parent = dentry->d_parent; 449 parent = dentry->d_parent;
528 if (parent && !spin_trylock(&parent->d_lock)) {
529 if (inode)
530 spin_unlock(&inode->i_lock);
531 goto relock;
532 }
533 450
534 /* 451 /*
535 * The dentry is now unrecoverably dead to the world. 452 * The dentry is now unrecoverably dead to the world.
@@ -543,10 +460,103 @@ relock:
543 if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) 460 if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
544 dentry->d_op->d_prune(dentry); 461 dentry->d_op->d_prune(dentry);
545 462
546 dentry_lru_del(dentry); 463 if (dentry->d_flags & DCACHE_LRU_LIST) {
464 if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
465 d_lru_del(dentry);
466 }
547 /* if it was on the hash then remove it */ 467 /* if it was on the hash then remove it */
548 __d_drop(dentry); 468 __d_drop(dentry);
549 return d_kill(dentry, parent); 469 list_del(&dentry->d_u.d_child);
470 /*
471 * Inform d_walk() that we are no longer attached to the
472 * dentry tree
473 */
474 dentry->d_flags |= DCACHE_DENTRY_KILLED;
475 if (parent)
476 spin_unlock(&parent->d_lock);
477 dentry_iput(dentry);
478 /*
479 * dentry_iput drops the locks, at which point nobody (except
480 * transient RCU lookups) can reach this dentry.
481 */
482 BUG_ON((int)dentry->d_lockref.count > 0);
483 this_cpu_dec(nr_dentry);
484 if (dentry->d_op && dentry->d_op->d_release)
485 dentry->d_op->d_release(dentry);
486
487 spin_lock(&dentry->d_lock);
488 if (dentry->d_flags & DCACHE_SHRINK_LIST) {
489 dentry->d_flags |= DCACHE_MAY_FREE;
490 can_free = false;
491 }
492 spin_unlock(&dentry->d_lock);
493 if (likely(can_free))
494 dentry_free(dentry);
495}
496
497/*
498 * Finish off a dentry we've decided to kill.
499 * dentry->d_lock must be held, returns with it unlocked.
500 * If ref is non-zero, then decrement the refcount too.
501 * Returns dentry requiring refcount drop, or NULL if we're done.
502 */
503static struct dentry *dentry_kill(struct dentry *dentry)
504 __releases(dentry->d_lock)
505{
506 struct inode *inode = dentry->d_inode;
507 struct dentry *parent = NULL;
508
509 if (inode && unlikely(!spin_trylock(&inode->i_lock)))
510 goto failed;
511
512 if (!IS_ROOT(dentry)) {
513 parent = dentry->d_parent;
514 if (unlikely(!spin_trylock(&parent->d_lock))) {
515 if (inode)
516 spin_unlock(&inode->i_lock);
517 goto failed;
518 }
519 }
520
521 __dentry_kill(dentry);
522 return parent;
523
524failed:
525 spin_unlock(&dentry->d_lock);
526 cpu_relax();
527 return dentry; /* try again with same dentry */
528}
529
530static inline struct dentry *lock_parent(struct dentry *dentry)
531{
532 struct dentry *parent = dentry->d_parent;
533 if (IS_ROOT(dentry))
534 return NULL;
535 if (likely(spin_trylock(&parent->d_lock)))
536 return parent;
537 spin_unlock(&dentry->d_lock);
538 rcu_read_lock();
539again:
540 parent = ACCESS_ONCE(dentry->d_parent);
541 spin_lock(&parent->d_lock);
542 /*
543 * We can't blindly lock dentry until we are sure
544 * that we won't violate the locking order.
545 * Any changes of dentry->d_parent must have
546 * been done with parent->d_lock held, so
547 * spin_lock() above is enough of a barrier
548 * for checking if it's still our child.
549 */
550 if (unlikely(parent != dentry->d_parent)) {
551 spin_unlock(&parent->d_lock);
552 goto again;
553 }
554 rcu_read_unlock();
555 if (parent != dentry)
556 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
557 else
558 parent = NULL;
559 return parent;
550} 560}
551 561
552/* 562/*
@@ -602,7 +612,7 @@ repeat:
602 return; 612 return;
603 613
604kill_it: 614kill_it:
605 dentry = dentry_kill(dentry, 1); 615 dentry = dentry_kill(dentry);
606 if (dentry) 616 if (dentry)
607 goto repeat; 617 goto repeat;
608} 618}
@@ -815,64 +825,15 @@ restart:
815} 825}
816EXPORT_SYMBOL(d_prune_aliases); 826EXPORT_SYMBOL(d_prune_aliases);
817 827
818/*
819 * Try to throw away a dentry - free the inode, dput the parent.
820 * Requires dentry->d_lock is held, and dentry->d_count == 0.
821 * Releases dentry->d_lock.
822 *
823 * This may fail if locks cannot be acquired no problem, just try again.
824 */
825static struct dentry * try_prune_one_dentry(struct dentry *dentry)
826 __releases(dentry->d_lock)
827{
828 struct dentry *parent;
829
830 parent = dentry_kill(dentry, 0);
831 /*
832 * If dentry_kill returns NULL, we have nothing more to do.
833 * if it returns the same dentry, trylocks failed. In either
834 * case, just loop again.
835 *
836 * Otherwise, we need to prune ancestors too. This is necessary
837 * to prevent quadratic behavior of shrink_dcache_parent(), but
838 * is also expected to be beneficial in reducing dentry cache
839 * fragmentation.
840 */
841 if (!parent)
842 return NULL;
843 if (parent == dentry)
844 return dentry;
845
846 /* Prune ancestors. */
847 dentry = parent;
848 while (dentry) {
849 if (lockref_put_or_lock(&dentry->d_lockref))
850 return NULL;
851 dentry = dentry_kill(dentry, 1);
852 }
853 return NULL;
854}
855
856static void shrink_dentry_list(struct list_head *list) 828static void shrink_dentry_list(struct list_head *list)
857{ 829{
858 struct dentry *dentry; 830 struct dentry *dentry, *parent;
859 831
860 rcu_read_lock(); 832 while (!list_empty(list)) {
861 for (;;) { 833 struct inode *inode;
862 dentry = list_entry_rcu(list->prev, struct dentry, d_lru); 834 dentry = list_entry(list->prev, struct dentry, d_lru);
863 if (&dentry->d_lru == list)
864 break; /* empty */
865
866 /*
867 * Get the dentry lock, and re-verify that the dentry is
868 * this on the shrinking list. If it is, we know that
869 * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
870 */
871 spin_lock(&dentry->d_lock); 835 spin_lock(&dentry->d_lock);
872 if (dentry != list_entry(list->prev, struct dentry, d_lru)) { 836 parent = lock_parent(dentry);
873 spin_unlock(&dentry->d_lock);
874 continue;
875 }
876 837
877 /* 838 /*
878 * The dispose list is isolated and dentries are not accounted 839 * The dispose list is isolated and dentries are not accounted
@@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list)
885 * We found an inuse dentry which was not removed from 846 * We found an inuse dentry which was not removed from
886 * the LRU because of laziness during lookup. Do not free it. 847 * the LRU because of laziness during lookup. Do not free it.
887 */ 848 */
888 if (dentry->d_lockref.count) { 849 if ((int)dentry->d_lockref.count > 0) {
889 spin_unlock(&dentry->d_lock); 850 spin_unlock(&dentry->d_lock);
851 if (parent)
852 spin_unlock(&parent->d_lock);
890 continue; 853 continue;
891 } 854 }
892 rcu_read_unlock();
893 855
894 /*
895 * If 'try_to_prune()' returns a dentry, it will
896 * be the same one we passed in, and d_lock will
897 * have been held the whole time, so it will not
898 * have been added to any other lists. We failed
899 * to get the inode lock.
900 *
901 * We just add it back to the shrink list.
902 */
903 dentry = try_prune_one_dentry(dentry);
904 856
905 rcu_read_lock(); 857 if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
906 if (dentry) { 858 bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
859 spin_unlock(&dentry->d_lock);
860 if (parent)
861 spin_unlock(&parent->d_lock);
862 if (can_free)
863 dentry_free(dentry);
864 continue;
865 }
866
867 inode = dentry->d_inode;
868 if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
907 d_shrink_add(dentry, list); 869 d_shrink_add(dentry, list);
908 spin_unlock(&dentry->d_lock); 870 spin_unlock(&dentry->d_lock);
871 if (parent)
872 spin_unlock(&parent->d_lock);
873 continue;
874 }
875
876 __dentry_kill(dentry);
877
878 /*
879 * We need to prune ancestors too. This is necessary to prevent
880 * quadratic behavior of shrink_dcache_parent(), but is also
881 * expected to be beneficial in reducing dentry cache
882 * fragmentation.
883 */
884 dentry = parent;
885 while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
886 parent = lock_parent(dentry);
887 if (dentry->d_lockref.count != 1) {
888 dentry->d_lockref.count--;
889 spin_unlock(&dentry->d_lock);
890 if (parent)
891 spin_unlock(&parent->d_lock);
892 break;
893 }
894 inode = dentry->d_inode; /* can't be NULL */
895 if (unlikely(!spin_trylock(&inode->i_lock))) {
896 spin_unlock(&dentry->d_lock);
897 if (parent)
898 spin_unlock(&parent->d_lock);
899 cpu_relax();
900 continue;
901 }
902 __dentry_kill(dentry);
903 dentry = parent;
909 } 904 }
910 } 905 }
911 rcu_read_unlock();
912} 906}
913 907
914static enum lru_status 908static enum lru_status
@@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
1261 if (data->start == dentry) 1255 if (data->start == dentry)
1262 goto out; 1256 goto out;
1263 1257
1264 /* 1258 if (dentry->d_flags & DCACHE_SHRINK_LIST) {
1265 * move only zero ref count dentries to the dispose list.
1266 *
1267 * Those which are presently on the shrink list, being processed
1268 * by shrink_dentry_list(), shouldn't be moved. Otherwise the
1269 * loop in shrink_dcache_parent() might not make any progress
1270 * and loop forever.
1271 */
1272 if (dentry->d_lockref.count) {
1273 dentry_lru_del(dentry);
1274 } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
1275 /*
1276 * We can't use d_lru_shrink_move() because we
1277 * need to get the global LRU lock and do the
1278 * LRU accounting.
1279 */
1280 d_lru_del(dentry);
1281 d_shrink_add(dentry, &data->dispose);
1282 data->found++; 1259 data->found++;
1283 ret = D_WALK_NORETRY; 1260 } else {
1261 if (dentry->d_flags & DCACHE_LRU_LIST)
1262 d_lru_del(dentry);
1263 if (!dentry->d_lockref.count) {
1264 d_shrink_add(dentry, &data->dispose);
1265 data->found++;
1266 }
1284 } 1267 }
1285 /* 1268 /*
1286 * We can return to the caller if we have found some (this 1269 * We can return to the caller if we have found some (this
1287 * ensures forward progress). We'll be coming back to find 1270 * ensures forward progress). We'll be coming back to find
1288 * the rest. 1271 * the rest.
1289 */ 1272 */
1290 if (data->found && need_resched()) 1273 if (!list_empty(&data->dispose))
1291 ret = D_WALK_QUIT; 1274 ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
1292out: 1275out:
1293 return ret; 1276 return ret;
1294} 1277}
@@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent)
1318} 1301}
1319EXPORT_SYMBOL(shrink_dcache_parent); 1302EXPORT_SYMBOL(shrink_dcache_parent);
1320 1303
1321static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) 1304static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
1322{ 1305{
1323 struct select_data *data = _data; 1306 /* it has busy descendents; complain about those instead */
1324 enum d_walk_ret ret = D_WALK_CONTINUE; 1307 if (!list_empty(&dentry->d_subdirs))
1308 return D_WALK_CONTINUE;
1325 1309
1326 if (dentry->d_lockref.count) { 1310 /* root with refcount 1 is fine */
1327 dentry_lru_del(dentry); 1311 if (dentry == _data && dentry->d_lockref.count == 1)
1328 if (likely(!list_empty(&dentry->d_subdirs))) 1312 return D_WALK_CONTINUE;
1329 goto out; 1313
1330 if (dentry == data->start && dentry->d_lockref.count == 1) 1314 printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
1331 goto out; 1315 " still in use (%d) [unmount of %s %s]\n",
1332 printk(KERN_ERR
1333 "BUG: Dentry %p{i=%lx,n=%s}"
1334 " still in use (%d)"
1335 " [unmount of %s %s]\n",
1336 dentry, 1316 dentry,
1337 dentry->d_inode ? 1317 dentry->d_inode ?
1338 dentry->d_inode->i_ino : 0UL, 1318 dentry->d_inode->i_ino : 0UL,
1339 dentry->d_name.name, 1319 dentry,
1340 dentry->d_lockref.count, 1320 dentry->d_lockref.count,
1341 dentry->d_sb->s_type->name, 1321 dentry->d_sb->s_type->name,
1342 dentry->d_sb->s_id); 1322 dentry->d_sb->s_id);
1343 BUG(); 1323 WARN_ON(1);
1344 } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { 1324 return D_WALK_CONTINUE;
1345 /* 1325}
1346 * We can't use d_lru_shrink_move() because we 1326
1347 * need to get the global LRU lock and do the 1327static void do_one_tree(struct dentry *dentry)
1348 * LRU accounting. 1328{
1349 */ 1329 shrink_dcache_parent(dentry);
1350 if (dentry->d_flags & DCACHE_LRU_LIST) 1330 d_walk(dentry, dentry, umount_check, NULL);
1351 d_lru_del(dentry); 1331 d_drop(dentry);
1352 d_shrink_add(dentry, &data->dispose); 1332 dput(dentry);
1353 data->found++;
1354 ret = D_WALK_NORETRY;
1355 }
1356out:
1357 if (data->found && need_resched())
1358 ret = D_WALK_QUIT;
1359 return ret;
1360} 1333}
1361 1334
1362/* 1335/*
@@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
1366{ 1339{
1367 struct dentry *dentry; 1340 struct dentry *dentry;
1368 1341
1369 if (down_read_trylock(&sb->s_umount)) 1342 WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
1370 BUG();
1371 1343
1372 dentry = sb->s_root; 1344 dentry = sb->s_root;
1373 sb->s_root = NULL; 1345 sb->s_root = NULL;
1374 for (;;) { 1346 do_one_tree(dentry);
1375 struct select_data data;
1376
1377 INIT_LIST_HEAD(&data.dispose);
1378 data.start = dentry;
1379 data.found = 0;
1380
1381 d_walk(dentry, &data, umount_collect, NULL);
1382 if (!data.found)
1383 break;
1384
1385 shrink_dentry_list(&data.dispose);
1386 cond_resched();
1387 }
1388 d_drop(dentry);
1389 dput(dentry);
1390 1347
1391 while (!hlist_bl_empty(&sb->s_anon)) { 1348 while (!hlist_bl_empty(&sb->s_anon)) {
1392 struct select_data data; 1349 dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
1393 dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); 1350 do_one_tree(dentry);
1394
1395 INIT_LIST_HEAD(&data.dispose);
1396 data.start = NULL;
1397 data.found = 0;
1398
1399 d_walk(dentry, &data, umount_collect, NULL);
1400 if (data.found)
1401 shrink_dentry_list(&data.dispose);
1402 cond_resched();
1403 } 1351 }
1404} 1352}
1405 1353
@@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1647 unsigned add_flags = d_flags_for_inode(inode); 1595 unsigned add_flags = d_flags_for_inode(inode);
1648 1596
1649 spin_lock(&dentry->d_lock); 1597 spin_lock(&dentry->d_lock);
1650 dentry->d_flags &= ~DCACHE_ENTRY_TYPE; 1598 __d_set_type(dentry, add_flags);
1651 dentry->d_flags |= add_flags;
1652 if (inode) 1599 if (inode)
1653 hlist_add_head(&dentry->d_alias, &inode->i_dentry); 1600 hlist_add_head(&dentry->d_alias, &inode->i_dentry);
1654 dentry->d_inode = inode; 1601 dentry->d_inode = inode;
@@ -2483,12 +2430,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2483 dentry->d_name.name = dentry->d_iname; 2430 dentry->d_name.name = dentry->d_iname;
2484 } else { 2431 } else {
2485 /* 2432 /*
2486 * Both are internal. Just copy target to dentry 2433 * Both are internal.
2487 */ 2434 */
2488 memcpy(dentry->d_iname, target->d_name.name, 2435 unsigned int i;
2489 target->d_name.len + 1); 2436 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
2490 dentry->d_name.len = target->d_name.len; 2437 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
2491 return; 2438 swap(((long *) &dentry->d_iname)[i],
2439 ((long *) &target->d_iname)[i]);
2440 }
2492 } 2441 }
2493 } 2442 }
2494 swap(dentry->d_name.len, target->d_name.len); 2443 swap(dentry->d_name.len, target->d_name.len);
@@ -2545,13 +2494,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
2545 * __d_move - move a dentry 2494 * __d_move - move a dentry
2546 * @dentry: entry to move 2495 * @dentry: entry to move
2547 * @target: new dentry 2496 * @target: new dentry
2497 * @exchange: exchange the two dentries
2548 * 2498 *
2549 * Update the dcache to reflect the move of a file name. Negative 2499 * Update the dcache to reflect the move of a file name. Negative
2550 * dcache entries should not be moved in this way. Caller must hold 2500 * dcache entries should not be moved in this way. Caller must hold
2551 * rename_lock, the i_mutex of the source and target directories, 2501 * rename_lock, the i_mutex of the source and target directories,
2552 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). 2502 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
2553 */ 2503 */
2554static void __d_move(struct dentry * dentry, struct dentry * target) 2504static void __d_move(struct dentry *dentry, struct dentry *target,
2505 bool exchange)
2555{ 2506{
2556 if (!dentry->d_inode) 2507 if (!dentry->d_inode)
2557 printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 2508 printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2573,8 +2524,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2573 __d_drop(dentry); 2524 __d_drop(dentry);
2574 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); 2525 __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
2575 2526
2576 /* Unhash the target: dput() will then get rid of it */ 2527 /*
2528 * Unhash the target (d_delete() is not usable here). If exchanging
2529 * the two dentries, then rehash onto the other's hash queue.
2530 */
2577 __d_drop(target); 2531 __d_drop(target);
2532 if (exchange) {
2533 __d_rehash(target,
2534 d_hash(dentry->d_parent, dentry->d_name.hash));
2535 }
2578 2536
2579 list_del(&dentry->d_u.d_child); 2537 list_del(&dentry->d_u.d_child);
2580 list_del(&target->d_u.d_child); 2538 list_del(&target->d_u.d_child);
@@ -2601,6 +2559,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2601 write_seqcount_end(&dentry->d_seq); 2559 write_seqcount_end(&dentry->d_seq);
2602 2560
2603 dentry_unlock_parents_for_move(dentry, target); 2561 dentry_unlock_parents_for_move(dentry, target);
2562 if (exchange)
2563 fsnotify_d_move(target);
2604 spin_unlock(&target->d_lock); 2564 spin_unlock(&target->d_lock);
2605 fsnotify_d_move(dentry); 2565 fsnotify_d_move(dentry);
2606 spin_unlock(&dentry->d_lock); 2566 spin_unlock(&dentry->d_lock);
@@ -2618,11 +2578,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2618void d_move(struct dentry *dentry, struct dentry *target) 2578void d_move(struct dentry *dentry, struct dentry *target)
2619{ 2579{
2620 write_seqlock(&rename_lock); 2580 write_seqlock(&rename_lock);
2621 __d_move(dentry, target); 2581 __d_move(dentry, target, false);
2622 write_sequnlock(&rename_lock); 2582 write_sequnlock(&rename_lock);
2623} 2583}
2624EXPORT_SYMBOL(d_move); 2584EXPORT_SYMBOL(d_move);
2625 2585
2586/*
2587 * d_exchange - exchange two dentries
2588 * @dentry1: first dentry
2589 * @dentry2: second dentry
2590 */
2591void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
2592{
2593 write_seqlock(&rename_lock);
2594
2595 WARN_ON(!dentry1->d_inode);
2596 WARN_ON(!dentry2->d_inode);
2597 WARN_ON(IS_ROOT(dentry1));
2598 WARN_ON(IS_ROOT(dentry2));
2599
2600 __d_move(dentry1, dentry2, true);
2601
2602 write_sequnlock(&rename_lock);
2603}
2604
2626/** 2605/**
2627 * d_ancestor - search for an ancestor 2606 * d_ancestor - search for an ancestor
2628 * @p1: ancestor dentry 2607 * @p1: ancestor dentry
@@ -2670,7 +2649,7 @@ static struct dentry *__d_unalias(struct inode *inode,
2670 m2 = &alias->d_parent->d_inode->i_mutex; 2649 m2 = &alias->d_parent->d_inode->i_mutex;
2671out_unalias: 2650out_unalias:
2672 if (likely(!d_mountpoint(alias))) { 2651 if (likely(!d_mountpoint(alias))) {
2673 __d_move(alias, dentry); 2652 __d_move(alias, dentry, false);
2674 ret = alias; 2653 ret = alias;
2675 } 2654 }
2676out_err: 2655out_err:
@@ -3112,6 +3091,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
3112 end = ERR_PTR(-ENAMETOOLONG); 3091 end = ERR_PTR(-ENAMETOOLONG);
3113 return end; 3092 return end;
3114} 3093}
3094EXPORT_SYMBOL(simple_dname);
3115 3095
3116/* 3096/*
3117 * Write full pathname from the root of the filesystem into the buffer. 3097 * Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444cccbe1..8c41b52da358 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
218 int err; 218 int err;
219 struct debugfs_fs_info *fsi = sb->s_fs_info; 219 struct debugfs_fs_info *fsi = sb->s_fs_info;
220 220
221 sync_filesystem(sb);
221 err = debugfs_parse_options(data, &fsi->mount_opts); 222 err = debugfs_parse_options(data, &fsi->mount_opts);
222 if (err) 223 if (err)
223 goto fail; 224 goto fail;
@@ -358,7 +359,7 @@ exit:
358 * @name: a pointer to a string containing the name of the file to create. 359 * @name: a pointer to a string containing the name of the file to create.
359 * @mode: the permission that the file should have. 360 * @mode: the permission that the file should have.
360 * @parent: a pointer to the parent dentry for this file. This should be a 361 * @parent: a pointer to the parent dentry for this file. This should be a
361 * directory dentry if set. If this paramater is NULL, then the 362 * directory dentry if set. If this parameter is NULL, then the
362 * file will be created in the root of the debugfs filesystem. 363 * file will be created in the root of the debugfs filesystem.
363 * @data: a pointer to something that the caller will want to get to later 364 * @data: a pointer to something that the caller will want to get to later
364 * on. The inode.i_private pointer will point to this value on 365 * on. The inode.i_private pointer will point to this value on
@@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
400 * @name: a pointer to a string containing the name of the directory to 401 * @name: a pointer to a string containing the name of the directory to
401 * create. 402 * create.
402 * @parent: a pointer to the parent dentry for this file. This should be a 403 * @parent: a pointer to the parent dentry for this file. This should be a
403 * directory dentry if set. If this paramater is NULL, then the 404 * directory dentry if set. If this parameter is NULL, then the
404 * directory will be created in the root of the debugfs filesystem. 405 * directory will be created in the root of the debugfs filesystem.
405 * 406 *
406 * This function creates a directory in debugfs with the given name. 407 * This function creates a directory in debugfs with the given name.
@@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
425 * @name: a pointer to a string containing the name of the symbolic link to 426 * @name: a pointer to a string containing the name of the symbolic link to
426 * create. 427 * create.
427 * @parent: a pointer to the parent dentry for this symbolic link. This 428 * @parent: a pointer to the parent dentry for this symbolic link. This
428 * should be a directory dentry if set. If this paramater is NULL, 429 * should be a directory dentry if set. If this parameter is NULL,
429 * then the symbolic link will be created in the root of the debugfs 430 * then the symbolic link will be created in the root of the debugfs
430 * filesystem. 431 * filesystem.
431 * @target: a pointer to a string containing the path to the target of the 432 * @target: a pointer to a string containing the path to the target of the
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f29cb7..c71038079b47 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
313 struct pts_fs_info *fsi = DEVPTS_SB(sb); 313 struct pts_fs_info *fsi = DEVPTS_SB(sb);
314 struct pts_mount_opts *opts = &fsi->mount_opts; 314 struct pts_mount_opts *opts = &fsi->mount_opts;
315 315
316 sync_filesystem(sb);
316 err = parse_mount_options(data, PARSE_REMOUNT, opts); 317 err = parse_mount_options(data, PARSE_REMOUNT, opts);
317 318
318 /* 319 /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a5489a939..31ba0935e32e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
664 goto out; 664 goto out;
665 sector = start_sector << (sdio->blkbits - 9); 665 sector = start_sector << (sdio->blkbits - 9);
666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); 666 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
667 nr_pages = min(nr_pages, BIO_MAX_PAGES);
668 BUG_ON(nr_pages <= 0); 667 BUG_ON(nr_pages <= 0);
669 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); 668 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
670 sdio->boundary = 0; 669 sdio->boundary = 0;
@@ -1194,13 +1193,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1194 } 1193 }
1195 1194
1196 /* 1195 /*
1197 * For file extending writes updating i_size before data 1196 * For file extending writes updating i_size before data writeouts
1198 * writeouts complete can expose uninitialized blocks. So 1197 * complete can expose uninitialized blocks in dumb filesystems.
1199 * even for AIO, we need to wait for i/o to complete before 1198 * In that case we need to wait for I/O completion even if asked
1200 * returning in this case. 1199 * for an asynchronous write.
1201 */ 1200 */
1202 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1201 if (is_sync_kiocb(iocb))
1203 (end > i_size_read(inode))); 1202 dio->is_async = false;
1203 else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
1204 (rw & WRITE) && end > i_size_read(inode))
1205 dio->is_async = false;
1206 else
1207 dio->is_async = true;
1208
1204 dio->inode = inode; 1209 dio->inode = inode;
1205 dio->rw = rw; 1210 dio->rw = rw;
1206 1211
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 0e90f0c91b93..dcea1e37a1b7 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17#include "ast.h"
17 18
18static uint64_t dlm_cb_seq; 19static uint64_t dlm_cb_seq;
19static DEFINE_SPINLOCK(dlm_cb_seq_spin); 20static DEFINE_SPINLOCK(dlm_cb_seq_spin);
@@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
308 mutex_unlock(&ls->ls_cb_mutex); 309 mutex_unlock(&ls->ls_cb_mutex);
309 310
310 if (count) 311 if (count)
311 log_debug(ls, "dlm_callback_resume %d", count); 312 log_rinfo(ls, "dlm_callback_resume %d", count);
312} 313}
313 314
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda446..d975851a7e1e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
68 uint16_t namelen; 68 uint16_t namelen;
69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; 69 unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
70 70
71 log_debug(ls, "dlm_recover_directory"); 71 log_rinfo(ls, "dlm_recover_directory");
72 72
73 if (dlm_no_directory(ls)) 73 if (dlm_no_directory(ls))
74 goto out_status; 74 goto out_status;
@@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
189 error = 0; 189 error = 0;
190 dlm_set_recover_status(ls, DLM_RS_DIR); 190 dlm_set_recover_status(ls, DLM_RS_DIR);
191 191
192 log_debug(ls, "dlm_recover_directory %u in %u new", 192 log_rinfo(ls, "dlm_recover_directory %u in %u new",
193 count, count_add); 193 count, count_add);
194 out_free: 194 out_free:
195 kfree(last_name); 195 kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e7665c31f7b1..5eff6ea3e27f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,6 +65,8 @@ struct dlm_mhandle;
65 printk(KERN_ERR "dlm: "fmt"\n" , ##args) 65 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
66#define log_error(ls, fmt, args...) \ 66#define log_error(ls, fmt, args...) \
67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) 67 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
68#define log_rinfo(ls, fmt, args...) \
69 printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
68 70
69#define log_debug(ls, fmt, args...) \ 71#define log_debug(ls, fmt, args...) \
70do { \ 72do { \
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e223a911a834..83f3d5520307 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s", 687 log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
688 from_nodeid, dir_nodeid, our_nodeid, r->res_name); 688 from_nodeid, dir_nodeid, our_nodeid, r->res_name);
689 dlm_free_rsb(r); 689 dlm_free_rsb(r);
690 r = NULL;
690 error = -ENOTBLK; 691 error = -ENOTBLK;
691 goto out_unlock; 692 goto out_unlock;
692 } 693 }
@@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls)
5462 up_write(&ls->ls_root_sem); 5463 up_write(&ls->ls_root_sem);
5463 5464
5464 if (lkb_count) 5465 if (lkb_count)
5465 log_debug(ls, "dlm_recover_purge %u locks for %u nodes", 5466 log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
5466 lkb_count, nodes_count); 5467 lkb_count, nodes_count);
5467} 5468}
5468 5469
@@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls)
5536 } 5537 }
5537 5538
5538 if (lkb_count) 5539 if (lkb_count)
5539 log_debug(ls, "dlm_recover_grant %u locks on %u resources", 5540 log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
5540 lkb_count, rsb_count); 5541 lkb_count, rsb_count);
5541} 5542}
5542 5543
@@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5695 put_rsb(r); 5696 put_rsb(r);
5696 out: 5697 out:
5697 if (error && error != -EEXIST) 5698 if (error && error != -EEXIST)
5698 log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", 5699 log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
5699 from_nodeid, remid, error); 5700 from_nodeid, remid, error);
5700 rl->rl_result = cpu_to_le32(error); 5701 rl->rl_result = cpu_to_le32(error);
5701 return error; 5702 return error;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d5abafd56a6d..04d6398c1f1c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -190,7 +190,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
190 else 190 else
191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); 191 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
192 192
193 log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); 193 log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
194 194
195 /* dlm_controld will see the uevent, do the necessary group management 195 /* dlm_controld will see the uevent, do the necessary group management
196 and then write to sysfs to wake us */ 196 and then write to sysfs to wake us */
@@ -198,7 +198,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
198 error = wait_event_interruptible(ls->ls_uevent_wait, 198 error = wait_event_interruptible(ls->ls_uevent_wait,
199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); 199 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
200 200
201 log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); 201 log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
202 202
203 if (error) 203 if (error)
204 goto out; 204 goto out;
@@ -640,7 +640,7 @@ static int new_lockspace(const char *name, const char *cluster,
640 640
641 dlm_create_debug_file(ls); 641 dlm_create_debug_file(ls);
642 642
643 log_debug(ls, "join complete"); 643 log_rinfo(ls, "join complete");
644 *lockspace = ls; 644 *lockspace = ls;
645 return 0; 645 return 0;
646 646
@@ -835,7 +835,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
835 dlm_clear_members(ls); 835 dlm_clear_members(ls);
836 dlm_clear_members_gone(ls); 836 dlm_clear_members_gone(ls);
837 kfree(ls->ls_node_array); 837 kfree(ls->ls_node_array);
838 log_debug(ls, "release_lockspace final free"); 838 log_rinfo(ls, "release_lockspace final free");
839 kobject_put(&ls->ls_kobj); 839 kobject_put(&ls->ls_kobj);
840 /* The ls structure will be freed when the kobject is done with */ 840 /* The ls structure will be freed when the kobject is done with */
841 841
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3190ca973dd6..1e5b45359509 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
424} 424}
425 425
426/* Data available on socket or listen socket received a connect */ 426/* Data available on socket or listen socket received a connect */
427static void lowcomms_data_ready(struct sock *sk, int count_unused) 427static void lowcomms_data_ready(struct sock *sk)
428{ 428{
429 struct connection *con = sock2con(sk); 429 struct connection *con = sock2con(sk);
430 if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) 430 if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b54921..9c47f1c14a8b 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
60 60
61#define SLOT_DEBUG_LINE 128 61#define SLOT_DEBUG_LINE 128
62 62
63static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, 63static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
64 struct rcom_slot *ro0, struct dlm_slot *array, 64 struct rcom_slot *ro0, struct dlm_slot *array,
65 int array_size) 65 int array_size)
66{ 66{
67 char line[SLOT_DEBUG_LINE]; 67 char line[SLOT_DEBUG_LINE];
68 int len = SLOT_DEBUG_LINE - 1; 68 int len = SLOT_DEBUG_LINE - 1;
69 int pos = 0; 69 int pos = 0;
70 int ret, i; 70 int ret, i;
71 71
72 if (!dlm_config.ci_log_debug)
73 return;
74
75 memset(line, 0, sizeof(line)); 72 memset(line, 0, sizeof(line));
76 73
77 if (array) { 74 if (array) {
@@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
95 } 92 }
96 } 93 }
97 94
98 log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); 95 log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
99} 96}
100 97
101int dlm_slots_copy_in(struct dlm_ls *ls) 98int dlm_slots_copy_in(struct dlm_ls *ls)
@@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
129 ro->ro_slot = le16_to_cpu(ro->ro_slot); 126 ro->ro_slot = le16_to_cpu(ro->ro_slot);
130 } 127 }
131 128
132 log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); 129 log_slots(ls, gen, num_slots, ro0, NULL, 0);
133 130
134 list_for_each_entry(memb, &ls->ls_nodes, list) { 131 list_for_each_entry(memb, &ls->ls_nodes, list) {
135 for (i = 0, ro = ro0; i < num_slots; i++, ro++) { 132 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
@@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
274 271
275 gen++; 272 gen++;
276 273
277 log_debug_slots(ls, gen, num, NULL, array, array_size); 274 log_slots(ls, gen, num, NULL, array, array_size);
278 275
279 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - 276 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
280 sizeof(struct rcom_config)) / sizeof(struct rcom_slot); 277 sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
@@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls)
447 break; 444 break;
448 } 445 }
449 if (error) 446 if (error)
450 log_debug(ls, "ping_members aborted %d last nodeid %d", 447 log_rinfo(ls, "ping_members aborted %d last nodeid %d",
451 error, ls->ls_recover_nodeid); 448 error, ls->ls_recover_nodeid);
452 return error; 449 return error;
453} 450}
@@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
539 count as a negative change so the "neg" recovery steps will happen */ 536 count as a negative change so the "neg" recovery steps will happen */
540 537
541 list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 538 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
542 log_debug(ls, "prev removed member %d", memb->nodeid); 539 log_rinfo(ls, "prev removed member %d", memb->nodeid);
543 neg++; 540 neg++;
544 } 541 }
545 542
@@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
551 continue; 548 continue;
552 549
553 if (!node) { 550 if (!node) {
554 log_debug(ls, "remove member %d", memb->nodeid); 551 log_rinfo(ls, "remove member %d", memb->nodeid);
555 } else { 552 } else {
556 /* removed and re-added */ 553 /* removed and re-added */
557 log_debug(ls, "remove member %d comm_seq %u %u", 554 log_rinfo(ls, "remove member %d comm_seq %u %u",
558 memb->nodeid, memb->comm_seq, node->comm_seq); 555 memb->nodeid, memb->comm_seq, node->comm_seq);
559 } 556 }
560 557
@@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
571 if (dlm_is_member(ls, node->nodeid)) 568 if (dlm_is_member(ls, node->nodeid))
572 continue; 569 continue;
573 dlm_add_member(ls, node); 570 dlm_add_member(ls, node);
574 log_debug(ls, "add member %d", node->nodeid); 571 log_rinfo(ls, "add member %d", node->nodeid);
575 } 572 }
576 573
577 list_for_each_entry(memb, &ls->ls_nodes, list) { 574 list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
591 complete(&ls->ls_members_done); 588 complete(&ls->ls_members_done);
592 } 589 }
593 590
594 log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); 591 log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
595 return error; 592 return error;
596} 593}
597 594
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a6bc63f6e31b..eaea789bf97d 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
526 int nodir = dlm_no_directory(ls); 526 int nodir = dlm_no_directory(ls);
527 int error; 527 int error;
528 528
529 log_debug(ls, "dlm_recover_masters"); 529 log_rinfo(ls, "dlm_recover_masters");
530 530
531 down_read(&ls->ls_root_sem); 531 down_read(&ls->ls_root_sem);
532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 532 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
552 } 552 }
553 up_read(&ls->ls_root_sem); 553 up_read(&ls->ls_root_sem);
554 554
555 log_debug(ls, "dlm_recover_masters %u of %u", count, total); 555 log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
556 556
557 error = dlm_wait_function(ls, &recover_idr_empty); 557 error = dlm_wait_function(ls, &recover_idr_empty);
558 out: 558 out:
@@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
685 } 685 }
686 up_read(&ls->ls_root_sem); 686 up_read(&ls->ls_root_sem);
687 687
688 log_debug(ls, "dlm_recover_locks %d out", count); 688 log_rinfo(ls, "dlm_recover_locks %d out", count);
689 689
690 error = dlm_wait_function(ls, &recover_list_empty); 690 error = dlm_wait_function(ls, &recover_list_empty);
691 out: 691 out:
@@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
883 up_read(&ls->ls_root_sem); 883 up_read(&ls->ls_root_sem);
884 884
885 if (count) 885 if (count)
886 log_debug(ls, "dlm_recover_rsbs %d done", count); 886 log_rinfo(ls, "dlm_recover_rsbs %d done", count);
887} 887}
888 888
889/* Create a single list of all root rsb's to be used during recovery */ 889/* Create a single list of all root rsb's to be used during recovery */
@@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls)
950 } 950 }
951 951
952 if (count) 952 if (count)
953 log_debug(ls, "dlm_clear_toss %u done", count); 953 log_rinfo(ls, "dlm_clear_toss %u done", count);
954} 954}
955 955
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec3..6859b4bf971e 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
55 unsigned long start; 55 unsigned long start;
56 int error, neg = 0; 56 int error, neg = 0;
57 57
58 log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); 58 log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
59 59
60 mutex_lock(&ls->ls_recoverd_active); 60 mutex_lock(&ls->ls_recoverd_active);
61 61
@@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
76 76
77 error = dlm_recover_members(ls, rv, &neg); 77 error = dlm_recover_members(ls, rv, &neg);
78 if (error) { 78 if (error) {
79 log_debug(ls, "dlm_recover_members error %d", error); 79 log_rinfo(ls, "dlm_recover_members error %d", error);
80 goto fail; 80 goto fail;
81 } 81 }
82 82
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
90 90
91 error = dlm_recover_members_wait(ls); 91 error = dlm_recover_members_wait(ls);
92 if (error) { 92 if (error) {
93 log_debug(ls, "dlm_recover_members_wait error %d", error); 93 log_rinfo(ls, "dlm_recover_members_wait error %d", error);
94 goto fail; 94 goto fail;
95 } 95 }
96 96
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
103 103
104 error = dlm_recover_directory(ls); 104 error = dlm_recover_directory(ls);
105 if (error) { 105 if (error) {
106 log_debug(ls, "dlm_recover_directory error %d", error); 106 log_rinfo(ls, "dlm_recover_directory error %d", error);
107 goto fail; 107 goto fail;
108 } 108 }
109 109
@@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
111 111
112 error = dlm_recover_directory_wait(ls); 112 error = dlm_recover_directory_wait(ls);
113 if (error) { 113 if (error) {
114 log_debug(ls, "dlm_recover_directory_wait error %d", error); 114 log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
115 goto fail; 115 goto fail;
116 } 116 }
117 117
118 log_debug(ls, "dlm_recover_directory %u out %u messages", 118 log_rinfo(ls, "dlm_recover_directory %u out %u messages",
119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); 119 ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
120 120
121 /* 121 /*
@@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
144 144
145 error = dlm_recover_masters(ls); 145 error = dlm_recover_masters(ls);
146 if (error) { 146 if (error) {
147 log_debug(ls, "dlm_recover_masters error %d", error); 147 log_rinfo(ls, "dlm_recover_masters error %d", error);
148 goto fail; 148 goto fail;
149 } 149 }
150 150
@@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
154 154
155 error = dlm_recover_locks(ls); 155 error = dlm_recover_locks(ls);
156 if (error) { 156 if (error) {
157 log_debug(ls, "dlm_recover_locks error %d", error); 157 log_rinfo(ls, "dlm_recover_locks error %d", error);
158 goto fail; 158 goto fail;
159 } 159 }
160 160
@@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
162 162
163 error = dlm_recover_locks_wait(ls); 163 error = dlm_recover_locks_wait(ls);
164 if (error) { 164 if (error) {
165 log_debug(ls, "dlm_recover_locks_wait error %d", error); 165 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
166 goto fail; 166 goto fail;
167 } 167 }
168 168
169 log_debug(ls, "dlm_recover_locks %u in", 169 log_rinfo(ls, "dlm_recover_locks %u in",
170 ls->ls_recover_locks_in); 170 ls->ls_recover_locks_in);
171 171
172 /* 172 /*
@@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
186 186
187 error = dlm_recover_locks_wait(ls); 187 error = dlm_recover_locks_wait(ls);
188 if (error) { 188 if (error) {
189 log_debug(ls, "dlm_recover_locks_wait error %d", error); 189 log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
190 goto fail; 190 goto fail;
191 } 191 }
192 } 192 }
@@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
205 205
206 error = dlm_recover_done_wait(ls); 206 error = dlm_recover_done_wait(ls);
207 if (error) { 207 if (error) {
208 log_debug(ls, "dlm_recover_done_wait error %d", error); 208 log_rinfo(ls, "dlm_recover_done_wait error %d", error);
209 goto fail; 209 goto fail;
210 } 210 }
211 211
@@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
217 217
218 error = enable_locking(ls, rv->seq); 218 error = enable_locking(ls, rv->seq);
219 if (error) { 219 if (error) {
220 log_debug(ls, "enable_locking error %d", error); 220 log_rinfo(ls, "enable_locking error %d", error);
221 goto fail; 221 goto fail;
222 } 222 }
223 223
224 error = dlm_process_requestqueue(ls); 224 error = dlm_process_requestqueue(ls);
225 if (error) { 225 if (error) {
226 log_debug(ls, "dlm_process_requestqueue error %d", error); 226 log_rinfo(ls, "dlm_process_requestqueue error %d", error);
227 goto fail; 227 goto fail;
228 } 228 }
229 229
230 error = dlm_recover_waiters_post(ls); 230 error = dlm_recover_waiters_post(ls);
231 if (error) { 231 if (error) {
232 log_debug(ls, "dlm_recover_waiters_post error %d", error); 232 log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
233 goto fail; 233 goto fail;
234 } 234 }
235 235
236 dlm_recover_grant(ls); 236 dlm_recover_grant(ls);
237 237
238 log_debug(ls, "dlm_recover %llu generation %u done: %u ms", 238 log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
239 (unsigned long long)rv->seq, ls->ls_generation, 239 (unsigned long long)rv->seq, ls->ls_generation,
240 jiffies_to_msecs(jiffies - start)); 240 jiffies_to_msecs(jiffies - start));
241 mutex_unlock(&ls->ls_recoverd_active); 241 mutex_unlock(&ls->ls_recoverd_active);
@@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
245 245
246 fail: 246 fail:
247 dlm_release_root_list(ls); 247 dlm_release_root_list(ls);
248 log_debug(ls, "dlm_recover %llu error %d", 248 log_rinfo(ls, "dlm_recover %llu error %d",
249 (unsigned long long)rv->seq, error); 249 (unsigned long long)rv->seq, error);
250 mutex_unlock(&ls->ls_recoverd_active); 250 mutex_unlock(&ls->ls_recoverd_active);
251 return error; 251 return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb2..9280202e488c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
59 if (ret) 59 if (ret)
60 return ret; 60 return ret;
61 if (write) { 61 if (write) {
62 if (sysctl_drop_caches & 1) 62 static int stfu;
63
64 if (sysctl_drop_caches & 1) {
63 iterate_supers(drop_pagecache_sb, NULL); 65 iterate_supers(drop_pagecache_sb, NULL);
64 if (sysctl_drop_caches & 2) 66 count_vm_event(DROP_PAGECACHE);
67 }
68 if (sysctl_drop_caches & 2) {
65 drop_slab(); 69 drop_slab();
70 count_vm_event(DROP_SLAB);
71 }
72 if (!stfu) {
73 pr_info("%s (%d): drop_caches: %d\n",
74 current->comm, task_pid_nr(current),
75 sysctl_drop_caches);
76 }
77 stfu |= sysctl_drop_caches & 4;
66 } 78 }
67 return 0; 79 return 0;
68} 80}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b167ca48b8ee..d4a9431ec73c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
641 } 641 }
642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, 642 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
643 lower_new_dir_dentry->d_inode, lower_new_dentry, 643 lower_new_dir_dentry->d_inode, lower_new_dentry,
644 NULL); 644 NULL, 0);
645 if (rc) 645 if (rc)
646 goto out_lock; 646 goto out_lock;
647 if (target_inode) 647 if (target_inode)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8ff0b1..afa1b81c3418 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
132 */ 132 */
133static void ecryptfs_evict_inode(struct inode *inode) 133static void ecryptfs_evict_inode(struct inode *inode)
134{ 134{
135 truncate_inode_pages(&inode->i_data, 0); 135 truncate_inode_pages_final(&inode->i_data);
136 clear_inode(inode); 136 clear_inode(inode);
137 iput(ecryptfs_inode_to_lower(inode)); 137 iput(ecryptfs_inode_to_lower(inode));
138} 138}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8dd524f32284..cdb2971192a5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file,
21 u32 attributes; 21 u32 attributes;
22 struct inode *inode = file->f_mapping->host; 22 struct inode *inode = file->f_mapping->host;
23 unsigned long datasize = count - sizeof(attributes); 23 unsigned long datasize = count - sizeof(attributes);
24 ssize_t bytes = 0; 24 ssize_t bytes;
25 bool set = false; 25 bool set = false;
26 26
27 if (count < sizeof(attributes)) 27 if (count < sizeof(attributes))
@@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file,
33 if (attributes & ~(EFI_VARIABLE_MASK)) 33 if (attributes & ~(EFI_VARIABLE_MASK))
34 return -EINVAL; 34 return -EINVAL;
35 35
36 data = kmalloc(datasize, GFP_KERNEL); 36 data = memdup_user(userbuf + sizeof(attributes), datasize);
37 if (!data) 37 if (IS_ERR(data))
38 return -ENOMEM; 38 return PTR_ERR(data);
39
40 if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
41 bytes = -EFAULT;
42 goto out;
43 }
44 39
45 bytes = efivar_entry_set_get_size(var, attributes, &datasize, 40 bytes = efivar_entry_set_get_size(var, attributes, &datasize,
46 data, &set); 41 data, &set);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 50215bbd6463..3befcc9f5d63 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -91,7 +91,7 @@ static void init_once(void *foo)
91 inode_init_once(&ei->vfs_inode); 91 inode_init_once(&ei->vfs_inode);
92} 92}
93 93
94static int init_inodecache(void) 94static int __init init_inodecache(void)
95{ 95{
96 efs_inode_cachep = kmem_cache_create("efs_inode_cache", 96 efs_inode_cachep = kmem_cache_create("efs_inode_cache",
97 sizeof(struct efs_inode_info), 97 sizeof(struct efs_inode_info),
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
114 114
115static int efs_remount(struct super_block *sb, int *flags, char *data) 115static int efs_remount(struct super_block *sb, int *flags, char *data)
116{ 116{
117 sync_filesystem(sb);
117 *flags |= MS_RDONLY; 118 *flags |= MS_RDONLY;
118 return 0; 119 return 0;
119} 120}
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fccdd723..238b7aa26f68 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/fdtable.h> 27#include <linux/fdtable.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/vmacache.h>
29#include <linux/stat.h> 30#include <linux/stat.h>
30#include <linux/fcntl.h> 31#include <linux/fcntl.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
@@ -97,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
97 module_put(fmt->module); 98 module_put(fmt->module);
98} 99}
99 100
101#ifdef CONFIG_USELIB
100/* 102/*
101 * Note that a shared library must be both readable and executable due to 103 * Note that a shared library must be both readable and executable due to
102 * security reasons. 104 * security reasons.
@@ -156,6 +158,7 @@ exit:
156out: 158out:
157 return error; 159 return error;
158} 160}
161#endif /* #ifdef CONFIG_USELIB */
159 162
160#ifdef CONFIG_MMU 163#ifdef CONFIG_MMU
161/* 164/*
@@ -654,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm,
654 unsigned long rlim_stack; 657 unsigned long rlim_stack;
655 658
656#ifdef CONFIG_STACK_GROWSUP 659#ifdef CONFIG_STACK_GROWSUP
657 /* Limit stack size to 1GB */ 660 /* Limit stack size */
658 stack_base = rlimit_max(RLIMIT_STACK); 661 stack_base = rlimit_max(RLIMIT_STACK);
659 if (stack_base > (1 << 30)) 662 if (stack_base > STACK_SIZE_MAX)
660 stack_base = 1 << 30; 663 stack_base = STACK_SIZE_MAX;
661 664
662 /* Make sure we didn't let the argument array grow too large. */ 665 /* Make sure we didn't let the argument array grow too large. */
663 if (vma->vm_end - vma->vm_start > stack_base) 666 if (vma->vm_end - vma->vm_start > stack_base)
@@ -810,7 +813,7 @@ EXPORT_SYMBOL(kernel_read);
810 813
811ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) 814ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
812{ 815{
813 ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); 816 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
814 if (res > 0) 817 if (res > 0)
815 flush_icache_range(addr, addr + len); 818 flush_icache_range(addr, addr + len);
816 return res; 819 return res;
@@ -820,7 +823,7 @@ EXPORT_SYMBOL(read_code);
820static int exec_mmap(struct mm_struct *mm) 823static int exec_mmap(struct mm_struct *mm)
821{ 824{
822 struct task_struct *tsk; 825 struct task_struct *tsk;
823 struct mm_struct * old_mm, *active_mm; 826 struct mm_struct *old_mm, *active_mm;
824 827
825 /* Notify parent that we're no longer interested in the old VM */ 828 /* Notify parent that we're no longer interested in the old VM */
826 tsk = current; 829 tsk = current;
@@ -846,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
846 tsk->mm = mm; 849 tsk->mm = mm;
847 tsk->active_mm = mm; 850 tsk->active_mm = mm;
848 activate_mm(active_mm, mm); 851 activate_mm(active_mm, mm);
852 tsk->mm->vmacache_seqnum = 0;
853 vmacache_flush(tsk);
849 task_unlock(tsk); 854 task_unlock(tsk);
850 if (old_mm) { 855 if (old_mm) {
851 up_read(&old_mm->mmap_sem); 856 up_read(&old_mm->mmap_sem);
@@ -1041,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
1041 * so that a new one can be started 1046 * so that a new one can be started
1042 */ 1047 */
1043 1048
1044void set_task_comm(struct task_struct *tsk, char *buf) 1049void set_task_comm(struct task_struct *tsk, const char *buf)
1045{ 1050{
1046 task_lock(tsk); 1051 task_lock(tsk);
1047 trace_task_rename(tsk, buf); 1052 trace_task_rename(tsk, buf);
@@ -1050,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf)
1050 perf_event_comm(tsk); 1055 perf_event_comm(tsk);
1051} 1056}
1052 1057
1053static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
1054{
1055 int i, ch;
1056
1057 /* Copies the binary name from after last slash */
1058 for (i = 0; (ch = *(fn++)) != '\0';) {
1059 if (ch == '/')
1060 i = 0; /* overwrite what we wrote */
1061 else
1062 if (i < len - 1)
1063 tcomm[i++] = ch;
1064 }
1065 tcomm[i] = '\0';
1066}
1067
1068int flush_old_exec(struct linux_binprm * bprm) 1058int flush_old_exec(struct linux_binprm * bprm)
1069{ 1059{
1070 int retval; 1060 int retval;
@@ -1078,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm)
1078 goto out; 1068 goto out;
1079 1069
1080 set_mm_exe_file(bprm->mm, bprm->file); 1070 set_mm_exe_file(bprm->mm, bprm->file);
1081
1082 filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
1083 /* 1071 /*
1084 * Release all of the old mmap stuff 1072 * Release all of the old mmap stuff
1085 */ 1073 */
@@ -1122,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm)
1122 else 1110 else
1123 set_dumpable(current->mm, suid_dumpable); 1111 set_dumpable(current->mm, suid_dumpable);
1124 1112
1125 set_task_comm(current, bprm->tcomm); 1113 set_task_comm(current, kbasename(bprm->filename));
1126 1114
1127 /* Set the new mm task size. We have to do that late because it may 1115 /* Set the new mm task size. We have to do that late because it may
1128 * depend on TIF_32BIT which is only updated in flush_thread() on 1116 * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1619,9 +1607,9 @@ SYSCALL_DEFINE3(execve,
1619 return do_execve(getname(filename), argv, envp); 1607 return do_execve(getname(filename), argv, envp);
1620} 1608}
1621#ifdef CONFIG_COMPAT 1609#ifdef CONFIG_COMPAT
1622asmlinkage long compat_sys_execve(const char __user * filename, 1610COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
1623 const compat_uptr_t __user * argv, 1611 const compat_uptr_t __user *, argv,
1624 const compat_uptr_t __user * envp) 1612 const compat_uptr_t __user *, envp)
1625{ 1613{
1626 return compat_do_execve(getname(filename), argv, envp); 1614 return compat_do_execve(getname(filename), argv, envp);
1627} 1615}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ee4317faccb1..d1c244d67667 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode)
1486 struct ore_io_state *ios; 1486 struct ore_io_state *ios;
1487 int ret; 1487 int ret;
1488 1488
1489 truncate_inode_pages(&inode->i_data, 0); 1489 truncate_inode_pages_final(&inode->i_data);
1490 1490
1491 /* TODO: should do better here */ 1491 /* TODO: should do better here */
1492 if (inode->i_nlink || is_bad_inode(inode)) 1492 if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7682b970d0f1..4e2c032ab8a1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
21#undef ORE_DBGMSG2 21#undef ORE_DBGMSG2
22#define ORE_DBGMSG2 ORE_DBGMSG 22#define ORE_DBGMSG2 ORE_DBGMSG
23 23
24struct page *_raid_page_alloc(void) 24static struct page *_raid_page_alloc(void)
25{ 25{
26 return alloc_page(GFP_KERNEL); 26 return alloc_page(GFP_KERNEL);
27} 27}
28 28
29void _raid_page_free(struct page *p) 29static void _raid_page_free(struct page *p)
30{ 30{
31 __free_page(p); 31 __free_page(p);
32} 32}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9d9763328734..ed73ed8ebbee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
543 return !(odi->systemid_len || odi->osdname_len); 543 return !(odi->systemid_len || odi->osdname_len);
544} 544}
545 545
546int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, 546static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
547 struct exofs_dev **peds) 547 struct exofs_dev **peds)
548{ 548{
549 struct __alloc_ore_devs_and_exofs_devs { 549 struct __alloc_ore_devs_and_exofs_devs {
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 1b8001bbe947..27695e6f4e46 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> 4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */ 5 */
6 6
7#include <linux/capability.h>
8#include <linux/init.h> 7#include <linux/init.h>
9#include <linux/sched.h> 8#include <linux/sched.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7cadd823bb31..7d66fb0e4cca 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
284 int best_ndir = inodes_per_group; 284 int best_ndir = inodes_per_group;
285 int best_group = -1; 285 int best_group = -1;
286 286
287 get_random_bytes(&group, sizeof(group)); 287 group = prandom_u32();
288 parent_group = (unsigned)group % ngroups; 288 parent_group = (unsigned)group % ngroups;
289 for (i = 0; i < ngroups; i++) { 289 for (i = 0; i < ngroups; i++) {
290 group = (parent_group + i) % ngroups; 290 group = (parent_group + i) % ngroups;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 94ed36849b71..b1d2a4675d42 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
78 dquot_drop(inode); 78 dquot_drop(inode);
79 } 79 }
80 80
81 truncate_inode_pages(&inode->i_data, 0); 81 truncate_inode_pages_final(&inode->i_data);
82 82
83 if (want_delete) { 83 if (want_delete) {
84 sb_start_intwrite(inode->i_sb); 84 sb_start_intwrite(inode->i_sb);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697bd638..3750031cfa2f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,7 +192,7 @@ static void init_once(void *foo)
192 inode_init_once(&ei->vfs_inode); 192 inode_init_once(&ei->vfs_inode);
193} 193}
194 194
195static int init_inodecache(void) 195static int __init init_inodecache(void)
196{ 196{
197 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", 197 ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
198 sizeof(struct ext2_inode_info), 198 sizeof(struct ext2_inode_info),
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1254 unsigned long old_sb_flags; 1254 unsigned long old_sb_flags;
1255 int err; 1255 int err;
1256 1256
1257 sync_filesystem(sb);
1257 spin_lock(&sbi->s_lock); 1258 spin_lock(&sbi->s_lock);
1258 1259
1259 /* Store the old options */ 1260 /* Store the old options */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index cfedb2cb0d8c..c0ebc4db8849 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
42 value, size, flags); 42 value, size, flags);
43} 43}
44 44
45int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, 45static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
46 void *fs_info) 46 void *fs_info)
47{ 47{
48 const struct xattr *xattr; 48 const struct xattr *xattr;
49 int err = 0; 49 int err = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22548f56197b..158b5d4ce067 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1727,10 +1727,7 @@ allocated:
1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1728 1728
1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1730 err = ext3_journal_dirty_metadata(handle, gdp_bh); 1730 fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
1731 if (!fatal)
1732 fatal = err;
1733
1734 if (fatal) 1731 if (fatal)
1735 goto out; 1732 goto out;
1736 1733
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e66e4808719f..17742eed2c16 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
276 * will be invalid once the directory was converted into a dx directory 276 * will be invalid once the directory was converted into a dx directory
277 */ 277 */
278loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) 278static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
279{ 279{
280 struct inode *inode = file->f_mapping->host; 280 struct inode *inode = file->f_mapping->host;
281 int dx_dir = is_dx_dir(inode); 281 int dx_dir = is_dx_dir(inode);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 082afd78b107..a1b810230cc5 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
215 int best_ndir = inodes_per_group; 215 int best_ndir = inodes_per_group;
216 int best_group = -1; 216 int best_group = -1;
217 217
218 get_random_bytes(&group, sizeof(group)); 218 group = prandom_u32();
219 parent_group = (unsigned)group % ngroups; 219 parent_group = (unsigned)group % ngroups;
220 for (i = 0; i < ngroups; i++) { 220 for (i = 0; i < ngroups; i++) {
221 group = (parent_group + i) % ngroups; 221 group = (parent_group + i) % ngroups;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 384b6ebb655f..f5157d0d1b43 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
228 log_wait_commit(journal, commit_tid); 228 log_wait_commit(journal, commit_tid);
229 filemap_write_and_wait(&inode->i_data); 229 filemap_write_and_wait(&inode->i_data);
230 } 230 }
231 truncate_inode_pages(&inode->i_data, 0); 231 truncate_inode_pages_final(&inode->i_data);
232 232
233 ext3_discard_reservation(inode); 233 ext3_discard_reservation(inode);
234 rsv = ei->i_block_alloc_info; 234 rsv = ei->i_block_alloc_info;
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1559} 1559}
1560 1560
1561/* 1561/*
1562 * Note that we always start a transaction even if we're not journalling 1562 * Note that whenever we need to map blocks we start a transaction even if
1563 * data. This is to preserve ordering: any hole instantiation within 1563 * we're not journalling data. This is to preserve ordering: any hole
1564 * __block_write_full_page -> ext3_get_block() should be journalled 1564 * instantiation within __block_write_full_page -> ext3_get_block() should be
1565 * along with the data so we don't crash and then get metadata which 1565 * journalled along with the data so we don't crash and then get metadata which
1566 * refers to old data. 1566 * refers to old data.
1567 * 1567 *
1568 * In all journalling modes block_write_full_page() will start the I/O. 1568 * In all journalling modes block_write_full_page() will start the I/O.
1569 * 1569 *
1570 * Problem:
1571 *
1572 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1573 * ext3_writepage()
1574 *
1575 * Similar for:
1576 *
1577 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1578 *
1579 * Same applies to ext3_get_block(). We will deadlock on various things like
1580 * lock_journal and i_truncate_mutex.
1581 *
1582 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1583 * allocations fail.
1584 *
1585 * 16May01: If we're reentered then journal_current_handle() will be
1586 * non-zero. We simply *return*.
1587 *
1588 * 1 July 2001: @@@ FIXME:
1589 * In journalled data mode, a data buffer may be metadata against the
1590 * current transaction. But the same file is part of a shared mapping
1591 * and someone does a writepage() on it.
1592 *
1593 * We will move the buffer onto the async_data list, but *after* it has
1594 * been dirtied. So there's a small window where we have dirty data on
1595 * BJ_Metadata.
1596 *
1597 * Note that this only applies to the last partial page in the file. The
1598 * bit which block_write_full_page() uses prepare/commit for. (That's
1599 * broken code anyway: it's wrong for msync()).
1600 *
1601 * It's a rare case: affects the final partial page, for journalled data
1602 * where the file is subject to bith write() and writepage() in the same
1603 * transction. To fix it we'll need a custom block_write_full_page().
1604 * We'll probably need that anyway for journalling writepage() output.
1605 *
1606 * We don't honour synchronous mounts for writepage(). That would be 1570 * We don't honour synchronous mounts for writepage(). That would be
1607 * disastrous. Any write() or metadata operation will sync the fs for 1571 * disastrous. Any write() or metadata operation will sync the fs for
1608 * us. 1572 * us.
1609 *
1610 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1611 * we don't need to open a transaction here.
1612 */ 1573 */
1613static int ext3_ordered_writepage(struct page *page, 1574static int ext3_ordered_writepage(struct page *page,
1614 struct writeback_control *wbc) 1575 struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
1673 * block_write_full_page() succeeded. Otherwise they are unmapped, 1634 * block_write_full_page() succeeded. Otherwise they are unmapped,
1674 * and generally junk. 1635 * and generally junk.
1675 */ 1636 */
1676 if (ret == 0) { 1637 if (ret == 0)
1677 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1638 ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1678 NULL, journal_dirty_data_fn); 1639 NULL, journal_dirty_data_fn);
1679 if (!ret)
1680 ret = err;
1681 }
1682 walk_page_buffers(handle, page_bufs, 0, 1640 walk_page_buffers(handle, page_bufs, 0,
1683 PAGE_CACHE_SIZE, NULL, bput_one); 1641 PAGE_CACHE_SIZE, NULL, bput_one);
1684 err = ext3_journal_stop(handle); 1642 err = ext3_journal_stop(handle);
@@ -1925,6 +1883,8 @@ retry:
1925 * and pretend the write failed... */ 1883 * and pretend the write failed... */
1926 ext3_truncate_failed_direct_write(inode); 1884 ext3_truncate_failed_direct_write(inode);
1927 ret = PTR_ERR(handle); 1885 ret = PTR_ERR(handle);
1886 if (inode->i_nlink)
1887 ext3_orphan_del(NULL, inode);
1928 goto out; 1888 goto out;
1929 } 1889 }
1930 if (inode->i_nlink) 1890 if (inode->i_nlink)
@@ -3212,21 +3172,20 @@ out_brelse:
3212 * 3172 *
3213 * We are called from a few places: 3173 * We are called from a few places:
3214 * 3174 *
3215 * - Within generic_file_write() for O_SYNC files. 3175 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
3216 * Here, there will be no transaction running. We wait for any running 3176 * Here, there will be no transaction running. We wait for any running
3217 * transaction to commit. 3177 * transaction to commit.
3218 * 3178 *
3219 * - Within sys_sync(), kupdate and such. 3179 * - Within flush work (for sys_sync(), kupdate and such).
3220 * We wait on commit, if tol to. 3180 * We wait on commit, if told to.
3221 * 3181 *
3222 * - Within prune_icache() (PF_MEMALLOC == true) 3182 * - Within iput_final() -> write_inode_now()
3223 * Here we simply return. We can't afford to block kswapd on the 3183 * We wait on commit, if told to.
3224 * journal commit.
3225 * 3184 *
3226 * In all cases it is actually safe for us to return without doing anything, 3185 * In all cases it is actually safe for us to return without doing anything,
3227 * because the inode has been copied into a raw inode buffer in 3186 * because the inode has been copied into a raw inode buffer in
3228 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 3187 * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
3229 * knfsd. 3188 * writeback.
3230 * 3189 *
3231 * Note that we are absolutely dependent upon all inode dirtiers doing the 3190 * Note that we are absolutely dependent upon all inode dirtiers doing the
3232 * right thing: they *must* call mark_inode_dirty() after dirtying info in 3191 * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3197,13 @@ out_brelse:
3238 * stuff(); 3197 * stuff();
3239 * inode->i_size = expr; 3198 * inode->i_size = expr;
3240 * 3199 *
3241 * is in error because a kswapd-driven write_inode() could occur while 3200 * is in error because write_inode() could occur while `stuff()' is running,
3242 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3201 * and the new i_size will be lost. Plus the inode will no longer be on the
3243 * will no longer be on the superblock's dirty inode list. 3202 * superblock's dirty inode list.
3244 */ 3203 */
3245int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) 3204int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3246{ 3205{
3247 if (current->flags & PF_MEMALLOC) 3206 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
3248 return 0; 3207 return 0;
3249 3208
3250 if (ext3_journal_current_handle()) { 3209 if (ext3_journal_current_handle()) {
@@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3253 return -EIO; 3212 return -EIO;
3254 } 3213 }
3255 3214
3256 if (wbc->sync_mode != WB_SYNC_ALL) 3215 /*
3216 * No need to force transaction in WB_SYNC_NONE mode. Also
3217 * ext3_sync_fs() will force the commit after everything is
3218 * written.
3219 */
3220 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
3257 return 0; 3221 return 0;
3258 3222
3259 return ext3_force_commit(inode->i_sb); 3223 return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..08cdfe5461e3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,7 +527,7 @@ static void init_once(void *foo)
527 inode_init_once(&ei->vfs_inode); 527 inode_init_once(&ei->vfs_inode);
528} 528}
529 529
530static int init_inodecache(void) 530static int __init init_inodecache(void)
531{ 531{
532 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", 532 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
533 sizeof(struct ext3_inode_info), 533 sizeof(struct ext3_inode_info),
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2649 int i; 2649 int i;
2650#endif 2650#endif
2651 2651
2652 sync_filesystem(sb);
2653
2652 /* Store the original options */ 2654 /* Store the original options */
2653 old_sb_flags = sb->s_flags; 2655 old_sb_flags = sb->s_flags;
2654 old_opts.s_mount_opt = sbi->s_mount_opt; 2656 old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3387664ad70e..722c2bf9645d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
43 name, value, size, flags); 43 name, value, size, flags);
44} 44}
45 45
46int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, 46static int ext3_initxattrs(struct inode *inode,
47 void *fs_info) 47 const struct xattr *xattr_array,
48 void *fs_info)
48{ 49{
49 const struct xattr *xattr; 50 const struct xattr *xattr;
50 handle_t *handle = fs_info; 51 handle_t *handle = fs_info;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6ea7b1436bbc..5c56785007e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
667 continue; 667 continue;
668 668
669 x = ext4_count_free(bitmap_bh->b_data, 669 x = ext4_count_free(bitmap_bh->b_data,
670 EXT4_BLOCKS_PER_GROUP(sb) / 8); 670 EXT4_CLUSTERS_PER_GROUP(sb) / 8);
671 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", 671 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
672 i, ext4_free_group_clusters(sb, gdp), x); 672 i, ext4_free_group_clusters(sb, gdp), x);
673 bitmap_count += x; 673 bitmap_count += x;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534fdc5ff..66946aa62127 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <crypto/hash.h> 33#include <crypto/hash.h>
34#include <linux/falloc.h>
34#ifdef __KERNEL__ 35#ifdef __KERNEL__
35#include <linux/compat.h> 36#include <linux/compat.h>
36#endif 37#endif
@@ -567,6 +568,8 @@ enum {
567#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 568#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
568 /* Do not put hole in extent cache */ 569 /* Do not put hole in extent cache */
569#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 570#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
571 /* Convert written extents to unwritten */
572#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
570 573
571/* 574/*
572 * The bit position of these flags must not overlap with any of the 575 * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
998#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group 1001#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
999 size of blocksize * 8 1002 size of blocksize * 8
1000 blocks */ 1003 blocks */
1004#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
1005 file systems */
1001 1006
1002#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ 1007#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
1003 ~EXT4_MOUNT_##opt 1008 ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
1326 struct list_head s_es_lru; 1331 struct list_head s_es_lru;
1327 unsigned long s_es_last_sorted; 1332 unsigned long s_es_last_sorted;
1328 struct percpu_counter s_extent_cache_cnt; 1333 struct percpu_counter s_extent_cache_cnt;
1334 struct mb_cache *s_mb_cache;
1329 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1330 1336
1331 /* Ratelimit ext4 messages. */ 1337 /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
2133extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2139extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2134extern int ext4_block_truncate_page(handle_t *handle, 2140extern int ext4_block_truncate_page(handle_t *handle,
2135 struct address_space *mapping, loff_t from); 2141 struct address_space *mapping, loff_t from);
2136extern int ext4_block_zero_page_range(handle_t *handle,
2137 struct address_space *mapping, loff_t from, loff_t length);
2138extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2142extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2139 loff_t lstart, loff_t lend); 2143 loff_t lstart, loff_t lend);
2140extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2144extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2462,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
2462 up_write(&EXT4_I(inode)->i_data_sem); 2466 up_write(&EXT4_I(inode)->i_data_sem);
2463} 2467}
2464 2468
2465/*
2466 * Update i_disksize after writeback has been started. Races with truncate
2467 * are avoided by checking i_size under i_data_sem.
2468 */
2469static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
2470{
2471 loff_t i_size;
2472
2473 down_write(&EXT4_I(inode)->i_data_sem);
2474 i_size = i_size_read(inode);
2475 if (newsize > i_size)
2476 newsize = i_size;
2477 if (newsize > EXT4_I(inode)->i_disksize)
2478 EXT4_I(inode)->i_disksize = newsize;
2479 up_write(&EXT4_I(inode)->i_data_sem);
2480}
2481
2482struct ext4_group_info { 2469struct ext4_group_info {
2483 unsigned long bb_state; 2470 unsigned long bb_state;
2484 struct rb_root bb_free_root; 2471 struct rb_root bb_free_root;
@@ -2757,6 +2744,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2757extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2744extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2758 __u64 start, __u64 len); 2745 __u64 start, __u64 len);
2759extern int ext4_ext_precache(struct inode *inode); 2746extern int ext4_ext_precache(struct inode *inode);
2747extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
2760 2748
2761/* move_extent.c */ 2749/* move_extent.c */
2762extern void ext4_double_down_write_data_sem(struct inode *first, 2750extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2754,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2766extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2754extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2767 __u64 start_orig, __u64 start_donor, 2755 __u64 start_orig, __u64 start_donor,
2768 __u64 len, __u64 *moved_len); 2756 __u64 len, __u64 *moved_len);
2757extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
2758 struct ext4_extent **extent);
2769 2759
2770/* page-io.c */ 2760/* page-io.c */
2771extern int __init ext4_init_pageio(void); 2761extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de832c8..c3fb607413ed 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
259 if (WARN_ON_ONCE(err)) { 259 if (WARN_ON_ONCE(err)) {
260 ext4_journal_abort_handle(where, line, __func__, bh, 260 ext4_journal_abort_handle(where, line, __func__, bh,
261 handle, err); 261 handle, err);
262 if (inode == NULL) {
263 pr_err("EXT4: jbd2_journal_dirty_metadata "
264 "failed: handle type %u started at "
265 "line %u, credits %u/%u, errcode %d",
266 handle->h_type,
267 handle->h_line_no,
268 handle->h_requested_credits,
269 handle->h_buffer_credits, err);
270 return err;
271 }
262 ext4_error_inode(inode, where, line, 272 ext4_error_inode(inode, where, line,
263 bh->b_blocknr, 273 bh->b_blocknr,
264 "journal_dirty_metadata failed: " 274 "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d549c58..01b0c208f625 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/string.h> 38#include <linux/string.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/falloc.h>
41#include <asm/uaccess.h> 40#include <asm/uaccess.h>
42#include <linux/fiemap.h> 41#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1691 * the extent that was written properly split out and conversion to 1690 * the extent that was written properly split out and conversion to
1692 * initialized is trivial. 1691 * initialized is trivial.
1693 */ 1692 */
1694 if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1693 if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
1695 return 0; 1694 return 0;
1696 1695
1697 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1696 ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1708 */ 1707 */
1709 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1708 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1710 return 0; 1709 return 0;
1710 if (ext4_ext_is_uninitialized(ex1) &&
1711 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1712 atomic_read(&EXT4_I(inode)->i_unwritten) ||
1713 (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
1714 return 0;
1711#ifdef AGGRESSIVE_TEST 1715#ifdef AGGRESSIVE_TEST
1712 if (ext1_ee_len >= 4) 1716 if (ext1_ee_len >= 4)
1713 return 0; 1717 return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1731{ 1735{
1732 struct ext4_extent_header *eh; 1736 struct ext4_extent_header *eh;
1733 unsigned int depth, len; 1737 unsigned int depth, len;
1734 int merge_done = 0; 1738 int merge_done = 0, uninit;
1735 1739
1736 depth = ext_depth(inode); 1740 depth = ext_depth(inode);
1737 BUG_ON(path[depth].p_hdr == NULL); 1741 BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1741 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1745 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1742 break; 1746 break;
1743 /* merge with next extent! */ 1747 /* merge with next extent! */
1748 uninit = ext4_ext_is_uninitialized(ex);
1744 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1749 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1745 + ext4_ext_get_actual_len(ex + 1)); 1750 + ext4_ext_get_actual_len(ex + 1));
1751 if (uninit)
1752 ext4_ext_mark_uninitialized(ex);
1746 1753
1747 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1754 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1748 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1755 len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1896 struct ext4_ext_path *npath = NULL; 1903 struct ext4_ext_path *npath = NULL;
1897 int depth, len, err; 1904 int depth, len, err;
1898 ext4_lblk_t next; 1905 ext4_lblk_t next;
1899 int mb_flags = 0; 1906 int mb_flags = 0, uninit;
1900 1907
1901 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1908 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1902 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1909 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1946 path + depth); 1953 path + depth);
1947 if (err) 1954 if (err)
1948 return err; 1955 return err;
1949 1956 uninit = ext4_ext_is_uninitialized(ex);
1950 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1957 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1951 + ext4_ext_get_actual_len(newext)); 1958 + ext4_ext_get_actual_len(newext));
1959 if (uninit)
1960 ext4_ext_mark_uninitialized(ex);
1952 eh = path[depth].p_hdr; 1961 eh = path[depth].p_hdr;
1953 nearex = ex; 1962 nearex = ex;
1954 goto merge; 1963 goto merge;
@@ -1971,10 +1980,13 @@ prepend:
1971 if (err) 1980 if (err)
1972 return err; 1981 return err;
1973 1982
1983 uninit = ext4_ext_is_uninitialized(ex);
1974 ex->ee_block = newext->ee_block; 1984 ex->ee_block = newext->ee_block;
1975 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1985 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1976 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1986 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1977 + ext4_ext_get_actual_len(newext)); 1987 + ext4_ext_get_actual_len(newext));
1988 if (uninit)
1989 ext4_ext_mark_uninitialized(ex);
1978 eh = path[depth].p_hdr; 1990 eh = path[depth].p_hdr;
1979 nearex = ex; 1991 nearex = ex;
1980 goto merge; 1992 goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2585 ex_ee_block = le32_to_cpu(ex->ee_block); 2597 ex_ee_block = le32_to_cpu(ex->ee_block);
2586 ex_ee_len = ext4_ext_get_actual_len(ex); 2598 ex_ee_len = ext4_ext_get_actual_len(ex);
2587 2599
2600 /*
2601 * If we're starting with an extent other than the last one in the
2602 * node, we need to see if it shares a cluster with the extent to
2603 * the right (towards the end of the file). If its leftmost cluster
2604 * is this extent's rightmost cluster and it is not cluster aligned,
2605 * we'll mark it as a partial that is not to be deallocated.
2606 */
2607
2608 if (ex != EXT_LAST_EXTENT(eh)) {
2609 ext4_fsblk_t current_pblk, right_pblk;
2610 long long current_cluster, right_cluster;
2611
2612 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2613 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2614 right_pblk = ext4_ext_pblock(ex + 1);
2615 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2616 if (current_cluster == right_cluster &&
2617 EXT4_PBLK_COFF(sbi, right_pblk))
2618 *partial_cluster = -right_cluster;
2619 }
2620
2588 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2621 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2589 2622
2590 while (ex >= EXT_FIRST_EXTENT(eh) && 2623 while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2710 err = ext4_ext_correct_indexes(handle, inode, path); 2743 err = ext4_ext_correct_indexes(handle, inode, path);
2711 2744
2712 /* 2745 /*
2713 * Free the partial cluster only if the current extent does not 2746 * If there's a partial cluster and at least one extent remains in
2714 * reference it. Otherwise we might free used cluster. 2747 * the leaf, free the partial cluster if it isn't shared with the
2748 * current extent. If there's a partial cluster and no extents
2749 * remain in the leaf, it can't be freed here. It can only be
2750 * freed when it's possible to determine if it's not shared with
2751 * any other extent - when the next leaf is processed or when space
2752 * removal is complete.
2715 */ 2753 */
2716 if (*partial_cluster > 0 && 2754 if (*partial_cluster > 0 && eh->eh_entries &&
2717 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2755 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2718 *partial_cluster)) { 2756 *partial_cluster)) {
2719 int flags = get_default_free_blocks_flags(inode); 2757 int flags = get_default_free_blocks_flags(inode);
@@ -3275,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle,
3275 return PTR_ERR(path); 3313 return PTR_ERR(path);
3276 depth = ext_depth(inode); 3314 depth = ext_depth(inode);
3277 ex = path[depth].p_ext; 3315 ex = path[depth].p_ext;
3316 if (!ex) {
3317 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3318 (unsigned long) map->m_lblk);
3319 return -EIO;
3320 }
3278 uninitialized = ext4_ext_is_uninitialized(ex); 3321 uninitialized = ext4_ext_is_uninitialized(ex);
3279 split_flag1 = 0; 3322 split_flag1 = 0;
3280 3323
@@ -3569,6 +3612,8 @@ out:
3569 * b> Splits in two extents: Write is happening at either end of the extent 3612 * b> Splits in two extents: Write is happening at either end of the extent
3570 * c> Splits in three extents: Somone is writing in middle of the extent 3613 * c> Splits in three extents: Somone is writing in middle of the extent
3571 * 3614 *
3615 * This works the same way in the case of initialized -> unwritten conversion.
3616 *
3572 * One of more index blocks maybe needed if the extent tree grow after 3617 * One of more index blocks maybe needed if the extent tree grow after
3573 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3618 * the uninitialized extent split. To prevent ENOSPC occur at the IO
3574 * complete, we need to split the uninitialized extent before DIO submit 3619 * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3624,7 @@ out:
3579 * 3624 *
3580 * Returns the size of uninitialized extent to be written on success. 3625 * Returns the size of uninitialized extent to be written on success.
3581 */ 3626 */
3582static int ext4_split_unwritten_extents(handle_t *handle, 3627static int ext4_split_convert_extents(handle_t *handle,
3583 struct inode *inode, 3628 struct inode *inode,
3584 struct ext4_map_blocks *map, 3629 struct ext4_map_blocks *map,
3585 struct ext4_ext_path *path, 3630 struct ext4_ext_path *path,
@@ -3591,9 +3636,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3591 unsigned int ee_len; 3636 unsigned int ee_len;
3592 int split_flag = 0, depth; 3637 int split_flag = 0, depth;
3593 3638
3594 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3639 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3595 "block %llu, max_blocks %u\n", inode->i_ino, 3640 __func__, inode->i_ino,
3596 (unsigned long long)map->m_lblk, map->m_len); 3641 (unsigned long long)map->m_lblk, map->m_len);
3597 3642
3598 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3643 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3599 inode->i_sb->s_blocksize_bits; 3644 inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3653,79 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3608 ee_block = le32_to_cpu(ex->ee_block); 3653 ee_block = le32_to_cpu(ex->ee_block);
3609 ee_len = ext4_ext_get_actual_len(ex); 3654 ee_len = ext4_ext_get_actual_len(ex);
3610 3655
3611 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3656 /* Convert to unwritten */
3612 split_flag |= EXT4_EXT_MARK_UNINIT2; 3657 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3613 if (flags & EXT4_GET_BLOCKS_CONVERT) 3658 split_flag |= EXT4_EXT_DATA_VALID1;
3614 split_flag |= EXT4_EXT_DATA_VALID2; 3659 /* Convert to initialized */
3660 } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3661 split_flag |= ee_block + ee_len <= eof_block ?
3662 EXT4_EXT_MAY_ZEROOUT : 0;
3663 split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
3664 }
3615 flags |= EXT4_GET_BLOCKS_PRE_IO; 3665 flags |= EXT4_GET_BLOCKS_PRE_IO;
3616 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3666 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3617} 3667}
3618 3668
3669static int ext4_convert_initialized_extents(handle_t *handle,
3670 struct inode *inode,
3671 struct ext4_map_blocks *map,
3672 struct ext4_ext_path *path)
3673{
3674 struct ext4_extent *ex;
3675 ext4_lblk_t ee_block;
3676 unsigned int ee_len;
3677 int depth;
3678 int err = 0;
3679
3680 depth = ext_depth(inode);
3681 ex = path[depth].p_ext;
3682 ee_block = le32_to_cpu(ex->ee_block);
3683 ee_len = ext4_ext_get_actual_len(ex);
3684
3685 ext_debug("%s: inode %lu, logical"
3686 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3687 (unsigned long long)ee_block, ee_len);
3688
3689 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3690 err = ext4_split_convert_extents(handle, inode, map, path,
3691 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3692 if (err < 0)
3693 goto out;
3694 ext4_ext_drop_refs(path);
3695 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3696 if (IS_ERR(path)) {
3697 err = PTR_ERR(path);
3698 goto out;
3699 }
3700 depth = ext_depth(inode);
3701 ex = path[depth].p_ext;
3702 if (!ex) {
3703 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3704 (unsigned long) map->m_lblk);
3705 err = -EIO;
3706 goto out;
3707 }
3708 }
3709
3710 err = ext4_ext_get_access(handle, inode, path + depth);
3711 if (err)
3712 goto out;
3713 /* first mark the extent as uninitialized */
3714 ext4_ext_mark_uninitialized(ex);
3715
3716 /* note: ext4_ext_correct_indexes() isn't needed here because
3717 * borders are not changed
3718 */
3719 ext4_ext_try_to_merge(handle, inode, path, ex);
3720
3721 /* Mark modified extent as dirty */
3722 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3723out:
3724 ext4_ext_show_leaf(inode, path);
3725 return err;
3726}
3727
3728
3619static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3729static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3620 struct inode *inode, 3730 struct inode *inode,
3621 struct ext4_map_blocks *map, 3731 struct ext4_map_blocks *map,
@@ -3649,8 +3759,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3649 inode->i_ino, (unsigned long long)ee_block, ee_len, 3759 inode->i_ino, (unsigned long long)ee_block, ee_len,
3650 (unsigned long long)map->m_lblk, map->m_len); 3760 (unsigned long long)map->m_lblk, map->m_len);
3651#endif 3761#endif
3652 err = ext4_split_unwritten_extents(handle, inode, map, path, 3762 err = ext4_split_convert_extents(handle, inode, map, path,
3653 EXT4_GET_BLOCKS_CONVERT); 3763 EXT4_GET_BLOCKS_CONVERT);
3654 if (err < 0) 3764 if (err < 0)
3655 goto out; 3765 goto out;
3656 ext4_ext_drop_refs(path); 3766 ext4_ext_drop_refs(path);
@@ -3851,6 +3961,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3851} 3961}
3852 3962
3853static int 3963static int
3964ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3965 struct ext4_map_blocks *map,
3966 struct ext4_ext_path *path, int flags,
3967 unsigned int allocated, ext4_fsblk_t newblock)
3968{
3969 int ret = 0;
3970 int err = 0;
3971
3972 /*
3973 * Make sure that the extent is no bigger than we support with
3974 * uninitialized extent
3975 */
3976 if (map->m_len > EXT_UNINIT_MAX_LEN)
3977 map->m_len = EXT_UNINIT_MAX_LEN / 2;
3978
3979 ret = ext4_convert_initialized_extents(handle, inode, map,
3980 path);
3981 if (ret >= 0) {
3982 ext4_update_inode_fsync_trans(handle, inode, 1);
3983 err = check_eofblocks_fl(handle, inode, map->m_lblk,
3984 path, map->m_len);
3985 } else
3986 err = ret;
3987 map->m_flags |= EXT4_MAP_UNWRITTEN;
3988 if (allocated > map->m_len)
3989 allocated = map->m_len;
3990 map->m_len = allocated;
3991
3992 return err ? err : allocated;
3993}
3994
3995static int
3854ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3996ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3855 struct ext4_map_blocks *map, 3997 struct ext4_map_blocks *map,
3856 struct ext4_ext_path *path, int flags, 3998 struct ext4_ext_path *path, int flags,
@@ -3877,8 +4019,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3877 4019
3878 /* get_block() before submit the IO, split the extent */ 4020 /* get_block() before submit the IO, split the extent */
3879 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4021 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3880 ret = ext4_split_unwritten_extents(handle, inode, map, 4022 ret = ext4_split_convert_extents(handle, inode, map,
3881 path, flags); 4023 path, flags | EXT4_GET_BLOCKS_CONVERT);
3882 if (ret <= 0) 4024 if (ret <= 0)
3883 goto out; 4025 goto out;
3884 /* 4026 /*
@@ -3993,10 +4135,6 @@ out1:
3993 map->m_pblk = newblock; 4135 map->m_pblk = newblock;
3994 map->m_len = allocated; 4136 map->m_len = allocated;
3995out2: 4137out2:
3996 if (path) {
3997 ext4_ext_drop_refs(path);
3998 kfree(path);
3999 }
4000 return err ? err : allocated; 4138 return err ? err : allocated;
4001} 4139}
4002 4140
@@ -4128,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4128 struct ext4_extent newex, *ex, *ex2; 4266 struct ext4_extent newex, *ex, *ex2;
4129 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4267 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4130 ext4_fsblk_t newblock = 0; 4268 ext4_fsblk_t newblock = 0;
4131 int free_on_err = 0, err = 0, depth; 4269 int free_on_err = 0, err = 0, depth, ret;
4132 unsigned int allocated = 0, offset = 0; 4270 unsigned int allocated = 0, offset = 0;
4133 unsigned int allocated_clusters = 0; 4271 unsigned int allocated_clusters = 0;
4134 struct ext4_allocation_request ar; 4272 struct ext4_allocation_request ar;
@@ -4170,6 +4308,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4170 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4308 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4171 unsigned short ee_len; 4309 unsigned short ee_len;
4172 4310
4311
4173 /* 4312 /*
4174 * Uninitialized extents are treated as holes, except that 4313 * Uninitialized extents are treated as holes, except that
4175 * we split out initialized portions during a write. 4314 * we split out initialized portions during a write.
@@ -4186,13 +4325,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4186 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4325 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4187 ee_block, ee_len, newblock); 4326 ee_block, ee_len, newblock);
4188 4327
4189 if (!ext4_ext_is_uninitialized(ex)) 4328 /*
4329 * If the extent is initialized check whether the
4330 * caller wants to convert it to unwritten.
4331 */
4332 if ((!ext4_ext_is_uninitialized(ex)) &&
4333 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4334 allocated = ext4_ext_convert_initialized_extent(
4335 handle, inode, map, path, flags,
4336 allocated, newblock);
4337 goto out2;
4338 } else if (!ext4_ext_is_uninitialized(ex))
4190 goto out; 4339 goto out;
4191 4340
4192 allocated = ext4_ext_handle_uninitialized_extents( 4341 ret = ext4_ext_handle_uninitialized_extents(
4193 handle, inode, map, path, flags, 4342 handle, inode, map, path, flags,
4194 allocated, newblock); 4343 allocated, newblock);
4195 goto out3; 4344 if (ret < 0)
4345 err = ret;
4346 else
4347 allocated = ret;
4348 goto out2;
4196 } 4349 }
4197 } 4350 }
4198 4351
@@ -4473,7 +4626,6 @@ out2:
4473 kfree(path); 4626 kfree(path);
4474 } 4627 }
4475 4628
4476out3:
4477 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4629 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4478 err ? err : allocated); 4630 err ? err : allocated);
4479 ext4_es_lru_add(inode); 4631 ext4_es_lru_add(inode);
@@ -4514,34 +4666,203 @@ retry:
4514 ext4_std_error(inode->i_sb, err); 4666 ext4_std_error(inode->i_sb, err);
4515} 4667}
4516 4668
4517static void ext4_falloc_update_inode(struct inode *inode, 4669static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4518 int mode, loff_t new_size, int update_ctime) 4670 ext4_lblk_t len, int flags, int mode)
4671{
4672 struct inode *inode = file_inode(file);
4673 handle_t *handle;
4674 int ret = 0;
4675 int ret2 = 0;
4676 int retries = 0;
4677 struct ext4_map_blocks map;
4678 unsigned int credits;
4679
4680 map.m_lblk = offset;
4681 /*
4682 * Don't normalize the request if it can fit in one extent so
4683 * that it doesn't get unnecessarily split into multiple
4684 * extents.
4685 */
4686 if (len <= EXT_UNINIT_MAX_LEN)
4687 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4688
4689 /*
4690 * credits to insert 1 extent into extent tree
4691 */
4692 credits = ext4_chunk_trans_blocks(inode, len);
4693
4694retry:
4695 while (ret >= 0 && ret < len) {
4696 map.m_lblk = map.m_lblk + ret;
4697 map.m_len = len = len - ret;
4698 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4699 credits);
4700 if (IS_ERR(handle)) {
4701 ret = PTR_ERR(handle);
4702 break;
4703 }
4704 ret = ext4_map_blocks(handle, inode, &map, flags);
4705 if (ret <= 0) {
4706 ext4_debug("inode #%lu: block %u: len %u: "
4707 "ext4_ext_map_blocks returned %d",
4708 inode->i_ino, map.m_lblk,
4709 map.m_len, ret);
4710 ext4_mark_inode_dirty(handle, inode);
4711 ret2 = ext4_journal_stop(handle);
4712 break;
4713 }
4714 ret2 = ext4_journal_stop(handle);
4715 if (ret2)
4716 break;
4717 }
4718 if (ret == -ENOSPC &&
4719 ext4_should_retry_alloc(inode->i_sb, &retries)) {
4720 ret = 0;
4721 goto retry;
4722 }
4723
4724 return ret > 0 ? ret2 : ret;
4725}
4726
4727static long ext4_zero_range(struct file *file, loff_t offset,
4728 loff_t len, int mode)
4519{ 4729{
4520 struct timespec now; 4730 struct inode *inode = file_inode(file);
4731 handle_t *handle = NULL;
4732 unsigned int max_blocks;
4733 loff_t new_size = 0;
4734 int ret = 0;
4735 int flags;
4736 int partial;
4737 loff_t start, end;
4738 ext4_lblk_t lblk;
4739 struct address_space *mapping = inode->i_mapping;
4740 unsigned int blkbits = inode->i_blkbits;
4741
4742 trace_ext4_zero_range(inode, offset, len, mode);
4521 4743
4522 if (update_ctime) { 4744 if (!S_ISREG(inode->i_mode))
4523 now = current_fs_time(inode->i_sb); 4745 return -EINVAL;
4524 if (!timespec_equal(&inode->i_ctime, &now)) 4746
4525 inode->i_ctime = now; 4747 /*
4748 * Write out all dirty pages to avoid race conditions
4749 * Then release them.
4750 */
4751 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4752 ret = filemap_write_and_wait_range(mapping, offset,
4753 offset + len - 1);
4754 if (ret)
4755 return ret;
4526 } 4756 }
4757
4758 /*
4759 * Round up offset. This is not fallocate, we neet to zero out
4760 * blocks, so convert interior block aligned part of the range to
4761 * unwritten and possibly manually zero out unaligned parts of the
4762 * range.
4763 */
4764 start = round_up(offset, 1 << blkbits);
4765 end = round_down((offset + len), 1 << blkbits);
4766
4767 if (start < offset || end > offset + len)
4768 return -EINVAL;
4769 partial = (offset + len) & ((1 << blkbits) - 1);
4770
4771 lblk = start >> blkbits;
4772 max_blocks = (end >> blkbits);
4773 if (max_blocks < lblk)
4774 max_blocks = 0;
4775 else
4776 max_blocks -= lblk;
4777
4778 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4779 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
4780 if (mode & FALLOC_FL_KEEP_SIZE)
4781 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4782
4783 mutex_lock(&inode->i_mutex);
4784
4527 /* 4785 /*
4528 * Update only when preallocation was requested beyond 4786 * Indirect files do not support unwritten extnets
4529 * the file size.
4530 */ 4787 */
4531 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4788 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4789 ret = -EOPNOTSUPP;
4790 goto out_mutex;
4791 }
4792
4793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4794 offset + len > i_size_read(inode)) {
4795 new_size = offset + len;
4796 ret = inode_newsize_ok(inode, new_size);
4797 if (ret)
4798 goto out_mutex;
4799 /*
4800 * If we have a partial block after EOF we have to allocate
4801 * the entire block.
4802 */
4803 if (partial)
4804 max_blocks += 1;
4805 }
4806
4807 if (max_blocks > 0) {
4808
4809 /* Now release the pages and zero block aligned part of pages*/
4810 truncate_pagecache_range(inode, start, end - 1);
4811
4812 /* Wait all existing dio workers, newcomers will block on i_mutex */
4813 ext4_inode_block_unlocked_dio(inode);
4814 inode_dio_wait(inode);
4815
4816 /*
4817 * Remove entire range from the extent status tree.
4818 */
4819 ret = ext4_es_remove_extent(inode, lblk, max_blocks);
4820 if (ret)
4821 goto out_dio;
4822
4823 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
4824 mode);
4825 if (ret)
4826 goto out_dio;
4827 }
4828
4829 handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
4830 if (IS_ERR(handle)) {
4831 ret = PTR_ERR(handle);
4832 ext4_std_error(inode->i_sb, ret);
4833 goto out_dio;
4834 }
4835
4836 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4837
4838 if (new_size) {
4532 if (new_size > i_size_read(inode)) 4839 if (new_size > i_size_read(inode))
4533 i_size_write(inode, new_size); 4840 i_size_write(inode, new_size);
4534 if (new_size > EXT4_I(inode)->i_disksize) 4841 if (new_size > EXT4_I(inode)->i_disksize)
4535 ext4_update_i_disksize(inode, new_size); 4842 ext4_update_i_disksize(inode, new_size);
4536 } else { 4843 } else {
4537 /* 4844 /*
4538 * Mark that we allocate beyond EOF so the subsequent truncate 4845 * Mark that we allocate beyond EOF so the subsequent truncate
4539 * can proceed even if the new size is the same as i_size. 4846 * can proceed even if the new size is the same as i_size.
4540 */ 4847 */
4541 if (new_size > i_size_read(inode)) 4848 if ((offset + len) > i_size_read(inode))
4542 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4849 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4543 } 4850 }
4544 4851
4852 ext4_mark_inode_dirty(handle, inode);
4853
4854 /* Zero out partial block at the edges of the range */
4855 ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4856
4857 if (file->f_flags & O_SYNC)
4858 ext4_handle_sync(handle);
4859
4860 ext4_journal_stop(handle);
4861out_dio:
4862 ext4_inode_resume_unlocked_dio(inode);
4863out_mutex:
4864 mutex_unlock(&inode->i_mutex);
4865 return ret;
4545} 4866}
4546 4867
4547/* 4868/*
@@ -4555,17 +4876,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4555{ 4876{
4556 struct inode *inode = file_inode(file); 4877 struct inode *inode = file_inode(file);
4557 handle_t *handle; 4878 handle_t *handle;
4558 loff_t new_size; 4879 loff_t new_size = 0;
4559 unsigned int max_blocks; 4880 unsigned int max_blocks;
4560 int ret = 0; 4881 int ret = 0;
4561 int ret2 = 0;
4562 int retries = 0;
4563 int flags; 4882 int flags;
4564 struct ext4_map_blocks map; 4883 ext4_lblk_t lblk;
4565 unsigned int credits, blkbits = inode->i_blkbits; 4884 struct timespec tv;
4885 unsigned int blkbits = inode->i_blkbits;
4566 4886
4567 /* Return error if mode is not supported */ 4887 /* Return error if mode is not supported */
4568 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4888 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4889 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4569 return -EOPNOTSUPP; 4890 return -EOPNOTSUPP;
4570 4891
4571 if (mode & FALLOC_FL_PUNCH_HOLE) 4892 if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4582,83 +4903,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4582 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4903 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4583 return -EOPNOTSUPP; 4904 return -EOPNOTSUPP;
4584 4905
4906 if (mode & FALLOC_FL_COLLAPSE_RANGE)
4907 return ext4_collapse_range(inode, offset, len);
4908
4909 if (mode & FALLOC_FL_ZERO_RANGE)
4910 return ext4_zero_range(file, offset, len, mode);
4911
4585 trace_ext4_fallocate_enter(inode, offset, len, mode); 4912 trace_ext4_fallocate_enter(inode, offset, len, mode);
4586 map.m_lblk = offset >> blkbits; 4913 lblk = offset >> blkbits;
4587 /* 4914 /*
4588 * We can't just convert len to max_blocks because 4915 * We can't just convert len to max_blocks because
4589 * If blocksize = 4096 offset = 3072 and len = 2048 4916 * If blocksize = 4096 offset = 3072 and len = 2048
4590 */ 4917 */
4591 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4918 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4592 - map.m_lblk; 4919 - lblk;
4593 /* 4920
4594 * credits to insert 1 extent into extent tree
4595 */
4596 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4597 mutex_lock(&inode->i_mutex);
4598 ret = inode_newsize_ok(inode, (len + offset));
4599 if (ret) {
4600 mutex_unlock(&inode->i_mutex);
4601 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4602 return ret;
4603 }
4604 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4921 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
4605 if (mode & FALLOC_FL_KEEP_SIZE) 4922 if (mode & FALLOC_FL_KEEP_SIZE)
4606 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4923 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4607 /*
4608 * Don't normalize the request if it can fit in one extent so
4609 * that it doesn't get unnecessarily split into multiple
4610 * extents.
4611 */
4612 if (len <= EXT_UNINIT_MAX_LEN << blkbits)
4613 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4614 4924
4615retry: 4925 mutex_lock(&inode->i_mutex);
4616 while (ret >= 0 && ret < max_blocks) {
4617 map.m_lblk = map.m_lblk + ret;
4618 map.m_len = max_blocks = max_blocks - ret;
4619 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4620 credits);
4621 if (IS_ERR(handle)) {
4622 ret = PTR_ERR(handle);
4623 break;
4624 }
4625 ret = ext4_map_blocks(handle, inode, &map, flags);
4626 if (ret <= 0) {
4627#ifdef EXT4FS_DEBUG
4628 ext4_warning(inode->i_sb,
4629 "inode #%lu: block %u: len %u: "
4630 "ext4_ext_map_blocks returned %d",
4631 inode->i_ino, map.m_lblk,
4632 map.m_len, ret);
4633#endif
4634 ext4_mark_inode_dirty(handle, inode);
4635 ret2 = ext4_journal_stop(handle);
4636 break;
4637 }
4638 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
4639 blkbits) >> blkbits))
4640 new_size = offset + len;
4641 else
4642 new_size = ((loff_t) map.m_lblk + ret) << blkbits;
4643 4926
4644 ext4_falloc_update_inode(inode, mode, new_size, 4927 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4645 (map.m_flags & EXT4_MAP_NEW)); 4928 offset + len > i_size_read(inode)) {
4646 ext4_mark_inode_dirty(handle, inode); 4929 new_size = offset + len;
4647 if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4930 ret = inode_newsize_ok(inode, new_size);
4648 ext4_handle_sync(handle); 4931 if (ret)
4649 ret2 = ext4_journal_stop(handle); 4932 goto out;
4650 if (ret2)
4651 break;
4652 } 4933 }
4653 if (ret == -ENOSPC && 4934
4654 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4935 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
4655 ret = 0; 4936 if (ret)
4656 goto retry; 4937 goto out;
4938
4939 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4940 if (IS_ERR(handle))
4941 goto out;
4942
4943 tv = inode->i_ctime = ext4_current_time(inode);
4944
4945 if (new_size) {
4946 if (new_size > i_size_read(inode)) {
4947 i_size_write(inode, new_size);
4948 inode->i_mtime = tv;
4949 }
4950 if (new_size > EXT4_I(inode)->i_disksize)
4951 ext4_update_i_disksize(inode, new_size);
4952 } else {
4953 /*
4954 * Mark that we allocate beyond EOF so the subsequent truncate
4955 * can proceed even if the new size is the same as i_size.
4956 */
4957 if ((offset + len) > i_size_read(inode))
4958 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4657 } 4959 }
4960 ext4_mark_inode_dirty(handle, inode);
4961 if (file->f_flags & O_SYNC)
4962 ext4_handle_sync(handle);
4963
4964 ext4_journal_stop(handle);
4965out:
4658 mutex_unlock(&inode->i_mutex); 4966 mutex_unlock(&inode->i_mutex);
4659 trace_ext4_fallocate_exit(inode, offset, max_blocks, 4967 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4660 ret > 0 ? ret2 : ret); 4968 return ret;
4661 return ret > 0 ? ret2 : ret;
4662} 4969}
4663 4970
4664/* 4971/*
@@ -4869,3 +5176,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4869 ext4_es_lru_add(inode); 5176 ext4_es_lru_add(inode);
4870 return error; 5177 return error;
4871} 5178}
5179
5180/*
5181 * ext4_access_path:
5182 * Function to access the path buffer for marking it dirty.
5183 * It also checks if there are sufficient credits left in the journal handle
5184 * to update path.
5185 */
5186static int
5187ext4_access_path(handle_t *handle, struct inode *inode,
5188 struct ext4_ext_path *path)
5189{
5190 int credits, err;
5191
5192 if (!ext4_handle_valid(handle))
5193 return 0;
5194
5195 /*
5196 * Check if need to extend journal credits
5197 * 3 for leaf, sb, and inode plus 2 (bmap and group
5198 * descriptor) for each block group; assume two block
5199 * groups
5200 */
5201 if (handle->h_buffer_credits < 7) {
5202 credits = ext4_writepage_trans_blocks(inode);
5203 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5204 /* EAGAIN is success */
5205 if (err && err != -EAGAIN)
5206 return err;
5207 }
5208
5209 err = ext4_ext_get_access(handle, inode, path);
5210 return err;
5211}
5212
5213/*
5214 * ext4_ext_shift_path_extents:
5215 * Shift the extents of a path structure lying between path[depth].p_ext
5216 * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5217 * from starting block for each extent.
5218 */
5219static int
5220ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5221 struct inode *inode, handle_t *handle,
5222 ext4_lblk_t *start)
5223{
5224 int depth, err = 0;
5225 struct ext4_extent *ex_start, *ex_last;
5226 bool update = 0;
5227 depth = path->p_depth;
5228
5229 while (depth >= 0) {
5230 if (depth == path->p_depth) {
5231 ex_start = path[depth].p_ext;
5232 if (!ex_start)
5233 return -EIO;
5234
5235 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5236 if (!ex_last)
5237 return -EIO;
5238
5239 err = ext4_access_path(handle, inode, path + depth);
5240 if (err)
5241 goto out;
5242
5243 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5244 update = 1;
5245
5246 *start = le32_to_cpu(ex_last->ee_block) +
5247 ext4_ext_get_actual_len(ex_last);
5248
5249 while (ex_start <= ex_last) {
5250 le32_add_cpu(&ex_start->ee_block, -shift);
5251 /* Try to merge to the left. */
5252 if ((ex_start >
5253 EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
5254 ext4_ext_try_to_merge_right(inode,
5255 path, ex_start - 1))
5256 ex_last--;
5257 else
5258 ex_start++;
5259 }
5260 err = ext4_ext_dirty(handle, inode, path + depth);
5261 if (err)
5262 goto out;
5263
5264 if (--depth < 0 || !update)
5265 break;
5266 }
5267
5268 /* Update index too */
5269 err = ext4_access_path(handle, inode, path + depth);
5270 if (err)
5271 goto out;
5272
5273 le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5274 err = ext4_ext_dirty(handle, inode, path + depth);
5275 if (err)
5276 goto out;
5277
5278 /* we are done if current index is not a starting index */
5279 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5280 break;
5281
5282 depth--;
5283 }
5284
5285out:
5286 return err;
5287}
5288
5289/*
5290 * ext4_ext_shift_extents:
5291 * All the extents which lies in the range from start to the last allocated
5292 * block for the file are shifted downwards by shift blocks.
5293 * On success, 0 is returned, error otherwise.
5294 */
5295static int
5296ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5297 ext4_lblk_t start, ext4_lblk_t shift)
5298{
5299 struct ext4_ext_path *path;
5300 int ret = 0, depth;
5301 struct ext4_extent *extent;
5302 ext4_lblk_t stop_block, current_block;
5303 ext4_lblk_t ex_start, ex_end;
5304
5305 /* Let path point to the last extent */
5306 path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5307 if (IS_ERR(path))
5308 return PTR_ERR(path);
5309
5310 depth = path->p_depth;
5311 extent = path[depth].p_ext;
5312 if (!extent) {
5313 ext4_ext_drop_refs(path);
5314 kfree(path);
5315 return ret;
5316 }
5317
5318 stop_block = le32_to_cpu(extent->ee_block) +
5319 ext4_ext_get_actual_len(extent);
5320 ext4_ext_drop_refs(path);
5321 kfree(path);
5322
5323 /* Nothing to shift, if hole is at the end of file */
5324 if (start >= stop_block)
5325 return ret;
5326
5327 /*
5328 * Don't start shifting extents until we make sure the hole is big
5329 * enough to accomodate the shift.
5330 */
5331 path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5332 if (IS_ERR(path))
5333 return PTR_ERR(path);
5334 depth = path->p_depth;
5335 extent = path[depth].p_ext;
5336 if (extent) {
5337 ex_start = le32_to_cpu(extent->ee_block);
5338 ex_end = le32_to_cpu(extent->ee_block) +
5339 ext4_ext_get_actual_len(extent);
5340 } else {
5341 ex_start = 0;
5342 ex_end = 0;
5343 }
5344 ext4_ext_drop_refs(path);
5345 kfree(path);
5346
5347 if ((start == ex_start && shift > ex_start) ||
5348 (shift > start - ex_end))
5349 return -EINVAL;
5350
5351 /* Its safe to start updating extents */
5352 while (start < stop_block) {
5353 path = ext4_ext_find_extent(inode, start, NULL, 0);
5354 if (IS_ERR(path))
5355 return PTR_ERR(path);
5356 depth = path->p_depth;
5357 extent = path[depth].p_ext;
5358 if (!extent) {
5359 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5360 (unsigned long) start);
5361 return -EIO;
5362 }
5363
5364 current_block = le32_to_cpu(extent->ee_block);
5365 if (start > current_block) {
5366 /* Hole, move to the next extent */
5367 ret = mext_next_extent(inode, path, &extent);
5368 if (ret != 0) {
5369 ext4_ext_drop_refs(path);
5370 kfree(path);
5371 if (ret == 1)
5372 ret = 0;
5373 break;
5374 }
5375 }
5376 ret = ext4_ext_shift_path_extents(path, shift, inode,
5377 handle, &start);
5378 ext4_ext_drop_refs(path);
5379 kfree(path);
5380 if (ret)
5381 break;
5382 }
5383
5384 return ret;
5385}
5386
5387/*
5388 * ext4_collapse_range:
5389 * This implements the fallocate's collapse range functionality for ext4
5390 * Returns: 0 and non-zero on error.
5391 */
5392int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5393{
5394 struct super_block *sb = inode->i_sb;
5395 ext4_lblk_t punch_start, punch_stop;
5396 handle_t *handle;
5397 unsigned int credits;
5398 loff_t new_size, ioffset;
5399 int ret;
5400
5401 /* Collapse range works only on fs block size aligned offsets. */
5402 if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
5403 len & (EXT4_BLOCK_SIZE(sb) - 1))
5404 return -EINVAL;
5405
5406 if (!S_ISREG(inode->i_mode))
5407 return -EINVAL;
5408
5409 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
5410 return -EOPNOTSUPP;
5411
5412 trace_ext4_collapse_range(inode, offset, len);
5413
5414 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5415 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5416
5417 /* Call ext4_force_commit to flush all data in case of data=journal. */
5418 if (ext4_should_journal_data(inode)) {
5419 ret = ext4_force_commit(inode->i_sb);
5420 if (ret)
5421 return ret;
5422 }
5423
5424 /*
5425 * Need to round down offset to be aligned with page size boundary
5426 * for page size > block size.
5427 */
5428 ioffset = round_down(offset, PAGE_SIZE);
5429
5430 /* Write out all dirty pages */
5431 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5432 LLONG_MAX);
5433 if (ret)
5434 return ret;
5435
5436 /* Take mutex lock */
5437 mutex_lock(&inode->i_mutex);
5438
5439 /*
5440 * There is no need to overlap collapse range with EOF, in which case
5441 * it is effectively a truncate operation
5442 */
5443 if (offset + len >= i_size_read(inode)) {
5444 ret = -EINVAL;
5445 goto out_mutex;
5446 }
5447
5448 /* Currently just for extent based files */
5449 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5450 ret = -EOPNOTSUPP;
5451 goto out_mutex;
5452 }
5453
5454 truncate_pagecache(inode, ioffset);
5455
5456 /* Wait for existing dio to complete */
5457 ext4_inode_block_unlocked_dio(inode);
5458 inode_dio_wait(inode);
5459
5460 credits = ext4_writepage_trans_blocks(inode);
5461 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5462 if (IS_ERR(handle)) {
5463 ret = PTR_ERR(handle);
5464 goto out_dio;
5465 }
5466
5467 down_write(&EXT4_I(inode)->i_data_sem);
5468 ext4_discard_preallocations(inode);
5469
5470 ret = ext4_es_remove_extent(inode, punch_start,
5471 EXT_MAX_BLOCKS - punch_start);
5472 if (ret) {
5473 up_write(&EXT4_I(inode)->i_data_sem);
5474 goto out_stop;
5475 }
5476
5477 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5478 if (ret) {
5479 up_write(&EXT4_I(inode)->i_data_sem);
5480 goto out_stop;
5481 }
5482 ext4_discard_preallocations(inode);
5483
5484 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5485 punch_stop - punch_start);
5486 if (ret) {
5487 up_write(&EXT4_I(inode)->i_data_sem);
5488 goto out_stop;
5489 }
5490
5491 new_size = i_size_read(inode) - len;
5492 i_size_write(inode, new_size);
5493 EXT4_I(inode)->i_disksize = new_size;
5494
5495 up_write(&EXT4_I(inode)->i_data_sem);
5496 if (IS_SYNC(inode))
5497 ext4_handle_sync(handle);
5498 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5499 ext4_mark_inode_dirty(handle, inode);
5500
5501out_stop:
5502 ext4_journal_stop(handle);
5503out_dio:
5504 ext4_inode_resume_unlocked_dio(inode);
5505out_mutex:
5506 mutex_unlock(&inode->i_mutex);
5507 return ret;
5508}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff783950..0ebc21204b51 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
184 while (node) { 184 while (node) {
185 struct extent_status *es; 185 struct extent_status *es;
186 es = rb_entry(node, struct extent_status, rb_node); 186 es = rb_entry(node, struct extent_status, rb_node);
187 printk(KERN_DEBUG " [%u/%u) %llu %llx", 187 printk(KERN_DEBUG " [%u/%u) %llu %x",
188 es->es_lblk, es->es_len, 188 es->es_lblk, es->es_len,
189 ext4_es_pblock(es), ext4_es_status(es)); 189 ext4_es_pblock(es), ext4_es_status(es));
190 node = rb_next(node); 190 node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
445 pr_warn("ES insert assertion failed for " 445 pr_warn("ES insert assertion failed for "
446 "inode: %lu we can find an extent " 446 "inode: %lu we can find an extent "
447 "at block [%d/%d/%llu/%c], but we " 447 "at block [%d/%d/%llu/%c], but we "
448 "want to add an delayed/hole extent " 448 "want to add a delayed/hole extent "
449 "[%d/%d/%llu/%llx]\n", 449 "[%d/%d/%llu/%x]\n",
450 inode->i_ino, ee_block, ee_len, 450 inode->i_ino, ee_block, ee_len,
451 ee_start, ee_status ? 'u' : 'w', 451 ee_start, ee_status ? 'u' : 'w',
452 es->es_lblk, es->es_len, 452 es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 486 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
487 pr_warn("ES insert assertion failed for inode: %lu " 487 pr_warn("ES insert assertion failed for inode: %lu "
488 "can't find an extent at block %d but we want " 488 "can't find an extent at block %d but we want "
489 "to add an written/unwritten extent " 489 "to add a written/unwritten extent "
490 "[%d/%d/%llu/%llx]\n", inode->i_ino, 490 "[%d/%d/%llu/%x]\n", inode->i_ino,
491 es->es_lblk, es->es_lblk, es->es_len, 491 es->es_lblk, es->es_lblk, es->es_len,
492 ext4_es_pblock(es), ext4_es_status(es)); 492 ext4_es_pblock(es), ext4_es_status(es));
493 } 493 }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
524 */ 524 */
525 pr_warn("ES insert assertion failed for inode: %lu " 525 pr_warn("ES insert assertion failed for inode: %lu "
526 "We can find blocks but we want to add a " 526 "We can find blocks but we want to add a "
527 "delayed/hole extent [%d/%d/%llu/%llx]\n", 527 "delayed/hole extent [%d/%d/%llu/%x]\n",
528 inode->i_ino, es->es_lblk, es->es_len, 528 inode->i_ino, es->es_lblk, es->es_len,
529 ext4_es_pblock(es), ext4_es_status(es)); 529 ext4_es_pblock(es), ext4_es_status(es));
530 return; 530 return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
554 if (ext4_es_is_written(es)) { 554 if (ext4_es_is_written(es)) {
555 pr_warn("ES insert assertion failed for inode: %lu " 555 pr_warn("ES insert assertion failed for inode: %lu "
556 "We can't find the block but we want to add " 556 "We can't find the block but we want to add "
557 "an written extent [%d/%d/%llu/%llx]\n", 557 "a written extent [%d/%d/%llu/%x]\n",
558 inode->i_ino, es->es_lblk, es->es_len, 558 inode->i_ino, es->es_lblk, es->es_len,
559 ext4_es_pblock(es), ext4_es_status(es)); 559 ext4_es_pblock(es), ext4_es_status(es));
560 return; 560 return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
658 658
659 newes.es_lblk = lblk; 659 newes.es_lblk = lblk;
660 newes.es_len = len; 660 newes.es_len = len;
661 ext4_es_store_pblock(&newes, pblk); 661 ext4_es_store_pblock_status(&newes, pblk, status);
662 ext4_es_store_status(&newes, status);
663 trace_ext4_es_insert_extent(inode, &newes); 662 trace_ext4_es_insert_extent(inode, &newes);
664 663
665 ext4_es_insert_extent_check(inode, &newes); 664 ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
699 698
700 newes.es_lblk = lblk; 699 newes.es_lblk = lblk;
701 newes.es_len = len; 700 newes.es_len = len;
702 ext4_es_store_pblock(&newes, pblk); 701 ext4_es_store_pblock_status(&newes, pblk, status);
703 ext4_es_store_status(&newes, status);
704 trace_ext4_es_cache_extent(inode, &newes); 702 trace_ext4_es_cache_extent(inode, &newes);
705 703
706 if (!len) 704 if (!len)
@@ -812,13 +810,13 @@ retry:
812 810
813 newes.es_lblk = end + 1; 811 newes.es_lblk = end + 1;
814 newes.es_len = len2; 812 newes.es_len = len2;
813 block = 0x7FDEADBEEFULL;
815 if (ext4_es_is_written(&orig_es) || 814 if (ext4_es_is_written(&orig_es) ||
816 ext4_es_is_unwritten(&orig_es)) { 815 ext4_es_is_unwritten(&orig_es))
817 block = ext4_es_pblock(&orig_es) + 816 block = ext4_es_pblock(&orig_es) +
818 orig_es.es_len - len2; 817 orig_es.es_len - len2;
819 ext4_es_store_pblock(&newes, block); 818 ext4_es_store_pblock_status(&newes, block,
820 } 819 ext4_es_status(&orig_es));
821 ext4_es_store_status(&newes, ext4_es_status(&orig_es));
822 err = __es_insert_extent(inode, &newes); 820 err = __es_insert_extent(inode, &newes);
823 if (err) { 821 if (err) {
824 es->es_lblk = orig_es.es_lblk; 822 es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc3..f1b62a419920 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
129 (es->es_pblk & ~ES_MASK)); 129 (es->es_pblk & ~ES_MASK));
130} 130}
131 131
132static inline void ext4_es_store_pblock_status(struct extent_status *es,
133 ext4_fsblk_t pb,
134 unsigned int status)
135{
136 es->es_pblk = (((ext4_fsblk_t)
137 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
138 (pb & ~ES_MASK));
139}
140
132extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 141extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
133extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 142extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
134extern void ext4_es_lru_add(struct inode *inode); 143extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a5073959f32..063fc1538355 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
82 size_t count = iov_length(iov, nr_segs); 82 size_t count = iov_length(iov, nr_segs);
83 loff_t final_size = pos + count; 83 loff_t final_size = pos + count;
84 84
85 if (pos >= inode->i_size) 85 if (pos >= i_size_read(inode))
86 return 0; 86 return 0;
87 87
88 if ((pos & blockmask) || (final_size & blockmask)) 88 if ((pos & blockmask) || (final_size & blockmask))
@@ -146,14 +146,14 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
146 overwrite = 1; 146 overwrite = 1;
147 } 147 }
148 148
149 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 149 ret = __generic_file_aio_write(iocb, iov, nr_segs);
150 mutex_unlock(&inode->i_mutex); 150 mutex_unlock(&inode->i_mutex);
151 151
152 if (ret > 0) { 152 if (ret > 0) {
153 ssize_t err; 153 ssize_t err;
154 154
155 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 155 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
156 if (err < 0 && ret > 0) 156 if (err < 0)
157 ret = err; 157 ret = err;
158 } 158 }
159 blk_finish_plug(&plug); 159 blk_finish_plug(&plug);
@@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
200 200
201static const struct vm_operations_struct ext4_file_vm_ops = { 201static const struct vm_operations_struct ext4_file_vm_ops = {
202 .fault = filemap_fault, 202 .fault = filemap_fault,
203 .map_pages = filemap_map_pages,
203 .page_mkwrite = ext4_page_mkwrite, 204 .page_mkwrite = ext4_page_mkwrite,
204 .remap_pages = generic_file_remap_pages, 205 .remap_pages = generic_file_remap_pages,
205}; 206};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24bfd7ff3049..d7b7462a0e13 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode)
215 jbd2_complete_transaction(journal, commit_tid); 215 jbd2_complete_transaction(journal, commit_tid);
216 filemap_write_and_wait(&inode->i_data); 216 filemap_write_and_wait(&inode->i_data);
217 } 217 }
218 truncate_inode_pages(&inode->i_data, 0); 218 truncate_inode_pages_final(&inode->i_data);
219 219
220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
221 goto no_delete; 221 goto no_delete;
@@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode)
226 226
227 if (ext4_should_order_data(inode)) 227 if (ext4_should_order_data(inode))
228 ext4_begin_ordered_truncate(inode, 0); 228 ext4_begin_ordered_truncate(inode, 0);
229 truncate_inode_pages(&inode->i_data, 0); 229 truncate_inode_pages_final(&inode->i_data);
230 230
231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232 if (is_bad_inode(inode)) 232 if (is_bad_inode(inode))
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
504{ 504{
505 struct extent_status es; 505 struct extent_status es;
506 int retval; 506 int retval;
507 int ret = 0;
507#ifdef ES_AGGRESSIVE_TEST 508#ifdef ES_AGGRESSIVE_TEST
508 struct ext4_map_blocks orig_map; 509 struct ext4_map_blocks orig_map;
509 510
@@ -515,6 +516,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
515 "logical block %lu\n", inode->i_ino, flags, map->m_len, 516 "logical block %lu\n", inode->i_ino, flags, map->m_len,
516 (unsigned long) map->m_lblk); 517 (unsigned long) map->m_lblk);
517 518
519 /*
520 * ext4_map_blocks returns an int, and m_len is an unsigned int
521 */
522 if (unlikely(map->m_len > INT_MAX))
523 map->m_len = INT_MAX;
524
525 /* We can handle the block number less than EXT_MAX_BLOCKS */
526 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
527 return -EIO;
528
518 /* Lookup extent status tree firstly */ 529 /* Lookup extent status tree firstly */
519 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 530 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
520 ext4_es_lru_add(inode); 531 ext4_es_lru_add(inode);
@@ -553,7 +564,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
553 EXT4_GET_BLOCKS_KEEP_SIZE); 564 EXT4_GET_BLOCKS_KEEP_SIZE);
554 } 565 }
555 if (retval > 0) { 566 if (retval > 0) {
556 int ret;
557 unsigned int status; 567 unsigned int status;
558 568
559 if (unlikely(retval != map->m_len)) { 569 if (unlikely(retval != map->m_len)) {
@@ -580,7 +590,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
580 590
581found: 591found:
582 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 592 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
583 int ret = check_block_validity(inode, map); 593 ret = check_block_validity(inode, map);
584 if (ret != 0) 594 if (ret != 0)
585 return ret; 595 return ret;
586 } 596 }
@@ -597,7 +607,13 @@ found:
597 * with buffer head unmapped. 607 * with buffer head unmapped.
598 */ 608 */
599 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 609 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
600 return retval; 610 /*
611 * If we need to convert extent to unwritten
612 * we continue and do the actual work in
613 * ext4_ext_map_blocks()
614 */
615 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
616 return retval;
601 617
602 /* 618 /*
603 * Here we clear m_flags because after allocating an new extent, 619 * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +669,6 @@ found:
653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 669 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
654 670
655 if (retval > 0) { 671 if (retval > 0) {
656 int ret;
657 unsigned int status; 672 unsigned int status;
658 673
659 if (unlikely(retval != map->m_len)) { 674 if (unlikely(retval != map->m_len)) {
@@ -688,7 +703,7 @@ found:
688has_zeroout: 703has_zeroout:
689 up_write((&EXT4_I(inode)->i_data_sem)); 704 up_write((&EXT4_I(inode)->i_data_sem));
690 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 705 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
691 int ret = check_block_validity(inode, map); 706 ret = check_block_validity(inode, map);
692 if (ret != 0) 707 if (ret != 0)
693 return ret; 708 return ret;
694 } 709 }
@@ -2232,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2232 return err; 2247 return err;
2233 } while (map->m_len); 2248 } while (map->m_len);
2234 2249
2235 /* Update on-disk size after IO is submitted */ 2250 /*
2251 * Update on-disk size after IO is submitted. Races with
2252 * truncate are avoided by checking i_size under i_data_sem.
2253 */
2236 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; 2254 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2237 if (disksize > EXT4_I(inode)->i_disksize) { 2255 if (disksize > EXT4_I(inode)->i_disksize) {
2238 int err2; 2256 int err2;
2239 2257 loff_t i_size;
2240 ext4_wb_update_i_disksize(inode, disksize); 2258
2259 down_write(&EXT4_I(inode)->i_data_sem);
2260 i_size = i_size_read(inode);
2261 if (disksize > i_size)
2262 disksize = i_size;
2263 if (disksize > EXT4_I(inode)->i_disksize)
2264 EXT4_I(inode)->i_disksize = disksize;
2241 err2 = ext4_mark_inode_dirty(handle, inode); 2265 err2 = ext4_mark_inode_dirty(handle, inode);
2266 up_write(&EXT4_I(inode)->i_data_sem);
2242 if (err2) 2267 if (err2)
2243 ext4_error(inode->i_sb, 2268 ext4_error(inode->i_sb,
2244 "Failed to mark inode %lu dirty", 2269 "Failed to mark inode %lu dirty",
@@ -3313,33 +3338,13 @@ void ext4_set_aops(struct inode *inode)
3313} 3338}
3314 3339
3315/* 3340/*
3316 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3317 * up to the end of the block which corresponds to `from'.
3318 * This required during truncate. We need to physically zero the tail end
3319 * of that block so it doesn't yield old data if the file is later grown.
3320 */
3321int ext4_block_truncate_page(handle_t *handle,
3322 struct address_space *mapping, loff_t from)
3323{
3324 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3325 unsigned length;
3326 unsigned blocksize;
3327 struct inode *inode = mapping->host;
3328
3329 blocksize = inode->i_sb->s_blocksize;
3330 length = blocksize - (offset & (blocksize - 1));
3331
3332 return ext4_block_zero_page_range(handle, mapping, from, length);
3333}
3334
3335/*
3336 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3341 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3337 * starting from file offset 'from'. The range to be zero'd must 3342 * starting from file offset 'from'. The range to be zero'd must
3338 * be contained with in one block. If the specified range exceeds 3343 * be contained with in one block. If the specified range exceeds
3339 * the end of the block it will be shortened to end of the block 3344 * the end of the block it will be shortened to end of the block
3340 * that cooresponds to 'from' 3345 * that cooresponds to 'from'
3341 */ 3346 */
3342int ext4_block_zero_page_range(handle_t *handle, 3347static int ext4_block_zero_page_range(handle_t *handle,
3343 struct address_space *mapping, loff_t from, loff_t length) 3348 struct address_space *mapping, loff_t from, loff_t length)
3344{ 3349{
3345 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3350 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3434,26 @@ unlock:
3429 return err; 3434 return err;
3430} 3435}
3431 3436
3437/*
3438 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3439 * up to the end of the block which corresponds to `from'.
3440 * This required during truncate. We need to physically zero the tail end
3441 * of that block so it doesn't yield old data if the file is later grown.
3442 */
3443int ext4_block_truncate_page(handle_t *handle,
3444 struct address_space *mapping, loff_t from)
3445{
3446 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3447 unsigned length;
3448 unsigned blocksize;
3449 struct inode *inode = mapping->host;
3450
3451 blocksize = inode->i_sb->s_blocksize;
3452 length = blocksize - (offset & (blocksize - 1));
3453
3454 return ext4_block_zero_page_range(handle, mapping, from, length);
3455}
3456
3432int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3457int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3433 loff_t lstart, loff_t length) 3458 loff_t lstart, loff_t length)
3434{ 3459{
@@ -3502,7 +3527,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3502 if (!S_ISREG(inode->i_mode)) 3527 if (!S_ISREG(inode->i_mode))
3503 return -EOPNOTSUPP; 3528 return -EOPNOTSUPP;
3504 3529
3505 trace_ext4_punch_hole(inode, offset, length); 3530 trace_ext4_punch_hole(inode, offset, length, 0);
3506 3531
3507 /* 3532 /*
3508 * Write out all dirty pages to avoid race conditions 3533 * Write out all dirty pages to avoid race conditions
@@ -3516,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3516 } 3541 }
3517 3542
3518 mutex_lock(&inode->i_mutex); 3543 mutex_lock(&inode->i_mutex);
3519 /* It's not possible punch hole on append only file */
3520 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
3521 ret = -EPERM;
3522 goto out_mutex;
3523 }
3524 if (IS_SWAPFILE(inode)) {
3525 ret = -ETXTBSY;
3526 goto out_mutex;
3527 }
3528 3544
3529 /* No need to punch hole beyond i_size */ 3545 /* No need to punch hole beyond i_size */
3530 if (offset >= inode->i_size) 3546 if (offset >= inode->i_size)
@@ -3605,10 +3621,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3605 ret = ext4_free_hole_blocks(handle, inode, first_block, 3621 ret = ext4_free_hole_blocks(handle, inode, first_block,
3606 stop_block); 3622 stop_block);
3607 3623
3608 ext4_discard_preallocations(inode);
3609 up_write(&EXT4_I(inode)->i_data_sem); 3624 up_write(&EXT4_I(inode)->i_data_sem);
3610 if (IS_SYNC(inode)) 3625 if (IS_SYNC(inode))
3611 ext4_handle_sync(handle); 3626 ext4_handle_sync(handle);
3627
3628 /* Now release the pages again to reduce race window */
3629 if (last_block_offset > first_block_offset)
3630 truncate_pagecache_range(inode, first_block_offset,
3631 last_block_offset);
3632
3612 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3633 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3613 ext4_mark_inode_dirty(handle, inode); 3634 ext4_mark_inode_dirty(handle, inode);
3614out_stop: 3635out_stop:
@@ -3682,7 +3703,7 @@ void ext4_truncate(struct inode *inode)
3682 3703
3683 /* 3704 /*
3684 * There is a possibility that we're either freeing the inode 3705 * There is a possibility that we're either freeing the inode
3685 * or it completely new indode. In those cases we might not 3706 * or it's a completely new inode. In those cases we might not
3686 * have i_mutex locked because it's not necessary. 3707 * have i_mutex locked because it's not necessary.
3687 */ 3708 */
3688 if (!(inode->i_state & (I_NEW|I_FREEING))) 3709 if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3955,8 @@ void ext4_set_inode_flags(struct inode *inode)
3934 new_fl |= S_NOATIME; 3955 new_fl |= S_NOATIME;
3935 if (flags & EXT4_DIRSYNC_FL) 3956 if (flags & EXT4_DIRSYNC_FL)
3936 new_fl |= S_DIRSYNC; 3957 new_fl |= S_DIRSYNC;
3937 set_mask_bits(&inode->i_flags, 3958 inode_set_flags(inode, new_fl,
3938 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); 3959 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3939} 3960}
3940 3961
3941/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3962/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4175,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4154 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4175 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4155 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4176 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4156 4177
4157 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4178 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4158 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4179 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4159 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4180 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4160 inode->i_version |= 4181 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4161 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4182 inode->i_version |=
4183 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4184 }
4162 } 4185 }
4163 4186
4164 ret = 0; 4187 ret = 0;
@@ -4328,8 +4351,7 @@ static int ext4_do_update_inode(handle_t *handle,
4328 goto out_brelse; 4351 goto out_brelse;
4329 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4352 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4330 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4353 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4331 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4354 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
4332 cpu_to_le32(EXT4_OS_HURD))
4333 raw_inode->i_file_acl_high = 4355 raw_inode->i_file_acl_high =
4334 cpu_to_le16(ei->i_file_acl >> 32); 4356 cpu_to_le16(ei->i_file_acl >> 32);
4335 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4357 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4396,15 @@ static int ext4_do_update_inode(handle_t *handle,
4374 raw_inode->i_block[block] = ei->i_data[block]; 4396 raw_inode->i_block[block] = ei->i_data[block];
4375 } 4397 }
4376 4398
4377 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4399 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
4378 if (ei->i_extra_isize) { 4400 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4379 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4401 if (ei->i_extra_isize) {
4380 raw_inode->i_version_hi = 4402 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4381 cpu_to_le32(inode->i_version >> 32); 4403 raw_inode->i_version_hi =
4382 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4404 cpu_to_le32(inode->i_version >> 32);
4405 raw_inode->i_extra_isize =
4406 cpu_to_le16(ei->i_extra_isize);
4407 }
4383 } 4408 }
4384 4409
4385 ext4_inode_csum_set(inode, raw_inode, ei); 4410 ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4402,21 +4427,20 @@ out_brelse:
4402 * 4427 *
4403 * We are called from a few places: 4428 * We are called from a few places:
4404 * 4429 *
4405 * - Within generic_file_write() for O_SYNC files. 4430 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
4406 * Here, there will be no transaction running. We wait for any running 4431 * Here, there will be no transaction running. We wait for any running
4407 * transaction to commit. 4432 * transaction to commit.
4408 * 4433 *
4409 * - Within sys_sync(), kupdate and such. 4434 * - Within flush work (sys_sync(), kupdate and such).
4410 * We wait on commit, if tol to. 4435 * We wait on commit, if told to.
4411 * 4436 *
4412 * - Within prune_icache() (PF_MEMALLOC == true) 4437 * - Within iput_final() -> write_inode_now()
4413 * Here we simply return. We can't afford to block kswapd on the 4438 * We wait on commit, if told to.
4414 * journal commit.
4415 * 4439 *
4416 * In all cases it is actually safe for us to return without doing anything, 4440 * In all cases it is actually safe for us to return without doing anything,
4417 * because the inode has been copied into a raw inode buffer in 4441 * because the inode has been copied into a raw inode buffer in
4418 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4442 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
4419 * knfsd. 4443 * writeback.
4420 * 4444 *
4421 * Note that we are absolutely dependent upon all inode dirtiers doing the 4445 * Note that we are absolutely dependent upon all inode dirtiers doing the
4422 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4446 * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -4428,15 +4452,15 @@ out_brelse:
4428 * stuff(); 4452 * stuff();
4429 * inode->i_size = expr; 4453 * inode->i_size = expr;
4430 * 4454 *
4431 * is in error because a kswapd-driven write_inode() could occur while 4455 * is in error because write_inode() could occur while `stuff()' is running,
4432 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4456 * and the new i_size will be lost. Plus the inode will no longer be on the
4433 * will no longer be on the superblock's dirty inode list. 4457 * superblock's dirty inode list.
4434 */ 4458 */
4435int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 4459int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4436{ 4460{
4437 int err; 4461 int err;
4438 4462
4439 if (current->flags & PF_MEMALLOC) 4463 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
4440 return 0; 4464 return 0;
4441 4465
4442 if (EXT4_SB(inode->i_sb)->s_journal) { 4466 if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -4446,7 +4470,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4446 return -EIO; 4470 return -EIO;
4447 } 4471 }
4448 4472
4449 if (wbc->sync_mode != WB_SYNC_ALL) 4473 /*
4474 * No need to force transaction in WB_SYNC_NONE mode. Also
4475 * ext4_sync_fs() will force the commit after everything is
4476 * written.
4477 */
4478 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
4450 return 0; 4479 return 0;
4451 4480
4452 err = ext4_force_commit(inode->i_sb); 4481 err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4485,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4456 err = __ext4_get_inode_loc(inode, &iloc, 0); 4485 err = __ext4_get_inode_loc(inode, &iloc, 0);
4457 if (err) 4486 if (err)
4458 return err; 4487 return err;
4459 if (wbc->sync_mode == WB_SYNC_ALL) 4488 /*
4489 * sync(2) will flush the whole buffer cache. No need to do
4490 * it here separately for each inode.
4491 */
4492 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4460 sync_dirty_buffer(iloc.bh); 4493 sync_dirty_buffer(iloc.bh);
4461 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 4494 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
4462 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 4495 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..0f2252ec274d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
104 struct ext4_inode_info *ei_bl; 104 struct ext4_inode_info *ei_bl;
105 struct ext4_sb_info *sbi = EXT4_SB(sb); 105 struct ext4_sb_info *sbi = EXT4_SB(sb);
106 106
107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
108 err = -EINVAL; 108 return -EINVAL;
109 goto swap_boot_out;
110 }
111 109
112 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { 110 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
113 err = -EPERM; 111 return -EPERM;
114 goto swap_boot_out;
115 }
116 112
117 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 113 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
118 if (IS_ERR(inode_bl)) { 114 if (IS_ERR(inode_bl))
119 err = PTR_ERR(inode_bl); 115 return PTR_ERR(inode_bl);
120 goto swap_boot_out;
121 }
122 ei_bl = EXT4_I(inode_bl); 116 ei_bl = EXT4_I(inode_bl);
123 117
124 filemap_flush(inode->i_mapping); 118 filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
193 ext4_mark_inode_dirty(handle, inode); 187 ext4_mark_inode_dirty(handle, inode);
194 } 188 }
195 } 189 }
196
197 ext4_journal_stop(handle); 190 ext4_journal_stop(handle);
198
199 ext4_double_up_write_data_sem(inode, inode_bl); 191 ext4_double_up_write_data_sem(inode, inode_bl);
200 192
201journal_err_out: 193journal_err_out:
202 ext4_inode_resume_unlocked_dio(inode); 194 ext4_inode_resume_unlocked_dio(inode);
203 ext4_inode_resume_unlocked_dio(inode_bl); 195 ext4_inode_resume_unlocked_dio(inode_bl);
204
205 unlock_two_nondirectories(inode, inode_bl); 196 unlock_two_nondirectories(inode, inode_bl);
206
207 iput(inode_bl); 197 iput(inode_bl);
208
209swap_boot_out:
210 return err; 198 return err;
211} 199}
212 200
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c7504be9..c8238a26818c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
989 poff = block % blocks_per_page; 989 poff = block % blocks_per_page;
990 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 990 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
991 if (!page) 991 if (!page)
992 return -EIO; 992 return -ENOMEM;
993 BUG_ON(page->mapping != inode->i_mapping); 993 BUG_ON(page->mapping != inode->i_mapping);
994 e4b->bd_bitmap_page = page; 994 e4b->bd_bitmap_page = page;
995 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 995 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1003 pnum = block / blocks_per_page; 1003 pnum = block / blocks_per_page;
1004 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1004 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1005 if (!page) 1005 if (!page)
1006 return -EIO; 1006 return -ENOMEM;
1007 BUG_ON(page->mapping != inode->i_mapping); 1007 BUG_ON(page->mapping != inode->i_mapping);
1008 e4b->bd_buddy_page = page; 1008 e4b->bd_buddy_page = page;
1009 return 0; 1009 return 0;
@@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1168 unlock_page(page); 1168 unlock_page(page);
1169 } 1169 }
1170 } 1170 }
1171 if (page == NULL || !PageUptodate(page)) { 1171 if (page == NULL) {
1172 ret = -ENOMEM;
1173 goto err;
1174 }
1175 if (!PageUptodate(page)) {
1172 ret = -EIO; 1176 ret = -EIO;
1173 goto err; 1177 goto err;
1174 } 1178 }
@@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1197 unlock_page(page); 1201 unlock_page(page);
1198 } 1202 }
1199 } 1203 }
1200 if (page == NULL || !PageUptodate(page)) { 1204 if (page == NULL) {
1205 ret = -ENOMEM;
1206 goto err;
1207 }
1208 if (!PageUptodate(page)) {
1201 ret = -EIO; 1209 ret = -EIO;
1202 goto err; 1210 goto err;
1203 } 1211 }
@@ -1808,6 +1816,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1808 ext4_lock_group(ac->ac_sb, group); 1816 ext4_lock_group(ac->ac_sb, group);
1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1817 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1810 ac->ac_g_ex.fe_len, &ex); 1818 ac->ac_g_ex.fe_len, &ex);
1819 ex.fe_logical = 0xDEADFA11; /* debug value */
1811 1820
1812 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1821 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1813 ext4_fsblk_t start; 1822 ext4_fsblk_t start;
@@ -1936,7 +1945,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1936 */ 1945 */
1937 break; 1946 break;
1938 } 1947 }
1939 1948 ex.fe_logical = 0xDEADC0DE; /* debug value */
1940 ext4_mb_measure_extent(ac, &ex, e4b); 1949 ext4_mb_measure_extent(ac, &ex, e4b);
1941 1950
1942 i += ex.fe_len; 1951 i += ex.fe_len;
@@ -1977,6 +1986,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1977 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1986 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1978 if (max >= sbi->s_stripe) { 1987 if (max >= sbi->s_stripe) {
1979 ac->ac_found++; 1988 ac->ac_found++;
1989 ex.fe_logical = 0xDEADF00D; /* debug value */
1980 ac->ac_b_ex = ex; 1990 ac->ac_b_ex = ex;
1981 ext4_mb_use_best_found(ac, e4b); 1991 ext4_mb_use_best_found(ac, e4b);
1982 break; 1992 break;
@@ -4006,8 +4016,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4006 (unsigned long)ac->ac_b_ex.fe_len, 4016 (unsigned long)ac->ac_b_ex.fe_len,
4007 (unsigned long)ac->ac_b_ex.fe_logical, 4017 (unsigned long)ac->ac_b_ex.fe_logical,
4008 (int)ac->ac_criteria); 4018 (int)ac->ac_criteria);
4009 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", 4019 ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4010 ac->ac_ex_scanned, ac->ac_found);
4011 ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); 4020 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4012 ngroups = ext4_get_groups_count(sb); 4021 ngroups = ext4_get_groups_count(sb);
4013 for (i = 0; i < ngroups; i++) { 4022 for (i = 0; i < ngroups; i++) {
@@ -5007,6 +5016,8 @@ error_return:
5007 */ 5016 */
5008static int ext4_trim_extent(struct super_block *sb, int start, int count, 5017static int ext4_trim_extent(struct super_block *sb, int start, int count,
5009 ext4_group_t group, struct ext4_buddy *e4b) 5018 ext4_group_t group, struct ext4_buddy *e4b)
5019__releases(bitlock)
5020__acquires(bitlock)
5010{ 5021{
5011 struct ext4_free_extent ex; 5022 struct ext4_free_extent ex;
5012 int ret = 0; 5023 int ret = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd5..d634e183b4d4 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
48 } \ 48 } \
49 } while (0) 49 } while (0)
50#else 50#else
51#define mb_debug(n, fmt, a...) 51#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
52#endif 52#endif
53 53
54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 54#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
175 /* copy of the best found extent taken before preallocation efforts */ 175 /* copy of the best found extent taken before preallocation efforts */
176 struct ext4_free_extent ac_f_ex; 176 struct ext4_free_extent ac_f_ex;
177 177
178 /* number of iterations done. we have to track to limit searching */
179 unsigned long ac_ex_scanned;
180 __u16 ac_groups_scanned; 178 __u16 ac_groups_scanned;
181 __u16 ac_found; 179 __u16 ac_found;
182 __u16 ac_tail; 180 __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503bd18c..58ee7dc87669 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
76 * ext4_ext_path structure refers to the last extent, or a negative error 76 * ext4_ext_path structure refers to the last extent, or a negative error
77 * value on failure. 77 * value on failure.
78 */ 78 */
79static int 79int
80mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 80mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
81 struct ext4_extent **extent) 81 struct ext4_extent **extent)
82{ 82{
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
861 } 861 }
862 if (!buffer_mapped(bh)) { 862 if (!buffer_mapped(bh)) {
863 zero_user(page, block_start, blocksize); 863 zero_user(page, block_start, blocksize);
864 if (!err) 864 set_buffer_uptodate(bh);
865 set_buffer_uptodate(bh);
866 continue; 865 continue;
867 } 866 }
868 } 867 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d050e043e884..1cb84f78909e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3000,6 +3000,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
3000 return ext4_get_first_inline_block(inode, parent_de, retval); 3000 return ext4_get_first_inline_block(inode, parent_de, retval);
3001} 3001}
3002 3002
3003struct ext4_renament {
3004 struct inode *dir;
3005 struct dentry *dentry;
3006 struct inode *inode;
3007 bool is_dir;
3008 int dir_nlink_delta;
3009
3010 /* entry for "dentry" */
3011 struct buffer_head *bh;
3012 struct ext4_dir_entry_2 *de;
3013 int inlined;
3014
3015 /* entry for ".." in inode if it's a directory */
3016 struct buffer_head *dir_bh;
3017 struct ext4_dir_entry_2 *parent_de;
3018 int dir_inlined;
3019};
3020
3021static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
3022{
3023 int retval;
3024
3025 ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
3026 &retval, &ent->parent_de,
3027 &ent->dir_inlined);
3028 if (!ent->dir_bh)
3029 return retval;
3030 if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
3031 return -EIO;
3032 BUFFER_TRACE(ent->dir_bh, "get_write_access");
3033 return ext4_journal_get_write_access(handle, ent->dir_bh);
3034}
3035
3036static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
3037 unsigned dir_ino)
3038{
3039 int retval;
3040
3041 ent->parent_de->inode = cpu_to_le32(dir_ino);
3042 BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
3043 if (!ent->dir_inlined) {
3044 if (is_dx(ent->inode)) {
3045 retval = ext4_handle_dirty_dx_node(handle,
3046 ent->inode,
3047 ent->dir_bh);
3048 } else {
3049 retval = ext4_handle_dirty_dirent_node(handle,
3050 ent->inode,
3051 ent->dir_bh);
3052 }
3053 } else {
3054 retval = ext4_mark_inode_dirty(handle, ent->inode);
3055 }
3056 if (retval) {
3057 ext4_std_error(ent->dir->i_sb, retval);
3058 return retval;
3059 }
3060 return 0;
3061}
3062
3063static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
3064 unsigned ino, unsigned file_type)
3065{
3066 int retval;
3067
3068 BUFFER_TRACE(ent->bh, "get write access");
3069 retval = ext4_journal_get_write_access(handle, ent->bh);
3070 if (retval)
3071 return retval;
3072 ent->de->inode = cpu_to_le32(ino);
3073 if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
3074 EXT4_FEATURE_INCOMPAT_FILETYPE))
3075 ent->de->file_type = file_type;
3076 ent->dir->i_version++;
3077 ent->dir->i_ctime = ent->dir->i_mtime =
3078 ext4_current_time(ent->dir);
3079 ext4_mark_inode_dirty(handle, ent->dir);
3080 BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
3081 if (!ent->inlined) {
3082 retval = ext4_handle_dirty_dirent_node(handle,
3083 ent->dir, ent->bh);
3084 if (unlikely(retval)) {
3085 ext4_std_error(ent->dir->i_sb, retval);
3086 return retval;
3087 }
3088 }
3089 brelse(ent->bh);
3090 ent->bh = NULL;
3091
3092 return 0;
3093}
3094
3095static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3096 const struct qstr *d_name)
3097{
3098 int retval = -ENOENT;
3099 struct buffer_head *bh;
3100 struct ext4_dir_entry_2 *de;
3101
3102 bh = ext4_find_entry(dir, d_name, &de, NULL);
3103 if (bh) {
3104 retval = ext4_delete_entry(handle, dir, de, bh);
3105 brelse(bh);
3106 }
3107 return retval;
3108}
3109
3110static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
3111{
3112 int retval;
3113 /*
3114 * ent->de could have moved from under us during htree split, so make
3115 * sure that we are deleting the right entry. We might also be pointing
3116 * to a stale entry in the unused part of ent->bh so just checking inum
3117 * and the name isn't enough.
3118 */
3119 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
3120 ent->de->name_len != ent->dentry->d_name.len ||
3121 strncmp(ent->de->name, ent->dentry->d_name.name,
3122 ent->de->name_len)) {
3123 retval = ext4_find_delete_entry(handle, ent->dir,
3124 &ent->dentry->d_name);
3125 } else {
3126 retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
3127 if (retval == -ENOENT) {
3128 retval = ext4_find_delete_entry(handle, ent->dir,
3129 &ent->dentry->d_name);
3130 }
3131 }
3132
3133 if (retval) {
3134 ext4_warning(ent->dir->i_sb,
3135 "Deleting old file (%lu), %d, error=%d",
3136 ent->dir->i_ino, ent->dir->i_nlink, retval);
3137 }
3138}
3139
3140static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3141{
3142 if (ent->dir_nlink_delta) {
3143 if (ent->dir_nlink_delta == -1)
3144 ext4_dec_count(handle, ent->dir);
3145 else
3146 ext4_inc_count(handle, ent->dir);
3147 ext4_mark_inode_dirty(handle, ent->dir);
3148 }
3149}
3150
3003/* 3151/*
3004 * Anybody can rename anything with this: the permission checks are left to the 3152 * Anybody can rename anything with this: the permission checks are left to the
3005 * higher-level routines. 3153 * higher-level routines.
@@ -3012,198 +3160,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3012 struct inode *new_dir, struct dentry *new_dentry) 3160 struct inode *new_dir, struct dentry *new_dentry)
3013{ 3161{
3014 handle_t *handle = NULL; 3162 handle_t *handle = NULL;
3015 struct inode *old_inode, *new_inode; 3163 struct ext4_renament old = {
3016 struct buffer_head *old_bh, *new_bh, *dir_bh; 3164 .dir = old_dir,
3017 struct ext4_dir_entry_2 *old_de, *new_de; 3165 .dentry = old_dentry,
3166 .inode = old_dentry->d_inode,
3167 };
3168 struct ext4_renament new = {
3169 .dir = new_dir,
3170 .dentry = new_dentry,
3171 .inode = new_dentry->d_inode,
3172 };
3018 int retval; 3173 int retval;
3019 int inlined = 0, new_inlined = 0;
3020 struct ext4_dir_entry_2 *parent_de;
3021 3174
3022 dquot_initialize(old_dir); 3175 dquot_initialize(old.dir);
3023 dquot_initialize(new_dir); 3176 dquot_initialize(new.dir);
3024
3025 old_bh = new_bh = dir_bh = NULL;
3026 3177
3027 /* Initialize quotas before so that eventual writes go 3178 /* Initialize quotas before so that eventual writes go
3028 * in separate transaction */ 3179 * in separate transaction */
3029 if (new_dentry->d_inode) 3180 if (new.inode)
3030 dquot_initialize(new_dentry->d_inode); 3181 dquot_initialize(new.inode);
3031 3182
3032 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); 3183 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3033 /* 3184 /*
3034 * Check for inode number is _not_ due to possible IO errors. 3185 * Check for inode number is _not_ due to possible IO errors.
3035 * We might rmdir the source, keep it as pwd of some process 3186 * We might rmdir the source, keep it as pwd of some process
3036 * and merrily kill the link to whatever was created under the 3187 * and merrily kill the link to whatever was created under the
3037 * same name. Goodbye sticky bit ;-< 3188 * same name. Goodbye sticky bit ;-<
3038 */ 3189 */
3039 old_inode = old_dentry->d_inode;
3040 retval = -ENOENT; 3190 retval = -ENOENT;
3041 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) 3191 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3042 goto end_rename; 3192 goto end_rename;
3043 3193
3044 new_inode = new_dentry->d_inode; 3194 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3045 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, 3195 &new.de, &new.inlined);
3046 &new_de, &new_inlined); 3196 if (new.bh) {
3047 if (new_bh) { 3197 if (!new.inode) {
3048 if (!new_inode) { 3198 brelse(new.bh);
3049 brelse(new_bh); 3199 new.bh = NULL;
3050 new_bh = NULL;
3051 } 3200 }
3052 } 3201 }
3053 if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) 3202 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
3054 ext4_alloc_da_blocks(old_inode); 3203 ext4_alloc_da_blocks(old.inode);
3055 3204
3056 handle = ext4_journal_start(old_dir, EXT4_HT_DIR, 3205 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3057 (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 3206 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3058 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3207 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3059 if (IS_ERR(handle)) 3208 if (IS_ERR(handle))
3060 return PTR_ERR(handle); 3209 return PTR_ERR(handle);
3061 3210
3062 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3211 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3063 ext4_handle_sync(handle); 3212 ext4_handle_sync(handle);
3064 3213
3065 if (S_ISDIR(old_inode->i_mode)) { 3214 if (S_ISDIR(old.inode->i_mode)) {
3066 if (new_inode) { 3215 if (new.inode) {
3067 retval = -ENOTEMPTY; 3216 retval = -ENOTEMPTY;
3068 if (!empty_dir(new_inode)) 3217 if (!empty_dir(new.inode))
3218 goto end_rename;
3219 } else {
3220 retval = -EMLINK;
3221 if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
3069 goto end_rename; 3222 goto end_rename;
3070 } 3223 }
3071 retval = -EIO; 3224 retval = ext4_rename_dir_prepare(handle, &old);
3072 dir_bh = ext4_get_first_dir_block(handle, old_inode,
3073 &retval, &parent_de,
3074 &inlined);
3075 if (!dir_bh)
3076 goto end_rename;
3077 if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
3078 goto end_rename;
3079 retval = -EMLINK;
3080 if (!new_inode && new_dir != old_dir &&
3081 EXT4_DIR_LINK_MAX(new_dir))
3082 goto end_rename;
3083 BUFFER_TRACE(dir_bh, "get_write_access");
3084 retval = ext4_journal_get_write_access(handle, dir_bh);
3085 if (retval) 3225 if (retval)
3086 goto end_rename; 3226 goto end_rename;
3087 } 3227 }
3088 if (!new_bh) { 3228 if (!new.bh) {
3089 retval = ext4_add_entry(handle, new_dentry, old_inode); 3229 retval = ext4_add_entry(handle, new.dentry, old.inode);
3090 if (retval) 3230 if (retval)
3091 goto end_rename; 3231 goto end_rename;
3092 } else { 3232 } else {
3093 BUFFER_TRACE(new_bh, "get write access"); 3233 retval = ext4_setent(handle, &new,
3094 retval = ext4_journal_get_write_access(handle, new_bh); 3234 old.inode->i_ino, old.de->file_type);
3095 if (retval) 3235 if (retval)
3096 goto end_rename; 3236 goto end_rename;
3097 new_de->inode = cpu_to_le32(old_inode->i_ino);
3098 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
3099 EXT4_FEATURE_INCOMPAT_FILETYPE))
3100 new_de->file_type = old_de->file_type;
3101 new_dir->i_version++;
3102 new_dir->i_ctime = new_dir->i_mtime =
3103 ext4_current_time(new_dir);
3104 ext4_mark_inode_dirty(handle, new_dir);
3105 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
3106 if (!new_inlined) {
3107 retval = ext4_handle_dirty_dirent_node(handle,
3108 new_dir, new_bh);
3109 if (unlikely(retval)) {
3110 ext4_std_error(new_dir->i_sb, retval);
3111 goto end_rename;
3112 }
3113 }
3114 brelse(new_bh);
3115 new_bh = NULL;
3116 } 3237 }
3117 3238
3118 /* 3239 /*
3119 * Like most other Unix systems, set the ctime for inodes on a 3240 * Like most other Unix systems, set the ctime for inodes on a
3120 * rename. 3241 * rename.
3121 */ 3242 */
3122 old_inode->i_ctime = ext4_current_time(old_inode); 3243 old.inode->i_ctime = ext4_current_time(old.inode);
3123 ext4_mark_inode_dirty(handle, old_inode); 3244 ext4_mark_inode_dirty(handle, old.inode);
3124 3245
3125 /* 3246 /*
3126 * ok, that's it 3247 * ok, that's it
3127 */ 3248 */
3128 if (le32_to_cpu(old_de->inode) != old_inode->i_ino || 3249 ext4_rename_delete(handle, &old);
3129 old_de->name_len != old_dentry->d_name.len || 3250
3130 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || 3251 if (new.inode) {
3131 (retval = ext4_delete_entry(handle, old_dir, 3252 ext4_dec_count(handle, new.inode);
3132 old_de, old_bh)) == -ENOENT) { 3253 new.inode->i_ctime = ext4_current_time(new.inode);
3133 /* old_de could have moved from under us during htree split, so
3134 * make sure that we are deleting the right entry. We might
3135 * also be pointing to a stale entry in the unused part of
3136 * old_bh so just checking inum and the name isn't enough. */
3137 struct buffer_head *old_bh2;
3138 struct ext4_dir_entry_2 *old_de2;
3139
3140 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
3141 &old_de2, NULL);
3142 if (old_bh2) {
3143 retval = ext4_delete_entry(handle, old_dir,
3144 old_de2, old_bh2);
3145 brelse(old_bh2);
3146 }
3147 } 3254 }
3148 if (retval) { 3255 old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
3149 ext4_warning(old_dir->i_sb, 3256 ext4_update_dx_flag(old.dir);
3150 "Deleting old file (%lu), %d, error=%d", 3257 if (old.dir_bh) {
3151 old_dir->i_ino, old_dir->i_nlink, retval); 3258 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3152 } 3259 if (retval)
3153
3154 if (new_inode) {
3155 ext4_dec_count(handle, new_inode);
3156 new_inode->i_ctime = ext4_current_time(new_inode);
3157 }
3158 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
3159 ext4_update_dx_flag(old_dir);
3160 if (dir_bh) {
3161 parent_de->inode = cpu_to_le32(new_dir->i_ino);
3162 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
3163 if (!inlined) {
3164 if (is_dx(old_inode)) {
3165 retval = ext4_handle_dirty_dx_node(handle,
3166 old_inode,
3167 dir_bh);
3168 } else {
3169 retval = ext4_handle_dirty_dirent_node(handle,
3170 old_inode, dir_bh);
3171 }
3172 } else {
3173 retval = ext4_mark_inode_dirty(handle, old_inode);
3174 }
3175 if (retval) {
3176 ext4_std_error(old_dir->i_sb, retval);
3177 goto end_rename; 3260 goto end_rename;
3178 } 3261
3179 ext4_dec_count(handle, old_dir); 3262 ext4_dec_count(handle, old.dir);
3180 if (new_inode) { 3263 if (new.inode) {
3181 /* checked empty_dir above, can't have another parent, 3264 /* checked empty_dir above, can't have another parent,
3182 * ext4_dec_count() won't work for many-linked dirs */ 3265 * ext4_dec_count() won't work for many-linked dirs */
3183 clear_nlink(new_inode); 3266 clear_nlink(new.inode);
3184 } else { 3267 } else {
3185 ext4_inc_count(handle, new_dir); 3268 ext4_inc_count(handle, new.dir);
3186 ext4_update_dx_flag(new_dir); 3269 ext4_update_dx_flag(new.dir);
3187 ext4_mark_inode_dirty(handle, new_dir); 3270 ext4_mark_inode_dirty(handle, new.dir);
3188 } 3271 }
3189 } 3272 }
3190 ext4_mark_inode_dirty(handle, old_dir); 3273 ext4_mark_inode_dirty(handle, old.dir);
3191 if (new_inode) { 3274 if (new.inode) {
3192 ext4_mark_inode_dirty(handle, new_inode); 3275 ext4_mark_inode_dirty(handle, new.inode);
3193 if (!new_inode->i_nlink) 3276 if (!new.inode->i_nlink)
3194 ext4_orphan_add(handle, new_inode); 3277 ext4_orphan_add(handle, new.inode);
3195 } 3278 }
3196 retval = 0; 3279 retval = 0;
3197 3280
3198end_rename: 3281end_rename:
3199 brelse(dir_bh); 3282 brelse(old.dir_bh);
3200 brelse(old_bh); 3283 brelse(old.bh);
3201 brelse(new_bh); 3284 brelse(new.bh);
3202 if (handle) 3285 if (handle)
3203 ext4_journal_stop(handle); 3286 ext4_journal_stop(handle);
3204 return retval; 3287 return retval;
3205} 3288}
3206 3289
3290static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3291 struct inode *new_dir, struct dentry *new_dentry)
3292{
3293 handle_t *handle = NULL;
3294 struct ext4_renament old = {
3295 .dir = old_dir,
3296 .dentry = old_dentry,
3297 .inode = old_dentry->d_inode,
3298 };
3299 struct ext4_renament new = {
3300 .dir = new_dir,
3301 .dentry = new_dentry,
3302 .inode = new_dentry->d_inode,
3303 };
3304 u8 new_file_type;
3305 int retval;
3306
3307 dquot_initialize(old.dir);
3308 dquot_initialize(new.dir);
3309
3310 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3311 &old.de, &old.inlined);
3312 /*
3313 * Check for inode number is _not_ due to possible IO errors.
3314 * We might rmdir the source, keep it as pwd of some process
3315 * and merrily kill the link to whatever was created under the
3316 * same name. Goodbye sticky bit ;-<
3317 */
3318 retval = -ENOENT;
3319 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3320 goto end_rename;
3321
3322 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3323 &new.de, &new.inlined);
3324
3325 /* RENAME_EXCHANGE case: old *and* new must both exist */
3326 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
3327 goto end_rename;
3328
3329 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3330 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3331 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3332 if (IS_ERR(handle))
3333 return PTR_ERR(handle);
3334
3335 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3336 ext4_handle_sync(handle);
3337
3338 if (S_ISDIR(old.inode->i_mode)) {
3339 old.is_dir = true;
3340 retval = ext4_rename_dir_prepare(handle, &old);
3341 if (retval)
3342 goto end_rename;
3343 }
3344 if (S_ISDIR(new.inode->i_mode)) {
3345 new.is_dir = true;
3346 retval = ext4_rename_dir_prepare(handle, &new);
3347 if (retval)
3348 goto end_rename;
3349 }
3350
3351 /*
3352 * Other than the special case of overwriting a directory, parents'
3353 * nlink only needs to be modified if this is a cross directory rename.
3354 */
3355 if (old.dir != new.dir && old.is_dir != new.is_dir) {
3356 old.dir_nlink_delta = old.is_dir ? -1 : 1;
3357 new.dir_nlink_delta = -old.dir_nlink_delta;
3358 retval = -EMLINK;
3359 if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
3360 (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
3361 goto end_rename;
3362 }
3363
3364 new_file_type = new.de->file_type;
3365 retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
3366 if (retval)
3367 goto end_rename;
3368
3369 retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
3370 if (retval)
3371 goto end_rename;
3372
3373 /*
3374 * Like most other Unix systems, set the ctime for inodes on a
3375 * rename.
3376 */
3377 old.inode->i_ctime = ext4_current_time(old.inode);
3378 new.inode->i_ctime = ext4_current_time(new.inode);
3379 ext4_mark_inode_dirty(handle, old.inode);
3380 ext4_mark_inode_dirty(handle, new.inode);
3381
3382 if (old.dir_bh) {
3383 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
3384 if (retval)
3385 goto end_rename;
3386 }
3387 if (new.dir_bh) {
3388 retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
3389 if (retval)
3390 goto end_rename;
3391 }
3392 ext4_update_dir_count(handle, &old);
3393 ext4_update_dir_count(handle, &new);
3394 retval = 0;
3395
3396end_rename:
3397 brelse(old.dir_bh);
3398 brelse(new.dir_bh);
3399 brelse(old.bh);
3400 brelse(new.bh);
3401 if (handle)
3402 ext4_journal_stop(handle);
3403 return retval;
3404}
3405
3406static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
3407 struct inode *new_dir, struct dentry *new_dentry,
3408 unsigned int flags)
3409{
3410 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
3411 return -EINVAL;
3412
3413 if (flags & RENAME_EXCHANGE) {
3414 return ext4_cross_rename(old_dir, old_dentry,
3415 new_dir, new_dentry);
3416 }
3417 /*
3418 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
3419 * is equivalent to regular rename.
3420 */
3421 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
3422}
3423
3207/* 3424/*
3208 * directories can handle most operations... 3425 * directories can handle most operations...
3209 */ 3426 */
@@ -3218,6 +3435,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3218 .mknod = ext4_mknod, 3435 .mknod = ext4_mknod,
3219 .tmpfile = ext4_tmpfile, 3436 .tmpfile = ext4_tmpfile,
3220 .rename = ext4_rename, 3437 .rename = ext4_rename,
3438 .rename2 = ext4_rename2,
3221 .setattr = ext4_setattr, 3439 .setattr = ext4_setattr,
3222 .setxattr = generic_setxattr, 3440 .setxattr = generic_setxattr,
3223 .getxattr = generic_getxattr, 3441 .getxattr = generic_getxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ab95508e3d40..c18d95b50540 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)
308 if (error) { 308 if (error) {
309 struct inode *inode = io_end->inode; 309 struct inode *inode = io_end->inode;
310 310
311 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 311 ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
312 "(offset %llu size %ld starting block %llu)", 312 "(offset %llu size %ld starting block %llu)",
313 inode->i_ino, 313 error, inode->i_ino,
314 (unsigned long long) io_end->offset, 314 (unsigned long long) io_end->offset,
315 (long) io_end->size, 315 (long) io_end->size,
316 (unsigned long long) 316 (unsigned long long)
317 bi_sector >> (inode->i_blkbits - 9)); 317 bi_sector >> (inode->i_blkbits - 9));
318 mapping_set_error(inode->i_mapping, error);
318 } 319 }
319 320
320 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 321 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2377d4..6f9e6fadac04 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
59static struct ext4_lazy_init *ext4_li_info; 59static struct ext4_lazy_init *ext4_li_info;
60static struct mutex ext4_li_mtx; 60static struct mutex ext4_li_mtx;
61static struct ext4_features *ext4_feat; 61static struct ext4_features *ext4_feat;
62static int ext4_mballoc_ready;
62 63
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 64static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 65 unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
845 invalidate_bdev(sbi->journal_bdev); 846 invalidate_bdev(sbi->journal_bdev);
846 ext4_blkdev_remove(sbi); 847 ext4_blkdev_remove(sbi);
847 } 848 }
849 if (sbi->s_mb_cache) {
850 ext4_xattr_destroy_cache(sbi->s_mb_cache);
851 sbi->s_mb_cache = NULL;
852 }
848 if (sbi->s_mmp_tsk) 853 if (sbi->s_mmp_tsk)
849 kthread_stop(sbi->s_mmp_tsk); 854 kthread_stop(sbi->s_mmp_tsk);
850 sb->s_fs_info = NULL; 855 sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
940 inode_init_once(&ei->vfs_inode); 945 inode_init_once(&ei->vfs_inode);
941} 946}
942 947
943static int init_inodecache(void) 948static int __init init_inodecache(void)
944{ 949{
945 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 950 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
946 sizeof(struct ext4_inode_info), 951 sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3575 "feature flags set on rev 0 fs, " 3580 "feature flags set on rev 0 fs, "
3576 "running e2fsck is recommended"); 3581 "running e2fsck is recommended");
3577 3582
3583 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3584 set_opt2(sb, HURD_COMPAT);
3585 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
3586 EXT4_FEATURE_INCOMPAT_64BIT)) {
3587 ext4_msg(sb, KERN_ERR,
3588 "The Hurd can't support 64-bit file systems");
3589 goto failed_mount;
3590 }
3591 }
3592
3578 if (IS_EXT2_SB(sb)) { 3593 if (IS_EXT2_SB(sb)) {
3579 if (ext2_feature_set_ok(sb)) 3594 if (ext2_feature_set_ok(sb))
3580 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " 3595 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -3854,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3854 goto failed_mount2; 3869 goto failed_mount2;
3855 } 3870 }
3856 } 3871 }
3872
3873 /*
3874 * set up enough so that it can read an inode,
3875 * and create new inode for buddy allocator
3876 */
3877 sbi->s_gdb_count = db_count;
3878 if (!test_opt(sb, NOLOAD) &&
3879 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
3880 sb->s_op = &ext4_sops;
3881 else
3882 sb->s_op = &ext4_nojournal_sops;
3883
3884 ext4_ext_init(sb);
3885 err = ext4_mb_init(sb);
3886 if (err) {
3887 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3888 err);
3889 goto failed_mount2;
3890 }
3891
3857 if (!ext4_check_descriptors(sb, &first_not_zeroed)) { 3892 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
3858 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3893 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3859 goto failed_mount2; 3894 goto failed_mount2a;
3860 } 3895 }
3861 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 3896 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
3862 if (!ext4_fill_flex_info(sb)) { 3897 if (!ext4_fill_flex_info(sb)) {
3863 ext4_msg(sb, KERN_ERR, 3898 ext4_msg(sb, KERN_ERR,
3864 "unable to initialize " 3899 "unable to initialize "
3865 "flex_bg meta info!"); 3900 "flex_bg meta info!");
3866 goto failed_mount2; 3901 goto failed_mount2a;
3867 } 3902 }
3868 3903
3869 sbi->s_gdb_count = db_count;
3870 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3904 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3871 spin_lock_init(&sbi->s_next_gen_lock); 3905 spin_lock_init(&sbi->s_next_gen_lock);
3872 3906
@@ -3901,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3901 sbi->s_stripe = ext4_get_stripe_size(sbi); 3935 sbi->s_stripe = ext4_get_stripe_size(sbi);
3902 sbi->s_extent_max_zeroout_kb = 32; 3936 sbi->s_extent_max_zeroout_kb = 32;
3903 3937
3904 /*
3905 * set up enough so that it can read an inode
3906 */
3907 if (!test_opt(sb, NOLOAD) &&
3908 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
3909 sb->s_op = &ext4_sops;
3910 else
3911 sb->s_op = &ext4_nojournal_sops;
3912 sb->s_export_op = &ext4_export_ops; 3938 sb->s_export_op = &ext4_export_ops;
3913 sb->s_xattr = ext4_xattr_handlers; 3939 sb->s_xattr = ext4_xattr_handlers;
3914#ifdef CONFIG_QUOTA 3940#ifdef CONFIG_QUOTA
@@ -4010,6 +4036,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4010 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); 4036 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
4011 4037
4012no_journal: 4038no_journal:
4039 if (ext4_mballoc_ready) {
4040 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
4041 if (!sbi->s_mb_cache) {
4042 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
4043 goto failed_mount_wq;
4044 }
4045 }
4046
4013 /* 4047 /*
4014 * Get the # of file system overhead blocks from the 4048 * Get the # of file system overhead blocks from the
4015 * superblock if present. 4049 * superblock if present.
@@ -4090,21 +4124,13 @@ no_journal:
4090 if (err) { 4124 if (err) {
4091 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " 4125 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
4092 "reserved pool", ext4_calculate_resv_clusters(sb)); 4126 "reserved pool", ext4_calculate_resv_clusters(sb));
4093 goto failed_mount4a; 4127 goto failed_mount5;
4094 } 4128 }
4095 4129
4096 err = ext4_setup_system_zone(sb); 4130 err = ext4_setup_system_zone(sb);
4097 if (err) { 4131 if (err) {
4098 ext4_msg(sb, KERN_ERR, "failed to initialize system " 4132 ext4_msg(sb, KERN_ERR, "failed to initialize system "
4099 "zone (%d)", err); 4133 "zone (%d)", err);
4100 goto failed_mount4a;
4101 }
4102
4103 ext4_ext_init(sb);
4104 err = ext4_mb_init(sb);
4105 if (err) {
4106 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4107 err);
4108 goto failed_mount5; 4134 goto failed_mount5;
4109 } 4135 }
4110 4136
@@ -4181,11 +4207,8 @@ failed_mount8:
4181failed_mount7: 4207failed_mount7:
4182 ext4_unregister_li_request(sb); 4208 ext4_unregister_li_request(sb);
4183failed_mount6: 4209failed_mount6:
4184 ext4_mb_release(sb);
4185failed_mount5:
4186 ext4_ext_release(sb);
4187 ext4_release_system_zone(sb); 4210 ext4_release_system_zone(sb);
4188failed_mount4a: 4211failed_mount5:
4189 dput(sb->s_root); 4212 dput(sb->s_root);
4190 sb->s_root = NULL; 4213 sb->s_root = NULL;
4191failed_mount4: 4214failed_mount4:
@@ -4209,11 +4232,14 @@ failed_mount3:
4209 percpu_counter_destroy(&sbi->s_extent_cache_cnt); 4232 percpu_counter_destroy(&sbi->s_extent_cache_cnt);
4210 if (sbi->s_mmp_tsk) 4233 if (sbi->s_mmp_tsk)
4211 kthread_stop(sbi->s_mmp_tsk); 4234 kthread_stop(sbi->s_mmp_tsk);
4235failed_mount2a:
4236 ext4_mb_release(sb);
4212failed_mount2: 4237failed_mount2:
4213 for (i = 0; i < db_count; i++) 4238 for (i = 0; i < db_count; i++)
4214 brelse(sbi->s_group_desc[i]); 4239 brelse(sbi->s_group_desc[i]);
4215 ext4_kvfree(sbi->s_group_desc); 4240 ext4_kvfree(sbi->s_group_desc);
4216failed_mount: 4241failed_mount:
4242 ext4_ext_release(sb);
4217 if (sbi->s_chksum_driver) 4243 if (sbi->s_chksum_driver)
4218 crypto_free_shash(sbi->s_chksum_driver); 4244 crypto_free_shash(sbi->s_chksum_driver);
4219 if (sbi->s_proc) { 4245 if (sbi->s_proc) {
@@ -4835,6 +4861,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4835 } 4861 }
4836 4862
4837 if (*flags & MS_RDONLY) { 4863 if (*flags & MS_RDONLY) {
4864 err = sync_filesystem(sb);
4865 if (err < 0)
4866 goto restore_opts;
4838 err = dquot_suspend(sb, -1); 4867 err = dquot_suspend(sb, -1);
4839 if (err < 0) 4868 if (err < 0)
4840 goto restore_opts; 4869 goto restore_opts;
@@ -5516,11 +5545,9 @@ static int __init ext4_init_fs(void)
5516 5545
5517 err = ext4_init_mballoc(); 5546 err = ext4_init_mballoc();
5518 if (err) 5547 if (err)
5519 goto out3;
5520
5521 err = ext4_init_xattr();
5522 if (err)
5523 goto out2; 5548 goto out2;
5549 else
5550 ext4_mballoc_ready = 1;
5524 err = init_inodecache(); 5551 err = init_inodecache();
5525 if (err) 5552 if (err)
5526 goto out1; 5553 goto out1;
@@ -5536,10 +5563,9 @@ out:
5536 unregister_as_ext3(); 5563 unregister_as_ext3();
5537 destroy_inodecache(); 5564 destroy_inodecache();
5538out1: 5565out1:
5539 ext4_exit_xattr(); 5566 ext4_mballoc_ready = 0;
5540out2:
5541 ext4_exit_mballoc(); 5567 ext4_exit_mballoc();
5542out3: 5568out2:
5543 ext4_exit_feat_adverts(); 5569 ext4_exit_feat_adverts();
5544out4: 5570out4:
5545 if (ext4_proc_root) 5571 if (ext4_proc_root)
@@ -5562,7 +5588,6 @@ static void __exit ext4_exit_fs(void)
5562 unregister_as_ext3(); 5588 unregister_as_ext3();
5563 unregister_filesystem(&ext4_fs_type); 5589 unregister_filesystem(&ext4_fs_type);
5564 destroy_inodecache(); 5590 destroy_inodecache();
5565 ext4_exit_xattr();
5566 ext4_exit_mballoc(); 5591 ext4_exit_mballoc();
5567 ext4_exit_feat_adverts(); 5592 ext4_exit_feat_adverts();
5568 remove_proc_entry("fs/ext4", NULL); 5593 remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94116ac..4eec399ec807 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 81# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
82#endif 82#endif
83 83
84static void ext4_xattr_cache_insert(struct buffer_head *); 84static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
85static struct buffer_head *ext4_xattr_cache_find(struct inode *, 85static struct buffer_head *ext4_xattr_cache_find(struct inode *,
86 struct ext4_xattr_header *, 86 struct ext4_xattr_header *,
87 struct mb_cache_entry **); 87 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
90static int ext4_xattr_list(struct dentry *dentry, char *buffer, 90static int ext4_xattr_list(struct dentry *dentry, char *buffer,
91 size_t buffer_size); 91 size_t buffer_size);
92 92
93static struct mb_cache *ext4_xattr_cache;
94
95static const struct xattr_handler *ext4_xattr_handler_map[] = { 93static const struct xattr_handler *ext4_xattr_handler_map[] = {
96 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 94 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
97#ifdef CONFIG_EXT4_FS_POSIX_ACL 95#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
117 NULL 115 NULL
118}; 116};
119 117
118#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
119 inode->i_sb->s_fs_info)->s_mb_cache)
120
120static __le32 ext4_xattr_block_csum(struct inode *inode, 121static __le32 ext4_xattr_block_csum(struct inode *inode,
121 sector_t block_nr, 122 sector_t block_nr,
122 struct ext4_xattr_header *hdr) 123 struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
265 struct ext4_xattr_entry *entry; 266 struct ext4_xattr_entry *entry;
266 size_t size; 267 size_t size;
267 int error; 268 int error;
269 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
268 270
269 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 271 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
270 name_index, name, buffer, (long)buffer_size); 272 name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
286 error = -EIO; 288 error = -EIO;
287 goto cleanup; 289 goto cleanup;
288 } 290 }
289 ext4_xattr_cache_insert(bh); 291 ext4_xattr_cache_insert(ext4_mb_cache, bh);
290 entry = BFIRST(bh); 292 entry = BFIRST(bh);
291 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); 293 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
292 if (error == -EIO) 294 if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
409 struct inode *inode = dentry->d_inode; 411 struct inode *inode = dentry->d_inode;
410 struct buffer_head *bh = NULL; 412 struct buffer_head *bh = NULL;
411 int error; 413 int error;
414 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
412 415
413 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 416 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
414 buffer, (long)buffer_size); 417 buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
430 error = -EIO; 433 error = -EIO;
431 goto cleanup; 434 goto cleanup;
432 } 435 }
433 ext4_xattr_cache_insert(bh); 436 ext4_xattr_cache_insert(ext4_mb_cache, bh);
434 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); 437 error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
435 438
436cleanup: 439cleanup:
@@ -517,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
517} 520}
518 521
519/* 522/*
520 * Release the xattr block BH: If the reference count is > 1, decrement 523 * Release the xattr block BH: If the reference count is > 1, decrement it;
521 * it; otherwise free the block. 524 * otherwise free the block.
522 */ 525 */
523static void 526static void
524ext4_xattr_release_block(handle_t *handle, struct inode *inode, 527ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
526{ 529{
527 struct mb_cache_entry *ce = NULL; 530 struct mb_cache_entry *ce = NULL;
528 int error = 0; 531 int error = 0;
532 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
529 533
530 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); 534 ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
531 error = ext4_journal_get_write_access(handle, bh); 535 error = ext4_journal_get_write_access(handle, bh);
532 if (error) 536 if (error)
533 goto out; 537 goto out;
@@ -538,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
538 if (ce) 542 if (ce)
539 mb_cache_entry_free(ce); 543 mb_cache_entry_free(ce);
540 get_bh(bh); 544 get_bh(bh);
545 unlock_buffer(bh);
541 ext4_free_blocks(handle, inode, bh, 0, 1, 546 ext4_free_blocks(handle, inode, bh, 0, 1,
542 EXT4_FREE_BLOCKS_METADATA | 547 EXT4_FREE_BLOCKS_METADATA |
543 EXT4_FREE_BLOCKS_FORGET); 548 EXT4_FREE_BLOCKS_FORGET);
544 unlock_buffer(bh);
545 } else { 549 } else {
546 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 550 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
547 if (ce) 551 if (ce)
548 mb_cache_entry_release(ce); 552 mb_cache_entry_release(ce);
553 /*
554 * Beware of this ugliness: Releasing of xattr block references
555 * from different inodes can race and so we have to protect
556 * from a race where someone else frees the block (and releases
557 * its journal_head) before we are done dirtying the buffer. In
558 * nojournal mode this race is harmless and we actually cannot
559 * call ext4_handle_dirty_xattr_block() with locked buffer as
560 * that function can call sync_dirty_buffer() so for that case
561 * we handle the dirtying after unlocking the buffer.
562 */
563 if (ext4_handle_valid(handle))
564 error = ext4_handle_dirty_xattr_block(handle, inode,
565 bh);
549 unlock_buffer(bh); 566 unlock_buffer(bh);
550 error = ext4_handle_dirty_xattr_block(handle, inode, bh); 567 if (!ext4_handle_valid(handle))
568 error = ext4_handle_dirty_xattr_block(handle, inode,
569 bh);
551 if (IS_SYNC(inode)) 570 if (IS_SYNC(inode))
552 ext4_handle_sync(handle); 571 ext4_handle_sync(handle);
553 dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); 572 dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
@@ -567,12 +586,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
567 size_t *min_offs, void *base, int *total) 586 size_t *min_offs, void *base, int *total)
568{ 587{
569 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { 588 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
570 *total += EXT4_XATTR_LEN(last->e_name_len);
571 if (!last->e_value_block && last->e_value_size) { 589 if (!last->e_value_block && last->e_value_size) {
572 size_t offs = le16_to_cpu(last->e_value_offs); 590 size_t offs = le16_to_cpu(last->e_value_offs);
573 if (offs < *min_offs) 591 if (offs < *min_offs)
574 *min_offs = offs; 592 *min_offs = offs;
575 } 593 }
594 if (total)
595 *total += EXT4_XATTR_LEN(last->e_name_len);
576 } 596 }
577 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 597 return (*min_offs - ((void *)last - base) - sizeof(__u32));
578} 598}
@@ -745,13 +765,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
745 struct ext4_xattr_search *s = &bs->s; 765 struct ext4_xattr_search *s = &bs->s;
746 struct mb_cache_entry *ce = NULL; 766 struct mb_cache_entry *ce = NULL;
747 int error = 0; 767 int error = 0;
768 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
748 769
749#define header(x) ((struct ext4_xattr_header *)(x)) 770#define header(x) ((struct ext4_xattr_header *)(x))
750 771
751 if (i->value && i->value_len > sb->s_blocksize) 772 if (i->value && i->value_len > sb->s_blocksize)
752 return -ENOSPC; 773 return -ENOSPC;
753 if (s->base) { 774 if (s->base) {
754 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, 775 ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
755 bs->bh->b_blocknr); 776 bs->bh->b_blocknr);
756 error = ext4_journal_get_write_access(handle, bs->bh); 777 error = ext4_journal_get_write_access(handle, bs->bh);
757 if (error) 778 if (error)
@@ -769,7 +790,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
769 if (!IS_LAST_ENTRY(s->first)) 790 if (!IS_LAST_ENTRY(s->first))
770 ext4_xattr_rehash(header(s->base), 791 ext4_xattr_rehash(header(s->base),
771 s->here); 792 s->here);
772 ext4_xattr_cache_insert(bs->bh); 793 ext4_xattr_cache_insert(ext4_mb_cache,
794 bs->bh);
773 } 795 }
774 unlock_buffer(bs->bh); 796 unlock_buffer(bs->bh);
775 if (error == -EIO) 797 if (error == -EIO)
@@ -905,7 +927,7 @@ getblk_failed:
905 memcpy(new_bh->b_data, s->base, new_bh->b_size); 927 memcpy(new_bh->b_data, s->base, new_bh->b_size);
906 set_buffer_uptodate(new_bh); 928 set_buffer_uptodate(new_bh);
907 unlock_buffer(new_bh); 929 unlock_buffer(new_bh);
908 ext4_xattr_cache_insert(new_bh); 930 ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
909 error = ext4_handle_dirty_xattr_block(handle, 931 error = ext4_handle_dirty_xattr_block(handle,
910 inode, new_bh); 932 inode, new_bh);
911 if (error) 933 if (error)
@@ -1228,7 +1250,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1228 struct ext4_xattr_block_find *bs = NULL; 1250 struct ext4_xattr_block_find *bs = NULL;
1229 char *buffer = NULL, *b_entry_name = NULL; 1251 char *buffer = NULL, *b_entry_name = NULL;
1230 size_t min_offs, free; 1252 size_t min_offs, free;
1231 int total_ino, total_blk; 1253 int total_ino;
1232 void *base, *start, *end; 1254 void *base, *start, *end;
1233 int extra_isize = 0, error = 0, tried_min_extra_isize = 0; 1255 int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
1234 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); 1256 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1308,7 @@ retry:
1286 first = BFIRST(bh); 1308 first = BFIRST(bh);
1287 end = bh->b_data + bh->b_size; 1309 end = bh->b_data + bh->b_size;
1288 min_offs = end - base; 1310 min_offs = end - base;
1289 free = ext4_xattr_free_space(first, &min_offs, base, 1311 free = ext4_xattr_free_space(first, &min_offs, base, NULL);
1290 &total_blk);
1291 if (free < new_extra_isize) { 1312 if (free < new_extra_isize) {
1292 if (!tried_min_extra_isize && s_min_extra_isize) { 1313 if (!tried_min_extra_isize && s_min_extra_isize) {
1293 tried_min_extra_isize++; 1314 tried_min_extra_isize++;
@@ -1495,13 +1516,13 @@ ext4_xattr_put_super(struct super_block *sb)
1495 * Returns 0, or a negative error number on failure. 1516 * Returns 0, or a negative error number on failure.
1496 */ 1517 */
1497static void 1518static void
1498ext4_xattr_cache_insert(struct buffer_head *bh) 1519ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
1499{ 1520{
1500 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1521 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1501 struct mb_cache_entry *ce; 1522 struct mb_cache_entry *ce;
1502 int error; 1523 int error;
1503 1524
1504 ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); 1525 ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
1505 if (!ce) { 1526 if (!ce) {
1506 ea_bdebug(bh, "out of memory"); 1527 ea_bdebug(bh, "out of memory");
1507 return; 1528 return;
@@ -1573,12 +1594,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1573{ 1594{
1574 __u32 hash = le32_to_cpu(header->h_hash); 1595 __u32 hash = le32_to_cpu(header->h_hash);
1575 struct mb_cache_entry *ce; 1596 struct mb_cache_entry *ce;
1597 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
1576 1598
1577 if (!header->h_hash) 1599 if (!header->h_hash)
1578 return NULL; /* never share */ 1600 return NULL; /* never share */
1579 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1601 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1580again: 1602again:
1581 ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, 1603 ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
1582 hash); 1604 hash);
1583 while (ce) { 1605 while (ce) {
1584 struct buffer_head *bh; 1606 struct buffer_head *bh;
@@ -1676,19 +1698,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1676 1698
1677#undef BLOCK_HASH_SHIFT 1699#undef BLOCK_HASH_SHIFT
1678 1700
1679int __init 1701#define HASH_BUCKET_BITS 10
1680ext4_init_xattr(void) 1702
1703struct mb_cache *
1704ext4_xattr_create_cache(char *name)
1681{ 1705{
1682 ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); 1706 return mb_cache_create(name, HASH_BUCKET_BITS);
1683 if (!ext4_xattr_cache)
1684 return -ENOMEM;
1685 return 0;
1686} 1707}
1687 1708
1688void 1709void ext4_xattr_destroy_cache(struct mb_cache *cache)
1689ext4_exit_xattr(void)
1690{ 1710{
1691 if (ext4_xattr_cache) 1711 if (cache)
1692 mb_cache_destroy(ext4_xattr_cache); 1712 mb_cache_destroy(cache);
1693 ext4_xattr_cache = NULL;
1694} 1713}
1714
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d6398833f..29bedf5589f6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 110extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
111 struct ext4_inode *raw_inode, handle_t *handle); 111 struct ext4_inode *raw_inode, handle_t *handle);
112 112
113extern int __init ext4_init_xattr(void);
114extern void ext4_exit_xattr(void);
115
116extern const struct xattr_handler *ext4_xattr_handlers[]; 113extern const struct xattr_handler *ext4_xattr_handlers[];
117 114
118extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, 115extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
124 struct ext4_xattr_info *i, 121 struct ext4_xattr_info *i,
125 struct ext4_xattr_ibody_find *is); 122 struct ext4_xattr_ibody_find *is);
126 123
124extern struct mb_cache *ext4_xattr_create_cache(char *name);
125extern void ext4_xattr_destroy_cache(struct mb_cache *);
126
127#ifdef CONFIG_EXT4_FS_SECURITY 127#ifdef CONFIG_EXT4_FS_SECURITY
128extern int ext4_init_security(handle_t *handle, struct inode *inode, 128extern int ext4_init_security(handle_t *handle, struct inode *inode,
129 struct inode *dir, const struct qstr *qstr); 129 struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fa8da4cb8c4b..e93e4ec7d165 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
174 174
175 retval = f2fs_getxattr(inode, name_index, "", NULL, 0); 175 retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
176 if (retval > 0) { 176 if (retval > 0) {
177 value = kmalloc(retval, GFP_KERNEL); 177 value = kmalloc(retval, GFP_F2FS_ZERO);
178 if (!value) 178 if (!value)
179 return ERR_PTR(-ENOMEM); 179 return ERR_PTR(-ENOMEM);
180 retval = f2fs_getxattr(inode, name_index, "", value, retval); 180 retval = f2fs_getxattr(inode, name_index, "", value, retval);
@@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type,
203 size_t size = 0; 203 size_t size = 0;
204 int error; 204 int error;
205 205
206 if (acl) {
207 error = posix_acl_valid(acl);
208 if (error < 0)
209 return error;
210 }
211
206 switch (type) { 212 switch (type) {
207 case ACL_TYPE_ACCESS: 213 case ACL_TYPE_ACCESS:
208 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; 214 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..4aa521aa9bc3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
33 struct address_space *mapping = META_MAPPING(sbi); 33 struct address_space *mapping = META_MAPPING(sbi);
34 struct page *page = NULL; 34 struct page *page = NULL;
35repeat: 35repeat:
36 page = grab_cache_page(mapping, index); 36 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
37 if (!page) { 37 if (!page) {
38 cond_resched(); 38 cond_resched();
39 goto repeat; 39 goto repeat;
40 } 40 }
41 41
42 /* We wait writeback only inside grab_meta_page() */
43 wait_on_page_writeback(page);
44 SetPageUptodate(page); 42 SetPageUptodate(page);
45 return page; 43 return page;
46} 44}
@@ -75,23 +73,102 @@ out:
75 return page; 73 return page;
76} 74}
77 75
76inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
77{
78 switch (type) {
79 case META_NAT:
80 return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
81 case META_SIT:
82 return SIT_BLK_CNT(sbi);
83 case META_SSA:
84 case META_CP:
85 return 0;
86 default:
87 BUG();
88 }
89}
90
91/*
92 * Readahead CP/NAT/SIT/SSA pages
93 */
94int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
95{
96 block_t prev_blk_addr = 0;
97 struct page *page;
98 int blkno = start;
99 int max_blks = get_max_meta_blks(sbi, type);
100
101 struct f2fs_io_info fio = {
102 .type = META,
103 .rw = READ_SYNC | REQ_META | REQ_PRIO
104 };
105
106 for (; nrpages-- > 0; blkno++) {
107 block_t blk_addr;
108
109 switch (type) {
110 case META_NAT:
111 /* get nat block addr */
112 if (unlikely(blkno >= max_blks))
113 blkno = 0;
114 blk_addr = current_nat_addr(sbi,
115 blkno * NAT_ENTRY_PER_BLOCK);
116 break;
117 case META_SIT:
118 /* get sit block addr */
119 if (unlikely(blkno >= max_blks))
120 goto out;
121 blk_addr = current_sit_addr(sbi,
122 blkno * SIT_ENTRY_PER_BLOCK);
123 if (blkno != start && prev_blk_addr + 1 != blk_addr)
124 goto out;
125 prev_blk_addr = blk_addr;
126 break;
127 case META_SSA:
128 case META_CP:
129 /* get ssa/cp block addr */
130 blk_addr = blkno;
131 break;
132 default:
133 BUG();
134 }
135
136 page = grab_cache_page(META_MAPPING(sbi), blk_addr);
137 if (!page)
138 continue;
139 if (PageUptodate(page)) {
140 mark_page_accessed(page);
141 f2fs_put_page(page, 1);
142 continue;
143 }
144
145 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
146 mark_page_accessed(page);
147 f2fs_put_page(page, 0);
148 }
149out:
150 f2fs_submit_merged_bio(sbi, META, READ);
151 return blkno - start;
152}
153
78static int f2fs_write_meta_page(struct page *page, 154static int f2fs_write_meta_page(struct page *page,
79 struct writeback_control *wbc) 155 struct writeback_control *wbc)
80{ 156{
81 struct inode *inode = page->mapping->host; 157 struct inode *inode = page->mapping->host;
82 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 158 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
83 159
84 /* Should not write any meta pages, if any IO error was occurred */ 160 if (unlikely(sbi->por_doing))
85 if (unlikely(sbi->por_doing ||
86 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
87 goto redirty_out; 161 goto redirty_out;
88
89 if (wbc->for_reclaim) 162 if (wbc->for_reclaim)
90 goto redirty_out; 163 goto redirty_out;
91 164
92 wait_on_page_writeback(page); 165 /* Should not write any meta pages, if any IO error was occurred */
166 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
167 goto no_write;
93 168
169 f2fs_wait_on_page_writeback(page, META);
94 write_meta_page(sbi, page); 170 write_meta_page(sbi, page);
171no_write:
95 dec_page_count(sbi, F2FS_DIRTY_META); 172 dec_page_count(sbi, F2FS_DIRTY_META);
96 unlock_page(page); 173 unlock_page(page);
97 return 0; 174 return 0;
@@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page,
99redirty_out: 176redirty_out:
100 dec_page_count(sbi, F2FS_DIRTY_META); 177 dec_page_count(sbi, F2FS_DIRTY_META);
101 wbc->pages_skipped++; 178 wbc->pages_skipped++;
179 account_page_redirty(page);
102 set_page_dirty(page); 180 set_page_dirty(page);
103 return AOP_WRITEPAGE_ACTIVATE; 181 return AOP_WRITEPAGE_ACTIVATE;
104} 182}
@@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
107 struct writeback_control *wbc) 185 struct writeback_control *wbc)
108{ 186{
109 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 187 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
110 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 188 long diff, written;
111 long written;
112
113 if (wbc->for_kupdate)
114 return 0;
115 189
116 /* collect a number of dirty meta pages and write together */ 190 /* collect a number of dirty meta pages and write together */
117 if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) 191 if (wbc->for_kupdate ||
118 return 0; 192 get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
193 goto skip_write;
119 194
120 /* if mounting is failed, skip writing node pages */ 195 /* if mounting is failed, skip writing node pages */
121 mutex_lock(&sbi->cp_mutex); 196 mutex_lock(&sbi->cp_mutex);
122 written = sync_meta_pages(sbi, META, nrpages); 197 diff = nr_pages_to_write(sbi, META, wbc);
198 written = sync_meta_pages(sbi, META, wbc->nr_to_write);
123 mutex_unlock(&sbi->cp_mutex); 199 mutex_unlock(&sbi->cp_mutex);
124 wbc->nr_to_write -= written; 200 wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
201 return 0;
202
203skip_write:
204 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
125 return 0; 205 return 0;
126} 206}
127 207
@@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
148 228
149 for (i = 0; i < nr_pages; i++) { 229 for (i = 0; i < nr_pages; i++) {
150 struct page *page = pvec.pages[i]; 230 struct page *page = pvec.pages[i];
231
151 lock_page(page); 232 lock_page(page);
152 f2fs_bug_on(page->mapping != mapping); 233
153 f2fs_bug_on(!PageDirty(page)); 234 if (unlikely(page->mapping != mapping)) {
154 clear_page_dirty_for_io(page); 235continue_unlock:
236 unlock_page(page);
237 continue;
238 }
239 if (!PageDirty(page)) {
240 /* someone wrote it for us */
241 goto continue_unlock;
242 }
243
244 if (!clear_page_dirty_for_io(page))
245 goto continue_unlock;
246
155 if (f2fs_write_meta_page(page, &wbc)) { 247 if (f2fs_write_meta_page(page, &wbc)) {
156 unlock_page(page); 248 unlock_page(page);
157 break; 249 break;
@@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
216 308
217void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 309void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
218{ 310{
219 struct list_head *head, *this; 311 struct list_head *head;
220 struct orphan_inode_entry *new = NULL, *orphan = NULL; 312 struct orphan_inode_entry *new, *orphan;
221 313
222 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); 314 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
223 new->ino = ino; 315 new->ino = ino;
224 316
225 spin_lock(&sbi->orphan_inode_lock); 317 spin_lock(&sbi->orphan_inode_lock);
226 head = &sbi->orphan_inode_list; 318 head = &sbi->orphan_inode_list;
227 list_for_each(this, head) { 319 list_for_each_entry(orphan, head, list) {
228 orphan = list_entry(this, struct orphan_inode_entry, list);
229 if (orphan->ino == ino) { 320 if (orphan->ino == ino) {
230 spin_unlock(&sbi->orphan_inode_lock); 321 spin_unlock(&sbi->orphan_inode_lock);
231 kmem_cache_free(orphan_entry_slab, new); 322 kmem_cache_free(orphan_entry_slab, new);
@@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
234 325
235 if (orphan->ino > ino) 326 if (orphan->ino > ino)
236 break; 327 break;
237 orphan = NULL;
238 } 328 }
239 329
240 /* add new_oentry into list which is sorted by inode number */ 330 /* add new orphan entry into list which is sorted by inode number */
241 if (orphan) 331 list_add_tail(&new->list, &orphan->list);
242 list_add(&new->list, this->prev);
243 else
244 list_add_tail(&new->list, head);
245 spin_unlock(&sbi->orphan_inode_lock); 332 spin_unlock(&sbi->orphan_inode_lock);
246} 333}
247 334
@@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
255 list_for_each_entry(orphan, head, list) { 342 list_for_each_entry(orphan, head, list) {
256 if (orphan->ino == ino) { 343 if (orphan->ino == ino) {
257 list_del(&orphan->list); 344 list_del(&orphan->list);
258 kmem_cache_free(orphan_entry_slab, orphan);
259 f2fs_bug_on(sbi->n_orphans == 0); 345 f2fs_bug_on(sbi->n_orphans == 0);
260 sbi->n_orphans--; 346 sbi->n_orphans--;
261 break; 347 spin_unlock(&sbi->orphan_inode_lock);
348 kmem_cache_free(orphan_entry_slab, orphan);
349 return;
262 } 350 }
263 } 351 }
264 spin_unlock(&sbi->orphan_inode_lock); 352 spin_unlock(&sbi->orphan_inode_lock);
@@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
285 start_blk = __start_cp_addr(sbi) + 1; 373 start_blk = __start_cp_addr(sbi) + 1;
286 orphan_blkaddr = __start_sum_addr(sbi) - 1; 374 orphan_blkaddr = __start_sum_addr(sbi) - 1;
287 375
376 ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
377
288 for (i = 0; i < orphan_blkaddr; i++) { 378 for (i = 0; i < orphan_blkaddr; i++) {
289 struct page *page = get_meta_page(sbi, start_blk + i); 379 struct page *page = get_meta_page(sbi, start_blk + i);
290 struct f2fs_orphan_block *orphan_blk; 380 struct f2fs_orphan_block *orphan_blk;
@@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
466{ 556{
467 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 557 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
468 struct list_head *head = &sbi->dir_inode_list; 558 struct list_head *head = &sbi->dir_inode_list;
469 struct list_head *this; 559 struct dir_inode_entry *entry;
470 560
471 list_for_each(this, head) { 561 list_for_each_entry(entry, head, list)
472 struct dir_inode_entry *entry;
473 entry = list_entry(this, struct dir_inode_entry, list);
474 if (unlikely(entry->inode == inode)) 562 if (unlikely(entry->inode == inode))
475 return -EEXIST; 563 return -EEXIST;
476 } 564
477 list_add_tail(&new->list, head); 565 list_add_tail(&new->list, head);
478 stat_inc_dirty_dir(sbi); 566 stat_inc_dirty_dir(sbi);
479 return 0; 567 return 0;
@@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
483{ 571{
484 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 572 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
485 struct dir_inode_entry *new; 573 struct dir_inode_entry *new;
574 int ret = 0;
486 575
487 if (!S_ISDIR(inode->i_mode)) 576 if (!S_ISDIR(inode->i_mode))
488 return; 577 return;
@@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
492 INIT_LIST_HEAD(&new->list); 581 INIT_LIST_HEAD(&new->list);
493 582
494 spin_lock(&sbi->dir_inode_lock); 583 spin_lock(&sbi->dir_inode_lock);
495 if (__add_dirty_inode(inode, new)) 584 ret = __add_dirty_inode(inode, new);
496 kmem_cache_free(inode_entry_slab, new);
497
498 inc_page_count(sbi, F2FS_DIRTY_DENTS);
499 inode_inc_dirty_dents(inode); 585 inode_inc_dirty_dents(inode);
500 SetPagePrivate(page); 586 SetPagePrivate(page);
501 spin_unlock(&sbi->dir_inode_lock); 587 spin_unlock(&sbi->dir_inode_lock);
588
589 if (ret)
590 kmem_cache_free(inode_entry_slab, new);
502} 591}
503 592
504void add_dirty_dir_inode(struct inode *inode) 593void add_dirty_dir_inode(struct inode *inode)
@@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode)
506 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 595 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
507 struct dir_inode_entry *new = 596 struct dir_inode_entry *new =
508 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 597 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
598 int ret = 0;
509 599
510 new->inode = inode; 600 new->inode = inode;
511 INIT_LIST_HEAD(&new->list); 601 INIT_LIST_HEAD(&new->list);
512 602
513 spin_lock(&sbi->dir_inode_lock); 603 spin_lock(&sbi->dir_inode_lock);
514 if (__add_dirty_inode(inode, new)) 604 ret = __add_dirty_inode(inode, new);
515 kmem_cache_free(inode_entry_slab, new);
516 spin_unlock(&sbi->dir_inode_lock); 605 spin_unlock(&sbi->dir_inode_lock);
606
607 if (ret)
608 kmem_cache_free(inode_entry_slab, new);
517} 609}
518 610
519void remove_dirty_dir_inode(struct inode *inode) 611void remove_dirty_dir_inode(struct inode *inode)
520{ 612{
521 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 613 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
522 614 struct list_head *head;
523 struct list_head *this, *head; 615 struct dir_inode_entry *entry;
524 616
525 if (!S_ISDIR(inode->i_mode)) 617 if (!S_ISDIR(inode->i_mode))
526 return; 618 return;
527 619
528 spin_lock(&sbi->dir_inode_lock); 620 spin_lock(&sbi->dir_inode_lock);
529 if (atomic_read(&F2FS_I(inode)->dirty_dents)) { 621 if (get_dirty_dents(inode)) {
530 spin_unlock(&sbi->dir_inode_lock); 622 spin_unlock(&sbi->dir_inode_lock);
531 return; 623 return;
532 } 624 }
533 625
534 head = &sbi->dir_inode_list; 626 head = &sbi->dir_inode_list;
535 list_for_each(this, head) { 627 list_for_each_entry(entry, head, list) {
536 struct dir_inode_entry *entry;
537 entry = list_entry(this, struct dir_inode_entry, list);
538 if (entry->inode == inode) { 628 if (entry->inode == inode) {
539 list_del(&entry->list); 629 list_del(&entry->list);
540 kmem_cache_free(inode_entry_slab, entry);
541 stat_dec_dirty_dir(sbi); 630 stat_dec_dirty_dir(sbi);
542 break; 631 spin_unlock(&sbi->dir_inode_lock);
632 kmem_cache_free(inode_entry_slab, entry);
633 goto done;
543 } 634 }
544 } 635 }
545 spin_unlock(&sbi->dir_inode_lock); 636 spin_unlock(&sbi->dir_inode_lock);
546 637
638done:
547 /* Only from the recovery routine */ 639 /* Only from the recovery routine */
548 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { 640 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
549 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); 641 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
@@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode)
554struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) 646struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
555{ 647{
556 648
557 struct list_head *this, *head; 649 struct list_head *head;
558 struct inode *inode = NULL; 650 struct inode *inode = NULL;
651 struct dir_inode_entry *entry;
559 652
560 spin_lock(&sbi->dir_inode_lock); 653 spin_lock(&sbi->dir_inode_lock);
561 654
562 head = &sbi->dir_inode_list; 655 head = &sbi->dir_inode_list;
563 list_for_each(this, head) { 656 list_for_each_entry(entry, head, list) {
564 struct dir_inode_entry *entry;
565 entry = list_entry(this, struct dir_inode_entry, list);
566 if (entry->inode->i_ino == ino) { 657 if (entry->inode->i_ino == ino) {
567 inode = entry->inode; 658 inode = entry->inode;
568 break; 659 break;
@@ -589,7 +680,7 @@ retry:
589 inode = igrab(entry->inode); 680 inode = igrab(entry->inode);
590 spin_unlock(&sbi->dir_inode_lock); 681 spin_unlock(&sbi->dir_inode_lock);
591 if (inode) { 682 if (inode) {
592 filemap_flush(inode->i_mapping); 683 filemap_fdatawrite(inode->i_mapping);
593 iput(inode); 684 iput(inode);
594 } else { 685 } else {
595 /* 686 /*
@@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
824 unblock_operations(sbi); 915 unblock_operations(sbi);
825 mutex_unlock(&sbi->cp_mutex); 916 mutex_unlock(&sbi->cp_mutex);
826 917
918 stat_inc_cp_count(sbi->stat_info);
827 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 919 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
828} 920}
829 921
@@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
845int __init create_checkpoint_caches(void) 937int __init create_checkpoint_caches(void)
846{ 938{
847 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 939 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
848 sizeof(struct orphan_inode_entry), NULL); 940 sizeof(struct orphan_inode_entry));
849 if (!orphan_entry_slab) 941 if (!orphan_entry_slab)
850 return -ENOMEM; 942 return -ENOMEM;
851 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 943 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
852 sizeof(struct dir_inode_entry), NULL); 944 sizeof(struct dir_inode_entry));
853 if (!inode_entry_slab) { 945 if (!inode_entry_slab) {
854 kmem_cache_destroy(orphan_entry_slab); 946 kmem_cache_destroy(orphan_entry_slab);
855 return -ENOMEM; 947 return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2261ccdd0b5f..45abd60e2bff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
45 45
46static void f2fs_write_end_io(struct bio *bio, int err) 46static void f2fs_write_end_io(struct bio *bio, int err)
47{ 47{
48 struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); 48 struct f2fs_sb_info *sbi = bio->bi_private;
49 struct bio_vec *bvec; 49 struct bio_vec *bvec;
50 int i; 50 int i;
51 51
@@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err)
55 if (unlikely(err)) { 55 if (unlikely(err)) {
56 SetPageError(page); 56 SetPageError(page);
57 set_bit(AS_EIO, &page->mapping->flags); 57 set_bit(AS_EIO, &page->mapping->flags);
58 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 58 f2fs_stop_checkpoint(sbi);
59 sbi->sb->s_flags |= MS_RDONLY;
60 } 59 }
61 end_page_writeback(page); 60 end_page_writeback(page);
62 dec_page_count(sbi, F2FS_WRITEBACK); 61 dec_page_count(sbi, F2FS_WRITEBACK);
63 } 62 }
64 63
65 if (bio->bi_private) 64 if (sbi->wait_io) {
66 complete(bio->bi_private); 65 complete(sbi->wait_io);
66 sbi->wait_io = NULL;
67 }
67 68
68 if (!get_pages(sbi, F2FS_WRITEBACK) && 69 if (!get_pages(sbi, F2FS_WRITEBACK) &&
69 !list_empty(&sbi->cp_wait.task_list)) 70 !list_empty(&sbi->cp_wait.task_list))
@@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
86 bio->bi_bdev = sbi->sb->s_bdev; 87 bio->bi_bdev = sbi->sb->s_bdev;
87 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
88 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
90 bio->bi_private = sbi;
89 91
90 return bio; 92 return bio;
91} 93}
@@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
113 */ 115 */
114 if (fio->type == META_FLUSH) { 116 if (fio->type == META_FLUSH) {
115 DECLARE_COMPLETION_ONSTACK(wait); 117 DECLARE_COMPLETION_ONSTACK(wait);
116 io->bio->bi_private = &wait; 118 io->sbi->wait_io = &wait;
117 submit_bio(rw, io->bio); 119 submit_bio(rw, io->bio);
118 wait_for_completion(&wait); 120 wait_for_completion(&wait);
119 } else { 121 } else {
@@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
132 134
133 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; 135 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
134 136
135 mutex_lock(&io->io_mutex); 137 down_write(&io->io_rwsem);
136 138
137 /* change META to META_FLUSH in the checkpoint procedure */ 139 /* change META to META_FLUSH in the checkpoint procedure */
138 if (type >= META_FLUSH) { 140 if (type >= META_FLUSH) {
@@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
140 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; 142 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
141 } 143 }
142 __submit_merged_bio(io); 144 __submit_merged_bio(io);
143 mutex_unlock(&io->io_mutex); 145 up_write(&io->io_rwsem);
144} 146}
145 147
146/* 148/*
@@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
178 180
179 verify_block_addr(sbi, blk_addr); 181 verify_block_addr(sbi, blk_addr);
180 182
181 mutex_lock(&io->io_mutex); 183 down_write(&io->io_rwsem);
182 184
183 if (!is_read) 185 if (!is_read)
184 inc_page_count(sbi, F2FS_WRITEBACK); 186 inc_page_count(sbi, F2FS_WRITEBACK);
@@ -202,7 +204,7 @@ alloc_new:
202 204
203 io->last_block_in_bio = blk_addr; 205 io->last_block_in_bio = blk_addr;
204 206
205 mutex_unlock(&io->io_mutex); 207 up_write(&io->io_rwsem);
206 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); 208 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
207} 209}
208 210
@@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page,
797 */ 799 */
798 offset = i_size & (PAGE_CACHE_SIZE - 1); 800 offset = i_size & (PAGE_CACHE_SIZE - 1);
799 if ((page->index >= end_index + 1) || !offset) { 801 if ((page->index >= end_index + 1) || !offset) {
800 if (S_ISDIR(inode->i_mode)) { 802 inode_dec_dirty_dents(inode);
801 dec_page_count(sbi, F2FS_DIRTY_DENTS);
802 inode_dec_dirty_dents(inode);
803 }
804 goto out; 803 goto out;
805 } 804 }
806 805
807 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 806 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
808write: 807write:
809 if (unlikely(sbi->por_doing)) { 808 if (unlikely(sbi->por_doing))
810 err = AOP_WRITEPAGE_ACTIVATE;
811 goto redirty_out; 809 goto redirty_out;
812 }
813 810
814 /* Dentry blocks are controlled by checkpoint */ 811 /* Dentry blocks are controlled by checkpoint */
815 if (S_ISDIR(inode->i_mode)) { 812 if (S_ISDIR(inode->i_mode)) {
816 dec_page_count(sbi, F2FS_DIRTY_DENTS);
817 inode_dec_dirty_dents(inode); 813 inode_dec_dirty_dents(inode);
818 err = do_write_data_page(page, &fio); 814 err = do_write_data_page(page, &fio);
819 } else { 815 goto done;
820 f2fs_lock_op(sbi); 816 }
821
822 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
823 err = f2fs_write_inline_data(inode, page, offset);
824 f2fs_unlock_op(sbi);
825 goto out;
826 } else {
827 err = do_write_data_page(page, &fio);
828 }
829 817
830 f2fs_unlock_op(sbi); 818 if (!wbc->for_reclaim)
831 need_balance_fs = true; 819 need_balance_fs = true;
832 } 820 else if (has_not_enough_free_secs(sbi, 0))
833 if (err == -ENOENT)
834 goto out;
835 else if (err)
836 goto redirty_out; 821 goto redirty_out;
837 822
838 if (wbc->for_reclaim) { 823 f2fs_lock_op(sbi);
839 f2fs_submit_merged_bio(sbi, DATA, WRITE); 824 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
840 need_balance_fs = false; 825 err = f2fs_write_inline_data(inode, page, offset);
841 } 826 else
827 err = do_write_data_page(page, &fio);
828 f2fs_unlock_op(sbi);
829done:
830 if (err && err != -ENOENT)
831 goto redirty_out;
842 832
843 clear_cold_data(page); 833 clear_cold_data(page);
844out: 834out:
@@ -849,12 +839,11 @@ out:
849 839
850redirty_out: 840redirty_out:
851 wbc->pages_skipped++; 841 wbc->pages_skipped++;
842 account_page_redirty(page);
852 set_page_dirty(page); 843 set_page_dirty(page);
853 return err; 844 return AOP_WRITEPAGE_ACTIVATE;
854} 845}
855 846
856#define MAX_DESIRED_PAGES_WP 4096
857
858static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, 847static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
859 void *data) 848 void *data)
860{ 849{
@@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping,
871 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 860 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
872 bool locked = false; 861 bool locked = false;
873 int ret; 862 int ret;
874 long excess_nrtw = 0, desired_nrtw; 863 long diff;
875 864
876 /* deal with chardevs and other special file */ 865 /* deal with chardevs and other special file */
877 if (!mapping->a_ops->writepage) 866 if (!mapping->a_ops->writepage)
878 return 0; 867 return 0;
879 868
880 if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { 869 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
881 desired_nrtw = MAX_DESIRED_PAGES_WP; 870 get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
882 excess_nrtw = desired_nrtw - wbc->nr_to_write; 871 goto skip_write;
883 wbc->nr_to_write = desired_nrtw; 872
884 } 873 diff = nr_pages_to_write(sbi, DATA, wbc);
885 874
886 if (!S_ISDIR(inode->i_mode)) { 875 if (!S_ISDIR(inode->i_mode)) {
887 mutex_lock(&sbi->writepages); 876 mutex_lock(&sbi->writepages);
@@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping,
895 884
896 remove_dirty_dir_inode(inode); 885 remove_dirty_dir_inode(inode);
897 886
898 wbc->nr_to_write -= excess_nrtw; 887 wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
899 return ret; 888 return ret;
889
890skip_write:
891 wbc->pages_skipped += get_dirty_dents(inode);
892 return 0;
900} 893}
901 894
902static int f2fs_write_begin(struct file *file, struct address_space *mapping, 895static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -949,13 +942,19 @@ inline_data:
949 if (dn.data_blkaddr == NEW_ADDR) { 942 if (dn.data_blkaddr == NEW_ADDR) {
950 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 943 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
951 } else { 944 } else {
952 if (f2fs_has_inline_data(inode)) 945 if (f2fs_has_inline_data(inode)) {
953 err = f2fs_read_inline_data(inode, page); 946 err = f2fs_read_inline_data(inode, page);
954 else 947 if (err) {
948 page_cache_release(page);
949 return err;
950 }
951 } else {
955 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 952 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
956 READ_SYNC); 953 READ_SYNC);
957 if (err) 954 if (err)
958 return err; 955 return err;
956 }
957
959 lock_page(page); 958 lock_page(page);
960 if (unlikely(!PageUptodate(page))) { 959 if (unlikely(!PageUptodate(page))) {
961 f2fs_put_page(page, 1); 960 f2fs_put_page(page, 1);
@@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
1031 unsigned int length) 1030 unsigned int length)
1032{ 1031{
1033 struct inode *inode = page->mapping->host; 1032 struct inode *inode = page->mapping->host;
1034 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1033 if (PageDirty(page))
1035 if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
1036 dec_page_count(sbi, F2FS_DIRTY_DENTS);
1037 inode_dec_dirty_dents(inode); 1034 inode_dec_dirty_dents(inode);
1038 }
1039 ClearPagePrivate(page); 1035 ClearPagePrivate(page);
1040} 1036}
1041 1037
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 3de9d20d0c14..b52c12cf5873 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
86{ 86{
87 struct f2fs_stat_info *si = F2FS_STAT(sbi); 87 struct f2fs_stat_info *si = F2FS_STAT(sbi);
88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; 88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
89 struct sit_info *sit_i = SIT_I(sbi);
90 unsigned int segno, vblocks; 89 unsigned int segno, vblocks;
91 int ndirty = 0; 90 int ndirty = 0;
92 91
@@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
94 total_vblocks = 0; 93 total_vblocks = 0;
95 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
96 hblks_per_sec = blks_per_sec / 2; 95 hblks_per_sec = blks_per_sec / 2;
97 mutex_lock(&sit_i->sentry_lock);
98 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 96 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
99 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
100 dist = abs(vblocks - hblks_per_sec); 98 dist = abs(vblocks - hblks_per_sec);
@@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
105 ndirty++; 103 ndirty++;
106 } 104 }
107 } 105 }
108 mutex_unlock(&sit_i->sentry_lock);
109 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 106 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
110 si->bimodal = bimodal / dist; 107 si->bimodal = bimodal / dist;
111 if (si->dirty_count) 108 if (si->dirty_count)
@@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v)
236 si->dirty_count); 233 si->dirty_count);
237 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", 234 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
238 si->prefree_count, si->free_segs, si->free_secs); 235 si->prefree_count, si->free_segs, si->free_secs);
236 seq_printf(s, "CP calls: %d\n", si->cp_count);
239 seq_printf(s, "GC calls: %d (BG: %d)\n", 237 seq_printf(s, "GC calls: %d (BG: %d)\n",
240 si->call_count, si->bg_gc); 238 si->call_count, si->bg_gc);
241 seq_printf(s, " - data segments : %d\n", si->data_segs); 239 seq_printf(s, " - data segments : %d\n", si->data_segs);
@@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v)
252 si->ndirty_dent, si->ndirty_dirs); 250 si->ndirty_dent, si->ndirty_dirs);
253 seq_printf(s, " - meta: %4d in %4d\n", 251 seq_printf(s, " - meta: %4d in %4d\n",
254 si->ndirty_meta, si->meta_pages); 252 si->ndirty_meta, si->meta_pages);
255 seq_printf(s, " - NATs: %5d > %lu\n", 253 seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
256 si->nats, NM_WOUT_THRESHOLD); 254 si->nats, si->sits);
257 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 255 seq_printf(s, " - free_nids: %9d\n",
258 si->sits, si->fnids); 256 si->fnids);
259 seq_puts(s, "\nDistribution of User Blocks:"); 257 seq_puts(s, "\nDistribution of User Blocks:");
260 seq_puts(s, " [ valid | invalid | free ]\n"); 258 seq_puts(s, " [ valid | invalid | free ]\n");
261 seq_puts(s, " ["); 259 seq_puts(s, " [");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2b7c255bcbdf..972fd0ef230f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode)
21 >> PAGE_CACHE_SHIFT; 21 >> PAGE_CACHE_SHIFT;
22} 22}
23 23
24static unsigned int dir_buckets(unsigned int level) 24static unsigned int dir_buckets(unsigned int level, int dir_level)
25{ 25{
26 if (level < MAX_DIR_HASH_DEPTH / 2) 26 if (level < MAX_DIR_HASH_DEPTH / 2)
27 return 1 << level; 27 return 1 << (level + dir_level);
28 else 28 else
29 return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); 29 return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1);
30} 30}
31 31
32static unsigned int bucket_blocks(unsigned int level) 32static unsigned int bucket_blocks(unsigned int level)
@@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; 65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
66} 66}
67 67
68static unsigned long dir_block_index(unsigned int level, unsigned int idx) 68static unsigned long dir_block_index(unsigned int level,
69 int dir_level, unsigned int idx)
69{ 70{
70 unsigned long i; 71 unsigned long i;
71 unsigned long bidx = 0; 72 unsigned long bidx = 0;
72 73
73 for (i = 0; i < level; i++) 74 for (i = 0; i < level; i++)
74 bidx += dir_buckets(i) * bucket_blocks(i); 75 bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
75 bidx += idx * bucket_blocks(level); 76 bidx += idx * bucket_blocks(level);
76 return bidx; 77 return bidx;
77} 78}
@@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
93 f2fs_hash_t namehash, struct page **res_page) 94 f2fs_hash_t namehash, struct page **res_page)
94{ 95{
95 struct f2fs_dir_entry *de; 96 struct f2fs_dir_entry *de;
96 unsigned long bit_pos, end_pos, next_pos; 97 unsigned long bit_pos = 0;
97 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); 98 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
98 int slots; 99 const void *dentry_bits = &dentry_blk->dentry_bitmap;
100 int max_len = 0;
99 101
100 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
101 NR_DENTRY_IN_BLOCK, 0);
102 while (bit_pos < NR_DENTRY_IN_BLOCK) { 102 while (bit_pos < NR_DENTRY_IN_BLOCK) {
103 if (!test_bit_le(bit_pos, dentry_bits)) {
104 if (bit_pos == 0)
105 max_len = 1;
106 else if (!test_bit_le(bit_pos - 1, dentry_bits))
107 max_len++;
108 bit_pos++;
109 continue;
110 }
103 de = &dentry_blk->dentry[bit_pos]; 111 de = &dentry_blk->dentry[bit_pos];
104 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
105
106 if (early_match_name(name, namelen, namehash, de)) { 112 if (early_match_name(name, namelen, namehash, de)) {
107 if (!memcmp(dentry_blk->filename[bit_pos], 113 if (!memcmp(dentry_blk->filename[bit_pos],
108 name, namelen)) { 114 name, namelen)) {
@@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
110 goto found; 116 goto found;
111 } 117 }
112 } 118 }
113 next_pos = bit_pos + slots; 119 if (max_len > *max_slots) {
114 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 120 *max_slots = max_len;
115 NR_DENTRY_IN_BLOCK, next_pos); 121 max_len = 0;
116 if (bit_pos >= NR_DENTRY_IN_BLOCK) 122 }
117 end_pos = NR_DENTRY_IN_BLOCK; 123 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
118 else
119 end_pos = bit_pos;
120 if (*max_slots < end_pos - next_pos)
121 *max_slots = end_pos - next_pos;
122 } 124 }
123 125
124 de = NULL; 126 de = NULL;
125 kunmap(dentry_page); 127 kunmap(dentry_page);
126found: 128found:
129 if (max_len > *max_slots)
130 *max_slots = max_len;
127 return de; 131 return de;
128} 132}
129 133
@@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
141 145
142 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 146 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
143 147
144 nbucket = dir_buckets(level); 148 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
145 nblock = bucket_blocks(level); 149 nblock = bucket_blocks(level);
146 150
147 bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); 151 bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
152 le32_to_cpu(namehash) % nbucket);
148 end_block = bidx + nblock; 153 end_block = bidx + nblock;
149 154
150 for (; bidx < end_block; bidx++) { 155 for (; bidx < end_block; bidx++) {
@@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
248 struct page *page, struct inode *inode) 253 struct page *page, struct inode *inode)
249{ 254{
250 lock_page(page); 255 lock_page(page);
251 wait_on_page_writeback(page); 256 f2fs_wait_on_page_writeback(page, DATA);
252 de->ino = cpu_to_le32(inode->i_ino); 257 de->ino = cpu_to_le32(inode->i_ino);
253 set_de_type(de, inode); 258 set_de_type(de, inode);
254 kunmap(page); 259 kunmap(page);
@@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode,
347 err = f2fs_init_security(inode, dir, name, page); 352 err = f2fs_init_security(inode, dir, name, page);
348 if (err) 353 if (err)
349 goto put_error; 354 goto put_error;
350
351 wait_on_page_writeback(page);
352 } else { 355 } else {
353 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 356 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
354 if (IS_ERR(page)) 357 if (IS_ERR(page))
355 return page; 358 return page;
356 359
357 wait_on_page_writeback(page);
358 set_cold_node(inode, page); 360 set_cold_node(inode, page);
359 } 361 }
360 362
@@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode,
372 374
373put_error: 375put_error:
374 f2fs_put_page(page, 1); 376 f2fs_put_page(page, 1);
377 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
378 truncate_inode_pages(&inode->i_data, 0);
379 truncate_blocks(inode, 0);
380 remove_dirty_dir_inode(inode);
375error: 381error:
376 remove_inode_page(inode); 382 remove_inode_page(inode);
377 return ERR_PTR(err); 383 return ERR_PTR(err);
@@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
395 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 401 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
396 } 402 }
397 403
398 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
399 update_inode_page(dir);
400
401 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 404 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
402 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 405 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
403} 406}
@@ -464,10 +467,11 @@ start:
464 if (level == current_depth) 467 if (level == current_depth)
465 ++current_depth; 468 ++current_depth;
466 469
467 nbucket = dir_buckets(level); 470 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
468 nblock = bucket_blocks(level); 471 nblock = bucket_blocks(level);
469 472
470 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); 473 bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
474 (le32_to_cpu(dentry_hash) % nbucket));
471 475
472 for (block = bidx; block <= (bidx + nblock - 1); block++) { 476 for (block = bidx; block <= (bidx + nblock - 1); block++) {
473 dentry_page = get_new_data_page(dir, NULL, block, true); 477 dentry_page = get_new_data_page(dir, NULL, block, true);
@@ -487,8 +491,9 @@ start:
487 ++level; 491 ++level;
488 goto start; 492 goto start;
489add_dentry: 493add_dentry:
490 wait_on_page_writeback(dentry_page); 494 f2fs_wait_on_page_writeback(dentry_page, DATA);
491 495
496 down_write(&F2FS_I(inode)->i_sem);
492 page = init_inode_metadata(inode, dir, name); 497 page = init_inode_metadata(inode, dir, name);
493 if (IS_ERR(page)) { 498 if (IS_ERR(page)) {
494 err = PTR_ERR(page); 499 err = PTR_ERR(page);
@@ -511,7 +516,12 @@ add_dentry:
511 516
512 update_parent_metadata(dir, inode, current_depth); 517 update_parent_metadata(dir, inode, current_depth);
513fail: 518fail:
514 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 519 up_write(&F2FS_I(inode)->i_sem);
520
521 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
522 update_inode_page(dir);
523 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
524 }
515 kunmap(dentry_page); 525 kunmap(dentry_page);
516 f2fs_put_page(dentry_page, 1); 526 f2fs_put_page(dentry_page, 1);
517 return err; 527 return err;
@@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
528 unsigned int bit_pos; 538 unsigned int bit_pos;
529 struct address_space *mapping = page->mapping; 539 struct address_space *mapping = page->mapping;
530 struct inode *dir = mapping->host; 540 struct inode *dir = mapping->host;
531 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
532 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 541 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
533 void *kaddr = page_address(page); 542 void *kaddr = page_address(page);
534 int i; 543 int i;
535 544
536 lock_page(page); 545 lock_page(page);
537 wait_on_page_writeback(page); 546 f2fs_wait_on_page_writeback(page, DATA);
538 547
539 dentry_blk = (struct f2fs_dentry_block *)kaddr; 548 dentry_blk = (struct f2fs_dentry_block *)kaddr;
540 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; 549 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
@@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
551 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 560 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
552 561
553 if (inode) { 562 if (inode) {
563 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
564
565 down_write(&F2FS_I(inode)->i_sem);
566
554 if (S_ISDIR(inode->i_mode)) { 567 if (S_ISDIR(inode->i_mode)) {
555 drop_nlink(dir); 568 drop_nlink(dir);
556 update_inode_page(dir); 569 update_inode_page(dir);
@@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
561 drop_nlink(inode); 574 drop_nlink(inode);
562 i_size_write(inode, 0); 575 i_size_write(inode, 0);
563 } 576 }
577 up_write(&F2FS_I(inode)->i_sem);
564 update_inode_page(inode); 578 update_inode_page(inode);
565 579
566 if (inode->i_nlink == 0) 580 if (inode->i_nlink == 0)
@@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
573 truncate_hole(dir, page->index, page->index + 1); 587 truncate_hole(dir, page->index, page->index + 1);
574 clear_page_dirty_for_io(page); 588 clear_page_dirty_for_io(page);
575 ClearPageUptodate(page); 589 ClearPageUptodate(page);
576 dec_page_count(sbi, F2FS_DIRTY_DENTS);
577 inode_dec_dirty_dents(dir); 590 inode_dec_dirty_dents(dir);
578 } 591 }
579 f2fs_put_page(page, 1); 592 f2fs_put_page(page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fc3c558cb4f3..2ecac8312359 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -40,6 +40,7 @@
40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
41#define F2FS_MOUNT_INLINE_XATTR 0x00000080 41#define F2FS_MOUNT_INLINE_XATTR 0x00000080
42#define F2FS_MOUNT_INLINE_DATA 0x00000100 42#define F2FS_MOUNT_INLINE_DATA 0x00000100
43#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
43 44
44#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 45#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
45#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 46#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -88,6 +89,16 @@ enum {
88 SIT_BITMAP 89 SIT_BITMAP
89}; 90};
90 91
92/*
93 * For CP/NAT/SIT/SSA readahead
94 */
95enum {
96 META_CP,
97 META_NAT,
98 META_SIT,
99 META_SSA
100};
101
91/* for the list of orphan inodes */ 102/* for the list of orphan inodes */
92struct orphan_inode_entry { 103struct orphan_inode_entry {
93 struct list_head list; /* list head */ 104 struct list_head list; /* list head */
@@ -187,16 +198,20 @@ struct extent_info {
187#define FADVISE_COLD_BIT 0x01 198#define FADVISE_COLD_BIT 0x01
188#define FADVISE_LOST_PINO_BIT 0x02 199#define FADVISE_LOST_PINO_BIT 0x02
189 200
201#define DEF_DIR_LEVEL 0
202
190struct f2fs_inode_info { 203struct f2fs_inode_info {
191 struct inode vfs_inode; /* serve a vfs inode */ 204 struct inode vfs_inode; /* serve a vfs inode */
192 unsigned long i_flags; /* keep an inode flags for ioctl */ 205 unsigned long i_flags; /* keep an inode flags for ioctl */
193 unsigned char i_advise; /* use to give file attribute hints */ 206 unsigned char i_advise; /* use to give file attribute hints */
207 unsigned char i_dir_level; /* use for dentry level for large dir */
194 unsigned int i_current_depth; /* use only in directory structure */ 208 unsigned int i_current_depth; /* use only in directory structure */
195 unsigned int i_pino; /* parent inode number */ 209 unsigned int i_pino; /* parent inode number */
196 umode_t i_acl_mode; /* keep file acl mode temporarily */ 210 umode_t i_acl_mode; /* keep file acl mode temporarily */
197 211
198 /* Use below internally in f2fs*/ 212 /* Use below internally in f2fs*/
199 unsigned long flags; /* use to pass per-file flags */ 213 unsigned long flags; /* use to pass per-file flags */
214 struct rw_semaphore i_sem; /* protect fi info */
200 atomic_t dirty_dents; /* # of dirty dentry pages */ 215 atomic_t dirty_dents; /* # of dirty dentry pages */
201 f2fs_hash_t chash; /* hash value of given file name */ 216 f2fs_hash_t chash; /* hash value of given file name */
202 unsigned int clevel; /* maximum level of given file name */ 217 unsigned int clevel; /* maximum level of given file name */
@@ -229,6 +244,7 @@ struct f2fs_nm_info {
229 block_t nat_blkaddr; /* base disk address of NAT */ 244 block_t nat_blkaddr; /* base disk address of NAT */
230 nid_t max_nid; /* maximum possible node ids */ 245 nid_t max_nid; /* maximum possible node ids */
231 nid_t next_scan_nid; /* the next nid to be scanned */ 246 nid_t next_scan_nid; /* the next nid to be scanned */
247 unsigned int ram_thresh; /* control the memory footprint */
232 248
233 /* NAT cache management */ 249 /* NAT cache management */
234 struct radix_tree_root nat_root;/* root of the nat entry cache */ 250 struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -238,6 +254,7 @@ struct f2fs_nm_info {
238 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 254 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
239 255
240 /* free node ids management */ 256 /* free node ids management */
257 struct radix_tree_root free_nid_root;/* root of the free_nid cache */
241 struct list_head free_nid_list; /* a list for free nids */ 258 struct list_head free_nid_list; /* a list for free nids */
242 spinlock_t free_nid_list_lock; /* protect free nid list */ 259 spinlock_t free_nid_list_lock; /* protect free nid list */
243 unsigned int fcnt; /* the number of free node id */ 260 unsigned int fcnt; /* the number of free node id */
@@ -300,6 +317,12 @@ enum {
300 NO_CHECK_TYPE 317 NO_CHECK_TYPE
301}; 318};
302 319
320struct flush_cmd {
321 struct flush_cmd *next;
322 struct completion wait;
323 int ret;
324};
325
303struct f2fs_sm_info { 326struct f2fs_sm_info {
304 struct sit_info *sit_info; /* whole segment information */ 327 struct sit_info *sit_info; /* whole segment information */
305 struct free_segmap_info *free_info; /* free segment information */ 328 struct free_segmap_info *free_info; /* free segment information */
@@ -328,6 +351,14 @@ struct f2fs_sm_info {
328 351
329 unsigned int ipu_policy; /* in-place-update policy */ 352 unsigned int ipu_policy; /* in-place-update policy */
330 unsigned int min_ipu_util; /* in-place-update threshold */ 353 unsigned int min_ipu_util; /* in-place-update threshold */
354
355 /* for flush command control */
356 struct task_struct *f2fs_issue_flush; /* flush thread */
357 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
358 struct flush_cmd *issue_list; /* list for command issue */
359 struct flush_cmd *dispatch_list; /* list for command dispatch */
360 spinlock_t issue_lock; /* for issue list lock */
361 struct flush_cmd *issue_tail; /* list tail of issue list */
331}; 362};
332 363
333/* 364/*
@@ -378,7 +409,7 @@ struct f2fs_bio_info {
378 struct bio *bio; /* bios to merge */ 409 struct bio *bio; /* bios to merge */
379 sector_t last_block_in_bio; /* last block number */ 410 sector_t last_block_in_bio; /* last block number */
380 struct f2fs_io_info fio; /* store buffered io info. */ 411 struct f2fs_io_info fio; /* store buffered io info. */
381 struct mutex io_mutex; /* mutex for bio */ 412 struct rw_semaphore io_rwsem; /* blocking op for bio */
382}; 413};
383 414
384struct f2fs_sb_info { 415struct f2fs_sb_info {
@@ -398,6 +429,7 @@ struct f2fs_sb_info {
398 /* for bio operations */ 429 /* for bio operations */
399 struct f2fs_bio_info read_io; /* for read bios */ 430 struct f2fs_bio_info read_io; /* for read bios */
400 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ 431 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
432 struct completion *wait_io; /* for completion bios */
401 433
402 /* for checkpoint */ 434 /* for checkpoint */
403 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ 435 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -407,7 +439,6 @@ struct f2fs_sb_info {
407 struct mutex node_write; /* locking node writes */ 439 struct mutex node_write; /* locking node writes */
408 struct mutex writepages; /* mutex for writepages() */ 440 struct mutex writepages; /* mutex for writepages() */
409 bool por_doing; /* recovery is doing or not */ 441 bool por_doing; /* recovery is doing or not */
410 bool on_build_free_nids; /* build_free_nids is doing */
411 wait_queue_head_t cp_wait; 442 wait_queue_head_t cp_wait;
412 443
413 /* for orphan inode management */ 444 /* for orphan inode management */
@@ -436,6 +467,7 @@ struct f2fs_sb_info {
436 unsigned int total_valid_node_count; /* valid node block count */ 467 unsigned int total_valid_node_count; /* valid node block count */
437 unsigned int total_valid_inode_count; /* valid inode count */ 468 unsigned int total_valid_inode_count; /* valid inode count */
438 int active_logs; /* # of active logs */ 469 int active_logs; /* # of active logs */
470 int dir_level; /* directory level */
439 471
440 block_t user_block_count; /* # of user blocks */ 472 block_t user_block_count; /* # of user blocks */
441 block_t total_valid_block_count; /* # of valid blocks */ 473 block_t total_valid_block_count; /* # of valid blocks */
@@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode)
622 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; 654 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
623} 655}
624 656
657static inline bool f2fs_has_xattr_block(unsigned int ofs)
658{
659 return ofs == XATTR_NODE_OFFSET;
660}
661
625static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, 662static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
626 struct inode *inode, blkcnt_t count) 663 struct inode *inode, blkcnt_t count)
627{ 664{
@@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
661 698
662static inline void inode_inc_dirty_dents(struct inode *inode) 699static inline void inode_inc_dirty_dents(struct inode *inode)
663{ 700{
701 inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
664 atomic_inc(&F2FS_I(inode)->dirty_dents); 702 atomic_inc(&F2FS_I(inode)->dirty_dents);
665} 703}
666 704
@@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
671 709
672static inline void inode_dec_dirty_dents(struct inode *inode) 710static inline void inode_dec_dirty_dents(struct inode *inode)
673{ 711{
712 if (!S_ISDIR(inode->i_mode))
713 return;
714
715 dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
674 atomic_dec(&F2FS_I(inode)->dirty_dents); 716 atomic_dec(&F2FS_I(inode)->dirty_dents);
675} 717}
676 718
@@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
679 return atomic_read(&sbi->nr_pages[count_type]); 721 return atomic_read(&sbi->nr_pages[count_type]);
680} 722}
681 723
724static inline int get_dirty_dents(struct inode *inode)
725{
726 return atomic_read(&F2FS_I(inode)->dirty_dents);
727}
728
682static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) 729static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
683{ 730{
684 unsigned int pages_per_sec = sbi->segs_per_sec * 731 unsigned int pages_per_sec = sbi->segs_per_sec *
@@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
689 736
690static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) 737static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
691{ 738{
692 block_t ret; 739 return sbi->total_valid_block_count;
693 spin_lock(&sbi->stat_lock);
694 ret = sbi->total_valid_block_count;
695 spin_unlock(&sbi->stat_lock);
696 return ret;
697} 740}
698 741
699static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) 742static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
@@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
789 832
790static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) 833static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
791{ 834{
792 unsigned int ret; 835 return sbi->total_valid_node_count;
793 spin_lock(&sbi->stat_lock);
794 ret = sbi->total_valid_node_count;
795 spin_unlock(&sbi->stat_lock);
796 return ret;
797} 836}
798 837
799static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) 838static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
@@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
814 853
815static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) 854static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
816{ 855{
817 unsigned int ret; 856 return sbi->total_valid_inode_count;
818 spin_lock(&sbi->stat_lock);
819 ret = sbi->total_valid_inode_count;
820 spin_unlock(&sbi->stat_lock);
821 return ret;
822} 857}
823 858
824static inline void f2fs_put_page(struct page *page, int unlock) 859static inline void f2fs_put_page(struct page *page, int unlock)
@@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn)
844} 879}
845 880
846static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, 881static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
847 size_t size, void (*ctor)(void *)) 882 size_t size)
848{ 883{
849 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); 884 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
850} 885}
851 886
852static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, 887static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
@@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
983 ri->i_inline |= F2FS_INLINE_DATA; 1018 ri->i_inline |= F2FS_INLINE_DATA;
984} 1019}
985 1020
1021static inline int f2fs_has_inline_xattr(struct inode *inode)
1022{
1023 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
1024}
1025
986static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 1026static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
987{ 1027{
988 if (is_inode_flag_set(fi, FI_INLINE_XATTR)) 1028 if (f2fs_has_inline_xattr(&fi->vfs_inode))
989 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; 1029 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
990 return DEF_ADDRS_PER_INODE; 1030 return DEF_ADDRS_PER_INODE;
991} 1031}
992 1032
993static inline void *inline_xattr_addr(struct page *page) 1033static inline void *inline_xattr_addr(struct page *page)
994{ 1034{
995 struct f2fs_inode *ri; 1035 struct f2fs_inode *ri = F2FS_INODE(page);
996 ri = (struct f2fs_inode *)page_address(page);
997 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - 1036 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
998 F2FS_INLINE_XATTR_ADDRS]); 1037 F2FS_INLINE_XATTR_ADDRS]);
999} 1038}
1000 1039
1001static inline int inline_xattr_size(struct inode *inode) 1040static inline int inline_xattr_size(struct inode *inode)
1002{ 1041{
1003 if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) 1042 if (f2fs_has_inline_xattr(inode))
1004 return F2FS_INLINE_XATTR_ADDRS << 2; 1043 return F2FS_INLINE_XATTR_ADDRS << 2;
1005 else 1044 else
1006 return 0; 1045 return 0;
@@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode)
1013 1052
1014static inline void *inline_data_addr(struct page *page) 1053static inline void *inline_data_addr(struct page *page)
1015{ 1054{
1016 struct f2fs_inode *ri; 1055 struct f2fs_inode *ri = F2FS_INODE(page);
1017 ri = (struct f2fs_inode *)page_address(page);
1018 return (void *)&(ri->i_addr[1]); 1056 return (void *)&(ri->i_addr[1]);
1019} 1057}
1020 1058
@@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb)
1023 return sb->s_flags & MS_RDONLY; 1061 return sb->s_flags & MS_RDONLY;
1024} 1062}
1025 1063
1064static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1065{
1066 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
1067 sbi->sb->s_flags |= MS_RDONLY;
1068}
1069
1026#define get_inode_mode(i) \ 1070#define get_inode_mode(i) \
1027 ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ 1071 ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
1028 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) 1072 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *);
1048struct inode *f2fs_iget(struct super_block *, unsigned long); 1092struct inode *f2fs_iget(struct super_block *, unsigned long);
1049int try_to_free_nats(struct f2fs_sb_info *, int); 1093int try_to_free_nats(struct f2fs_sb_info *, int);
1050void update_inode(struct inode *, struct page *); 1094void update_inode(struct inode *, struct page *);
1051int update_inode_page(struct inode *); 1095void update_inode_page(struct inode *);
1052int f2fs_write_inode(struct inode *, struct writeback_control *); 1096int f2fs_write_inode(struct inode *, struct writeback_control *);
1053void f2fs_evict_inode(struct inode *); 1097void f2fs_evict_inode(struct inode *);
1054 1098
@@ -1097,6 +1141,7 @@ struct dnode_of_data;
1097struct node_info; 1141struct node_info;
1098 1142
1099int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1143int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1144bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
1100void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1145void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1101int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1146int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1102int truncate_inode_blocks(struct inode *, pgoff_t); 1147int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
1115void alloc_nid_failed(struct f2fs_sb_info *, nid_t); 1160void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
1116void recover_node_page(struct f2fs_sb_info *, struct page *, 1161void recover_node_page(struct f2fs_sb_info *, struct page *,
1117 struct f2fs_summary *, struct node_info *, block_t); 1162 struct f2fs_summary *, struct node_info *, block_t);
1163bool recover_xattr_data(struct inode *, struct page *, block_t);
1118int recover_inode_page(struct f2fs_sb_info *, struct page *); 1164int recover_inode_page(struct f2fs_sb_info *, struct page *);
1119int restore_node_summary(struct f2fs_sb_info *, unsigned int, 1165int restore_node_summary(struct f2fs_sb_info *, unsigned int,
1120 struct f2fs_summary_block *); 1166 struct f2fs_summary_block *);
@@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void);
1129 */ 1175 */
1130void f2fs_balance_fs(struct f2fs_sb_info *); 1176void f2fs_balance_fs(struct f2fs_sb_info *);
1131void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1177void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1178int f2fs_issue_flush(struct f2fs_sb_info *);
1132void invalidate_blocks(struct f2fs_sb_info *, block_t); 1179void invalidate_blocks(struct f2fs_sb_info *, block_t);
1180void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1133void clear_prefree_segments(struct f2fs_sb_info *); 1181void clear_prefree_segments(struct f2fs_sb_info *);
1134int npages_for_summary_flush(struct f2fs_sb_info *); 1182int npages_for_summary_flush(struct f2fs_sb_info *);
1135void allocate_new_segments(struct f2fs_sb_info *); 1183void allocate_new_segments(struct f2fs_sb_info *);
@@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void);
1162 */ 1210 */
1163struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1211struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1164struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1212struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1213int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
1165long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1214long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1166int acquire_orphan_inode(struct f2fs_sb_info *); 1215int acquire_orphan_inode(struct f2fs_sb_info *);
1167void release_orphan_inode(struct f2fs_sb_info *); 1216void release_orphan_inode(struct f2fs_sb_info *);
@@ -1231,7 +1280,7 @@ struct f2fs_stat_info {
1231 int util_free, util_valid, util_invalid; 1280 int util_free, util_valid, util_invalid;
1232 int rsvd_segs, overp_segs; 1281 int rsvd_segs, overp_segs;
1233 int dirty_count, node_pages, meta_pages; 1282 int dirty_count, node_pages, meta_pages;
1234 int prefree_count, call_count; 1283 int prefree_count, call_count, cp_count;
1235 int tot_segs, node_segs, data_segs, free_segs, free_secs; 1284 int tot_segs, node_segs, data_segs, free_segs, free_secs;
1236 int tot_blks, data_blks, node_blks; 1285 int tot_blks, data_blks, node_blks;
1237 int curseg[NR_CURSEG_TYPE]; 1286 int curseg[NR_CURSEG_TYPE];
@@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1248 return (struct f2fs_stat_info *)sbi->stat_info; 1297 return (struct f2fs_stat_info *)sbi->stat_info;
1249} 1298}
1250 1299
1300#define stat_inc_cp_count(si) ((si)->cp_count++)
1251#define stat_inc_call_count(si) ((si)->call_count++) 1301#define stat_inc_call_count(si) ((si)->call_count++)
1252#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) 1302#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
1253#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) 1303#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
@@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *);
1302void __init f2fs_create_root_stats(void); 1352void __init f2fs_create_root_stats(void);
1303void f2fs_destroy_root_stats(void); 1353void f2fs_destroy_root_stats(void);
1304#else 1354#else
1355#define stat_inc_cp_count(si)
1305#define stat_inc_call_count(si) 1356#define stat_inc_call_count(si)
1306#define stat_inc_bggc_count(si) 1357#define stat_inc_bggc_count(si)
1307#define stat_inc_dirty_dir(sbi) 1358#define stat_inc_dirty_dir(sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0dfcef53a6ed..60e7d5448a1d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
76 trace_f2fs_vm_page_mkwrite(page, DATA); 76 trace_f2fs_vm_page_mkwrite(page, DATA);
77mapped: 77mapped:
78 /* fill the page */ 78 /* fill the page */
79 wait_on_page_writeback(page); 79 f2fs_wait_on_page_writeback(page, DATA);
80out: 80out:
81 sb_end_pagefault(inode->i_sb); 81 sb_end_pagefault(inode->i_sb);
82 return block_page_mkwrite_return(err); 82 return block_page_mkwrite_return(err);
@@ -84,6 +84,7 @@ out:
84 84
85static const struct vm_operations_struct f2fs_file_vm_ops = { 85static const struct vm_operations_struct f2fs_file_vm_ops = {
86 .fault = filemap_fault, 86 .fault = filemap_fault,
87 .map_pages = filemap_map_pages,
87 .page_mkwrite = f2fs_vm_page_mkwrite, 88 .page_mkwrite = f2fs_vm_page_mkwrite,
88 .remap_pages = generic_file_remap_pages, 89 .remap_pages = generic_file_remap_pages,
89}; 90};
@@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
111int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 112int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
112{ 113{
113 struct inode *inode = file->f_mapping->host; 114 struct inode *inode = file->f_mapping->host;
115 struct f2fs_inode_info *fi = F2FS_I(inode);
114 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 116 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
115 int ret = 0; 117 int ret = 0;
116 bool need_cp = false; 118 bool need_cp = false;
117 struct writeback_control wbc = { 119 struct writeback_control wbc = {
118 .sync_mode = WB_SYNC_NONE, 120 .sync_mode = WB_SYNC_ALL,
119 .nr_to_write = LONG_MAX, 121 .nr_to_write = LONG_MAX,
120 .for_reclaim = 0, 122 .for_reclaim = 0,
121 }; 123 };
@@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
133 /* guarantee free sections for fsync */ 135 /* guarantee free sections for fsync */
134 f2fs_balance_fs(sbi); 136 f2fs_balance_fs(sbi);
135 137
136 mutex_lock(&inode->i_mutex); 138 down_read(&fi->i_sem);
137 139
138 /* 140 /*
139 * Both of fdatasync() and fsync() are able to be recovered from 141 * Both of fdatasync() and fsync() are able to be recovered from
@@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
150 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) 152 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
151 need_cp = true; 153 need_cp = true;
152 154
155 up_read(&fi->i_sem);
156
153 if (need_cp) { 157 if (need_cp) {
154 nid_t pino; 158 nid_t pino;
155 159
156 F2FS_I(inode)->xattr_ver = 0;
157
158 /* all the dirty node pages should be flushed for POR */ 160 /* all the dirty node pages should be flushed for POR */
159 ret = f2fs_sync_fs(inode->i_sb, 1); 161 ret = f2fs_sync_fs(inode->i_sb, 1);
162
163 down_write(&fi->i_sem);
164 F2FS_I(inode)->xattr_ver = 0;
160 if (file_wrong_pino(inode) && inode->i_nlink == 1 && 165 if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
161 get_parent_ino(inode, &pino)) { 166 get_parent_ino(inode, &pino)) {
162 F2FS_I(inode)->i_pino = pino; 167 F2FS_I(inode)->i_pino = pino;
163 file_got_pino(inode); 168 file_got_pino(inode);
169 up_write(&fi->i_sem);
164 mark_inode_dirty_sync(inode); 170 mark_inode_dirty_sync(inode);
165 ret = f2fs_write_inode(inode, NULL); 171 ret = f2fs_write_inode(inode, NULL);
166 if (ret) 172 if (ret)
167 goto out; 173 goto out;
174 } else {
175 up_write(&fi->i_sem);
168 } 176 }
169 } else { 177 } else {
170 /* if there is no written node page, write its inode page */ 178 /* if there is no written node page, write its inode page */
171 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 179 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
180 if (fsync_mark_done(sbi, inode->i_ino))
181 goto out;
172 mark_inode_dirty_sync(inode); 182 mark_inode_dirty_sync(inode);
173 ret = f2fs_write_inode(inode, NULL); 183 ret = f2fs_write_inode(inode, NULL);
174 if (ret) 184 if (ret)
@@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
177 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 187 ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
178 if (ret) 188 if (ret)
179 goto out; 189 goto out;
180 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 190 ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
181 } 191 }
182out: 192out:
183 mutex_unlock(&inode->i_mutex);
184 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 193 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
185 return ret; 194 return ret;
186} 195}
@@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
245 f2fs_put_page(page, 1); 254 f2fs_put_page(page, 1);
246 return; 255 return;
247 } 256 }
248 wait_on_page_writeback(page); 257 f2fs_wait_on_page_writeback(page, DATA);
249 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 258 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
250 set_page_dirty(page); 259 set_page_dirty(page);
251 f2fs_put_page(page, 1); 260 f2fs_put_page(page, 1);
@@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
422 f2fs_unlock_op(sbi); 431 f2fs_unlock_op(sbi);
423 432
424 if (!IS_ERR(page)) { 433 if (!IS_ERR(page)) {
425 wait_on_page_writeback(page); 434 f2fs_wait_on_page_writeback(page, DATA);
426 zero_user(page, start, len); 435 zero_user(page, start, len);
427 set_page_dirty(page); 436 set_page_dirty(page);
428 f2fs_put_page(page, 1); 437 f2fs_put_page(page, 1);
@@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode,
560 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 569 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
561 return -EOPNOTSUPP; 570 return -EOPNOTSUPP;
562 571
572 mutex_lock(&inode->i_mutex);
573
563 if (mode & FALLOC_FL_PUNCH_HOLE) 574 if (mode & FALLOC_FL_PUNCH_HOLE)
564 ret = punch_hole(inode, offset, len); 575 ret = punch_hole(inode, offset, len);
565 else 576 else
@@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode,
569 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 580 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
570 mark_inode_dirty(inode); 581 mark_inode_dirty(inode);
571 } 582 }
583
584 mutex_unlock(&inode->i_mutex);
585
572 trace_f2fs_fallocate(inode, mode, offset, len, ret); 586 trace_f2fs_fallocate(inode, mode, offset, len, ret);
573 return ret; 587 return ret;
574} 588}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea0371e854b4..b90dbe55403a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
531 set_page_dirty(page); 531 set_page_dirty(page);
532 set_cold_data(page); 532 set_cold_data(page);
533 } else { 533 } else {
534 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
535
536 f2fs_wait_on_page_writeback(page, DATA); 534 f2fs_wait_on_page_writeback(page, DATA);
537 535
538 if (clear_page_dirty_for_io(page) && 536 if (clear_page_dirty_for_io(page))
539 S_ISDIR(inode->i_mode)) {
540 dec_page_count(sbi, F2FS_DIRTY_DENTS);
541 inode_dec_dirty_dents(inode); 537 inode_dec_dirty_dents(inode);
542 }
543 set_cold_data(page); 538 set_cold_data(page);
544 do_write_data_page(page, &fio); 539 do_write_data_page(page, &fio);
545 clear_cold_data(page); 540 clear_cold_data(page);
@@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
701gc_more: 696gc_more:
702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 697 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
703 goto stop; 698 goto stop;
699 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
700 goto stop;
704 701
705 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 702 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
706 gc_type = FG_GC; 703 gc_type = FG_GC;
@@ -711,6 +708,11 @@ gc_more:
711 goto stop; 708 goto stop;
712 ret = 0; 709 ret = 0;
713 710
711 /* readahead multi ssa blocks those have contiguous address */
712 if (sbi->segs_per_sec > 1)
713 ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
714 META_SSA);
715
714 for (i = 0; i < sbi->segs_per_sec; i++) 716 for (i = 0; i < sbi->segs_per_sec; i++)
715 do_garbage_collect(sbi, segno + i, &ilist, gc_type); 717 do_garbage_collect(sbi, segno + i, &ilist, gc_type);
716 718
@@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
740int __init create_gc_caches(void) 742int __init create_gc_caches(void)
741{ 743{
742 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", 744 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
743 sizeof(struct inode_entry), NULL); 745 sizeof(struct inode_entry));
744 if (!winode_slab) 746 if (!winode_slab)
745 return -ENOMEM; 747 return -ENOMEM;
746 return 0; 748 return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 31ee5b164ff9..383db1fabcf4 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
45 } 45 }
46 46
47 ipage = get_node_page(sbi, inode->i_ino); 47 ipage = get_node_page(sbi, inode->i_ino);
48 if (IS_ERR(ipage)) 48 if (IS_ERR(ipage)) {
49 unlock_page(page);
49 return PTR_ERR(ipage); 50 return PTR_ERR(ipage);
51 }
50 52
51 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); 53 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
52 54
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 4d67ed736dca..ee829d360468 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode)
107 fi->flags = 0; 107 fi->flags = 0;
108 fi->i_advise = ri->i_advise; 108 fi->i_advise = ri->i_advise;
109 fi->i_pino = le32_to_cpu(ri->i_pino); 109 fi->i_pino = le32_to_cpu(ri->i_pino);
110 fi->i_dir_level = ri->i_dir_level;
110 111
111 get_extent_info(&fi->ext, ri->i_ext); 112 get_extent_info(&fi->ext, ri->i_ext);
112 get_inline_info(fi, ri); 113 get_inline_info(fi, ri);
@@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page)
204 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); 205 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
205 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); 206 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
206 ri->i_generation = cpu_to_le32(inode->i_generation); 207 ri->i_generation = cpu_to_le32(inode->i_generation);
208 ri->i_dir_level = F2FS_I(inode)->i_dir_level;
207 209
208 __set_inode_rdev(inode, ri); 210 __set_inode_rdev(inode, ri);
209 set_cold_node(inode, node_page); 211 set_cold_node(inode, node_page);
@@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page)
212 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); 214 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
213} 215}
214 216
215int update_inode_page(struct inode *inode) 217void update_inode_page(struct inode *inode)
216{ 218{
217 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 219 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
218 struct page *node_page; 220 struct page *node_page;
219 221retry:
220 node_page = get_node_page(sbi, inode->i_ino); 222 node_page = get_node_page(sbi, inode->i_ino);
221 if (IS_ERR(node_page)) 223 if (IS_ERR(node_page)) {
222 return PTR_ERR(node_page); 224 int err = PTR_ERR(node_page);
223 225 if (err == -ENOMEM) {
226 cond_resched();
227 goto retry;
228 } else if (err != -ENOENT) {
229 f2fs_stop_checkpoint(sbi);
230 }
231 return;
232 }
224 update_inode(inode, node_page); 233 update_inode(inode, node_page);
225 f2fs_put_page(node_page, 1); 234 f2fs_put_page(node_page, 1);
226 return 0;
227} 235}
228 236
229int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 237int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
230{ 238{
231 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
232 int ret;
233 240
234 if (inode->i_ino == F2FS_NODE_INO(sbi) || 241 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
235 inode->i_ino == F2FS_META_INO(sbi)) 242 inode->i_ino == F2FS_META_INO(sbi))
@@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
243 * during the urgent cleaning time when runing out of free sections. 250 * during the urgent cleaning time when runing out of free sections.
244 */ 251 */
245 f2fs_lock_op(sbi); 252 f2fs_lock_op(sbi);
246 ret = update_inode_page(inode); 253 update_inode_page(inode);
247 f2fs_unlock_op(sbi); 254 f2fs_unlock_op(sbi);
248 255
249 if (wbc) 256 if (wbc)
250 f2fs_balance_fs(sbi); 257 f2fs_balance_fs(sbi);
251 258
252 return ret; 259 return 0;
253} 260}
254 261
255/* 262/*
@@ -260,13 +267,13 @@ void f2fs_evict_inode(struct inode *inode)
260 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 267 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
261 268
262 trace_f2fs_evict_inode(inode); 269 trace_f2fs_evict_inode(inode);
263 truncate_inode_pages(&inode->i_data, 0); 270 truncate_inode_pages_final(&inode->i_data);
264 271
265 if (inode->i_ino == F2FS_NODE_INO(sbi) || 272 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
266 inode->i_ino == F2FS_META_INO(sbi)) 273 inode->i_ino == F2FS_META_INO(sbi))
267 goto no_delete; 274 goto no_delete;
268 275
269 f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); 276 f2fs_bug_on(get_dirty_dents(inode));
270 remove_dirty_dir_inode(inode); 277 remove_dirty_dir_inode(inode);
271 278
272 if (inode->i_nlink || is_bad_inode(inode)) 279 if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 397d459e97bf..a9409d19dfd4 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
207 inode = f2fs_iget(dir->i_sb, ino); 207 inode = f2fs_iget(dir->i_sb, ino);
208 if (IS_ERR(inode)) 208 if (IS_ERR(inode))
209 return ERR_CAST(inode); 209 return ERR_CAST(inode);
210
211 stat_inc_inline_inode(inode);
210 } 212 }
211 213
212 return d_splice_alias(inode, dentry); 214 return d_splice_alias(inode, dentry);
@@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
424 } 426 }
425 427
426 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 428 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
429 down_write(&F2FS_I(old_inode)->i_sem);
427 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 430 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
431 up_write(&F2FS_I(old_inode)->i_sem);
428 432
429 new_inode->i_ctime = CURRENT_TIME; 433 new_inode->i_ctime = CURRENT_TIME;
434 down_write(&F2FS_I(new_inode)->i_sem);
430 if (old_dir_entry) 435 if (old_dir_entry)
431 drop_nlink(new_inode); 436 drop_nlink(new_inode);
432 drop_nlink(new_inode); 437 drop_nlink(new_inode);
438 up_write(&F2FS_I(new_inode)->i_sem);
439
433 mark_inode_dirty(new_inode); 440 mark_inode_dirty(new_inode);
434 441
435 if (!new_inode->i_nlink) 442 if (!new_inode->i_nlink)
@@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
459 if (old_dir != new_dir) { 466 if (old_dir != new_dir) {
460 f2fs_set_link(old_inode, old_dir_entry, 467 f2fs_set_link(old_inode, old_dir_entry,
461 old_dir_page, new_dir); 468 old_dir_page, new_dir);
469 down_write(&F2FS_I(old_inode)->i_sem);
462 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 470 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
471 up_write(&F2FS_I(old_inode)->i_sem);
463 update_inode_page(old_inode); 472 update_inode_page(old_inode);
464 } else { 473 } else {
465 kunmap(old_dir_page); 474 kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..a161e955c4c8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -21,9 +21,27 @@
21#include "segment.h" 21#include "segment.h"
22#include <trace/events/f2fs.h> 22#include <trace/events/f2fs.h>
23 23
24#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
25
24static struct kmem_cache *nat_entry_slab; 26static struct kmem_cache *nat_entry_slab;
25static struct kmem_cache *free_nid_slab; 27static struct kmem_cache *free_nid_slab;
26 28
29static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type)
30{
31 struct sysinfo val;
32 unsigned long mem_size = 0;
33
34 si_meminfo(&val);
35 if (type == FREE_NIDS)
36 mem_size = nm_i->fcnt * sizeof(struct free_nid);
37 else if (type == NAT_ENTRIES)
38 mem_size += nm_i->nat_cnt * sizeof(struct nat_entry);
39 mem_size >>= 12;
40
41 /* give 50:50 memory for free nids and nat caches respectively */
42 return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11));
43}
44
27static void clear_node_page_dirty(struct page *page) 45static void clear_node_page_dirty(struct page *page)
28{ 46{
29 struct address_space *mapping = page->mapping; 47 struct address_space *mapping = page->mapping;
@@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
82 return dst_page; 100 return dst_page;
83} 101}
84 102
85/*
86 * Readahead NAT pages
87 */
88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
89{
90 struct address_space *mapping = META_MAPPING(sbi);
91 struct f2fs_nm_info *nm_i = NM_I(sbi);
92 struct page *page;
93 pgoff_t index;
94 int i;
95 struct f2fs_io_info fio = {
96 .type = META,
97 .rw = READ_SYNC | REQ_META | REQ_PRIO
98 };
99
100
101 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
102 if (unlikely(nid >= nm_i->max_nid))
103 nid = 0;
104 index = current_nat_addr(sbi, nid);
105
106 page = grab_cache_page(mapping, index);
107 if (!page)
108 continue;
109 if (PageUptodate(page)) {
110 mark_page_accessed(page);
111 f2fs_put_page(page, 1);
112 continue;
113 }
114 f2fs_submit_page_mbio(sbi, page, index, &fio);
115 mark_page_accessed(page);
116 f2fs_put_page(page, 0);
117 }
118 f2fs_submit_merged_bio(sbi, META, READ);
119}
120
121static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 103static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
122{ 104{
123 return radix_tree_lookup(&nm_i->nat_root, n); 105 return radix_tree_lookup(&nm_i->nat_root, n);
@@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
151 return is_cp; 133 return is_cp;
152} 134}
153 135
136bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
137{
138 struct f2fs_nm_info *nm_i = NM_I(sbi);
139 struct nat_entry *e;
140 bool fsync_done = false;
141
142 read_lock(&nm_i->nat_tree_lock);
143 e = __lookup_nat_cache(nm_i, nid);
144 if (e)
145 fsync_done = e->fsync_done;
146 read_unlock(&nm_i->nat_tree_lock);
147 return fsync_done;
148}
149
154static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 150static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
155{ 151{
156 struct nat_entry *new; 152 struct nat_entry *new;
@@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
164 } 160 }
165 memset(new, 0, sizeof(struct nat_entry)); 161 memset(new, 0, sizeof(struct nat_entry));
166 nat_set_nid(new, nid); 162 nat_set_nid(new, nid);
163 new->checkpointed = true;
167 list_add_tail(&new->list, &nm_i->nat_entries); 164 list_add_tail(&new->list, &nm_i->nat_entries);
168 nm_i->nat_cnt++; 165 nm_i->nat_cnt++;
169 return new; 166 return new;
@@ -185,13 +182,12 @@ retry:
185 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); 182 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
186 nat_set_ino(e, le32_to_cpu(ne->ino)); 183 nat_set_ino(e, le32_to_cpu(ne->ino));
187 nat_set_version(e, ne->version); 184 nat_set_version(e, ne->version);
188 e->checkpointed = true;
189 } 185 }
190 write_unlock(&nm_i->nat_tree_lock); 186 write_unlock(&nm_i->nat_tree_lock);
191} 187}
192 188
193static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, 189static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
194 block_t new_blkaddr) 190 block_t new_blkaddr, bool fsync_done)
195{ 191{
196 struct f2fs_nm_info *nm_i = NM_I(sbi); 192 struct f2fs_nm_info *nm_i = NM_I(sbi);
197 struct nat_entry *e; 193 struct nat_entry *e;
@@ -205,7 +201,6 @@ retry:
205 goto retry; 201 goto retry;
206 } 202 }
207 e->ni = *ni; 203 e->ni = *ni;
208 e->checkpointed = true;
209 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 204 f2fs_bug_on(ni->blk_addr == NEW_ADDR);
210 } else if (new_blkaddr == NEW_ADDR) { 205 } else if (new_blkaddr == NEW_ADDR) {
211 /* 206 /*
@@ -217,9 +212,6 @@ retry:
217 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 212 f2fs_bug_on(ni->blk_addr != NULL_ADDR);
218 } 213 }
219 214
220 if (new_blkaddr == NEW_ADDR)
221 e->checkpointed = false;
222
223 /* sanity check */ 215 /* sanity check */
224 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 216 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
225 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 217 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
@@ -239,6 +231,11 @@ retry:
239 /* change address */ 231 /* change address */
240 nat_set_blkaddr(e, new_blkaddr); 232 nat_set_blkaddr(e, new_blkaddr);
241 __set_nat_cache_dirty(nm_i, e); 233 __set_nat_cache_dirty(nm_i, e);
234
235 /* update fsync_mark if its inode nat entry is still alive */
236 e = __lookup_nat_cache(nm_i, ni->ino);
237 if (e)
238 e->fsync_done = fsync_done;
242 write_unlock(&nm_i->nat_tree_lock); 239 write_unlock(&nm_i->nat_tree_lock);
243} 240}
244 241
@@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
246{ 243{
247 struct f2fs_nm_info *nm_i = NM_I(sbi); 244 struct f2fs_nm_info *nm_i = NM_I(sbi);
248 245
249 if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) 246 if (available_free_memory(nm_i, NAT_ENTRIES))
250 return 0; 247 return 0;
251 248
252 write_lock(&nm_i->nat_tree_lock); 249 write_lock(&nm_i->nat_tree_lock);
@@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn)
505 /* Deallocate node address */ 502 /* Deallocate node address */
506 invalidate_blocks(sbi, ni.blk_addr); 503 invalidate_blocks(sbi, ni.blk_addr);
507 dec_valid_node_count(sbi, dn->inode); 504 dec_valid_node_count(sbi, dn->inode);
508 set_node_addr(sbi, &ni, NULL_ADDR); 505 set_node_addr(sbi, &ni, NULL_ADDR, false);
509 506
510 if (dn->nid == dn->inode->i_ino) { 507 if (dn->nid == dn->inode->i_ino) {
511 remove_orphan_inode(sbi, dn->nid); 508 remove_orphan_inode(sbi, dn->nid);
@@ -763,7 +760,7 @@ skip_partial:
763 f2fs_put_page(page, 1); 760 f2fs_put_page(page, 1);
764 goto restart; 761 goto restart;
765 } 762 }
766 wait_on_page_writeback(page); 763 f2fs_wait_on_page_writeback(page, NODE);
767 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 764 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
768 set_page_dirty(page); 765 set_page_dirty(page);
769 unlock_page(page); 766 unlock_page(page);
@@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
852 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 849 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
853 return ERR_PTR(-EPERM); 850 return ERR_PTR(-EPERM);
854 851
855 page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); 852 page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
853 dn->nid, AOP_FLAG_NOFS);
856 if (!page) 854 if (!page)
857 return ERR_PTR(-ENOMEM); 855 return ERR_PTR(-ENOMEM);
858 856
@@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
867 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 865 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
868 new_ni = old_ni; 866 new_ni = old_ni;
869 new_ni.ino = dn->inode->i_ino; 867 new_ni.ino = dn->inode->i_ino;
870 set_node_addr(sbi, &new_ni, NEW_ADDR); 868 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
871 869
872 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); 870 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
873 set_cold_node(dn->inode, page); 871 set_cold_node(dn->inode, page);
874 SetPageUptodate(page); 872 SetPageUptodate(page);
875 set_page_dirty(page); 873 set_page_dirty(page);
876 874
877 if (ofs == XATTR_NODE_OFFSET) 875 if (f2fs_has_xattr_block(ofs))
878 F2FS_I(dn->inode)->i_xattr_nid = dn->nid; 876 F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
879 877
880 dn->node_page = page; 878 dn->node_page = page;
@@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
948 struct page *page; 946 struct page *page;
949 int err; 947 int err;
950repeat: 948repeat:
951 page = grab_cache_page(NODE_MAPPING(sbi), nid); 949 page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
950 nid, AOP_FLAG_NOFS);
952 if (!page) 951 if (!page)
953 return ERR_PTR(-ENOMEM); 952 return ERR_PTR(-ENOMEM);
954 953
@@ -959,7 +958,7 @@ repeat:
959 goto got_it; 958 goto got_it;
960 959
961 lock_page(page); 960 lock_page(page);
962 if (unlikely(!PageUptodate(page))) { 961 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
963 f2fs_put_page(page, 1); 962 f2fs_put_page(page, 1);
964 return ERR_PTR(-EIO); 963 return ERR_PTR(-EIO);
965 } 964 }
@@ -968,7 +967,6 @@ repeat:
968 goto repeat; 967 goto repeat;
969 } 968 }
970got_it: 969got_it:
971 f2fs_bug_on(nid != nid_of_node(page));
972 mark_page_accessed(page); 970 mark_page_accessed(page);
973 return page; 971 return page;
974} 972}
@@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1168 continue; 1166 continue;
1169 1167
1170 if (ino && ino_of_node(page) == ino) { 1168 if (ino && ino_of_node(page) == ino) {
1171 wait_on_page_writeback(page); 1169 f2fs_wait_on_page_writeback(page, NODE);
1172 if (TestClearPageError(page)) 1170 if (TestClearPageError(page))
1173 ret = -EIO; 1171 ret = -EIO;
1174 } 1172 }
@@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page,
1201 if (unlikely(sbi->por_doing)) 1199 if (unlikely(sbi->por_doing))
1202 goto redirty_out; 1200 goto redirty_out;
1203 1201
1204 wait_on_page_writeback(page); 1202 f2fs_wait_on_page_writeback(page, NODE);
1205 1203
1206 /* get old block addr of this node page */ 1204 /* get old block addr of this node page */
1207 nid = nid_of_node(page); 1205 nid = nid_of_node(page);
@@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page,
1222 mutex_lock(&sbi->node_write); 1220 mutex_lock(&sbi->node_write);
1223 set_page_writeback(page); 1221 set_page_writeback(page);
1224 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1222 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1225 set_node_addr(sbi, &ni, new_addr); 1223 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
1226 dec_page_count(sbi, F2FS_DIRTY_NODES); 1224 dec_page_count(sbi, F2FS_DIRTY_NODES);
1227 mutex_unlock(&sbi->node_write); 1225 mutex_unlock(&sbi->node_write);
1228 unlock_page(page); 1226 unlock_page(page);
@@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page,
1231redirty_out: 1229redirty_out:
1232 dec_page_count(sbi, F2FS_DIRTY_NODES); 1230 dec_page_count(sbi, F2FS_DIRTY_NODES);
1233 wbc->pages_skipped++; 1231 wbc->pages_skipped++;
1232 account_page_redirty(page);
1234 set_page_dirty(page); 1233 set_page_dirty(page);
1235 return AOP_WRITEPAGE_ACTIVATE; 1234 return AOP_WRITEPAGE_ACTIVATE;
1236} 1235}
1237 1236
1238/*
1239 * It is very important to gather dirty pages and write at once, so that we can
1240 * submit a big bio without interfering other data writes.
1241 * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
1242 */
1243#define COLLECT_DIRTY_NODES 1536
1244static int f2fs_write_node_pages(struct address_space *mapping, 1237static int f2fs_write_node_pages(struct address_space *mapping,
1245 struct writeback_control *wbc) 1238 struct writeback_control *wbc)
1246{ 1239{
1247 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1240 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1248 long nr_to_write = wbc->nr_to_write; 1241 long diff;
1249 1242
1250 /* balancing f2fs's metadata in background */ 1243 /* balancing f2fs's metadata in background */
1251 f2fs_balance_fs_bg(sbi); 1244 f2fs_balance_fs_bg(sbi);
1252 1245
1253 /* collect a number of dirty node pages and write together */ 1246 /* collect a number of dirty node pages and write together */
1254 if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) 1247 if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1255 return 0; 1248 goto skip_write;
1256 1249
1257 /* if mounting is failed, skip writing node pages */ 1250 diff = nr_pages_to_write(sbi, NODE, wbc);
1258 wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1259 wbc->sync_mode = WB_SYNC_NONE; 1251 wbc->sync_mode = WB_SYNC_NONE;
1260 sync_node_pages(sbi, 0, wbc); 1252 sync_node_pages(sbi, 0, wbc);
1261 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - 1253 wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1262 wbc->nr_to_write); 1254 return 0;
1255
1256skip_write:
1257 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1263 return 0; 1258 return 0;
1264} 1259}
1265 1260
@@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = {
1307 .releasepage = f2fs_release_node_page, 1302 .releasepage = f2fs_release_node_page,
1308}; 1303};
1309 1304
1310static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) 1305static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1306 nid_t n)
1311{ 1307{
1312 struct list_head *this; 1308 return radix_tree_lookup(&nm_i->free_nid_root, n);
1313 struct free_nid *i;
1314 list_for_each(this, head) {
1315 i = list_entry(this, struct free_nid, list);
1316 if (i->nid == n)
1317 return i;
1318 }
1319 return NULL;
1320} 1309}
1321 1310
1322static void __del_from_free_nid_list(struct free_nid *i) 1311static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
1312 struct free_nid *i)
1323{ 1313{
1324 list_del(&i->list); 1314 list_del(&i->list);
1325 kmem_cache_free(free_nid_slab, i); 1315 radix_tree_delete(&nm_i->free_nid_root, i->nid);
1326} 1316}
1327 1317
1328static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) 1318static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
@@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1331 struct nat_entry *ne; 1321 struct nat_entry *ne;
1332 bool allocated = false; 1322 bool allocated = false;
1333 1323
1334 if (nm_i->fcnt > 2 * MAX_FREE_NIDS) 1324 if (!available_free_memory(nm_i, FREE_NIDS))
1335 return -1; 1325 return -1;
1336 1326
1337 /* 0 nid should not be used */ 1327 /* 0 nid should not be used */
@@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1342 /* do not add allocated nids */ 1332 /* do not add allocated nids */
1343 read_lock(&nm_i->nat_tree_lock); 1333 read_lock(&nm_i->nat_tree_lock);
1344 ne = __lookup_nat_cache(nm_i, nid); 1334 ne = __lookup_nat_cache(nm_i, nid);
1345 if (ne && nat_get_blkaddr(ne) != NULL_ADDR) 1335 if (ne &&
1336 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
1346 allocated = true; 1337 allocated = true;
1347 read_unlock(&nm_i->nat_tree_lock); 1338 read_unlock(&nm_i->nat_tree_lock);
1348 if (allocated) 1339 if (allocated)
@@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1354 i->state = NID_NEW; 1345 i->state = NID_NEW;
1355 1346
1356 spin_lock(&nm_i->free_nid_list_lock); 1347 spin_lock(&nm_i->free_nid_list_lock);
1357 if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { 1348 if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
1358 spin_unlock(&nm_i->free_nid_list_lock); 1349 spin_unlock(&nm_i->free_nid_list_lock);
1359 kmem_cache_free(free_nid_slab, i); 1350 kmem_cache_free(free_nid_slab, i);
1360 return 0; 1351 return 0;
@@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1368static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) 1359static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1369{ 1360{
1370 struct free_nid *i; 1361 struct free_nid *i;
1362 bool need_free = false;
1363
1371 spin_lock(&nm_i->free_nid_list_lock); 1364 spin_lock(&nm_i->free_nid_list_lock);
1372 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1365 i = __lookup_free_nid_list(nm_i, nid);
1373 if (i && i->state == NID_NEW) { 1366 if (i && i->state == NID_NEW) {
1374 __del_from_free_nid_list(i); 1367 __del_from_free_nid_list(nm_i, i);
1375 nm_i->fcnt--; 1368 nm_i->fcnt--;
1369 need_free = true;
1376 } 1370 }
1377 spin_unlock(&nm_i->free_nid_list_lock); 1371 spin_unlock(&nm_i->free_nid_list_lock);
1372
1373 if (need_free)
1374 kmem_cache_free(free_nid_slab, i);
1378} 1375}
1379 1376
1380static void scan_nat_page(struct f2fs_nm_info *nm_i, 1377static void scan_nat_page(struct f2fs_nm_info *nm_i,
@@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1413 return; 1410 return;
1414 1411
1415 /* readahead nat pages to be scanned */ 1412 /* readahead nat pages to be scanned */
1416 ra_nat_pages(sbi, nid); 1413 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
1417 1414
1418 while (1) { 1415 while (1) {
1419 struct page *page = get_current_nat_page(sbi, nid); 1416 struct page *page = get_current_nat_page(sbi, nid);
@@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1454{ 1451{
1455 struct f2fs_nm_info *nm_i = NM_I(sbi); 1452 struct f2fs_nm_info *nm_i = NM_I(sbi);
1456 struct free_nid *i = NULL; 1453 struct free_nid *i = NULL;
1457 struct list_head *this;
1458retry: 1454retry:
1459 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) 1455 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
1460 return false; 1456 return false;
@@ -1462,13 +1458,11 @@ retry:
1462 spin_lock(&nm_i->free_nid_list_lock); 1458 spin_lock(&nm_i->free_nid_list_lock);
1463 1459
1464 /* We should not use stale free nids created by build_free_nids */ 1460 /* We should not use stale free nids created by build_free_nids */
1465 if (nm_i->fcnt && !sbi->on_build_free_nids) { 1461 if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1466 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1462 f2fs_bug_on(list_empty(&nm_i->free_nid_list));
1467 list_for_each(this, &nm_i->free_nid_list) { 1463 list_for_each_entry(i, &nm_i->free_nid_list, list)
1468 i = list_entry(this, struct free_nid, list);
1469 if (i->state == NID_NEW) 1464 if (i->state == NID_NEW)
1470 break; 1465 break;
1471 }
1472 1466
1473 f2fs_bug_on(i->state != NID_NEW); 1467 f2fs_bug_on(i->state != NID_NEW);
1474 *nid = i->nid; 1468 *nid = i->nid;
@@ -1481,9 +1475,7 @@ retry:
1481 1475
1482 /* Let's scan nat pages and its caches to get free nids */ 1476 /* Let's scan nat pages and its caches to get free nids */
1483 mutex_lock(&nm_i->build_lock); 1477 mutex_lock(&nm_i->build_lock);
1484 sbi->on_build_free_nids = true;
1485 build_free_nids(sbi); 1478 build_free_nids(sbi);
1486 sbi->on_build_free_nids = false;
1487 mutex_unlock(&nm_i->build_lock); 1479 mutex_unlock(&nm_i->build_lock);
1488 goto retry; 1480 goto retry;
1489} 1481}
@@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1497 struct free_nid *i; 1489 struct free_nid *i;
1498 1490
1499 spin_lock(&nm_i->free_nid_list_lock); 1491 spin_lock(&nm_i->free_nid_list_lock);
1500 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1492 i = __lookup_free_nid_list(nm_i, nid);
1501 f2fs_bug_on(!i || i->state != NID_ALLOC); 1493 f2fs_bug_on(!i || i->state != NID_ALLOC);
1502 __del_from_free_nid_list(i); 1494 __del_from_free_nid_list(nm_i, i);
1503 spin_unlock(&nm_i->free_nid_list_lock); 1495 spin_unlock(&nm_i->free_nid_list_lock);
1496
1497 kmem_cache_free(free_nid_slab, i);
1504} 1498}
1505 1499
1506/* 1500/*
@@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1510{ 1504{
1511 struct f2fs_nm_info *nm_i = NM_I(sbi); 1505 struct f2fs_nm_info *nm_i = NM_I(sbi);
1512 struct free_nid *i; 1506 struct free_nid *i;
1507 bool need_free = false;
1513 1508
1514 if (!nid) 1509 if (!nid)
1515 return; 1510 return;
1516 1511
1517 spin_lock(&nm_i->free_nid_list_lock); 1512 spin_lock(&nm_i->free_nid_list_lock);
1518 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1513 i = __lookup_free_nid_list(nm_i, nid);
1519 f2fs_bug_on(!i || i->state != NID_ALLOC); 1514 f2fs_bug_on(!i || i->state != NID_ALLOC);
1520 if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { 1515 if (!available_free_memory(nm_i, FREE_NIDS)) {
1521 __del_from_free_nid_list(i); 1516 __del_from_free_nid_list(nm_i, i);
1517 need_free = true;
1522 } else { 1518 } else {
1523 i->state = NID_NEW; 1519 i->state = NID_NEW;
1524 nm_i->fcnt++; 1520 nm_i->fcnt++;
1525 } 1521 }
1526 spin_unlock(&nm_i->free_nid_list_lock); 1522 spin_unlock(&nm_i->free_nid_list_lock);
1523
1524 if (need_free)
1525 kmem_cache_free(free_nid_slab, i);
1527} 1526}
1528 1527
1529void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, 1528void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1531 block_t new_blkaddr) 1530 block_t new_blkaddr)
1532{ 1531{
1533 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); 1532 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1534 set_node_addr(sbi, ni, new_blkaddr); 1533 set_node_addr(sbi, ni, new_blkaddr, false);
1535 clear_node_page_dirty(page); 1534 clear_node_page_dirty(page);
1536} 1535}
1537 1536
1537void recover_inline_xattr(struct inode *inode, struct page *page)
1538{
1539 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1540 void *src_addr, *dst_addr;
1541 size_t inline_size;
1542 struct page *ipage;
1543 struct f2fs_inode *ri;
1544
1545 if (!f2fs_has_inline_xattr(inode))
1546 return;
1547
1548 if (!IS_INODE(page))
1549 return;
1550
1551 ri = F2FS_INODE(page);
1552 if (!(ri->i_inline & F2FS_INLINE_XATTR))
1553 return;
1554
1555 ipage = get_node_page(sbi, inode->i_ino);
1556 f2fs_bug_on(IS_ERR(ipage));
1557
1558 dst_addr = inline_xattr_addr(ipage);
1559 src_addr = inline_xattr_addr(page);
1560 inline_size = inline_xattr_size(inode);
1561
1562 memcpy(dst_addr, src_addr, inline_size);
1563
1564 update_inode(inode, ipage);
1565 f2fs_put_page(ipage, 1);
1566}
1567
1568bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1569{
1570 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1571 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1572 nid_t new_xnid = nid_of_node(page);
1573 struct node_info ni;
1574
1575 recover_inline_xattr(inode, page);
1576
1577 if (!f2fs_has_xattr_block(ofs_of_node(page)))
1578 return false;
1579
1580 /* 1: invalidate the previous xattr nid */
1581 if (!prev_xnid)
1582 goto recover_xnid;
1583
1584 /* Deallocate node address */
1585 get_node_info(sbi, prev_xnid, &ni);
1586 f2fs_bug_on(ni.blk_addr == NULL_ADDR);
1587 invalidate_blocks(sbi, ni.blk_addr);
1588 dec_valid_node_count(sbi, inode);
1589 set_node_addr(sbi, &ni, NULL_ADDR, false);
1590
1591recover_xnid:
1592 /* 2: allocate new xattr nid */
1593 if (unlikely(!inc_valid_node_count(sbi, inode)))
1594 f2fs_bug_on(1);
1595
1596 remove_free_nid(NM_I(sbi), new_xnid);
1597 get_node_info(sbi, new_xnid, &ni);
1598 ni.ino = inode->i_ino;
1599 set_node_addr(sbi, &ni, NEW_ADDR, false);
1600 F2FS_I(inode)->i_xattr_nid = new_xnid;
1601
1602 /* 3: update xattr blkaddr */
1603 refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
1604 set_node_addr(sbi, &ni, blkaddr, false);
1605
1606 update_inode_page(inode);
1607 return true;
1608}
1609
1538int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1610int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1539{ 1611{
1540 struct f2fs_inode *src, *dst; 1612 struct f2fs_inode *src, *dst;
@@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1567 1639
1568 if (unlikely(!inc_valid_node_count(sbi, NULL))) 1640 if (unlikely(!inc_valid_node_count(sbi, NULL)))
1569 WARN_ON(1); 1641 WARN_ON(1);
1570 set_node_addr(sbi, &new_ni, NEW_ADDR); 1642 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1571 inc_valid_inode_count(sbi); 1643 inc_valid_inode_count(sbi);
1572 f2fs_put_page(ipage, 1); 1644 f2fs_put_page(ipage, 1);
1573 return 0; 1645 return 0;
@@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1590 for (; page_idx < start + nrpages; page_idx++) { 1662 for (; page_idx < start + nrpages; page_idx++) {
1591 /* alloc temporal page for read node summary info*/ 1663 /* alloc temporal page for read node summary info*/
1592 page = alloc_page(GFP_F2FS_ZERO); 1664 page = alloc_page(GFP_F2FS_ZERO);
1593 if (!page) { 1665 if (!page)
1594 struct page *tmp; 1666 break;
1595 list_for_each_entry_safe(page, tmp, pages, lru) {
1596 list_del(&page->lru);
1597 unlock_page(page);
1598 __free_pages(page, 0);
1599 }
1600 return -ENOMEM;
1601 }
1602 1667
1603 lock_page(page); 1668 lock_page(page);
1604 page->index = page_idx; 1669 page->index = page_idx;
@@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1609 f2fs_submit_page_mbio(sbi, page, page->index, &fio); 1674 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
1610 1675
1611 f2fs_submit_merged_bio(sbi, META, READ); 1676 f2fs_submit_merged_bio(sbi, META, READ);
1612 return 0; 1677
1678 return page_idx - start;
1613} 1679}
1614 1680
1615int restore_node_summary(struct f2fs_sb_info *sbi, 1681int restore_node_summary(struct f2fs_sb_info *sbi,
@@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1628 addr = START_BLOCK(sbi, segno); 1694 addr = START_BLOCK(sbi, segno);
1629 sum_entry = &sum->entries[0]; 1695 sum_entry = &sum->entries[0];
1630 1696
1631 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { 1697 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
1632 nrpages = min(last_offset - i, bio_blocks); 1698 nrpages = min(last_offset - i, bio_blocks);
1633 1699
1634 /* read ahead node pages */ 1700 /* read ahead node pages */
1635 err = ra_sum_pages(sbi, &page_list, addr, nrpages); 1701 nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages);
1636 if (err) 1702 if (!nrpages)
1637 return err; 1703 return -ENOMEM;
1638 1704
1639 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1705 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1706 if (err)
1707 goto skip;
1640 1708
1641 lock_page(page); 1709 lock_page(page);
1642 if (unlikely(!PageUptodate(page))) { 1710 if (unlikely(!PageUptodate(page))) {
@@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1648 sum_entry->ofs_in_node = 0; 1716 sum_entry->ofs_in_node = 0;
1649 sum_entry++; 1717 sum_entry++;
1650 } 1718 }
1651
1652 list_del(&page->lru);
1653 unlock_page(page); 1719 unlock_page(page);
1720skip:
1721 list_del(&page->lru);
1654 __free_pages(page, 0); 1722 __free_pages(page, 0);
1655 } 1723 }
1656 } 1724 }
@@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1709 struct f2fs_nm_info *nm_i = NM_I(sbi); 1777 struct f2fs_nm_info *nm_i = NM_I(sbi);
1710 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1778 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1711 struct f2fs_summary_block *sum = curseg->sum_blk; 1779 struct f2fs_summary_block *sum = curseg->sum_blk;
1712 struct list_head *cur, *n; 1780 struct nat_entry *ne, *cur;
1713 struct page *page = NULL; 1781 struct page *page = NULL;
1714 struct f2fs_nat_block *nat_blk = NULL; 1782 struct f2fs_nat_block *nat_blk = NULL;
1715 nid_t start_nid = 0, end_nid = 0; 1783 nid_t start_nid = 0, end_nid = 0;
@@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1721 mutex_lock(&curseg->curseg_mutex); 1789 mutex_lock(&curseg->curseg_mutex);
1722 1790
1723 /* 1) flush dirty nat caches */ 1791 /* 1) flush dirty nat caches */
1724 list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { 1792 list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
1725 struct nat_entry *ne;
1726 nid_t nid; 1793 nid_t nid;
1727 struct f2fs_nat_entry raw_ne; 1794 struct f2fs_nat_entry raw_ne;
1728 int offset = -1; 1795 int offset = -1;
1729 block_t new_blkaddr; 1796 block_t new_blkaddr;
1730 1797
1731 ne = list_entry(cur, struct nat_entry, list);
1732 nid = nat_get_nid(ne);
1733
1734 if (nat_get_blkaddr(ne) == NEW_ADDR) 1798 if (nat_get_blkaddr(ne) == NEW_ADDR)
1735 continue; 1799 continue;
1800
1801 nid = nat_get_nid(ne);
1802
1736 if (flushed) 1803 if (flushed)
1737 goto to_nat_page; 1804 goto to_nat_page;
1738 1805
@@ -1783,16 +1850,12 @@ flush_now:
1783 } else { 1850 } else {
1784 write_lock(&nm_i->nat_tree_lock); 1851 write_lock(&nm_i->nat_tree_lock);
1785 __clear_nat_cache_dirty(nm_i, ne); 1852 __clear_nat_cache_dirty(nm_i, ne);
1786 ne->checkpointed = true;
1787 write_unlock(&nm_i->nat_tree_lock); 1853 write_unlock(&nm_i->nat_tree_lock);
1788 } 1854 }
1789 } 1855 }
1790 if (!flushed) 1856 if (!flushed)
1791 mutex_unlock(&curseg->curseg_mutex); 1857 mutex_unlock(&curseg->curseg_mutex);
1792 f2fs_put_page(page, 1); 1858 f2fs_put_page(page, 1);
1793
1794 /* 2) shrink nat caches if necessary */
1795 try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
1796} 1859}
1797 1860
1798static int init_node_manager(struct f2fs_sb_info *sbi) 1861static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1807 /* segment_count_nat includes pair segment so divide to 2. */ 1870 /* segment_count_nat includes pair segment so divide to 2. */
1808 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; 1871 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
1809 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); 1872 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
1810 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1873
1874 /* not used nids: 0, node, meta, (and root counted as valid node) */
1875 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3;
1811 nm_i->fcnt = 0; 1876 nm_i->fcnt = 0;
1812 nm_i->nat_cnt = 0; 1877 nm_i->nat_cnt = 0;
1878 nm_i->ram_thresh = DEF_RAM_THRESHOLD;
1813 1879
1880 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
1814 INIT_LIST_HEAD(&nm_i->free_nid_list); 1881 INIT_LIST_HEAD(&nm_i->free_nid_list);
1815 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1882 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1816 INIT_LIST_HEAD(&nm_i->nat_entries); 1883 INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1864 spin_lock(&nm_i->free_nid_list_lock); 1931 spin_lock(&nm_i->free_nid_list_lock);
1865 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 1932 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1866 f2fs_bug_on(i->state == NID_ALLOC); 1933 f2fs_bug_on(i->state == NID_ALLOC);
1867 __del_from_free_nid_list(i); 1934 __del_from_free_nid_list(nm_i, i);
1868 nm_i->fcnt--; 1935 nm_i->fcnt--;
1936 spin_unlock(&nm_i->free_nid_list_lock);
1937 kmem_cache_free(free_nid_slab, i);
1938 spin_lock(&nm_i->free_nid_list_lock);
1869 } 1939 }
1870 f2fs_bug_on(nm_i->fcnt); 1940 f2fs_bug_on(nm_i->fcnt);
1871 spin_unlock(&nm_i->free_nid_list_lock); 1941 spin_unlock(&nm_i->free_nid_list_lock);
@@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1875 while ((found = __gang_lookup_nat_cache(nm_i, 1945 while ((found = __gang_lookup_nat_cache(nm_i,
1876 nid, NATVEC_SIZE, natvec))) { 1946 nid, NATVEC_SIZE, natvec))) {
1877 unsigned idx; 1947 unsigned idx;
1878 for (idx = 0; idx < found; idx++) { 1948 nid = nat_get_nid(natvec[found - 1]) + 1;
1879 struct nat_entry *e = natvec[idx]; 1949 for (idx = 0; idx < found; idx++)
1880 nid = nat_get_nid(e) + 1; 1950 __del_from_nat_cache(nm_i, natvec[idx]);
1881 __del_from_nat_cache(nm_i, e);
1882 }
1883 } 1951 }
1884 f2fs_bug_on(nm_i->nat_cnt); 1952 f2fs_bug_on(nm_i->nat_cnt);
1885 write_unlock(&nm_i->nat_tree_lock); 1953 write_unlock(&nm_i->nat_tree_lock);
@@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1892int __init create_node_manager_caches(void) 1960int __init create_node_manager_caches(void)
1893{ 1961{
1894 nat_entry_slab = f2fs_kmem_cache_create("nat_entry", 1962 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1895 sizeof(struct nat_entry), NULL); 1963 sizeof(struct nat_entry));
1896 if (!nat_entry_slab) 1964 if (!nat_entry_slab)
1897 return -ENOMEM; 1965 return -ENOMEM;
1898 1966
1899 free_nid_slab = f2fs_kmem_cache_create("free_nid", 1967 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1900 sizeof(struct free_nid), NULL); 1968 sizeof(struct free_nid));
1901 if (!free_nid_slab) { 1969 if (!free_nid_slab) {
1902 kmem_cache_destroy(nat_entry_slab); 1970 kmem_cache_destroy(nat_entry_slab);
1903 return -ENOMEM; 1971 return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c4c79885c993..5decc1a375f0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -17,14 +17,11 @@
17/* # of pages to perform readahead before building free nids */ 17/* # of pages to perform readahead before building free nids */
18#define FREE_NID_PAGES 4 18#define FREE_NID_PAGES 4
19 19
20/* maximum # of free node ids to produce during build_free_nids */
21#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
22
23/* maximum readahead size for node during getting data blocks */ 20/* maximum readahead size for node during getting data blocks */
24#define MAX_RA_NODE 128 21#define MAX_RA_NODE 128
25 22
26/* maximum cached nat entries to manage memory footprint */ 23/* control the memory footprint threshold (10MB per 1GB ram) */
27#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) 24#define DEF_RAM_THRESHOLD 10
28 25
29/* vector size for gang look-up from nat cache that consists of radix tree */ 26/* vector size for gang look-up from nat cache that consists of radix tree */
30#define NATVEC_SIZE 64 27#define NATVEC_SIZE 64
@@ -45,6 +42,7 @@ struct node_info {
45struct nat_entry { 42struct nat_entry {
46 struct list_head list; /* for clean or dirty nat list */ 43 struct list_head list; /* for clean or dirty nat list */
47 bool checkpointed; /* whether it is checkpointed or not */ 44 bool checkpointed; /* whether it is checkpointed or not */
45 bool fsync_done; /* whether the latest node has fsync mark */
48 struct node_info ni; /* in-memory node information */ 46 struct node_info ni; /* in-memory node information */
49}; 47};
50 48
@@ -58,9 +56,15 @@ struct nat_entry {
58#define nat_set_version(nat, v) (nat->ni.version = v) 56#define nat_set_version(nat, v) (nat->ni.version = v)
59 57
60#define __set_nat_cache_dirty(nm_i, ne) \ 58#define __set_nat_cache_dirty(nm_i, ne) \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); 59 do { \
60 ne->checkpointed = false; \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
62 } while (0);
62#define __clear_nat_cache_dirty(nm_i, ne) \ 63#define __clear_nat_cache_dirty(nm_i, ne) \
63 list_move_tail(&ne->list, &nm_i->nat_entries); 64 do { \
65 ne->checkpointed = true; \
66 list_move_tail(&ne->list, &nm_i->nat_entries); \
67 } while (0);
64#define inc_node_version(version) (++version) 68#define inc_node_version(version) (++version)
65 69
66static inline void node_info_from_raw_nat(struct node_info *ni, 70static inline void node_info_from_raw_nat(struct node_info *ni,
@@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni,
71 ni->version = raw_ne->version; 75 ni->version = raw_ne->version;
72} 76}
73 77
78enum nid_type {
79 FREE_NIDS, /* indicates the free nid list */
80 NAT_ENTRIES /* indicates the cached nat entry */
81};
82
74/* 83/*
75 * For free nid mangement 84 * For free nid mangement
76 */ 85 */
@@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page)
236{ 245{
237 unsigned int ofs = ofs_of_node(node_page); 246 unsigned int ofs = ofs_of_node(node_page);
238 247
239 if (ofs == XATTR_NODE_OFFSET) 248 if (f2fs_has_xattr_block(ofs))
240 return false; 249 return false;
241 250
242 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || 251 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 976a7a934db5..b1ae89f0f44e 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, 27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
28 nid_t ino) 28 nid_t ino)
29{ 29{
30 struct list_head *this;
31 struct fsync_inode_entry *entry; 30 struct fsync_inode_entry *entry;
32 31
33 list_for_each(this, head) { 32 list_for_each_entry(entry, head, list)
34 entry = list_entry(this, struct fsync_inode_entry, list);
35 if (entry->inode->i_ino == ino) 33 if (entry->inode->i_ino == ino)
36 return entry; 34 return entry;
37 } 35
38 return NULL; 36 return NULL;
39} 37}
40 38
@@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
136 134
137 /* get node pages in the current segment */ 135 /* get node pages in the current segment */
138 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 136 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
139 blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; 137 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
140 138
141 /* read node page */ 139 /* read node page */
142 page = alloc_page(GFP_F2FS_ZERO); 140 page = alloc_page(GFP_F2FS_ZERO);
@@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
218{ 216{
219 struct seg_entry *sentry; 217 struct seg_entry *sentry;
220 unsigned int segno = GET_SEGNO(sbi, blkaddr); 218 unsigned int segno = GET_SEGNO(sbi, blkaddr);
221 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & 219 unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
222 (sbi->blocks_per_seg - 1); 220 struct f2fs_summary_block *sum_node;
223 struct f2fs_summary sum; 221 struct f2fs_summary sum;
222 struct page *sum_page, *node_page;
224 nid_t ino, nid; 223 nid_t ino, nid;
225 void *kaddr;
226 struct inode *inode; 224 struct inode *inode;
227 struct page *node_page;
228 unsigned int offset; 225 unsigned int offset;
229 block_t bidx; 226 block_t bidx;
230 int i; 227 int i;
@@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
238 struct curseg_info *curseg = CURSEG_I(sbi, i); 235 struct curseg_info *curseg = CURSEG_I(sbi, i);
239 if (curseg->segno == segno) { 236 if (curseg->segno == segno) {
240 sum = curseg->sum_blk->entries[blkoff]; 237 sum = curseg->sum_blk->entries[blkoff];
241 break; 238 goto got_it;
242 } 239 }
243 } 240 }
244 if (i > CURSEG_COLD_DATA) {
245 struct page *sum_page = get_sum_page(sbi, segno);
246 struct f2fs_summary_block *sum_node;
247 kaddr = page_address(sum_page);
248 sum_node = (struct f2fs_summary_block *)kaddr;
249 sum = sum_node->entries[blkoff];
250 f2fs_put_page(sum_page, 1);
251 }
252 241
242 sum_page = get_sum_page(sbi, segno);
243 sum_node = (struct f2fs_summary_block *)page_address(sum_page);
244 sum = sum_node->entries[blkoff];
245 f2fs_put_page(sum_page, 1);
246got_it:
253 /* Use the locked dnode page and inode */ 247 /* Use the locked dnode page and inode */
254 nid = le32_to_cpu(sum.nid); 248 nid = le32_to_cpu(sum.nid);
255 if (dn->inode->i_ino == nid) { 249 if (dn->inode->i_ino == nid) {
@@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
301 if (recover_inline_data(inode, page)) 295 if (recover_inline_data(inode, page))
302 goto out; 296 goto out;
303 297
298 if (recover_xattr_data(inode, page, blkaddr))
299 goto out;
300
304 start = start_bidx_of_node(ofs_of_node(page), fi); 301 start = start_bidx_of_node(ofs_of_node(page), fi);
305 if (IS_INODE(page)) 302 if (IS_INODE(page))
306 end = start + ADDRS_PER_INODE(fi); 303 end = start + ADDRS_PER_INODE(fi);
@@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
317 goto out; 314 goto out;
318 } 315 }
319 316
320 wait_on_page_writeback(dn.node_page); 317 f2fs_wait_on_page_writeback(dn.node_page, NODE);
321 318
322 get_node_info(sbi, dn.nid, &ni); 319 get_node_info(sbi, dn.nid, &ni);
323 f2fs_bug_on(ni.ino != ino_of_node(page)); 320 f2fs_bug_on(ni.ino != ino_of_node(page));
@@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
437 bool need_writecp = false; 434 bool need_writecp = false;
438 435
439 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", 436 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
440 sizeof(struct fsync_inode_entry), NULL); 437 sizeof(struct fsync_inode_entry));
441 if (!fsync_entry_slab) 438 if (!fsync_entry_slab)
442 return -ENOMEM; 439 return -ENOMEM;
443 440
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7caac5f2ca9e..085f548be7a3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/prefetch.h> 15#include <linux/prefetch.h>
16#include <linux/kthread.h>
16#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
17#include <linux/swap.h> 18#include <linux/swap.h>
18 19
@@ -24,6 +25,7 @@
24#define __reverse_ffz(x) __reverse_ffs(~(x)) 25#define __reverse_ffz(x) __reverse_ffs(~(x))
25 26
26static struct kmem_cache *discard_entry_slab; 27static struct kmem_cache *discard_entry_slab;
28static struct kmem_cache *flush_cmd_slab;
27 29
28/* 30/*
29 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since 31 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
195 f2fs_sync_fs(sbi->sb, true); 197 f2fs_sync_fs(sbi->sb, true);
196} 198}
197 199
200static int issue_flush_thread(void *data)
201{
202 struct f2fs_sb_info *sbi = data;
203 struct f2fs_sm_info *sm_i = SM_I(sbi);
204 wait_queue_head_t *q = &sm_i->flush_wait_queue;
205repeat:
206 if (kthread_should_stop())
207 return 0;
208
209 spin_lock(&sm_i->issue_lock);
210 if (sm_i->issue_list) {
211 sm_i->dispatch_list = sm_i->issue_list;
212 sm_i->issue_list = sm_i->issue_tail = NULL;
213 }
214 spin_unlock(&sm_i->issue_lock);
215
216 if (sm_i->dispatch_list) {
217 struct bio *bio = bio_alloc(GFP_NOIO, 0);
218 struct flush_cmd *cmd, *next;
219 int ret;
220
221 bio->bi_bdev = sbi->sb->s_bdev;
222 ret = submit_bio_wait(WRITE_FLUSH, bio);
223
224 for (cmd = sm_i->dispatch_list; cmd; cmd = next) {
225 cmd->ret = ret;
226 next = cmd->next;
227 complete(&cmd->wait);
228 }
229 sm_i->dispatch_list = NULL;
230 }
231
232 wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list);
233 goto repeat;
234}
235
236int f2fs_issue_flush(struct f2fs_sb_info *sbi)
237{
238 struct f2fs_sm_info *sm_i = SM_I(sbi);
239 struct flush_cmd *cmd;
240 int ret;
241
242 if (!test_opt(sbi, FLUSH_MERGE))
243 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
244
245 cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC);
246 cmd->next = NULL;
247 cmd->ret = 0;
248 init_completion(&cmd->wait);
249
250 spin_lock(&sm_i->issue_lock);
251 if (sm_i->issue_list)
252 sm_i->issue_tail->next = cmd;
253 else
254 sm_i->issue_list = cmd;
255 sm_i->issue_tail = cmd;
256 spin_unlock(&sm_i->issue_lock);
257
258 if (!sm_i->dispatch_list)
259 wake_up(&sm_i->flush_wait_queue);
260
261 wait_for_completion(&cmd->wait);
262 ret = cmd->ret;
263 kmem_cache_free(flush_cmd_slab, cmd);
264 return ret;
265}
266
198static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, 267static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
199 enum dirty_type dirty_type) 268 enum dirty_type dirty_type)
200{ 269{
@@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
340void clear_prefree_segments(struct f2fs_sb_info *sbi) 409void clear_prefree_segments(struct f2fs_sb_info *sbi)
341{ 410{
342 struct list_head *head = &(SM_I(sbi)->discard_list); 411 struct list_head *head = &(SM_I(sbi)->discard_list);
343 struct list_head *this, *next; 412 struct discard_entry *entry, *this;
344 struct discard_entry *entry;
345 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 413 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
346 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 414 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
347 unsigned int total_segs = TOTAL_SEGS(sbi); 415 unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
370 mutex_unlock(&dirty_i->seglist_lock); 438 mutex_unlock(&dirty_i->seglist_lock);
371 439
372 /* send small discards */ 440 /* send small discards */
373 list_for_each_safe(this, next, head) { 441 list_for_each_entry_safe(entry, this, head, list) {
374 entry = list_entry(this, struct discard_entry, list);
375 f2fs_issue_discard(sbi, entry->blkaddr, entry->len); 442 f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
376 list_del(&entry->list); 443 list_del(&entry->list);
377 SM_I(sbi)->nr_discards -= entry->len; 444 SM_I(sbi)->nr_discards -= entry->len;
@@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
405 472
406 se = get_seg_entry(sbi, segno); 473 se = get_seg_entry(sbi, segno);
407 new_vblocks = se->valid_blocks + del; 474 new_vblocks = se->valid_blocks + del;
408 offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); 475 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
409 476
410 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 477 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
411 (new_vblocks > sbi->blocks_per_seg))); 478 (new_vblocks > sbi->blocks_per_seg)));
@@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
434 get_sec_entry(sbi, segno)->valid_blocks += del; 501 get_sec_entry(sbi, segno)->valid_blocks += del;
435} 502}
436 503
437static void refresh_sit_entry(struct f2fs_sb_info *sbi, 504void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
438 block_t old_blkaddr, block_t new_blkaddr)
439{ 505{
440 update_sit_entry(sbi, new_blkaddr, 1); 506 update_sit_entry(sbi, new, 1);
441 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) 507 if (GET_SEGNO(sbi, old) != NULL_SEGNO)
442 update_sit_entry(sbi, old_blkaddr, -1); 508 update_sit_entry(sbi, old, -1);
509
510 locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
511 locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
443} 512}
444 513
445void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) 514void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
@@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
881 950
882 stat_inc_block_count(sbi, curseg); 951 stat_inc_block_count(sbi, curseg);
883 952
953 if (!__has_curseg_space(sbi, type))
954 sit_i->s_ops->allocate_segment(sbi, type, false);
884 /* 955 /*
885 * SIT information should be updated before segment allocation, 956 * SIT information should be updated before segment allocation,
886 * since SSR needs latest valid block information. 957 * since SSR needs latest valid block information.
887 */ 958 */
888 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); 959 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
889
890 if (!__has_curseg_space(sbi, type))
891 sit_i->s_ops->allocate_segment(sbi, type, false);
892
893 locate_dirty_segment(sbi, old_cursegno); 960 locate_dirty_segment(sbi, old_cursegno);
894 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 961
895 mutex_unlock(&sit_i->sentry_lock); 962 mutex_unlock(&sit_i->sentry_lock);
896 963
897 if (page && IS_NODESEG(type)) 964 if (page && IS_NODESEG(type))
@@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi,
987 change_curseg(sbi, type, true); 1054 change_curseg(sbi, type, true);
988 } 1055 }
989 1056
990 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1057 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
991 (sbi->blocks_per_seg - 1);
992 __add_sum_entry(sbi, type, sum); 1058 __add_sum_entry(sbi, type, sum);
993 1059
994 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1060 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
995
996 locate_dirty_segment(sbi, old_cursegno); 1061 locate_dirty_segment(sbi, old_cursegno);
997 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
998 1062
999 mutex_unlock(&sit_i->sentry_lock); 1063 mutex_unlock(&sit_i->sentry_lock);
1000 mutex_unlock(&curseg->curseg_mutex); 1064 mutex_unlock(&curseg->curseg_mutex);
@@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
1028 curseg->next_segno = segno; 1092 curseg->next_segno = segno;
1029 change_curseg(sbi, type, true); 1093 change_curseg(sbi, type, true);
1030 } 1094 }
1031 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1095 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
1032 (sbi->blocks_per_seg - 1);
1033 __add_sum_entry(sbi, type, sum); 1096 __add_sum_entry(sbi, type, sum);
1034 1097
1035 /* change the current log to the next block addr in advance */ 1098 /* change the current log to the next block addr in advance */
@@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
1037 curseg->next_segno = next_segno; 1100 curseg->next_segno = next_segno;
1038 change_curseg(sbi, type, true); 1101 change_curseg(sbi, type, true);
1039 } 1102 }
1040 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & 1103 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
1041 (sbi->blocks_per_seg - 1);
1042 1104
1043 /* rewrite node page */ 1105 /* rewrite node page */
1044 set_page_writeback(page); 1106 set_page_writeback(page);
1045 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); 1107 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
1046 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1108 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1047 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1109 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
1048
1049 locate_dirty_segment(sbi, old_cursegno); 1110 locate_dirty_segment(sbi, old_cursegno);
1050 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
1051 1111
1052 mutex_unlock(&sit_i->sentry_lock); 1112 mutex_unlock(&sit_i->sentry_lock);
1053 mutex_unlock(&curseg->curseg_mutex); 1113 mutex_unlock(&curseg->curseg_mutex);
1054} 1114}
1055 1115
1116static inline bool is_merged_page(struct f2fs_sb_info *sbi,
1117 struct page *page, enum page_type type)
1118{
1119 enum page_type btype = PAGE_TYPE_OF_BIO(type);
1120 struct f2fs_bio_info *io = &sbi->write_io[btype];
1121 struct bio_vec *bvec;
1122 int i;
1123
1124 down_read(&io->io_rwsem);
1125 if (!io->bio)
1126 goto out;
1127
1128 bio_for_each_segment_all(bvec, io->bio, i) {
1129 if (page == bvec->bv_page) {
1130 up_read(&io->io_rwsem);
1131 return true;
1132 }
1133 }
1134
1135out:
1136 up_read(&io->io_rwsem);
1137 return false;
1138}
1139
1056void f2fs_wait_on_page_writeback(struct page *page, 1140void f2fs_wait_on_page_writeback(struct page *page,
1057 enum page_type type) 1141 enum page_type type)
1058{ 1142{
1059 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1143 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1060 if (PageWriteback(page)) { 1144 if (PageWriteback(page)) {
1061 f2fs_submit_merged_bio(sbi, type, WRITE); 1145 if (is_merged_page(sbi, page, type))
1146 f2fs_submit_merged_bio(sbi, type, WRITE);
1062 wait_on_page_writeback(page); 1147 wait_on_page_writeback(page);
1063 } 1148 }
1064} 1149}
@@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1167 ns->ofs_in_node = 0; 1252 ns->ofs_in_node = 0;
1168 } 1253 }
1169 } else { 1254 } else {
1170 if (restore_node_summary(sbi, segno, sum)) { 1255 int err;
1256
1257 err = restore_node_summary(sbi, segno, sum);
1258 if (err) {
1171 f2fs_put_page(new, 1); 1259 f2fs_put_page(new, 1);
1172 return -EINVAL; 1260 return err;
1173 } 1261 }
1174 } 1262 }
1175 } 1263 }
@@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1190static int restore_curseg_summaries(struct f2fs_sb_info *sbi) 1278static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1191{ 1279{
1192 int type = CURSEG_HOT_DATA; 1280 int type = CURSEG_HOT_DATA;
1281 int err;
1193 1282
1194 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { 1283 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1195 /* restore for compacted data summary */ 1284 /* restore for compacted data summary */
@@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1198 type = CURSEG_HOT_NODE; 1287 type = CURSEG_HOT_NODE;
1199 } 1288 }
1200 1289
1201 for (; type <= CURSEG_COLD_NODE; type++) 1290 for (; type <= CURSEG_COLD_NODE; type++) {
1202 if (read_normal_summaries(sbi, type)) 1291 err = read_normal_summaries(sbi, type);
1203 return -EINVAL; 1292 if (err)
1293 return err;
1294 }
1295
1204 return 0; 1296 return 0;
1205} 1297}
1206 1298
@@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi)
1583 return restore_curseg_summaries(sbi); 1675 return restore_curseg_summaries(sbi);
1584} 1676}
1585 1677
1586static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
1587{
1588 struct address_space *mapping = META_MAPPING(sbi);
1589 struct page *page;
1590 block_t blk_addr, prev_blk_addr = 0;
1591 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1592 int blkno = start;
1593 struct f2fs_io_info fio = {
1594 .type = META,
1595 .rw = READ_SYNC | REQ_META | REQ_PRIO
1596 };
1597
1598 for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
1599
1600 blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
1601
1602 if (blkno != start && prev_blk_addr + 1 != blk_addr)
1603 break;
1604 prev_blk_addr = blk_addr;
1605repeat:
1606 page = grab_cache_page(mapping, blk_addr);
1607 if (!page) {
1608 cond_resched();
1609 goto repeat;
1610 }
1611 if (PageUptodate(page)) {
1612 mark_page_accessed(page);
1613 f2fs_put_page(page, 1);
1614 continue;
1615 }
1616
1617 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
1618
1619 mark_page_accessed(page);
1620 f2fs_put_page(page, 0);
1621 }
1622
1623 f2fs_submit_merged_bio(sbi, META, READ);
1624 return blkno - start;
1625}
1626
1627static void build_sit_entries(struct f2fs_sb_info *sbi) 1678static void build_sit_entries(struct f2fs_sb_info *sbi)
1628{ 1679{
1629 struct sit_info *sit_i = SIT_I(sbi); 1680 struct sit_info *sit_i = SIT_I(sbi);
@@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1635 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1686 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1636 1687
1637 do { 1688 do {
1638 readed = ra_sit_pages(sbi, start_blk, nrpages); 1689 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
1639 1690
1640 start = start_blk * sit_i->sents_per_block; 1691 start = start_blk * sit_i->sents_per_block;
1641 end = (start_blk + readed) * sit_i->sents_per_block; 1692 end = (start_blk + readed) * sit_i->sents_per_block;
@@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1781{ 1832{
1782 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); 1833 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1783 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1834 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1835 dev_t dev = sbi->sb->s_bdev->bd_dev;
1784 struct f2fs_sm_info *sm_info; 1836 struct f2fs_sm_info *sm_info;
1785 int err; 1837 int err;
1786 1838
@@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1799 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); 1851 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
1800 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); 1852 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1801 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 1853 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1802 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; 1854 sm_info->rec_prefree_segments = sm_info->main_segments *
1855 DEF_RECLAIM_PREFREE_SEGMENTS / 100;
1803 sm_info->ipu_policy = F2FS_IPU_DISABLE; 1856 sm_info->ipu_policy = F2FS_IPU_DISABLE;
1804 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 1857 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
1805 1858
@@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1807 sm_info->nr_discards = 0; 1860 sm_info->nr_discards = 0;
1808 sm_info->max_discards = 0; 1861 sm_info->max_discards = 0;
1809 1862
1863 if (test_opt(sbi, FLUSH_MERGE)) {
1864 spin_lock_init(&sm_info->issue_lock);
1865 init_waitqueue_head(&sm_info->flush_wait_queue);
1866
1867 sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
1868 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
1869 if (IS_ERR(sm_info->f2fs_issue_flush))
1870 return PTR_ERR(sm_info->f2fs_issue_flush);
1871 }
1872
1810 err = build_sit_info(sbi); 1873 err = build_sit_info(sbi);
1811 if (err) 1874 if (err)
1812 return err; 1875 return err;
@@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1915 struct f2fs_sm_info *sm_info = SM_I(sbi); 1978 struct f2fs_sm_info *sm_info = SM_I(sbi);
1916 if (!sm_info) 1979 if (!sm_info)
1917 return; 1980 return;
1981 if (sm_info->f2fs_issue_flush)
1982 kthread_stop(sm_info->f2fs_issue_flush);
1918 destroy_dirty_segmap(sbi); 1983 destroy_dirty_segmap(sbi);
1919 destroy_curseg(sbi); 1984 destroy_curseg(sbi);
1920 destroy_free_segmap(sbi); 1985 destroy_free_segmap(sbi);
@@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1926int __init create_segment_manager_caches(void) 1991int __init create_segment_manager_caches(void)
1927{ 1992{
1928 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 1993 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
1929 sizeof(struct discard_entry), NULL); 1994 sizeof(struct discard_entry));
1930 if (!discard_entry_slab) 1995 if (!discard_entry_slab)
1931 return -ENOMEM; 1996 return -ENOMEM;
1997 flush_cmd_slab = f2fs_kmem_cache_create("flush_command",
1998 sizeof(struct flush_cmd));
1999 if (!flush_cmd_slab) {
2000 kmem_cache_destroy(discard_entry_slab);
2001 return -ENOMEM;
2002 }
1932 return 0; 2003 return 0;
1933} 2004}
1934 2005
1935void destroy_segment_manager_caches(void) 2006void destroy_segment_manager_caches(void)
1936{ 2007{
1937 kmem_cache_destroy(discard_entry_slab); 2008 kmem_cache_destroy(discard_entry_slab);
2009 kmem_cache_destroy(flush_cmd_slab);
1938} 2010}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5731682d7516..7091204680f4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,7 +14,7 @@
14#define NULL_SEGNO ((unsigned int)(~0)) 14#define NULL_SEGNO ((unsigned int)(~0))
15#define NULL_SECNO ((unsigned int)(~0)) 15#define NULL_SECNO ((unsigned int)(~0))
16 16
17#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ 17#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
18 18
19/* L: Logical segment # in volume, R: Relative segment # in main area */ 19/* L: Logical segment # in volume, R: Relative segment # in main area */
20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) 20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
@@ -57,6 +57,9 @@
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr) 57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
61 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
62
60#define GET_SEGNO(sbi, blk_addr) \ 63#define GET_SEGNO(sbi, blk_addr) \
61 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ 64 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
62 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ 65 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
@@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
377 380
378static inline block_t written_block_count(struct f2fs_sb_info *sbi) 381static inline block_t written_block_count(struct f2fs_sb_info *sbi)
379{ 382{
380 struct sit_info *sit_i = SIT_I(sbi); 383 return SIT_I(sbi)->written_valid_blocks;
381 block_t vblocks;
382
383 mutex_lock(&sit_i->sentry_lock);
384 vblocks = sit_i->written_valid_blocks;
385 mutex_unlock(&sit_i->sentry_lock);
386
387 return vblocks;
388} 384}
389 385
390static inline unsigned int free_segments(struct f2fs_sb_info *sbi) 386static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
391{ 387{
392 struct free_segmap_info *free_i = FREE_I(sbi); 388 return FREE_I(sbi)->free_segments;
393 unsigned int free_segs;
394
395 read_lock(&free_i->segmap_lock);
396 free_segs = free_i->free_segments;
397 read_unlock(&free_i->segmap_lock);
398
399 return free_segs;
400} 389}
401 390
402static inline int reserved_segments(struct f2fs_sb_info *sbi) 391static inline int reserved_segments(struct f2fs_sb_info *sbi)
@@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi)
406 395
407static inline unsigned int free_sections(struct f2fs_sb_info *sbi) 396static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
408{ 397{
409 struct free_segmap_info *free_i = FREE_I(sbi); 398 return FREE_I(sbi)->free_sections;
410 unsigned int free_secs;
411
412 read_lock(&free_i->segmap_lock);
413 free_secs = free_i->free_sections;
414 read_unlock(&free_i->segmap_lock);
415
416 return free_secs;
417} 399}
418 400
419static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) 401static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
@@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
682 struct request_queue *q = bdev_get_queue(bdev); 664 struct request_queue *q = bdev_get_queue(bdev);
683 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 665 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
684} 666}
667
668/*
669 * It is very important to gather dirty pages and write at once, so that we can
670 * submit a big bio without interfering other data writes.
671 * By default, 512 pages for directory data,
672 * 512 pages (2MB) * 3 for three types of nodes, and
673 * max_bio_blocks for meta are set.
674 */
675static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
676{
677 if (type == DATA)
678 return sbi->blocks_per_seg;
679 else if (type == NODE)
680 return 3 * sbi->blocks_per_seg;
681 else if (type == META)
682 return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
683 else
684 return 0;
685}
686
687/*
688 * When writing pages, it'd better align nr_to_write for segment size.
689 */
690static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
691 struct writeback_control *wbc)
692{
693 long nr_to_write, desired;
694
695 if (wbc->sync_mode != WB_SYNC_NONE)
696 return 0;
697
698 nr_to_write = wbc->nr_to_write;
699
700 if (type == DATA)
701 desired = 4096;
702 else if (type == NODE)
703 desired = 3 * max_hw_blocks(sbi);
704 else
705 desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
706
707 wbc->nr_to_write = desired;
708 return desired - nr_to_write;
709}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83abd53..c756923a7302 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -51,6 +51,7 @@ enum {
51 Opt_disable_ext_identify, 51 Opt_disable_ext_identify,
52 Opt_inline_xattr, 52 Opt_inline_xattr,
53 Opt_inline_data, 53 Opt_inline_data,
54 Opt_flush_merge,
54 Opt_err, 55 Opt_err,
55}; 56};
56 57
@@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = {
67 {Opt_disable_ext_identify, "disable_ext_identify"}, 68 {Opt_disable_ext_identify, "disable_ext_identify"},
68 {Opt_inline_xattr, "inline_xattr"}, 69 {Opt_inline_xattr, "inline_xattr"},
69 {Opt_inline_data, "inline_data"}, 70 {Opt_inline_data, "inline_data"},
71 {Opt_flush_merge, "flush_merge"},
70 {Opt_err, NULL}, 72 {Opt_err, NULL},
71}; 73};
72 74
@@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = {
74enum { 76enum {
75 GC_THREAD, /* struct f2fs_gc_thread */ 77 GC_THREAD, /* struct f2fs_gc_thread */
76 SM_INFO, /* struct f2fs_sm_info */ 78 SM_INFO, /* struct f2fs_sm_info */
79 NM_INFO, /* struct f2fs_nm_info */
77 F2FS_SBI, /* struct f2fs_sb_info */ 80 F2FS_SBI, /* struct f2fs_sb_info */
78}; 81};
79 82
@@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
92 return (unsigned char *)sbi->gc_thread; 95 return (unsigned char *)sbi->gc_thread;
93 else if (struct_type == SM_INFO) 96 else if (struct_type == SM_INFO)
94 return (unsigned char *)SM_I(sbi); 97 return (unsigned char *)SM_I(sbi);
98 else if (struct_type == NM_INFO)
99 return (unsigned char *)NM_I(sbi);
95 else if (struct_type == F2FS_SBI) 100 else if (struct_type == F2FS_SBI)
96 return (unsigned char *)sbi; 101 return (unsigned char *)sbi;
97 return NULL; 102 return NULL;
@@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
183F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 188F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
184F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 189F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
185F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
191F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
186F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 192F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
193F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
187 194
188#define ATTR_LIST(name) (&f2fs_attr_##name.attr) 195#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
189static struct attribute *f2fs_attrs[] = { 196static struct attribute *f2fs_attrs[] = {
@@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = {
196 ATTR_LIST(ipu_policy), 203 ATTR_LIST(ipu_policy),
197 ATTR_LIST(min_ipu_util), 204 ATTR_LIST(min_ipu_util),
198 ATTR_LIST(max_victim_search), 205 ATTR_LIST(max_victim_search),
206 ATTR_LIST(dir_level),
207 ATTR_LIST(ram_thresh),
199 NULL, 208 NULL,
200}; 209};
201 210
@@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options)
256 265
257 if (!name) 266 if (!name)
258 return -ENOMEM; 267 return -ENOMEM;
259 if (!strncmp(name, "on", 2)) 268 if (strlen(name) == 2 && !strncmp(name, "on", 2))
260 set_opt(sbi, BG_GC); 269 set_opt(sbi, BG_GC);
261 else if (!strncmp(name, "off", 3)) 270 else if (strlen(name) == 3 && !strncmp(name, "off", 3))
262 clear_opt(sbi, BG_GC); 271 clear_opt(sbi, BG_GC);
263 else { 272 else {
264 kfree(name); 273 kfree(name);
@@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options)
327 case Opt_inline_data: 336 case Opt_inline_data:
328 set_opt(sbi, INLINE_DATA); 337 set_opt(sbi, INLINE_DATA);
329 break; 338 break;
339 case Opt_flush_merge:
340 set_opt(sbi, FLUSH_MERGE);
341 break;
330 default: 342 default:
331 f2fs_msg(sb, KERN_ERR, 343 f2fs_msg(sb, KERN_ERR,
332 "Unrecognized mount option \"%s\" or missing value", 344 "Unrecognized mount option \"%s\" or missing value",
@@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
353 fi->i_current_depth = 1; 365 fi->i_current_depth = 1;
354 fi->i_advise = 0; 366 fi->i_advise = 0;
355 rwlock_init(&fi->ext.ext_lock); 367 rwlock_init(&fi->ext.ext_lock);
368 init_rwsem(&fi->i_sem);
356 369
357 set_inode_flag(fi, FI_NEW_INODE); 370 set_inode_flag(fi, FI_NEW_INODE);
358 371
359 if (test_opt(F2FS_SB(sb), INLINE_XATTR)) 372 if (test_opt(F2FS_SB(sb), INLINE_XATTR))
360 set_inode_flag(fi, FI_INLINE_XATTR); 373 set_inode_flag(fi, FI_INLINE_XATTR);
361 374
375 /* Will be used by directory only */
376 fi->i_dir_level = F2FS_SB(sb)->dir_level;
377
362 return &fi->vfs_inode; 378 return &fi->vfs_inode;
363} 379}
364 380
@@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
526 seq_puts(seq, ",disable_ext_identify"); 542 seq_puts(seq, ",disable_ext_identify");
527 if (test_opt(sbi, INLINE_DATA)) 543 if (test_opt(sbi, INLINE_DATA))
528 seq_puts(seq, ",inline_data"); 544 seq_puts(seq, ",inline_data");
545 if (test_opt(sbi, FLUSH_MERGE))
546 seq_puts(seq, ",flush_merge");
529 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 547 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
530 548
531 return 0; 549 return 0;
@@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
539 le32_to_cpu(sbi->raw_super->segment_count_main); 557 le32_to_cpu(sbi->raw_super->segment_count_main);
540 int i; 558 int i;
541 559
560 seq_puts(seq, "format: segment_type|valid_blocks\n"
561 "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
562
542 for (i = 0; i < total_segs; i++) { 563 for (i = 0; i < total_segs; i++) {
543 seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); 564 struct seg_entry *se = get_seg_entry(sbi, i);
544 if (i != 0 && (i % 10) == 0) 565
545 seq_puts(seq, "\n"); 566 if ((i % 10) == 0)
567 seq_printf(seq, "%-5d", i);
568 seq_printf(seq, "%d|%-3u", se->type,
569 get_valid_blocks(sbi, i, 1));
570 if ((i % 10) == 9 || i == (total_segs - 1))
571 seq_putc(seq, '\n');
546 else 572 else
547 seq_puts(seq, " "); 573 seq_putc(seq, ' ');
548 } 574 }
575
549 return 0; 576 return 0;
550} 577}
551 578
@@ -568,6 +595,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
568 struct f2fs_mount_info org_mount_opt; 595 struct f2fs_mount_info org_mount_opt;
569 int err, active_logs; 596 int err, active_logs;
570 597
598 sync_filesystem(sb);
599
571 /* 600 /*
572 * Save the old mount options in case we 601 * Save the old mount options in case we
573 * need to restore them. 602 * need to restore them.
@@ -638,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
638 667
639 if (unlikely(ino < F2FS_ROOT_INO(sbi))) 668 if (unlikely(ino < F2FS_ROOT_INO(sbi)))
640 return ERR_PTR(-ESTALE); 669 return ERR_PTR(-ESTALE);
670 if (unlikely(ino >= NM_I(sbi)->max_nid))
671 return ERR_PTR(-ESTALE);
641 672
642 /* 673 /*
643 * f2fs_iget isn't quite right if the inode is currently unallocated! 674 * f2fs_iget isn't quite right if the inode is currently unallocated!
@@ -785,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
785 816
786 for (i = 0; i < NR_COUNT_TYPE; i++) 817 for (i = 0; i < NR_COUNT_TYPE; i++)
787 atomic_set(&sbi->nr_pages[i], 0); 818 atomic_set(&sbi->nr_pages[i], 0);
819
820 sbi->dir_level = DEF_DIR_LEVEL;
788} 821}
789 822
790/* 823/*
@@ -896,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
896 sbi->por_doing = false; 929 sbi->por_doing = false;
897 spin_lock_init(&sbi->stat_lock); 930 spin_lock_init(&sbi->stat_lock);
898 931
899 mutex_init(&sbi->read_io.io_mutex); 932 init_rwsem(&sbi->read_io.io_rwsem);
900 sbi->read_io.sbi = sbi; 933 sbi->read_io.sbi = sbi;
901 sbi->read_io.bio = NULL; 934 sbi->read_io.bio = NULL;
902 for (i = 0; i < NR_PAGE_TYPE; i++) { 935 for (i = 0; i < NR_PAGE_TYPE; i++) {
903 mutex_init(&sbi->write_io[i].io_mutex); 936 init_rwsem(&sbi->write_io[i].io_rwsem);
904 sbi->write_io[i].sbi = sbi; 937 sbi->write_io[i].sbi = sbi;
905 sbi->write_io[i].bio = NULL; 938 sbi->write_io[i].bio = NULL;
906 } 939 }
@@ -989,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
989 goto free_root_inode; 1022 goto free_root_inode;
990 } 1023 }
991 1024
992 /* recover fsynced data */
993 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
994 err = recover_fsync_data(sbi);
995 if (err)
996 f2fs_msg(sb, KERN_ERR,
997 "Cannot recover all fsync data errno=%ld", err);
998 }
999
1000 /*
1001 * If filesystem is not mounted as read-only then
1002 * do start the gc_thread.
1003 */
1004 if (!(sb->s_flags & MS_RDONLY)) {
1005 /* After POR, we can run background GC thread.*/
1006 err = start_gc_thread(sbi);
1007 if (err)
1008 goto free_gc;
1009 }
1010
1011 err = f2fs_build_stats(sbi); 1025 err = f2fs_build_stats(sbi);
1012 if (err) 1026 if (err)
1013 goto free_gc; 1027 goto free_root_inode;
1014 1028
1015 if (f2fs_proc_root) 1029 if (f2fs_proc_root)
1016 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); 1030 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1032,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1032 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, 1046 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
1033 "%s", sb->s_id); 1047 "%s", sb->s_id);
1034 if (err) 1048 if (err)
1035 goto fail; 1049 goto free_proc;
1050
1051 /* recover fsynced data */
1052 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1053 err = recover_fsync_data(sbi);
1054 if (err)
1055 f2fs_msg(sb, KERN_ERR,
1056 "Cannot recover all fsync data errno=%ld", err);
1057 }
1036 1058
1059 /*
1060 * If filesystem is not mounted as read-only then
1061 * do start the gc_thread.
1062 */
1063 if (!(sb->s_flags & MS_RDONLY)) {
1064 /* After POR, we can run background GC thread.*/
1065 err = start_gc_thread(sbi);
1066 if (err)
1067 goto free_kobj;
1068 }
1037 return 0; 1069 return 0;
1038fail: 1070
1071free_kobj:
1072 kobject_del(&sbi->s_kobj);
1073free_proc:
1039 if (sbi->s_proc) { 1074 if (sbi->s_proc) {
1040 remove_proc_entry("segment_info", sbi->s_proc); 1075 remove_proc_entry("segment_info", sbi->s_proc);
1041 remove_proc_entry(sb->s_id, f2fs_proc_root); 1076 remove_proc_entry(sb->s_id, f2fs_proc_root);
1042 } 1077 }
1043 f2fs_destroy_stats(sbi); 1078 f2fs_destroy_stats(sbi);
1044free_gc:
1045 stop_gc_thread(sbi);
1046free_root_inode: 1079free_root_inode:
1047 dput(sb->s_root); 1080 dput(sb->s_root);
1048 sb->s_root = NULL; 1081 sb->s_root = NULL;
@@ -1082,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs");
1082static int __init init_inodecache(void) 1115static int __init init_inodecache(void)
1083{ 1116{
1084 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1117 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
1085 sizeof(struct f2fs_inode_info), NULL); 1118 sizeof(struct f2fs_inode_info));
1086 if (!f2fs_inode_cachep) 1119 if (!f2fs_inode_cachep)
1087 return -ENOMEM; 1120 return -ENOMEM;
1088 return 0; 1121 return 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 89d0422a91a8..503c2451131e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
275 275
276 inline_size = inline_xattr_size(inode); 276 inline_size = inline_xattr_size(inode);
277 277
278 txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); 278 txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
279 if (!txattr_addr) 279 if (!txattr_addr)
280 return NULL; 280 return NULL;
281 281
@@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
407 if (name == NULL) 407 if (name == NULL)
408 return -EINVAL; 408 return -EINVAL;
409 name_len = strlen(name); 409 name_len = strlen(name);
410 if (name_len > F2FS_NAME_LEN)
411 return -ERANGE;
410 412
411 base_addr = read_all_xattrs(inode, NULL); 413 base_addr = read_all_xattrs(inode, NULL);
412 if (!base_addr) 414 if (!base_addr)
@@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
590 f2fs_balance_fs(sbi); 592 f2fs_balance_fs(sbi);
591 593
592 f2fs_lock_op(sbi); 594 f2fs_lock_op(sbi);
595 /* protect xattr_ver */
596 down_write(&F2FS_I(inode)->i_sem);
593 err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); 597 err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
598 up_write(&F2FS_I(inode)->i_sem);
594 f2fs_unlock_op(sbi); 599 f2fs_unlock_op(sbi);
595 600
596 return err; 601 return err;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578f6695..b3361fe2bcb5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);
490 490
491static void fat_evict_inode(struct inode *inode) 491static void fat_evict_inode(struct inode *inode)
492{ 492{
493 truncate_inode_pages(&inode->i_data, 0); 493 truncate_inode_pages_final(&inode->i_data);
494 if (!inode->i_nlink) { 494 if (!inode->i_nlink) {
495 inode->i_size = 0; 495 inode->i_size = 0;
496 fat_truncate_blocks(inode, 0); 496 fat_truncate_blocks(inode, 0);
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
635 struct msdos_sb_info *sbi = MSDOS_SB(sb); 635 struct msdos_sb_info *sbi = MSDOS_SB(sb);
636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); 636 *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
637 637
638 sync_filesystem(sb);
639
638 /* make sure we update state on remount. */ 640 /* make sure we update state on remount. */
639 new_rdonly = *flags & MS_RDONLY; 641 new_rdonly = *flags & MS_RDONLY;
640 if (new_rdonly != (sb->s_flags & MS_RDONLY)) { 642 if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ef6866592a0f..72c82f69b01b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -272,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
272 case F_SETFL: 272 case F_SETFL:
273 err = setfl(fd, filp, arg); 273 err = setfl(fd, filp, arg);
274 break; 274 break;
275#if BITS_PER_LONG != 32
276 /* 32-bit arches must use fcntl64() */
277 case F_OFD_GETLK:
278#endif
275 case F_GETLK: 279 case F_GETLK:
276 err = fcntl_getlk(filp, (struct flock __user *) arg); 280 err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
277 break; 281 break;
282#if BITS_PER_LONG != 32
283 /* 32-bit arches must use fcntl64() */
284 case F_OFD_SETLK:
285 case F_OFD_SETLKW:
286#endif
287 /* Fallthrough */
278 case F_SETLK: 288 case F_SETLK:
279 case F_SETLKW: 289 case F_SETLKW:
280 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); 290 err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
@@ -388,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
388 goto out1; 398 goto out1;
389 399
390 switch (cmd) { 400 switch (cmd) {
391 case F_GETLK64: 401 case F_GETLK64:
392 err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); 402 case F_OFD_GETLK:
393 break; 403 err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
394 case F_SETLK64: 404 break;
395 case F_SETLKW64: 405 case F_SETLK64:
396 err = fcntl_setlk64(fd, f.file, cmd, 406 case F_SETLKW64:
397 (struct flock64 __user *) arg); 407 case F_OFD_SETLK:
398 break; 408 case F_OFD_SETLKW:
399 default: 409 err = fcntl_setlk64(fd, f.file, cmd,
400 err = do_fcntl(fd, cmd, arg, f.file); 410 (struct flock64 __user *) arg);
401 break; 411 break;
412 default:
413 err = do_fcntl(fd, cmd, arg, f.file);
414 break;
402 } 415 }
403out1: 416out1:
404 fdput(f); 417 fdput(f);
diff --git a/fs/file.c b/fs/file.c
index eb56a13dab3e..8f294cfac697 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
25 25
26int sysctl_nr_open __read_mostly = 1024*1024; 26int sysctl_nr_open __read_mostly = 1024*1024;
27int sysctl_nr_open_min = BITS_PER_LONG; 27int sysctl_nr_open_min = BITS_PER_LONG;
28int sysctl_nr_open_max = 1024 * 1024; /* raised later */ 28/* our max() is unusable in constant expressions ;-/ */
29#define __const_max(x, y) ((x) < (y) ? (x) : (y))
30int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
31 -BITS_PER_LONG;
29 32
30static void *alloc_fdmem(size_t size) 33static void *alloc_fdmem(size_t size)
31{ 34{
@@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk)
429 } 432 }
430} 433}
431 434
432void __init files_defer_init(void)
433{
434 sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
435 -BITS_PER_LONG;
436}
437
438struct files_struct init_files = { 435struct files_struct init_files = {
439 .count = ATOMIC_INIT(1), 436 .count = ATOMIC_INIT(1),
440 .fdt = &init_files.fdtab, 437 .fdt = &init_files.fdtab,
@@ -497,7 +494,7 @@ repeat:
497 error = fd; 494 error = fd;
498#if 1 495#if 1
499 /* Sanity check */ 496 /* Sanity check */
500 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { 497 if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
501 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 498 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
502 rcu_assign_pointer(fdt->fd[fd], NULL); 499 rcu_assign_pointer(fdt->fd[fd], NULL);
503 } 500 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 5b24008ea4f6..a374f5033e97 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
52static inline void file_free(struct file *f) 52static inline void file_free(struct file *f)
53{ 53{
54 percpu_counter_dec(&nr_files); 54 percpu_counter_dec(&nr_files);
55 file_check_state(f);
56 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); 55 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
57} 56}
58 57
@@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
178 file->f_mapping = path->dentry->d_inode->i_mapping; 177 file->f_mapping = path->dentry->d_inode->i_mapping;
179 file->f_mode = mode; 178 file->f_mode = mode;
180 file->f_op = fop; 179 file->f_op = fop;
181
182 /*
183 * These mounts don't really matter in practice
184 * for r/o bind mounts. They aren't userspace-
185 * visible. We do this for consistency, and so
186 * that we can do debugging checks at __fput()
187 */
188 if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
189 file_take_write(file);
190 WARN_ON(mnt_clone_write(path->mnt));
191 }
192 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) 180 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
193 i_readcount_inc(path->dentry->d_inode); 181 i_readcount_inc(path->dentry->d_inode);
194 return file; 182 return file;
195} 183}
196EXPORT_SYMBOL(alloc_file); 184EXPORT_SYMBOL(alloc_file);
197 185
198/**
199 * drop_file_write_access - give up ability to write to a file
200 * @file: the file to which we will stop writing
201 *
202 * This is a central place which will give up the ability
203 * to write to @file, along with access to write through
204 * its vfsmount.
205 */
206static void drop_file_write_access(struct file *file)
207{
208 struct vfsmount *mnt = file->f_path.mnt;
209 struct dentry *dentry = file->f_path.dentry;
210 struct inode *inode = dentry->d_inode;
211
212 put_write_access(inode);
213
214 if (special_file(inode->i_mode))
215 return;
216 if (file_check_writeable(file) != 0)
217 return;
218 __mnt_drop_write(mnt);
219 file_release_write(file);
220}
221
222/* the real guts of fput() - releasing the last reference to file 186/* the real guts of fput() - releasing the last reference to file
223 */ 187 */
224static void __fput(struct file *file) 188static void __fput(struct file *file)
@@ -235,7 +199,7 @@ static void __fput(struct file *file)
235 * in the file cleanup chain. 199 * in the file cleanup chain.
236 */ 200 */
237 eventpoll_release(file); 201 eventpoll_release(file);
238 locks_remove_flock(file); 202 locks_remove_file(file);
239 203
240 if (unlikely(file->f_flags & FASYNC)) { 204 if (unlikely(file->f_flags & FASYNC)) {
241 if (file->f_op->fasync) 205 if (file->f_op->fasync)
@@ -253,8 +217,10 @@ static void __fput(struct file *file)
253 put_pid(file->f_owner.pid); 217 put_pid(file->f_owner.pid);
254 if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) 218 if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
255 i_readcount_dec(inode); 219 i_readcount_dec(inode);
256 if (file->f_mode & FMODE_WRITE) 220 if (file->f_mode & FMODE_WRITER) {
257 drop_file_write_access(file); 221 put_write_access(inode);
222 __mnt_drop_write(mnt);
223 }
258 file->f_path.dentry = NULL; 224 file->f_path.dentry = NULL;
259 file->f_path.mnt = NULL; 225 file->f_path.mnt = NULL;
260 file->f_inode = NULL; 226 file->f_inode = NULL;
@@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages)
359 325
360 n = (mempages * (PAGE_SIZE / 1024)) / 10; 326 n = (mempages * (PAGE_SIZE / 1024)) / 10;
361 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 327 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
362 files_defer_init();
363 percpu_counter_init(&nr_files, 0); 328 percpu_counter_init(&nr_files, 0);
364} 329}
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6a..5797d45a78cb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
121 121
122EXPORT_SYMBOL(unregister_filesystem); 122EXPORT_SYMBOL(unregister_filesystem);
123 123
124#ifdef CONFIG_SYSFS_SYSCALL
124static int fs_index(const char __user * __name) 125static int fs_index(const char __user * __name)
125{ 126{
126 struct file_system_type * tmp; 127 struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
199 } 200 }
200 return retval; 201 return retval;
201} 202}
203#endif
202 204
203int __init get_filesystem_list(char *buf) 205int __init get_filesystem_list(char *buf)
204{ 206{
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72cef17..363e3ae25f6b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
354void 354void
355vxfs_evict_inode(struct inode *ip) 355vxfs_evict_inode(struct inode *ip)
356{ 356{
357 truncate_inode_pages(&ip->i_data, 0); 357 truncate_inode_pages_final(&ip->i_data);
358 clear_inode(ip); 358 clear_inode(ip);
359 call_rcu(&ip->i_rcu, vxfs_i_callback); 359 call_rcu(&ip->i_rcu, vxfs_i_callback);
360} 360}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 25d4099a4aea..99c7f0a37af4 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
192 * vxfs_lookup - lookup pathname component 192 * vxfs_lookup - lookup pathname component
193 * @dip: dir in which we lookup 193 * @dip: dir in which we lookup
194 * @dp: dentry we lookup 194 * @dp: dentry we lookup
195 * @nd: lookup nameidata 195 * @flags: lookup flags
196 * 196 *
197 * Description: 197 * Description:
198 * vxfs_lookup tries to lookup the pathname component described 198 * vxfs_lookup tries to lookup the pathname component described
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e492..7ca8c75d50d3 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
124 124
125static int vxfs_remount(struct super_block *sb, int *flags, char *data) 125static int vxfs_remount(struct super_block *sb, int *flags, char *data)
126{ 126{
127 sync_filesystem(sb);
127 *flags |= MS_RDONLY; 128 *flags |= MS_RDONLY;
128 return 0; 129 return 0;
129} 130}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d754e3cf99a8..be568b7311d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -89,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head)
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
90#include <trace/events/writeback.h> 90#include <trace/events/writeback.h>
91 91
92EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
93
94static void bdi_wakeup_thread(struct backing_dev_info *bdi)
95{
96 spin_lock_bh(&bdi->wb_lock);
97 if (test_bit(BDI_registered, &bdi->state))
98 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
99 spin_unlock_bh(&bdi->wb_lock);
100}
101
92static void bdi_queue_work(struct backing_dev_info *bdi, 102static void bdi_queue_work(struct backing_dev_info *bdi,
93 struct wb_writeback_work *work) 103 struct wb_writeback_work *work)
94{ 104{
95 trace_writeback_queue(bdi, work); 105 trace_writeback_queue(bdi, work);
96 106
97 spin_lock_bh(&bdi->wb_lock); 107 spin_lock_bh(&bdi->wb_lock);
108 if (!test_bit(BDI_registered, &bdi->state)) {
109 if (work->done)
110 complete(work->done);
111 goto out_unlock;
112 }
98 list_add_tail(&work->list, &bdi->work_list); 113 list_add_tail(&work->list, &bdi->work_list);
99 spin_unlock_bh(&bdi->wb_lock);
100
101 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 114 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
115out_unlock:
116 spin_unlock_bh(&bdi->wb_lock);
102} 117}
103 118
104static void 119static void
@@ -114,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
114 work = kzalloc(sizeof(*work), GFP_ATOMIC); 129 work = kzalloc(sizeof(*work), GFP_ATOMIC);
115 if (!work) { 130 if (!work) {
116 trace_writeback_nowork(bdi); 131 trace_writeback_nowork(bdi);
117 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 132 bdi_wakeup_thread(bdi);
118 return; 133 return;
119 } 134 }
120 135
@@ -161,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
161 * writeback as soon as there is no other work to do. 176 * writeback as soon as there is no other work to do.
162 */ 177 */
163 trace_writeback_wake_background(bdi); 178 trace_writeback_wake_background(bdi);
164 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 179 bdi_wakeup_thread(bdi);
165} 180}
166 181
167/* 182/*
@@ -1017,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1017 current->flags |= PF_SWAPWRITE; 1032 current->flags |= PF_SWAPWRITE;
1018 1033
1019 if (likely(!current_is_workqueue_rescuer() || 1034 if (likely(!current_is_workqueue_rescuer() ||
1020 list_empty(&bdi->bdi_list))) { 1035 !test_bit(BDI_registered, &bdi->state))) {
1021 /* 1036 /*
1022 * The normal path. Keep writing back @bdi until its 1037 * The normal path. Keep writing back @bdi until its
1023 * work_list is empty. Note that this path is also taken 1038 * work_list is empty. Note that this path is also taken
@@ -1039,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work)
1039 trace_writeback_pages_written(pages_written); 1054 trace_writeback_pages_written(pages_written);
1040 } 1055 }
1041 1056
1042 if (!list_empty(&bdi->work_list) || 1057 if (!list_empty(&bdi->work_list))
1043 (wb_has_dirty_io(wb) && dirty_writeback_interval)) 1058 mod_delayed_work(bdi_wq, &wb->dwork, 0);
1044 queue_delayed_work(bdi_wq, &wb->dwork, 1059 else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1045 msecs_to_jiffies(dirty_writeback_interval * 10)); 1060 bdi_wakeup_thread_delayed(bdi);
1046 1061
1047 current->flags &= ~PF_SWAPWRITE; 1062 current->flags &= ~PF_SWAPWRITE;
1048} 1063}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855d00a9..205e0d5d5307 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -348,7 +348,7 @@ int __init fuse_ctl_init(void)
348 return register_filesystem(&fuse_ctl_fs_type); 348 return register_filesystem(&fuse_ctl_fs_type);
349} 349}
350 350
351void fuse_ctl_cleanup(void) 351void __exit fuse_ctl_cleanup(void)
352{ 352{
353 unregister_filesystem(&fuse_ctl_fs_type); 353 unregister_filesystem(&fuse_ctl_fs_type);
354} 354}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b96a49b37d66..13b691a8a7d2 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
95 struct iovec iov = { .iov_base = buf, .iov_len = count }; 95 struct iovec iov = { .iov_base = buf, .iov_len = count };
96 struct fuse_io_priv io = { .async = 0, .file = file }; 96 struct fuse_io_priv io = { .async = 0, .file = file };
97 97
98 return fuse_direct_io(&io, &iov, 1, count, &pos, 0); 98 return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
99} 99}
100 100
101static ssize_t cuse_write(struct file *file, const char __user *buf, 101static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
109 * No locking or generic_write_checks(), the server is 109 * No locking or generic_write_checks(), the server is
110 * responsible for locking and sanity checks. 110 * responsible for locking and sanity checks.
111 */ 111 */
112 return fuse_direct_io(&io, &iov, 1, count, &pos, 1); 112 return fuse_direct_io(&io, &iov, 1, count, &pos,
113 FUSE_DIO_WRITE | FUSE_DIO_CUSE);
113} 114}
114 115
115static int cuse_open(struct inode *inode, struct file *file) 116static int cuse_open(struct inode *inode, struct file *file)
@@ -568,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
568 569
569 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); 570 return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
570} 571}
571static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); 572static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
572 573
573static ssize_t cuse_class_abort_store(struct device *dev, 574static ssize_t cuse_class_abort_store(struct device *dev,
574 struct device_attribute *attr, 575 struct device_attribute *attr,
@@ -579,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
579 fuse_abort_conn(&cc->fc); 580 fuse_abort_conn(&cc->fc);
580 return count; 581 return count;
581} 582}
582static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); 583static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
583 584
584static struct attribute *cuse_class_dev_attrs[] = { 585static struct attribute *cuse_class_dev_attrs[] = {
585 &dev_attr_waiting.attr, 586 &dev_attr_waiting.attr,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb455ae..aac71ce373e4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
667 struct pipe_buffer *buf = cs->currbuf; 667 struct pipe_buffer *buf = cs->currbuf;
668 668
669 if (!cs->write) { 669 if (!cs->write) {
670 buf->ops->unmap(cs->pipe, buf, cs->mapaddr); 670 kunmap_atomic(cs->mapaddr);
671 } else { 671 } else {
672 kunmap(buf->page); 672 kunmap_atomic(cs->mapaddr);
673 buf->len = PAGE_SIZE - cs->len; 673 buf->len = PAGE_SIZE - cs->len;
674 } 674 }
675 cs->currbuf = NULL; 675 cs->currbuf = NULL;
676 cs->mapaddr = NULL; 676 cs->mapaddr = NULL;
677 } else if (cs->mapaddr) { 677 } else if (cs->mapaddr) {
678 kunmap(cs->pg); 678 kunmap_atomic(cs->mapaddr);
679 if (cs->write) { 679 if (cs->write) {
680 flush_dcache_page(cs->pg); 680 flush_dcache_page(cs->pg);
681 set_page_dirty_lock(cs->pg); 681 set_page_dirty_lock(cs->pg);
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
706 706
707 BUG_ON(!cs->nr_segs); 707 BUG_ON(!cs->nr_segs);
708 cs->currbuf = buf; 708 cs->currbuf = buf;
709 cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); 709 cs->mapaddr = kmap_atomic(buf->page);
710 cs->len = buf->len; 710 cs->len = buf->len;
711 cs->buf = cs->mapaddr + buf->offset; 711 cs->buf = cs->mapaddr + buf->offset;
712 cs->pipebufs++; 712 cs->pipebufs++;
@@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
726 buf->len = 0; 726 buf->len = 0;
727 727
728 cs->currbuf = buf; 728 cs->currbuf = buf;
729 cs->mapaddr = kmap(page); 729 cs->mapaddr = kmap_atomic(page);
730 cs->buf = cs->mapaddr; 730 cs->buf = cs->mapaddr;
731 cs->len = PAGE_SIZE; 731 cs->len = PAGE_SIZE;
732 cs->pipebufs++; 732 cs->pipebufs++;
@@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
745 return err; 745 return err;
746 BUG_ON(err != 1); 746 BUG_ON(err != 1);
747 offset = cs->addr % PAGE_SIZE; 747 offset = cs->addr % PAGE_SIZE;
748 cs->mapaddr = kmap(cs->pg); 748 cs->mapaddr = kmap_atomic(cs->pg);
749 cs->buf = cs->mapaddr + offset; 749 cs->buf = cs->mapaddr + offset;
750 cs->len = min(PAGE_SIZE - offset, cs->seglen); 750 cs->len = min(PAGE_SIZE - offset, cs->seglen);
751 cs->seglen -= cs->len; 751 cs->seglen -= cs->len;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
874out_fallback_unlock: 874out_fallback_unlock:
875 unlock_page(newpage); 875 unlock_page(newpage);
876out_fallback: 876out_fallback:
877 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); 877 cs->mapaddr = kmap_atomic(buf->page);
878 cs->buf = cs->mapaddr + buf->offset; 878 cs->buf = cs->mapaddr + buf->offset;
879 879
880 err = lock_request(cs->fc, cs->req); 880 err = lock_request(cs->fc, cs->req);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1d1292c581c3..42198359fa1b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
679 return create_new_entry(fc, req, dir, entry, S_IFLNK); 679 return create_new_entry(fc, req, dir, entry, S_IFLNK);
680} 680}
681 681
682static inline void fuse_update_ctime(struct inode *inode)
683{
684 if (!IS_NOCMTIME(inode)) {
685 inode->i_ctime = current_fs_time(inode->i_sb);
686 mark_inode_dirty_sync(inode);
687 }
688}
689
682static int fuse_unlink(struct inode *dir, struct dentry *entry) 690static int fuse_unlink(struct inode *dir, struct dentry *entry)
683{ 691{
684 int err; 692 int err;
@@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
713 fuse_invalidate_attr(inode); 721 fuse_invalidate_attr(inode);
714 fuse_invalidate_attr(dir); 722 fuse_invalidate_attr(dir);
715 fuse_invalidate_entry_cache(entry); 723 fuse_invalidate_entry_cache(entry);
724 fuse_update_ctime(inode);
716 } else if (err == -EINTR) 725 } else if (err == -EINTR)
717 fuse_invalidate_entry(entry); 726 fuse_invalidate_entry(entry);
718 return err; 727 return err;
@@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
743 return err; 752 return err;
744} 753}
745 754
746static int fuse_rename(struct inode *olddir, struct dentry *oldent, 755static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
747 struct inode *newdir, struct dentry *newent) 756 struct inode *newdir, struct dentry *newent,
757 unsigned int flags, int opcode, size_t argsize)
748{ 758{
749 int err; 759 int err;
750 struct fuse_rename_in inarg; 760 struct fuse_rename2_in inarg;
751 struct fuse_conn *fc = get_fuse_conn(olddir); 761 struct fuse_conn *fc = get_fuse_conn(olddir);
752 struct fuse_req *req = fuse_get_req_nopages(fc); 762 struct fuse_req *req;
753 763
764 req = fuse_get_req_nopages(fc);
754 if (IS_ERR(req)) 765 if (IS_ERR(req))
755 return PTR_ERR(req); 766 return PTR_ERR(req);
756 767
757 memset(&inarg, 0, sizeof(inarg)); 768 memset(&inarg, 0, argsize);
758 inarg.newdir = get_node_id(newdir); 769 inarg.newdir = get_node_id(newdir);
759 req->in.h.opcode = FUSE_RENAME; 770 inarg.flags = flags;
771 req->in.h.opcode = opcode;
760 req->in.h.nodeid = get_node_id(olddir); 772 req->in.h.nodeid = get_node_id(olddir);
761 req->in.numargs = 3; 773 req->in.numargs = 3;
762 req->in.args[0].size = sizeof(inarg); 774 req->in.args[0].size = argsize;
763 req->in.args[0].value = &inarg; 775 req->in.args[0].value = &inarg;
764 req->in.args[1].size = oldent->d_name.len + 1; 776 req->in.args[1].size = oldent->d_name.len + 1;
765 req->in.args[1].value = oldent->d_name.name; 777 req->in.args[1].value = oldent->d_name.name;
@@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
771 if (!err) { 783 if (!err) {
772 /* ctime changes */ 784 /* ctime changes */
773 fuse_invalidate_attr(oldent->d_inode); 785 fuse_invalidate_attr(oldent->d_inode);
786 fuse_update_ctime(oldent->d_inode);
787
788 if (flags & RENAME_EXCHANGE) {
789 fuse_invalidate_attr(newent->d_inode);
790 fuse_update_ctime(newent->d_inode);
791 }
774 792
775 fuse_invalidate_attr(olddir); 793 fuse_invalidate_attr(olddir);
776 if (olddir != newdir) 794 if (olddir != newdir)
777 fuse_invalidate_attr(newdir); 795 fuse_invalidate_attr(newdir);
778 796
779 /* newent will end up negative */ 797 /* newent will end up negative */
780 if (newent->d_inode) { 798 if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
781 fuse_invalidate_attr(newent->d_inode); 799 fuse_invalidate_attr(newent->d_inode);
782 fuse_invalidate_entry_cache(newent); 800 fuse_invalidate_entry_cache(newent);
801 fuse_update_ctime(newent->d_inode);
783 } 802 }
784 } else if (err == -EINTR) { 803 } else if (err == -EINTR) {
785 /* If request was interrupted, DEITY only knows if the 804 /* If request was interrupted, DEITY only knows if the
@@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
795 return err; 814 return err;
796} 815}
797 816
817static int fuse_rename(struct inode *olddir, struct dentry *oldent,
818 struct inode *newdir, struct dentry *newent)
819{
820 return fuse_rename_common(olddir, oldent, newdir, newent, 0,
821 FUSE_RENAME, sizeof(struct fuse_rename_in));
822}
823
824static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
825 struct inode *newdir, struct dentry *newent,
826 unsigned int flags)
827{
828 struct fuse_conn *fc = get_fuse_conn(olddir);
829 int err;
830
831 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
832 return -EINVAL;
833
834 if (fc->no_rename2 || fc->minor < 23)
835 return -EINVAL;
836
837 err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
838 FUSE_RENAME2, sizeof(struct fuse_rename2_in));
839 if (err == -ENOSYS) {
840 fc->no_rename2 = 1;
841 err = -EINVAL;
842 }
843 return err;
844
845}
846
798static int fuse_link(struct dentry *entry, struct inode *newdir, 847static int fuse_link(struct dentry *entry, struct inode *newdir,
799 struct dentry *newent) 848 struct dentry *newent)
800{ 849{
@@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
829 inc_nlink(inode); 878 inc_nlink(inode);
830 spin_unlock(&fc->lock); 879 spin_unlock(&fc->lock);
831 fuse_invalidate_attr(inode); 880 fuse_invalidate_attr(inode);
881 fuse_update_ctime(inode);
832 } else if (err == -EINTR) { 882 } else if (err == -EINTR) {
833 fuse_invalidate_attr(inode); 883 fuse_invalidate_attr(inode);
834 } 884 }
@@ -839,6 +889,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
839 struct kstat *stat) 889 struct kstat *stat)
840{ 890{
841 unsigned int blkbits; 891 unsigned int blkbits;
892 struct fuse_conn *fc = get_fuse_conn(inode);
893
894 /* see the comment in fuse_change_attributes() */
895 if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
896 attr->size = i_size_read(inode);
897 attr->mtime = inode->i_mtime.tv_sec;
898 attr->mtimensec = inode->i_mtime.tv_nsec;
899 attr->ctime = inode->i_ctime.tv_sec;
900 attr->ctimensec = inode->i_ctime.tv_nsec;
901 }
842 902
843 stat->dev = inode->i_sb->s_dev; 903 stat->dev = inode->i_sb->s_dev;
844 stat->ino = attr->ino; 904 stat->ino = attr->ino;
@@ -1477,12 +1537,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
1477 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); 1537 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
1478} 1538}
1479 1539
1480static bool update_mtime(unsigned ivalid) 1540static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
1481{ 1541{
1482 /* Always update if mtime is explicitly set */ 1542 /* Always update if mtime is explicitly set */
1483 if (ivalid & ATTR_MTIME_SET) 1543 if (ivalid & ATTR_MTIME_SET)
1484 return true; 1544 return true;
1485 1545
1546 /* Or if kernel i_mtime is the official one */
1547 if (trust_local_mtime)
1548 return true;
1549
1486 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ 1550 /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
1487 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) 1551 if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
1488 return false; 1552 return false;
@@ -1491,7 +1555,8 @@ static bool update_mtime(unsigned ivalid)
1491 return true; 1555 return true;
1492} 1556}
1493 1557
1494static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) 1558static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
1559 bool trust_local_cmtime)
1495{ 1560{
1496 unsigned ivalid = iattr->ia_valid; 1561 unsigned ivalid = iattr->ia_valid;
1497 1562
@@ -1510,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1510 if (!(ivalid & ATTR_ATIME_SET)) 1575 if (!(ivalid & ATTR_ATIME_SET))
1511 arg->valid |= FATTR_ATIME_NOW; 1576 arg->valid |= FATTR_ATIME_NOW;
1512 } 1577 }
1513 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { 1578 if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
1514 arg->valid |= FATTR_MTIME; 1579 arg->valid |= FATTR_MTIME;
1515 arg->mtime = iattr->ia_mtime.tv_sec; 1580 arg->mtime = iattr->ia_mtime.tv_sec;
1516 arg->mtimensec = iattr->ia_mtime.tv_nsec; 1581 arg->mtimensec = iattr->ia_mtime.tv_nsec;
1517 if (!(ivalid & ATTR_MTIME_SET)) 1582 if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
1518 arg->valid |= FATTR_MTIME_NOW; 1583 arg->valid |= FATTR_MTIME_NOW;
1519 } 1584 }
1585 if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
1586 arg->valid |= FATTR_CTIME;
1587 arg->ctime = iattr->ia_ctime.tv_sec;
1588 arg->ctimensec = iattr->ia_ctime.tv_nsec;
1589 }
1520} 1590}
1521 1591
1522/* 1592/*
@@ -1563,6 +1633,62 @@ void fuse_release_nowrite(struct inode *inode)
1563 spin_unlock(&fc->lock); 1633 spin_unlock(&fc->lock);
1564} 1634}
1565 1635
1636static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
1637 struct inode *inode,
1638 struct fuse_setattr_in *inarg_p,
1639 struct fuse_attr_out *outarg_p)
1640{
1641 req->in.h.opcode = FUSE_SETATTR;
1642 req->in.h.nodeid = get_node_id(inode);
1643 req->in.numargs = 1;
1644 req->in.args[0].size = sizeof(*inarg_p);
1645 req->in.args[0].value = inarg_p;
1646 req->out.numargs = 1;
1647 if (fc->minor < 9)
1648 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1649 else
1650 req->out.args[0].size = sizeof(*outarg_p);
1651 req->out.args[0].value = outarg_p;
1652}
1653
1654/*
1655 * Flush inode->i_mtime to the server
1656 */
1657int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
1658{
1659 struct fuse_conn *fc = get_fuse_conn(inode);
1660 struct fuse_req *req;
1661 struct fuse_setattr_in inarg;
1662 struct fuse_attr_out outarg;
1663 int err;
1664
1665 req = fuse_get_req_nopages(fc);
1666 if (IS_ERR(req))
1667 return PTR_ERR(req);
1668
1669 memset(&inarg, 0, sizeof(inarg));
1670 memset(&outarg, 0, sizeof(outarg));
1671
1672 inarg.valid = FATTR_MTIME;
1673 inarg.mtime = inode->i_mtime.tv_sec;
1674 inarg.mtimensec = inode->i_mtime.tv_nsec;
1675 if (fc->minor >= 23) {
1676 inarg.valid |= FATTR_CTIME;
1677 inarg.ctime = inode->i_ctime.tv_sec;
1678 inarg.ctimensec = inode->i_ctime.tv_nsec;
1679 }
1680 if (ff) {
1681 inarg.valid |= FATTR_FH;
1682 inarg.fh = ff->fh;
1683 }
1684 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1685 fuse_request_send(fc, req);
1686 err = req->out.h.error;
1687 fuse_put_request(fc, req);
1688
1689 return err;
1690}
1691
1566/* 1692/*
1567 * Set attributes, and at the same time refresh them. 1693 * Set attributes, and at the same time refresh them.
1568 * 1694 *
@@ -1580,8 +1706,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1580 struct fuse_setattr_in inarg; 1706 struct fuse_setattr_in inarg;
1581 struct fuse_attr_out outarg; 1707 struct fuse_attr_out outarg;
1582 bool is_truncate = false; 1708 bool is_truncate = false;
1709 bool is_wb = fc->writeback_cache;
1583 loff_t oldsize; 1710 loff_t oldsize;
1584 int err; 1711 int err;
1712 bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
1585 1713
1586 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) 1714 if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
1587 attr->ia_valid |= ATTR_FORCE; 1715 attr->ia_valid |= ATTR_FORCE;
@@ -1606,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1606 if (is_truncate) { 1734 if (is_truncate) {
1607 fuse_set_nowrite(inode); 1735 fuse_set_nowrite(inode);
1608 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1736 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1737 if (trust_local_cmtime && attr->ia_size != inode->i_size)
1738 attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
1609 } 1739 }
1610 1740
1611 memset(&inarg, 0, sizeof(inarg)); 1741 memset(&inarg, 0, sizeof(inarg));
1612 memset(&outarg, 0, sizeof(outarg)); 1742 memset(&outarg, 0, sizeof(outarg));
1613 iattr_to_fattr(attr, &inarg); 1743 iattr_to_fattr(attr, &inarg, trust_local_cmtime);
1614 if (file) { 1744 if (file) {
1615 struct fuse_file *ff = file->private_data; 1745 struct fuse_file *ff = file->private_data;
1616 inarg.valid |= FATTR_FH; 1746 inarg.valid |= FATTR_FH;
@@ -1621,17 +1751,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1621 inarg.valid |= FATTR_LOCKOWNER; 1751 inarg.valid |= FATTR_LOCKOWNER;
1622 inarg.lock_owner = fuse_lock_owner_id(fc, current->files); 1752 inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
1623 } 1753 }
1624 req->in.h.opcode = FUSE_SETATTR; 1754 fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
1625 req->in.h.nodeid = get_node_id(inode);
1626 req->in.numargs = 1;
1627 req->in.args[0].size = sizeof(inarg);
1628 req->in.args[0].value = &inarg;
1629 req->out.numargs = 1;
1630 if (fc->minor < 9)
1631 req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1632 else
1633 req->out.args[0].size = sizeof(outarg);
1634 req->out.args[0].value = &outarg;
1635 fuse_request_send(fc, req); 1755 fuse_request_send(fc, req);
1636 err = req->out.h.error; 1756 err = req->out.h.error;
1637 fuse_put_request(fc, req); 1757 fuse_put_request(fc, req);
@@ -1648,10 +1768,21 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1648 } 1768 }
1649 1769
1650 spin_lock(&fc->lock); 1770 spin_lock(&fc->lock);
1771 /* the kernel maintains i_mtime locally */
1772 if (trust_local_cmtime) {
1773 if (attr->ia_valid & ATTR_MTIME)
1774 inode->i_mtime = attr->ia_mtime;
1775 if (attr->ia_valid & ATTR_CTIME)
1776 inode->i_ctime = attr->ia_ctime;
1777 /* FIXME: clear I_DIRTY_SYNC? */
1778 }
1779
1651 fuse_change_attributes_common(inode, &outarg.attr, 1780 fuse_change_attributes_common(inode, &outarg.attr,
1652 attr_timeout(&outarg)); 1781 attr_timeout(&outarg));
1653 oldsize = inode->i_size; 1782 oldsize = inode->i_size;
1654 i_size_write(inode, outarg.attr.size); 1783 /* see the comment in fuse_change_attributes() */
1784 if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
1785 i_size_write(inode, outarg.attr.size);
1655 1786
1656 if (is_truncate) { 1787 if (is_truncate) {
1657 /* NOTE: this may release/reacquire fc->lock */ 1788 /* NOTE: this may release/reacquire fc->lock */
@@ -1663,7 +1794,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
1663 * Only call invalidate_inode_pages2() after removing 1794 * Only call invalidate_inode_pages2() after removing
1664 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. 1795 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1665 */ 1796 */
1666 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { 1797 if ((is_truncate || !is_wb) &&
1798 S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1667 truncate_pagecache(inode, outarg.attr.size); 1799 truncate_pagecache(inode, outarg.attr.size);
1668 invalidate_inode_pages2(inode->i_mapping); 1800 invalidate_inode_pages2(inode->i_mapping);
1669 } 1801 }
@@ -1739,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1739 fc->no_setxattr = 1; 1871 fc->no_setxattr = 1;
1740 err = -EOPNOTSUPP; 1872 err = -EOPNOTSUPP;
1741 } 1873 }
1742 if (!err) 1874 if (!err) {
1743 fuse_invalidate_attr(inode); 1875 fuse_invalidate_attr(inode);
1876 fuse_update_ctime(inode);
1877 }
1744 return err; 1878 return err;
1745} 1879}
1746 1880
@@ -1870,8 +2004,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1870 fc->no_removexattr = 1; 2004 fc->no_removexattr = 1;
1871 err = -EOPNOTSUPP; 2005 err = -EOPNOTSUPP;
1872 } 2006 }
1873 if (!err) 2007 if (!err) {
1874 fuse_invalidate_attr(inode); 2008 fuse_invalidate_attr(inode);
2009 fuse_update_ctime(inode);
2010 }
1875 return err; 2011 return err;
1876} 2012}
1877 2013
@@ -1882,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
1882 .unlink = fuse_unlink, 2018 .unlink = fuse_unlink,
1883 .rmdir = fuse_rmdir, 2019 .rmdir = fuse_rmdir,
1884 .rename = fuse_rename, 2020 .rename = fuse_rename,
2021 .rename2 = fuse_rename2,
1885 .link = fuse_link, 2022 .link = fuse_link,
1886 .setattr = fuse_setattr, 2023 .setattr = fuse_setattr,
1887 .create = fuse_create, 2024 .create = fuse_create,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc303c3ae..96d513e01a5d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
188} 188}
189EXPORT_SYMBOL_GPL(fuse_do_open); 189EXPORT_SYMBOL_GPL(fuse_do_open);
190 190
191static void fuse_link_write_file(struct file *file)
192{
193 struct inode *inode = file_inode(file);
194 struct fuse_conn *fc = get_fuse_conn(inode);
195 struct fuse_inode *fi = get_fuse_inode(inode);
196 struct fuse_file *ff = file->private_data;
197 /*
198 * file may be written through mmap, so chain it onto the
199 * inodes's write_file list
200 */
201 spin_lock(&fc->lock);
202 if (list_empty(&ff->write_entry))
203 list_add(&ff->write_entry, &fi->write_files);
204 spin_unlock(&fc->lock);
205}
206
191void fuse_finish_open(struct inode *inode, struct file *file) 207void fuse_finish_open(struct inode *inode, struct file *file)
192{ 208{
193 struct fuse_file *ff = file->private_data; 209 struct fuse_file *ff = file->private_data;
@@ -207,25 +223,37 @@ void fuse_finish_open(struct inode *inode, struct file *file)
207 i_size_write(inode, 0); 223 i_size_write(inode, 0);
208 spin_unlock(&fc->lock); 224 spin_unlock(&fc->lock);
209 fuse_invalidate_attr(inode); 225 fuse_invalidate_attr(inode);
226 if (fc->writeback_cache)
227 file_update_time(file);
210 } 228 }
229 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
230 fuse_link_write_file(file);
211} 231}
212 232
213int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 233int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
214{ 234{
215 struct fuse_conn *fc = get_fuse_conn(inode); 235 struct fuse_conn *fc = get_fuse_conn(inode);
216 int err; 236 int err;
237 bool lock_inode = (file->f_flags & O_TRUNC) &&
238 fc->atomic_o_trunc &&
239 fc->writeback_cache;
217 240
218 err = generic_file_open(inode, file); 241 err = generic_file_open(inode, file);
219 if (err) 242 if (err)
220 return err; 243 return err;
221 244
245 if (lock_inode)
246 mutex_lock(&inode->i_mutex);
247
222 err = fuse_do_open(fc, get_node_id(inode), file, isdir); 248 err = fuse_do_open(fc, get_node_id(inode), file, isdir);
223 if (err)
224 return err;
225 249
226 fuse_finish_open(inode, file); 250 if (!err)
251 fuse_finish_open(inode, file);
227 252
228 return 0; 253 if (lock_inode)
254 mutex_unlock(&inode->i_mutex);
255
256 return err;
229} 257}
230 258
231static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) 259static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
@@ -292,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file)
292 320
293static int fuse_release(struct inode *inode, struct file *file) 321static int fuse_release(struct inode *inode, struct file *file)
294{ 322{
323 struct fuse_conn *fc = get_fuse_conn(inode);
324
325 /* see fuse_vma_close() for !writeback_cache case */
326 if (fc->writeback_cache)
327 write_inode_now(inode, 1);
328
295 fuse_release_common(file, FUSE_RELEASE); 329 fuse_release_common(file, FUSE_RELEASE);
296 330
297 /* return value is ignored by VFS */ 331 /* return value is ignored by VFS */
@@ -333,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
333} 367}
334 368
335/* 369/*
336 * Check if page is under writeback 370 * Check if any page in a range is under writeback
337 * 371 *
338 * This is currently done by walking the list of writepage requests 372 * This is currently done by walking the list of writepage requests
339 * for the inode, which can be pretty inefficient. 373 * for the inode, which can be pretty inefficient.
340 */ 374 */
341static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 375static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
376 pgoff_t idx_to)
342{ 377{
343 struct fuse_conn *fc = get_fuse_conn(inode); 378 struct fuse_conn *fc = get_fuse_conn(inode);
344 struct fuse_inode *fi = get_fuse_inode(inode); 379 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -351,8 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
351 386
352 BUG_ON(req->inode != inode); 387 BUG_ON(req->inode != inode);
353 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; 388 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
354 if (curr_index <= index && 389 if (idx_from < curr_index + req->num_pages &&
355 index < curr_index + req->num_pages) { 390 curr_index <= idx_to) {
356 found = true; 391 found = true;
357 break; 392 break;
358 } 393 }
@@ -362,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
362 return found; 397 return found;
363} 398}
364 399
400static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
401{
402 return fuse_range_is_writeback(inode, index, index);
403}
404
365/* 405/*
366 * Wait for page writeback to be completed. 406 * Wait for page writeback to be completed.
367 * 407 *
@@ -376,6 +416,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
376 return 0; 416 return 0;
377} 417}
378 418
419/*
420 * Wait for all pending writepages on the inode to finish.
421 *
422 * This is currently done by blocking further writes with FUSE_NOWRITE
423 * and waiting for all sent writes to complete.
424 *
425 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
426 * could conflict with truncation.
427 */
428static void fuse_sync_writes(struct inode *inode)
429{
430 fuse_set_nowrite(inode);
431 fuse_release_nowrite(inode);
432}
433
379static int fuse_flush(struct file *file, fl_owner_t id) 434static int fuse_flush(struct file *file, fl_owner_t id)
380{ 435{
381 struct inode *inode = file_inode(file); 436 struct inode *inode = file_inode(file);
@@ -391,6 +446,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
391 if (fc->no_flush) 446 if (fc->no_flush)
392 return 0; 447 return 0;
393 448
449 err = write_inode_now(inode, 1);
450 if (err)
451 return err;
452
453 mutex_lock(&inode->i_mutex);
454 fuse_sync_writes(inode);
455 mutex_unlock(&inode->i_mutex);
456
394 req = fuse_get_req_nofail_nopages(fc, file); 457 req = fuse_get_req_nofail_nopages(fc, file);
395 memset(&inarg, 0, sizeof(inarg)); 458 memset(&inarg, 0, sizeof(inarg));
396 inarg.fh = ff->fh; 459 inarg.fh = ff->fh;
@@ -411,21 +474,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
411 return err; 474 return err;
412} 475}
413 476
414/*
415 * Wait for all pending writepages on the inode to finish.
416 *
417 * This is currently done by blocking further writes with FUSE_NOWRITE
418 * and waiting for all sent writes to complete.
419 *
420 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
421 * could conflict with truncation.
422 */
423static void fuse_sync_writes(struct inode *inode)
424{
425 fuse_set_nowrite(inode);
426 fuse_release_nowrite(inode);
427}
428
429int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 477int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
430 int datasync, int isdir) 478 int datasync, int isdir)
431{ 479{
@@ -439,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
439 if (is_bad_inode(inode)) 487 if (is_bad_inode(inode))
440 return -EIO; 488 return -EIO;
441 489
442 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
443 if (err)
444 return err;
445
446 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
447 return 0;
448
449 mutex_lock(&inode->i_mutex); 490 mutex_lock(&inode->i_mutex);
450 491
451 /* 492 /*
@@ -453,11 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
453 * wait for all outstanding writes, before sending the FSYNC 494 * wait for all outstanding writes, before sending the FSYNC
454 * request. 495 * request.
455 */ 496 */
456 err = write_inode_now(inode, 0); 497 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
457 if (err) 498 if (err)
458 goto out; 499 goto out;
459 500
460 fuse_sync_writes(inode); 501 fuse_sync_writes(inode);
502 err = sync_inode_metadata(inode, 1);
503 if (err)
504 goto out;
505
506 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
507 goto out;
461 508
462 req = fuse_get_req_nopages(fc); 509 req = fuse_get_req_nopages(fc);
463 if (IS_ERR(req)) { 510 if (IS_ERR(req)) {
@@ -655,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
655 spin_unlock(&fc->lock); 702 spin_unlock(&fc->lock);
656} 703}
657 704
658static int fuse_readpage(struct file *file, struct page *page) 705static void fuse_short_read(struct fuse_req *req, struct inode *inode,
706 u64 attr_ver)
707{
708 size_t num_read = req->out.args[0].size;
709 struct fuse_conn *fc = get_fuse_conn(inode);
710
711 if (fc->writeback_cache) {
712 /*
713 * A hole in a file. Some data after the hole are in page cache,
714 * but have not reached the client fs yet. So, the hole is not
715 * present there.
716 */
717 int i;
718 int start_idx = num_read >> PAGE_CACHE_SHIFT;
719 size_t off = num_read & (PAGE_CACHE_SIZE - 1);
720
721 for (i = start_idx; i < req->num_pages; i++) {
722 zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
723 off = 0;
724 }
725 } else {
726 loff_t pos = page_offset(req->pages[0]) + num_read;
727 fuse_read_update_size(inode, pos, attr_ver);
728 }
729}
730
731static int fuse_do_readpage(struct file *file, struct page *page)
659{ 732{
660 struct fuse_io_priv io = { .async = 0, .file = file }; 733 struct fuse_io_priv io = { .async = 0, .file = file };
661 struct inode *inode = page->mapping->host; 734 struct inode *inode = page->mapping->host;
@@ -667,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page)
667 u64 attr_ver; 740 u64 attr_ver;
668 int err; 741 int err;
669 742
670 err = -EIO;
671 if (is_bad_inode(inode))
672 goto out;
673
674 /* 743 /*
675 * Page writeback can extend beyond the lifetime of the 744 * Page writeback can extend beyond the lifetime of the
676 * page-cache page, so make sure we read a properly synced 745 * page-cache page, so make sure we read a properly synced
@@ -679,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page)
679 fuse_wait_on_page_writeback(inode, page->index); 748 fuse_wait_on_page_writeback(inode, page->index);
680 749
681 req = fuse_get_req(fc, 1); 750 req = fuse_get_req(fc, 1);
682 err = PTR_ERR(req);
683 if (IS_ERR(req)) 751 if (IS_ERR(req))
684 goto out; 752 return PTR_ERR(req);
685 753
686 attr_ver = fuse_get_attr_version(fc); 754 attr_ver = fuse_get_attr_version(fc);
687 755
@@ -692,18 +760,32 @@ static int fuse_readpage(struct file *file, struct page *page)
692 req->page_descs[0].length = count; 760 req->page_descs[0].length = count;
693 num_read = fuse_send_read(req, &io, pos, count, NULL); 761 num_read = fuse_send_read(req, &io, pos, count, NULL);
694 err = req->out.h.error; 762 err = req->out.h.error;
695 fuse_put_request(fc, req);
696 763
697 if (!err) { 764 if (!err) {
698 /* 765 /*
699 * Short read means EOF. If file size is larger, truncate it 766 * Short read means EOF. If file size is larger, truncate it
700 */ 767 */
701 if (num_read < count) 768 if (num_read < count)
702 fuse_read_update_size(inode, pos + num_read, attr_ver); 769 fuse_short_read(req, inode, attr_ver);
703 770
704 SetPageUptodate(page); 771 SetPageUptodate(page);
705 } 772 }
706 773
774 fuse_put_request(fc, req);
775
776 return err;
777}
778
779static int fuse_readpage(struct file *file, struct page *page)
780{
781 struct inode *inode = page->mapping->host;
782 int err;
783
784 err = -EIO;
785 if (is_bad_inode(inode))
786 goto out;
787
788 err = fuse_do_readpage(file, page);
707 fuse_invalidate_atime(inode); 789 fuse_invalidate_atime(inode);
708 out: 790 out:
709 unlock_page(page); 791 unlock_page(page);
@@ -726,13 +808,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
726 /* 808 /*
727 * Short read means EOF. If file size is larger, truncate it 809 * Short read means EOF. If file size is larger, truncate it
728 */ 810 */
729 if (!req->out.h.error && num_read < count) { 811 if (!req->out.h.error && num_read < count)
730 loff_t pos; 812 fuse_short_read(req, inode, req->misc.read.attr_ver);
731 813
732 pos = page_offset(req->pages[0]) + num_read;
733 fuse_read_update_size(inode, pos,
734 req->misc.read.attr_ver);
735 }
736 fuse_invalidate_atime(inode); 814 fuse_invalidate_atime(inode);
737 } 815 }
738 816
@@ -922,16 +1000,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
922 return req->misc.write.out.size; 1000 return req->misc.write.out.size;
923} 1001}
924 1002
925void fuse_write_update_size(struct inode *inode, loff_t pos) 1003bool fuse_write_update_size(struct inode *inode, loff_t pos)
926{ 1004{
927 struct fuse_conn *fc = get_fuse_conn(inode); 1005 struct fuse_conn *fc = get_fuse_conn(inode);
928 struct fuse_inode *fi = get_fuse_inode(inode); 1006 struct fuse_inode *fi = get_fuse_inode(inode);
1007 bool ret = false;
929 1008
930 spin_lock(&fc->lock); 1009 spin_lock(&fc->lock);
931 fi->attr_version = ++fc->attr_version; 1010 fi->attr_version = ++fc->attr_version;
932 if (pos > inode->i_size) 1011 if (pos > inode->i_size) {
933 i_size_write(inode, pos); 1012 i_size_write(inode, pos);
1013 ret = true;
1014 }
934 spin_unlock(&fc->lock); 1015 spin_unlock(&fc->lock);
1016
1017 return ret;
935} 1018}
936 1019
937static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 1020static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -1003,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
1003 if (mapping_writably_mapped(mapping)) 1086 if (mapping_writably_mapped(mapping))
1004 flush_dcache_page(page); 1087 flush_dcache_page(page);
1005 1088
1006 pagefault_disable();
1007 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 1089 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
1008 pagefault_enable();
1009 flush_dcache_page(page); 1090 flush_dcache_page(page);
1010 1091
1011 mark_page_accessed(page); 1092 mark_page_accessed(page);
@@ -1116,6 +1197,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1116 struct iov_iter i; 1197 struct iov_iter i;
1117 loff_t endbyte = 0; 1198 loff_t endbyte = 0;
1118 1199
1200 if (get_fuse_conn(inode)->writeback_cache) {
1201 /* Update size (EOF optimization) and mode (SUID clearing) */
1202 err = fuse_update_attributes(mapping->host, NULL, file, NULL);
1203 if (err)
1204 return err;
1205
1206 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1207 }
1208
1119 WARN_ON(iocb->ki_pos != pos); 1209 WARN_ON(iocb->ki_pos != pos);
1120 1210
1121 ocount = 0; 1211 ocount = 0;
@@ -1145,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1145 goto out; 1235 goto out;
1146 1236
1147 if (file->f_flags & O_DIRECT) { 1237 if (file->f_flags & O_DIRECT) {
1148 written = generic_file_direct_write(iocb, iov, &nr_segs, 1238 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
1149 pos, &iocb->ki_pos,
1150 count, ocount); 1239 count, ocount);
1151 if (written < 0 || written == count) 1240 if (written < 0 || written == count)
1152 goto out; 1241 goto out;
@@ -1289,13 +1378,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
1289 1378
1290ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 1379ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1291 unsigned long nr_segs, size_t count, loff_t *ppos, 1380 unsigned long nr_segs, size_t count, loff_t *ppos,
1292 int write) 1381 int flags)
1293{ 1382{
1383 int write = flags & FUSE_DIO_WRITE;
1384 int cuse = flags & FUSE_DIO_CUSE;
1294 struct file *file = io->file; 1385 struct file *file = io->file;
1386 struct inode *inode = file->f_mapping->host;
1295 struct fuse_file *ff = file->private_data; 1387 struct fuse_file *ff = file->private_data;
1296 struct fuse_conn *fc = ff->fc; 1388 struct fuse_conn *fc = ff->fc;
1297 size_t nmax = write ? fc->max_write : fc->max_read; 1389 size_t nmax = write ? fc->max_write : fc->max_read;
1298 loff_t pos = *ppos; 1390 loff_t pos = *ppos;
1391 pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
1392 pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1299 ssize_t res = 0; 1393 ssize_t res = 0;
1300 struct fuse_req *req; 1394 struct fuse_req *req;
1301 struct iov_iter ii; 1395 struct iov_iter ii;
@@ -1309,6 +1403,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
1309 if (IS_ERR(req)) 1403 if (IS_ERR(req))
1310 return PTR_ERR(req); 1404 return PTR_ERR(req);
1311 1405
1406 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1407 if (!write)
1408 mutex_lock(&inode->i_mutex);
1409 fuse_sync_writes(inode);
1410 if (!write)
1411 mutex_unlock(&inode->i_mutex);
1412 }
1413
1312 while (count) { 1414 while (count) {
1313 size_t nres; 1415 size_t nres;
1314 fl_owner_t owner = current->files; 1416 fl_owner_t owner = current->files;
@@ -1397,7 +1499,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
1397 1499
1398 res = generic_write_checks(file, ppos, &count, 0); 1500 res = generic_write_checks(file, ppos, &count, 0);
1399 if (!res) 1501 if (!res)
1400 res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); 1502 res = fuse_direct_io(io, iov, nr_segs, count, ppos,
1503 FUSE_DIO_WRITE);
1401 1504
1402 fuse_invalidate_attr(inode); 1505 fuse_invalidate_attr(inode);
1403 1506
@@ -1556,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1556 fuse_writepage_free(fc, req); 1659 fuse_writepage_free(fc, req);
1557} 1660}
1558 1661
1559static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, 1662static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
1560 struct fuse_inode *fi) 1663 struct fuse_inode *fi)
1561{ 1664{
1562 struct fuse_file *ff = NULL; 1665 struct fuse_file *ff = NULL;
1563 1666
1564 spin_lock(&fc->lock); 1667 spin_lock(&fc->lock);
1565 if (!WARN_ON(list_empty(&fi->write_files))) { 1668 if (!list_empty(&fi->write_files)) {
1566 ff = list_entry(fi->write_files.next, struct fuse_file, 1669 ff = list_entry(fi->write_files.next, struct fuse_file,
1567 write_entry); 1670 write_entry);
1568 fuse_file_get(ff); 1671 fuse_file_get(ff);
@@ -1572,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
1572 return ff; 1675 return ff;
1573} 1676}
1574 1677
1678static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
1679 struct fuse_inode *fi)
1680{
1681 struct fuse_file *ff = __fuse_write_file_get(fc, fi);
1682 WARN_ON(!ff);
1683 return ff;
1684}
1685
1686int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1687{
1688 struct fuse_conn *fc = get_fuse_conn(inode);
1689 struct fuse_inode *fi = get_fuse_inode(inode);
1690 struct fuse_file *ff;
1691 int err;
1692
1693 ff = __fuse_write_file_get(fc, fi);
1694 err = fuse_flush_times(inode, ff);
1695 if (ff)
1696 fuse_file_put(ff, 0);
1697
1698 return err;
1699}
1700
1575static int fuse_writepage_locked(struct page *page) 1701static int fuse_writepage_locked(struct page *page)
1576{ 1702{
1577 struct address_space *mapping = page->mapping; 1703 struct address_space *mapping = page->mapping;
@@ -1885,6 +2011,77 @@ out:
1885 return err; 2011 return err;
1886} 2012}
1887 2013
2014/*
2015 * It's worthy to make sure that space is reserved on disk for the write,
2016 * but how to implement it without killing performance need more thinking.
2017 */
2018static int fuse_write_begin(struct file *file, struct address_space *mapping,
2019 loff_t pos, unsigned len, unsigned flags,
2020 struct page **pagep, void **fsdata)
2021{
2022 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2023 struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
2024 struct page *page;
2025 loff_t fsize;
2026 int err = -ENOMEM;
2027
2028 WARN_ON(!fc->writeback_cache);
2029
2030 page = grab_cache_page_write_begin(mapping, index, flags);
2031 if (!page)
2032 goto error;
2033
2034 fuse_wait_on_page_writeback(mapping->host, page->index);
2035
2036 if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
2037 goto success;
2038 /*
2039 * Check if the start this page comes after the end of file, in which
2040 * case the readpage can be optimized away.
2041 */
2042 fsize = i_size_read(mapping->host);
2043 if (fsize <= (pos & PAGE_CACHE_MASK)) {
2044 size_t off = pos & ~PAGE_CACHE_MASK;
2045 if (off)
2046 zero_user_segment(page, 0, off);
2047 goto success;
2048 }
2049 err = fuse_do_readpage(file, page);
2050 if (err)
2051 goto cleanup;
2052success:
2053 *pagep = page;
2054 return 0;
2055
2056cleanup:
2057 unlock_page(page);
2058 page_cache_release(page);
2059error:
2060 return err;
2061}
2062
2063static int fuse_write_end(struct file *file, struct address_space *mapping,
2064 loff_t pos, unsigned len, unsigned copied,
2065 struct page *page, void *fsdata)
2066{
2067 struct inode *inode = page->mapping->host;
2068
2069 if (!PageUptodate(page)) {
2070 /* Zero any unwritten bytes at the end of the page */
2071 size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
2072 if (endoff)
2073 zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
2074 SetPageUptodate(page);
2075 }
2076
2077 fuse_write_update_size(inode, pos + copied);
2078 set_page_dirty(page);
2079 unlock_page(page);
2080 page_cache_release(page);
2081
2082 return copied;
2083}
2084
1888static int fuse_launder_page(struct page *page) 2085static int fuse_launder_page(struct page *page)
1889{ 2086{
1890 int err = 0; 2087 int err = 0;
@@ -1940,26 +2137,16 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1940static const struct vm_operations_struct fuse_file_vm_ops = { 2137static const struct vm_operations_struct fuse_file_vm_ops = {
1941 .close = fuse_vma_close, 2138 .close = fuse_vma_close,
1942 .fault = filemap_fault, 2139 .fault = filemap_fault,
2140 .map_pages = filemap_map_pages,
1943 .page_mkwrite = fuse_page_mkwrite, 2141 .page_mkwrite = fuse_page_mkwrite,
1944 .remap_pages = generic_file_remap_pages, 2142 .remap_pages = generic_file_remap_pages,
1945}; 2143};
1946 2144
1947static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2145static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1948{ 2146{
1949 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 2147 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1950 struct inode *inode = file_inode(file); 2148 fuse_link_write_file(file);
1951 struct fuse_conn *fc = get_fuse_conn(inode); 2149
1952 struct fuse_inode *fi = get_fuse_inode(inode);
1953 struct fuse_file *ff = file->private_data;
1954 /*
1955 * file may be written through mmap, so chain it onto the
1956 * inodes's write_file list
1957 */
1958 spin_lock(&fc->lock);
1959 if (list_empty(&ff->write_entry))
1960 list_add(&ff->write_entry, &fi->write_files);
1961 spin_unlock(&fc->lock);
1962 }
1963 file_accessed(file); 2150 file_accessed(file);
1964 vma->vm_ops = &fuse_file_vm_ops; 2151 vma->vm_ops = &fuse_file_vm_ops;
1965 return 0; 2152 return 0;
@@ -2606,7 +2793,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
2606{ 2793{
2607 spin_lock(&fc->lock); 2794 spin_lock(&fc->lock);
2608 if (RB_EMPTY_NODE(&ff->polled_node)) { 2795 if (RB_EMPTY_NODE(&ff->polled_node)) {
2609 struct rb_node **link, *parent; 2796 struct rb_node **link, *uninitialized_var(parent);
2610 2797
2611 link = fuse_find_polled_node(fc, ff->kh, &parent); 2798 link = fuse_find_polled_node(fc, ff->kh, &parent);
2612 BUG_ON(*link); 2799 BUG_ON(*link);
@@ -2808,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2808 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || 2995 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
2809 (mode & FALLOC_FL_PUNCH_HOLE); 2996 (mode & FALLOC_FL_PUNCH_HOLE);
2810 2997
2998 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2999 return -EOPNOTSUPP;
3000
2811 if (fc->no_fallocate) 3001 if (fc->no_fallocate)
2812 return -EOPNOTSUPP; 3002 return -EOPNOTSUPP;
2813 3003
@@ -2850,8 +3040,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2850 goto out; 3040 goto out;
2851 3041
2852 /* we could have extended the file */ 3042 /* we could have extended the file */
2853 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3043 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2854 fuse_write_update_size(inode, offset + length); 3044 bool changed = fuse_write_update_size(inode, offset + length);
3045
3046 if (changed && fc->writeback_cache)
3047 file_update_time(file);
3048 }
2855 3049
2856 if (mode & FALLOC_FL_PUNCH_HOLE) 3050 if (mode & FALLOC_FL_PUNCH_HOLE)
2857 truncate_pagecache_range(inode, offset, offset + length - 1); 3051 truncate_pagecache_range(inode, offset, offset + length - 1);
@@ -2915,6 +3109,8 @@ static const struct address_space_operations fuse_file_aops = {
2915 .set_page_dirty = __set_page_dirty_nobuffers, 3109 .set_page_dirty = __set_page_dirty_nobuffers,
2916 .bmap = fuse_bmap, 3110 .bmap = fuse_bmap,
2917 .direct_IO = fuse_direct_IO, 3111 .direct_IO = fuse_direct_IO,
3112 .write_begin = fuse_write_begin,
3113 .write_end = fuse_write_end,
2918}; 3114};
2919 3115
2920void fuse_init_file_inode(struct inode *inode) 3116void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2da5db2c8bdb..7aa5c75e0de1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -480,6 +480,9 @@ struct fuse_conn {
480 /** Set if bdi is valid */ 480 /** Set if bdi is valid */
481 unsigned bdi_initialized:1; 481 unsigned bdi_initialized:1;
482 482
483 /** write-back cache policy (default is write-through) */
484 unsigned writeback_cache:1;
485
483 /* 486 /*
484 * The following bitfields are only for optimization purposes 487 * The following bitfields are only for optimization purposes
485 * and hence races in setting them will not cause malfunction 488 * and hence races in setting them will not cause malfunction
@@ -539,6 +542,9 @@ struct fuse_conn {
539 /** Is fallocate not implemented by fs? */ 542 /** Is fallocate not implemented by fs? */
540 unsigned no_fallocate:1; 543 unsigned no_fallocate:1;
541 544
545 /** Is rename with flags implemented by fs? */
546 unsigned no_rename2:1;
547
542 /** Use enhanced/automatic page cache invalidation. */ 548 /** Use enhanced/automatic page cache invalidation. */
543 unsigned auto_inval_data:1; 549 unsigned auto_inval_data:1;
544 550
@@ -720,7 +726,7 @@ int fuse_dev_init(void);
720void fuse_dev_cleanup(void); 726void fuse_dev_cleanup(void);
721 727
722int fuse_ctl_init(void); 728int fuse_ctl_init(void);
723void fuse_ctl_cleanup(void); 729void __exit fuse_ctl_cleanup(void);
724 730
725/** 731/**
726 * Allocate a request 732 * Allocate a request
@@ -863,9 +869,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
863 869
864int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 870int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
865 bool isdir); 871 bool isdir);
872
873/**
874 * fuse_direct_io() flags
875 */
876
877/** If set, it is WRITE; otherwise - READ */
878#define FUSE_DIO_WRITE (1 << 0)
879
880/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
881#define FUSE_DIO_CUSE (1 << 1)
882
866ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, 883ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
867 unsigned long nr_segs, size_t count, loff_t *ppos, 884 unsigned long nr_segs, size_t count, loff_t *ppos,
868 int write); 885 int flags);
869long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 886long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
870 unsigned int flags); 887 unsigned int flags);
871long fuse_ioctl_common(struct file *file, unsigned int cmd, 888long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -873,7 +890,10 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
873unsigned fuse_file_poll(struct file *file, poll_table *wait); 890unsigned fuse_file_poll(struct file *file, poll_table *wait);
874int fuse_dev_release(struct inode *inode, struct file *file); 891int fuse_dev_release(struct inode *inode, struct file *file);
875 892
876void fuse_write_update_size(struct inode *inode, loff_t pos); 893bool fuse_write_update_size(struct inode *inode, loff_t pos);
894
895int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
896int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
877 897
878int fuse_do_setattr(struct inode *inode, struct iattr *attr, 898int fuse_do_setattr(struct inode *inode, struct iattr *attr,
879 struct file *file); 899 struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d468643a68b2..754dcf23de8a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)
123 123
124static void fuse_evict_inode(struct inode *inode) 124static void fuse_evict_inode(struct inode *inode)
125{ 125{
126 truncate_inode_pages(&inode->i_data, 0); 126 truncate_inode_pages_final(&inode->i_data);
127 clear_inode(inode); 127 clear_inode(inode);
128 if (inode->i_sb->s_flags & MS_ACTIVE) { 128 if (inode->i_sb->s_flags & MS_ACTIVE) {
129 struct fuse_conn *fc = get_fuse_conn(inode); 129 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
135 135
136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) 136static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
137{ 137{
138 sync_filesystem(sb);
138 if (*flags & MS_MANDLOCK) 139 if (*flags & MS_MANDLOCK)
139 return -EINVAL; 140 return -EINVAL;
140 141
@@ -170,10 +171,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
170 inode->i_blocks = attr->blocks; 171 inode->i_blocks = attr->blocks;
171 inode->i_atime.tv_sec = attr->atime; 172 inode->i_atime.tv_sec = attr->atime;
172 inode->i_atime.tv_nsec = attr->atimensec; 173 inode->i_atime.tv_nsec = attr->atimensec;
173 inode->i_mtime.tv_sec = attr->mtime; 174 /* mtime from server may be stale due to local buffered write */
174 inode->i_mtime.tv_nsec = attr->mtimensec; 175 if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
175 inode->i_ctime.tv_sec = attr->ctime; 176 inode->i_mtime.tv_sec = attr->mtime;
176 inode->i_ctime.tv_nsec = attr->ctimensec; 177 inode->i_mtime.tv_nsec = attr->mtimensec;
178 inode->i_ctime.tv_sec = attr->ctime;
179 inode->i_ctime.tv_nsec = attr->ctimensec;
180 }
177 181
178 if (attr->blksize != 0) 182 if (attr->blksize != 0)
179 inode->i_blkbits = ilog2(attr->blksize); 183 inode->i_blkbits = ilog2(attr->blksize);
@@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
197{ 201{
198 struct fuse_conn *fc = get_fuse_conn(inode); 202 struct fuse_conn *fc = get_fuse_conn(inode);
199 struct fuse_inode *fi = get_fuse_inode(inode); 203 struct fuse_inode *fi = get_fuse_inode(inode);
204 bool is_wb = fc->writeback_cache;
200 loff_t oldsize; 205 loff_t oldsize;
201 struct timespec old_mtime; 206 struct timespec old_mtime;
202 207
@@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
211 fuse_change_attributes_common(inode, attr, attr_valid); 216 fuse_change_attributes_common(inode, attr, attr_valid);
212 217
213 oldsize = inode->i_size; 218 oldsize = inode->i_size;
214 i_size_write(inode, attr->size); 219 /*
220 * In case of writeback_cache enabled, the cached writes beyond EOF
221 * extend local i_size without keeping userspace server in sync. So,
222 * attr->size coming from server can be stale. We cannot trust it.
223 */
224 if (!is_wb || !S_ISREG(inode->i_mode))
225 i_size_write(inode, attr->size);
215 spin_unlock(&fc->lock); 226 spin_unlock(&fc->lock);
216 227
217 if (S_ISREG(inode->i_mode)) { 228 if (!is_wb && S_ISREG(inode->i_mode)) {
218 bool inval = false; 229 bool inval = false;
219 230
220 if (oldsize != attr->size) { 231 if (oldsize != attr->size) {
@@ -243,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
243{ 254{
244 inode->i_mode = attr->mode & S_IFMT; 255 inode->i_mode = attr->mode & S_IFMT;
245 inode->i_size = attr->size; 256 inode->i_size = attr->size;
257 inode->i_mtime.tv_sec = attr->mtime;
258 inode->i_mtime.tv_nsec = attr->mtimensec;
259 inode->i_ctime.tv_sec = attr->ctime;
260 inode->i_ctime.tv_nsec = attr->ctimensec;
246 if (S_ISREG(inode->i_mode)) { 261 if (S_ISREG(inode->i_mode)) {
247 fuse_init_common(inode); 262 fuse_init_common(inode);
248 fuse_init_file_inode(inode); 263 fuse_init_file_inode(inode);
@@ -289,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
289 return NULL; 304 return NULL;
290 305
291 if ((inode->i_state & I_NEW)) { 306 if ((inode->i_state & I_NEW)) {
292 inode->i_flags |= S_NOATIME|S_NOCMTIME; 307 inode->i_flags |= S_NOATIME;
308 if (!fc->writeback_cache || !S_ISREG(attr->mode))
309 inode->i_flags |= S_NOCMTIME;
293 inode->i_generation = generation; 310 inode->i_generation = generation;
294 inode->i_data.backing_dev_info = &fc->bdi; 311 inode->i_data.backing_dev_info = &fc->bdi;
295 fuse_init_inode(inode, attr); 312 fuse_init_inode(inode, attr);
@@ -773,6 +790,7 @@ static const struct super_operations fuse_super_operations = {
773 .alloc_inode = fuse_alloc_inode, 790 .alloc_inode = fuse_alloc_inode,
774 .destroy_inode = fuse_destroy_inode, 791 .destroy_inode = fuse_destroy_inode,
775 .evict_inode = fuse_evict_inode, 792 .evict_inode = fuse_evict_inode,
793 .write_inode = fuse_write_inode,
776 .drop_inode = generic_delete_inode, 794 .drop_inode = generic_delete_inode,
777 .remount_fs = fuse_remount_fs, 795 .remount_fs = fuse_remount_fs,
778 .put_super = fuse_put_super, 796 .put_super = fuse_put_super,
@@ -873,6 +891,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
873 } 891 }
874 if (arg->flags & FUSE_ASYNC_DIO) 892 if (arg->flags & FUSE_ASYNC_DIO)
875 fc->async_dio = 1; 893 fc->async_dio = 1;
894 if (arg->flags & FUSE_WRITEBACK_CACHE)
895 fc->writeback_cache = 1;
896 if (arg->time_gran && arg->time_gran <= 1000000000)
897 fc->sb->s_time_gran = arg->time_gran;
898 else
899 fc->sb->s_time_gran = 1000000000;
900
876 } else { 901 } else {
877 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 902 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
878 fc->no_lock = 1; 903 fc->no_lock = 1;
@@ -900,7 +925,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
900 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | 925 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
901 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | 926 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
902 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | 927 FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
903 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; 928 FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
929 FUSE_WRITEBACK_CACHE;
904 req->in.h.opcode = FUSE_INIT; 930 req->in.h.opcode = FUSE_INIT;
905 req->in.numargs = 1; 931 req->in.numargs = 1;
906 req->in.args[0].size = sizeof(*arg); 932 req->in.args[0].size = sizeof(*arg);
@@ -978,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
978 if (sb->s_flags & MS_MANDLOCK) 1004 if (sb->s_flags & MS_MANDLOCK)
979 goto err; 1005 goto err;
980 1006
981 sb->s_flags &= ~MS_NOSEC; 1007 sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
982 1008
983 if (!parse_fuse_opt((char *) data, &d, is_bdev)) 1009 if (!parse_fuse_opt((char *) data, &d, is_bdev))
984 goto err; 1010 goto err;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ba9456685f47..3088e2a38e30 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -64,18 +64,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
64 return acl; 64 return acl;
65} 65}
66 66
67static int gfs2_set_mode(struct inode *inode, umode_t mode)
68{
69 int error = 0;
70
71 if (mode != inode->i_mode) {
72 inode->i_mode = mode;
73 mark_inode_dirty(inode);
74 }
75
76 return error;
77}
78
79int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) 67int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
80{ 68{
81 int error; 69 int error;
@@ -85,8 +73,8 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
85 73
86 BUG_ON(name == NULL); 74 BUG_ON(name == NULL);
87 75
88 if (acl->a_count > GFS2_ACL_MAX_ENTRIES) 76 if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
89 return -EINVAL; 77 return -E2BIG;
90 78
91 if (type == ACL_TYPE_ACCESS) { 79 if (type == ACL_TYPE_ACCESS) {
92 umode_t mode = inode->i_mode; 80 umode_t mode = inode->i_mode;
@@ -98,9 +86,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
98 if (error == 0) 86 if (error == 0)
99 acl = NULL; 87 acl = NULL;
100 88
101 error = gfs2_set_mode(inode, mode); 89 if (mode != inode->i_mode) {
102 if (error) 90 inode->i_mode = mode;
103 return error; 91 mark_inode_dirty(inode);
92 }
104 } 93 }
105 94
106 if (acl) { 95 if (acl) {
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 301260c999ba..2d65ec4cd4be 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -14,7 +14,7 @@
14 14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" 15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" 16#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
18 18
19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); 19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); 20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 49436fa7cd4f..ce62dcac90b6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -21,6 +21,7 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <trace/events/writeback.h>
24 25
25#include "gfs2.h" 26#include "gfs2.h"
26#include "incore.h" 27#include "incore.h"
@@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping,
230static int gfs2_write_jdata_pagevec(struct address_space *mapping, 231static int gfs2_write_jdata_pagevec(struct address_space *mapping,
231 struct writeback_control *wbc, 232 struct writeback_control *wbc,
232 struct pagevec *pvec, 233 struct pagevec *pvec,
233 int nr_pages, pgoff_t end) 234 int nr_pages, pgoff_t end,
235 pgoff_t *done_index)
234{ 236{
235 struct inode *inode = mapping->host; 237 struct inode *inode = mapping->host;
236 struct gfs2_sbd *sdp = GFS2_SB(inode); 238 struct gfs2_sbd *sdp = GFS2_SB(inode);
237 loff_t i_size = i_size_read(inode);
238 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
239 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
240 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); 239 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
241 int i; 240 int i;
242 int ret; 241 int ret;
@@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
248 for(i = 0; i < nr_pages; i++) { 247 for(i = 0; i < nr_pages; i++) {
249 struct page *page = pvec->pages[i]; 248 struct page *page = pvec->pages[i];
250 249
250 /*
251 * At this point, the page may be truncated or
252 * invalidated (changing page->mapping to NULL), or
253 * even swizzled back from swapper_space to tmpfs file
254 * mapping. However, page->index will not change
255 * because we have a reference on the page.
256 */
257 if (page->index > end) {
258 /*
259 * can't be range_cyclic (1st pass) because
260 * end == -1 in that case.
261 */
262 ret = 1;
263 break;
264 }
265
266 *done_index = page->index;
267
251 lock_page(page); 268 lock_page(page);
252 269
253 if (unlikely(page->mapping != mapping)) { 270 if (unlikely(page->mapping != mapping)) {
271continue_unlock:
254 unlock_page(page); 272 unlock_page(page);
255 continue; 273 continue;
256 } 274 }
257 275
258 if (!wbc->range_cyclic && page->index > end) { 276 if (!PageDirty(page)) {
259 ret = 1; 277 /* someone wrote it for us */
260 unlock_page(page); 278 goto continue_unlock;
261 continue;
262 } 279 }
263 280
264 if (wbc->sync_mode != WB_SYNC_NONE) 281 if (PageWriteback(page)) {
265 wait_on_page_writeback(page); 282 if (wbc->sync_mode != WB_SYNC_NONE)
266 283 wait_on_page_writeback(page);
267 if (PageWriteback(page) || 284 else
268 !clear_page_dirty_for_io(page)) { 285 goto continue_unlock;
269 unlock_page(page);
270 continue;
271 } 286 }
272 287
273 /* Is the page fully outside i_size? (truncate in progress) */ 288 BUG_ON(PageWriteback(page));
274 if (page->index > end_index || (page->index == end_index && !offset)) { 289 if (!clear_page_dirty_for_io(page))
275 page->mapping->a_ops->invalidatepage(page, 0, 290 goto continue_unlock;
276 PAGE_CACHE_SIZE); 291
277 unlock_page(page); 292 trace_wbc_writepage(wbc, mapping->backing_dev_info);
278 continue;
279 }
280 293
281 ret = __gfs2_jdata_writepage(page, wbc); 294 ret = __gfs2_jdata_writepage(page, wbc);
295 if (unlikely(ret)) {
296 if (ret == AOP_WRITEPAGE_ACTIVATE) {
297 unlock_page(page);
298 ret = 0;
299 } else {
300
301 /*
302 * done_index is set past this page,
303 * so media errors will not choke
304 * background writeout for the entire
305 * file. This has consequences for
306 * range_cyclic semantics (ie. it may
307 * not be suitable for data integrity
308 * writeout).
309 */
310 *done_index = page->index + 1;
311 ret = 1;
312 break;
313 }
314 }
282 315
283 if (ret || (--(wbc->nr_to_write) <= 0)) 316 /*
317 * We stop writing back only if we are not doing
318 * integrity sync. In case of integrity sync we have to
319 * keep going until we have written all the pages
320 * we tagged for writeback prior to entering this loop.
321 */
322 if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
284 ret = 1; 323 ret = 1;
324 break;
325 }
326
285 } 327 }
286 gfs2_trans_end(sdp); 328 gfs2_trans_end(sdp);
287 return ret; 329 return ret;
@@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
306 int done = 0; 348 int done = 0;
307 struct pagevec pvec; 349 struct pagevec pvec;
308 int nr_pages; 350 int nr_pages;
351 pgoff_t uninitialized_var(writeback_index);
309 pgoff_t index; 352 pgoff_t index;
310 pgoff_t end; 353 pgoff_t end;
311 int scanned = 0; 354 pgoff_t done_index;
355 int cycled;
312 int range_whole = 0; 356 int range_whole = 0;
357 int tag;
313 358
314 pagevec_init(&pvec, 0); 359 pagevec_init(&pvec, 0);
315 if (wbc->range_cyclic) { 360 if (wbc->range_cyclic) {
316 index = mapping->writeback_index; /* Start from prev offset */ 361 writeback_index = mapping->writeback_index; /* prev offset */
362 index = writeback_index;
363 if (index == 0)
364 cycled = 1;
365 else
366 cycled = 0;
317 end = -1; 367 end = -1;
318 } else { 368 } else {
319 index = wbc->range_start >> PAGE_CACHE_SHIFT; 369 index = wbc->range_start >> PAGE_CACHE_SHIFT;
320 end = wbc->range_end >> PAGE_CACHE_SHIFT; 370 end = wbc->range_end >> PAGE_CACHE_SHIFT;
321 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 371 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
322 range_whole = 1; 372 range_whole = 1;
323 scanned = 1; 373 cycled = 1; /* ignore range_cyclic tests */
324 } 374 }
375 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
376 tag = PAGECACHE_TAG_TOWRITE;
377 else
378 tag = PAGECACHE_TAG_DIRTY;
325 379
326retry: 380retry:
327 while (!done && (index <= end) && 381 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
328 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 382 tag_pages_for_writeback(mapping, index, end);
329 PAGECACHE_TAG_DIRTY, 383 done_index = index;
330 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 384 while (!done && (index <= end)) {
331 scanned = 1; 385 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
332 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); 386 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
387 if (nr_pages == 0)
388 break;
389
390 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
333 if (ret) 391 if (ret)
334 done = 1; 392 done = 1;
335 if (ret > 0) 393 if (ret > 0)
336 ret = 0; 394 ret = 0;
337
338 pagevec_release(&pvec); 395 pagevec_release(&pvec);
339 cond_resched(); 396 cond_resched();
340 } 397 }
341 398
342 if (!scanned && !done) { 399 if (!cycled && !done) {
343 /* 400 /*
401 * range_cyclic:
344 * We hit the last page and there is more work to be done: wrap 402 * We hit the last page and there is more work to be done: wrap
345 * back to the start of the file 403 * back to the start of the file
346 */ 404 */
347 scanned = 1; 405 cycled = 1;
348 index = 0; 406 index = 0;
407 end = writeback_index - 1;
349 goto retry; 408 goto retry;
350 } 409 }
351 410
352 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 411 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
353 mapping->writeback_index = index; 412 mapping->writeback_index = done_index;
413
354 return ret; 414 return ret;
355} 415}
356 416
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fe0500c0af7a..c62d4b9f51dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1328,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1328} 1328}
1329 1329
1330/** 1330/**
1331 * gfs2_free_journal_extents - Free cached journal bmap info
1332 * @jd: The journal
1333 *
1334 */
1335
1336void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1337{
1338 struct gfs2_journal_extent *jext;
1339
1340 while(!list_empty(&jd->extent_list)) {
1341 jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1342 list_del(&jext->list);
1343 kfree(jext);
1344 }
1345}
1346
1347/**
1348 * gfs2_add_jextent - Add or merge a new extent to extent cache
1349 * @jd: The journal descriptor
1350 * @lblock: The logical block at start of new extent
1351 * @pblock: The physical block at start of new extent
1352 * @blocks: Size of extent in fs blocks
1353 *
1354 * Returns: 0 on success or -ENOMEM
1355 */
1356
1357static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1358{
1359 struct gfs2_journal_extent *jext;
1360
1361 if (!list_empty(&jd->extent_list)) {
1362 jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1363 if ((jext->dblock + jext->blocks) == dblock) {
1364 jext->blocks += blocks;
1365 return 0;
1366 }
1367 }
1368
1369 jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1370 if (jext == NULL)
1371 return -ENOMEM;
1372 jext->dblock = dblock;
1373 jext->lblock = lblock;
1374 jext->blocks = blocks;
1375 list_add_tail(&jext->list, &jd->extent_list);
1376 jd->nr_extents++;
1377 return 0;
1378}
1379
1380/**
1381 * gfs2_map_journal_extents - Cache journal bmap info
1382 * @sdp: The super block
1383 * @jd: The journal to map
1384 *
1385 * Create a reusable "extent" mapping from all logical
1386 * blocks to all physical blocks for the given journal. This will save
1387 * us time when writing journal blocks. Most journals will have only one
1388 * extent that maps all their logical blocks. That's because gfs2.mkfs
1389 * arranges the journal blocks sequentially to maximize performance.
1390 * So the extent would map the first block for the entire file length.
1391 * However, gfs2_jadd can happen while file activity is happening, so
1392 * those journals may not be sequential. Less likely is the case where
1393 * the users created their own journals by mounting the metafs and
1394 * laying it out. But it's still possible. These journals might have
1395 * several extents.
1396 *
1397 * Returns: 0 on success, or error on failure
1398 */
1399
1400int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1401{
1402 u64 lblock = 0;
1403 u64 lblock_stop;
1404 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1405 struct buffer_head bh;
1406 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1407 u64 size;
1408 int rc;
1409
1410 lblock_stop = i_size_read(jd->jd_inode) >> shift;
1411 size = (lblock_stop - lblock) << shift;
1412 jd->nr_extents = 0;
1413 WARN_ON(!list_empty(&jd->extent_list));
1414
1415 do {
1416 bh.b_state = 0;
1417 bh.b_blocknr = 0;
1418 bh.b_size = size;
1419 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1420 if (rc || !buffer_mapped(&bh))
1421 goto fail;
1422 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1423 if (rc)
1424 goto fail;
1425 size -= bh.b_size;
1426 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1427 } while(size > 0);
1428
1429 fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1430 jd->nr_extents);
1431 return 0;
1432
1433fail:
1434 fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1435 rc, jd->jd_jid,
1436 (unsigned long long)(i_size_read(jd->jd_inode) - size),
1437 jd->nr_extents);
1438 fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1439 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1440 bh.b_state, (unsigned long long)bh.b_size);
1441 gfs2_free_journal_extents(jd);
1442 return rc;
1443}
1444
1445/**
1331 * gfs2_write_alloc_required - figure out if a write will require an allocation 1446 * gfs2_write_alloc_required - figure out if a write will require an allocation
1332 * @ip: the file being written to 1447 * @ip: the file being written to
1333 * @offset: the offset to write to 1448 * @offset: the offset to write to
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 42fea03e2bd9..81ded5e2aaa2 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55extern int gfs2_file_dealloc(struct gfs2_inode *ip); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len); 57 unsigned int len);
58extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
59extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
58 60
59#endif /* __BMAP_DOT_H__ */ 61#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fa32655449c8..1a349f9a9685 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -53,6 +53,8 @@
53 * but never before the maximum hash table size has been reached. 53 * but never before the maximum hash table size has been reached.
54 */ 54 */
55 55
56#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
57
56#include <linux/slab.h> 58#include <linux/slab.h>
57#include <linux/spinlock.h> 59#include <linux/spinlock.h>
58#include <linux/buffer_head.h> 60#include <linux/buffer_head.h>
@@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
507 goto error; 509 goto error;
508 return 0; 510 return 0;
509error: 511error:
510 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, 512 pr_warn("%s: %s (%s)\n",
511 first ? "first in block" : "not first in block"); 513 __func__, msg, first ? "first in block" : "not first in block");
512 return -EIO; 514 return -EIO;
513} 515}
514 516
@@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf)
531 } 533 }
532 return offset; 534 return offset;
533wrong_type: 535wrong_type:
534 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", 536 pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
535 be32_to_cpu(h->mh_type));
536 return -1; 537 return -1;
537} 538}
538 539
@@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
728 729
729 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); 730 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
730 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { 731 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
731 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ 732 /* pr_info("block num=%llu\n", leaf_no); */
732 error = -EIO; 733 error = -EIO;
733 } 734 }
734 735
@@ -1006,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1006 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); 1007 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
1007 half_len = len >> 1; 1008 half_len = len >> 1;
1008 if (!half_len) { 1009 if (!half_len) {
1009 printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); 1010 pr_warn("i_depth %u lf_depth %u index %u\n",
1011 dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
1010 gfs2_consist_inode(dip); 1012 gfs2_consist_inode(dip);
1011 error = -EIO; 1013 error = -EIO;
1012 goto fail_brelse; 1014 goto fail_brelse;
@@ -1684,6 +1686,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1684 return 0; 1686 return 0;
1685} 1687}
1686 1688
1689static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
1690{
1691 u64 where = ip->i_no_addr + 1;
1692 if (ip->i_eattr == where)
1693 return 1;
1694 return 0;
1695}
1696
1687/** 1697/**
1688 * gfs2_dir_add - Add new filename into directory 1698 * gfs2_dir_add - Add new filename into directory
1689 * @inode: The directory inode 1699 * @inode: The directory inode
@@ -1721,6 +1731,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1721 dent = gfs2_init_dirent(inode, dent, name, bh); 1731 dent = gfs2_init_dirent(inode, dent, name, bh);
1722 gfs2_inum_out(nip, dent); 1732 gfs2_inum_out(nip, dent);
1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1733 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1734 dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
1724 tv = CURRENT_TIME; 1735 tv = CURRENT_TIME;
1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1736 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1726 leaf = (struct gfs2_leaf *)bh->b_data; 1737 leaf = (struct gfs2_leaf *)bh->b_data;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index efc078f0ee4e..80d67253623c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -494,6 +494,7 @@ out:
494 494
495static const struct vm_operations_struct gfs2_vm_ops = { 495static const struct vm_operations_struct gfs2_vm_ops = {
496 .fault = filemap_fault, 496 .fault = filemap_fault,
497 .map_pages = filemap_map_pages,
497 .page_mkwrite = gfs2_page_mkwrite, 498 .page_mkwrite = gfs2_page_mkwrite,
498 .remap_pages = generic_file_remap_pages, 499 .remap_pages = generic_file_remap_pages,
499}; 500};
@@ -811,6 +812,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
811 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); 812 loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
812 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; 813 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
813 loff_t max_chunk_size = UINT_MAX & bsize_mask; 814 loff_t max_chunk_size = UINT_MAX & bsize_mask;
815 struct gfs2_holder gh;
816
814 next = (next + 1) << sdp->sd_sb.sb_bsize_shift; 817 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
815 818
816 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 819 /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -831,8 +834,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
831 if (error) 834 if (error)
832 return error; 835 return error;
833 836
834 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); 837 mutex_lock(&inode->i_mutex);
835 error = gfs2_glock_nq(&ip->i_gh); 838
839 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
840 error = gfs2_glock_nq(&gh);
836 if (unlikely(error)) 841 if (unlikely(error))
837 goto out_uninit; 842 goto out_uninit;
838 843
@@ -900,9 +905,10 @@ out_trans_fail:
900out_qunlock: 905out_qunlock:
901 gfs2_quota_unlock(ip); 906 gfs2_quota_unlock(ip);
902out_unlock: 907out_unlock:
903 gfs2_glock_dq(&ip->i_gh); 908 gfs2_glock_dq(&gh);
904out_uninit: 909out_uninit:
905 gfs2_holder_uninit(&ip->i_gh); 910 gfs2_holder_uninit(&gh);
911 mutex_unlock(&inode->i_mutex);
906 return error; 912 return error;
907} 913}
908 914
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ca0be6c69a26..aec7f73832f0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -468,7 +470,7 @@ retry:
468 do_xmote(gl, gh, LM_ST_UNLOCKED); 470 do_xmote(gl, gh, LM_ST_UNLOCKED);
469 break; 471 break;
470 default: /* Everything else */ 472 default: /* Everything else */
471 printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); 473 pr_err("wanted %u got %u\n", gl->gl_target, state);
472 GLOCK_BUG_ON(gl, 1); 474 GLOCK_BUG_ON(gl, 1);
473 } 475 }
474 spin_unlock(&gl->gl_spin); 476 spin_unlock(&gl->gl_spin);
@@ -542,7 +544,7 @@ __acquires(&gl->gl_spin)
542 /* lock_dlm */ 544 /* lock_dlm */
543 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); 545 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
544 if (ret) { 546 if (ret) {
545 printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); 547 pr_err("lm_lock ret %d\n", ret);
546 GLOCK_BUG_ON(gl, 1); 548 GLOCK_BUG_ON(gl, 1);
547 } 549 }
548 } else { /* lock_nolock */ 550 } else { /* lock_nolock */
@@ -935,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
935 vaf.fmt = fmt; 937 vaf.fmt = fmt;
936 vaf.va = &args; 938 vaf.va = &args;
937 939
938 printk(KERN_ERR " %pV", &vaf); 940 pr_err("%pV", &vaf);
939 } 941 }
940 942
941 va_end(args); 943 va_end(args);
@@ -1010,13 +1012,13 @@ do_cancel:
1010 return; 1012 return;
1011 1013
1012trap_recursive: 1014trap_recursive:
1013 printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); 1015 pr_err("original: %pSR\n", (void *)gh2->gh_ip);
1014 printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); 1016 pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
1015 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1017 pr_err("lock type: %d req lock state : %d\n",
1016 gh2->gh_gl->gl_name.ln_type, gh2->gh_state); 1018 gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
1017 printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); 1019 pr_err("new: %pSR\n", (void *)gh->gh_ip);
1018 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); 1020 pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
1019 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1021 pr_err("lock type: %d req lock state : %d\n",
1020 gh->gh_gl->gl_name.ln_type, gh->gh_state); 1022 gh->gh_gl->gl_name.ln_type, gh->gh_state);
1021 gfs2_dump_glock(NULL, gl); 1023 gfs2_dump_glock(NULL, gl);
1022 BUG(); 1024 BUG();
@@ -1045,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1045 1047
1046 spin_lock(&gl->gl_spin); 1048 spin_lock(&gl->gl_spin);
1047 add_to_queue(gh); 1049 add_to_queue(gh);
1048 if ((LM_FLAG_NOEXP & gh->gh_flags) && 1050 if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
1049 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) 1051 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
1050 set_bit(GLF_REPLY_PENDING, &gl->gl_flags); 1052 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1053 gl->gl_lockref.count++;
1054 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1055 gl->gl_lockref.count--;
1056 }
1051 run_queue(gl, 1); 1057 run_queue(gl, 1);
1052 spin_unlock(&gl->gl_spin); 1058 spin_unlock(&gl->gl_spin);
1053 1059
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3bf0631b5d56..54b66809e818 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -82,6 +82,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
82 struct gfs2_trans tr; 82 struct gfs2_trans tr;
83 83
84 memset(&tr, 0, sizeof(tr)); 84 memset(&tr, 0, sizeof(tr));
85 INIT_LIST_HEAD(&tr.tr_buf);
86 INIT_LIST_HEAD(&tr.tr_databuf);
85 tr.tr_revokes = atomic_read(&gl->gl_ail_count); 87 tr.tr_revokes = atomic_read(&gl->gl_ail_count);
86 88
87 if (!tr.tr_revokes) 89 if (!tr.tr_revokes)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index cf0e34400f71..bdf70c18610c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,7 @@ struct gfs2_log_header_host {
52 */ 52 */
53 53
54struct gfs2_log_operations { 54struct gfs2_log_operations {
55 void (*lo_before_commit) (struct gfs2_sbd *sdp); 55 void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); 56 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_scan) (struct gfs2_jdesc *jd, 57 void (*lo_before_scan) (struct gfs2_jdesc *jd,
58 struct gfs2_log_header_host *head, int pass); 58 struct gfs2_log_header_host *head, int pass);
@@ -371,6 +371,7 @@ enum {
371 GIF_ALLOC_FAILED = 2, 371 GIF_ALLOC_FAILED = 2,
372 GIF_SW_PAGED = 3, 372 GIF_SW_PAGED = 3,
373 GIF_ORDERED = 4, 373 GIF_ORDERED = 4,
374 GIF_FREE_VFS_INODE = 5,
374}; 375};
375 376
376struct gfs2_inode { 377struct gfs2_inode {
@@ -462,11 +463,11 @@ struct gfs2_trans {
462 unsigned int tr_blocks; 463 unsigned int tr_blocks;
463 unsigned int tr_revokes; 464 unsigned int tr_revokes;
464 unsigned int tr_reserved; 465 unsigned int tr_reserved;
466 unsigned int tr_touched:1;
467 unsigned int tr_attached:1;
465 468
466 struct gfs2_holder tr_t_gh; 469 struct gfs2_holder tr_t_gh;
467 470
468 int tr_touched;
469 int tr_attached;
470 471
471 unsigned int tr_num_buf_new; 472 unsigned int tr_num_buf_new;
472 unsigned int tr_num_databuf_new; 473 unsigned int tr_num_databuf_new;
@@ -476,6 +477,8 @@ struct gfs2_trans {
476 unsigned int tr_num_revoke_rm; 477 unsigned int tr_num_revoke_rm;
477 478
478 struct list_head tr_list; 479 struct list_head tr_list;
480 struct list_head tr_databuf;
481 struct list_head tr_buf;
479 482
480 unsigned int tr_first; 483 unsigned int tr_first;
481 struct list_head tr_ail1_list; 484 struct list_head tr_ail1_list;
@@ -483,7 +486,7 @@ struct gfs2_trans {
483}; 486};
484 487
485struct gfs2_journal_extent { 488struct gfs2_journal_extent {
486 struct list_head extent_list; 489 struct list_head list;
487 490
488 unsigned int lblock; /* First logical block */ 491 unsigned int lblock; /* First logical block */
489 u64 dblock; /* First disk block */ 492 u64 dblock; /* First disk block */
@@ -493,6 +496,7 @@ struct gfs2_journal_extent {
493struct gfs2_jdesc { 496struct gfs2_jdesc {
494 struct list_head jd_list; 497 struct list_head jd_list;
495 struct list_head extent_list; 498 struct list_head extent_list;
499 unsigned int nr_extents;
496 struct work_struct jd_work; 500 struct work_struct jd_work;
497 struct inode *jd_inode; 501 struct inode *jd_inode;
498 unsigned long jd_flags; 502 unsigned long jd_flags;
@@ -500,6 +504,15 @@ struct gfs2_jdesc {
500 unsigned int jd_jid; 504 unsigned int jd_jid;
501 unsigned int jd_blocks; 505 unsigned int jd_blocks;
502 int jd_recover_error; 506 int jd_recover_error;
507 /* Replay stuff */
508
509 unsigned int jd_found_blocks;
510 unsigned int jd_found_revokes;
511 unsigned int jd_replayed_blocks;
512
513 struct list_head jd_revoke_list;
514 unsigned int jd_replay_tail;
515
503}; 516};
504 517
505struct gfs2_statfs_change_host { 518struct gfs2_statfs_change_host {
@@ -746,19 +759,12 @@ struct gfs2_sbd {
746 759
747 struct gfs2_trans *sd_log_tr; 760 struct gfs2_trans *sd_log_tr;
748 unsigned int sd_log_blks_reserved; 761 unsigned int sd_log_blks_reserved;
749 unsigned int sd_log_commited_buf;
750 unsigned int sd_log_commited_databuf;
751 int sd_log_commited_revoke; 762 int sd_log_commited_revoke;
752 763
753 atomic_t sd_log_pinned; 764 atomic_t sd_log_pinned;
754 unsigned int sd_log_num_buf;
755 unsigned int sd_log_num_revoke; 765 unsigned int sd_log_num_revoke;
756 unsigned int sd_log_num_rg;
757 unsigned int sd_log_num_databuf;
758 766
759 struct list_head sd_log_le_buf;
760 struct list_head sd_log_le_revoke; 767 struct list_head sd_log_le_revoke;
761 struct list_head sd_log_le_databuf;
762 struct list_head sd_log_le_ordered; 768 struct list_head sd_log_le_ordered;
763 spinlock_t sd_ordered_lock; 769 spinlock_t sd_ordered_lock;
764 770
@@ -786,15 +792,6 @@ struct gfs2_sbd {
786 struct list_head sd_ail1_list; 792 struct list_head sd_ail1_list;
787 struct list_head sd_ail2_list; 793 struct list_head sd_ail2_list;
788 794
789 /* Replay stuff */
790
791 struct list_head sd_revoke_list;
792 unsigned int sd_replay_tail;
793
794 unsigned int sd_found_blocks;
795 unsigned int sd_found_revokes;
796 unsigned int sd_replayed_blocks;
797
798 /* For quiescing the filesystem */ 795 /* For quiescing the filesystem */
799 struct gfs2_holder sd_freeze_gh; 796 struct gfs2_holder sd_freeze_gh;
800 797
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5c524180c98e..28cc7bf6575a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -376,12 +376,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
376 inode->i_gid = current_fsgid(); 376 inode->i_gid = current_fsgid();
377} 377}
378 378
379static int alloc_dinode(struct gfs2_inode *ip, u32 flags) 379static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
380{ 380{
381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
382 struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, }; 382 struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
383 int error; 383 int error;
384 int dblocks = 1;
385 384
386 error = gfs2_quota_lock_check(ip); 385 error = gfs2_quota_lock_check(ip);
387 if (error) 386 if (error)
@@ -391,11 +390,11 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
391 if (error) 390 if (error)
392 goto out_quota; 391 goto out_quota;
393 392
394 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); 393 error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
395 if (error) 394 if (error)
396 goto out_ipreserv; 395 goto out_ipreserv;
397 396
398 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); 397 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
399 ip->i_no_formal_ino = ip->i_generation; 398 ip->i_no_formal_ino = ip->i_generation;
400 ip->i_inode.i_ino = ip->i_no_addr; 399 ip->i_inode.i_ino = ip->i_no_addr;
401 ip->i_goal = ip->i_no_addr; 400 ip->i_goal = ip->i_no_addr;
@@ -428,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh,
428} 427}
429 428
430/** 429/**
430 * gfs2_init_xattr - Initialise an xattr block for a new inode
431 * @ip: The inode in question
432 *
433 * This sets up an empty xattr block for a new inode, ready to
434 * take any ACLs, LSM xattrs, etc.
435 */
436
437static void gfs2_init_xattr(struct gfs2_inode *ip)
438{
439 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
440 struct buffer_head *bh;
441 struct gfs2_ea_header *ea;
442
443 bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
444 gfs2_trans_add_meta(ip->i_gl, bh);
445 gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
446 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
447
448 ea = GFS2_EA_BH2FIRST(bh);
449 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
450 ea->ea_type = GFS2_EATYPE_UNUSED;
451 ea->ea_flags = GFS2_EAFLAG_LAST;
452
453 brelse(bh);
454}
455
456/**
431 * init_dinode - Fill in a new dinode structure 457 * init_dinode - Fill in a new dinode structure
432 * @dip: The directory this inode is being created in 458 * @dip: The directory this inode is being created in
433 * @ip: The inode 459 * @ip: The inode
@@ -545,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
545 return err; 571 return err;
546} 572}
547 573
548static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
549 const struct qstr *qstr)
550{
551 return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
552 &gfs2_initxattrs, NULL);
553}
554
555/** 574/**
556 * gfs2_create_inode - Create a new inode 575 * gfs2_create_inode - Create a new inode
557 * @dir: The parent directory 576 * @dir: The parent directory
@@ -578,8 +597,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
578 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 597 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
579 struct gfs2_glock *io_gl; 598 struct gfs2_glock *io_gl;
580 struct dentry *d; 599 struct dentry *d;
581 int error; 600 int error, free_vfs_inode = 0;
582 u32 aflags = 0; 601 u32 aflags = 0;
602 unsigned blocks = 1;
583 struct gfs2_diradd da = { .bh = NULL, }; 603 struct gfs2_diradd da = { .bh = NULL, };
584 604
585 if (!name->len || name->len > GFS2_FNAMESIZE) 605 if (!name->len || name->len > GFS2_FNAMESIZE)
@@ -676,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
676 (dip->i_diskflags & GFS2_DIF_TOPDIR)) 696 (dip->i_diskflags & GFS2_DIF_TOPDIR))
677 aflags |= GFS2_AF_ORLOV; 697 aflags |= GFS2_AF_ORLOV;
678 698
679 error = alloc_dinode(ip, aflags); 699 if (default_acl || acl)
700 blocks++;
701
702 error = alloc_dinode(ip, aflags, &blocks);
680 if (error) 703 if (error)
681 goto fail_free_inode; 704 goto fail_free_inode;
682 705
706 gfs2_set_inode_blocks(inode, blocks);
707
683 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 708 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
684 if (error) 709 if (error)
685 goto fail_free_inode; 710 goto fail_free_inode;
@@ -689,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
689 if (error) 714 if (error)
690 goto fail_free_inode; 715 goto fail_free_inode;
691 716
692 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 717 error = gfs2_trans_begin(sdp, blocks, 0);
693 if (error) 718 if (error)
694 goto fail_gunlock2; 719 goto fail_gunlock2;
695 720
721 if (blocks > 1) {
722 ip->i_eattr = ip->i_no_addr + 1;
723 gfs2_init_xattr(ip);
724 }
696 init_dinode(dip, ip, symname); 725 init_dinode(dip, ip, symname);
697 gfs2_trans_end(sdp); 726 gfs2_trans_end(sdp);
698 727
@@ -722,7 +751,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
722 if (error) 751 if (error)
723 goto fail_gunlock3; 752 goto fail_gunlock3;
724 753
725 error = gfs2_security_init(dip, ip, name); 754 error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
755 &gfs2_initxattrs, NULL);
726 if (error) 756 if (error)
727 goto fail_gunlock3; 757 goto fail_gunlock3;
728 758
@@ -758,15 +788,16 @@ fail_free_acls:
758 if (acl) 788 if (acl)
759 posix_acl_release(acl); 789 posix_acl_release(acl);
760fail_free_vfs_inode: 790fail_free_vfs_inode:
761 free_inode_nonrcu(inode); 791 free_vfs_inode = 1;
762 inode = NULL;
763fail_gunlock: 792fail_gunlock:
764 gfs2_dir_no_add(&da); 793 gfs2_dir_no_add(&da);
765 gfs2_glock_dq_uninit(ghs); 794 gfs2_glock_dq_uninit(ghs);
766 if (inode && !IS_ERR(inode)) { 795 if (inode && !IS_ERR(inode)) {
767 clear_nlink(inode); 796 clear_nlink(inode);
768 mark_inode_dirty(inode); 797 if (!free_vfs_inode)
769 set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); 798 mark_inode_dirty(inode);
799 set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
800 &GFS2_I(inode)->i_flags);
770 iput(inode); 801 iput(inode);
771 } 802 }
772fail: 803fail:
@@ -1263,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1263 } 1294 }
1264 1295
1265 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); 1296 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
1297 if (!tmp) {
1298 error = -ENOENT;
1299 break;
1300 }
1266 if (IS_ERR(tmp)) { 1301 if (IS_ERR(tmp)) {
1267 error = PTR_ERR(tmp); 1302 error = PTR_ERR(tmp);
1268 break; 1303 break;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2a6ba06bee6f..c1eb555dc588 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/fs.h> 12#include <linux/fs.h>
11#include <linux/dlm.h> 13#include <linux/dlm.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode)
176 gfs2_glock_cb(gl, LM_ST_SHARED); 178 gfs2_glock_cb(gl, LM_ST_SHARED);
177 break; 179 break;
178 default: 180 default:
179 printk(KERN_ERR "unknown bast mode %d", mode); 181 pr_err("unknown bast mode %d\n", mode);
180 BUG(); 182 BUG();
181 } 183 }
182} 184}
@@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate)
195 case LM_ST_SHARED: 197 case LM_ST_SHARED:
196 return DLM_LOCK_PR; 198 return DLM_LOCK_PR;
197 } 199 }
198 printk(KERN_ERR "unknown LM state %d", lmstate); 200 pr_err("unknown LM state %d\n", lmstate);
199 BUG(); 201 BUG();
200 return -1; 202 return -1;
201} 203}
@@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
308 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, 310 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
309 NULL, gl); 311 NULL, gl);
310 if (error) { 312 if (error) {
311 printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", 313 pr_err("gdlm_unlock %x,%llx err=%d\n",
312 gl->gl_name.ln_type, 314 gl->gl_name.ln_type,
313 (unsigned long long)gl->gl_name.ln_number, error); 315 (unsigned long long)gl->gl_name.ln_number, error);
314 return; 316 return;
@@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
1102 } 1104 }
1103 1105
1104 if (ls->ls_recover_submit[jid]) { 1106 if (ls->ls_recover_submit[jid]) {
1105 fs_info(sdp, "recover_slot jid %d gen %u prev %u", 1107 fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
1106 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); 1108 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
1107 } 1109 }
1108 ls->ls_recover_submit[jid] = ls->ls_recover_block; 1110 ls->ls_recover_submit[jid] = ls->ls_recover_block;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9dcb9777a5f8..4a14d504ef83 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/kthread.h> 18#include <linux/kthread.h>
19#include <linux/freezer.h> 19#include <linux/freezer.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/blkdev.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/list_sort.h> 23#include <linux/list_sort.h>
23 24
@@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
145{ 146{
146 struct list_head *head = &sdp->sd_ail1_list; 147 struct list_head *head = &sdp->sd_ail1_list;
147 struct gfs2_trans *tr; 148 struct gfs2_trans *tr;
149 struct blk_plug plug;
148 150
149 trace_gfs2_ail_flush(sdp, wbc, 1); 151 trace_gfs2_ail_flush(sdp, wbc, 1);
152 blk_start_plug(&plug);
150 spin_lock(&sdp->sd_ail_lock); 153 spin_lock(&sdp->sd_ail_lock);
151restart: 154restart:
152 list_for_each_entry_reverse(tr, head, tr_list) { 155 list_for_each_entry_reverse(tr, head, tr_list) {
@@ -156,6 +159,7 @@ restart:
156 goto restart; 159 goto restart;
157 } 160 }
158 spin_unlock(&sdp->sd_ail_lock); 161 spin_unlock(&sdp->sd_ail_lock);
162 blk_finish_plug(&plug);
159 trace_gfs2_ail_flush(sdp, wbc, 0); 163 trace_gfs2_ail_flush(sdp, wbc, 0);
160} 164}
161 165
@@ -410,24 +414,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
410static unsigned int calc_reserved(struct gfs2_sbd *sdp) 414static unsigned int calc_reserved(struct gfs2_sbd *sdp)
411{ 415{
412 unsigned int reserved = 0; 416 unsigned int reserved = 0;
413 unsigned int mbuf_limit, metabufhdrs_needed; 417 unsigned int mbuf;
414 unsigned int dbuf_limit, databufhdrs_needed; 418 unsigned int dbuf;
415 unsigned int revokes = 0; 419 struct gfs2_trans *tr = sdp->sd_log_tr;
416 420
417 mbuf_limit = buf_limit(sdp); 421 if (tr) {
418 metabufhdrs_needed = (sdp->sd_log_commited_buf + 422 mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
419 (mbuf_limit - 1)) / mbuf_limit; 423 dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
420 dbuf_limit = databuf_limit(sdp); 424 reserved = mbuf + dbuf;
421 databufhdrs_needed = (sdp->sd_log_commited_databuf + 425 /* Account for header blocks */
422 (dbuf_limit - 1)) / dbuf_limit; 426 reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
427 reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
428 }
423 429
424 if (sdp->sd_log_commited_revoke > 0) 430 if (sdp->sd_log_commited_revoke > 0)
425 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 431 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
426 sizeof(u64)); 432 sizeof(u64));
427
428 reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
429 sdp->sd_log_commited_databuf + databufhdrs_needed +
430 revokes;
431 /* One for the overall header */ 433 /* One for the overall header */
432 if (reserved) 434 if (reserved)
433 reserved++; 435 reserved++;
@@ -682,36 +684,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
682 } 684 }
683 trace_gfs2_log_flush(sdp, 1); 685 trace_gfs2_log_flush(sdp, 1);
684 686
687 sdp->sd_log_flush_head = sdp->sd_log_head;
688 sdp->sd_log_flush_wrapped = 0;
685 tr = sdp->sd_log_tr; 689 tr = sdp->sd_log_tr;
686 if (tr) { 690 if (tr) {
687 sdp->sd_log_tr = NULL; 691 sdp->sd_log_tr = NULL;
688 INIT_LIST_HEAD(&tr->tr_ail1_list); 692 INIT_LIST_HEAD(&tr->tr_ail1_list);
689 INIT_LIST_HEAD(&tr->tr_ail2_list); 693 INIT_LIST_HEAD(&tr->tr_ail2_list);
694 tr->tr_first = sdp->sd_log_flush_head;
690 } 695 }
691 696
692 if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
693 printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
694 sdp->sd_log_commited_buf);
695 gfs2_assert_withdraw(sdp, 0);
696 }
697 if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
698 printk(KERN_INFO "GFS2: log databuf %u %u\n",
699 sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
700 gfs2_assert_withdraw(sdp, 0);
701 }
702 gfs2_assert_withdraw(sdp, 697 gfs2_assert_withdraw(sdp,
703 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 698 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
704 699
705 sdp->sd_log_flush_head = sdp->sd_log_head;
706 sdp->sd_log_flush_wrapped = 0;
707 if (tr)
708 tr->tr_first = sdp->sd_log_flush_head;
709
710 gfs2_ordered_write(sdp); 700 gfs2_ordered_write(sdp);
711 lops_before_commit(sdp); 701 lops_before_commit(sdp, tr);
712 gfs2_log_flush_bio(sdp, WRITE); 702 gfs2_log_flush_bio(sdp, WRITE);
713 703
714 if (sdp->sd_log_head != sdp->sd_log_flush_head) { 704 if (sdp->sd_log_head != sdp->sd_log_flush_head) {
705 log_flush_wait(sdp);
715 log_write_header(sdp, 0); 706 log_write_header(sdp, 0);
716 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 707 } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
717 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ 708 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
@@ -723,8 +714,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
723 gfs2_log_lock(sdp); 714 gfs2_log_lock(sdp);
724 sdp->sd_log_head = sdp->sd_log_flush_head; 715 sdp->sd_log_head = sdp->sd_log_flush_head;
725 sdp->sd_log_blks_reserved = 0; 716 sdp->sd_log_blks_reserved = 0;
726 sdp->sd_log_commited_buf = 0;
727 sdp->sd_log_commited_databuf = 0;
728 sdp->sd_log_commited_revoke = 0; 717 sdp->sd_log_commited_revoke = 0;
729 718
730 spin_lock(&sdp->sd_ail_lock); 719 spin_lock(&sdp->sd_ail_lock);
@@ -740,34 +729,54 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
740 kfree(tr); 729 kfree(tr);
741} 730}
742 731
732/**
733 * gfs2_merge_trans - Merge a new transaction into a cached transaction
734 * @old: Original transaction to be expanded
735 * @new: New transaction to be merged
736 */
737
738static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
739{
740 WARN_ON_ONCE(old->tr_attached != 1);
741
742 old->tr_num_buf_new += new->tr_num_buf_new;
743 old->tr_num_databuf_new += new->tr_num_databuf_new;
744 old->tr_num_buf_rm += new->tr_num_buf_rm;
745 old->tr_num_databuf_rm += new->tr_num_databuf_rm;
746 old->tr_num_revoke += new->tr_num_revoke;
747 old->tr_num_revoke_rm += new->tr_num_revoke_rm;
748
749 list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
750 list_splice_tail_init(&new->tr_buf, &old->tr_buf);
751}
752
743static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 753static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
744{ 754{
745 unsigned int reserved; 755 unsigned int reserved;
746 unsigned int unused; 756 unsigned int unused;
757 unsigned int maxres;
747 758
748 gfs2_log_lock(sdp); 759 gfs2_log_lock(sdp);
749 760
750 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; 761 if (sdp->sd_log_tr) {
751 sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - 762 gfs2_merge_trans(sdp->sd_log_tr, tr);
752 tr->tr_num_databuf_rm; 763 } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
753 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 764 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
754 (((int)sdp->sd_log_commited_databuf) >= 0)); 765 sdp->sd_log_tr = tr;
766 tr->tr_attached = 1;
767 }
768
755 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
756 reserved = calc_reserved(sdp); 770 reserved = calc_reserved(sdp);
757 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 771 maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
758 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 772 gfs2_assert_withdraw(sdp, maxres >= reserved);
773 unused = maxres - reserved;
759 atomic_add(unused, &sdp->sd_log_blks_free); 774 atomic_add(unused, &sdp->sd_log_blks_free);
760 trace_gfs2_log_blocks(sdp, unused); 775 trace_gfs2_log_blocks(sdp, unused);
761 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 776 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
762 sdp->sd_jdesc->jd_blocks); 777 sdp->sd_jdesc->jd_blocks);
763 sdp->sd_log_blks_reserved = reserved; 778 sdp->sd_log_blks_reserved = reserved;
764 779
765 if (sdp->sd_log_tr == NULL &&
766 (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
767 gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
768 sdp->sd_log_tr = tr;
769 tr->tr_attached = 1;
770 }
771 gfs2_log_unlock(sdp); 780 gfs2_log_unlock(sdp);
772} 781}
773 782
@@ -807,10 +816,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
807 down_write(&sdp->sd_log_flush_lock); 816 down_write(&sdp->sd_log_flush_lock);
808 817
809 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 818 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
810 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
811 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 819 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
812 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
813 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
814 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); 820 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
815 821
816 sdp->sd_log_flush_head = sdp->sd_log_head; 822 sdp->sd_log_flush_head = sdp->sd_log_head;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 76693793cedd..a294d8d8bcd4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -146,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
146 struct gfs2_journal_extent *je; 146 struct gfs2_journal_extent *je;
147 u64 block; 147 u64 block;
148 148
149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { 149 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
150 if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { 150 if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
151 block = je->dblock + lbn - je->lblock; 151 block = je->dblock + lbn - je->lblock;
152 gfs2_log_incr_head(sdp); 152 gfs2_log_incr_head(sdp);
153 return block; 153 return block;
@@ -491,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
491 gfs2_log_unlock(sdp); 491 gfs2_log_unlock(sdp);
492} 492}
493 493
494static void buf_lo_before_commit(struct gfs2_sbd *sdp) 494static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
495{ 495{
496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ 496 unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
497 497 unsigned int nbuf;
498 gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, 498 if (tr == NULL)
499 &sdp->sd_log_le_buf, 0); 499 return;
500 nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
501 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
500} 502}
501 503
502static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 504static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
503{ 505{
504 struct list_head *head = &sdp->sd_log_le_buf; 506 struct list_head *head;
505 struct gfs2_bufdata *bd; 507 struct gfs2_bufdata *bd;
506 508
507 if (tr == NULL) { 509 if (tr == NULL)
508 gfs2_assert(sdp, list_empty(head));
509 return; 510 return;
510 }
511 511
512 head = &tr->tr_buf;
512 while (!list_empty(head)) { 513 while (!list_empty(head)) {
513 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 514 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
514 list_del_init(&bd->bd_list); 515 list_del_init(&bd->bd_list);
515 sdp->sd_log_num_buf--;
516
517 gfs2_unpin(sdp, bd->bd_bh, tr); 516 gfs2_unpin(sdp, bd->bd_bh, tr);
518 } 517 }
519 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
520} 518}
521 519
522static void buf_lo_before_scan(struct gfs2_jdesc *jd, 520static void buf_lo_before_scan(struct gfs2_jdesc *jd,
523 struct gfs2_log_header_host *head, int pass) 521 struct gfs2_log_header_host *head, int pass)
524{ 522{
525 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
526
527 if (pass != 0) 523 if (pass != 0)
528 return; 524 return;
529 525
530 sdp->sd_found_blocks = 0; 526 jd->jd_found_blocks = 0;
531 sdp->sd_replayed_blocks = 0; 527 jd->jd_replayed_blocks = 0;
532} 528}
533 529
534static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 530static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -551,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
551 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { 547 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
552 blkno = be64_to_cpu(*ptr++); 548 blkno = be64_to_cpu(*ptr++);
553 549
554 sdp->sd_found_blocks++; 550 jd->jd_found_blocks++;
555 551
556 if (gfs2_revoke_check(sdp, blkno, start)) 552 if (gfs2_revoke_check(jd, blkno, start))
557 continue; 553 continue;
558 554
559 error = gfs2_replay_read_block(jd, start, &bh_log); 555 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -574,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
574 if (error) 570 if (error)
575 break; 571 break;
576 572
577 sdp->sd_replayed_blocks++; 573 jd->jd_replayed_blocks++;
578 } 574 }
579 575
580 return error; 576 return error;
@@ -617,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
617 gfs2_meta_sync(ip->i_gl); 613 gfs2_meta_sync(ip->i_gl);
618 614
619 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", 615 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
620 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 616 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
621} 617}
622 618
623static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 619static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
624{ 620{
625 struct gfs2_meta_header *mh; 621 struct gfs2_meta_header *mh;
626 unsigned int offset; 622 unsigned int offset;
@@ -679,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
679static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 675static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
680 struct gfs2_log_header_host *head, int pass) 676 struct gfs2_log_header_host *head, int pass)
681{ 677{
682 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
683
684 if (pass != 0) 678 if (pass != 0)
685 return; 679 return;
686 680
687 sdp->sd_found_revokes = 0; 681 jd->jd_found_revokes = 0;
688 sdp->sd_replay_tail = head->lh_tail; 682 jd->jd_replay_tail = head->lh_tail;
689} 683}
690 684
691static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 685static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -717,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
717 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { 711 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
718 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); 712 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
719 713
720 error = gfs2_revoke_add(sdp, blkno, start); 714 error = gfs2_revoke_add(jd, blkno, start);
721 if (error < 0) { 715 if (error < 0) {
722 brelse(bh); 716 brelse(bh);
723 return error; 717 return error;
724 } 718 }
725 else if (error) 719 else if (error)
726 sdp->sd_found_revokes++; 720 jd->jd_found_revokes++;
727 721
728 if (!--revokes) 722 if (!--revokes)
729 break; 723 break;
@@ -743,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
743 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 737 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
744 738
745 if (error) { 739 if (error) {
746 gfs2_revoke_clean(sdp); 740 gfs2_revoke_clean(jd);
747 return; 741 return;
748 } 742 }
749 if (pass != 1) 743 if (pass != 1)
750 return; 744 return;
751 745
752 fs_info(sdp, "jid=%u: Found %u revoke tags\n", 746 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
753 jd->jd_jid, sdp->sd_found_revokes); 747 jd->jd_jid, jd->jd_found_revokes);
754 748
755 gfs2_revoke_clean(sdp); 749 gfs2_revoke_clean(jd);
756} 750}
757 751
758/** 752/**
@@ -760,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
760 * 754 *
761 */ 755 */
762 756
763static void databuf_lo_before_commit(struct gfs2_sbd *sdp) 757static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
764{ 758{
765 unsigned int limit = buf_limit(sdp) / 2; 759 unsigned int limit = databuf_limit(sdp);
766 760 unsigned int nbuf;
767 gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, 761 if (tr == NULL)
768 &sdp->sd_log_le_databuf, 1); 762 return;
763 nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
764 gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
769} 765}
770 766
771static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 767static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -789,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
789 blkno = be64_to_cpu(*ptr++); 785 blkno = be64_to_cpu(*ptr++);
790 esc = be64_to_cpu(*ptr++); 786 esc = be64_to_cpu(*ptr++);
791 787
792 sdp->sd_found_blocks++; 788 jd->jd_found_blocks++;
793 789
794 if (gfs2_revoke_check(sdp, blkno, start)) 790 if (gfs2_revoke_check(jd, blkno, start))
795 continue; 791 continue;
796 792
797 error = gfs2_replay_read_block(jd, start, &bh_log); 793 error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -811,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
811 brelse(bh_log); 807 brelse(bh_log);
812 brelse(bh_ip); 808 brelse(bh_ip);
813 809
814 sdp->sd_replayed_blocks++; 810 jd->jd_replayed_blocks++;
815 } 811 }
816 812
817 return error; 813 return error;
@@ -835,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
835 gfs2_meta_sync(ip->i_gl); 831 gfs2_meta_sync(ip->i_gl);
836 832
837 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", 833 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
838 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); 834 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
839} 835}
840 836
841static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 837static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
842{ 838{
843 struct list_head *head = &sdp->sd_log_le_databuf; 839 struct list_head *head;
844 struct gfs2_bufdata *bd; 840 struct gfs2_bufdata *bd;
845 841
846 if (tr == NULL) { 842 if (tr == NULL)
847 gfs2_assert(sdp, list_empty(head));
848 return; 843 return;
849 }
850 844
845 head = &tr->tr_databuf;
851 while (!list_empty(head)) { 846 while (!list_empty(head)) {
852 bd = list_entry(head->next, struct gfs2_bufdata, bd_list); 847 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
853 list_del_init(&bd->bd_list); 848 list_del_init(&bd->bd_list);
854 sdp->sd_log_num_databuf--;
855 gfs2_unpin(sdp, bd->bd_bh, tr); 849 gfs2_unpin(sdp, bd->bd_bh, tr);
856 } 850 }
857 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
858} 851}
859 852
860 853
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9ca2e6438419..a65a7ba32ffd 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
46 return limit; 46 return limit;
47} 47}
48 48
49static inline void lops_before_commit(struct gfs2_sbd *sdp) 49static inline void lops_before_commit(struct gfs2_sbd *sdp,
50 struct gfs2_trans *tr)
50{ 51{
51 int x; 52 int x;
52 for (x = 0; gfs2_log_ops[x]; x++) 53 for (x = 0; gfs2_log_ops[x]; x++)
53 if (gfs2_log_ops[x]->lo_before_commit) 54 if (gfs2_log_ops[x]->lo_before_commit)
54 gfs2_log_ops[x]->lo_before_commit(sdp); 55 gfs2_log_ops[x]->lo_before_commit(sdp, tr);
55} 56}
56 57
57static inline void lops_after_commit(struct gfs2_sbd *sdp, 58static inline void lops_after_commit(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c272e73063de..82b6ac829656 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -165,7 +167,7 @@ static int __init init_gfs2_fs(void)
165 167
166 gfs2_register_debugfs(); 168 gfs2_register_debugfs();
167 169
168 printk("GFS2 installed\n"); 170 pr_info("GFS2 installed\n");
169 171
170 return 0; 172 return 0;
171 173
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index c7f24690ed05..2cf09b63a6b4 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
97 .releasepage = gfs2_releasepage, 97 .releasepage = gfs2_releasepage,
98}; 98};
99 99
100const struct address_space_operations gfs2_rgrp_aops = {
101 .writepage = gfs2_aspace_writepage,
102 .releasepage = gfs2_releasepage,
103};
104
100/** 105/**
101 * gfs2_getbuf - Get a buffer with a given address space 106 * gfs2_getbuf - Get a buffer with a given address space
102 * @gl: the glock 107 * @gl: the glock
@@ -267,15 +272,10 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
267 trace_gfs2_pin(bd, 0); 272 trace_gfs2_pin(bd, 0);
268 atomic_dec(&sdp->sd_log_pinned); 273 atomic_dec(&sdp->sd_log_pinned);
269 list_del_init(&bd->bd_list); 274 list_del_init(&bd->bd_list);
270 if (meta) { 275 if (meta)
271 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
272 sdp->sd_log_num_buf--;
273 tr->tr_num_buf_rm++; 276 tr->tr_num_buf_rm++;
274 } else { 277 else
275 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
276 sdp->sd_log_num_databuf--;
277 tr->tr_num_databuf_rm++; 278 tr->tr_num_databuf_rm++;
278 }
279 tr->tr_touched = 1; 279 tr->tr_touched = 1;
280 was_pinned = 1; 280 was_pinned = 1;
281 brelse(bh); 281 brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208a..ac5d8027d335 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
38} 38}
39 39
40extern const struct address_space_operations gfs2_meta_aops; 40extern const struct address_space_operations gfs2_meta_aops;
41extern const struct address_space_operations gfs2_rgrp_aops;
41 42
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) 43static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{ 44{
44 struct inode *inode = mapping->host; 45 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops) 46 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; 47 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
48 else if (mapping->a_ops == &gfs2_rgrp_aops)
49 return container_of(mapping, struct gfs2_sbd, sd_aspace);
47 else 50 else
48 return inode->i_sb->s_fs_info; 51 return inode->i_sb->s_fs_info;
49} 52}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6872d09561a..22f954051bb8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -104,7 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
104 mapping = &sdp->sd_aspace; 106 mapping = &sdp->sd_aspace;
105 107
106 address_space_init_once(mapping); 108 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops; 109 mapping->a_ops = &gfs2_rgrp_aops;
108 mapping->host = sb->s_bdev->bd_inode; 110 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0; 111 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS); 112 mapping_set_gfp_mask(mapping, GFP_NOFS);
@@ -114,9 +116,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
114 116
115 spin_lock_init(&sdp->sd_log_lock); 117 spin_lock_init(&sdp->sd_log_lock);
116 atomic_set(&sdp->sd_log_pinned, 0); 118 atomic_set(&sdp->sd_log_pinned, 0);
117 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
118 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 119 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
119 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered); 120 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
121 spin_lock_init(&sdp->sd_ordered_lock); 121 spin_lock_init(&sdp->sd_ordered_lock);
122 122
@@ -130,8 +130,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
130 atomic_set(&sdp->sd_log_in_flight, 0); 130 atomic_set(&sdp->sd_log_in_flight, 0);
131 init_waitqueue_head(&sdp->sd_log_flush_wait); 131 init_waitqueue_head(&sdp->sd_log_flush_wait);
132 132
133 INIT_LIST_HEAD(&sdp->sd_revoke_list);
134
135 return sdp; 133 return sdp;
136} 134}
137 135
@@ -154,7 +152,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
154 if (sb->sb_magic != GFS2_MAGIC || 152 if (sb->sb_magic != GFS2_MAGIC ||
155 sb->sb_type != GFS2_METATYPE_SB) { 153 sb->sb_type != GFS2_METATYPE_SB) {
156 if (!silent) 154 if (!silent)
157 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); 155 pr_warn("not a GFS2 filesystem\n");
158 return -EINVAL; 156 return -EINVAL;
159 } 157 }
160 158
@@ -176,7 +174,7 @@ static void end_bio_io_page(struct bio *bio, int error)
176 if (!error) 174 if (!error)
177 SetPageUptodate(page); 175 SetPageUptodate(page);
178 else 176 else
179 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); 177 pr_warn("error %d reading superblock\n", error);
180 unlock_page(page); 178 unlock_page(page);
181} 179}
182 180
@@ -519,67 +517,6 @@ out:
519 return ret; 517 return ret;
520} 518}
521 519
522/**
523 * map_journal_extents - create a reusable "extent" mapping from all logical
524 * blocks to all physical blocks for the given journal. This will save
525 * us time when writing journal blocks. Most journals will have only one
526 * extent that maps all their logical blocks. That's because gfs2.mkfs
527 * arranges the journal blocks sequentially to maximize performance.
528 * So the extent would map the first block for the entire file length.
529 * However, gfs2_jadd can happen while file activity is happening, so
530 * those journals may not be sequential. Less likely is the case where
531 * the users created their own journals by mounting the metafs and
532 * laying it out. But it's still possible. These journals might have
533 * several extents.
534 *
535 * TODO: This should be done in bigger chunks rather than one block at a time,
536 * but since it's only done at mount time, I'm not worried about the
537 * time it takes.
538 */
539static int map_journal_extents(struct gfs2_sbd *sdp)
540{
541 struct gfs2_jdesc *jd = sdp->sd_jdesc;
542 unsigned int lb;
543 u64 db, prev_db; /* logical block, disk block, prev disk block */
544 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
545 struct gfs2_journal_extent *jext = NULL;
546 struct buffer_head bh;
547 int rc = 0;
548
549 prev_db = 0;
550
551 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
552 bh.b_state = 0;
553 bh.b_blocknr = 0;
554 bh.b_size = 1 << ip->i_inode.i_blkbits;
555 rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
556 db = bh.b_blocknr;
557 if (rc || !db) {
558 printk(KERN_INFO "GFS2 journal mapping error %d: lb="
559 "%u db=%llu\n", rc, lb, (unsigned long long)db);
560 break;
561 }
562 if (!prev_db || db != prev_db + 1) {
563 jext = kzalloc(sizeof(struct gfs2_journal_extent),
564 GFP_KERNEL);
565 if (!jext) {
566 printk(KERN_INFO "GFS2 error: out of memory "
567 "mapping journal extents.\n");
568 rc = -ENOMEM;
569 break;
570 }
571 jext->dblock = db;
572 jext->lblock = lb;
573 jext->blocks = 1;
574 list_add_tail(&jext->extent_list, &jd->extent_list);
575 } else {
576 jext->blocks++;
577 }
578 prev_db = db;
579 }
580 return rc;
581}
582
583static void gfs2_others_may_mount(struct gfs2_sbd *sdp) 520static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
584{ 521{
585 char *message = "FIRSTMOUNT=Done"; 522 char *message = "FIRSTMOUNT=Done";
@@ -638,6 +575,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
638 break; 575 break;
639 576
640 INIT_LIST_HEAD(&jd->extent_list); 577 INIT_LIST_HEAD(&jd->extent_list);
578 INIT_LIST_HEAD(&jd->jd_revoke_list);
579
641 INIT_WORK(&jd->jd_work, gfs2_recover_func); 580 INIT_WORK(&jd->jd_work, gfs2_recover_func);
642 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 581 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
643 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 582 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
@@ -781,7 +720,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
781 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); 720 atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
782 721
783 /* Map the extents for this journal's blocks */ 722 /* Map the extents for this journal's blocks */
784 map_journal_extents(sdp); 723 gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
785 } 724 }
786 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); 725 trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
787 726
@@ -1008,7 +947,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1008 lm = &gfs2_dlm_ops; 947 lm = &gfs2_dlm_ops;
1009#endif 948#endif
1010 } else { 949 } else {
1011 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); 950 pr_info("can't find protocol %s\n", proto);
1012 return -ENOENT; 951 return -ENOENT;
1013 } 952 }
1014 953
@@ -1115,7 +1054,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1115 1054
1116 sdp = init_sbd(sb); 1055 sdp = init_sbd(sb);
1117 if (!sdp) { 1056 if (!sdp) {
1118 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); 1057 pr_warn("can't alloc struct gfs2_sbd\n");
1119 return -ENOMEM; 1058 return -ENOMEM;
1120 } 1059 }
1121 sdp->sd_args = *args; 1060 sdp->sd_args = *args;
@@ -1363,7 +1302,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1363 1302
1364 error = gfs2_mount_args(&args, data); 1303 error = gfs2_mount_args(&args, data);
1365 if (error) { 1304 if (error) {
1366 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1305 pr_warn("can't parse mount arguments\n");
1367 goto error_super; 1306 goto error_super;
1368 } 1307 }
1369 1308
@@ -1413,15 +1352,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
1413 1352
1414 error = kern_path(dev_name, LOOKUP_FOLLOW, &path); 1353 error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
1415 if (error) { 1354 if (error) {
1416 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", 1355 pr_warn("path_lookup on %s returned error %d\n",
1417 dev_name, error); 1356 dev_name, error);
1418 return ERR_PTR(error); 1357 return ERR_PTR(error);
1419 } 1358 }
1420 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, 1359 s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
1421 path.dentry->d_inode->i_sb->s_bdev); 1360 path.dentry->d_inode->i_sb->s_bdev);
1422 path_put(&path); 1361 path_put(&path);
1423 if (IS_ERR(s)) { 1362 if (IS_ERR(s)) {
1424 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1363 pr_warn("gfs2 mount does not exist\n");
1425 return ERR_CAST(s); 1364 return ERR_CAST(s);
1426 } 1365 }
1427 if ((flags ^ s->s_flags) & MS_RDONLY) { 1366 if ((flags ^ s->s_flags) & MS_RDONLY) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8bec0e3192dd..c4effff7cf55 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -36,6 +36,8 @@
36 * the quota file, so it is not being constantly read. 36 * the quota file, so it is not being constantly read.
37 */ 37 */
38 38
39#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
40
39#include <linux/sched.h> 41#include <linux/sched.h>
40#include <linux/slab.h> 42#include <linux/slab.h>
41#include <linux/mm.h> 43#include <linux/mm.h>
@@ -330,6 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd)
330 if (bit < sdp->sd_quota_slots) { 332 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap); 333 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit; 334 qd->qd_slot = bit;
335 error = 0;
333out: 336out:
334 qd->qd_slot_count++; 337 qd->qd_slot_count++;
335 } 338 }
@@ -1081,10 +1084,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
1081{ 1084{
1082 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 1085 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
1083 1086
1084 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", 1087 fs_info(sdp, "quota %s for %s %u\n",
1085 sdp->sd_fsname, type, 1088 type,
1086 (qd->qd_id.type == USRQUOTA) ? "user" : "group", 1089 (qd->qd_id.type == USRQUOTA) ? "user" : "group",
1087 from_kqid(&init_user_ns, qd->qd_id)); 1090 from_kqid(&init_user_ns, qd->qd_id));
1088 1091
1089 return 0; 1092 return 0;
1090} 1093}
@@ -1242,14 +1245,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); 1245 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1243 bm_size *= sizeof(unsigned long); 1246 bm_size *= sizeof(unsigned long);
1244 error = -ENOMEM; 1247 error = -ENOMEM;
1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); 1248 sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
1246 if (sdp->sd_quota_bitmap == NULL) 1249 if (sdp->sd_quota_bitmap == NULL)
1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); 1250 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
1251 __GFP_ZERO, PAGE_KERNEL);
1248 if (!sdp->sd_quota_bitmap) 1252 if (!sdp->sd_quota_bitmap)
1249 return error; 1253 return error;
1250 1254
1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1252
1253 for (x = 0; x < blocks; x++) { 1255 for (x = 0; x < blocks; x++) {
1254 struct buffer_head *bh; 1256 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc; 1257 const struct gfs2_quota_change *qc;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 963b2d75200c..7ad4094d68c0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
52 return error; 52 return error;
53} 53}
54 54
55int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 55int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
56{ 56{
57 struct list_head *head = &sdp->sd_revoke_list; 57 struct list_head *head = &jd->jd_revoke_list;
58 struct gfs2_revoke_replay *rr; 58 struct gfs2_revoke_replay *rr;
59 int found = 0; 59 int found = 0;
60 60
@@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
81 return 1; 81 return 1;
82} 82}
83 83
84int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) 84int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
85{ 85{
86 struct gfs2_revoke_replay *rr; 86 struct gfs2_revoke_replay *rr;
87 int wrap, a, b, revoke; 87 int wrap, a, b, revoke;
88 int found = 0; 88 int found = 0;
89 89
90 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { 90 list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
91 if (rr->rr_blkno == blkno) { 91 if (rr->rr_blkno == blkno) {
92 found = 1; 92 found = 1;
93 break; 93 break;
@@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
97 if (!found) 97 if (!found)
98 return 0; 98 return 0;
99 99
100 wrap = (rr->rr_where < sdp->sd_replay_tail); 100 wrap = (rr->rr_where < jd->jd_replay_tail);
101 a = (sdp->sd_replay_tail < where); 101 a = (jd->jd_replay_tail < where);
102 b = (where < rr->rr_where); 102 b = (where < rr->rr_where);
103 revoke = (wrap) ? (a || b) : (a && b); 103 revoke = (wrap) ? (a || b) : (a && b);
104 104
105 return revoke; 105 return revoke;
106} 106}
107 107
108void gfs2_revoke_clean(struct gfs2_sbd *sdp) 108void gfs2_revoke_clean(struct gfs2_jdesc *jd)
109{ 109{
110 struct list_head *head = &sdp->sd_revoke_list; 110 struct list_head *head = &jd->jd_revoke_list;
111 struct gfs2_revoke_replay *rr; 111 struct gfs2_revoke_replay *rr;
112 112
113 while (!list_empty(head)) { 113 while (!list_empty(head)) {
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 2226136c7647..6142836cce96 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 23extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
24 struct buffer_head **bh); 24 struct buffer_head **bh);
25 25
26extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 26extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
27extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 27extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
28extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); 28extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
29 29
30extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
31 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a1da21349235..281a7716e3f3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/slab.h> 12#include <linux/slab.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -99,12 +101,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
99 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; 101 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
100 102
101 if (unlikely(!valid_change[new_state * 4 + cur_state])) { 103 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
102 printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " 104 pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
103 "new_state=%d\n", rbm->offset, cur_state, new_state); 105 rbm->offset, cur_state, new_state);
104 printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", 106 pr_warn("rgrp=0x%llx bi_start=0x%x\n",
105 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); 107 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
106 printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", 108 pr_warn("bi_offset=0x%x bi_len=0x%x\n",
107 bi->bi_offset, bi->bi_len); 109 bi->bi_offset, bi->bi_len);
108 dump_stack(); 110 dump_stack();
109 gfs2_consist_rgrpd(rbm->rgd); 111 gfs2_consist_rgrpd(rbm->rgd);
110 return; 112 return;
@@ -736,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
736 738
737static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) 739static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
738{ 740{
739 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); 741 pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
740 printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); 742 pr_info("ri_length = %u\n", rgd->rd_length);
741 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); 743 pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
742 printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); 744 pr_info("ri_data = %u\n", rgd->rd_data);
743 printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); 745 pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
744} 746}
745 747
746/** 748/**
@@ -1102,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1102 * Returns: errno 1104 * Returns: errno
1103 */ 1105 */
1104 1106
1105int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) 1107static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1106{ 1108{
1107 struct gfs2_sbd *sdp = rgd->rd_sbd; 1109 struct gfs2_sbd *sdp = rgd->rd_sbd;
1108 struct gfs2_glock *gl = rgd->rd_gl; 1110 struct gfs2_glock *gl = rgd->rd_gl;
@@ -1169,7 +1171,7 @@ fail:
1169 return error; 1171 return error;
1170} 1172}
1171 1173
1172int update_rgrp_lvb(struct gfs2_rgrpd *rgd) 1174static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1173{ 1175{
1174 u32 rl_flags; 1176 u32 rl_flags;
1175 1177
@@ -2278,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2278 } 2280 }
2279 } 2281 }
2280 if (rbm.rgd->rd_free < *nblocks) { 2282 if (rbm.rgd->rd_free < *nblocks) {
2281 printk(KERN_WARNING "nblocks=%u\n", *nblocks); 2283 pr_warn("nblocks=%u\n", *nblocks);
2282 goto rgrp_error; 2284 goto rgrp_error;
2283 } 2285 }
2284 2286
@@ -2296,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2296 2298
2297 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); 2299 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2298 if (dinode) 2300 if (dinode)
2299 gfs2_trans_add_unrevoke(sdp, block, 1); 2301 gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2300 2302
2301 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); 2303 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2302 2304
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 60f60f6181f3..de8afad89e51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/bio.h> 12#include <linux/bio.h>
11#include <linux/sched.h> 13#include <linux/sched.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
@@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
175 break; 177 break;
176 case Opt_debug: 178 case Opt_debug:
177 if (args->ar_errors == GFS2_ERRORS_PANIC) { 179 if (args->ar_errors == GFS2_ERRORS_PANIC) {
178 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 180 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
179 "are mutually exclusive.\n");
180 return -EINVAL; 181 return -EINVAL;
181 } 182 }
182 args->ar_debug = 1; 183 args->ar_debug = 1;
@@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
228 case Opt_commit: 229 case Opt_commit:
229 rv = match_int(&tmp[0], &args->ar_commit); 230 rv = match_int(&tmp[0], &args->ar_commit);
230 if (rv || args->ar_commit <= 0) { 231 if (rv || args->ar_commit <= 0) {
231 printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); 232 pr_warn("commit mount option requires a positive numeric argument\n");
232 return rv ? rv : -EINVAL; 233 return rv ? rv : -EINVAL;
233 } 234 }
234 break; 235 break;
235 case Opt_statfs_quantum: 236 case Opt_statfs_quantum:
236 rv = match_int(&tmp[0], &args->ar_statfs_quantum); 237 rv = match_int(&tmp[0], &args->ar_statfs_quantum);
237 if (rv || args->ar_statfs_quantum < 0) { 238 if (rv || args->ar_statfs_quantum < 0) {
238 printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); 239 pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
239 return rv ? rv : -EINVAL; 240 return rv ? rv : -EINVAL;
240 } 241 }
241 break; 242 break;
242 case Opt_quota_quantum: 243 case Opt_quota_quantum:
243 rv = match_int(&tmp[0], &args->ar_quota_quantum); 244 rv = match_int(&tmp[0], &args->ar_quota_quantum);
244 if (rv || args->ar_quota_quantum <= 0) { 245 if (rv || args->ar_quota_quantum <= 0) {
245 printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); 246 pr_warn("quota_quantum mount option requires a positive numeric argument\n");
246 return rv ? rv : -EINVAL; 247 return rv ? rv : -EINVAL;
247 } 248 }
248 break; 249 break;
@@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
250 rv = match_int(&tmp[0], &args->ar_statfs_percent); 251 rv = match_int(&tmp[0], &args->ar_statfs_percent);
251 if (rv || args->ar_statfs_percent < 0 || 252 if (rv || args->ar_statfs_percent < 0 ||
252 args->ar_statfs_percent > 100) { 253 args->ar_statfs_percent > 100) {
253 printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); 254 pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
254 return rv ? rv : -EINVAL; 255 return rv ? rv : -EINVAL;
255 } 256 }
256 break; 257 break;
@@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
259 break; 260 break;
260 case Opt_err_panic: 261 case Opt_err_panic:
261 if (args->ar_debug) { 262 if (args->ar_debug) {
262 printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " 263 pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
263 "are mutually exclusive.\n");
264 return -EINVAL; 264 return -EINVAL;
265 } 265 }
266 args->ar_errors = GFS2_ERRORS_PANIC; 266 args->ar_errors = GFS2_ERRORS_PANIC;
@@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
279 break; 279 break;
280 case Opt_error: 280 case Opt_error:
281 default: 281 default:
282 printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); 282 pr_warn("invalid mount option: %s\n", o);
283 return -EINVAL; 283 return -EINVAL;
284 } 284 }
285 } 285 }
@@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
295 295
296void gfs2_jindex_free(struct gfs2_sbd *sdp) 296void gfs2_jindex_free(struct gfs2_sbd *sdp)
297{ 297{
298 struct list_head list, *head; 298 struct list_head list;
299 struct gfs2_jdesc *jd; 299 struct gfs2_jdesc *jd;
300 struct gfs2_journal_extent *jext;
301 300
302 spin_lock(&sdp->sd_jindex_spin); 301 spin_lock(&sdp->sd_jindex_spin);
303 list_add(&list, &sdp->sd_jindex_list); 302 list_add(&list, &sdp->sd_jindex_list);
@@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
307 306
308 while (!list_empty(&list)) { 307 while (!list_empty(&list)) {
309 jd = list_entry(list.next, struct gfs2_jdesc, jd_list); 308 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
310 head = &jd->extent_list; 309 gfs2_free_journal_extents(jd);
311 while (!list_empty(head)) {
312 jext = list_entry(head->next,
313 struct gfs2_journal_extent,
314 extent_list);
315 list_del(&jext->extent_list);
316 kfree(jext);
317 }
318 list_del(&jd->jd_list); 310 list_del(&jd->jd_list);
319 iput(jd->jd_inode); 311 iput(jd->jd_inode);
320 kfree(jd); 312 kfree(jd);
@@ -1175,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1175 struct gfs2_tune *gt = &sdp->sd_tune; 1167 struct gfs2_tune *gt = &sdp->sd_tune;
1176 int error; 1168 int error;
1177 1169
1170 sync_filesystem(sb);
1171
1178 spin_lock(&gt->gt_spin); 1172 spin_lock(&gt->gt_spin);
1179 args.ar_commit = gt->gt_logd_secs; 1173 args.ar_commit = gt->gt_logd_secs;
1180 args.ar_quota_quantum = gt->gt_quota_quantum; 1174 args.ar_quota_quantum = gt->gt_quota_quantum;
@@ -1256,7 +1250,7 @@ static int gfs2_drop_inode(struct inode *inode)
1256{ 1250{
1257 struct gfs2_inode *ip = GFS2_I(inode); 1251 struct gfs2_inode *ip = GFS2_I(inode);
1258 1252
1259 if (inode->i_nlink) { 1253 if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
1260 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1254 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1261 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1255 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1262 clear_nlink(inode); 1256 clear_nlink(inode);
@@ -1471,6 +1465,11 @@ static void gfs2_evict_inode(struct inode *inode)
1471 struct gfs2_holder gh; 1465 struct gfs2_holder gh;
1472 int error; 1466 int error;
1473 1467
1468 if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
1469 clear_inode(inode);
1470 return;
1471 }
1472
1474 if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) 1473 if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
1475 goto out; 1474 goto out;
1476 1475
@@ -1558,7 +1557,7 @@ out_unlock:
1558 fs_warn(sdp, "gfs2_evict_inode: %d\n", error); 1557 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1559out: 1558out:
1560 /* Case 3 starts here */ 1559 /* Case 3 starts here */
1561 truncate_inode_pages(&inode->i_data, 0); 1560 truncate_inode_pages_final(&inode->i_data);
1562 gfs2_rs_delete(ip, NULL); 1561 gfs2_rs_delete(ip, NULL);
1563 gfs2_ordered_del_inode(ip); 1562 gfs2_ordered_del_inode(ip);
1564 clear_inode(inode); 1563 clear_inode(inode);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d09f6edda0ff..de25d5577e5d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/completion.h> 14#include <linux/completion.h>
@@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
138 if (simple_strtol(buf, NULL, 0) != 1) 140 if (simple_strtol(buf, NULL, 0) != 1)
139 return -EINVAL; 141 return -EINVAL;
140 142
141 gfs2_lm_withdraw(sdp, 143 gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
142 "GFS2: fsid=%s: withdrawing from cluster at user's request\n", 144
143 sdp->sd_fsname);
144 return len; 145 return len;
145} 146}
146 147
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 2b20d7046bf3..bead90d27bad 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/sched.h> 12#include <linux/sched.h>
11#include <linux/slab.h> 13#include <linux/slab.h>
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -51,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
51 if (revokes) 53 if (revokes)
52 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 54 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
53 sizeof(u64)); 55 sizeof(u64));
56 INIT_LIST_HEAD(&tr->tr_databuf);
57 INIT_LIST_HEAD(&tr->tr_buf);
58
54 sb_start_intwrite(sdp->sd_vfs); 59 sb_start_intwrite(sdp->sd_vfs);
55 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 60 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
56 61
@@ -96,14 +101,13 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
96 101
97static void gfs2_print_trans(const struct gfs2_trans *tr) 102static void gfs2_print_trans(const struct gfs2_trans *tr)
98{ 103{
99 printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", 104 pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
100 (void *)tr->tr_ip); 105 pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
101 printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", 106 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
102 tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); 107 pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
103 printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", 108 tr->tr_num_buf_new, tr->tr_num_buf_rm,
104 tr->tr_num_buf_new, tr->tr_num_buf_rm, 109 tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
105 tr->tr_num_databuf_new, tr->tr_num_databuf_rm, 110 tr->tr_num_revoke, tr->tr_num_revoke_rm);
106 tr->tr_num_revoke, tr->tr_num_revoke_rm);
107} 111}
108 112
109void gfs2_trans_end(struct gfs2_sbd *sdp) 113void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -210,8 +214,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
210 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 214 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
211 gfs2_pin(sdp, bd->bd_bh); 215 gfs2_pin(sdp, bd->bd_bh);
212 tr->tr_num_databuf_new++; 216 tr->tr_num_databuf_new++;
213 sdp->sd_log_num_databuf++; 217 list_add_tail(&bd->bd_list, &tr->tr_databuf);
214 list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
215 } 218 }
216 gfs2_log_unlock(sdp); 219 gfs2_log_unlock(sdp);
217 unlock_buffer(bh); 220 unlock_buffer(bh);
@@ -230,16 +233,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
230 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 233 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
231 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; 234 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
232 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { 235 if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
233 printk(KERN_ERR 236 pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
234 "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
235 (unsigned long long)bd->bd_bh->b_blocknr); 237 (unsigned long long)bd->bd_bh->b_blocknr);
236 BUG(); 238 BUG();
237 } 239 }
238 gfs2_pin(sdp, bd->bd_bh); 240 gfs2_pin(sdp, bd->bd_bh);
239 mh->__pad0 = cpu_to_be64(0); 241 mh->__pad0 = cpu_to_be64(0);
240 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); 242 mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
241 sdp->sd_log_num_buf++; 243 list_add(&bd->bd_list, &tr->tr_buf);
242 list_add(&bd->bd_list, &sdp->sd_log_le_buf);
243 tr->tr_num_buf_new++; 244 tr->tr_num_buf_new++;
244} 245}
245 246
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f7109f689e61..86d2035ac669 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,6 +7,8 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
11#include <linux/completion.h> 13#include <linux/completion.h>
12#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly;
30 32
31void gfs2_assert_i(struct gfs2_sbd *sdp) 33void gfs2_assert_i(struct gfs2_sbd *sdp)
32{ 34{
33 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", 35 fs_emerg(sdp, "fatal assertion failed\n");
34 sdp->sd_fsname);
35} 36}
36 37
37int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) 38int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
38{ 39{
39 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 40 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
40 const struct lm_lockops *lm = ls->ls_ops; 41 const struct lm_lockops *lm = ls->ls_ops;
41 va_list args; 42 va_list args;
43 struct va_format vaf;
42 44
43 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && 45 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
44 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 46 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
45 return 0; 47 return 0;
46 48
47 va_start(args, fmt); 49 va_start(args, fmt);
48 vprintk(fmt, args); 50
51 vaf.fmt = fmt;
52 vaf.va = &args;
53
54 fs_err(sdp, "%pV", &vaf);
55
49 va_end(args); 56 va_end(args);
50 57
51 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { 58 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
@@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
66 } 73 }
67 74
68 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) 75 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
69 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); 76 panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
70 77
71 return -1; 78 return -1;
72} 79}
@@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
82{ 89{
83 int me; 90 int me;
84 me = gfs2_lm_withdraw(sdp, 91 me = gfs2_lm_withdraw(sdp,
85 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" 92 "fatal: assertion \"%s\" failed\n"
86 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 93 " function = %s, file = %s, line = %u\n",
87 sdp->sd_fsname, assertion, 94 assertion, function, file, line);
88 sdp->sd_fsname, function, file, line);
89 dump_stack(); 95 dump_stack();
90 return (me) ? -1 : -2; 96 return (me) ? -1 : -2;
91} 97}
@@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
105 return -2; 111 return -2;
106 112
107 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) 113 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
108 printk(KERN_WARNING 114 fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
109 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 115 assertion, function, file, line);
110 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
111 sdp->sd_fsname, assertion,
112 sdp->sd_fsname, function, file, line);
113 116
114 if (sdp->sd_args.ar_debug) 117 if (sdp->sd_args.ar_debug)
115 BUG(); 118 BUG();
@@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
138{ 141{
139 int rv; 142 int rv;
140 rv = gfs2_lm_withdraw(sdp, 143 rv = gfs2_lm_withdraw(sdp,
141 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 144 "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
142 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 145 function, file, line);
143 sdp->sd_fsname,
144 sdp->sd_fsname, function, file, line);
145 return rv; 146 return rv;
146} 147}
147 148
@@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
157 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 158 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
158 int rv; 159 int rv;
159 rv = gfs2_lm_withdraw(sdp, 160 rv = gfs2_lm_withdraw(sdp,
160 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 161 "fatal: filesystem consistency error\n"
161 "GFS2: fsid=%s: inode = %llu %llu\n" 162 " inode = %llu %llu\n"
162 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 163 " function = %s, file = %s, line = %u\n",
163 sdp->sd_fsname, 164 (unsigned long long)ip->i_no_formal_ino,
164 sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, 165 (unsigned long long)ip->i_no_addr,
165 (unsigned long long)ip->i_no_addr, 166 function, file, line);
166 sdp->sd_fsname, function, file, line);
167 return rv; 167 return rv;
168} 168}
169 169
@@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
179 struct gfs2_sbd *sdp = rgd->rd_sbd; 179 struct gfs2_sbd *sdp = rgd->rd_sbd;
180 int rv; 180 int rv;
181 rv = gfs2_lm_withdraw(sdp, 181 rv = gfs2_lm_withdraw(sdp,
182 "GFS2: fsid=%s: fatal: filesystem consistency error\n" 182 "fatal: filesystem consistency error\n"
183 "GFS2: fsid=%s: RG = %llu\n" 183 " RG = %llu\n"
184 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 184 " function = %s, file = %s, line = %u\n",
185 sdp->sd_fsname, 185 (unsigned long long)rgd->rd_addr,
186 sdp->sd_fsname, (unsigned long long)rgd->rd_addr, 186 function, file, line);
187 sdp->sd_fsname, function, file, line);
188 return rv; 187 return rv;
189} 188}
190 189
@@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
200{ 199{
201 int me; 200 int me;
202 me = gfs2_lm_withdraw(sdp, 201 me = gfs2_lm_withdraw(sdp,
203 "GFS2: fsid=%s: fatal: invalid metadata block\n" 202 "fatal: invalid metadata block\n"
204 "GFS2: fsid=%s: bh = %llu (%s)\n" 203 " bh = %llu (%s)\n"
205 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 204 " function = %s, file = %s, line = %u\n",
206 sdp->sd_fsname, 205 (unsigned long long)bh->b_blocknr, type,
207 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, 206 function, file, line);
208 sdp->sd_fsname, function, file, line);
209 return (me) ? -1 : -2; 207 return (me) ? -1 : -2;
210} 208}
211 209
@@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
221{ 219{
222 int me; 220 int me;
223 me = gfs2_lm_withdraw(sdp, 221 me = gfs2_lm_withdraw(sdp,
224 "GFS2: fsid=%s: fatal: invalid metadata block\n" 222 "fatal: invalid metadata block\n"
225 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" 223 " bh = %llu (type: exp=%u, found=%u)\n"
226 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 224 " function = %s, file = %s, line = %u\n",
227 sdp->sd_fsname, 225 (unsigned long long)bh->b_blocknr, type, t,
228 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, 226 function, file, line);
229 sdp->sd_fsname, function, file, line);
230 return (me) ? -1 : -2; 227 return (me) ? -1 : -2;
231} 228}
232 229
@@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
241{ 238{
242 int rv; 239 int rv;
243 rv = gfs2_lm_withdraw(sdp, 240 rv = gfs2_lm_withdraw(sdp,
244 "GFS2: fsid=%s: fatal: I/O error\n" 241 "fatal: I/O error\n"
245 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 242 " function = %s, file = %s, line = %u\n",
246 sdp->sd_fsname, 243 function, file, line);
247 sdp->sd_fsname, function, file, line);
248 return rv; 244 return rv;
249} 245}
250 246
@@ -259,12 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
259{ 255{
260 int rv; 256 int rv;
261 rv = gfs2_lm_withdraw(sdp, 257 rv = gfs2_lm_withdraw(sdp,
262 "GFS2: fsid=%s: fatal: I/O error\n" 258 "fatal: I/O error\n"
263 "GFS2: fsid=%s: block = %llu\n" 259 " block = %llu\n"
264 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 260 " function = %s, file = %s, line = %u\n",
265 sdp->sd_fsname, 261 (unsigned long long)bh->b_blocknr,
266 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, 262 function, file, line);
267 sdp->sd_fsname, function, file, line);
268 return rv; 263 return rv;
269} 264}
270 265
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index b7ffb09b99ea..cbdcbdf39614 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,22 +10,23 @@
10#ifndef __UTIL_DOT_H__ 10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__ 11#define __UTIL_DOT_H__
12 12
13#ifdef pr_fmt
14#undef pr_fmt
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#endif
17
13#include <linux/mempool.h> 18#include <linux/mempool.h>
14 19
15#include "incore.h" 20#include "incore.h"
16 21
17#define fs_printk(level, fs, fmt, arg...) \ 22#define fs_emerg(fs, fmt, ...) \
18 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) 23 pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
19 24#define fs_warn(fs, fmt, ...) \
20#define fs_info(fs, fmt, arg...) \ 25 pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
21 fs_printk(KERN_INFO , fs , fmt , ## arg) 26#define fs_err(fs, fmt, ...) \
22 27 pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
23#define fs_warn(fs, fmt, arg...) \ 28#define fs_info(fs, fmt, ...) \
24 fs_printk(KERN_WARNING , fs , fmt , ## arg) 29 pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
25
26#define fs_err(fs, fmt, arg...) \
27 fs_printk(KERN_ERR, fs , fmt , ## arg)
28
29 30
30void gfs2_assert_i(struct gfs2_sbd *sdp); 31void gfs2_assert_i(struct gfs2_sbd *sdp);
31 32
@@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
85 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; 86 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
86 u32 magic = be32_to_cpu(mh->mh_magic); 87 u32 magic = be32_to_cpu(mh->mh_magic);
87 if (unlikely(magic != GFS2_MAGIC)) { 88 if (unlikely(magic != GFS2_MAGIC)) {
88 printk(KERN_ERR "GFS2: Magic number missing at %llu\n", 89 pr_err("Magic number missing at %llu\n",
89 (unsigned long long)bh->b_blocknr); 90 (unsigned long long)bh->b_blocknr);
90 return -EIO; 91 return -EIO;
91 } 92 }
@@ -164,7 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
164#define gfs2_tune_get(sdp, field) \ 165#define gfs2_tune_get(sdp, field) \
165gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) 166gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
166 167
167int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); 168__printf(2, 3)
169int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
168 170
169#endif /* __UTIL_DOT_H__ */ 171#endif /* __UTIL_DOT_H__ */
170
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31b5e0f..9e2fecd62f62 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,7 +547,7 @@ out:
547 547
548void hfs_evict_inode(struct inode *inode) 548void hfs_evict_inode(struct inode *inode)
549{ 549{
550 truncate_inode_pages(&inode->i_data, 0); 550 truncate_inode_pages_final(&inode->i_data);
551 clear_inode(inode); 551 clear_inode(inode);
552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754cd..eee7206c38d1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
112 112
113static int hfs_remount(struct super_block *sb, int *flags, char *data) 113static int hfs_remount(struct super_block *sb, int *flags, char *data)
114{ 114{
115 sync_filesystem(sb);
115 *flags |= MS_NODIRATIME; 116 *flags |= MS_NODIRATIME;
116 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 117 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
117 return 0; 118 return 0;
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 0f47890299c4..caf89a7be0a1 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -11,7 +11,7 @@
11 11
12static struct kmem_cache *hfsplus_attr_tree_cachep; 12static struct kmem_cache *hfsplus_attr_tree_cachep;
13 13
14int hfsplus_create_attr_tree_cache(void) 14int __init hfsplus_create_attr_tree_cache(void)
15{ 15{
16 if (hfsplus_attr_tree_cachep) 16 if (hfsplus_attr_tree_cachep)
17 return -EEXIST; 17 return -EEXIST;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fbb212fbb1ef..a7aafb35b624 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
227 u32 ablock, dblock, mask; 227 u32 ablock, dblock, mask;
228 sector_t sector; 228 sector_t sector;
229 int was_dirty = 0; 229 int was_dirty = 0;
230 int shift;
231 230
232 /* Convert inode block to disk allocation block */ 231 /* Convert inode block to disk allocation block */
233 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
234 ablock = iblock >> sbi->fs_shift; 232 ablock = iblock >> sbi->fs_shift;
235 233
236 if (iblock >= hip->fs_blocks) { 234 if (iblock >= hip->fs_blocks) {
@@ -498,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode)
498 goto insert_extent; 496 goto insert_extent;
499 } 497 }
500out: 498out:
501 mutex_unlock(&hip->extents_lock);
502 if (!res) { 499 if (!res) {
503 hip->alloc_blocks += len; 500 hip->alloc_blocks += len;
501 mutex_unlock(&hip->extents_lock);
504 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); 502 hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
503 return 0;
505 } 504 }
505 mutex_unlock(&hip->extents_lock);
506 return res; 506 return res;
507 507
508insert_extent: 508insert_extent:
@@ -556,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode)
556 556
557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> 557 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
558 HFSPLUS_SB(sb)->alloc_blksz_shift; 558 HFSPLUS_SB(sb)->alloc_blksz_shift;
559
560 mutex_lock(&hip->extents_lock);
561
559 alloc_cnt = hip->alloc_blocks; 562 alloc_cnt = hip->alloc_blocks;
560 if (blk_cnt == alloc_cnt) 563 if (blk_cnt == alloc_cnt)
561 goto out; 564 goto out_unlock;
562 565
563 mutex_lock(&hip->extents_lock);
564 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); 566 res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
565 if (res) { 567 if (res) {
566 mutex_unlock(&hip->extents_lock); 568 mutex_unlock(&hip->extents_lock);
@@ -592,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode)
592 hfs_brec_remove(&fd); 594 hfs_brec_remove(&fd);
593 } 595 }
594 hfs_find_exit(&fd); 596 hfs_find_exit(&fd);
595 mutex_unlock(&hip->extents_lock);
596 597
597 hip->alloc_blocks = blk_cnt; 598 hip->alloc_blocks = blk_cnt;
598out: 599out_unlock:
600 mutex_unlock(&hip->extents_lock);
599 hip->phys_size = inode->i_size; 601 hip->phys_size = inode->i_size;
600 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> 602 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
601 sb->s_blocksize_bits; 603 sb->s_blocksize_bits;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 62d571eb69ba..83dc29286b10 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *,
367 */ 367 */
368 368
369/* attributes.c */ 369/* attributes.c */
370int hfsplus_create_attr_tree_cache(void); 370int __init hfsplus_create_attr_tree_cache(void);
371void hfsplus_destroy_attr_tree_cache(void); 371void hfsplus_destroy_attr_tree_cache(void);
372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); 372hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); 373void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa640ef..a513d2d36be9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode,
161static void hfsplus_evict_inode(struct inode *inode) 161static void hfsplus_evict_inode(struct inode *inode)
162{ 162{
163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); 163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
164 truncate_inode_pages(&inode->i_data, 0); 164 truncate_inode_pages_final(&inode->i_data);
165 clear_inode(inode); 165 clear_inode(inode);
166 if (HFSPLUS_IS_RSRC(inode)) { 166 if (HFSPLUS_IS_RSRC(inode)) {
167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
323 323
324static int hfsplus_remount(struct super_block *sb, int *flags, char *data) 324static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
325{ 325{
326 sync_filesystem(sb);
326 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 327 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
327 return 0; 328 return 0;
328 if (!(*flags & MS_RDONLY)) { 329 if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe649d325b1f..9c470fde9878 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
230 230
231static void hostfs_evict_inode(struct inode *inode) 231static void hostfs_evict_inode(struct inode *inode)
232{ 232{
233 truncate_inode_pages(&inode->i_data, 0); 233 truncate_inode_pages_final(&inode->i_data);
234 clear_inode(inode); 234 clear_inode(inode);
235 if (HOSTFS_I(inode)->fd != -1) { 235 if (HOSTFS_I(inode)->fd != -1) {
236 close_file(&HOSTFS_I(inode)->fd); 236 close_file(&HOSTFS_I(inode)->fd);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0ea97e..50a427313835 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode)
304 304
305void hpfs_evict_inode(struct inode *inode) 305void hpfs_evict_inode(struct inode *inode)
306{ 306{
307 truncate_inode_pages(&inode->i_data, 0); 307 truncate_inode_pages_final(&inode->i_data);
308 clear_inode(inode); 308 clear_inode(inode);
309 if (!inode->i_nlink) { 309 if (!inode->i_nlink) {
310 hpfs_lock(inode->i_sb); 310 hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff688b76..fe3463a43236 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
421 struct hpfs_sb_info *sbi = hpfs_sb(s); 421 struct hpfs_sb_info *sbi = hpfs_sb(s);
422 char *new_opts = kstrdup(data, GFP_KERNEL); 422 char *new_opts = kstrdup(data, GFP_KERNEL);
423 423
424 sync_filesystem(s);
425
424 *flags |= MS_NOATIME; 426 *flags |= MS_NOATIME;
425 427
426 hpfs_lock(s); 428 hpfs_lock(s);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..e19d4c0cacae 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
366 366
367static void hugetlbfs_evict_inode(struct inode *inode) 367static void hugetlbfs_evict_inode(struct inode *inode)
368{ 368{
369 struct resv_map *resv_map;
370
369 truncate_hugepages(inode, 0); 371 truncate_hugepages(inode, 0);
372 resv_map = (struct resv_map *)inode->i_mapping->private_data;
373 /* root inode doesn't have the resv_map, so we should check it */
374 if (resv_map)
375 resv_map_release(&resv_map->refs);
370 clear_inode(inode); 376 clear_inode(inode);
371} 377}
372 378
@@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
476 umode_t mode, dev_t dev) 482 umode_t mode, dev_t dev)
477{ 483{
478 struct inode *inode; 484 struct inode *inode;
485 struct resv_map *resv_map;
486
487 resv_map = resv_map_alloc();
488 if (!resv_map)
489 return NULL;
479 490
480 inode = new_inode(sb); 491 inode = new_inode(sb);
481 if (inode) { 492 if (inode) {
@@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
487 inode->i_mapping->a_ops = &hugetlbfs_aops; 498 inode->i_mapping->a_ops = &hugetlbfs_aops;
488 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 499 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
489 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 500 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
490 INIT_LIST_HEAD(&inode->i_mapping->private_list); 501 inode->i_mapping->private_data = resv_map;
491 info = HUGETLBFS_I(inode); 502 info = HUGETLBFS_I(inode);
492 /* 503 /*
493 * The policy is initialized here even if we are creating a 504 * The policy is initialized here even if we are creating a
@@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
517 break; 528 break;
518 } 529 }
519 lockdep_annotate_inode_mutex_key(inode); 530 lockdep_annotate_inode_mutex_key(inode);
520 } 531 } else
532 kref_put(&resv_map->refs, resv_map_release);
533
521 return inode; 534 return inode;
522} 535}
523 536
@@ -1017,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void)
1017 int error; 1030 int error;
1018 int i; 1031 int i;
1019 1032
1033 if (!hugepages_supported()) {
1034 pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
1035 return -ENOTSUPP;
1036 }
1037
1020 error = bdi_init(&hugetlbfs_backing_dev_info); 1038 error = bdi_init(&hugetlbfs_backing_dev_info);
1021 if (error) 1039 if (error)
1022 return error; 1040 return error;
diff --git a/fs/inode.c b/fs/inode.c
index 4bcdad3c9361..f96d2a6f88cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -503,6 +503,7 @@ void clear_inode(struct inode *inode)
503 */ 503 */
504 spin_lock_irq(&inode->i_data.tree_lock); 504 spin_lock_irq(&inode->i_data.tree_lock);
505 BUG_ON(inode->i_data.nrpages); 505 BUG_ON(inode->i_data.nrpages);
506 BUG_ON(inode->i_data.nrshadows);
506 spin_unlock_irq(&inode->i_data.tree_lock); 507 spin_unlock_irq(&inode->i_data.tree_lock);
507 BUG_ON(!list_empty(&inode->i_data.private_list)); 508 BUG_ON(!list_empty(&inode->i_data.private_list));
508 BUG_ON(!(inode->i_state & I_FREEING)); 509 BUG_ON(!(inode->i_state & I_FREEING));
@@ -548,8 +549,7 @@ static void evict(struct inode *inode)
548 if (op->evict_inode) { 549 if (op->evict_inode) {
549 op->evict_inode(inode); 550 op->evict_inode(inode);
550 } else { 551 } else {
551 if (inode->i_data.nrpages) 552 truncate_inode_pages_final(&inode->i_data);
552 truncate_inode_pages(&inode->i_data, 0);
553 clear_inode(inode); 553 clear_inode(inode);
554 } 554 }
555 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 555 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -944,24 +944,22 @@ EXPORT_SYMBOL(unlock_new_inode);
944 944
945/** 945/**
946 * lock_two_nondirectories - take two i_mutexes on non-directory objects 946 * lock_two_nondirectories - take two i_mutexes on non-directory objects
947 *
948 * Lock any non-NULL argument that is not a directory.
949 * Zero, one or two objects may be locked by this function.
950 *
947 * @inode1: first inode to lock 951 * @inode1: first inode to lock
948 * @inode2: second inode to lock 952 * @inode2: second inode to lock
949 */ 953 */
950void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 954void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
951{ 955{
952 WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); 956 if (inode1 > inode2)
953 if (inode1 == inode2 || !inode2) { 957 swap(inode1, inode2);
954 mutex_lock(&inode1->i_mutex); 958
955 return; 959 if (inode1 && !S_ISDIR(inode1->i_mode))
956 }
957 WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
958 if (inode1 < inode2) {
959 mutex_lock(&inode1->i_mutex); 960 mutex_lock(&inode1->i_mutex);
961 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
960 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); 962 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
961 } else {
962 mutex_lock(&inode2->i_mutex);
963 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2);
964 }
965} 963}
966EXPORT_SYMBOL(lock_two_nondirectories); 964EXPORT_SYMBOL(lock_two_nondirectories);
967 965
@@ -972,8 +970,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
972 */ 970 */
973void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 971void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
974{ 972{
975 mutex_unlock(&inode1->i_mutex); 973 if (inode1 && !S_ISDIR(inode1->i_mode))
976 if (inode2 && inode2 != inode1) 974 mutex_unlock(&inode1->i_mutex);
975 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
977 mutex_unlock(&inode2->i_mutex); 976 mutex_unlock(&inode2->i_mutex);
978} 977}
979EXPORT_SYMBOL(unlock_two_nondirectories); 978EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1899,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
1899 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); 1898 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
1900} 1899}
1901EXPORT_SYMBOL(inode_dio_done); 1900EXPORT_SYMBOL(inode_dio_done);
1901
1902/*
1903 * inode_set_flags - atomically set some inode flags
1904 *
1905 * Note: the caller should be holding i_mutex, or else be sure that
1906 * they have exclusive access to the inode structure (i.e., while the
1907 * inode is being instantiated). The reason for the cmpxchg() loop
1908 * --- which wouldn't be necessary if all code paths which modify
1909 * i_flags actually followed this rule, is that there is at least one
1910 * code path which doesn't today --- for example,
1911 * __generic_file_aio_write() calls file_remove_suid() without holding
1912 * i_mutex --- so we use cmpxchg() out of an abundance of caution.
1913 *
1914 * In the long run, i_mutex is overkill, and we should probably look
1915 * at using the i_lock spinlock to protect i_flags, and then make sure
1916 * it is so documented in include/linux/fs.h and that all code follows
1917 * the locking convention!!
1918 */
1919void inode_set_flags(struct inode *inode, unsigned int flags,
1920 unsigned int mask)
1921{
1922 unsigned int old_flags, new_flags;
1923
1924 WARN_ON_ONCE(flags & ~mask);
1925 do {
1926 old_flags = ACCESS_ONCE(inode->i_flags);
1927 new_flags = (old_flags & ~mask) | flags;
1928 } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
1929 new_flags) != old_flags));
1930}
1931EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10ea13f2..4556ce1af5b0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static void init_once(void *foo)
93 inode_init_once(&ei->vfs_inode); 93 inode_init_once(&ei->vfs_inode);
94} 94}
95 95
96static int init_inodecache(void) 96static int __init init_inodecache(void)
97{ 97{
98 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", 98 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
99 sizeof(struct iso_inode_info), 99 sizeof(struct iso_inode_info),
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
117 117
118static int isofs_remount(struct super_block *sb, int *flags, char *data) 118static int isofs_remount(struct super_block *sb, int *flags, char *data)
119{ 119{
120 sync_filesystem(sb);
120 if (!(*flags & MS_RDONLY)) 121 if (!(*flags & MS_RDONLY))
121 return -EROFS; 122 return -EROFS;
122 return 0; 123 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc0594063..5f26139a165a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
555 blk_start_plug(&plug); 555 blk_start_plug(&plug);
556 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
557 &log_bufs, WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
558 blk_finish_plug(&plug);
559 558
560 jbd_debug(3, "JBD2: commit phase 2b\n"); 559 jbd_debug(3, "JBD2: commit phase 2b\n");
561 560
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
582 err = 0; 581 err = 0;
583 bufs = 0; 582 bufs = 0;
584 descriptor = NULL; 583 descriptor = NULL;
585 blk_start_plug(&plug);
586 while (commit_transaction->t_buffers) { 584 while (commit_transaction->t_buffers) {
587 585
588 /* Find the next buffer to be journaled... */ 586 /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
1067 goto restart_loop; 1065 goto restart_loop;
1068 } 1066 }
1069 1067
1068 /* Add the transaction to the checkpoint list
1069 * __journal_remove_checkpoint() can not destroy transaction
1070 * under us because it is not marked as T_FINISHED yet */
1071 if (journal->j_checkpoint_transactions == NULL) {
1072 journal->j_checkpoint_transactions = commit_transaction;
1073 commit_transaction->t_cpnext = commit_transaction;
1074 commit_transaction->t_cpprev = commit_transaction;
1075 } else {
1076 commit_transaction->t_cpnext =
1077 journal->j_checkpoint_transactions;
1078 commit_transaction->t_cpprev =
1079 commit_transaction->t_cpnext->t_cpprev;
1080 commit_transaction->t_cpnext->t_cpprev =
1081 commit_transaction;
1082 commit_transaction->t_cpprev->t_cpnext =
1083 commit_transaction;
1084 }
1085 spin_unlock(&journal->j_list_lock);
1086
1070 /* Done with this transaction! */ 1087 /* Done with this transaction! */
1071 1088
1072 jbd_debug(3, "JBD2: commit phase 7\n"); 1089 jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
1085 atomic_read(&commit_transaction->t_handle_count); 1102 atomic_read(&commit_transaction->t_handle_count);
1086 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1103 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1087 commit_transaction->t_tid, &stats.run); 1104 commit_transaction->t_tid, &stats.run);
1088 1105 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1089 /*
1090 * Calculate overall stats
1091 */
1092 spin_lock(&journal->j_history_lock);
1093 journal->j_stats.ts_tid++;
1094 if (commit_transaction->t_requested)
1095 journal->j_stats.ts_requested++;
1096 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1097 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1098 journal->j_stats.run.rs_running += stats.run.rs_running;
1099 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1100 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1101 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1102 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1103 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1104 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1105 spin_unlock(&journal->j_history_lock);
1106 1106
1107 commit_transaction->t_state = T_COMMIT_CALLBACK; 1107 commit_transaction->t_state = T_COMMIT_CALLBACK;
1108 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1108 J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
1122 1122
1123 write_unlock(&journal->j_state_lock); 1123 write_unlock(&journal->j_state_lock);
1124 1124
1125 if (journal->j_checkpoint_transactions == NULL) {
1126 journal->j_checkpoint_transactions = commit_transaction;
1127 commit_transaction->t_cpnext = commit_transaction;
1128 commit_transaction->t_cpprev = commit_transaction;
1129 } else {
1130 commit_transaction->t_cpnext =
1131 journal->j_checkpoint_transactions;
1132 commit_transaction->t_cpprev =
1133 commit_transaction->t_cpnext->t_cpprev;
1134 commit_transaction->t_cpnext->t_cpprev =
1135 commit_transaction;
1136 commit_transaction->t_cpprev->t_cpnext =
1137 commit_transaction;
1138 }
1139 spin_unlock(&journal->j_list_lock);
1140 /* Drop all spin_locks because commit_callback may be block.
1141 * __journal_remove_checkpoint() can not destroy transaction
1142 * under us because it is not marked as T_FINISHED yet */
1143 if (journal->j_commit_callback) 1125 if (journal->j_commit_callback)
1144 journal->j_commit_callback(journal, commit_transaction); 1126 journal->j_commit_callback(journal, commit_transaction);
1145 1127
@@ -1150,7 +1132,7 @@ restart_loop:
1150 write_lock(&journal->j_state_lock); 1132 write_lock(&journal->j_state_lock);
1151 spin_lock(&journal->j_list_lock); 1133 spin_lock(&journal->j_list_lock);
1152 commit_transaction->t_state = T_FINISHED; 1134 commit_transaction->t_state = T_FINISHED;
1153 /* Recheck checkpoint lists after j_list_lock was dropped */ 1135 /* Check if the transaction can be dropped now that we are finished */
1154 if (commit_transaction->t_checkpoint_list == NULL && 1136 if (commit_transaction->t_checkpoint_list == NULL &&
1155 commit_transaction->t_checkpoint_io_list == NULL) { 1137 commit_transaction->t_checkpoint_io_list == NULL) {
1156 __jbd2_journal_drop_transaction(journal, commit_transaction); 1138 __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
1159 spin_unlock(&journal->j_list_lock); 1141 spin_unlock(&journal->j_list_lock);
1160 write_unlock(&journal->j_state_lock); 1142 write_unlock(&journal->j_state_lock);
1161 wake_up(&journal->j_wait_done_commit); 1143 wake_up(&journal->j_wait_done_commit);
1144
1145 /*
1146 * Calculate overall stats
1147 */
1148 spin_lock(&journal->j_history_lock);
1149 journal->j_stats.ts_tid++;
1150 journal->j_stats.ts_requested += stats.ts_requested;
1151 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1152 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1153 journal->j_stats.run.rs_running += stats.run.rs_running;
1154 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1155 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1156 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1157 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1158 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1159 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1160 spin_unlock(&journal->j_history_lock);
1162} 1161}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344afb49a..67b8e303946c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
122#endif 122#endif
123 123
124/* Checksumming functions */ 124/* Checksumming functions */
125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
126{ 126{
127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
128 return 1; 128 return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
143 return cpu_to_be32(csum); 143 return cpu_to_be32(csum);
144} 144}
145 145
146int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
147{ 147{
148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
149 return 1; 149 return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 151 return sb->s_checksum == jbd2_superblock_csum(j, sb);
152} 152}
153 153
154void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
155{ 155{
156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
157 return; 157 return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
302 journal->j_flags |= JBD2_UNMOUNT; 302 journal->j_flags |= JBD2_UNMOUNT;
303 303
304 while (journal->j_task) { 304 while (journal->j_task) {
305 wake_up(&journal->j_wait_commit);
306 write_unlock(&journal->j_state_lock); 305 write_unlock(&journal->j_state_lock);
306 wake_up(&journal->j_wait_commit);
307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 307 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
308 write_lock(&journal->j_state_lock); 308 write_lock(&journal->j_state_lock);
309 } 309 }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
710 while (tid_gt(tid, journal->j_commit_sequence)) { 710 while (tid_gt(tid, journal->j_commit_sequence)) {
711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", 711 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
712 tid, journal->j_commit_sequence); 712 tid, journal->j_commit_sequence);
713 wake_up(&journal->j_wait_commit);
714 read_unlock(&journal->j_state_lock); 713 read_unlock(&journal->j_state_lock);
714 wake_up(&journal->j_wait_commit);
715 wait_event(journal->j_wait_done_commit, 715 wait_event(journal->j_wait_done_commit,
716 !tid_gt(tid, journal->j_commit_sequence)); 716 !tid_gt(tid, journal->j_commit_sequence));
717 read_lock(&journal->j_state_lock); 717 read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365f54a5..38cfcf5f6fce 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1073 * reused here. 1073 * reused here.
1074 */ 1074 */
1075 jbd_lock_bh_state(bh); 1075 jbd_lock_bh_state(bh);
1076 spin_lock(&journal->j_list_lock);
1077 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 1076 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
1078 jh->b_transaction == NULL || 1077 jh->b_transaction == NULL ||
1079 (jh->b_transaction == journal->j_committing_transaction && 1078 (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1096 jh->b_modified = 0; 1095 jh->b_modified = 0;
1097 1096
1098 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 1097 JBUFFER_TRACE(jh, "file as BJ_Reserved");
1098 spin_lock(&journal->j_list_lock);
1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 1099 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1100 } else if (jh->b_transaction == journal->j_committing_transaction) { 1100 } else if (jh->b_transaction == journal->j_committing_transaction) {
1101 /* first access by this transaction */ 1101 /* first access by this transaction */
1102 jh->b_modified = 0; 1102 jh->b_modified = 0;
1103 1103
1104 JBUFFER_TRACE(jh, "set next transaction"); 1104 JBUFFER_TRACE(jh, "set next transaction");
1105 spin_lock(&journal->j_list_lock);
1105 jh->b_next_transaction = transaction; 1106 jh->b_next_transaction = transaction;
1106 } 1107 }
1107 spin_unlock(&journal->j_list_lock); 1108 spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1312 journal->j_running_transaction)) { 1313 journal->j_running_transaction)) {
1313 printk(KERN_ERR "JBD2: %s: " 1314 printk(KERN_ERR "JBD2: %s: "
1314 "jh->b_transaction (%llu, %p, %u) != " 1315 "jh->b_transaction (%llu, %p, %u) != "
1315 "journal->j_running_transaction (%p, %u)", 1316 "journal->j_running_transaction (%p, %u)\n",
1316 journal->j_devname, 1317 journal->j_devname,
1317 (unsigned long long) bh->b_blocknr, 1318 (unsigned long long) bh->b_blocknr,
1318 jh->b_transaction, 1319 jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1335 */ 1336 */
1336 if (jh->b_transaction != transaction) { 1337 if (jh->b_transaction != transaction) {
1337 JBUFFER_TRACE(jh, "already on other transaction"); 1338 JBUFFER_TRACE(jh, "already on other transaction");
1338 if (unlikely(jh->b_transaction != 1339 if (unlikely(((jh->b_transaction !=
1339 journal->j_committing_transaction)) { 1340 journal->j_committing_transaction)) ||
1340 printk(KERN_ERR "JBD2: %s: " 1341 (jh->b_next_transaction != transaction))) {
1341 "jh->b_transaction (%llu, %p, %u) != " 1342 printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1342 "journal->j_committing_transaction (%p, %u)", 1343 "bad jh for block %llu: "
1344 "transaction (%p, %u), "
1345 "jh->b_transaction (%p, %u), "
1346 "jh->b_next_transaction (%p, %u), jlist %u\n",
1343 journal->j_devname, 1347 journal->j_devname,
1344 (unsigned long long) bh->b_blocknr, 1348 (unsigned long long) bh->b_blocknr,
1349 transaction, transaction->t_tid,
1345 jh->b_transaction, 1350 jh->b_transaction,
1346 jh->b_transaction ? jh->b_transaction->t_tid : 0, 1351 jh->b_transaction ?
1347 journal->j_committing_transaction, 1352 jh->b_transaction->t_tid : 0,
1348 journal->j_committing_transaction ?
1349 journal->j_committing_transaction->t_tid : 0);
1350 ret = -EINVAL;
1351 }
1352 if (unlikely(jh->b_next_transaction != transaction)) {
1353 printk(KERN_ERR "JBD2: %s: "
1354 "jh->b_next_transaction (%llu, %p, %u) != "
1355 "transaction (%p, %u)",
1356 journal->j_devname,
1357 (unsigned long long) bh->b_blocknr,
1358 jh->b_next_transaction, 1353 jh->b_next_transaction,
1359 jh->b_next_transaction ? 1354 jh->b_next_transaction ?
1360 jh->b_next_transaction->t_tid : 0, 1355 jh->b_next_transaction->t_tid : 0,
1361 transaction, transaction->t_tid); 1356 jh->b_jlist);
1357 WARN_ON(1);
1362 ret = -EINVAL; 1358 ret = -EINVAL;
1363 } 1359 }
1364 /* And this case is illegal: we can't reuse another 1360 /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1415 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1416 1412
1417 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
1418 spin_lock(&journal->j_list_lock);
1419 1414
1420 if (!buffer_jbd(bh)) 1415 if (!buffer_jbd(bh))
1421 goto not_jbd; 1416 goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1468 * we know to remove the checkpoint after we commit. 1463 * we know to remove the checkpoint after we commit.
1469 */ 1464 */
1470 1465
1466 spin_lock(&journal->j_list_lock);
1471 if (jh->b_cp_transaction) { 1467 if (jh->b_cp_transaction) {
1472 __jbd2_journal_temp_unlink_buffer(jh); 1468 __jbd2_journal_temp_unlink_buffer(jh);
1473 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1469 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1480 goto drop; 1476 goto drop;
1481 } 1477 }
1482 } 1478 }
1479 spin_unlock(&journal->j_list_lock);
1483 } else if (jh->b_transaction) { 1480 } else if (jh->b_transaction) {
1484 J_ASSERT_JH(jh, (jh->b_transaction == 1481 J_ASSERT_JH(jh, (jh->b_transaction ==
1485 journal->j_committing_transaction)); 1482 journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1491 1488
1492 if (jh->b_next_transaction) { 1489 if (jh->b_next_transaction) {
1493 J_ASSERT(jh->b_next_transaction == transaction); 1490 J_ASSERT(jh->b_next_transaction == transaction);
1491 spin_lock(&journal->j_list_lock);
1494 jh->b_next_transaction = NULL; 1492 jh->b_next_transaction = NULL;
1493 spin_unlock(&journal->j_list_lock);
1495 1494
1496 /* 1495 /*
1497 * only drop a reference if this transaction modified 1496 * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1503 } 1502 }
1504 1503
1505not_jbd: 1504not_jbd:
1506 spin_unlock(&journal->j_list_lock);
1507 jbd_unlock_bh_state(bh); 1505 jbd_unlock_bh_state(bh);
1508 __brelse(bh); 1506 __brelse(bh);
1509drop: 1507drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1821 if (buffer_locked(bh) || buffer_dirty(bh)) 1819 if (buffer_locked(bh) || buffer_dirty(bh))
1822 goto out; 1820 goto out;
1823 1821
1824 if (jh->b_next_transaction != NULL) 1822 if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
1825 goto out; 1823 goto out;
1826 1824
1827 spin_lock(&journal->j_list_lock); 1825 spin_lock(&journal->j_list_lock);
1828 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1826 if (jh->b_cp_transaction != NULL) {
1829 /* written-back checkpointed metadata buffer */ 1827 /* written-back checkpointed metadata buffer */
1830 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1828 JBUFFER_TRACE(jh, "remove from checkpoint list");
1831 __jbd2_journal_remove_checkpoint(jh); 1829 __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 16a5047903a6..406d9cc84ba8 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
33 unsigned char *cpage_out, 33 unsigned char *cpage_out,
34 uint32_t *sourcelen, uint32_t *dstlen) 34 uint32_t *sourcelen, uint32_t *dstlen)
35{ 35{
36 short positions[256]; 36 unsigned short positions[256];
37 int outpos = 0; 37 int outpos = 0;
38 int pos=0; 38 int pos=0;
39 39
@@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in,
74 unsigned char *cpage_out, 74 unsigned char *cpage_out,
75 uint32_t srclen, uint32_t destlen) 75 uint32_t srclen, uint32_t destlen)
76{ 76{
77 short positions[256]; 77 unsigned short positions[256];
78 int outpos = 0; 78 int outpos = 0;
79 int pos=0; 79 int pos=0;
80 80
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a69e426435dd..601afd1afddf 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode)
242 242
243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n", 243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
244 __func__, inode->i_ino, inode->i_mode); 244 __func__, inode->i_ino, inode->i_mode);
245 truncate_inode_pages(&inode->i_data, 0); 245 truncate_inode_pages_final(&inode->i_data);
246 clear_inode(inode); 246 clear_inode(inode);
247 jffs2_do_clear_inode(c, f); 247 jffs2_do_clear_inode(c, f);
248} 248}
@@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
457 The umask is only applied if there's no default ACL */ 457 The umask is only applied if there's no default ACL */
458 ret = jffs2_init_acl_pre(dir_i, inode, &mode); 458 ret = jffs2_init_acl_pre(dir_i, inode, &mode);
459 if (ret) { 459 if (ret) {
460 make_bad_inode(inode); 460 mutex_unlock(&f->sem);
461 iput(inode); 461 make_bad_inode(inode);
462 return ERR_PTR(ret); 462 iput(inode);
463 return ERR_PTR(ret);
463 } 464 }
464 ret = jffs2_do_new_inode (c, f, mode, ri); 465 ret = jffs2_do_new_inode (c, f, mode, ri);
465 if (ret) { 466 if (ret) {
467 mutex_unlock(&f->sem);
466 make_bad_inode(inode); 468 make_bad_inode(inode);
467 iput(inode); 469 iput(inode);
468 return ERR_PTR(ret); 470 return ERR_PTR(ret);
@@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
479 inode->i_size = 0; 481 inode->i_size = 0;
480 482
481 if (insert_inode_locked(inode) < 0) { 483 if (insert_inode_locked(inode) < 0) {
484 mutex_unlock(&f->sem);
482 make_bad_inode(inode); 485 make_bad_inode(inode);
483 iput(inode); 486 iput(inode);
484 return ERR_PTR(-EINVAL); 487 return ERR_PTR(-EINVAL);
@@ -687,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
687 struct inode *inode = OFNI_EDONI_2SFFJ(f); 690 struct inode *inode = OFNI_EDONI_2SFFJ(f);
688 struct page *pg; 691 struct page *pg;
689 692
690 pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, 693 pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
691 (void *)jffs2_do_readpage_unlock, inode); 694 (void *)jffs2_do_readpage_unlock, inode);
692 if (IS_ERR(pg)) 695 if (IS_ERR(pg))
693 return (void *)pg; 696 return (void *)pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index e4619b00f7c5..fa35ff79ab35 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info
231 uint32_t version; 231 uint32_t version;
232 uint32_t data_crc; 232 uint32_t data_crc;
233 uint32_t partial_crc; 233 uint32_t partial_crc;
234 uint16_t csize; 234 uint32_t csize;
235 uint16_t overlapped; 235 uint16_t overlapped;
236}; 236};
237 237
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 03310721712f..b6bd4affd9ad 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
179 spin_unlock(&c->erase_completion_lock); 179 spin_unlock(&c->erase_completion_lock);
180 180
181 schedule(); 181 schedule();
182 remove_wait_queue(&c->erase_wait, &wait);
182 } else 183 } else
183 spin_unlock(&c->erase_completion_lock); 184 spin_unlock(&c->erase_completion_lock);
184 } else if (ret) 185 } else if (ret)
@@ -211,20 +212,25 @@ out:
211int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, 212int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
212 uint32_t *len, uint32_t sumsize) 213 uint32_t *len, uint32_t sumsize)
213{ 214{
214 int ret = -EAGAIN; 215 int ret;
215 minsize = PAD(minsize); 216 minsize = PAD(minsize);
216 217
217 jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); 218 jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
218 219
219 spin_lock(&c->erase_completion_lock); 220 while (true) {
220 while(ret == -EAGAIN) { 221 spin_lock(&c->erase_completion_lock);
221 ret = jffs2_do_reserve_space(c, minsize, len, sumsize); 222 ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
222 if (ret) { 223 if (ret) {
223 jffs2_dbg(1, "%s(): looping, ret is %d\n", 224 jffs2_dbg(1, "%s(): looping, ret is %d\n",
224 __func__, ret); 225 __func__, ret);
225 } 226 }
227 spin_unlock(&c->erase_completion_lock);
228
229 if (ret == -EAGAIN)
230 cond_resched();
231 else
232 break;
226 } 233 }
227 spin_unlock(&c->erase_completion_lock);
228 if (!ret) 234 if (!ret)
229 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); 235 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
230 236
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a35..0918f0e2e266 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 243 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
244 int err; 244 int err;
245 245
246 sync_filesystem(sb);
246 err = jffs2_parse_options(c, data); 247 err = jffs2_parse_options(c, data);
247 if (err) 248 if (err)
248 return -EINVAL; 249 return -EINVAL;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab719add5..6f8fe72c2a7a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
154 dquot_initialize(inode); 154 dquot_initialize(inode);
155 155
156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) { 156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
157 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages_final(&inode->i_data);
158 158
159 if (test_cflag(COMMIT_Freewmap, inode)) 159 if (test_cflag(COMMIT_Freewmap, inode))
160 jfs_free_zero_link(inode); 160 jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
168 dquot_free_inode(inode); 168 dquot_free_inode(inode);
169 } 169 }
170 } else { 170 } else {
171 truncate_inode_pages(&inode->i_data, 0); 171 truncate_inode_pages_final(&inode->i_data);
172 } 172 }
173 clear_inode(inode); 173 clear_inode(inode);
174 dquot_drop(inode); 174 dquot_drop(inode);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483444fd..97f7fda51890 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
418 int flag = JFS_SBI(sb)->flag; 418 int flag = JFS_SBI(sb)->flag;
419 int ret; 419 int ret;
420 420
421 sync_filesystem(sb);
421 if (!parse_options(data, sb, &newLVSize, &flag)) { 422 if (!parse_options(data, sb, &newLVSize, &flag)) {
422 return -EINVAL; 423 return -EINVAL;
423 } 424 }
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
new file mode 100644
index 000000000000..397b5f7a7a16
--- /dev/null
+++ b/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
1#
2# KERNFS should be selected by its users
3#
4
5config KERNFS
6 bool
7 default n
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bd6e18be6e1a..ac127cd008bf 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,6 +8,7 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/sched.h>
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/idr.h> 14#include <linux/idr.h>
@@ -18,9 +19,162 @@
18#include "kernfs-internal.h" 19#include "kernfs-internal.h"
19 20
20DEFINE_MUTEX(kernfs_mutex); 21DEFINE_MUTEX(kernfs_mutex);
22static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
23static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
21 24
22#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) 25#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
23 26
27static bool kernfs_active(struct kernfs_node *kn)
28{
29 lockdep_assert_held(&kernfs_mutex);
30 return atomic_read(&kn->active) >= 0;
31}
32
33static bool kernfs_lockdep(struct kernfs_node *kn)
34{
35#ifdef CONFIG_DEBUG_LOCK_ALLOC
36 return kn->flags & KERNFS_LOCKDEP;
37#else
38 return false;
39#endif
40}
41
42static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
43{
44 return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
45}
46
47static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
48 size_t buflen)
49{
50 char *p = buf + buflen;
51 int len;
52
53 *--p = '\0';
54
55 do {
56 len = strlen(kn->name);
57 if (p - buf < len + 1) {
58 buf[0] = '\0';
59 p = NULL;
60 break;
61 }
62 p -= len;
63 memcpy(p, kn->name, len);
64 *--p = '/';
65 kn = kn->parent;
66 } while (kn && kn->parent);
67
68 return p;
69}
70
71/**
72 * kernfs_name - obtain the name of a given node
73 * @kn: kernfs_node of interest
74 * @buf: buffer to copy @kn's name into
75 * @buflen: size of @buf
76 *
77 * Copies the name of @kn into @buf of @buflen bytes. The behavior is
78 * similar to strlcpy(). It returns the length of @kn's name and if @buf
79 * isn't long enough, it's filled upto @buflen-1 and nul terminated.
80 *
81 * This function can be called from any context.
82 */
83int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
84{
85 unsigned long flags;
86 int ret;
87
88 spin_lock_irqsave(&kernfs_rename_lock, flags);
89 ret = kernfs_name_locked(kn, buf, buflen);
90 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
91 return ret;
92}
93
94/**
95 * kernfs_path - build full path of a given node
96 * @kn: kernfs_node of interest
97 * @buf: buffer to copy @kn's name into
98 * @buflen: size of @buf
99 *
100 * Builds and returns the full path of @kn in @buf of @buflen bytes. The
101 * path is built from the end of @buf so the returned pointer usually
102 * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
103 * and %NULL is returned.
104 */
105char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
106{
107 unsigned long flags;
108 char *p;
109
110 spin_lock_irqsave(&kernfs_rename_lock, flags);
111 p = kernfs_path_locked(kn, buf, buflen);
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113 return p;
114}
115EXPORT_SYMBOL_GPL(kernfs_path);
116
117/**
118 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
119 * @kn: kernfs_node of interest
120 *
121 * This function can be called from any context.
122 */
123void pr_cont_kernfs_name(struct kernfs_node *kn)
124{
125 unsigned long flags;
126
127 spin_lock_irqsave(&kernfs_rename_lock, flags);
128
129 kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
130 pr_cont("%s", kernfs_pr_cont_buf);
131
132 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
133}
134
135/**
136 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
137 * @kn: kernfs_node of interest
138 *
139 * This function can be called from any context.
140 */
141void pr_cont_kernfs_path(struct kernfs_node *kn)
142{
143 unsigned long flags;
144 char *p;
145
146 spin_lock_irqsave(&kernfs_rename_lock, flags);
147
148 p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
149 sizeof(kernfs_pr_cont_buf));
150 if (p)
151 pr_cont("%s", p);
152 else
153 pr_cont("<name too long>");
154
155 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
156}
157
158/**
159 * kernfs_get_parent - determine the parent node and pin it
160 * @kn: kernfs_node of interest
161 *
162 * Determines @kn's parent, pins and returns it. This function can be
163 * called from any context.
164 */
165struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
166{
167 struct kernfs_node *parent;
168 unsigned long flags;
169
170 spin_lock_irqsave(&kernfs_rename_lock, flags);
171 parent = kn->parent;
172 kernfs_get(parent);
173 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
174
175 return parent;
176}
177
24/** 178/**
25 * kernfs_name_hash 179 * kernfs_name_hash
26 * @name: Null terminated string to hash 180 * @name: Null terminated string to hash
@@ -37,7 +191,7 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
37 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); 191 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
38 hash &= 0x7fffffffU; 192 hash &= 0x7fffffffU;
39 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ 193 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
40 if (hash < 1) 194 if (hash < 2)
41 hash += 2; 195 hash += 2;
42 if (hash >= INT_MAX) 196 if (hash >= INT_MAX)
43 hash = INT_MAX - 1; 197 hash = INT_MAX - 1;
@@ -78,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
78 struct rb_node **node = &kn->parent->dir.children.rb_node; 232 struct rb_node **node = &kn->parent->dir.children.rb_node;
79 struct rb_node *parent = NULL; 233 struct rb_node *parent = NULL;
80 234
81 if (kernfs_type(kn) == KERNFS_DIR)
82 kn->parent->dir.subdirs++;
83
84 while (*node) { 235 while (*node) {
85 struct kernfs_node *pos; 236 struct kernfs_node *pos;
86 int result; 237 int result;
@@ -95,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
95 else 246 else
96 return -EEXIST; 247 return -EEXIST;
97 } 248 }
249
98 /* add new node and rebalance the tree */ 250 /* add new node and rebalance the tree */
99 rb_link_node(&kn->rb, parent, node); 251 rb_link_node(&kn->rb, parent, node);
100 rb_insert_color(&kn->rb, &kn->parent->dir.children); 252 rb_insert_color(&kn->rb, &kn->parent->dir.children);
253
254 /* successfully added, account subdir number */
255 if (kernfs_type(kn) == KERNFS_DIR)
256 kn->parent->dir.subdirs++;
257
101 return 0; 258 return 0;
102} 259}
103 260
@@ -105,18 +262,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
105 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree 262 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
106 * @kn: kernfs_node of interest 263 * @kn: kernfs_node of interest
107 * 264 *
108 * Unlink @kn from its sibling rbtree which starts from 265 * Try to unlink @kn from its sibling rbtree which starts from
109 * kn->parent->dir.children. 266 * kn->parent->dir.children. Returns %true if @kn was actually
267 * removed, %false if @kn wasn't on the rbtree.
110 * 268 *
111 * Locking: 269 * Locking:
112 * mutex_lock(kernfs_mutex) 270 * mutex_lock(kernfs_mutex)
113 */ 271 */
114static void kernfs_unlink_sibling(struct kernfs_node *kn) 272static bool kernfs_unlink_sibling(struct kernfs_node *kn)
115{ 273{
274 if (RB_EMPTY_NODE(&kn->rb))
275 return false;
276
116 if (kernfs_type(kn) == KERNFS_DIR) 277 if (kernfs_type(kn) == KERNFS_DIR)
117 kn->parent->dir.subdirs--; 278 kn->parent->dir.subdirs--;
118 279
119 rb_erase(&kn->rb, &kn->parent->dir.children); 280 rb_erase(&kn->rb, &kn->parent->dir.children);
281 RB_CLEAR_NODE(&kn->rb);
282 return true;
120} 283}
121 284
122/** 285/**
@@ -137,7 +300,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
137 if (!atomic_inc_unless_negative(&kn->active)) 300 if (!atomic_inc_unless_negative(&kn->active))
138 return NULL; 301 return NULL;
139 302
140 if (kn->flags & KERNFS_LOCKDEP) 303 if (kernfs_lockdep(kn))
141 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); 304 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
142 return kn; 305 return kn;
143} 306}
@@ -151,59 +314,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
151 */ 314 */
152void kernfs_put_active(struct kernfs_node *kn) 315void kernfs_put_active(struct kernfs_node *kn)
153{ 316{
317 struct kernfs_root *root = kernfs_root(kn);
154 int v; 318 int v;
155 319
156 if (unlikely(!kn)) 320 if (unlikely(!kn))
157 return; 321 return;
158 322
159 if (kn->flags & KERNFS_LOCKDEP) 323 if (kernfs_lockdep(kn))
160 rwsem_release(&kn->dep_map, 1, _RET_IP_); 324 rwsem_release(&kn->dep_map, 1, _RET_IP_);
161 v = atomic_dec_return(&kn->active); 325 v = atomic_dec_return(&kn->active);
162 if (likely(v != KN_DEACTIVATED_BIAS)) 326 if (likely(v != KN_DEACTIVATED_BIAS))
163 return; 327 return;
164 328
165 /* 329 wake_up_all(&root->deactivate_waitq);
166 * atomic_dec_return() is a mb(), we'll always see the updated
167 * kn->u.completion.
168 */
169 complete(kn->u.completion);
170} 330}
171 331
172/** 332/**
173 * kernfs_deactivate - deactivate kernfs_node 333 * kernfs_drain - drain kernfs_node
174 * @kn: kernfs_node to deactivate 334 * @kn: kernfs_node to drain
175 * 335 *
176 * Deny new active references and drain existing ones. 336 * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
337 * removers may invoke this function concurrently on @kn and all will
338 * return after draining is complete.
177 */ 339 */
178static void kernfs_deactivate(struct kernfs_node *kn) 340static void kernfs_drain(struct kernfs_node *kn)
341 __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
179{ 342{
180 DECLARE_COMPLETION_ONSTACK(wait); 343 struct kernfs_root *root = kernfs_root(kn);
181 int v;
182
183 BUG_ON(!(kn->flags & KERNFS_REMOVED));
184 344
185 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) 345 lockdep_assert_held(&kernfs_mutex);
186 return; 346 WARN_ON_ONCE(kernfs_active(kn));
187 347
188 kn->u.completion = (void *)&wait; 348 mutex_unlock(&kernfs_mutex);
189 349
190 if (kn->flags & KERNFS_LOCKDEP) 350 if (kernfs_lockdep(kn)) {
191 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); 351 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
192 /* atomic_add_return() is a mb(), put_active() will always see 352 if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
193 * the updated kn->u.completion.
194 */
195 v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
196
197 if (v != KN_DEACTIVATED_BIAS) {
198 if (kn->flags & KERNFS_LOCKDEP)
199 lock_contended(&kn->dep_map, _RET_IP_); 353 lock_contended(&kn->dep_map, _RET_IP_);
200 wait_for_completion(&wait);
201 } 354 }
202 355
203 if (kn->flags & KERNFS_LOCKDEP) { 356 /* but everyone should wait for draining */
357 wait_event(root->deactivate_waitq,
358 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
359
360 if (kernfs_lockdep(kn)) {
204 lock_acquired(&kn->dep_map, _RET_IP_); 361 lock_acquired(&kn->dep_map, _RET_IP_);
205 rwsem_release(&kn->dep_map, 1, _RET_IP_); 362 rwsem_release(&kn->dep_map, 1, _RET_IP_);
206 } 363 }
364
365 kernfs_unmap_bin_file(kn);
366
367 mutex_lock(&kernfs_mutex);
207} 368}
208 369
209/** 370/**
@@ -234,13 +395,15 @@ void kernfs_put(struct kernfs_node *kn)
234 return; 395 return;
235 root = kernfs_root(kn); 396 root = kernfs_root(kn);
236 repeat: 397 repeat:
237 /* Moving/renaming is always done while holding reference. 398 /*
399 * Moving/renaming is always done while holding reference.
238 * kn->parent won't change beneath us. 400 * kn->parent won't change beneath us.
239 */ 401 */
240 parent = kn->parent; 402 parent = kn->parent;
241 403
242 WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", 404 WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
243 parent ? parent->name : "", kn->name); 405 "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
406 parent ? parent->name : "", kn->name, atomic_read(&kn->active));
244 407
245 if (kernfs_type(kn) == KERNFS_LINK) 408 if (kernfs_type(kn) == KERNFS_LINK)
246 kernfs_put(kn->symlink.target_kn); 409 kernfs_put(kn->symlink.target_kn);
@@ -282,8 +445,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
282 kn = dentry->d_fsdata; 445 kn = dentry->d_fsdata;
283 mutex_lock(&kernfs_mutex); 446 mutex_lock(&kernfs_mutex);
284 447
285 /* The kernfs node has been deleted */ 448 /* The kernfs node has been deactivated */
286 if (kn->flags & KERNFS_REMOVED) 449 if (!kernfs_active(kn))
287 goto out_bad; 450 goto out_bad;
288 451
289 /* The kernfs node has been moved? */ 452 /* The kernfs node has been moved? */
@@ -328,6 +491,24 @@ const struct dentry_operations kernfs_dops = {
328 .d_release = kernfs_dop_release, 491 .d_release = kernfs_dop_release,
329}; 492};
330 493
494/**
495 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
496 * @dentry: the dentry in question
497 *
498 * Return the kernfs_node associated with @dentry. If @dentry is not a
499 * kernfs one, %NULL is returned.
500 *
501 * While the returned kernfs_node will stay accessible as long as @dentry
502 * is accessible, the returned node can be in any state and the caller is
503 * fully responsible for determining what's accessible.
504 */
505struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
506{
507 if (dentry->d_sb->s_op == &kernfs_sops)
508 return dentry->d_fsdata;
509 return NULL;
510}
511
331static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, 512static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
332 const char *name, umode_t mode, 513 const char *name, umode_t mode,
333 unsigned flags) 514 unsigned flags)
@@ -352,11 +533,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
352 kn->ino = ret; 533 kn->ino = ret;
353 534
354 atomic_set(&kn->count, 1); 535 atomic_set(&kn->count, 1);
355 atomic_set(&kn->active, 0); 536 atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
537 RB_CLEAR_NODE(&kn->rb);
356 538
357 kn->name = name; 539 kn->name = name;
358 kn->mode = mode; 540 kn->mode = mode;
359 kn->flags = flags | KERNFS_REMOVED; 541 kn->flags = flags;
360 542
361 return kn; 543 return kn;
362 544
@@ -382,69 +564,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
382} 564}
383 565
384/** 566/**
385 * kernfs_addrm_start - prepare for kernfs_node add/remove
386 * @acxt: pointer to kernfs_addrm_cxt to be used
387 *
388 * This function is called when the caller is about to add or remove
389 * kernfs_node. This function acquires kernfs_mutex. @acxt is used
390 * to keep and pass context to other addrm functions.
391 *
392 * LOCKING:
393 * Kernel thread context (may sleep). kernfs_mutex is locked on
394 * return.
395 */
396void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
397 __acquires(kernfs_mutex)
398{
399 memset(acxt, 0, sizeof(*acxt));
400
401 mutex_lock(&kernfs_mutex);
402}
403
404/**
405 * kernfs_add_one - add kernfs_node to parent without warning 567 * kernfs_add_one - add kernfs_node to parent without warning
406 * @acxt: addrm context to use
407 * @kn: kernfs_node to be added 568 * @kn: kernfs_node to be added
408 * 569 *
409 * The caller must already have initialized @kn->parent. This 570 * The caller must already have initialized @kn->parent. This
410 * function increments nlink of the parent's inode if @kn is a 571 * function increments nlink of the parent's inode if @kn is a
411 * directory and link into the children list of the parent. 572 * directory and link into the children list of the parent.
412 * 573 *
413 * This function should be called between calls to
414 * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
415 * the same @acxt as passed to kernfs_addrm_start().
416 *
417 * LOCKING:
418 * Determined by kernfs_addrm_start().
419 *
420 * RETURNS: 574 * RETURNS:
421 * 0 on success, -EEXIST if entry with the given name already 575 * 0 on success, -EEXIST if entry with the given name already
422 * exists. 576 * exists.
423 */ 577 */
424int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) 578int kernfs_add_one(struct kernfs_node *kn)
425{ 579{
426 struct kernfs_node *parent = kn->parent; 580 struct kernfs_node *parent = kn->parent;
427 bool has_ns = kernfs_ns_enabled(parent);
428 struct kernfs_iattrs *ps_iattr; 581 struct kernfs_iattrs *ps_iattr;
582 bool has_ns;
429 int ret; 583 int ret;
430 584
431 if (has_ns != (bool)kn->ns) { 585 mutex_lock(&kernfs_mutex);
432 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", 586
433 has_ns ? "required" : "invalid", parent->name, kn->name); 587 ret = -EINVAL;
434 return -EINVAL; 588 has_ns = kernfs_ns_enabled(parent);
435 } 589 if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
590 has_ns ? "required" : "invalid", parent->name, kn->name))
591 goto out_unlock;
436 592
437 if (kernfs_type(parent) != KERNFS_DIR) 593 if (kernfs_type(parent) != KERNFS_DIR)
438 return -EINVAL; 594 goto out_unlock;
439 595
440 if (parent->flags & KERNFS_REMOVED) 596 ret = -ENOENT;
441 return -ENOENT; 597 if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
598 goto out_unlock;
442 599
443 kn->hash = kernfs_name_hash(kn->name, kn->ns); 600 kn->hash = kernfs_name_hash(kn->name, kn->ns);
444 601
445 ret = kernfs_link_sibling(kn); 602 ret = kernfs_link_sibling(kn);
446 if (ret) 603 if (ret)
447 return ret; 604 goto out_unlock;
448 605
449 /* Update timestamps on the parent */ 606 /* Update timestamps on the parent */
450 ps_iattr = parent->iattr; 607 ps_iattr = parent->iattr;
@@ -453,82 +610,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
453 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; 610 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
454 } 611 }
455 612
456 /* Mark the entry added into directory tree */ 613 mutex_unlock(&kernfs_mutex);
457 kn->flags &= ~KERNFS_REMOVED;
458
459 return 0;
460}
461
462/**
463 * kernfs_remove_one - remove kernfs_node from parent
464 * @acxt: addrm context to use
465 * @kn: kernfs_node to be removed
466 *
467 * Mark @kn removed and drop nlink of parent inode if @kn is a
468 * directory. @kn is unlinked from the children list.
469 *
470 * This function should be called between calls to
471 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
472 * passed the same @acxt as passed to kernfs_addrm_start().
473 *
474 * LOCKING:
475 * Determined by kernfs_addrm_start().
476 */
477static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
478 struct kernfs_node *kn)
479{
480 struct kernfs_iattrs *ps_iattr;
481 614
482 /* 615 /*
483 * Removal can be called multiple times on the same node. Only the 616 * Activate the new node unless CREATE_DEACTIVATED is requested.
484 * first invocation is effective and puts the base ref. 617 * If not activated here, the kernfs user is responsible for
618 * activating the node with kernfs_activate(). A node which hasn't
619 * been activated is not visible to userland and its removal won't
620 * trigger deactivation.
485 */ 621 */
486 if (kn->flags & KERNFS_REMOVED) 622 if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
487 return; 623 kernfs_activate(kn);
488 624 return 0;
489 if (kn->parent) {
490 kernfs_unlink_sibling(kn);
491
492 /* Update timestamps on the parent */
493 ps_iattr = kn->parent->iattr;
494 if (ps_iattr) {
495 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
496 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
497 }
498 }
499
500 kn->flags |= KERNFS_REMOVED;
501 kn->u.removed_list = acxt->removed;
502 acxt->removed = kn;
503}
504 625
505/** 626out_unlock:
506 * kernfs_addrm_finish - finish up kernfs_node add/remove
507 * @acxt: addrm context to finish up
508 *
509 * Finish up kernfs_node add/remove. Resources acquired by
510 * kernfs_addrm_start() are released and removed kernfs_nodes are
511 * cleaned up.
512 *
513 * LOCKING:
514 * kernfs_mutex is released.
515 */
516void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
517 __releases(kernfs_mutex)
518{
519 /* release resources acquired by kernfs_addrm_start() */
520 mutex_unlock(&kernfs_mutex); 627 mutex_unlock(&kernfs_mutex);
521 628 return ret;
522 /* kill removed kernfs_nodes */
523 while (acxt->removed) {
524 struct kernfs_node *kn = acxt->removed;
525
526 acxt->removed = kn->u.removed_list;
527
528 kernfs_deactivate(kn);
529 kernfs_unmap_bin_file(kn);
530 kernfs_put(kn);
531 }
532} 629}
533 630
534/** 631/**
@@ -599,13 +696,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
599 696
600/** 697/**
601 * kernfs_create_root - create a new kernfs hierarchy 698 * kernfs_create_root - create a new kernfs hierarchy
602 * @kdops: optional directory syscall operations for the hierarchy 699 * @scops: optional syscall operations for the hierarchy
700 * @flags: KERNFS_ROOT_* flags
603 * @priv: opaque data associated with the new directory 701 * @priv: opaque data associated with the new directory
604 * 702 *
605 * Returns the root of the new hierarchy on success, ERR_PTR() value on 703 * Returns the root of the new hierarchy on success, ERR_PTR() value on
606 * failure. 704 * failure.
607 */ 705 */
608struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) 706struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
707 unsigned int flags, void *priv)
609{ 708{
610 struct kernfs_root *root; 709 struct kernfs_root *root;
611 struct kernfs_node *kn; 710 struct kernfs_node *kn;
@@ -624,12 +723,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
624 return ERR_PTR(-ENOMEM); 723 return ERR_PTR(-ENOMEM);
625 } 724 }
626 725
627 kn->flags &= ~KERNFS_REMOVED;
628 kn->priv = priv; 726 kn->priv = priv;
629 kn->dir.root = root; 727 kn->dir.root = root;
630 728
631 root->dir_ops = kdops; 729 root->syscall_ops = scops;
730 root->flags = flags;
632 root->kn = kn; 731 root->kn = kn;
732 init_waitqueue_head(&root->deactivate_waitq);
733
734 if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
735 kernfs_activate(kn);
633 736
634 return root; 737 return root;
635} 738}
@@ -660,7 +763,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
660 const char *name, umode_t mode, 763 const char *name, umode_t mode,
661 void *priv, const void *ns) 764 void *priv, const void *ns)
662{ 765{
663 struct kernfs_addrm_cxt acxt;
664 struct kernfs_node *kn; 766 struct kernfs_node *kn;
665 int rc; 767 int rc;
666 768
@@ -674,10 +776,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
674 kn->priv = priv; 776 kn->priv = priv;
675 777
676 /* link in */ 778 /* link in */
677 kernfs_addrm_start(&acxt); 779 rc = kernfs_add_one(kn);
678 rc = kernfs_add_one(&acxt, kn);
679 kernfs_addrm_finish(&acxt);
680
681 if (!rc) 780 if (!rc)
682 return kn; 781 return kn;
683 782
@@ -703,7 +802,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
703 kn = kernfs_find_ns(parent, dentry->d_name.name, ns); 802 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
704 803
705 /* no such entry */ 804 /* no such entry */
706 if (!kn) { 805 if (!kn || !kernfs_active(kn)) {
707 ret = NULL; 806 ret = NULL;
708 goto out_unlock; 807 goto out_unlock;
709 } 808 }
@@ -728,23 +827,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
728 umode_t mode) 827 umode_t mode)
729{ 828{
730 struct kernfs_node *parent = dir->i_private; 829 struct kernfs_node *parent = dir->i_private;
731 struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; 830 struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
831 int ret;
732 832
733 if (!kdops || !kdops->mkdir) 833 if (!scops || !scops->mkdir)
734 return -EPERM; 834 return -EPERM;
735 835
736 return kdops->mkdir(parent, dentry->d_name.name, mode); 836 if (!kernfs_get_active(parent))
837 return -ENODEV;
838
839 ret = scops->mkdir(parent, dentry->d_name.name, mode);
840
841 kernfs_put_active(parent);
842 return ret;
737} 843}
738 844
739static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) 845static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
740{ 846{
741 struct kernfs_node *kn = dentry->d_fsdata; 847 struct kernfs_node *kn = dentry->d_fsdata;
742 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 848 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
849 int ret;
743 850
744 if (!kdops || !kdops->rmdir) 851 if (!scops || !scops->rmdir)
745 return -EPERM; 852 return -EPERM;
746 853
747 return kdops->rmdir(kn); 854 if (!kernfs_get_active(kn))
855 return -ENODEV;
856
857 ret = scops->rmdir(kn);
858
859 kernfs_put_active(kn);
860 return ret;
748} 861}
749 862
750static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, 863static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -752,12 +865,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
752{ 865{
753 struct kernfs_node *kn = old_dentry->d_fsdata; 866 struct kernfs_node *kn = old_dentry->d_fsdata;
754 struct kernfs_node *new_parent = new_dir->i_private; 867 struct kernfs_node *new_parent = new_dir->i_private;
755 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; 868 struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
869 int ret;
756 870
757 if (!kdops || !kdops->rename) 871 if (!scops || !scops->rename)
758 return -EPERM; 872 return -EPERM;
759 873
760 return kdops->rename(kn, new_parent, new_dentry->d_name.name); 874 if (!kernfs_get_active(kn))
875 return -ENODEV;
876
877 if (!kernfs_get_active(new_parent)) {
878 kernfs_put_active(kn);
879 return -ENODEV;
880 }
881
882 ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
883
884 kernfs_put_active(new_parent);
885 kernfs_put_active(kn);
886 return ret;
761} 887}
762 888
763const struct inode_operations kernfs_dir_iops = { 889const struct inode_operations kernfs_dir_iops = {
@@ -830,23 +956,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
830 return pos->parent; 956 return pos->parent;
831} 957}
832 958
833static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, 959/**
834 struct kernfs_node *kn) 960 * kernfs_activate - activate a node which started deactivated
961 * @kn: kernfs_node whose subtree is to be activated
962 *
963 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
964 * needs to be explicitly activated. A node which hasn't been activated
965 * isn't visible to userland and deactivation is skipped during its
966 * removal. This is useful to construct atomic init sequences where
967 * creation of multiple nodes should either succeed or fail atomically.
968 *
969 * The caller is responsible for ensuring that this function is not called
970 * after kernfs_remove*() is invoked on @kn.
971 */
972void kernfs_activate(struct kernfs_node *kn)
835{ 973{
836 struct kernfs_node *pos, *next; 974 struct kernfs_node *pos;
837 975
838 if (!kn) 976 mutex_lock(&kernfs_mutex);
977
978 pos = NULL;
979 while ((pos = kernfs_next_descendant_post(pos, kn))) {
980 if (!pos || (pos->flags & KERNFS_ACTIVATED))
981 continue;
982
983 WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
984 WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
985
986 atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
987 pos->flags |= KERNFS_ACTIVATED;
988 }
989
990 mutex_unlock(&kernfs_mutex);
991}
992
993static void __kernfs_remove(struct kernfs_node *kn)
994{
995 struct kernfs_node *pos;
996
997 lockdep_assert_held(&kernfs_mutex);
998
999 /*
1000 * Short-circuit if non-root @kn has already finished removal.
1001 * This is for kernfs_remove_self() which plays with active ref
1002 * after removal.
1003 */
1004 if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
839 return; 1005 return;
840 1006
841 pr_debug("kernfs %s: removing\n", kn->name); 1007 pr_debug("kernfs %s: removing\n", kn->name);
842 1008
843 next = NULL; 1009 /* prevent any new usage under @kn by deactivating all nodes */
1010 pos = NULL;
1011 while ((pos = kernfs_next_descendant_post(pos, kn)))
1012 if (kernfs_active(pos))
1013 atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
1014
1015 /* deactivate and unlink the subtree node-by-node */
844 do { 1016 do {
845 pos = next; 1017 pos = kernfs_leftmost_descendant(kn);
846 next = kernfs_next_descendant_post(pos, kn); 1018
847 if (pos) 1019 /*
848 kernfs_remove_one(acxt, pos); 1020 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
849 } while (next); 1021 * base ref could have been put by someone else by the time
1022 * the function returns. Make sure it doesn't go away
1023 * underneath us.
1024 */
1025 kernfs_get(pos);
1026
1027 /*
1028 * Drain iff @kn was activated. This avoids draining and
1029 * its lockdep annotations for nodes which have never been
1030 * activated and allows embedding kernfs_remove() in create
1031 * error paths without worrying about draining.
1032 */
1033 if (kn->flags & KERNFS_ACTIVATED)
1034 kernfs_drain(pos);
1035 else
1036 WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
1037
1038 /*
1039 * kernfs_unlink_sibling() succeeds once per node. Use it
1040 * to decide who's responsible for cleanups.
1041 */
1042 if (!pos->parent || kernfs_unlink_sibling(pos)) {
1043 struct kernfs_iattrs *ps_iattr =
1044 pos->parent ? pos->parent->iattr : NULL;
1045
1046 /* update timestamps on the parent */
1047 if (ps_iattr) {
1048 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
1049 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
1050 }
1051
1052 kernfs_put(pos);
1053 }
1054
1055 kernfs_put(pos);
1056 } while (pos != kn);
850} 1057}
851 1058
852/** 1059/**
@@ -857,11 +1064,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
857 */ 1064 */
858void kernfs_remove(struct kernfs_node *kn) 1065void kernfs_remove(struct kernfs_node *kn)
859{ 1066{
860 struct kernfs_addrm_cxt acxt; 1067 mutex_lock(&kernfs_mutex);
1068 __kernfs_remove(kn);
1069 mutex_unlock(&kernfs_mutex);
1070}
1071
1072/**
1073 * kernfs_break_active_protection - break out of active protection
1074 * @kn: the self kernfs_node
1075 *
1076 * The caller must be running off of a kernfs operation which is invoked
1077 * with an active reference - e.g. one of kernfs_ops. Each invocation of
1078 * this function must also be matched with an invocation of
1079 * kernfs_unbreak_active_protection().
1080 *
1081 * This function releases the active reference of @kn the caller is
1082 * holding. Once this function is called, @kn may be removed at any point
1083 * and the caller is solely responsible for ensuring that the objects it
1084 * dereferences are accessible.
1085 */
1086void kernfs_break_active_protection(struct kernfs_node *kn)
1087{
1088 /*
1089 * Take out ourself out of the active ref dependency chain. If
1090 * we're called without an active ref, lockdep will complain.
1091 */
1092 kernfs_put_active(kn);
1093}
1094
1095/**
1096 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
1097 * @kn: the self kernfs_node
1098 *
1099 * If kernfs_break_active_protection() was called, this function must be
1100 * invoked before finishing the kernfs operation. Note that while this
1101 * function restores the active reference, it doesn't and can't actually
1102 * restore the active protection - @kn may already or be in the process of
1103 * being removed. Once kernfs_break_active_protection() is invoked, that
1104 * protection is irreversibly gone for the kernfs operation instance.
1105 *
1106 * While this function may be called at any point after
1107 * kernfs_break_active_protection() is invoked, its most useful location
1108 * would be right before the enclosing kernfs operation returns.
1109 */
1110void kernfs_unbreak_active_protection(struct kernfs_node *kn)
1111{
1112 /*
1113 * @kn->active could be in any state; however, the increment we do
1114 * here will be undone as soon as the enclosing kernfs operation
1115 * finishes and this temporary bump can't break anything. If @kn
1116 * is alive, nothing changes. If @kn is being deactivated, the
1117 * soon-to-follow put will either finish deactivation or restore
1118 * deactivated state. If @kn is already removed, the temporary
1119 * bump is guaranteed to be gone before @kn is released.
1120 */
1121 atomic_inc(&kn->active);
1122 if (kernfs_lockdep(kn))
1123 rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
1124}
1125
1126/**
1127 * kernfs_remove_self - remove a kernfs_node from its own method
1128 * @kn: the self kernfs_node to remove
1129 *
1130 * The caller must be running off of a kernfs operation which is invoked
1131 * with an active reference - e.g. one of kernfs_ops. This can be used to
1132 * implement a file operation which deletes itself.
1133 *
1134 * For example, the "delete" file for a sysfs device directory can be
1135 * implemented by invoking kernfs_remove_self() on the "delete" file
1136 * itself. This function breaks the circular dependency of trying to
1137 * deactivate self while holding an active ref itself. It isn't necessary
1138 * to modify the usual removal path to use kernfs_remove_self(). The
1139 * "delete" implementation can simply invoke kernfs_remove_self() on self
1140 * before proceeding with the usual removal path. kernfs will ignore later
1141 * kernfs_remove() on self.
1142 *
1143 * kernfs_remove_self() can be called multiple times concurrently on the
1144 * same kernfs_node. Only the first one actually performs removal and
1145 * returns %true. All others will wait until the kernfs operation which
1146 * won self-removal finishes and return %false. Note that the losers wait
1147 * for the completion of not only the winning kernfs_remove_self() but also
1148 * the whole kernfs_ops which won the arbitration. This can be used to
1149 * guarantee, for example, all concurrent writes to a "delete" file to
1150 * finish only after the whole operation is complete.
1151 */
1152bool kernfs_remove_self(struct kernfs_node *kn)
1153{
1154 bool ret;
1155
1156 mutex_lock(&kernfs_mutex);
1157 kernfs_break_active_protection(kn);
1158
1159 /*
1160 * SUICIDAL is used to arbitrate among competing invocations. Only
1161 * the first one will actually perform removal. When the removal
1162 * is complete, SUICIDED is set and the active ref is restored
1163 * while holding kernfs_mutex. The ones which lost arbitration
1164 * waits for SUICDED && drained which can happen only after the
1165 * enclosing kernfs operation which executed the winning instance
1166 * of kernfs_remove_self() finished.
1167 */
1168 if (!(kn->flags & KERNFS_SUICIDAL)) {
1169 kn->flags |= KERNFS_SUICIDAL;
1170 __kernfs_remove(kn);
1171 kn->flags |= KERNFS_SUICIDED;
1172 ret = true;
1173 } else {
1174 wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
1175 DEFINE_WAIT(wait);
861 1176
862 kernfs_addrm_start(&acxt); 1177 while (true) {
863 __kernfs_remove(&acxt, kn); 1178 prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
864 kernfs_addrm_finish(&acxt); 1179
1180 if ((kn->flags & KERNFS_SUICIDED) &&
1181 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
1182 break;
1183
1184 mutex_unlock(&kernfs_mutex);
1185 schedule();
1186 mutex_lock(&kernfs_mutex);
1187 }
1188 finish_wait(waitq, &wait);
1189 WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
1190 ret = false;
1191 }
1192
1193 /*
1194 * This must be done while holding kernfs_mutex; otherwise, waiting
1195 * for SUICIDED && deactivated could finish prematurely.
1196 */
1197 kernfs_unbreak_active_protection(kn);
1198
1199 mutex_unlock(&kernfs_mutex);
1200 return ret;
865} 1201}
866 1202
867/** 1203/**
@@ -876,7 +1212,6 @@ void kernfs_remove(struct kernfs_node *kn)
876int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, 1212int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
877 const void *ns) 1213 const void *ns)
878{ 1214{
879 struct kernfs_addrm_cxt acxt;
880 struct kernfs_node *kn; 1215 struct kernfs_node *kn;
881 1216
882 if (!parent) { 1217 if (!parent) {
@@ -885,13 +1220,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
885 return -ENOENT; 1220 return -ENOENT;
886 } 1221 }
887 1222
888 kernfs_addrm_start(&acxt); 1223 mutex_lock(&kernfs_mutex);
889 1224
890 kn = kernfs_find_ns(parent, name, ns); 1225 kn = kernfs_find_ns(parent, name, ns);
891 if (kn) 1226 if (kn)
892 __kernfs_remove(&acxt, kn); 1227 __kernfs_remove(kn);
893 1228
894 kernfs_addrm_finish(&acxt); 1229 mutex_unlock(&kernfs_mutex);
895 1230
896 if (kn) 1231 if (kn)
897 return 0; 1232 return 0;
@@ -909,12 +1244,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
909int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, 1244int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
910 const char *new_name, const void *new_ns) 1245 const char *new_name, const void *new_ns)
911{ 1246{
1247 struct kernfs_node *old_parent;
1248 const char *old_name = NULL;
912 int error; 1249 int error;
913 1250
1251 /* can't move or rename root */
1252 if (!kn->parent)
1253 return -EINVAL;
1254
914 mutex_lock(&kernfs_mutex); 1255 mutex_lock(&kernfs_mutex);
915 1256
916 error = -ENOENT; 1257 error = -ENOENT;
917 if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) 1258 if (!kernfs_active(kn) || !kernfs_active(new_parent))
918 goto out; 1259 goto out;
919 1260
920 error = 0; 1261 error = 0;
@@ -932,13 +1273,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
932 new_name = kstrdup(new_name, GFP_KERNEL); 1273 new_name = kstrdup(new_name, GFP_KERNEL);
933 if (!new_name) 1274 if (!new_name)
934 goto out; 1275 goto out;
935 1276 } else {
936 if (kn->flags & KERNFS_STATIC_NAME) 1277 new_name = NULL;
937 kn->flags &= ~KERNFS_STATIC_NAME;
938 else
939 kfree(kn->name);
940
941 kn->name = new_name;
942 } 1278 }
943 1279
944 /* 1280 /*
@@ -946,12 +1282,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
946 */ 1282 */
947 kernfs_unlink_sibling(kn); 1283 kernfs_unlink_sibling(kn);
948 kernfs_get(new_parent); 1284 kernfs_get(new_parent);
949 kernfs_put(kn->parent); 1285
1286 /* rename_lock protects ->parent and ->name accessors */
1287 spin_lock_irq(&kernfs_rename_lock);
1288
1289 old_parent = kn->parent;
1290 kn->parent = new_parent;
1291
950 kn->ns = new_ns; 1292 kn->ns = new_ns;
1293 if (new_name) {
1294 if (!(kn->flags & KERNFS_STATIC_NAME))
1295 old_name = kn->name;
1296 kn->flags &= ~KERNFS_STATIC_NAME;
1297 kn->name = new_name;
1298 }
1299
1300 spin_unlock_irq(&kernfs_rename_lock);
1301
951 kn->hash = kernfs_name_hash(kn->name, kn->ns); 1302 kn->hash = kernfs_name_hash(kn->name, kn->ns);
952 kn->parent = new_parent;
953 kernfs_link_sibling(kn); 1303 kernfs_link_sibling(kn);
954 1304
1305 kernfs_put(old_parent);
1306 kfree(old_name);
1307
955 error = 0; 1308 error = 0;
956 out: 1309 out:
957 mutex_unlock(&kernfs_mutex); 1310 mutex_unlock(&kernfs_mutex);
@@ -974,7 +1327,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
974 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) 1327 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
975{ 1328{
976 if (pos) { 1329 if (pos) {
977 int valid = !(pos->flags & KERNFS_REMOVED) && 1330 int valid = kernfs_active(pos) &&
978 pos->parent == parent && hash == pos->hash; 1331 pos->parent == parent && hash == pos->hash;
979 kernfs_put(pos); 1332 kernfs_put(pos);
980 if (!valid) 1333 if (!valid)
@@ -993,8 +1346,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
993 break; 1346 break;
994 } 1347 }
995 } 1348 }
996 /* Skip over entries in the wrong namespace */ 1349 /* Skip over entries which are dying/dead or in the wrong namespace */
997 while (pos && pos->ns != ns) { 1350 while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
998 struct rb_node *node = rb_next(&pos->rb); 1351 struct rb_node *node = rb_next(&pos->rb);
999 if (!node) 1352 if (!node)
1000 pos = NULL; 1353 pos = NULL;
@@ -1008,14 +1361,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1008 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) 1361 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1009{ 1362{
1010 pos = kernfs_dir_pos(ns, parent, ino, pos); 1363 pos = kernfs_dir_pos(ns, parent, ino, pos);
1011 if (pos) 1364 if (pos) {
1012 do { 1365 do {
1013 struct rb_node *node = rb_next(&pos->rb); 1366 struct rb_node *node = rb_next(&pos->rb);
1014 if (!node) 1367 if (!node)
1015 pos = NULL; 1368 pos = NULL;
1016 else 1369 else
1017 pos = rb_to_kn(node); 1370 pos = rb_to_kn(node);
1018 } while (pos && pos->ns != ns); 1371 } while (pos && (!kernfs_active(pos) || pos->ns != ns));
1372 }
1019 return pos; 1373 return pos;
1020} 1374}
1021 1375
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf397bfdff2..5e9a80cfc3d8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -252,10 +252,18 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
252 size_t count, loff_t *ppos) 252 size_t count, loff_t *ppos)
253{ 253{
254 struct kernfs_open_file *of = kernfs_of(file); 254 struct kernfs_open_file *of = kernfs_of(file);
255 ssize_t len = min_t(size_t, count, PAGE_SIZE);
256 const struct kernfs_ops *ops; 255 const struct kernfs_ops *ops;
256 size_t len;
257 char *buf; 257 char *buf;
258 258
259 if (of->atomic_write_len) {
260 len = count;
261 if (len > of->atomic_write_len)
262 return -E2BIG;
263 } else {
264 len = min_t(size_t, count, PAGE_SIZE);
265 }
266
259 buf = kmalloc(len + 1, GFP_KERNEL); 267 buf = kmalloc(len + 1, GFP_KERNEL);
260 if (!buf) 268 if (!buf)
261 return -ENOMEM; 269 return -ENOMEM;
@@ -476,6 +484,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
476 484
477 ops = kernfs_ops(of->kn); 485 ops = kernfs_ops(of->kn);
478 rc = ops->mmap(of, vma); 486 rc = ops->mmap(of, vma);
487 if (rc)
488 goto out_put;
479 489
480 /* 490 /*
481 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() 491 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
@@ -600,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
600static int kernfs_fop_open(struct inode *inode, struct file *file) 610static int kernfs_fop_open(struct inode *inode, struct file *file)
601{ 611{
602 struct kernfs_node *kn = file->f_path.dentry->d_fsdata; 612 struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
613 struct kernfs_root *root = kernfs_root(kn);
603 const struct kernfs_ops *ops; 614 const struct kernfs_ops *ops;
604 struct kernfs_open_file *of; 615 struct kernfs_open_file *of;
605 bool has_read, has_write, has_mmap; 616 bool has_read, has_write, has_mmap;
@@ -614,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
614 has_write = ops->write || ops->mmap; 625 has_write = ops->write || ops->mmap;
615 has_mmap = ops->mmap; 626 has_mmap = ops->mmap;
616 627
617 /* check perms and supported operations */ 628 /* see the flag definition for details */
618 if ((file->f_mode & FMODE_WRITE) && 629 if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
619 (!(inode->i_mode & S_IWUGO) || !has_write)) 630 if ((file->f_mode & FMODE_WRITE) &&
620 goto err_out; 631 (!(inode->i_mode & S_IWUGO) || !has_write))
632 goto err_out;
621 633
622 if ((file->f_mode & FMODE_READ) && 634 if ((file->f_mode & FMODE_READ) &&
623 (!(inode->i_mode & S_IRUGO) || !has_read)) 635 (!(inode->i_mode & S_IRUGO) || !has_read))
624 goto err_out; 636 goto err_out;
637 }
625 638
626 /* allocate a kernfs_open_file for the file */ 639 /* allocate a kernfs_open_file for the file */
627 error = -ENOMEM; 640 error = -ENOMEM;
@@ -653,6 +666,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
653 of->file = file; 666 of->file = file;
654 667
655 /* 668 /*
669 * Write path needs to atomic_write_len outside active reference.
670 * Cache it in open_file. See kernfs_fop_write() for details.
671 */
672 of->atomic_write_len = ops->atomic_write_len;
673
674 /*
656 * Always instantiate seq_file even if read access doesn't use 675 * Always instantiate seq_file even if read access doesn't use
657 * seq_file or is not requested. This unifies private data access 676 * seq_file or is not requested. This unifies private data access
658 * and readable regular files are the vast majority anyway. 677 * and readable regular files are the vast majority anyway.
@@ -820,7 +839,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
820 bool name_is_static, 839 bool name_is_static,
821 struct lock_class_key *key) 840 struct lock_class_key *key)
822{ 841{
823 struct kernfs_addrm_cxt acxt;
824 struct kernfs_node *kn; 842 struct kernfs_node *kn;
825 unsigned flags; 843 unsigned flags;
826 int rc; 844 int rc;
@@ -855,10 +873,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
855 if (ops->mmap) 873 if (ops->mmap)
856 kn->flags |= KERNFS_HAS_MMAP; 874 kn->flags |= KERNFS_HAS_MMAP;
857 875
858 kernfs_addrm_start(&acxt); 876 rc = kernfs_add_one(kn);
859 rc = kernfs_add_one(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861
862 if (rc) { 877 if (rc) {
863 kernfs_put(kn); 878 kernfs_put(kn);
864 return ERR_PTR(rc); 879 return ERR_PTR(rc);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index e55126f85bd2..985217626e66 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -48,14 +48,18 @@ void __init kernfs_inode_init(void)
48 48
49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) 49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
50{ 50{
51 static DEFINE_MUTEX(iattr_mutex);
52 struct kernfs_iattrs *ret;
51 struct iattr *iattrs; 53 struct iattr *iattrs;
52 54
55 mutex_lock(&iattr_mutex);
56
53 if (kn->iattr) 57 if (kn->iattr)
54 return kn->iattr; 58 goto out_unlock;
55 59
56 kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); 60 kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
57 if (!kn->iattr) 61 if (!kn->iattr)
58 return NULL; 62 goto out_unlock;
59 iattrs = &kn->iattr->ia_iattr; 63 iattrs = &kn->iattr->ia_iattr;
60 64
61 /* assign default attributes */ 65 /* assign default attributes */
@@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
65 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; 69 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
66 70
67 simple_xattrs_init(&kn->iattr->xattrs); 71 simple_xattrs_init(&kn->iattr->xattrs);
68 72out_unlock:
69 return kn->iattr; 73 ret = kn->iattr;
74 mutex_unlock(&iattr_mutex);
75 return ret;
70} 76}
71 77
72static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) 78static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -355,7 +361,7 @@ void kernfs_evict_inode(struct inode *inode)
355{ 361{
356 struct kernfs_node *kn = inode->i_private; 362 struct kernfs_node *kn = inode->i_private;
357 363
358 truncate_inode_pages(&inode->i_data, 0); 364 truncate_inode_pages_final(&inode->i_data);
359 clear_inode(inode); 365 clear_inode(inode);
360 kernfs_put(kn); 366 kernfs_put(kn);
361} 367}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eb536b76374a..8be13b2a079b 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,7 +26,8 @@ struct kernfs_iattrs {
26 struct simple_xattrs xattrs; 26 struct simple_xattrs xattrs;
27}; 27};
28 28
29#define KN_DEACTIVATED_BIAS INT_MIN 29/* +1 to avoid triggering overflow warning when negating it */
30#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
30 31
31/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ 32/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
32 33
@@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
45} 46}
46 47
47/* 48/*
48 * Context structure to be used while adding/removing nodes.
49 */
50struct kernfs_addrm_cxt {
51 struct kernfs_node *removed;
52};
53
54/*
55 * mount.c 49 * mount.c
56 */ 50 */
57struct kernfs_super_info { 51struct kernfs_super_info {
@@ -71,6 +65,7 @@ struct kernfs_super_info {
71}; 65};
72#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) 66#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
73 67
68extern const struct super_operations kernfs_sops;
74extern struct kmem_cache *kernfs_node_cache; 69extern struct kmem_cache *kernfs_node_cache;
75 70
76/* 71/*
@@ -100,9 +95,7 @@ extern const struct inode_operations kernfs_dir_iops;
100 95
101struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); 96struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
102void kernfs_put_active(struct kernfs_node *kn); 97void kernfs_put_active(struct kernfs_node *kn);
103void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); 98int kernfs_add_one(struct kernfs_node *kn);
104int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
105void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
106struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, 99struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
107 const char *name, umode_t mode, 100 const char *name, umode_t mode,
108 unsigned flags); 101 unsigned flags);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0f4152defe7b..95dcd1d558bb 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -19,13 +19,50 @@
19 19
20struct kmem_cache *kernfs_node_cache; 20struct kmem_cache *kernfs_node_cache;
21 21
22static const struct super_operations kernfs_sops = { 22static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
23{
24 struct kernfs_root *root = kernfs_info(sb)->root;
25 struct kernfs_syscall_ops *scops = root->syscall_ops;
26
27 if (scops && scops->remount_fs)
28 return scops->remount_fs(root, flags, data);
29 return 0;
30}
31
32static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
33{
34 struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
35 struct kernfs_syscall_ops *scops = root->syscall_ops;
36
37 if (scops && scops->show_options)
38 return scops->show_options(sf, root);
39 return 0;
40}
41
42const struct super_operations kernfs_sops = {
23 .statfs = simple_statfs, 43 .statfs = simple_statfs,
24 .drop_inode = generic_delete_inode, 44 .drop_inode = generic_delete_inode,
25 .evict_inode = kernfs_evict_inode, 45 .evict_inode = kernfs_evict_inode,
46
47 .remount_fs = kernfs_sop_remount_fs,
48 .show_options = kernfs_sop_show_options,
26}; 49};
27 50
28static int kernfs_fill_super(struct super_block *sb) 51/**
52 * kernfs_root_from_sb - determine kernfs_root associated with a super_block
53 * @sb: the super_block in question
54 *
55 * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
56 * %NULL is returned.
57 */
58struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
59{
60 if (sb->s_op == &kernfs_sops)
61 return kernfs_info(sb)->root;
62 return NULL;
63}
64
65static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
29{ 66{
30 struct kernfs_super_info *info = kernfs_info(sb); 67 struct kernfs_super_info *info = kernfs_info(sb);
31 struct inode *inode; 68 struct inode *inode;
@@ -33,7 +70,7 @@ static int kernfs_fill_super(struct super_block *sb)
33 70
34 sb->s_blocksize = PAGE_CACHE_SIZE; 71 sb->s_blocksize = PAGE_CACHE_SIZE;
35 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 72 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
36 sb->s_magic = SYSFS_MAGIC; 73 sb->s_magic = magic;
37 sb->s_op = &kernfs_sops; 74 sb->s_op = &kernfs_sops;
38 sb->s_time_gran = 1; 75 sb->s_time_gran = 1;
39 76
@@ -94,6 +131,7 @@ const void *kernfs_super_ns(struct super_block *sb)
94 * @fs_type: file_system_type of the fs being mounted 131 * @fs_type: file_system_type of the fs being mounted
95 * @flags: mount flags specified for the mount 132 * @flags: mount flags specified for the mount
96 * @root: kernfs_root of the hierarchy being mounted 133 * @root: kernfs_root of the hierarchy being mounted
134 * @magic: file system specific magic number
97 * @new_sb_created: tell the caller if we allocated a new superblock 135 * @new_sb_created: tell the caller if we allocated a new superblock
98 * @ns: optional namespace tag of the mount 136 * @ns: optional namespace tag of the mount
99 * 137 *
@@ -105,8 +143,8 @@ const void *kernfs_super_ns(struct super_block *sb)
105 * The return value can be passed to the vfs layer verbatim. 143 * The return value can be passed to the vfs layer verbatim.
106 */ 144 */
107struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, 145struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
108 struct kernfs_root *root, bool *new_sb_created, 146 struct kernfs_root *root, unsigned long magic,
109 const void *ns) 147 bool *new_sb_created, const void *ns)
110{ 148{
111 struct super_block *sb; 149 struct super_block *sb;
112 struct kernfs_super_info *info; 150 struct kernfs_super_info *info;
@@ -129,7 +167,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
129 *new_sb_created = !sb->s_root; 167 *new_sb_created = !sb->s_root;
130 168
131 if (!sb->s_root) { 169 if (!sb->s_root) {
132 error = kernfs_fill_super(sb); 170 error = kernfs_fill_super(sb, magic);
133 if (error) { 171 if (error) {
134 deactivate_locked_super(sb); 172 deactivate_locked_super(sb);
135 return ERR_PTR(error); 173 return ERR_PTR(error);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 4d457055acb9..8a198898e39a 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
27 struct kernfs_node *target) 27 struct kernfs_node *target)
28{ 28{
29 struct kernfs_node *kn; 29 struct kernfs_node *kn;
30 struct kernfs_addrm_cxt acxt;
31 int error; 30 int error;
32 31
33 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); 32 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
@@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
39 kn->symlink.target_kn = target; 38 kn->symlink.target_kn = target;
40 kernfs_get(target); /* ref owned by symlink */ 39 kernfs_get(target); /* ref owned by symlink */
41 40
42 kernfs_addrm_start(&acxt); 41 error = kernfs_add_one(kn);
43 error = kernfs_add_one(&acxt, kn);
44 kernfs_addrm_finish(&acxt);
45
46 if (!error) 42 if (!error)
47 return kn; 43 return kn;
48 44
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 10d6c41aecad..6bf06a07f3e0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -235,6 +235,7 @@ out_err:
235 if (warned++ == 0) 235 if (warned++ == 0)
236 printk(KERN_WARNING 236 printk(KERN_WARNING
237 "lockd_up: makesock failed, error=%d\n", err); 237 "lockd_up: makesock failed, error=%d\n", err);
238 svc_shutdown_net(serv, net);
238 return err; 239 return err;
239} 240}
240 241
diff --git a/fs/locks.c b/fs/locks.c
index 92a0f0a52b06..e390bd9ae068 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,7 @@
135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 135#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 136#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 137#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
138#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
138 139
139static bool lease_breaking(struct file_lock *fl) 140static bool lease_breaking(struct file_lock *fl)
140{ 141{
@@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type)
344 return 0; 345 return 0;
345} 346}
346 347
347/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX 348static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
348 * style lock. 349 struct flock64 *l)
349 */
350static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
351 struct flock *l)
352{ 350{
353 off_t start, end;
354
355 switch (l->l_whence) { 351 switch (l->l_whence) {
356 case SEEK_SET: 352 case SEEK_SET:
357 start = 0; 353 fl->fl_start = 0;
358 break; 354 break;
359 case SEEK_CUR: 355 case SEEK_CUR:
360 start = filp->f_pos; 356 fl->fl_start = filp->f_pos;
361 break; 357 break;
362 case SEEK_END: 358 case SEEK_END:
363 start = i_size_read(file_inode(filp)); 359 fl->fl_start = i_size_read(file_inode(filp));
364 break; 360 break;
365 default: 361 default:
366 return -EINVAL; 362 return -EINVAL;
367 } 363 }
364 if (l->l_start > OFFSET_MAX - fl->fl_start)
365 return -EOVERFLOW;
366 fl->fl_start += l->l_start;
367 if (fl->fl_start < 0)
368 return -EINVAL;
368 369
369 /* POSIX-1996 leaves the case l->l_len < 0 undefined; 370 /* POSIX-1996 leaves the case l->l_len < 0 undefined;
370 POSIX-2001 defines it. */ 371 POSIX-2001 defines it. */
371 start += l->l_start;
372 if (start < 0)
373 return -EINVAL;
374 fl->fl_end = OFFSET_MAX;
375 if (l->l_len > 0) { 372 if (l->l_len > 0) {
376 end = start + l->l_len - 1; 373 if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
377 fl->fl_end = end; 374 return -EOVERFLOW;
375 fl->fl_end = fl->fl_start + l->l_len - 1;
376
378 } else if (l->l_len < 0) { 377 } else if (l->l_len < 0) {
379 end = start - 1; 378 if (fl->fl_start + l->l_len < 0)
380 fl->fl_end = end;
381 start += l->l_len;
382 if (start < 0)
383 return -EINVAL; 379 return -EINVAL;
384 } 380 fl->fl_end = fl->fl_start - 1;
385 fl->fl_start = start; /* we record the absolute position */ 381 fl->fl_start += l->l_len;
386 if (fl->fl_end < fl->fl_start) 382 } else
387 return -EOVERFLOW; 383 fl->fl_end = OFFSET_MAX;
388 384
389 fl->fl_owner = current->files; 385 fl->fl_owner = current->files;
390 fl->fl_pid = current->tgid; 386 fl->fl_pid = current->tgid;
391 fl->fl_file = filp; 387 fl->fl_file = filp;
@@ -396,52 +392,21 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
396 return assign_type(fl, l->l_type); 392 return assign_type(fl, l->l_type);
397} 393}
398 394
399#if BITS_PER_LONG == 32 395/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
400static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, 396 * style lock.
401 struct flock64 *l) 397 */
398static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
399 struct flock *l)
402{ 400{
403 loff_t start; 401 struct flock64 ll = {
404 402 .l_type = l->l_type,
405 switch (l->l_whence) { 403 .l_whence = l->l_whence,
406 case SEEK_SET: 404 .l_start = l->l_start,
407 start = 0; 405 .l_len = l->l_len,
408 break; 406 };
409 case SEEK_CUR:
410 start = filp->f_pos;
411 break;
412 case SEEK_END:
413 start = i_size_read(file_inode(filp));
414 break;
415 default:
416 return -EINVAL;
417 }
418 407
419 start += l->l_start; 408 return flock64_to_posix_lock(filp, fl, &ll);
420 if (start < 0)
421 return -EINVAL;
422 fl->fl_end = OFFSET_MAX;
423 if (l->l_len > 0) {
424 fl->fl_end = start + l->l_len - 1;
425 } else if (l->l_len < 0) {
426 fl->fl_end = start - 1;
427 start += l->l_len;
428 if (start < 0)
429 return -EINVAL;
430 }
431 fl->fl_start = start; /* we record the absolute position */
432 if (fl->fl_end < fl->fl_start)
433 return -EOVERFLOW;
434
435 fl->fl_owner = current->files;
436 fl->fl_pid = current->tgid;
437 fl->fl_file = filp;
438 fl->fl_flags = FL_POSIX;
439 fl->fl_ops = NULL;
440 fl->fl_lmops = NULL;
441
442 return assign_type(fl, l->l_type);
443} 409}
444#endif
445 410
446/* default lease lock manager operations */ 411/* default lease lock manager operations */
447static void lease_break_callback(struct file_lock *fl) 412static void lease_break_callback(struct file_lock *fl)
@@ -511,8 +476,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
511} 476}
512 477
513/* Must be called with the i_lock held! */ 478/* Must be called with the i_lock held! */
514static inline void 479static void locks_insert_global_locks(struct file_lock *fl)
515locks_insert_global_locks(struct file_lock *fl)
516{ 480{
517 lg_local_lock(&file_lock_lglock); 481 lg_local_lock(&file_lock_lglock);
518 fl->fl_link_cpu = smp_processor_id(); 482 fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +485,7 @@ locks_insert_global_locks(struct file_lock *fl)
521} 485}
522 486
523/* Must be called with the i_lock held! */ 487/* Must be called with the i_lock held! */
524static inline void 488static void locks_delete_global_locks(struct file_lock *fl)
525locks_delete_global_locks(struct file_lock *fl)
526{ 489{
527 /* 490 /*
528 * Avoid taking lock if already unhashed. This is safe since this check 491 * Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +507,12 @@ posix_owner_key(struct file_lock *fl)
544 return (unsigned long)fl->fl_owner; 507 return (unsigned long)fl->fl_owner;
545} 508}
546 509
547static inline void 510static void locks_insert_global_blocked(struct file_lock *waiter)
548locks_insert_global_blocked(struct file_lock *waiter)
549{ 511{
550 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); 512 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
551} 513}
552 514
553static inline void 515static void locks_delete_global_blocked(struct file_lock *waiter)
554locks_delete_global_blocked(struct file_lock *waiter)
555{ 516{
556 hash_del(&waiter->fl_link); 517 hash_del(&waiter->fl_link);
557} 518}
@@ -581,7 +542,7 @@ static void locks_delete_block(struct file_lock *waiter)
581 * it seems like the reasonable thing to do. 542 * it seems like the reasonable thing to do.
582 * 543 *
583 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block 544 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
584 * list itself is protected by the file_lock_list, but by ensuring that the 545 * list itself is protected by the blocked_lock_lock, but by ensuring that the
585 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock 546 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
586 * in some cases when we see that the fl_block list is empty. 547 * in some cases when we see that the fl_block list is empty.
587 */ 548 */
@@ -591,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker,
591 BUG_ON(!list_empty(&waiter->fl_block)); 552 BUG_ON(!list_empty(&waiter->fl_block));
592 waiter->fl_next = blocker; 553 waiter->fl_next = blocker;
593 list_add_tail(&waiter->fl_block, &blocker->fl_block); 554 list_add_tail(&waiter->fl_block, &blocker->fl_block);
594 if (IS_POSIX(blocker)) 555 if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
595 locks_insert_global_blocked(waiter); 556 locks_insert_global_blocked(waiter);
596} 557}
597 558
@@ -652,15 +613,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
652 locks_insert_global_locks(fl); 613 locks_insert_global_locks(fl);
653} 614}
654 615
655/* 616/**
656 * Delete a lock and then free it. 617 * locks_delete_lock - Delete a lock and then free it.
657 * Wake up processes that are blocked waiting for this lock, 618 * @thisfl_p: pointer that points to the fl_next field of the previous
658 * notify the FS that the lock has been cleared and 619 * inode->i_flock list entry
659 * finally free the lock. 620 *
621 * Unlink a lock from all lists and free the namespace reference, but don't
622 * free it yet. Wake up processes that are blocked waiting for this lock and
623 * notify the FS that the lock has been cleared.
660 * 624 *
661 * Must be called with the i_lock held! 625 * Must be called with the i_lock held!
662 */ 626 */
663static void locks_delete_lock(struct file_lock **thisfl_p) 627static void locks_unlink_lock(struct file_lock **thisfl_p)
664{ 628{
665 struct file_lock *fl = *thisfl_p; 629 struct file_lock *fl = *thisfl_p;
666 630
@@ -675,6 +639,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
675 } 639 }
676 640
677 locks_wake_up_blocks(fl); 641 locks_wake_up_blocks(fl);
642}
643
644/*
645 * Unlink a lock from all lists and free it.
646 *
647 * Must be called with i_lock held!
648 */
649static void locks_delete_lock(struct file_lock **thisfl_p)
650{
651 struct file_lock *fl = *thisfl_p;
652
653 locks_unlink_lock(thisfl_p);
678 locks_free_lock(fl); 654 locks_free_lock(fl);
679} 655}
680 656
@@ -769,8 +745,16 @@ EXPORT_SYMBOL(posix_test_lock);
769 * Note: the above assumption may not be true when handling lock 745 * Note: the above assumption may not be true when handling lock
770 * requests from a broken NFS client. It may also fail in the presence 746 * requests from a broken NFS client. It may also fail in the presence
771 * of tasks (such as posix threads) sharing the same open file table. 747 * of tasks (such as posix threads) sharing the same open file table.
772 *
773 * To handle those cases, we just bail out after a few iterations. 748 * To handle those cases, we just bail out after a few iterations.
749 *
750 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
751 * Because the owner is not even nominally tied to a thread of
752 * execution, the deadlock detection below can't reasonably work well. Just
753 * skip it for those.
754 *
755 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
756 * locks that just checks for the case where two tasks are attempting to
757 * upgrade from read to write locks on the same inode.
774 */ 758 */
775 759
776#define MAX_DEADLK_ITERATIONS 10 760#define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +777,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
793{ 777{
794 int i = 0; 778 int i = 0;
795 779
780 /*
781 * This deadlock detector can't reasonably detect deadlocks with
782 * FL_OFDLCK locks, since they aren't owned by a process, per-se.
783 */
784 if (IS_OFDLCK(caller_fl))
785 return 0;
786
796 while ((block_fl = what_owner_is_waiting_for(block_fl))) { 787 while ((block_fl = what_owner_is_waiting_for(block_fl))) {
797 if (i++ > MAX_DEADLK_ITERATIONS) 788 if (i++ > MAX_DEADLK_ITERATIONS)
798 return 0; 789 return 0;
@@ -1152,13 +1143,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1152 1143
1153/** 1144/**
1154 * locks_mandatory_locked - Check for an active lock 1145 * locks_mandatory_locked - Check for an active lock
1155 * @inode: the file to check 1146 * @file: the file to check
1156 * 1147 *
1157 * Searches the inode's list of locks to find any POSIX locks which conflict. 1148 * Searches the inode's list of locks to find any POSIX locks which conflict.
1158 * This function is called from locks_verify_locked() only. 1149 * This function is called from locks_verify_locked() only.
1159 */ 1150 */
1160int locks_mandatory_locked(struct inode *inode) 1151int locks_mandatory_locked(struct file *file)
1161{ 1152{
1153 struct inode *inode = file_inode(file);
1162 fl_owner_t owner = current->files; 1154 fl_owner_t owner = current->files;
1163 struct file_lock *fl; 1155 struct file_lock *fl;
1164 1156
@@ -1169,7 +1161,7 @@ int locks_mandatory_locked(struct inode *inode)
1169 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1161 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1170 if (!IS_POSIX(fl)) 1162 if (!IS_POSIX(fl))
1171 continue; 1163 continue;
1172 if (fl->fl_owner != owner) 1164 if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
1173 break; 1165 break;
1174 } 1166 }
1175 spin_unlock(&inode->i_lock); 1167 spin_unlock(&inode->i_lock);
@@ -1195,19 +1187,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1195{ 1187{
1196 struct file_lock fl; 1188 struct file_lock fl;
1197 int error; 1189 int error;
1190 bool sleep = false;
1198 1191
1199 locks_init_lock(&fl); 1192 locks_init_lock(&fl);
1200 fl.fl_owner = current->files;
1201 fl.fl_pid = current->tgid; 1193 fl.fl_pid = current->tgid;
1202 fl.fl_file = filp; 1194 fl.fl_file = filp;
1203 fl.fl_flags = FL_POSIX | FL_ACCESS; 1195 fl.fl_flags = FL_POSIX | FL_ACCESS;
1204 if (filp && !(filp->f_flags & O_NONBLOCK)) 1196 if (filp && !(filp->f_flags & O_NONBLOCK))
1205 fl.fl_flags |= FL_SLEEP; 1197 sleep = true;
1206 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; 1198 fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
1207 fl.fl_start = offset; 1199 fl.fl_start = offset;
1208 fl.fl_end = offset + count - 1; 1200 fl.fl_end = offset + count - 1;
1209 1201
1210 for (;;) { 1202 for (;;) {
1203 if (filp) {
1204 fl.fl_owner = (fl_owner_t)filp;
1205 fl.fl_flags &= ~FL_SLEEP;
1206 error = __posix_lock_file(inode, &fl, NULL);
1207 if (!error)
1208 break;
1209 }
1210
1211 if (sleep)
1212 fl.fl_flags |= FL_SLEEP;
1213 fl.fl_owner = current->files;
1211 error = __posix_lock_file(inode, &fl, NULL); 1214 error = __posix_lock_file(inode, &fl, NULL);
1212 if (error != FILE_LOCK_DEFERRED) 1215 if (error != FILE_LOCK_DEFERRED)
1213 break; 1216 break;
@@ -1376,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1376 1379
1377restart: 1380restart:
1378 break_time = flock->fl_break_time; 1381 break_time = flock->fl_break_time;
1379 if (break_time != 0) { 1382 if (break_time != 0)
1380 break_time -= jiffies; 1383 break_time -= jiffies;
1381 if (break_time == 0) 1384 if (break_time == 0)
1382 break_time++; 1385 break_time++;
1383 }
1384 locks_insert_block(flock, new_fl); 1386 locks_insert_block(flock, new_fl);
1385 spin_unlock(&inode->i_lock); 1387 spin_unlock(&inode->i_lock);
1386 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1388 error = wait_event_interruptible_timeout(new_fl->fl_wait,
@@ -1472,6 +1474,32 @@ int fcntl_getlease(struct file *filp)
1472 return type; 1474 return type;
1473} 1475}
1474 1476
1477/**
1478 * check_conflicting_open - see if the given dentry points to a file that has
1479 * an existing open that would conflict with the
1480 * desired lease.
1481 * @dentry: dentry to check
1482 * @arg: type of lease that we're trying to acquire
1483 *
1484 * Check to see if there's an existing open fd on this file that would
1485 * conflict with the lease we're trying to set.
1486 */
1487static int
1488check_conflicting_open(const struct dentry *dentry, const long arg)
1489{
1490 int ret = 0;
1491 struct inode *inode = dentry->d_inode;
1492
1493 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1494 return -EAGAIN;
1495
1496 if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
1497 (atomic_read(&inode->i_count) > 1)))
1498 ret = -EAGAIN;
1499
1500 return ret;
1501}
1502
1475static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1503static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1476{ 1504{
1477 struct file_lock *fl, **before, **my_before = NULL, *lease; 1505 struct file_lock *fl, **before, **my_before = NULL, *lease;
@@ -1499,12 +1527,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1499 return -EINVAL; 1527 return -EINVAL;
1500 } 1528 }
1501 1529
1502 error = -EAGAIN; 1530 error = check_conflicting_open(dentry, arg);
1503 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1531 if (error)
1504 goto out;
1505 if ((arg == F_WRLCK)
1506 && ((d_count(dentry) > 1)
1507 || (atomic_read(&inode->i_count) > 1)))
1508 goto out; 1532 goto out;
1509 1533
1510 /* 1534 /*
@@ -1549,7 +1573,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1549 goto out; 1573 goto out;
1550 1574
1551 locks_insert_lock(before, lease); 1575 locks_insert_lock(before, lease);
1552 error = 0; 1576 /*
1577 * The check in break_lease() is lockless. It's possible for another
1578 * open to race in after we did the earlier check for a conflicting
1579 * open but before the lease was inserted. Check again for a
1580 * conflicting open and cancel the lease if there is one.
1581 *
1582 * We also add a barrier here to ensure that the insertion of the lock
1583 * precedes these checks.
1584 */
1585 smp_mb();
1586 error = check_conflicting_open(dentry, arg);
1587 if (error)
1588 locks_unlink_lock(flp);
1553out: 1589out:
1554 if (is_deleg) 1590 if (is_deleg)
1555 mutex_unlock(&inode->i_mutex); 1591 mutex_unlock(&inode->i_mutex);
@@ -1842,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
1842 1878
1843static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) 1879static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1844{ 1880{
1845 flock->l_pid = fl->fl_pid; 1881 flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
1846#if BITS_PER_LONG == 32 1882#if BITS_PER_LONG == 32
1847 /* 1883 /*
1848 * Make sure we can represent the posix lock via 1884 * Make sure we can represent the posix lock via
@@ -1864,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
1864#if BITS_PER_LONG == 32 1900#if BITS_PER_LONG == 32
1865static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) 1901static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1866{ 1902{
1867 flock->l_pid = fl->fl_pid; 1903 flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
1868 flock->l_start = fl->fl_start; 1904 flock->l_start = fl->fl_start;
1869 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : 1905 flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
1870 fl->fl_end - fl->fl_start + 1; 1906 fl->fl_end - fl->fl_start + 1;
@@ -1876,7 +1912,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
1876/* Report the first existing lock that would conflict with l. 1912/* Report the first existing lock that would conflict with l.
1877 * This implements the F_GETLK command of fcntl(). 1913 * This implements the F_GETLK command of fcntl().
1878 */ 1914 */
1879int fcntl_getlk(struct file *filp, struct flock __user *l) 1915int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
1880{ 1916{
1881 struct file_lock file_lock; 1917 struct file_lock file_lock;
1882 struct flock flock; 1918 struct flock flock;
@@ -1893,6 +1929,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
1893 if (error) 1929 if (error)
1894 goto out; 1930 goto out;
1895 1931
1932 if (cmd == F_OFD_GETLK) {
1933 error = -EINVAL;
1934 if (flock.l_pid != 0)
1935 goto out;
1936
1937 cmd = F_GETLK;
1938 file_lock.fl_flags |= FL_OFDLCK;
1939 file_lock.fl_owner = (fl_owner_t)filp;
1940 }
1941
1896 error = vfs_test_lock(filp, &file_lock); 1942 error = vfs_test_lock(filp, &file_lock);
1897 if (error) 1943 if (error)
1898 goto out; 1944 goto out;
@@ -1976,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
1976 return error; 2022 return error;
1977} 2023}
1978 2024
2025/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
2026static int
2027check_fmode_for_setlk(struct file_lock *fl)
2028{
2029 switch (fl->fl_type) {
2030 case F_RDLCK:
2031 if (!(fl->fl_file->f_mode & FMODE_READ))
2032 return -EBADF;
2033 break;
2034 case F_WRLCK:
2035 if (!(fl->fl_file->f_mode & FMODE_WRITE))
2036 return -EBADF;
2037 }
2038 return 0;
2039}
2040
1979/* Apply the lock described by l to an open file descriptor. 2041/* Apply the lock described by l to an open file descriptor.
1980 * This implements both the F_SETLK and F_SETLKW commands of fcntl(). 2042 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
1981 */ 2043 */
@@ -2012,25 +2074,36 @@ again:
2012 error = flock_to_posix_lock(filp, file_lock, &flock); 2074 error = flock_to_posix_lock(filp, file_lock, &flock);
2013 if (error) 2075 if (error)
2014 goto out; 2076 goto out;
2015 if (cmd == F_SETLKW) { 2077
2016 file_lock->fl_flags |= FL_SLEEP; 2078 error = check_fmode_for_setlk(file_lock);
2017 } 2079 if (error)
2018 2080 goto out;
2019 error = -EBADF; 2081
2020 switch (flock.l_type) { 2082 /*
2021 case F_RDLCK: 2083 * If the cmd is requesting file-private locks, then set the
2022 if (!(filp->f_mode & FMODE_READ)) 2084 * FL_OFDLCK flag and override the owner.
2023 goto out; 2085 */
2024 break; 2086 switch (cmd) {
2025 case F_WRLCK: 2087 case F_OFD_SETLK:
2026 if (!(filp->f_mode & FMODE_WRITE)) 2088 error = -EINVAL;
2089 if (flock.l_pid != 0)
2027 goto out; 2090 goto out;
2091
2092 cmd = F_SETLK;
2093 file_lock->fl_flags |= FL_OFDLCK;
2094 file_lock->fl_owner = (fl_owner_t)filp;
2028 break; 2095 break;
2029 case F_UNLCK: 2096 case F_OFD_SETLKW:
2030 break;
2031 default:
2032 error = -EINVAL; 2097 error = -EINVAL;
2033 goto out; 2098 if (flock.l_pid != 0)
2099 goto out;
2100
2101 cmd = F_SETLKW;
2102 file_lock->fl_flags |= FL_OFDLCK;
2103 file_lock->fl_owner = (fl_owner_t)filp;
2104 /* Fallthrough */
2105 case F_SETLKW:
2106 file_lock->fl_flags |= FL_SLEEP;
2034 } 2107 }
2035 2108
2036 error = do_lock_file_wait(filp, cmd, file_lock); 2109 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2061,7 +2134,7 @@ out:
2061/* Report the first existing lock that would conflict with l. 2134/* Report the first existing lock that would conflict with l.
2062 * This implements the F_GETLK command of fcntl(). 2135 * This implements the F_GETLK command of fcntl().
2063 */ 2136 */
2064int fcntl_getlk64(struct file *filp, struct flock64 __user *l) 2137int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
2065{ 2138{
2066 struct file_lock file_lock; 2139 struct file_lock file_lock;
2067 struct flock64 flock; 2140 struct flock64 flock;
@@ -2078,6 +2151,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
2078 if (error) 2151 if (error)
2079 goto out; 2152 goto out;
2080 2153
2154 if (cmd == F_OFD_GETLK) {
2155 error = -EINVAL;
2156 if (flock.l_pid != 0)
2157 goto out;
2158
2159 cmd = F_GETLK64;
2160 file_lock.fl_flags |= FL_OFDLCK;
2161 file_lock.fl_owner = (fl_owner_t)filp;
2162 }
2163
2081 error = vfs_test_lock(filp, &file_lock); 2164 error = vfs_test_lock(filp, &file_lock);
2082 if (error) 2165 if (error)
2083 goto out; 2166 goto out;
@@ -2130,25 +2213,36 @@ again:
2130 error = flock64_to_posix_lock(filp, file_lock, &flock); 2213 error = flock64_to_posix_lock(filp, file_lock, &flock);
2131 if (error) 2214 if (error)
2132 goto out; 2215 goto out;
2133 if (cmd == F_SETLKW64) { 2216
2134 file_lock->fl_flags |= FL_SLEEP; 2217 error = check_fmode_for_setlk(file_lock);
2135 } 2218 if (error)
2136 2219 goto out;
2137 error = -EBADF; 2220
2138 switch (flock.l_type) { 2221 /*
2139 case F_RDLCK: 2222 * If the cmd is requesting file-private locks, then set the
2140 if (!(filp->f_mode & FMODE_READ)) 2223 * FL_OFDLCK flag and override the owner.
2141 goto out; 2224 */
2142 break; 2225 switch (cmd) {
2143 case F_WRLCK: 2226 case F_OFD_SETLK:
2144 if (!(filp->f_mode & FMODE_WRITE)) 2227 error = -EINVAL;
2228 if (flock.l_pid != 0)
2145 goto out; 2229 goto out;
2230
2231 cmd = F_SETLK64;
2232 file_lock->fl_flags |= FL_OFDLCK;
2233 file_lock->fl_owner = (fl_owner_t)filp;
2146 break; 2234 break;
2147 case F_UNLCK: 2235 case F_OFD_SETLKW:
2148 break;
2149 default:
2150 error = -EINVAL; 2236 error = -EINVAL;
2151 goto out; 2237 if (flock.l_pid != 0)
2238 goto out;
2239
2240 cmd = F_SETLKW64;
2241 file_lock->fl_flags |= FL_OFDLCK;
2242 file_lock->fl_owner = (fl_owner_t)filp;
2243 /* Fallthrough */
2244 case F_SETLKW64:
2245 file_lock->fl_flags |= FL_SLEEP;
2152 } 2246 }
2153 2247
2154 error = do_lock_file_wait(filp, cmd, file_lock); 2248 error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2209,7 +2303,7 @@ EXPORT_SYMBOL(locks_remove_posix);
2209/* 2303/*
2210 * This function is called on the last close of an open file. 2304 * This function is called on the last close of an open file.
2211 */ 2305 */
2212void locks_remove_flock(struct file *filp) 2306void locks_remove_file(struct file *filp)
2213{ 2307{
2214 struct inode * inode = file_inode(filp); 2308 struct inode * inode = file_inode(filp);
2215 struct file_lock *fl; 2309 struct file_lock *fl;
@@ -2218,6 +2312,8 @@ void locks_remove_flock(struct file *filp)
2218 if (!inode->i_flock) 2312 if (!inode->i_flock)
2219 return; 2313 return;
2220 2314
2315 locks_remove_posix(filp, (fl_owner_t)filp);
2316
2221 if (filp->f_op->flock) { 2317 if (filp->f_op->flock) {
2222 struct file_lock fl = { 2318 struct file_lock fl = {
2223 .fl_pid = current->tgid, 2319 .fl_pid = current->tgid,
@@ -2236,16 +2332,28 @@ void locks_remove_flock(struct file *filp)
2236 2332
2237 while ((fl = *before) != NULL) { 2333 while ((fl = *before) != NULL) {
2238 if (fl->fl_file == filp) { 2334 if (fl->fl_file == filp) {
2239 if (IS_FLOCK(fl)) {
2240 locks_delete_lock(before);
2241 continue;
2242 }
2243 if (IS_LEASE(fl)) { 2335 if (IS_LEASE(fl)) {
2244 lease_modify(before, F_UNLCK); 2336 lease_modify(before, F_UNLCK);
2245 continue; 2337 continue;
2246 } 2338 }
2247 /* What? */ 2339
2248 BUG(); 2340 /*
2341 * There's a leftover lock on the list of a type that
2342 * we didn't expect to see. Most likely a classic
2343 * POSIX lock that ended up not getting released
2344 * properly, or that raced onto the list somehow. Log
2345 * some info about it and then just remove it from
2346 * the list.
2347 */
2348 WARN(!IS_FLOCK(fl),
2349 "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
2350 MAJOR(inode->i_sb->s_dev),
2351 MINOR(inode->i_sb->s_dev), inode->i_ino,
2352 fl->fl_type, fl->fl_flags,
2353 fl->fl_start, fl->fl_end);
2354
2355 locks_delete_lock(before);
2356 continue;
2249 } 2357 }
2250 before = &fl->fl_next; 2358 before = &fl->fl_next;
2251 } 2359 }
@@ -2314,8 +2422,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2314 2422
2315 seq_printf(f, "%lld:%s ", id, pfx); 2423 seq_printf(f, "%lld:%s ", id, pfx);
2316 if (IS_POSIX(fl)) { 2424 if (IS_POSIX(fl)) {
2317 seq_printf(f, "%6s %s ", 2425 if (fl->fl_flags & FL_ACCESS)
2318 (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", 2426 seq_printf(f, "ACCESS");
2427 else if (IS_OFDLCK(fl))
2428 seq_printf(f, "OFDLCK");
2429 else
2430 seq_printf(f, "POSIX ");
2431
2432 seq_printf(f, " %s ",
2319 (inode == NULL) ? "*NOINODE*" : 2433 (inode == NULL) ? "*NOINODE*" :
2320 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); 2434 mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
2321 } else if (IS_FLOCK(fl)) { 2435 } else if (IS_FLOCK(fl)) {
@@ -2385,6 +2499,7 @@ static int locks_show(struct seq_file *f, void *v)
2385} 2499}
2386 2500
2387static void *locks_start(struct seq_file *f, loff_t *pos) 2501static void *locks_start(struct seq_file *f, loff_t *pos)
2502 __acquires(&blocked_lock_lock)
2388{ 2503{
2389 struct locks_iterator *iter = f->private; 2504 struct locks_iterator *iter = f->private;
2390 2505
@@ -2403,6 +2518,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2403} 2518}
2404 2519
2405static void locks_stop(struct seq_file *f, void *v) 2520static void locks_stop(struct seq_file *f, void *v)
2521 __releases(&blocked_lock_lock)
2406{ 2522{
2407 spin_unlock(&blocked_lock_lock); 2523 spin_unlock(&blocked_lock_lock);
2408 lg_global_unlock(&file_lock_lglock); 2524 lg_global_unlock(&file_lock_lglock);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cbade2fb..48140315f627 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
2180 do_delete_inode(inode); 2180 do_delete_inode(inode);
2181 } 2181 }
2182 } 2182 }
2183 truncate_inode_pages(&inode->i_data, 0); 2183 truncate_inode_pages_final(&inode->i_data);
2184 clear_inode(inode); 2184 clear_inode(inode);
2185 2185
2186 /* Cheaper version of write_inode. All changes are concealed in 2186 /* Cheaper version of write_inode. All changes are concealed in
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf673..bf166e388f0d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
26 * back on the lru list. 26 * back on the lru list.
27 */ 27 */
28 28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
29#include <linux/kernel.h> 64#include <linux/kernel.h>
30#include <linux/module.h> 65#include <linux/module.h>
31 66
@@ -34,9 +69,10 @@
34#include <linux/mm.h> 69#include <linux/mm.h>
35#include <linux/slab.h> 70#include <linux/slab.h>
36#include <linux/sched.h> 71#include <linux/sched.h>
37#include <linux/init.h> 72#include <linux/list_bl.h>
38#include <linux/mbcache.h> 73#include <linux/mbcache.h>
39 74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
40 76
41#ifdef MB_CACHE_DEBUG 77#ifdef MB_CACHE_DEBUG
42# define mb_debug(f...) do { \ 78# define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
57 93
58#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) 94#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
59 95
96#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS)
97#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
98 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
99
60static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); 100static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
61 101static struct blockgroup_lock *mb_cache_bg_lock;
102static struct kmem_cache *mb_cache_kmem_cache;
103
62MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); 104MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
63MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); 105MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
64MODULE_LICENSE("GPL"); 106MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
86static LIST_HEAD(mb_cache_lru_list); 128static LIST_HEAD(mb_cache_lru_list);
87static DEFINE_SPINLOCK(mb_cache_spinlock); 129static DEFINE_SPINLOCK(mb_cache_spinlock);
88 130
131static inline void
132__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
133{
134 spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
135 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
136}
137
138static inline void
139__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
140{
141 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
142 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
143}
144
89static inline int 145static inline int
90__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 146__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
91{ 147{
92 return !list_empty(&ce->e_block_list); 148 return !hlist_bl_unhashed(&ce->e_block_list);
93} 149}
94 150
95 151
96static void 152static inline void
97__mb_cache_entry_unhash(struct mb_cache_entry *ce) 153__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
98{ 154{
99 if (__mb_cache_entry_is_hashed(ce)) { 155 if (__mb_cache_entry_is_block_hashed(ce))
100 list_del_init(&ce->e_block_list); 156 hlist_bl_del_init(&ce->e_block_list);
101 list_del(&ce->e_index.o_list);
102 }
103} 157}
104 158
159static inline int
160__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
161{
162 return !hlist_bl_unhashed(&ce->e_index.o_list);
163}
164
165static inline void
166__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
167{
168 if (__mb_cache_entry_is_index_hashed(ce))
169 hlist_bl_del_init(&ce->e_index.o_list);
170}
171
172/*
173 * __mb_cache_entry_unhash_unlock()
174 *
175 * This function is called to unhash both the block and index hash
176 * chain.
177 * It assumes both the block and index hash chain is locked upon entry.
178 * It also unlock both hash chains both exit
179 */
180static inline void
181__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
182{
183 __mb_cache_entry_unhash_index(ce);
184 hlist_bl_unlock(ce->e_index_hash_p);
185 __mb_cache_entry_unhash_block(ce);
186 hlist_bl_unlock(ce->e_block_hash_p);
187}
105 188
106static void 189static void
107__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 190__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
108{ 191{
109 struct mb_cache *cache = ce->e_cache; 192 struct mb_cache *cache = ce->e_cache;
110 193
111 mb_assert(!(ce->e_used || ce->e_queued)); 194 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
112 kmem_cache_free(cache->c_entry_cache, ce); 195 kmem_cache_free(cache->c_entry_cache, ce);
113 atomic_dec(&cache->c_entry_count); 196 atomic_dec(&cache->c_entry_count);
114} 197}
115 198
116
117static void 199static void
118__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 200__mb_cache_entry_release(struct mb_cache_entry *ce)
119 __releases(mb_cache_spinlock)
120{ 201{
202 /* First lock the entry to serialize access to its local data. */
203 __spin_lock_mb_cache_entry(ce);
121 /* Wake up all processes queuing for this cache entry. */ 204 /* Wake up all processes queuing for this cache entry. */
122 if (ce->e_queued) 205 if (ce->e_queued)
123 wake_up_all(&mb_cache_queue); 206 wake_up_all(&mb_cache_queue);
124 if (ce->e_used >= MB_CACHE_WRITER) 207 if (ce->e_used >= MB_CACHE_WRITER)
125 ce->e_used -= MB_CACHE_WRITER; 208 ce->e_used -= MB_CACHE_WRITER;
209 /*
210 * Make sure that all cache entries on lru_list have
211 * both e_used and e_qued of 0s.
212 */
126 ce->e_used--; 213 ce->e_used--;
127 if (!(ce->e_used || ce->e_queued)) { 214 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
128 if (!__mb_cache_entry_is_hashed(ce)) 215 if (!__mb_cache_entry_is_block_hashed(ce)) {
216 __spin_unlock_mb_cache_entry(ce);
129 goto forget; 217 goto forget;
130 mb_assert(list_empty(&ce->e_lru_list)); 218 }
131 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); 219 /*
220 * Need access to lru list, first drop entry lock,
221 * then reacquire the lock in the proper order.
222 */
223 spin_lock(&mb_cache_spinlock);
224 if (list_empty(&ce->e_lru_list))
225 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
226 spin_unlock(&mb_cache_spinlock);
132 } 227 }
133 spin_unlock(&mb_cache_spinlock); 228 __spin_unlock_mb_cache_entry(ce);
134 return; 229 return;
135forget: 230forget:
136 spin_unlock(&mb_cache_spinlock); 231 mb_assert(list_empty(&ce->e_lru_list));
137 __mb_cache_entry_forget(ce, GFP_KERNEL); 232 __mb_cache_entry_forget(ce, GFP_KERNEL);
138} 233}
139 234
140
141/* 235/*
142 * mb_cache_shrink_scan() memory pressure callback 236 * mb_cache_shrink_scan() memory pressure callback
143 * 237 *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
160 254
161 mb_debug("trying to free %d entries", nr_to_scan); 255 mb_debug("trying to free %d entries", nr_to_scan);
162 spin_lock(&mb_cache_spinlock); 256 spin_lock(&mb_cache_spinlock);
163 while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { 257 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
164 struct mb_cache_entry *ce = 258 struct mb_cache_entry *ce =
165 list_entry(mb_cache_lru_list.next, 259 list_entry(mb_cache_lru_list.next,
166 struct mb_cache_entry, e_lru_list); 260 struct mb_cache_entry, e_lru_list);
167 list_move_tail(&ce->e_lru_list, &free_list); 261 list_del_init(&ce->e_lru_list);
168 __mb_cache_entry_unhash(ce); 262 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
169 freed++; 263 continue;
264 spin_unlock(&mb_cache_spinlock);
265 /* Prevent any find or get operation on the entry */
266 hlist_bl_lock(ce->e_block_hash_p);
267 hlist_bl_lock(ce->e_index_hash_p);
268 /* Ignore if it is touched by a find/get */
269 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
270 !list_empty(&ce->e_lru_list)) {
271 hlist_bl_unlock(ce->e_index_hash_p);
272 hlist_bl_unlock(ce->e_block_hash_p);
273 spin_lock(&mb_cache_spinlock);
274 continue;
275 }
276 __mb_cache_entry_unhash_unlock(ce);
277 list_add_tail(&ce->e_lru_list, &free_list);
278 spin_lock(&mb_cache_spinlock);
170 } 279 }
171 spin_unlock(&mb_cache_spinlock); 280 spin_unlock(&mb_cache_spinlock);
281
172 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { 282 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
173 __mb_cache_entry_forget(entry, gfp_mask); 283 __mb_cache_entry_forget(entry, gfp_mask);
284 freed++;
174 } 285 }
175 return freed; 286 return freed;
176} 287}
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
215 int n, bucket_count = 1 << bucket_bits; 326 int n, bucket_count = 1 << bucket_bits;
216 struct mb_cache *cache = NULL; 327 struct mb_cache *cache = NULL;
217 328
329 if (!mb_cache_bg_lock) {
330 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
331 GFP_KERNEL);
332 if (!mb_cache_bg_lock)
333 return NULL;
334 bgl_lock_init(mb_cache_bg_lock);
335 }
336
218 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); 337 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
219 if (!cache) 338 if (!cache)
220 return NULL; 339 return NULL;
221 cache->c_name = name; 340 cache->c_name = name;
222 atomic_set(&cache->c_entry_count, 0); 341 atomic_set(&cache->c_entry_count, 0);
223 cache->c_bucket_bits = bucket_bits; 342 cache->c_bucket_bits = bucket_bits;
224 cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), 343 cache->c_block_hash = kmalloc(bucket_count *
225 GFP_KERNEL); 344 sizeof(struct hlist_bl_head), GFP_KERNEL);
226 if (!cache->c_block_hash) 345 if (!cache->c_block_hash)
227 goto fail; 346 goto fail;
228 for (n=0; n<bucket_count; n++) 347 for (n=0; n<bucket_count; n++)
229 INIT_LIST_HEAD(&cache->c_block_hash[n]); 348 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
230 cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), 349 cache->c_index_hash = kmalloc(bucket_count *
231 GFP_KERNEL); 350 sizeof(struct hlist_bl_head), GFP_KERNEL);
232 if (!cache->c_index_hash) 351 if (!cache->c_index_hash)
233 goto fail; 352 goto fail;
234 for (n=0; n<bucket_count; n++) 353 for (n=0; n<bucket_count; n++)
235 INIT_LIST_HEAD(&cache->c_index_hash[n]); 354 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
236 cache->c_entry_cache = kmem_cache_create(name, 355 if (!mb_cache_kmem_cache) {
237 sizeof(struct mb_cache_entry), 0, 356 mb_cache_kmem_cache = kmem_cache_create(name,
238 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 357 sizeof(struct mb_cache_entry), 0,
239 if (!cache->c_entry_cache) 358 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
240 goto fail2; 359 if (!mb_cache_kmem_cache)
360 goto fail2;
361 }
362 cache->c_entry_cache = mb_cache_kmem_cache;
241 363
242 /* 364 /*
243 * Set an upper limit on the number of cache entries so that the hash 365 * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
273mb_cache_shrink(struct block_device *bdev) 395mb_cache_shrink(struct block_device *bdev)
274{ 396{
275 LIST_HEAD(free_list); 397 LIST_HEAD(free_list);
276 struct list_head *l, *ltmp; 398 struct list_head *l;
399 struct mb_cache_entry *ce, *tmp;
277 400
401 l = &mb_cache_lru_list;
278 spin_lock(&mb_cache_spinlock); 402 spin_lock(&mb_cache_spinlock);
279 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 403 while (!list_is_last(l, &mb_cache_lru_list)) {
280 struct mb_cache_entry *ce = 404 l = l->next;
281 list_entry(l, struct mb_cache_entry, e_lru_list); 405 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
282 if (ce->e_bdev == bdev) { 406 if (ce->e_bdev == bdev) {
283 list_move_tail(&ce->e_lru_list, &free_list); 407 list_del_init(&ce->e_lru_list);
284 __mb_cache_entry_unhash(ce); 408 if (ce->e_used || ce->e_queued ||
409 atomic_read(&ce->e_refcnt))
410 continue;
411 spin_unlock(&mb_cache_spinlock);
412 /*
413 * Prevent any find or get operation on the entry.
414 */
415 hlist_bl_lock(ce->e_block_hash_p);
416 hlist_bl_lock(ce->e_index_hash_p);
417 /* Ignore if it is touched by a find/get */
418 if (ce->e_used || ce->e_queued ||
419 atomic_read(&ce->e_refcnt) ||
420 !list_empty(&ce->e_lru_list)) {
421 hlist_bl_unlock(ce->e_index_hash_p);
422 hlist_bl_unlock(ce->e_block_hash_p);
423 l = &mb_cache_lru_list;
424 spin_lock(&mb_cache_spinlock);
425 continue;
426 }
427 __mb_cache_entry_unhash_unlock(ce);
428 mb_assert(!(ce->e_used || ce->e_queued ||
429 atomic_read(&ce->e_refcnt)));
430 list_add_tail(&ce->e_lru_list, &free_list);
431 l = &mb_cache_lru_list;
432 spin_lock(&mb_cache_spinlock);
285 } 433 }
286 } 434 }
287 spin_unlock(&mb_cache_spinlock); 435 spin_unlock(&mb_cache_spinlock);
288 list_for_each_safe(l, ltmp, &free_list) { 436
289 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 437 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
290 e_lru_list), GFP_KERNEL); 438 __mb_cache_entry_forget(ce, GFP_KERNEL);
291 } 439 }
292} 440}
293 441
@@ -303,23 +451,27 @@ void
303mb_cache_destroy(struct mb_cache *cache) 451mb_cache_destroy(struct mb_cache *cache)
304{ 452{
305 LIST_HEAD(free_list); 453 LIST_HEAD(free_list);
306 struct list_head *l, *ltmp; 454 struct mb_cache_entry *ce, *tmp;
307 455
308 spin_lock(&mb_cache_spinlock); 456 spin_lock(&mb_cache_spinlock);
309 list_for_each_safe(l, ltmp, &mb_cache_lru_list) { 457 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
310 struct mb_cache_entry *ce = 458 if (ce->e_cache == cache)
311 list_entry(l, struct mb_cache_entry, e_lru_list);
312 if (ce->e_cache == cache) {
313 list_move_tail(&ce->e_lru_list, &free_list); 459 list_move_tail(&ce->e_lru_list, &free_list);
314 __mb_cache_entry_unhash(ce);
315 }
316 } 460 }
317 list_del(&cache->c_cache_list); 461 list_del(&cache->c_cache_list);
318 spin_unlock(&mb_cache_spinlock); 462 spin_unlock(&mb_cache_spinlock);
319 463
320 list_for_each_safe(l, ltmp, &free_list) { 464 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
321 __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, 465 list_del_init(&ce->e_lru_list);
322 e_lru_list), GFP_KERNEL); 466 /*
467 * Prevent any find or get operation on the entry.
468 */
469 hlist_bl_lock(ce->e_block_hash_p);
470 hlist_bl_lock(ce->e_index_hash_p);
471 mb_assert(!(ce->e_used || ce->e_queued ||
472 atomic_read(&ce->e_refcnt)));
473 __mb_cache_entry_unhash_unlock(ce);
474 __mb_cache_entry_forget(ce, GFP_KERNEL);
323 } 475 }
324 476
325 if (atomic_read(&cache->c_entry_count) > 0) { 477 if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
328 atomic_read(&cache->c_entry_count)); 480 atomic_read(&cache->c_entry_count));
329 } 481 }
330 482
331 kmem_cache_destroy(cache->c_entry_cache); 483 if (list_empty(&mb_cache_list)) {
332 484 kmem_cache_destroy(mb_cache_kmem_cache);
485 mb_cache_kmem_cache = NULL;
486 }
333 kfree(cache->c_index_hash); 487 kfree(cache->c_index_hash);
334 kfree(cache->c_block_hash); 488 kfree(cache->c_block_hash);
335 kfree(cache); 489 kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
346struct mb_cache_entry * 500struct mb_cache_entry *
347mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 501mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
348{ 502{
349 struct mb_cache_entry *ce = NULL; 503 struct mb_cache_entry *ce;
350 504
351 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { 505 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
506 struct list_head *l;
507
508 l = &mb_cache_lru_list;
352 spin_lock(&mb_cache_spinlock); 509 spin_lock(&mb_cache_spinlock);
353 if (!list_empty(&mb_cache_lru_list)) { 510 while (!list_is_last(l, &mb_cache_lru_list)) {
354 ce = list_entry(mb_cache_lru_list.next, 511 l = l->next;
355 struct mb_cache_entry, e_lru_list); 512 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
356 list_del_init(&ce->e_lru_list); 513 if (ce->e_cache == cache) {
357 __mb_cache_entry_unhash(ce); 514 list_del_init(&ce->e_lru_list);
515 if (ce->e_used || ce->e_queued ||
516 atomic_read(&ce->e_refcnt))
517 continue;
518 spin_unlock(&mb_cache_spinlock);
519 /*
520 * Prevent any find or get operation on the
521 * entry.
522 */
523 hlist_bl_lock(ce->e_block_hash_p);
524 hlist_bl_lock(ce->e_index_hash_p);
525 /* Ignore if it is touched by a find/get */
526 if (ce->e_used || ce->e_queued ||
527 atomic_read(&ce->e_refcnt) ||
528 !list_empty(&ce->e_lru_list)) {
529 hlist_bl_unlock(ce->e_index_hash_p);
530 hlist_bl_unlock(ce->e_block_hash_p);
531 l = &mb_cache_lru_list;
532 spin_lock(&mb_cache_spinlock);
533 continue;
534 }
535 mb_assert(list_empty(&ce->e_lru_list));
536 mb_assert(!(ce->e_used || ce->e_queued ||
537 atomic_read(&ce->e_refcnt)));
538 __mb_cache_entry_unhash_unlock(ce);
539 goto found;
540 }
358 } 541 }
359 spin_unlock(&mb_cache_spinlock); 542 spin_unlock(&mb_cache_spinlock);
360 } 543 }
361 if (!ce) { 544
362 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 545 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
363 if (!ce) 546 if (!ce)
364 return NULL; 547 return NULL;
365 atomic_inc(&cache->c_entry_count); 548 atomic_inc(&cache->c_entry_count);
366 INIT_LIST_HEAD(&ce->e_lru_list); 549 INIT_LIST_HEAD(&ce->e_lru_list);
367 INIT_LIST_HEAD(&ce->e_block_list); 550 INIT_HLIST_BL_NODE(&ce->e_block_list);
368 ce->e_cache = cache; 551 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
369 ce->e_queued = 0; 552 ce->e_cache = cache;
370 } 553 ce->e_queued = 0;
554 atomic_set(&ce->e_refcnt, 0);
555found:
556 ce->e_block_hash_p = &cache->c_block_hash[0];
557 ce->e_index_hash_p = &cache->c_index_hash[0];
371 ce->e_used = 1 + MB_CACHE_WRITER; 558 ce->e_used = 1 + MB_CACHE_WRITER;
372 return ce; 559 return ce;
373} 560}
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
393{ 580{
394 struct mb_cache *cache = ce->e_cache; 581 struct mb_cache *cache = ce->e_cache;
395 unsigned int bucket; 582 unsigned int bucket;
396 struct list_head *l; 583 struct hlist_bl_node *l;
397 int error = -EBUSY; 584 struct hlist_bl_head *block_hash_p;
585 struct hlist_bl_head *index_hash_p;
586 struct mb_cache_entry *lce;
398 587
588 mb_assert(ce);
399 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 589 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
400 cache->c_bucket_bits); 590 cache->c_bucket_bits);
401 spin_lock(&mb_cache_spinlock); 591 block_hash_p = &cache->c_block_hash[bucket];
402 list_for_each_prev(l, &cache->c_block_hash[bucket]) { 592 hlist_bl_lock(block_hash_p);
403 struct mb_cache_entry *ce = 593 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
404 list_entry(l, struct mb_cache_entry, e_block_list); 594 if (lce->e_bdev == bdev && lce->e_block == block) {
405 if (ce->e_bdev == bdev && ce->e_block == block) 595 hlist_bl_unlock(block_hash_p);
406 goto out; 596 return -EBUSY;
597 }
407 } 598 }
408 __mb_cache_entry_unhash(ce); 599 mb_assert(!__mb_cache_entry_is_block_hashed(ce));
600 __mb_cache_entry_unhash_block(ce);
601 __mb_cache_entry_unhash_index(ce);
409 ce->e_bdev = bdev; 602 ce->e_bdev = bdev;
410 ce->e_block = block; 603 ce->e_block = block;
411 list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); 604 ce->e_block_hash_p = block_hash_p;
412 ce->e_index.o_key = key; 605 ce->e_index.o_key = key;
606 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
607 hlist_bl_unlock(block_hash_p);
413 bucket = hash_long(key, cache->c_bucket_bits); 608 bucket = hash_long(key, cache->c_bucket_bits);
414 list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); 609 index_hash_p = &cache->c_index_hash[bucket];
415 error = 0; 610 hlist_bl_lock(index_hash_p);
416out: 611 ce->e_index_hash_p = index_hash_p;
417 spin_unlock(&mb_cache_spinlock); 612 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
418 return error; 613 hlist_bl_unlock(index_hash_p);
614 return 0;
419} 615}
420 616
421 617
@@ -429,24 +625,26 @@ out:
429void 625void
430mb_cache_entry_release(struct mb_cache_entry *ce) 626mb_cache_entry_release(struct mb_cache_entry *ce)
431{ 627{
432 spin_lock(&mb_cache_spinlock); 628 __mb_cache_entry_release(ce);
433 __mb_cache_entry_release_unlock(ce);
434} 629}
435 630
436 631
437/* 632/*
438 * mb_cache_entry_free() 633 * mb_cache_entry_free()
439 * 634 *
440 * This is equivalent to the sequence mb_cache_entry_takeout() --
441 * mb_cache_entry_release().
442 */ 635 */
443void 636void
444mb_cache_entry_free(struct mb_cache_entry *ce) 637mb_cache_entry_free(struct mb_cache_entry *ce)
445{ 638{
446 spin_lock(&mb_cache_spinlock); 639 mb_assert(ce);
447 mb_assert(list_empty(&ce->e_lru_list)); 640 mb_assert(list_empty(&ce->e_lru_list));
448 __mb_cache_entry_unhash(ce); 641 hlist_bl_lock(ce->e_index_hash_p);
449 __mb_cache_entry_release_unlock(ce); 642 __mb_cache_entry_unhash_index(ce);
643 hlist_bl_unlock(ce->e_index_hash_p);
644 hlist_bl_lock(ce->e_block_hash_p);
645 __mb_cache_entry_unhash_block(ce);
646 hlist_bl_unlock(ce->e_block_hash_p);
647 __mb_cache_entry_release(ce);
450} 648}
451 649
452 650
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
463 sector_t block) 661 sector_t block)
464{ 662{
465 unsigned int bucket; 663 unsigned int bucket;
466 struct list_head *l; 664 struct hlist_bl_node *l;
467 struct mb_cache_entry *ce; 665 struct mb_cache_entry *ce;
666 struct hlist_bl_head *block_hash_p;
468 667
469 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 668 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
470 cache->c_bucket_bits); 669 cache->c_bucket_bits);
471 spin_lock(&mb_cache_spinlock); 670 block_hash_p = &cache->c_block_hash[bucket];
472 list_for_each(l, &cache->c_block_hash[bucket]) { 671 /* First serialize access to the block corresponding hash chain. */
473 ce = list_entry(l, struct mb_cache_entry, e_block_list); 672 hlist_bl_lock(block_hash_p);
673 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
674 mb_assert(ce->e_block_hash_p == block_hash_p);
474 if (ce->e_bdev == bdev && ce->e_block == block) { 675 if (ce->e_bdev == bdev && ce->e_block == block) {
475 DEFINE_WAIT(wait); 676 /*
677 * Prevent a free from removing the entry.
678 */
679 atomic_inc(&ce->e_refcnt);
680 hlist_bl_unlock(block_hash_p);
681 __spin_lock_mb_cache_entry(ce);
682 atomic_dec(&ce->e_refcnt);
683 if (ce->e_used > 0) {
684 DEFINE_WAIT(wait);
685 while (ce->e_used > 0) {
686 ce->e_queued++;
687 prepare_to_wait(&mb_cache_queue, &wait,
688 TASK_UNINTERRUPTIBLE);
689 __spin_unlock_mb_cache_entry(ce);
690 schedule();
691 __spin_lock_mb_cache_entry(ce);
692 ce->e_queued--;
693 }
694 finish_wait(&mb_cache_queue, &wait);
695 }
696 ce->e_used += 1 + MB_CACHE_WRITER;
697 __spin_unlock_mb_cache_entry(ce);
476 698
477 if (!list_empty(&ce->e_lru_list)) 699 if (!list_empty(&ce->e_lru_list)) {
700 spin_lock(&mb_cache_spinlock);
478 list_del_init(&ce->e_lru_list); 701 list_del_init(&ce->e_lru_list);
479
480 while (ce->e_used > 0) {
481 ce->e_queued++;
482 prepare_to_wait(&mb_cache_queue, &wait,
483 TASK_UNINTERRUPTIBLE);
484 spin_unlock(&mb_cache_spinlock); 702 spin_unlock(&mb_cache_spinlock);
485 schedule();
486 spin_lock(&mb_cache_spinlock);
487 ce->e_queued--;
488 } 703 }
489 finish_wait(&mb_cache_queue, &wait); 704 if (!__mb_cache_entry_is_block_hashed(ce)) {
490 ce->e_used += 1 + MB_CACHE_WRITER; 705 __mb_cache_entry_release(ce);
491
492 if (!__mb_cache_entry_is_hashed(ce)) {
493 __mb_cache_entry_release_unlock(ce);
494 return NULL; 706 return NULL;
495 } 707 }
496 goto cleanup; 708 return ce;
497 } 709 }
498 } 710 }
499 ce = NULL; 711 hlist_bl_unlock(block_hash_p);
500 712 return NULL;
501cleanup:
502 spin_unlock(&mb_cache_spinlock);
503 return ce;
504} 713}
505 714
506#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 715#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
507 716
508static struct mb_cache_entry * 717static struct mb_cache_entry *
509__mb_cache_entry_find(struct list_head *l, struct list_head *head, 718__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
510 struct block_device *bdev, unsigned int key) 719 struct block_device *bdev, unsigned int key)
511{ 720{
512 while (l != head) { 721
722 /* The index hash chain is alredy acquire by caller. */
723 while (l != NULL) {
513 struct mb_cache_entry *ce = 724 struct mb_cache_entry *ce =
514 list_entry(l, struct mb_cache_entry, e_index.o_list); 725 hlist_bl_entry(l, struct mb_cache_entry,
726 e_index.o_list);
727 mb_assert(ce->e_index_hash_p == head);
515 if (ce->e_bdev == bdev && ce->e_index.o_key == key) { 728 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
516 DEFINE_WAIT(wait); 729 /*
517 730 * Prevent a free from removing the entry.
518 if (!list_empty(&ce->e_lru_list)) 731 */
519 list_del_init(&ce->e_lru_list); 732 atomic_inc(&ce->e_refcnt);
520 733 hlist_bl_unlock(head);
734 __spin_lock_mb_cache_entry(ce);
735 atomic_dec(&ce->e_refcnt);
736 ce->e_used++;
521 /* Incrementing before holding the lock gives readers 737 /* Incrementing before holding the lock gives readers
522 priority over writers. */ 738 priority over writers. */
523 ce->e_used++; 739 if (ce->e_used >= MB_CACHE_WRITER) {
524 while (ce->e_used >= MB_CACHE_WRITER) { 740 DEFINE_WAIT(wait);
525 ce->e_queued++; 741
526 prepare_to_wait(&mb_cache_queue, &wait, 742 while (ce->e_used >= MB_CACHE_WRITER) {
527 TASK_UNINTERRUPTIBLE); 743 ce->e_queued++;
528 spin_unlock(&mb_cache_spinlock); 744 prepare_to_wait(&mb_cache_queue, &wait,
529 schedule(); 745 TASK_UNINTERRUPTIBLE);
530 spin_lock(&mb_cache_spinlock); 746 __spin_unlock_mb_cache_entry(ce);
531 ce->e_queued--; 747 schedule();
748 __spin_lock_mb_cache_entry(ce);
749 ce->e_queued--;
750 }
751 finish_wait(&mb_cache_queue, &wait);
532 } 752 }
533 finish_wait(&mb_cache_queue, &wait); 753 __spin_unlock_mb_cache_entry(ce);
534 754 if (!list_empty(&ce->e_lru_list)) {
535 if (!__mb_cache_entry_is_hashed(ce)) {
536 __mb_cache_entry_release_unlock(ce);
537 spin_lock(&mb_cache_spinlock); 755 spin_lock(&mb_cache_spinlock);
756 list_del_init(&ce->e_lru_list);
757 spin_unlock(&mb_cache_spinlock);
758 }
759 if (!__mb_cache_entry_is_block_hashed(ce)) {
760 __mb_cache_entry_release(ce);
538 return ERR_PTR(-EAGAIN); 761 return ERR_PTR(-EAGAIN);
539 } 762 }
540 return ce; 763 return ce;
541 } 764 }
542 l = l->next; 765 l = l->next;
543 } 766 }
767 hlist_bl_unlock(head);
544 return NULL; 768 return NULL;
545} 769}
546 770
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
562 unsigned int key) 786 unsigned int key)
563{ 787{
564 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 788 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
565 struct list_head *l; 789 struct hlist_bl_node *l;
566 struct mb_cache_entry *ce; 790 struct mb_cache_entry *ce = NULL;
567 791 struct hlist_bl_head *index_hash_p;
568 spin_lock(&mb_cache_spinlock); 792
569 l = cache->c_index_hash[bucket].next; 793 index_hash_p = &cache->c_index_hash[bucket];
570 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 794 hlist_bl_lock(index_hash_p);
571 spin_unlock(&mb_cache_spinlock); 795 if (!hlist_bl_empty(index_hash_p)) {
796 l = hlist_bl_first(index_hash_p);
797 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
798 } else
799 hlist_bl_unlock(index_hash_p);
572 return ce; 800 return ce;
573} 801}
574 802
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
597{ 825{
598 struct mb_cache *cache = prev->e_cache; 826 struct mb_cache *cache = prev->e_cache;
599 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 827 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
600 struct list_head *l; 828 struct hlist_bl_node *l;
601 struct mb_cache_entry *ce; 829 struct mb_cache_entry *ce;
830 struct hlist_bl_head *index_hash_p;
602 831
603 spin_lock(&mb_cache_spinlock); 832 index_hash_p = &cache->c_index_hash[bucket];
833 mb_assert(prev->e_index_hash_p == index_hash_p);
834 hlist_bl_lock(index_hash_p);
835 mb_assert(!hlist_bl_empty(index_hash_p));
604 l = prev->e_index.o_list.next; 836 l = prev->e_index.o_list.next;
605 ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); 837 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
606 __mb_cache_entry_release_unlock(prev); 838 __mb_cache_entry_release(prev);
607 return ce; 839 return ce;
608} 840}
609 841
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109162a5..f007a3355570 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
26 26
27static void minix_evict_inode(struct inode *inode) 27static void minix_evict_inode(struct inode *inode)
28{ 28{
29 truncate_inode_pages(&inode->i_data, 0); 29 truncate_inode_pages_final(&inode->i_data);
30 if (!inode->i_nlink) { 30 if (!inode->i_nlink) {
31 inode->i_size = 0; 31 inode->i_size = 0;
32 minix_truncate(inode); 32 minix_truncate(inode);
@@ -86,7 +86,7 @@ static void init_once(void *foo)
86 inode_init_once(&ei->vfs_inode); 86 inode_init_once(&ei->vfs_inode);
87} 87}
88 88
89static int init_inodecache(void) 89static int __init init_inodecache(void)
90{ 90{
91 minix_inode_cachep = kmem_cache_create("minix_inode_cache", 91 minix_inode_cachep = kmem_cache_create("minix_inode_cache",
92 sizeof(struct minix_inode_info), 92 sizeof(struct minix_inode_info),
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
123 struct minix_sb_info * sbi = minix_sb(sb); 123 struct minix_sb_info * sbi = minix_sb(sb);
124 struct minix_super_block * ms; 124 struct minix_super_block * ms;
125 125
126 sync_filesystem(sb);
126 ms = sbi->s_ms; 127 ms = sbi->s_ms;
127 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 128 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
128 return 0; 129 return 0;
diff --git a/fs/mount.h b/fs/mount.h
index b29e42f05f34..d55297f2fa05 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
10 struct user_namespace *user_ns; 10 struct user_namespace *user_ns;
11 u64 seq; /* Sequence number to prevent loops */ 11 u64 seq; /* Sequence number to prevent loops */
12 wait_queue_head_t poll; 12 wait_queue_head_t poll;
13 int event; 13 u64 event;
14}; 14};
15 15
16struct mnt_pcp { 16struct mnt_pcp {
@@ -104,6 +104,9 @@ struct proc_mounts {
104 struct mnt_namespace *ns; 104 struct mnt_namespace *ns;
105 struct path root; 105 struct path root;
106 int (*show)(struct seq_file *, struct vfsmount *); 106 int (*show)(struct seq_file *, struct vfsmount *);
107 void *cached_mount;
108 u64 cached_event;
109 loff_t cached_index;
107}; 110};
108 111
109#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) 112#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/namei.c b/fs/namei.c
index 4b491b431990..80168273396b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask)
358 358
359 return -EACCES; 359 return -EACCES;
360} 360}
361EXPORT_SYMBOL(generic_permission);
361 362
362/* 363/*
363 * We _really_ want to just do "generic_permission()" without 364 * We _really_ want to just do "generic_permission()" without
@@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask)
455 return retval; 456 return retval;
456 return __inode_permission(inode, mask); 457 return __inode_permission(inode, mask);
457} 458}
459EXPORT_SYMBOL(inode_permission);
458 460
459/** 461/**
460 * path_get - get a reference to a path 462 * path_get - get a reference to a path
@@ -924,6 +926,7 @@ int follow_up(struct path *path)
924 path->mnt = &parent->mnt; 926 path->mnt = &parent->mnt;
925 return 1; 927 return 1;
926} 928}
929EXPORT_SYMBOL(follow_up);
927 930
928/* 931/*
929 * Perform an automount 932 * Perform an automount
@@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path)
1085 } 1088 }
1086 return 0; 1089 return 0;
1087} 1090}
1091EXPORT_SYMBOL(follow_down_one);
1088 1092
1089static inline bool managed_dentry_might_block(struct dentry *dentry) 1093static inline bool managed_dentry_might_block(struct dentry *dentry)
1090{ 1094{
@@ -1223,6 +1227,7 @@ int follow_down(struct path *path)
1223 } 1227 }
1224 return 0; 1228 return 0;
1225} 1229}
1230EXPORT_SYMBOL(follow_down);
1226 1231
1227/* 1232/*
1228 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() 1233 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -1537,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1537 inode = path->dentry->d_inode; 1542 inode = path->dentry->d_inode;
1538 } 1543 }
1539 err = -ENOENT; 1544 err = -ENOENT;
1540 if (!inode) 1545 if (!inode || d_is_negative(path->dentry))
1541 goto out_path_put; 1546 goto out_path_put;
1542 1547
1543 if (should_follow_link(path->dentry, follow)) { 1548 if (should_follow_link(path->dentry, follow)) {
@@ -1796,7 +1801,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1796 if (err) 1801 if (err)
1797 return err; 1802 return err;
1798 } 1803 }
1799 if (!d_is_directory(nd->path.dentry)) { 1804 if (!d_can_lookup(nd->path.dentry)) {
1800 err = -ENOTDIR; 1805 err = -ENOTDIR;
1801 break; 1806 break;
1802 } 1807 }
@@ -1817,7 +1822,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1817 struct dentry *root = nd->root.dentry; 1822 struct dentry *root = nd->root.dentry;
1818 struct inode *inode = root->d_inode; 1823 struct inode *inode = root->d_inode;
1819 if (*name) { 1824 if (*name) {
1820 if (!d_is_directory(root)) 1825 if (!d_can_lookup(root))
1821 return -ENOTDIR; 1826 return -ENOTDIR;
1822 retval = inode_permission(inode, MAY_EXEC); 1827 retval = inode_permission(inode, MAY_EXEC);
1823 if (retval) 1828 if (retval)
@@ -1873,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1873 dentry = f.file->f_path.dentry; 1878 dentry = f.file->f_path.dentry;
1874 1879
1875 if (*name) { 1880 if (*name) {
1876 if (!d_is_directory(dentry)) { 1881 if (!d_can_lookup(dentry)) {
1877 fdput(f); 1882 fdput(f);
1878 return -ENOTDIR; 1883 return -ENOTDIR;
1879 } 1884 }
@@ -1955,7 +1960,7 @@ static int path_lookupat(int dfd, const char *name,
1955 err = complete_walk(nd); 1960 err = complete_walk(nd);
1956 1961
1957 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1962 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1958 if (!d_is_directory(nd->path.dentry)) { 1963 if (!d_can_lookup(nd->path.dentry)) {
1959 path_put(&nd->path); 1964 path_put(&nd->path);
1960 err = -ENOTDIR; 1965 err = -ENOTDIR;
1961 } 1966 }
@@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
2025 *path = nd.path; 2030 *path = nd.path;
2026 return res; 2031 return res;
2027} 2032}
2033EXPORT_SYMBOL(kern_path);
2028 2034
2029/** 2035/**
2030 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 2036 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2049 *path = nd.path; 2055 *path = nd.path;
2050 return err; 2056 return err;
2051} 2057}
2058EXPORT_SYMBOL(vfs_path_lookup);
2052 2059
2053/* 2060/*
2054 * Restricted form of lookup. Doesn't follow links, single-component only, 2061 * Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2111 2118
2112 return __lookup_hash(&this, base, 0); 2119 return __lookup_hash(&this, base, 0);
2113} 2120}
2121EXPORT_SYMBOL(lookup_one_len);
2114 2122
2115int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 2123int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2116 struct path *path, int *empty) 2124 struct path *path, int *empty)
@@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
2135{ 2143{
2136 return user_path_at_empty(dfd, name, flags, path, NULL); 2144 return user_path_at_empty(dfd, name, flags, path, NULL);
2137} 2145}
2146EXPORT_SYMBOL(user_path_at);
2138 2147
2139/* 2148/*
2140 * NB: most callers don't do anything directly with the reference to the 2149 * NB: most callers don't do anything directly with the reference to the
@@ -2240,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
2240 mutex_unlock(&dir->d_inode->i_mutex); 2249 mutex_unlock(&dir->d_inode->i_mutex);
2241 2250
2242done: 2251done:
2243 if (!dentry->d_inode) { 2252 if (!dentry->d_inode || d_is_negative(dentry)) {
2244 error = -ENOENT; 2253 error = -ENOENT;
2245 dput(dentry); 2254 dput(dentry);
2246 goto out; 2255 goto out;
@@ -2414,11 +2423,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2414 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) 2423 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
2415 return -EPERM; 2424 return -EPERM;
2416 if (isdir) { 2425 if (isdir) {
2417 if (!d_is_directory(victim) && !d_is_autodir(victim)) 2426 if (!d_is_dir(victim))
2418 return -ENOTDIR; 2427 return -ENOTDIR;
2419 if (IS_ROOT(victim)) 2428 if (IS_ROOT(victim))
2420 return -EBUSY; 2429 return -EBUSY;
2421 } else if (d_is_directory(victim) || d_is_autodir(victim)) 2430 } else if (d_is_dir(victim))
2422 return -EISDIR; 2431 return -EISDIR;
2423 if (IS_DEADDIR(dir)) 2432 if (IS_DEADDIR(dir))
2424 return -ENOENT; 2433 return -ENOENT;
@@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2477 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2486 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2478 return NULL; 2487 return NULL;
2479} 2488}
2489EXPORT_SYMBOL(lock_rename);
2480 2490
2481void unlock_rename(struct dentry *p1, struct dentry *p2) 2491void unlock_rename(struct dentry *p1, struct dentry *p2)
2482{ 2492{
@@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
2486 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 2496 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2487 } 2497 }
2488} 2498}
2499EXPORT_SYMBOL(unlock_rename);
2489 2500
2490int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2501int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2491 bool want_excl) 2502 bool want_excl)
@@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2506 fsnotify_create(dir, dentry); 2517 fsnotify_create(dir, dentry);
2507 return error; 2518 return error;
2508} 2519}
2520EXPORT_SYMBOL(vfs_create);
2509 2521
2510static int may_open(struct path *path, int acc_mode, int flag) 2522static int may_open(struct path *path, int acc_mode, int flag)
2511{ 2523{
@@ -2569,7 +2581,7 @@ static int handle_truncate(struct file *filp)
2569 /* 2581 /*
2570 * Refuse to truncate files with mandatory locks held on them. 2582 * Refuse to truncate files with mandatory locks held on them.
2571 */ 2583 */
2572 error = locks_verify_locked(inode); 2584 error = locks_verify_locked(filp);
2573 if (!error) 2585 if (!error)
2574 error = security_path_truncate(path); 2586 error = security_path_truncate(path);
2575 if (!error) { 2587 if (!error) {
@@ -2982,7 +2994,7 @@ retry_lookup:
2982finish_lookup: 2994finish_lookup:
2983 /* we _can_ be in RCU mode here */ 2995 /* we _can_ be in RCU mode here */
2984 error = -ENOENT; 2996 error = -ENOENT;
2985 if (d_is_negative(path->dentry)) { 2997 if (!inode || d_is_negative(path->dentry)) {
2986 path_to_nameidata(path, nd); 2998 path_to_nameidata(path, nd);
2987 goto out; 2999 goto out;
2988 } 3000 }
@@ -3016,11 +3028,10 @@ finish_open:
3016 } 3028 }
3017 audit_inode(name, nd->path.dentry, 0); 3029 audit_inode(name, nd->path.dentry, 0);
3018 error = -EISDIR; 3030 error = -EISDIR;
3019 if ((open_flag & O_CREAT) && 3031 if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
3020 (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
3021 goto out; 3032 goto out;
3022 error = -ENOTDIR; 3033 error = -ENOTDIR;
3023 if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) 3034 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3024 goto out; 3035 goto out;
3025 if (!S_ISREG(nd->inode->i_mode)) 3036 if (!S_ISREG(nd->inode->i_mode))
3026 will_truncate = false; 3037 will_truncate = false;
@@ -3376,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3376 fsnotify_create(dir, dentry); 3387 fsnotify_create(dir, dentry);
3377 return error; 3388 return error;
3378} 3389}
3390EXPORT_SYMBOL(vfs_mknod);
3379 3391
3380static int may_mknod(umode_t mode) 3392static int may_mknod(umode_t mode)
3381{ 3393{
@@ -3465,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3465 fsnotify_mkdir(dir, dentry); 3477 fsnotify_mkdir(dir, dentry);
3466 return error; 3478 return error;
3467} 3479}
3480EXPORT_SYMBOL(vfs_mkdir);
3468 3481
3469SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) 3482SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3470{ 3483{
@@ -3519,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry)
3519 __d_drop(dentry); 3532 __d_drop(dentry);
3520 spin_unlock(&dentry->d_lock); 3533 spin_unlock(&dentry->d_lock);
3521} 3534}
3535EXPORT_SYMBOL(dentry_unhash);
3522 3536
3523int vfs_rmdir(struct inode *dir, struct dentry *dentry) 3537int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3524{ 3538{
@@ -3556,6 +3570,7 @@ out:
3556 d_delete(dentry); 3570 d_delete(dentry);
3557 return error; 3571 return error;
3558} 3572}
3573EXPORT_SYMBOL(vfs_rmdir);
3559 3574
3560static long do_rmdir(int dfd, const char __user *pathname) 3575static long do_rmdir(int dfd, const char __user *pathname)
3561{ 3576{
@@ -3673,6 +3688,7 @@ out:
3673 3688
3674 return error; 3689 return error;
3675} 3690}
3691EXPORT_SYMBOL(vfs_unlink);
3676 3692
3677/* 3693/*
3678 * Make sure that the actual truncation of the file will occur outside its 3694 * Make sure that the actual truncation of the file will occur outside its
@@ -3744,7 +3760,7 @@ exit1:
3744slashes: 3760slashes:
3745 if (d_is_negative(dentry)) 3761 if (d_is_negative(dentry))
3746 error = -ENOENT; 3762 error = -ENOENT;
3747 else if (d_is_directory(dentry) || d_is_autodir(dentry)) 3763 else if (d_is_dir(dentry))
3748 error = -EISDIR; 3764 error = -EISDIR;
3749 else 3765 else
3750 error = -ENOTDIR; 3766 error = -ENOTDIR;
@@ -3786,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
3786 fsnotify_create(dir, dentry); 3802 fsnotify_create(dir, dentry);
3787 return error; 3803 return error;
3788} 3804}
3805EXPORT_SYMBOL(vfs_symlink);
3789 3806
3790SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 3807SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3791 int, newdfd, const char __user *, newname) 3808 int, newdfd, const char __user *, newname)
@@ -3894,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3894 fsnotify_link(dir, inode, new_dentry); 3911 fsnotify_link(dir, inode, new_dentry);
3895 return error; 3912 return error;
3896} 3913}
3914EXPORT_SYMBOL(vfs_link);
3897 3915
3898/* 3916/*
3899 * Hardlinks are often used in delicate situations. We avoid 3917 * Hardlinks are often used in delicate situations. We avoid
@@ -3974,7 +3992,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3974 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 3992 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3975} 3993}
3976 3994
3977/* 3995/**
3996 * vfs_rename - rename a filesystem object
3997 * @old_dir: parent of source
3998 * @old_dentry: source
3999 * @new_dir: parent of destination
4000 * @new_dentry: destination
4001 * @delegated_inode: returns an inode needing a delegation break
4002 * @flags: rename flags
4003 *
4004 * The caller must hold multiple mutexes--see lock_rename()).
4005 *
4006 * If vfs_rename discovers a delegation in need of breaking at either
4007 * the source or destination, it will return -EWOULDBLOCK and return a
4008 * reference to the inode in delegated_inode. The caller should then
4009 * break the delegation and retry. Because breaking a delegation may
4010 * take a long time, the caller should drop all locks before doing
4011 * so.
4012 *
4013 * Alternatively, a caller may pass NULL for delegated_inode. This may
4014 * be appropriate for callers that expect the underlying filesystem not
4015 * to be NFS exported.
4016 *
3978 * The worst of all namespace operations - renaming directory. "Perverted" 4017 * The worst of all namespace operations - renaming directory. "Perverted"
3979 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 4018 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
3980 * Problems: 4019 * Problems:
@@ -4002,163 +4041,140 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
4002 * ->i_mutex on parents, which works but leads to some truly excessive 4041 * ->i_mutex on parents, which works but leads to some truly excessive
4003 * locking]. 4042 * locking].
4004 */ 4043 */
4005static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 4044int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4006 struct inode *new_dir, struct dentry *new_dentry) 4045 struct inode *new_dir, struct dentry *new_dentry,
4046 struct inode **delegated_inode, unsigned int flags)
4007{ 4047{
4008 int error = 0; 4048 int error;
4049 bool is_dir = d_is_dir(old_dentry);
4050 const unsigned char *old_name;
4051 struct inode *source = old_dentry->d_inode;
4009 struct inode *target = new_dentry->d_inode; 4052 struct inode *target = new_dentry->d_inode;
4053 bool new_is_dir = false;
4010 unsigned max_links = new_dir->i_sb->s_max_links; 4054 unsigned max_links = new_dir->i_sb->s_max_links;
4011 4055
4056 if (source == target)
4057 return 0;
4058
4059 error = may_delete(old_dir, old_dentry, is_dir);
4060 if (error)
4061 return error;
4062
4063 if (!target) {
4064 error = may_create(new_dir, new_dentry);
4065 } else {
4066 new_is_dir = d_is_dir(new_dentry);
4067
4068 if (!(flags & RENAME_EXCHANGE))
4069 error = may_delete(new_dir, new_dentry, is_dir);
4070 else
4071 error = may_delete(new_dir, new_dentry, new_is_dir);
4072 }
4073 if (error)
4074 return error;
4075
4076 if (!old_dir->i_op->rename)
4077 return -EPERM;
4078
4079 if (flags && !old_dir->i_op->rename2)
4080 return -EINVAL;
4081
4012 /* 4082 /*
4013 * If we are going to change the parent - check write permissions, 4083 * If we are going to change the parent - check write permissions,
4014 * we'll need to flip '..'. 4084 * we'll need to flip '..'.
4015 */ 4085 */
4016 if (new_dir != old_dir) { 4086 if (new_dir != old_dir) {
4017 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 4087 if (is_dir) {
4018 if (error) 4088 error = inode_permission(source, MAY_WRITE);
4019 return error; 4089 if (error)
4090 return error;
4091 }
4092 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4093 error = inode_permission(target, MAY_WRITE);
4094 if (error)
4095 return error;
4096 }
4020 } 4097 }
4021 4098
4022 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 4099 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4100 flags);
4023 if (error) 4101 if (error)
4024 return error; 4102 return error;
4025 4103
4104 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4026 dget(new_dentry); 4105 dget(new_dentry);
4027 if (target) 4106 if (!is_dir || (flags & RENAME_EXCHANGE))
4107 lock_two_nondirectories(source, target);
4108 else if (target)
4028 mutex_lock(&target->i_mutex); 4109 mutex_lock(&target->i_mutex);
4029 4110
4030 error = -EBUSY; 4111 error = -EBUSY;
4031 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 4112 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
4032 goto out; 4113 goto out;
4033 4114
4034 error = -EMLINK; 4115 if (max_links && new_dir != old_dir) {
4035 if (max_links && !target && new_dir != old_dir && 4116 error = -EMLINK;
4036 new_dir->i_nlink >= max_links) 4117 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4037 goto out; 4118 goto out;
4038 4119 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4039 if (target) 4120 old_dir->i_nlink >= max_links)
4121 goto out;
4122 }
4123 if (is_dir && !(flags & RENAME_EXCHANGE) && target)
4040 shrink_dcache_parent(new_dentry); 4124 shrink_dcache_parent(new_dentry);
4041 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4125 if (!is_dir) {
4042 if (error) 4126 error = try_break_deleg(source, delegated_inode);
4043 goto out; 4127 if (error)
4044 4128 goto out;
4045 if (target) {
4046 target->i_flags |= S_DEAD;
4047 dont_mount(new_dentry);
4048 } 4129 }
4049out: 4130 if (target && !new_is_dir) {
4050 if (target)
4051 mutex_unlock(&target->i_mutex);
4052 dput(new_dentry);
4053 if (!error)
4054 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4055 d_move(old_dentry,new_dentry);
4056 return error;
4057}
4058
4059static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4060 struct inode *new_dir, struct dentry *new_dentry,
4061 struct inode **delegated_inode)
4062{
4063 struct inode *target = new_dentry->d_inode;
4064 struct inode *source = old_dentry->d_inode;
4065 int error;
4066
4067 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
4068 if (error)
4069 return error;
4070
4071 dget(new_dentry);
4072 lock_two_nondirectories(source, target);
4073
4074 error = -EBUSY;
4075 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
4076 goto out;
4077
4078 error = try_break_deleg(source, delegated_inode);
4079 if (error)
4080 goto out;
4081 if (target) {
4082 error = try_break_deleg(target, delegated_inode); 4131 error = try_break_deleg(target, delegated_inode);
4083 if (error) 4132 if (error)
4084 goto out; 4133 goto out;
4085 } 4134 }
4086 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 4135 if (!flags) {
4136 error = old_dir->i_op->rename(old_dir, old_dentry,
4137 new_dir, new_dentry);
4138 } else {
4139 error = old_dir->i_op->rename2(old_dir, old_dentry,
4140 new_dir, new_dentry, flags);
4141 }
4087 if (error) 4142 if (error)
4088 goto out; 4143 goto out;
4089 4144
4090 if (target) 4145 if (!(flags & RENAME_EXCHANGE) && target) {
4146 if (is_dir)
4147 target->i_flags |= S_DEAD;
4091 dont_mount(new_dentry); 4148 dont_mount(new_dentry);
4092 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 4149 }
4093 d_move(old_dentry, new_dentry); 4150 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4151 if (!(flags & RENAME_EXCHANGE))
4152 d_move(old_dentry, new_dentry);
4153 else
4154 d_exchange(old_dentry, new_dentry);
4155 }
4094out: 4156out:
4095 unlock_two_nondirectories(source, target); 4157 if (!is_dir || (flags & RENAME_EXCHANGE))
4158 unlock_two_nondirectories(source, target);
4159 else if (target)
4160 mutex_unlock(&target->i_mutex);
4096 dput(new_dentry); 4161 dput(new_dentry);
4097 return error; 4162 if (!error) {
4098}
4099
4100/**
4101 * vfs_rename - rename a filesystem object
4102 * @old_dir: parent of source
4103 * @old_dentry: source
4104 * @new_dir: parent of destination
4105 * @new_dentry: destination
4106 * @delegated_inode: returns an inode needing a delegation break
4107 *
4108 * The caller must hold multiple mutexes--see lock_rename()).
4109 *
4110 * If vfs_rename discovers a delegation in need of breaking at either
4111 * the source or destination, it will return -EWOULDBLOCK and return a
4112 * reference to the inode in delegated_inode. The caller should then
4113 * break the delegation and retry. Because breaking a delegation may
4114 * take a long time, the caller should drop all locks before doing
4115 * so.
4116 *
4117 * Alternatively, a caller may pass NULL for delegated_inode. This may
4118 * be appropriate for callers that expect the underlying filesystem not
4119 * to be NFS exported.
4120 */
4121int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4122 struct inode *new_dir, struct dentry *new_dentry,
4123 struct inode **delegated_inode)
4124{
4125 int error;
4126 int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
4127 const unsigned char *old_name;
4128
4129 if (old_dentry->d_inode == new_dentry->d_inode)
4130 return 0;
4131
4132 error = may_delete(old_dir, old_dentry, is_dir);
4133 if (error)
4134 return error;
4135
4136 if (!new_dentry->d_inode)
4137 error = may_create(new_dir, new_dentry);
4138 else
4139 error = may_delete(new_dir, new_dentry, is_dir);
4140 if (error)
4141 return error;
4142
4143 if (!old_dir->i_op->rename)
4144 return -EPERM;
4145
4146 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
4147
4148 if (is_dir)
4149 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
4150 else
4151 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
4152 if (!error)
4153 fsnotify_move(old_dir, new_dir, old_name, is_dir, 4163 fsnotify_move(old_dir, new_dir, old_name, is_dir,
4154 new_dentry->d_inode, old_dentry); 4164 !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4165 if (flags & RENAME_EXCHANGE) {
4166 fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
4167 new_is_dir, NULL, new_dentry);
4168 }
4169 }
4155 fsnotify_oldname_free(old_name); 4170 fsnotify_oldname_free(old_name);
4156 4171
4157 return error; 4172 return error;
4158} 4173}
4174EXPORT_SYMBOL(vfs_rename);
4159 4175
4160SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4176SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4161 int, newdfd, const char __user *, newname) 4177 int, newdfd, const char __user *, newname, unsigned int, flags)
4162{ 4178{
4163 struct dentry *old_dir, *new_dir; 4179 struct dentry *old_dir, *new_dir;
4164 struct dentry *old_dentry, *new_dentry; 4180 struct dentry *old_dentry, *new_dentry;
@@ -4170,6 +4186,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4170 unsigned int lookup_flags = 0; 4186 unsigned int lookup_flags = 0;
4171 bool should_retry = false; 4187 bool should_retry = false;
4172 int error; 4188 int error;
4189
4190 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
4191 return -EINVAL;
4192
4193 if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
4194 return -EINVAL;
4195
4173retry: 4196retry:
4174 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); 4197 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4175 if (IS_ERR(from)) { 4198 if (IS_ERR(from)) {
@@ -4193,6 +4216,8 @@ retry:
4193 goto exit2; 4216 goto exit2;
4194 4217
4195 new_dir = newnd.path.dentry; 4218 new_dir = newnd.path.dentry;
4219 if (flags & RENAME_NOREPLACE)
4220 error = -EEXIST;
4196 if (newnd.last_type != LAST_NORM) 4221 if (newnd.last_type != LAST_NORM)
4197 goto exit2; 4222 goto exit2;
4198 4223
@@ -4202,7 +4227,8 @@ retry:
4202 4227
4203 oldnd.flags &= ~LOOKUP_PARENT; 4228 oldnd.flags &= ~LOOKUP_PARENT;
4204 newnd.flags &= ~LOOKUP_PARENT; 4229 newnd.flags &= ~LOOKUP_PARENT;
4205 newnd.flags |= LOOKUP_RENAME_TARGET; 4230 if (!(flags & RENAME_EXCHANGE))
4231 newnd.flags |= LOOKUP_RENAME_TARGET;
4206 4232
4207retry_deleg: 4233retry_deleg:
4208 trap = lock_rename(new_dir, old_dir); 4234 trap = lock_rename(new_dir, old_dir);
@@ -4215,34 +4241,49 @@ retry_deleg:
4215 error = -ENOENT; 4241 error = -ENOENT;
4216 if (d_is_negative(old_dentry)) 4242 if (d_is_negative(old_dentry))
4217 goto exit4; 4243 goto exit4;
4244 new_dentry = lookup_hash(&newnd);
4245 error = PTR_ERR(new_dentry);
4246 if (IS_ERR(new_dentry))
4247 goto exit4;
4248 error = -EEXIST;
4249 if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4250 goto exit5;
4251 if (flags & RENAME_EXCHANGE) {
4252 error = -ENOENT;
4253 if (d_is_negative(new_dentry))
4254 goto exit5;
4255
4256 if (!d_is_dir(new_dentry)) {
4257 error = -ENOTDIR;
4258 if (newnd.last.name[newnd.last.len])
4259 goto exit5;
4260 }
4261 }
4218 /* unless the source is a directory trailing slashes give -ENOTDIR */ 4262 /* unless the source is a directory trailing slashes give -ENOTDIR */
4219 if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { 4263 if (!d_is_dir(old_dentry)) {
4220 error = -ENOTDIR; 4264 error = -ENOTDIR;
4221 if (oldnd.last.name[oldnd.last.len]) 4265 if (oldnd.last.name[oldnd.last.len])
4222 goto exit4; 4266 goto exit5;
4223 if (newnd.last.name[newnd.last.len]) 4267 if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
4224 goto exit4; 4268 goto exit5;
4225 } 4269 }
4226 /* source should not be ancestor of target */ 4270 /* source should not be ancestor of target */
4227 error = -EINVAL; 4271 error = -EINVAL;
4228 if (old_dentry == trap) 4272 if (old_dentry == trap)
4229 goto exit4; 4273 goto exit5;
4230 new_dentry = lookup_hash(&newnd);
4231 error = PTR_ERR(new_dentry);
4232 if (IS_ERR(new_dentry))
4233 goto exit4;
4234 /* target should not be an ancestor of source */ 4274 /* target should not be an ancestor of source */
4235 error = -ENOTEMPTY; 4275 if (!(flags & RENAME_EXCHANGE))
4276 error = -ENOTEMPTY;
4236 if (new_dentry == trap) 4277 if (new_dentry == trap)
4237 goto exit5; 4278 goto exit5;
4238 4279
4239 error = security_path_rename(&oldnd.path, old_dentry, 4280 error = security_path_rename(&oldnd.path, old_dentry,
4240 &newnd.path, new_dentry); 4281 &newnd.path, new_dentry, flags);
4241 if (error) 4282 if (error)
4242 goto exit5; 4283 goto exit5;
4243 error = vfs_rename(old_dir->d_inode, old_dentry, 4284 error = vfs_rename(old_dir->d_inode, old_dentry,
4244 new_dir->d_inode, new_dentry, 4285 new_dir->d_inode, new_dentry,
4245 &delegated_inode); 4286 &delegated_inode, flags);
4246exit5: 4287exit5:
4247 dput(new_dentry); 4288 dput(new_dentry);
4248exit4: 4289exit4:
@@ -4272,16 +4313,20 @@ exit:
4272 return error; 4313 return error;
4273} 4314}
4274 4315
4275SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4316SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
4317 int, newdfd, const char __user *, newname)
4276{ 4318{
4277 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 4319 return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
4278} 4320}
4279 4321
4280int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 4322SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
4281{ 4323{
4282 int len; 4324 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4325}
4283 4326
4284 len = PTR_ERR(link); 4327int readlink_copy(char __user *buffer, int buflen, const char *link)
4328{
4329 int len = PTR_ERR(link);
4285 if (IS_ERR(link)) 4330 if (IS_ERR(link))
4286 goto out; 4331 goto out;
4287 4332
@@ -4293,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
4293out: 4338out:
4294 return len; 4339 return len;
4295} 4340}
4341EXPORT_SYMBOL(readlink_copy);
4296 4342
4297/* 4343/*
4298 * A helper for ->readlink(). This should be used *ONLY* for symlinks that 4344 * A helper for ->readlink(). This should be used *ONLY* for symlinks that
@@ -4310,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4310 if (IS_ERR(cookie)) 4356 if (IS_ERR(cookie))
4311 return PTR_ERR(cookie); 4357 return PTR_ERR(cookie);
4312 4358
4313 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); 4359 res = readlink_copy(buffer, buflen, nd_get_link(&nd));
4314 if (dentry->d_inode->i_op->put_link) 4360 if (dentry->d_inode->i_op->put_link)
4315 dentry->d_inode->i_op->put_link(dentry, &nd, cookie); 4361 dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
4316 return res; 4362 return res;
4317} 4363}
4364EXPORT_SYMBOL(generic_readlink);
4318 4365
4319/* get the link contents into pagecache */ 4366/* get the link contents into pagecache */
4320static char *page_getlink(struct dentry * dentry, struct page **ppage) 4367static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4334,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
4334int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 4381int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
4335{ 4382{
4336 struct page *page = NULL; 4383 struct page *page = NULL;
4337 char *s = page_getlink(dentry, &page); 4384 int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
4338 int res = vfs_readlink(dentry,buffer,buflen,s);
4339 if (page) { 4385 if (page) {
4340 kunmap(page); 4386 kunmap(page);
4341 page_cache_release(page); 4387 page_cache_release(page);
4342 } 4388 }
4343 return res; 4389 return res;
4344} 4390}
4391EXPORT_SYMBOL(page_readlink);
4345 4392
4346void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) 4393void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
4347{ 4394{
@@ -4349,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
4349 nd_set_link(nd, page_getlink(dentry, &page)); 4396 nd_set_link(nd, page_getlink(dentry, &page));
4350 return page; 4397 return page;
4351} 4398}
4399EXPORT_SYMBOL(page_follow_link_light);
4352 4400
4353void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 4401void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
4354{ 4402{
@@ -4359,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
4359 page_cache_release(page); 4407 page_cache_release(page);
4360 } 4408 }
4361} 4409}
4410EXPORT_SYMBOL(page_put_link);
4362 4411
4363/* 4412/*
4364 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS 4413 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4396,45 +4445,18 @@ retry:
4396fail: 4445fail:
4397 return err; 4446 return err;
4398} 4447}
4448EXPORT_SYMBOL(__page_symlink);
4399 4449
4400int page_symlink(struct inode *inode, const char *symname, int len) 4450int page_symlink(struct inode *inode, const char *symname, int len)
4401{ 4451{
4402 return __page_symlink(inode, symname, len, 4452 return __page_symlink(inode, symname, len,
4403 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); 4453 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
4404} 4454}
4455EXPORT_SYMBOL(page_symlink);
4405 4456
4406const struct inode_operations page_symlink_inode_operations = { 4457const struct inode_operations page_symlink_inode_operations = {
4407 .readlink = generic_readlink, 4458 .readlink = generic_readlink,
4408 .follow_link = page_follow_link_light, 4459 .follow_link = page_follow_link_light,
4409 .put_link = page_put_link, 4460 .put_link = page_put_link,
4410}; 4461};
4411
4412EXPORT_SYMBOL(user_path_at);
4413EXPORT_SYMBOL(follow_down_one);
4414EXPORT_SYMBOL(follow_down);
4415EXPORT_SYMBOL(follow_up);
4416EXPORT_SYMBOL(get_write_access); /* nfsd */
4417EXPORT_SYMBOL(lock_rename);
4418EXPORT_SYMBOL(lookup_one_len);
4419EXPORT_SYMBOL(page_follow_link_light);
4420EXPORT_SYMBOL(page_put_link);
4421EXPORT_SYMBOL(page_readlink);
4422EXPORT_SYMBOL(__page_symlink);
4423EXPORT_SYMBOL(page_symlink);
4424EXPORT_SYMBOL(page_symlink_inode_operations); 4462EXPORT_SYMBOL(page_symlink_inode_operations);
4425EXPORT_SYMBOL(kern_path);
4426EXPORT_SYMBOL(vfs_path_lookup);
4427EXPORT_SYMBOL(inode_permission);
4428EXPORT_SYMBOL(unlock_rename);
4429EXPORT_SYMBOL(vfs_create);
4430EXPORT_SYMBOL(vfs_link);
4431EXPORT_SYMBOL(vfs_mkdir);
4432EXPORT_SYMBOL(vfs_mknod);
4433EXPORT_SYMBOL(generic_permission);
4434EXPORT_SYMBOL(vfs_readlink);
4435EXPORT_SYMBOL(vfs_rename);
4436EXPORT_SYMBOL(vfs_rmdir);
4437EXPORT_SYMBOL(vfs_symlink);
4438EXPORT_SYMBOL(vfs_unlink);
4439EXPORT_SYMBOL(dentry_unhash);
4440EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2ffc5a2905d4..182bc41cd887 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str)
52} 52}
53__setup("mphash_entries=", set_mphash_entries); 53__setup("mphash_entries=", set_mphash_entries);
54 54
55static int event; 55static u64 event;
56static DEFINE_IDA(mnt_id_ida); 56static DEFINE_IDA(mnt_id_ida);
57static DEFINE_IDA(mnt_group_ida); 57static DEFINE_IDA(mnt_group_ida);
58static DEFINE_SPINLOCK(mnt_id_lock); 58static DEFINE_SPINLOCK(mnt_id_lock);
@@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
414 */ 414 */
415int __mnt_want_write_file(struct file *file) 415int __mnt_want_write_file(struct file *file)
416{ 416{
417 struct inode *inode = file_inode(file); 417 if (!(file->f_mode & FMODE_WRITER))
418
419 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
420 return __mnt_want_write(file->f_path.mnt); 418 return __mnt_want_write(file->f_path.mnt);
421 else 419 else
422 return mnt_clone_write(file->f_path.mnt); 420 return mnt_clone_write(file->f_path.mnt);
@@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb)
570static void free_vfsmnt(struct mount *mnt) 568static void free_vfsmnt(struct mount *mnt)
571{ 569{
572 kfree(mnt->mnt_devname); 570 kfree(mnt->mnt_devname);
573 mnt_free_id(mnt);
574#ifdef CONFIG_SMP 571#ifdef CONFIG_SMP
575 free_percpu(mnt->mnt_pcp); 572 free_percpu(mnt->mnt_pcp);
576#endif 573#endif
577 kmem_cache_free(mnt_cache, mnt); 574 kmem_cache_free(mnt_cache, mnt);
578} 575}
579 576
577static void delayed_free_vfsmnt(struct rcu_head *head)
578{
579 free_vfsmnt(container_of(head, struct mount, mnt_rcu));
580}
581
580/* call under rcu_read_lock */ 582/* call under rcu_read_lock */
581bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 583bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
582{ 584{
@@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
848 850
849 root = mount_fs(type, flags, name, data); 851 root = mount_fs(type, flags, name, data);
850 if (IS_ERR(root)) { 852 if (IS_ERR(root)) {
853 mnt_free_id(mnt);
851 free_vfsmnt(mnt); 854 free_vfsmnt(mnt);
852 return ERR_CAST(root); 855 return ERR_CAST(root);
853 } 856 }
@@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
885 goto out_free; 888 goto out_free;
886 } 889 }
887 890
888 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; 891 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
889 /* Don't allow unprivileged users to change mount flags */ 892 /* Don't allow unprivileged users to change mount flags */
890 if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) 893 if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
891 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 894 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
928 return mnt; 931 return mnt;
929 932
930 out_free: 933 out_free:
934 mnt_free_id(mnt);
931 free_vfsmnt(mnt); 935 free_vfsmnt(mnt);
932 return ERR_PTR(err); 936 return ERR_PTR(err);
933} 937}
934 938
935static void delayed_free(struct rcu_head *head)
936{
937 struct mount *mnt = container_of(head, struct mount, mnt_rcu);
938 kfree(mnt->mnt_devname);
939#ifdef CONFIG_SMP
940 free_percpu(mnt->mnt_pcp);
941#endif
942 kmem_cache_free(mnt_cache, mnt);
943}
944
945static void mntput_no_expire(struct mount *mnt) 939static void mntput_no_expire(struct mount *mnt)
946{ 940{
947put_again: 941put_again:
@@ -991,7 +985,7 @@ put_again:
991 dput(mnt->mnt.mnt_root); 985 dput(mnt->mnt.mnt_root);
992 deactivate_super(mnt->mnt.mnt_sb); 986 deactivate_super(mnt->mnt.mnt_sb);
993 mnt_free_id(mnt); 987 mnt_free_id(mnt);
994 call_rcu(&mnt->mnt_rcu, delayed_free); 988 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
995} 989}
996 990
997void mntput(struct vfsmount *mnt) 991void mntput(struct vfsmount *mnt)
@@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
1100 struct proc_mounts *p = proc_mounts(m); 1094 struct proc_mounts *p = proc_mounts(m);
1101 1095
1102 down_read(&namespace_sem); 1096 down_read(&namespace_sem);
1103 return seq_list_start(&p->ns->list, *pos); 1097 if (p->cached_event == p->ns->event) {
1098 void *v = p->cached_mount;
1099 if (*pos == p->cached_index)
1100 return v;
1101 if (*pos == p->cached_index + 1) {
1102 v = seq_list_next(v, &p->ns->list, &p->cached_index);
1103 return p->cached_mount = v;
1104 }
1105 }
1106
1107 p->cached_event = p->ns->event;
1108 p->cached_mount = seq_list_start(&p->ns->list, *pos);
1109 p->cached_index = *pos;
1110 return p->cached_mount;
1104} 1111}
1105 1112
1106static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1113static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1107{ 1114{
1108 struct proc_mounts *p = proc_mounts(m); 1115 struct proc_mounts *p = proc_mounts(m);
1109 1116
1110 return seq_list_next(v, &p->ns->list, pos); 1117 p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1118 p->cached_index = *pos;
1119 return p->cached_mount;
1111} 1120}
1112 1121
1113static void m_stop(struct seq_file *m, void *v) 1122static void m_stop(struct seq_file *m, void *v)
@@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1661 if (err) 1670 if (err)
1662 goto out; 1671 goto out;
1663 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1672 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1673 lock_mount_hash();
1664 if (err) 1674 if (err)
1665 goto out_cleanup_ids; 1675 goto out_cleanup_ids;
1666 lock_mount_hash();
1667 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1676 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1668 set_mnt_shared(p); 1677 set_mnt_shared(p);
1669 } else { 1678 } else {
@@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1690 return 0; 1699 return 0;
1691 1700
1692 out_cleanup_ids: 1701 out_cleanup_ids:
1702 while (!hlist_empty(&tree_list)) {
1703 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1704 umount_tree(child, 0);
1705 }
1706 unlock_mount_hash();
1693 cleanup_group_ids(source_mnt, NULL); 1707 cleanup_group_ids(source_mnt, NULL);
1694 out: 1708 out:
1695 return err; 1709 return err;
@@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
2044 struct mount *parent; 2058 struct mount *parent;
2045 int err; 2059 int err;
2046 2060
2047 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); 2061 mnt_flags &= ~MNT_INTERNAL_FLAGS;
2048 2062
2049 mp = lock_mount(path); 2063 mp = lock_mount(path);
2050 if (IS_ERR(mp)) 2064 if (IS_ERR(mp))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index c320ac52353e..08b8ea8c353e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
339 if (val) 339 if (val)
340 goto finished; 340 goto finished;
341 341
342 DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n", 342 ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n",
343 dentry, NCP_GET_AGE(dentry)); 343 dentry, NCP_GET_AGE(dentry));
344 344
345 len = sizeof(__name); 345 len = sizeof(__name);
@@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
358 res = ncp_obtain_info(server, dir, __name, &(finfo.i)); 358 res = ncp_obtain_info(server, dir, __name, &(finfo.i));
359 } 359 }
360 finfo.volume = finfo.i.volNumber; 360 finfo.volume = finfo.i.volNumber;
361 DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n", 361 ncp_dbg(2, "looked for %pd/%s, res=%d\n",
362 dentry->d_parent, __name, res); 362 dentry->d_parent, __name, res);
363 /* 363 /*
364 * If we didn't find it, or if it has a different dirEntNum to 364 * If we didn't find it, or if it has a different dirEntNum to
@@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
372 ncp_new_dentry(dentry); 372 ncp_new_dentry(dentry);
373 val=1; 373 val=1;
374 } else 374 } else
375 DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); 375 ncp_dbg(2, "found, but dirEntNum changed\n");
376 376
377 ncp_update_inode2(inode, &finfo); 377 ncp_update_inode2(inode, &finfo);
378 mutex_unlock(&inode->i_mutex); 378 mutex_unlock(&inode->i_mutex);
379 } 379 }
380 380
381finished: 381finished:
382 DDPRINTK("ncp_lookup_validate: result=%d\n", val); 382 ncp_dbg(2, "result=%d\n", val);
383 dput(parent); 383 dput(parent);
384 return val; 384 return val;
385} 385}
@@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
453 ctl.page = NULL; 453 ctl.page = NULL;
454 ctl.cache = NULL; 454 ctl.cache = NULL;
455 455
456 DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file, 456 ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos);
457 (int) ctx->pos);
458 457
459 result = -EIO; 458 result = -EIO;
460 /* Do not generate '.' and '..' when server is dead. */ 459 /* Do not generate '.' and '..' when server is dead. */
@@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
697 struct ncp_entry_info entry; 696 struct ncp_entry_info entry;
698 int i; 697 int i;
699 698
700 DPRINTK("ncp_read_volume_list: pos=%ld\n", 699 ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos);
701 (unsigned long) ctx->pos);
702 700
703 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 701 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
704 int inval_dentry; 702 int inval_dentry;
@@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
708 if (!strlen(info.volume_name)) 706 if (!strlen(info.volume_name))
709 continue; 707 continue;
710 708
711 DPRINTK("ncp_read_volume_list: found vol: %s\n", 709 ncp_dbg(1, "found vol: %s\n", info.volume_name);
712 info.volume_name);
713 710
714 if (ncp_lookup_volume(server, info.volume_name, 711 if (ncp_lookup_volume(server, info.volume_name,
715 &entry.i)) { 712 &entry.i)) {
716 DPRINTK("ncpfs: could not lookup vol %s\n", 713 ncp_dbg(1, "could not lookup vol %s\n",
717 info.volume_name); 714 info.volume_name);
718 continue; 715 continue;
719 } 716 }
@@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx,
738 int more; 735 int more;
739 size_t bufsize; 736 size_t bufsize;
740 737
741 DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file, 738 ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos);
742 (unsigned long) ctx->pos); 739 ncp_vdbg("init %pD, volnum=%d, dirent=%u\n",
743 PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n", 740 file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
744 file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
745 741
746 err = ncp_initialize_search(server, dir, &seq); 742 err = ncp_initialize_search(server, dir, &seq);
747 if (err) { 743 if (err) {
748 DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); 744 ncp_dbg(1, "init failed, err=%d\n", err);
749 return; 745 return;
750 } 746 }
751 /* We MUST NOT use server->buffer_size handshaked with server if we are 747 /* We MUST NOT use server->buffer_size handshaked with server if we are
@@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb)
808 goto out; 804 goto out;
809 result = -ENOENT; 805 result = -ENOENT;
810 if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { 806 if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
811 PPRINTK("ncp_conn_logged_in: %s not found\n", 807 ncp_vdbg("%s not found\n", server->m.mounted_vol);
812 server->m.mounted_vol);
813 goto out; 808 goto out;
814 } 809 }
815 dent = sb->s_root; 810 dent = sb->s_root;
@@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb)
822 NCP_FINFO(ino)->DosDirNum = DosDirNum; 817 NCP_FINFO(ino)->DosDirNum = DosDirNum;
823 result = 0; 818 result = 0;
824 } else { 819 } else {
825 DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); 820 ncp_dbg(1, "sb->s_root->d_inode == NULL!\n");
826 } 821 }
827 } else { 822 } else {
828 DPRINTK("ncpfs: sb->s_root == NULL!\n"); 823 ncp_dbg(1, "sb->s_root == NULL!\n");
829 } 824 }
830 } else 825 } else
831 result = 0; 826 result = 0;
@@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
846 if (!ncp_conn_valid(server)) 841 if (!ncp_conn_valid(server))
847 goto finished; 842 goto finished;
848 843
849 PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry); 844 ncp_vdbg("server lookup for %pd2\n", dentry);
850 845
851 len = sizeof(__name); 846 len = sizeof(__name);
852 if (ncp_is_server_root(dir)) { 847 if (ncp_is_server_root(dir)) {
@@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
854 dentry->d_name.len, 1); 849 dentry->d_name.len, 1);
855 if (!res) 850 if (!res)
856 res = ncp_lookup_volume(server, __name, &(finfo.i)); 851 res = ncp_lookup_volume(server, __name, &(finfo.i));
857 if (!res) 852 if (!res)
858 ncp_update_known_namespace(server, finfo.i.volNumber, NULL); 853 ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
859 } else { 854 } else {
860 res = ncp_io2vol(server, __name, &len, dentry->d_name.name, 855 res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
861 dentry->d_name.len, !ncp_preserve_case(dir)); 856 dentry->d_name.len, !ncp_preserve_case(dir));
862 if (!res) 857 if (!res)
863 res = ncp_obtain_info(server, dir, __name, &(finfo.i)); 858 res = ncp_obtain_info(server, dir, __name, &(finfo.i));
864 } 859 }
865 PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res); 860 ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
866 /* 861 /*
867 * If we didn't find an entry, make a negative dentry. 862 * If we didn't find an entry, make a negative dentry.
868 */ 863 */
@@ -886,7 +881,7 @@ add_entry:
886 } 881 }
887 882
888finished: 883finished:
889 PPRINTK("ncp_lookup: result=%d\n", error); 884 ncp_vdbg("result=%d\n", error);
890 return ERR_PTR(error); 885 return ERR_PTR(error);
891} 886}
892 887
@@ -909,7 +904,7 @@ out:
909 return error; 904 return error;
910 905
911out_close: 906out_close:
912 PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry); 907 ncp_vdbg("%pd2 failed, closing file\n", dentry);
913 ncp_close_file(NCP_SERVER(dir), finfo->file_handle); 908 ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
914 goto out; 909 goto out;
915} 910}
@@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
923 int opmode; 918 int opmode;
924 __u8 __name[NCP_MAXPATHLEN + 1]; 919 __u8 __name[NCP_MAXPATHLEN + 1];
925 920
926 PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode); 921 ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode);
927 922
928 ncp_age_dentry(server, dentry); 923 ncp_age_dentry(server, dentry);
929 len = sizeof(__name); 924 len = sizeof(__name);
@@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
952 error = -ENAMETOOLONG; 947 error = -ENAMETOOLONG;
953 else if (result < 0) 948 else if (result < 0)
954 error = result; 949 error = result;
955 DPRINTK("ncp_create: %pd2 failed\n", dentry); 950 ncp_dbg(1, "%pd2 failed\n", dentry);
956 goto out; 951 goto out;
957 } 952 }
958 opmode = O_WRONLY; 953 opmode = O_WRONLY;
@@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
985 int error, len; 980 int error, len;
986 __u8 __name[NCP_MAXPATHLEN + 1]; 981 __u8 __name[NCP_MAXPATHLEN + 1];
987 982
988 DPRINTK("ncp_mkdir: making %pd2\n", dentry); 983 ncp_dbg(1, "making %pd2\n", dentry);
989 984
990 ncp_age_dentry(server, dentry); 985 ncp_age_dentry(server, dentry);
991 len = sizeof(__name); 986 len = sizeof(__name);
@@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1022 int error, result, len; 1017 int error, result, len;
1023 __u8 __name[NCP_MAXPATHLEN + 1]; 1018 __u8 __name[NCP_MAXPATHLEN + 1];
1024 1019
1025 DPRINTK("ncp_rmdir: removing %pd2\n", dentry); 1020 ncp_dbg(1, "removing %pd2\n", dentry);
1026 1021
1027 len = sizeof(__name); 1022 len = sizeof(__name);
1028 error = ncp_io2vol(server, __name, &len, dentry->d_name.name, 1023 error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1067 int error; 1062 int error;
1068 1063
1069 server = NCP_SERVER(dir); 1064 server = NCP_SERVER(dir);
1070 DPRINTK("ncp_unlink: unlinking %pd2\n", dentry); 1065 ncp_dbg(1, "unlinking %pd2\n", dentry);
1071 1066
1072 /* 1067 /*
1073 * Check whether to close the file ... 1068 * Check whether to close the file ...
1074 */ 1069 */
1075 if (inode) { 1070 if (inode) {
1076 PPRINTK("ncp_unlink: closing file\n"); 1071 ncp_vdbg("closing file\n");
1077 ncp_make_closed(inode); 1072 ncp_make_closed(inode);
1078 } 1073 }
1079 1074
@@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
1087#endif 1082#endif
1088 switch (error) { 1083 switch (error) {
1089 case 0x00: 1084 case 0x00:
1090 DPRINTK("ncp: removed %pd2\n", dentry); 1085 ncp_dbg(1, "removed %pd2\n", dentry);
1091 break; 1086 break;
1092 case 0x85: 1087 case 0x85:
1093 case 0x8A: 1088 case 0x8A:
@@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1120 int old_len, new_len; 1115 int old_len, new_len;
1121 __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; 1116 __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
1122 1117
1123 DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry); 1118 ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
1124 1119
1125 ncp_age_dentry(server, old_dentry); 1120 ncp_age_dentry(server, old_dentry);
1126 ncp_age_dentry(server, new_dentry); 1121 ncp_age_dentry(server, new_dentry);
@@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1150#endif 1145#endif
1151 switch (error) { 1146 switch (error) {
1152 case 0x00: 1147 case 0x00:
1153 DPRINTK("ncp renamed %pd -> %pd.\n", 1148 ncp_dbg(1, "renamed %pd -> %pd\n",
1154 old_dentry, new_dentry); 1149 old_dentry, new_dentry);
1155 break; 1150 break;
1156 case 0x9E: 1151 case 0x9E:
1157 error = -ENAMETOOLONG; 1152 error = -ENAMETOOLONG;
@@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry,
1173 if (!new_valid_dev(rdev)) 1168 if (!new_valid_dev(rdev))
1174 return -EINVAL; 1169 return -EINVAL;
1175 if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { 1170 if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
1176 DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); 1171 ncp_dbg(1, "mode = 0%ho\n", mode);
1177 return ncp_create_new(dir, dentry, mode, rdev, 0); 1172 return ncp_create_new(dir, dentry, mode, rdev, 0);
1178 } 1173 }
1179 return -EPERM; /* Strange, but true */ 1174 return -EPERM; /* Strange, but true */
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 8f5074e1ecb9..77640a8bfb87 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -6,6 +6,8 @@
6 * 6 *
7 */ 7 */
8 8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
9#include <asm/uaccess.h> 11#include <asm/uaccess.h>
10 12
11#include <linux/time.h> 13#include <linux/time.h>
@@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right)
34 36
35 error = -EINVAL; 37 error = -EINVAL;
36 if (!inode) { 38 if (!inode) {
37 printk(KERN_ERR "ncp_make_open: got NULL inode\n"); 39 pr_err("%s: got NULL inode\n", __func__);
38 goto out; 40 goto out;
39 } 41 }
40 42
41 DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n", 43 ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n",
42 atomic_read(&NCP_FINFO(inode)->opened), 44 atomic_read(&NCP_FINFO(inode)->opened),
43 NCP_FINFO(inode)->volNumber, 45 NCP_FINFO(inode)->volNumber,
44 NCP_FINFO(inode)->dirEntNum); 46 NCP_FINFO(inode)->dirEntNum);
@@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right)
71 break; 73 break;
72 } 74 }
73 if (result) { 75 if (result) {
74 PPRINTK("ncp_make_open: failed, result=%d\n", result); 76 ncp_vdbg("failed, result=%d\n", result);
75 goto out_unlock; 77 goto out_unlock;
76 } 78 }
77 /* 79 /*
@@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right)
83 } 85 }
84 86
85 access = NCP_FINFO(inode)->access; 87 access = NCP_FINFO(inode)->access;
86 PPRINTK("ncp_make_open: file open, access=%x\n", access); 88 ncp_vdbg("file open, access=%x\n", access);
87 if (access == right || access == O_RDWR) { 89 if (access == right || access == O_RDWR) {
88 atomic_inc(&NCP_FINFO(inode)->opened); 90 atomic_inc(&NCP_FINFO(inode)->opened);
89 error = 0; 91 error = 0;
@@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
107 void* freepage; 109 void* freepage;
108 size_t freelen; 110 size_t freelen;
109 111
110 DPRINTK("ncp_file_read: enter %pd2\n", dentry); 112 ncp_dbg(1, "enter %pd2\n", dentry);
111 113
112 pos = *ppos; 114 pos = *ppos;
113 115
@@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
124 126
125 error = ncp_make_open(inode, O_RDONLY); 127 error = ncp_make_open(inode, O_RDONLY);
126 if (error) { 128 if (error) {
127 DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error); 129 ncp_dbg(1, "open failed, error=%d\n", error);
128 return error; 130 return error;
129 } 131 }
130 132
@@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
165 167
166 file_accessed(file); 168 file_accessed(file);
167 169
168 DPRINTK("ncp_file_read: exit %pd2\n", dentry); 170 ncp_dbg(1, "exit %pd2\n", dentry);
169outrel: 171outrel:
170 ncp_inode_close(inode); 172 ncp_inode_close(inode);
171 return already_read ? already_read : error; 173 return already_read ? already_read : error;
@@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
182 int errno; 184 int errno;
183 void* bouncebuffer; 185 void* bouncebuffer;
184 186
185 DPRINTK("ncp_file_write: enter %pd2\n", dentry); 187 ncp_dbg(1, "enter %pd2\n", dentry);
186 if ((ssize_t) count < 0) 188 if ((ssize_t) count < 0)
187 return -EINVAL; 189 return -EINVAL;
188 pos = *ppos; 190 pos = *ppos;
@@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
211 return 0; 213 return 0;
212 errno = ncp_make_open(inode, O_WRONLY); 214 errno = ncp_make_open(inode, O_WRONLY);
213 if (errno) { 215 if (errno) {
214 DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno); 216 ncp_dbg(1, "open failed, error=%d\n", errno);
215 return errno; 217 return errno;
216 } 218 }
217 bufsize = NCP_SERVER(inode)->buffer_size; 219 bufsize = NCP_SERVER(inode)->buffer_size;
@@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
261 i_size_write(inode, pos); 263 i_size_write(inode, pos);
262 mutex_unlock(&inode->i_mutex); 264 mutex_unlock(&inode->i_mutex);
263 } 265 }
264 DPRINTK("ncp_file_write: exit %pd2\n", dentry); 266 ncp_dbg(1, "exit %pd2\n", dentry);
265outrel: 267outrel:
266 ncp_inode_close(inode); 268 ncp_inode_close(inode);
267 return already_written ? already_written : errno; 269 return already_written ? already_written : errno;
@@ -269,7 +271,7 @@ outrel:
269 271
270static int ncp_release(struct inode *inode, struct file *file) { 272static int ncp_release(struct inode *inode, struct file *file) {
271 if (ncp_make_closed(inode)) { 273 if (ncp_make_closed(inode)) {
272 DPRINTK("ncp_release: failed to close\n"); 274 ncp_dbg(1, "failed to close\n");
273 } 275 }
274 return 0; 276 return 0;
275} 277}
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 0af3349de851..03ffde1f44d6 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -2,6 +2,8 @@
2 * getopt.c 2 * getopt.c
3 */ 3 */
4 4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
5#include <linux/kernel.h> 7#include <linux/kernel.h>
6#include <linux/string.h> 8#include <linux/string.h>
7 9
@@ -46,8 +48,8 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
46 if (opts->has_arg & OPT_NOPARAM) { 48 if (opts->has_arg & OPT_NOPARAM) {
47 return opts->val; 49 return opts->val;
48 } 50 }
49 printk(KERN_INFO "%s: the %s option requires an argument\n", 51 pr_info("%s: the %s option requires an argument\n",
50 caller, token); 52 caller, token);
51 return -EINVAL; 53 return -EINVAL;
52 } 54 }
53 if (opts->has_arg & OPT_INT) { 55 if (opts->has_arg & OPT_INT) {
@@ -57,18 +59,18 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
57 if (!*v) { 59 if (!*v) {
58 return opts->val; 60 return opts->val;
59 } 61 }
60 printk(KERN_INFO "%s: invalid numeric value in %s=%s\n", 62 pr_info("%s: invalid numeric value in %s=%s\n",
61 caller, token, val); 63 caller, token, val);
62 return -EDOM; 64 return -EDOM;
63 } 65 }
64 if (opts->has_arg & OPT_STRING) { 66 if (opts->has_arg & OPT_STRING) {
65 return opts->val; 67 return opts->val;
66 } 68 }
67 printk(KERN_INFO "%s: unexpected argument %s to the %s option\n", 69 pr_info("%s: unexpected argument %s to the %s option\n",
68 caller, val, token); 70 caller, val, token);
69 return -EINVAL; 71 return -EINVAL;
70 } 72 }
71 } 73 }
72 printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token); 74 pr_info("%s: Unrecognized mount option %s\n", caller, token);
73 return -EOPNOTSUPP; 75 return -EOPNOTSUPP;
74} 76}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebecb55f..e31e589369a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,6 +9,8 @@
9 * 9 *
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/module.h> 14#include <linux/module.h>
13 15
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
@@ -99,6 +101,7 @@ static void destroy_inodecache(void)
99 101
100static int ncp_remount(struct super_block *sb, int *flags, char* data) 102static int ncp_remount(struct super_block *sb, int *flags, char* data)
101{ 103{
104 sync_filesystem(sb);
102 *flags |= MS_NODIRATIME; 105 *flags |= MS_NODIRATIME;
103 return 0; 106 return 0;
104} 107}
@@ -132,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
132 NCP_FINFO(inode)->access = nwinfo->access; 135 NCP_FINFO(inode)->access = nwinfo->access;
133 memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, 136 memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
134 sizeof(nwinfo->file_handle)); 137 sizeof(nwinfo->file_handle));
135 DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n", 138 ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n",
136 nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, 139 nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
137 NCP_FINFO(inode)->dirEntNum); 140 NCP_FINFO(inode)->dirEntNum);
138} 141}
@@ -140,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
140static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) 143static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
141{ 144{
142 /* NFS namespace mode overrides others if it's set. */ 145 /* NFS namespace mode overrides others if it's set. */
143 DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n", 146 ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode);
144 nwi->entryName, nwi->nfs.mode);
145 if (nwi->nfs.mode) { 147 if (nwi->nfs.mode) {
146 /* XXX Security? */ 148 /* XXX Security? */
147 inode->i_mode = nwi->nfs.mode; 149 inode->i_mode = nwi->nfs.mode;
@@ -229,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
229 231
230 ncp_update_attrs(inode, nwinfo); 232 ncp_update_attrs(inode, nwinfo);
231 233
232 DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); 234 ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode);
233 235
234 set_nlink(inode, 1); 236 set_nlink(inode, 1);
235 inode->i_uid = server->m.uid; 237 inode->i_uid = server->m.uid;
@@ -257,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
257 struct inode *inode; 259 struct inode *inode;
258 260
259 if (info == NULL) { 261 if (info == NULL) {
260 printk(KERN_ERR "ncp_iget: info is NULL\n"); 262 pr_err("%s: info is NULL\n", __func__);
261 return NULL; 263 return NULL;
262 } 264 }
263 265
@@ -289,23 +291,23 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
289 } 291 }
290 insert_inode_hash(inode); 292 insert_inode_hash(inode);
291 } else 293 } else
292 printk(KERN_ERR "ncp_iget: iget failed!\n"); 294 pr_err("%s: iget failed!\n", __func__);
293 return inode; 295 return inode;
294} 296}
295 297
296static void 298static void
297ncp_evict_inode(struct inode *inode) 299ncp_evict_inode(struct inode *inode)
298{ 300{
299 truncate_inode_pages(&inode->i_data, 0); 301 truncate_inode_pages_final(&inode->i_data);
300 clear_inode(inode); 302 clear_inode(inode);
301 303
302 if (S_ISDIR(inode->i_mode)) { 304 if (S_ISDIR(inode->i_mode)) {
303 DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); 305 ncp_dbg(2, "put directory %ld\n", inode->i_ino);
304 } 306 }
305 307
306 if (ncp_make_closed(inode) != 0) { 308 if (ncp_make_closed(inode) != 0) {
307 /* We can't do anything but complain. */ 309 /* We can't do anything but complain. */
308 printk(KERN_ERR "ncp_evict_inode: could not close\n"); 310 pr_err("%s: could not close\n", __func__);
309 } 311 }
310} 312}
311 313
@@ -468,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
468{ 470{
469 struct ncp_mount_data_kernel data; 471 struct ncp_mount_data_kernel data;
470 struct ncp_server *server; 472 struct ncp_server *server;
471 struct file *ncp_filp;
472 struct inode *root_inode; 473 struct inode *root_inode;
473 struct inode *sock_inode;
474 struct socket *sock; 474 struct socket *sock;
475 int error; 475 int error;
476 int default_bufsize; 476 int default_bufsize;
@@ -539,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
539 if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || 539 if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
540 !gid_valid(data.gid)) 540 !gid_valid(data.gid))
541 goto out; 541 goto out;
542 error = -EBADF; 542 sock = sockfd_lookup(data.ncp_fd, &error);
543 ncp_filp = fget(data.ncp_fd);
544 if (!ncp_filp)
545 goto out;
546 error = -ENOTSOCK;
547 sock_inode = file_inode(ncp_filp);
548 if (!S_ISSOCK(sock_inode->i_mode))
549 goto out_fput;
550 sock = SOCKET_I(sock_inode);
551 if (!sock) 543 if (!sock)
552 goto out_fput; 544 goto out;
553 545
554 if (sock->type == SOCK_STREAM) 546 if (sock->type == SOCK_STREAM)
555 default_bufsize = 0xF000; 547 default_bufsize = 0xF000;
556 else 548 else
@@ -572,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
572 if (error) 564 if (error)
573 goto out_fput; 565 goto out_fput;
574 566
575 server->ncp_filp = ncp_filp;
576 server->ncp_sock = sock; 567 server->ncp_sock = sock;
577 568
578 if (data.info_fd != -1) { 569 if (data.info_fd != -1) {
579 struct socket *info_sock; 570 struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
580
581 error = -EBADF;
582 server->info_filp = fget(data.info_fd);
583 if (!server->info_filp)
584 goto out_bdi;
585 error = -ENOTSOCK;
586 sock_inode = file_inode(server->info_filp);
587 if (!S_ISSOCK(sock_inode->i_mode))
588 goto out_fput2;
589 info_sock = SOCKET_I(sock_inode);
590 if (!info_sock) 571 if (!info_sock)
591 goto out_fput2; 572 goto out_bdi;
573 server->info_sock = info_sock;
592 error = -EBADFD; 574 error = -EBADFD;
593 if (info_sock->type != SOCK_STREAM) 575 if (info_sock->type != SOCK_STREAM)
594 goto out_fput2; 576 goto out_fput2;
595 server->info_sock = info_sock;
596 } 577 }
597 578
598/* server->lock = 0; */ 579/* server->lock = 0; */
@@ -620,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
620 now because of PATH_MAX changes.. */ 601 now because of PATH_MAX changes.. */
621 if (server->m.time_out < 1) { 602 if (server->m.time_out < 1) {
622 server->m.time_out = 10; 603 server->m.time_out = 10;
623 printk(KERN_INFO "You need to recompile your ncpfs utils..\n"); 604 pr_info("You need to recompile your ncpfs utils..\n");
624 } 605 }
625 server->m.time_out = server->m.time_out * HZ / 100; 606 server->m.time_out = server->m.time_out * HZ / 100;
626 server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; 607 server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
@@ -681,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
681 ncp_unlock_server(server); 662 ncp_unlock_server(server);
682 if (error < 0) 663 if (error < 0)
683 goto out_rxbuf; 664 goto out_rxbuf;
684 DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb)); 665 ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb));
685 666
686 error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ 667 error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */
687#ifdef CONFIG_NCPFS_PACKET_SIGNING 668#ifdef CONFIG_NCPFS_PACKET_SIGNING
@@ -709,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
709 if (ncp_negotiate_buffersize(server, default_bufsize, 690 if (ncp_negotiate_buffersize(server, default_bufsize,
710 &(server->buffer_size)) != 0) 691 &(server->buffer_size)) != 0)
711 goto out_disconnect; 692 goto out_disconnect;
712 DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size); 693 ncp_dbg(1, "bufsize = %d\n", server->buffer_size);
713 694
714 memset(&finfo, 0, sizeof(finfo)); 695 memset(&finfo, 0, sizeof(finfo));
715 finfo.i.attributes = aDIR; 696 finfo.i.attributes = aDIR;
@@ -738,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
738 root_inode = ncp_iget(sb, &finfo); 719 root_inode = ncp_iget(sb, &finfo);
739 if (!root_inode) 720 if (!root_inode)
740 goto out_disconnect; 721 goto out_disconnect;
741 DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); 722 ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
742 sb->s_root = d_make_root(root_inode); 723 sb->s_root = d_make_root(root_inode);
743 if (!sb->s_root) 724 if (!sb->s_root)
744 goto out_disconnect; 725 goto out_disconnect;
@@ -764,17 +745,12 @@ out_nls:
764 mutex_destroy(&server->root_setup_lock); 745 mutex_destroy(&server->root_setup_lock);
765 mutex_destroy(&server->mutex); 746 mutex_destroy(&server->mutex);
766out_fput2: 747out_fput2:
767 if (server->info_filp) 748 if (server->info_sock)
768 fput(server->info_filp); 749 sockfd_put(server->info_sock);
769out_bdi: 750out_bdi:
770 bdi_destroy(&server->bdi); 751 bdi_destroy(&server->bdi);
771out_fput: 752out_fput:
772 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 753 sockfd_put(sock);
773 *
774 * The previously used put_filp(ncp_filp); was bogus, since
775 * it doesn't perform proper unlocking.
776 */
777 fput(ncp_filp);
778out: 754out:
779 put_pid(data.wdog_pid); 755 put_pid(data.wdog_pid);
780 sb->s_fs_info = NULL; 756 sb->s_fs_info = NULL;
@@ -807,9 +783,9 @@ static void ncp_put_super(struct super_block *sb)
807 mutex_destroy(&server->root_setup_lock); 783 mutex_destroy(&server->root_setup_lock);
808 mutex_destroy(&server->mutex); 784 mutex_destroy(&server->mutex);
809 785
810 if (server->info_filp) 786 if (server->info_sock)
811 fput(server->info_filp); 787 sockfd_put(server->info_sock);
812 fput(server->ncp_filp); 788 sockfd_put(server->ncp_sock);
813 kill_pid(server->m.wdog_pid, SIGTERM, 1); 789 kill_pid(server->m.wdog_pid, SIGTERM, 1);
814 put_pid(server->m.wdog_pid); 790 put_pid(server->m.wdog_pid);
815 791
@@ -984,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
984 if ((attr->ia_valid & ATTR_SIZE) != 0) { 960 if ((attr->ia_valid & ATTR_SIZE) != 0) {
985 int written; 961 int written;
986 962
987 DPRINTK("ncpfs: trying to change size to %ld\n", 963 ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size);
988 attr->ia_size);
989 964
990 if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { 965 if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
991 result = -EACCES; 966 result = -EACCES;
@@ -1071,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs");
1071static int __init init_ncp_fs(void) 1046static int __init init_ncp_fs(void)
1072{ 1047{
1073 int err; 1048 int err;
1074 DPRINTK("ncpfs: init_ncp_fs called\n"); 1049 ncp_dbg(1, "called\n");
1075 1050
1076 err = init_inodecache(); 1051 err = init_inodecache();
1077 if (err) 1052 if (err)
@@ -1088,7 +1063,7 @@ out1:
1088 1063
1089static void __exit exit_ncp_fs(void) 1064static void __exit exit_ncp_fs(void)
1090{ 1065{
1091 DPRINTK("ncpfs: exit_ncp_fs called\n"); 1066 ncp_dbg(1, "called\n");
1092 unregister_filesystem(&ncp_fs_type); 1067 unregister_filesystem(&ncp_fs_type);
1093 destroy_inodecache(); 1068 destroy_inodecache();
1094} 1069}
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60426ccb3b65..d5659d96ee7f 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
41 return -EFAULT; 41 return -EFAULT;
42 42
43 if (info.version != NCP_GET_FS_INFO_VERSION) { 43 if (info.version != NCP_GET_FS_INFO_VERSION) {
44 DPRINTK("info.version invalid: %d\n", info.version); 44 ncp_dbg(1, "info.version invalid: %d\n", info.version);
45 return -EINVAL; 45 return -EINVAL;
46 } 46 }
47 /* TODO: info.addr = server->m.serv_addr; */ 47 /* TODO: info.addr = server->m.serv_addr; */
@@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
66 return -EFAULT; 66 return -EFAULT;
67 67
68 if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { 68 if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
69 DPRINTK("info.version invalid: %d\n", info2.version); 69 ncp_dbg(1, "info.version invalid: %d\n", info2.version);
70 return -EINVAL; 70 return -EINVAL;
71 } 71 }
72 info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); 72 info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
132 return -EFAULT; 132 return -EFAULT;
133 133
134 if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { 134 if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
135 DPRINTK("info.version invalid: %d\n", info2.version); 135 ncp_dbg(1, "info.version invalid: %d\n", info2.version);
136 return -EINVAL; 136 return -EINVAL;
137 } 137 }
138 info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); 138 info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
308 else 308 else
309 result = server->reply_size; 309 result = server->reply_size;
310 ncp_unlock_server(server); 310 ncp_unlock_server(server);
311 DPRINTK("ncp_ioctl: copy %d bytes\n", 311 ncp_dbg(1, "copy %d bytes\n", result);
312 result);
313 if (result >= 0) 312 if (result >= 0)
314 if (copy_to_user(request.data, bouncebuffer, result)) 313 if (copy_to_user(request.data, bouncebuffer, result))
315 result = -EFAULT; 314 result = -EFAULT;
@@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
385 sr.namespace = server->name_space[sr.volNumber]; 384 sr.namespace = server->name_space[sr.volNumber];
386 result = 0; 385 result = 0;
387 } else 386 } else
388 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 387 ncp_dbg(1, "s_root->d_inode==NULL\n");
389 } else 388 } else
390 DPRINTK("ncpfs: s_root==NULL\n"); 389 ncp_dbg(1, "s_root==NULL\n");
391 } else { 390 } else {
392 sr.volNumber = -1; 391 sr.volNumber = -1;
393 sr.namespace = 0; 392 sr.namespace = 0;
@@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
440 NCP_FINFO(s_inode)->DosDirNum = dosde; 439 NCP_FINFO(s_inode)->DosDirNum = dosde;
441 server->root_setuped = 1; 440 server->root_setuped = 1;
442 } else { 441 } else {
443 DPRINTK("ncpfs: s_root->d_inode==NULL\n"); 442 ncp_dbg(1, "s_root->d_inode==NULL\n");
444 result = -EIO; 443 result = -EIO;
445 } 444 }
446 } else { 445 } else {
447 DPRINTK("ncpfs: s_root==NULL\n"); 446 ncp_dbg(1, "s_root==NULL\n");
448 result = -EIO; 447 result = -EIO;
449 } 448 }
450 } 449 }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 3c5dd55d284c..b359d12eb359 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
107{ 107{
108 struct inode *inode = file_inode(file); 108 struct inode *inode = file_inode(file);
109 109
110 DPRINTK("ncp_mmap: called\n"); 110 ncp_dbg(1, "called\n");
111 111
112 if (!ncp_conn_valid(NCP_SERVER(inode))) 112 if (!ncp_conn_valid(NCP_SERVER(inode)))
113 return -EIO; 113 return -EIO;
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
index 31831afe1c3b..b9f69e1b1f43 100644
--- a/fs/ncpfs/ncp_fs.h
+++ b/fs/ncpfs/ncp_fs.h
@@ -2,30 +2,32 @@
2#include "ncp_fs_i.h" 2#include "ncp_fs_i.h"
3#include "ncp_fs_sb.h" 3#include "ncp_fs_sb.h"
4 4
5/* define because it is easy to change PRINTK to {*}PRINTK */
6#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
7
8#undef NCPFS_PARANOIA 5#undef NCPFS_PARANOIA
9#ifdef NCPFS_PARANOIA 6#ifdef NCPFS_PARANOIA
10#define PPRINTK(format, args...) PRINTK(format , ## args) 7#define ncp_vdbg(fmt, ...) \
8 pr_debug(fmt, ##__VA_ARGS__)
11#else 9#else
12#define PPRINTK(format, args...) 10#define ncp_vdbg(fmt, ...) \
11do { \
12 if (0) \
13 pr_debug(fmt, ##__VA_ARGS__); \
14} while (0)
13#endif 15#endif
14 16
15#ifndef DEBUG_NCP 17#ifndef DEBUG_NCP
16#define DEBUG_NCP 0 18#define DEBUG_NCP 0
17#endif 19#endif
18#if DEBUG_NCP > 0 20
19#define DPRINTK(format, args...) PRINTK(format , ## args) 21#if DEBUG_NCP > 0 && !defined(DEBUG)
20#else 22#define DEBUG
21#define DPRINTK(format, args...)
22#endif
23#if DEBUG_NCP > 1
24#define DDPRINTK(format, args...) PRINTK(format , ## args)
25#else
26#define DDPRINTK(format, args...)
27#endif 23#endif
28 24
25#define ncp_dbg(level, fmt, ...) \
26do { \
27 if (level <= DEBUG_NCP) \
28 pr_debug(fmt, ##__VA_ARGS__); \
29} while (0)
30
29#define NCP_MAX_RPC_TIMEOUT (6*HZ) 31#define NCP_MAX_RPC_TIMEOUT (6*HZ)
30 32
31 33
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index b81e97adc5a9..55e26fd80886 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -45,9 +45,7 @@ struct ncp_server {
45 45
46 __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; 46 __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
47 47
48 struct file *ncp_filp; /* File pointer to ncp socket */
49 struct socket *ncp_sock;/* ncp socket */ 48 struct socket *ncp_sock;/* ncp socket */
50 struct file *info_filp;
51 struct socket *info_sock; 49 struct socket *info_sock;
52 50
53 u8 sequence; 51 u8 sequence;
@@ -111,7 +109,7 @@ struct ncp_server {
111 109
112 spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ 110 spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
113 111
114 void (*data_ready)(struct sock* sk, int len); 112 void (*data_ready)(struct sock* sk);
115 void (*error_report)(struct sock* sk); 113 void (*error_report)(struct sock* sk);
116 void (*write_space)(struct sock* sk); /* STREAM mode only */ 114 void (*write_space)(struct sock* sk); /* STREAM mode only */
117 struct { 115 struct {
@@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work);
153extern void ncpdgram_rcv_proc(struct work_struct *work); 151extern void ncpdgram_rcv_proc(struct work_struct *work);
154extern void ncpdgram_timeout_proc(struct work_struct *work); 152extern void ncpdgram_timeout_proc(struct work_struct *work);
155extern void ncpdgram_timeout_call(unsigned long server); 153extern void ncpdgram_timeout_call(unsigned long server);
156extern void ncp_tcp_data_ready(struct sock* sk, int len); 154extern void ncp_tcp_data_ready(struct sock* sk);
157extern void ncp_tcp_write_space(struct sock* sk); 155extern void ncp_tcp_write_space(struct sock* sk);
158extern void ncp_tcp_error_report(struct sock* sk); 156extern void ncp_tcp_error_report(struct sock* sk);
159 157
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 981a95617fc9..482387532f54 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -9,14 +9,14 @@
9 * 9 *
10 */ 10 */
11 11
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include "ncp_fs.h" 14#include "ncp_fs.h"
15 15
16static inline void assert_server_locked(struct ncp_server *server) 16static inline void assert_server_locked(struct ncp_server *server)
17{ 17{
18 if (server->lock == 0) { 18 if (server->lock == 0) {
19 DPRINTK("ncpfs: server not locked!\n"); 19 ncp_dbg(1, "server not locked!\n");
20 } 20 }
21} 21}
22 22
@@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s)
75 int len = strlen(s); 75 int len = strlen(s);
76 assert_server_locked(server); 76 assert_server_locked(server);
77 if (len > 255) { 77 if (len > 255) {
78 DPRINTK("ncpfs: string too long: %s\n", s); 78 ncp_dbg(1, "string too long: %s\n", s);
79 len = 255; 79 len = 255;
80 } 80 }
81 ncp_add_byte(server, len); 81 ncp_add_byte(server, len);
@@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server,
225 result = -EIO; 225 result = -EIO;
226 len = ncp_reply_byte(server, 29); 226 len = ncp_reply_byte(server, 29);
227 if (len > NCP_VOLNAME_LEN) { 227 if (len > NCP_VOLNAME_LEN) {
228 DPRINTK("ncpfs: volume name too long: %d\n", len); 228 ncp_dbg(1, "volume name too long: %d\n", len);
229 goto out; 229 goto out;
230 } 230 }
231 memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); 231 memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
@@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n,
259 result = -EIO; 259 result = -EIO;
260 len = ncp_reply_byte(server, 21); 260 len = ncp_reply_byte(server, 21);
261 if (len > NCP_VOLNAME_LEN) { 261 if (len > NCP_VOLNAME_LEN) {
262 DPRINTK("ncpfs: volume name too long: %d\n", len); 262 ncp_dbg(1, "volume name too long: %d\n", len);
263 goto out; 263 goto out;
264 } 264 }
265 memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); 265 memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
@@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode)
295 err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); 295 err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
296 296
297 if (!err) 297 if (!err)
298 PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n", 298 ncp_vdbg("volnum=%d, dirent=%u, error=%d\n",
299 NCP_FINFO(inode)->volNumber, 299 NCP_FINFO(inode)->volNumber,
300 NCP_FINFO(inode)->dirEntNum, err); 300 NCP_FINFO(inode)->dirEntNum, err);
301 } 301 }
302 mutex_unlock(&NCP_FINFO(inode)->open_mutex); 302 mutex_unlock(&NCP_FINFO(inode)->open_mutex);
303 return err; 303 return err;
@@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
394 394
395 if ((result = ncp_request(server, 87)) == 0) { 395 if ((result = ncp_request(server, 87)) == 0) {
396 ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); 396 ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
397 DPRINTK(KERN_DEBUG 397 ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n",
398 "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n",
399 target->entryName, target->nfs.mode, 398 target->entryName, target->nfs.mode,
400 target->nfs.rdev); 399 target->nfs.rdev);
401 } else { 400 } else {
@@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa
425 int result; 424 int result;
426 425
427 if (target == NULL) { 426 if (target == NULL) {
428 printk(KERN_ERR "ncp_obtain_info: invalid call\n"); 427 pr_err("%s: invalid call\n", __func__);
429 return -EINVAL; 428 return -EINVAL;
430 } 429 }
431 ncp_init_request(server); 430 ncp_init_request(server);
@@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
498 namespace = ncp_reply_data(server, 2); 497 namespace = ncp_reply_data(server, 2);
499 498
500 while (no_namespaces > 0) { 499 while (no_namespaces > 0) {
501 DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume); 500 ncp_dbg(1, "found %d on %d\n", *namespace, volume);
502 501
503#ifdef CONFIG_NCPFS_NFS_NS 502#ifdef CONFIG_NCPFS_NFS_NS
504 if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) 503 if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS))
@@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
531 if (ret_ns) 530 if (ret_ns)
532 *ret_ns = ns; 531 *ret_ns = ns;
533 532
534 DPRINTK("lookup_vol: namespace[%d] = %d\n", 533 ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]);
535 volume, server->name_space[volume]);
536 534
537 if (server->name_space[volume] == ns) 535 if (server->name_space[volume] == ns)
538 return 0; 536 return 0;
@@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server,
596{ 594{
597 int result; 595 int result;
598 596
599 DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); 597 ncp_dbg(1, "looking up vol %s\n", volname);
600 598
601 ncp_init_request(server); 599 ncp_init_request(server);
602 ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ 600 ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 3a1587222c8a..471bc3d1139e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 12
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
@@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req)
96 kfree(req); 97 kfree(req);
97} 98}
98 99
99void ncp_tcp_data_ready(struct sock *sk, int len) 100void ncp_tcp_data_ready(struct sock *sk)
100{ 101{
101 struct ncp_server *server = sk->sk_user_data; 102 struct ncp_server *server = sk->sk_user_data;
102 103
103 server->data_ready(sk, len); 104 server->data_ready(sk);
104 schedule_work(&server->rcv.tq); 105 schedule_work(&server->rcv.tq);
105} 106}
106 107
@@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server)
231 return; 232 return;
232 233
233 if (result < 0) { 234 if (result < 0) {
234 printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result); 235 pr_err("tcp: Send failed: %d\n", result);
235 __ncp_abort_request(server, rq, result); 236 __ncp_abort_request(server, rq, result);
236 return; 237 return;
237 } 238 }
@@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *
332 mutex_lock(&server->rcv.creq_mutex); 333 mutex_lock(&server->rcv.creq_mutex);
333 if (!ncp_conn_valid(server)) { 334 if (!ncp_conn_valid(server)) {
334 mutex_unlock(&server->rcv.creq_mutex); 335 mutex_unlock(&server->rcv.creq_mutex);
335 printk(KERN_ERR "ncpfs: tcp: Server died\n"); 336 pr_err("tcp: Server died\n");
336 return -EIO; 337 return -EIO;
337 } 338 }
338 ncp_req_get(req); 339 ncp_req_get(req);
@@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work)
405 } 406 }
406 result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); 407 result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
407 if (result < 0) { 408 if (result < 0) {
408 DPRINTK("recv failed with %d\n", result); 409 ncp_dbg(1, "recv failed with %d\n", result);
409 continue; 410 continue;
410 } 411 }
411 if (result < 10) { 412 if (result < 10) {
412 DPRINTK("too short (%u) watchdog packet\n", result); 413 ncp_dbg(1, "too short (%u) watchdog packet\n", result);
413 continue; 414 continue;
414 } 415 }
415 if (buf[9] != '?') { 416 if (buf[9] != '?') {
416 DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]); 417 ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]);
417 continue; 418 continue;
418 } 419 }
419 buf[9] = 'Y'; 420 buf[9] = 'Y';
@@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
448 result -= 8; 449 result -= 8;
449 hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; 450 hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
450 if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { 451 if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
451 printk(KERN_INFO "ncpfs: Signature violation\n"); 452 pr_info("Signature violation\n");
452 result = -EIO; 453 result = -EIO;
453 } 454 }
454 } 455 }
@@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
524 return result; 525 return result;
525 } 526 }
526 if (result > len) { 527 if (result > len) {
527 printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len); 528 pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
528 return -EIO; 529 return -EIO;
529 } 530 }
530 return result; 531 return result;
@@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
552 __ncptcp_abort(server); 553 __ncptcp_abort(server);
553 } 554 }
554 if (result < 0) { 555 if (result < 0) {
555 printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result); 556 pr_err("tcp: error in recvmsg: %d\n", result);
556 } else { 557 } else {
557 DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n"); 558 ncp_dbg(1, "tcp: EOF\n");
558 } 559 }
559 return -EIO; 560 return -EIO;
560 } 561 }
@@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
566 switch (server->rcv.state) { 567 switch (server->rcv.state) {
567 case 0: 568 case 0:
568 if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { 569 if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
569 printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); 570 pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
570 __ncptcp_abort(server); 571 __ncptcp_abort(server);
571 return -EIO; 572 return -EIO;
572 } 573 }
573 datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; 574 datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
574 if (datalen < 10) { 575 if (datalen < 10) {
575 printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); 576 pr_err("tcp: Unexpected reply len %d\n", datalen);
576 __ncptcp_abort(server); 577 __ncptcp_abort(server);
577 return -EIO; 578 return -EIO;
578 } 579 }
579#ifdef CONFIG_NCPFS_PACKET_SIGNING 580#ifdef CONFIG_NCPFS_PACKET_SIGNING
580 if (server->sign_active) { 581 if (server->sign_active) {
581 if (datalen < 18) { 582 if (datalen < 18) {
582 printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); 583 pr_err("tcp: Unexpected reply len %d\n", datalen);
583 __ncptcp_abort(server); 584 __ncptcp_abort(server);
584 return -EIO; 585 return -EIO;
585 } 586 }
@@ -604,7 +605,7 @@ cont:;
604 server->rcv.len = datalen - 10; 605 server->rcv.len = datalen - 10;
605 break; 606 break;
606 } 607 }
607 DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type); 608 ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type);
608skipdata2:; 609skipdata2:;
609 server->rcv.state = 2; 610 server->rcv.state = 2;
610skipdata:; 611skipdata:;
@@ -614,11 +615,11 @@ skipdata:;
614 } 615 }
615 req = server->rcv.creq; 616 req = server->rcv.creq;
616 if (!req) { 617 if (!req) {
617 DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n"); 618 ncp_dbg(1, "Reply without appropriate request\n");
618 goto skipdata2; 619 goto skipdata2;
619 } 620 }
620 if (datalen > req->datalen + 8) { 621 if (datalen > req->datalen + 8) {
621 printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); 622 pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
622 server->rcv.state = 3; 623 server->rcv.state = 3;
623 goto skipdata; 624 goto skipdata;
624 } 625 }
@@ -638,12 +639,12 @@ skipdata:;
638 req = server->rcv.creq; 639 req = server->rcv.creq;
639 if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { 640 if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
640 if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { 641 if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
641 printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n"); 642 pr_err("tcp: Bad sequence number\n");
642 __ncp_abort_request(server, req, -EIO); 643 __ncp_abort_request(server, req, -EIO);
643 return -EIO; 644 return -EIO;
644 } 645 }
645 if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { 646 if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
646 printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n"); 647 pr_err("tcp: Connection number mismatch\n");
647 __ncp_abort_request(server, req, -EIO); 648 __ncp_abort_request(server, req, -EIO);
648 return -EIO; 649 return -EIO;
649 } 650 }
@@ -651,7 +652,7 @@ skipdata:;
651#ifdef CONFIG_NCPFS_PACKET_SIGNING 652#ifdef CONFIG_NCPFS_PACKET_SIGNING
652 if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { 653 if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
653 if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { 654 if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
654 printk(KERN_ERR "ncpfs: tcp: Signature violation\n"); 655 pr_err("tcp: Signature violation\n");
655 __ncp_abort_request(server, req, -EIO); 656 __ncp_abort_request(server, req, -EIO);
656 return -EIO; 657 return -EIO;
657 } 658 }
@@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
742 int result; 743 int result;
743 744
744 if (server->lock == 0) { 745 if (server->lock == 0) {
745 printk(KERN_ERR "ncpfs: Server not locked!\n"); 746 pr_err("Server not locked!\n");
746 return -EIO; 747 return -EIO;
747 } 748 }
748 if (!ncp_conn_valid(server)) { 749 if (!ncp_conn_valid(server)) {
@@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
781 spin_unlock_irqrestore(&current->sighand->siglock, flags); 782 spin_unlock_irqrestore(&current->sighand->siglock, flags);
782 } 783 }
783 784
784 DDPRINTK("do_ncp_rpc_call returned %d\n", result); 785 ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result);
785 786
786 return result; 787 return result;
787} 788}
@@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function,
811 812
812 result = ncp_do_request(server, server->current_size, reply, size); 813 result = ncp_do_request(server, server->current_size, reply, size);
813 if (result < 0) { 814 if (result < 0) {
814 DPRINTK("ncp_request_error: %d\n", result); 815 ncp_dbg(1, "ncp_request_error: %d\n", result);
815 goto out; 816 goto out;
816 } 817 }
817 server->completion = reply->completion_code; 818 server->completion = reply->completion_code;
@@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function,
822 result = reply->completion_code; 823 result = reply->completion_code;
823 824
824 if (result != 0) 825 if (result != 0)
825 PPRINTK("ncp_request: completion code=%x\n", result); 826 ncp_vdbg("completion code=%x\n", result);
826out: 827out:
827 return result; 828 return result;
828} 829}
@@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server)
865{ 866{
866 mutex_lock(&server->mutex); 867 mutex_lock(&server->mutex);
867 if (server->lock) 868 if (server->lock)
868 printk(KERN_WARNING "ncp_lock_server: was locked!\n"); 869 pr_warn("%s: was locked!\n", __func__);
869 server->lock = 1; 870 server->lock = 1;
870} 871}
871 872
872void ncp_unlock_server(struct ncp_server *server) 873void ncp_unlock_server(struct ncp_server *server)
873{ 874{
874 if (!server->lock) { 875 if (!server->lock) {
875 printk(KERN_WARNING "ncp_unlock_server: was not locked!\n"); 876 pr_warn("%s: was not locked!\n", __func__);
876 return; 877 return;
877 } 878 }
878 server->lock = 0; 879 server->lock = 0;
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 52439ddc8de0..1a63bfdb4a65 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
112 __le32 attr; 112 __le32 attr;
113 unsigned int hdr; 113 unsigned int hdr;
114 114
115 DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname); 115 ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname);
116 116
117 if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) 117 if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
118 kludge = 0; 118 kludge = 0;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 1213 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1214 if (end != NFS_I(inode)->npages) { 1214 if (end != NFS_I(inode)->npages) {
1215 rcu_read_lock(); 1215 rcu_read_lock();
1216 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); 1216 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
1217 rcu_read_unlock(); 1217 rcu_read_unlock();
1218 } 1218 }
1219 1219
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ae2e87b95453..41db5258e7a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -112,7 +112,8 @@ out:
112 * TODO: keep track of all layouts (and delegations) in a hash table 112 * TODO: keep track of all layouts (and delegations) in a hash table
113 * hashed by filehandle. 113 * hashed by filehandle.
114 */ 114 */
115static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) 115static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
116 struct nfs_fh *fh, nfs4_stateid *stateid)
116{ 117{
117 struct nfs_server *server; 118 struct nfs_server *server;
118 struct inode *ino; 119 struct inode *ino;
@@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
120 121
121 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 122 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
122 list_for_each_entry(lo, &server->layouts, plh_layouts) { 123 list_for_each_entry(lo, &server->layouts, plh_layouts) {
124 if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
125 continue;
123 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) 126 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
124 continue; 127 continue;
125 ino = igrab(lo->plh_inode); 128 ino = igrab(lo->plh_inode);
126 if (!ino) 129 if (!ino)
127 continue; 130 break;
128 spin_lock(&ino->i_lock); 131 spin_lock(&ino->i_lock);
129 /* Is this layout in the process of being freed? */ 132 /* Is this layout in the process of being freed? */
130 if (NFS_I(ino)->layout != lo) { 133 if (NFS_I(ino)->layout != lo) {
131 spin_unlock(&ino->i_lock); 134 spin_unlock(&ino->i_lock);
132 iput(ino); 135 iput(ino);
133 continue; 136 break;
134 } 137 }
135 pnfs_get_layout_hdr(lo); 138 pnfs_get_layout_hdr(lo);
136 spin_unlock(&ino->i_lock); 139 spin_unlock(&ino->i_lock);
@@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
141 return NULL; 144 return NULL;
142} 145}
143 146
144static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) 147static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
148 struct nfs_fh *fh, nfs4_stateid *stateid)
145{ 149{
146 struct pnfs_layout_hdr *lo; 150 struct pnfs_layout_hdr *lo;
147 151
148 spin_lock(&clp->cl_lock); 152 spin_lock(&clp->cl_lock);
149 rcu_read_lock(); 153 rcu_read_lock();
150 lo = get_layout_by_fh_locked(clp, fh); 154 lo = get_layout_by_fh_locked(clp, fh, stateid);
151 rcu_read_unlock(); 155 rcu_read_unlock();
152 spin_unlock(&clp->cl_lock); 156 spin_unlock(&clp->cl_lock);
153 157
@@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,
162 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 166 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
163 LIST_HEAD(free_me_list); 167 LIST_HEAD(free_me_list);
164 168
165 lo = get_layout_by_fh(clp, &args->cbl_fh); 169 lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
166 if (!lo) 170 if (!lo)
167 return NFS4ERR_NOMATCHING_LAYOUT; 171 goto out;
168 172
169 ino = lo->plh_inode; 173 ino = lo->plh_inode;
170 spin_lock(&ino->i_lock); 174 spin_lock(&ino->i_lock);
@@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
179 pnfs_free_lseg_list(&free_me_list); 183 pnfs_free_lseg_list(&free_me_list);
180 pnfs_put_layout_hdr(lo); 184 pnfs_put_layout_hdr(lo);
181 iput(ino); 185 iput(ino);
186out:
182 return rv; 187 return rv;
183} 188}
184 189
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a48fe4b84b6..d9f3d067cd15 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {
69 69
70static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) 70static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
71{ 71{
72 struct nfs_inode *nfsi = NFS_I(dir);
72 struct nfs_open_dir_context *ctx; 73 struct nfs_open_dir_context *ctx;
73 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 74 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
74 if (ctx != NULL) { 75 if (ctx != NULL) {
75 ctx->duped = 0; 76 ctx->duped = 0;
76 ctx->attr_gencount = NFS_I(dir)->attr_gencount; 77 ctx->attr_gencount = nfsi->attr_gencount;
77 ctx->dir_cookie = 0; 78 ctx->dir_cookie = 0;
78 ctx->dup_cookie = 0; 79 ctx->dup_cookie = 0;
79 ctx->cred = get_rpccred(cred); 80 ctx->cred = get_rpccred(cred);
81 spin_lock(&dir->i_lock);
82 list_add(&ctx->list, &nfsi->open_files);
83 spin_unlock(&dir->i_lock);
80 return ctx; 84 return ctx;
81 } 85 }
82 return ERR_PTR(-ENOMEM); 86 return ERR_PTR(-ENOMEM);
83} 87}
84 88
85static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 89static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
86{ 90{
91 spin_lock(&dir->i_lock);
92 list_del(&ctx->list);
93 spin_unlock(&dir->i_lock);
87 put_rpccred(ctx->cred); 94 put_rpccred(ctx->cred);
88 kfree(ctx); 95 kfree(ctx);
89} 96}
@@ -126,7 +133,7 @@ out:
126static int 133static int
127nfs_closedir(struct inode *inode, struct file *filp) 134nfs_closedir(struct inode *inode, struct file *filp)
128{ 135{
129 put_nfs_open_dir_context(filp->private_data); 136 put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
130 return 0; 137 return 0;
131} 138}
132 139
@@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
306 if (printk_ratelimit()) { 313 if (printk_ratelimit()) {
307 pr_notice("NFS: directory %pD2 contains a readdir loop." 314 pr_notice("NFS: directory %pD2 contains a readdir loop."
308 "Please contact your server vendor. " 315 "Please contact your server vendor. "
309 "The file: %s has duplicate cookie %llu\n", 316 "The file: %.*s has duplicate cookie %llu\n",
310 desc->file, 317 desc->file, array->array[i].string.len,
311 array->array[i].string.name, 318 array->array[i].string.name, *desc->dir_cookie);
312 *desc->dir_cookie);
313 } 319 }
314 status = -ELOOP; 320 status = -ELOOP;
315 goto out; 321 goto out;
@@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)
437 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); 443 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
438} 444}
439 445
446/*
447 * This function is mainly for use by nfs_getattr().
448 *
449 * If this is an 'ls -l', we want to force use of readdirplus.
450 * Do this by checking if there is an active file descriptor
451 * and calling nfs_advise_use_readdirplus, then forcing a
452 * cache flush.
453 */
454void nfs_force_use_readdirplus(struct inode *dir)
455{
456 if (!list_empty(&NFS_I(dir)->open_files)) {
457 nfs_advise_use_readdirplus(dir);
458 nfs_zap_mapping(dir, dir->i_mapping);
459 }
460}
461
440static 462static
441void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) 463void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
442{ 464{
@@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
815 goto out; 837 goto out;
816} 838}
817 839
840static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
841{
842 struct nfs_inode *nfsi = NFS_I(dir);
843
844 if (nfs_attribute_cache_expired(dir))
845 return true;
846 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
847 return true;
848 return false;
849}
850
818/* The file offset position represents the dirent entry number. A 851/* The file offset position represents the dirent entry number. A
819 last cookie cache takes care of the common case of reading the 852 last cookie cache takes care of the common case of reading the
820 whole directory. 853 whole directory.
@@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
847 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; 880 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
848 881
849 nfs_block_sillyrename(dentry); 882 nfs_block_sillyrename(dentry);
850 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) 883 if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
851 res = nfs_revalidate_mapping(inode, file->f_mapping); 884 res = nfs_revalidate_mapping(inode, file->f_mapping);
852 if (res < 0) 885 if (res < 0)
853 goto out; 886 goto out;
@@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1911 struct inode *old_inode = old_dentry->d_inode; 1944 struct inode *old_inode = old_dentry->d_inode;
1912 struct inode *new_inode = new_dentry->d_inode; 1945 struct inode *new_inode = new_dentry->d_inode;
1913 struct dentry *dentry = NULL, *rehash = NULL; 1946 struct dentry *dentry = NULL, *rehash = NULL;
1947 struct rpc_task *task;
1914 int error = -EBUSY; 1948 int error = -EBUSY;
1915 1949
1916 dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", 1950 dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
@@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1958 if (new_inode != NULL) 1992 if (new_inode != NULL)
1959 NFS_PROTO(new_inode)->return_delegation(new_inode); 1993 NFS_PROTO(new_inode)->return_delegation(new_inode);
1960 1994
1961 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1995 task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
1962 new_dir, &new_dentry->d_name); 1996 if (IS_ERR(task)) {
1997 error = PTR_ERR(task);
1998 goto out;
1999 }
2000
2001 error = rpc_wait_for_completion_task(task);
2002 if (error == 0)
2003 error = task->tk_status;
2004 rpc_put_task(task);
1963 nfs_mark_for_revalidate(old_inode); 2005 nfs_mark_for_revalidate(old_inode);
1964out: 2006out:
1965 if (rehash) 2007 if (rehash)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5bb790a69c71..284ca901fe16 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -617,6 +617,7 @@ out:
617 617
618static const struct vm_operations_struct nfs_file_vm_ops = { 618static const struct vm_operations_struct nfs_file_vm_ops = {
619 .fault = filemap_fault, 619 .fault = filemap_fault,
620 .map_pages = filemap_map_pages,
620 .page_mkwrite = nfs_vm_page_mkwrite, 621 .page_mkwrite = nfs_vm_page_mkwrite,
621 .remap_pages = generic_file_remap_pages, 622 .remap_pages = generic_file_remap_pages,
622}; 623};
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 360114ae8b82..0c438973f3c8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode);
128 128
129void nfs_evict_inode(struct inode *inode) 129void nfs_evict_inode(struct inode *inode)
130{ 130{
131 truncate_inode_pages(&inode->i_data, 0); 131 truncate_inode_pages_final(&inode->i_data);
132 clear_inode(inode); 132 clear_inode(inode);
133 nfs_clear_inode(inode); 133 nfs_clear_inode(inode);
134} 134}
@@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
588} 588}
589EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); 589EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
590 590
591static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
592{
593 struct dentry *parent;
594
595 parent = dget_parent(dentry);
596 nfs_force_use_readdirplus(parent->d_inode);
597 dput(parent);
598}
599
600static bool nfs_need_revalidate_inode(struct inode *inode)
601{
602 if (NFS_I(inode)->cache_validity &
603 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
604 return true;
605 if (nfs_attribute_cache_expired(inode))
606 return true;
607 return false;
608}
609
591int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 610int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
592{ 611{
593 struct inode *inode = dentry->d_inode; 612 struct inode *inode = dentry->d_inode;
@@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
616 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 635 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
617 need_atime = 0; 636 need_atime = 0;
618 637
619 if (need_atime) 638 if (need_atime || nfs_need_revalidate_inode(inode)) {
620 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 639 struct nfs_server *server = NFS_SERVER(inode);
621 else 640
622 err = nfs_revalidate_inode(NFS_SERVER(inode), inode); 641 if (server->caps & NFS_CAP_READDIRPLUS)
642 nfs_request_parent_use_readdirplus(dentry);
643 err = __nfs_revalidate_inode(server, inode);
644 }
623 if (!err) { 645 if (!err) {
624 generic_fillattr(inode, stat); 646 generic_fillattr(inode, stat);
625 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 647 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode)
961 */ 983 */
962int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 984int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
963{ 985{
964 if (!(NFS_I(inode)->cache_validity & 986 if (!nfs_need_revalidate_inode(inode))
965 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
966 && !nfs_attribute_cache_expired(inode))
967 return NFS_STALE(inode) ? -ESTALE : 0; 987 return NFS_STALE(inode) ? -ESTALE : 0;
968 return __nfs_revalidate_inode(server, inode); 988 return __nfs_revalidate_inode(server, inode);
969} 989}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b46cf5a67329..dd8bfc2e2464 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
301 const char *ip_addr); 301 const char *ip_addr);
302 302
303/* dir.c */ 303/* dir.c */
304extern void nfs_force_use_readdirplus(struct inode *dir);
304extern unsigned long nfs_access_cache_count(struct shrinker *shrink, 305extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
305 struct shrink_control *sc); 306 struct shrink_control *sc);
306extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, 307extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *,
474#define nfs_migrate_page NULL 475#define nfs_migrate_page NULL
475#endif 476#endif
476 477
478/* unlink.c */
479extern struct rpc_task *
480nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
481 struct dentry *old_dentry, struct dentry *new_dentry,
482 void (*complete)(struct rpc_task *, struct nfs_renamedata *));
483extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
484
477/* direct.c */ 485/* direct.c */
478void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 486void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
479 struct nfs_direct_req *dreq); 487 struct nfs_direct_req *dreq);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a462ef0fb5d6..db60149c4579 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
479} 479}
480 480
481static int 481static int
482nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
483 struct inode *new_dir, struct qstr *new_name)
484{
485 struct nfs_renameargs arg = {
486 .old_dir = NFS_FH(old_dir),
487 .old_name = old_name,
488 .new_dir = NFS_FH(new_dir),
489 .new_name = new_name,
490 };
491 struct nfs_renameres res;
492 struct rpc_message msg = {
493 .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
494 .rpc_argp = &arg,
495 .rpc_resp = &res,
496 };
497 int status = -ENOMEM;
498
499 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
500
501 res.old_fattr = nfs_alloc_fattr();
502 res.new_fattr = nfs_alloc_fattr();
503 if (res.old_fattr == NULL || res.new_fattr == NULL)
504 goto out;
505
506 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
507 nfs_post_op_update_inode(old_dir, res.old_fattr);
508 nfs_post_op_update_inode(new_dir, res.new_fattr);
509out:
510 nfs_free_fattr(res.old_fattr);
511 nfs_free_fattr(res.new_fattr);
512 dprintk("NFS reply rename: %d\n", status);
513 return status;
514}
515
516static int
517nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 482nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
518{ 483{
519 struct nfs3_linkargs arg = { 484 struct nfs3_linkargs arg = {
@@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
968 .unlink_setup = nfs3_proc_unlink_setup, 933 .unlink_setup = nfs3_proc_unlink_setup,
969 .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, 934 .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
970 .unlink_done = nfs3_proc_unlink_done, 935 .unlink_done = nfs3_proc_unlink_done,
971 .rename = nfs3_proc_rename,
972 .rename_setup = nfs3_proc_rename_setup, 936 .rename_setup = nfs3_proc_rename_setup,
973 .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, 937 .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
974 .rename_done = nfs3_proc_rename_done, 938 .rename_done = nfs3_proc_rename_done,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a5b27c2d9689..e1d1badbe53c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
427extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 427extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
428extern void nfs_inode_find_state_and_recover(struct inode *inode, 428extern void nfs_inode_find_state_and_recover(struct inode *inode,
429 const nfs4_stateid *stateid); 429 const nfs4_stateid *stateid);
430extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
430extern void nfs4_schedule_lease_recovery(struct nfs_client *); 431extern void nfs4_schedule_lease_recovery(struct nfs_client *);
431extern int nfs4_wait_clnt_recover(struct nfs_client *clp); 432extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
432extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); 433extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
@@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
500 return memcmp(dst, src, sizeof(*dst)) == 0; 501 return memcmp(dst, src, sizeof(*dst)) == 0;
501} 502}
502 503
504static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
505{
506 return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
507}
508
509static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
510{
511 return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
512}
513
503static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) 514static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
504{ 515{
505 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; 516 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e46d3d1b6cc..aa9ef4876046 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,
531 *result = pos; 531 *result = pos;
532 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", 532 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
533 __func__, pos, atomic_read(&pos->cl_count)); 533 __func__, pos, atomic_read(&pos->cl_count));
534 goto out;
535 case -ERESTARTSYS:
536 case -ETIMEDOUT:
537 /* The callback path may have been inadvertently
538 * changed. Schedule recovery!
539 */
540 nfs4_schedule_path_down_recovery(pos);
534 default: 541 default:
535 goto out; 542 goto out;
536 } 543 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 450bfedbe2f4..397be39c6dc8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)
1068 dput(p->dentry); 1068 dput(p->dentry);
1069 nfs_sb_deactive(sb); 1069 nfs_sb_deactive(sb);
1070 nfs_fattr_free_names(&p->f_attr); 1070 nfs_fattr_free_names(&p->f_attr);
1071 kfree(p->f_attr.mdsthreshold);
1071 kfree(p); 1072 kfree(p);
1072} 1073}
1073 1074
@@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
1137 nfs4_state_set_mode_locked(state, state->state | fmode); 1138 nfs4_state_set_mode_locked(state, state->state | fmode);
1138} 1139}
1139 1140
1140static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 1141static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
1142{
1143 struct nfs_client *clp = state->owner->so_server->nfs_client;
1144 bool need_recover = false;
1145
1146 if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
1147 need_recover = true;
1148 if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
1149 need_recover = true;
1150 if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
1151 need_recover = true;
1152 if (need_recover)
1153 nfs4_state_mark_reclaim_nograce(clp, state);
1154}
1155
1156static bool nfs_need_update_open_stateid(struct nfs4_state *state,
1157 nfs4_stateid *stateid)
1158{
1159 if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
1160 return true;
1161 if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
1162 nfs_test_and_clear_all_open_stateid(state);
1163 return true;
1164 }
1165 if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
1166 return true;
1167 return false;
1168}
1169
1170static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1171 nfs4_stateid *stateid, fmode_t fmode)
1141{ 1172{
1173 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1174 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
1175 case FMODE_WRITE:
1176 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1177 break;
1178 case FMODE_READ:
1179 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1180 break;
1181 case 0:
1182 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1183 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1184 clear_bit(NFS_OPEN_STATE, &state->flags);
1185 }
1186 if (stateid == NULL)
1187 return;
1188 if (!nfs_need_update_open_stateid(state, stateid))
1189 return;
1142 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1190 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1143 nfs4_stateid_copy(&state->stateid, stateid); 1191 nfs4_stateid_copy(&state->stateid, stateid);
1144 nfs4_stateid_copy(&state->open_stateid, stateid); 1192 nfs4_stateid_copy(&state->open_stateid, stateid);
1145 set_bit(NFS_OPEN_STATE, &state->flags); 1193}
1194
1195static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
1196{
1197 write_seqlock(&state->seqlock);
1198 nfs_clear_open_stateid_locked(state, stateid, fmode);
1199 write_sequnlock(&state->seqlock);
1200 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
1201 nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
1202}
1203
1204static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
1205{
1146 switch (fmode) { 1206 switch (fmode) {
1147 case FMODE_READ: 1207 case FMODE_READ:
1148 set_bit(NFS_O_RDONLY_STATE, &state->flags); 1208 set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
1153 case FMODE_READ|FMODE_WRITE: 1213 case FMODE_READ|FMODE_WRITE:
1154 set_bit(NFS_O_RDWR_STATE, &state->flags); 1214 set_bit(NFS_O_RDWR_STATE, &state->flags);
1155 } 1215 }
1156} 1216 if (!nfs_need_update_open_stateid(state, stateid))
1157 1217 return;
1158static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 1218 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1159{ 1219 nfs4_stateid_copy(&state->stateid, stateid);
1160 write_seqlock(&state->seqlock); 1220 nfs4_stateid_copy(&state->open_stateid, stateid);
1161 nfs_set_open_stateid_locked(state, stateid, fmode);
1162 write_sequnlock(&state->seqlock);
1163} 1221}
1164 1222
1165static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) 1223static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
@@ -1217,6 +1275,8 @@ no_delegation:
1217 __update_open_stateid(state, open_stateid, NULL, fmode); 1275 __update_open_stateid(state, open_stateid, NULL, fmode);
1218 ret = 1; 1276 ret = 1;
1219 } 1277 }
1278 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
1279 nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
1220 1280
1221 return ret; 1281 return ret;
1222} 1282}
@@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1450 struct nfs4_state *newstate; 1510 struct nfs4_state *newstate;
1451 int ret; 1511 int ret;
1452 1512
1513 /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
1514 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1515 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1516 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1453 /* memory barrier prior to reading state->n_* */ 1517 /* memory barrier prior to reading state->n_* */
1454 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1518 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1455 clear_bit(NFS_OPEN_STATE, &state->flags); 1519 clear_bit(NFS_OPEN_STATE, &state->flags);
1456 smp_rmb(); 1520 smp_rmb();
1457 if (state->n_rdwr != 0) { 1521 if (state->n_rdwr != 0) {
1458 clear_bit(NFS_O_RDWR_STATE, &state->flags);
1459 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); 1522 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
1460 if (ret != 0) 1523 if (ret != 0)
1461 return ret; 1524 return ret;
@@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1463 return -ESTALE; 1526 return -ESTALE;
1464 } 1527 }
1465 if (state->n_wronly != 0) { 1528 if (state->n_wronly != 0) {
1466 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
1467 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); 1529 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
1468 if (ret != 0) 1530 if (ret != 0)
1469 return ret; 1531 return ret;
@@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1471 return -ESTALE; 1533 return -ESTALE;
1472 } 1534 }
1473 if (state->n_rdonly != 0) { 1535 if (state->n_rdonly != 0) {
1474 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1475 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); 1536 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
1476 if (ret != 0) 1537 if (ret != 0)
1477 return ret; 1538 return ret;
@@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,
2244 } 2305 }
2245 } 2306 }
2246 2307
2247 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { 2308 if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
2248 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); 2309 if (!opendata->f_attr.mdsthreshold) {
2249 if (!opendata->f_attr.mdsthreshold) 2310 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
2250 goto err_free_label; 2311 if (!opendata->f_attr.mdsthreshold)
2312 goto err_free_label;
2313 }
2251 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; 2314 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
2252 } 2315 }
2253 if (dentry->d_inode != NULL) 2316 if (dentry->d_inode != NULL)
@@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir,
2275 if (opendata->file_created) 2338 if (opendata->file_created)
2276 *opened |= FILE_CREATED; 2339 *opened |= FILE_CREATED;
2277 2340
2278 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2341 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
2279 *ctx_th = opendata->f_attr.mdsthreshold; 2342 *ctx_th = opendata->f_attr.mdsthreshold;
2280 else 2343 opendata->f_attr.mdsthreshold = NULL;
2281 kfree(opendata->f_attr.mdsthreshold); 2344 }
2282 opendata->f_attr.mdsthreshold = NULL;
2283 2345
2284 nfs4_label_free(olabel); 2346 nfs4_label_free(olabel);
2285 2347
@@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,
2289err_free_label: 2351err_free_label:
2290 nfs4_label_free(olabel); 2352 nfs4_label_free(olabel);
2291err_opendata_put: 2353err_opendata_put:
2292 kfree(opendata->f_attr.mdsthreshold);
2293 nfs4_opendata_put(opendata); 2354 nfs4_opendata_put(opendata);
2294err_put_state_owner: 2355err_put_state_owner:
2295 nfs4_put_state_owner(sp); 2356 nfs4_put_state_owner(sp);
@@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data)
2479 kfree(calldata); 2540 kfree(calldata);
2480} 2541}
2481 2542
2482static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
2483 fmode_t fmode)
2484{
2485 spin_lock(&state->owner->so_lock);
2486 clear_bit(NFS_O_RDWR_STATE, &state->flags);
2487 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
2488 case FMODE_WRITE:
2489 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
2490 break;
2491 case FMODE_READ:
2492 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
2493 break;
2494 case 0:
2495 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
2496 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
2497 clear_bit(NFS_OPEN_STATE, &state->flags);
2498 }
2499 spin_unlock(&state->owner->so_lock);
2500}
2501
2502static void nfs4_close_done(struct rpc_task *task, void *data) 2543static void nfs4_close_done(struct rpc_task *task, void *data)
2503{ 2544{
2504 struct nfs4_closedata *calldata = data; 2545 struct nfs4_closedata *calldata = data;
@@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2517 if (calldata->roc) 2558 if (calldata->roc)
2518 pnfs_roc_set_barrier(state->inode, 2559 pnfs_roc_set_barrier(state->inode,
2519 calldata->roc_barrier); 2560 calldata->roc_barrier);
2520 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 2561 nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
2521 renew_lease(server, calldata->timestamp); 2562 renew_lease(server, calldata->timestamp);
2522 break; 2563 goto out_release;
2523 case -NFS4ERR_ADMIN_REVOKED: 2564 case -NFS4ERR_ADMIN_REVOKED:
2524 case -NFS4ERR_STALE_STATEID: 2565 case -NFS4ERR_STALE_STATEID:
2525 case -NFS4ERR_OLD_STATEID: 2566 case -NFS4ERR_OLD_STATEID:
@@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2533 goto out_release; 2574 goto out_release;
2534 } 2575 }
2535 } 2576 }
2536 nfs4_close_clear_stateid_flags(state, calldata->arg.fmode); 2577 nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
2537out_release: 2578out_release:
2538 nfs_release_seqid(calldata->arg.seqid); 2579 nfs_release_seqid(calldata->arg.seqid);
2539 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2580 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3507 return 1; 3548 return 1;
3508} 3549}
3509 3550
3510static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3511 struct inode *new_dir, struct qstr *new_name)
3512{
3513 struct nfs_server *server = NFS_SERVER(old_dir);
3514 struct nfs_renameargs arg = {
3515 .old_dir = NFS_FH(old_dir),
3516 .new_dir = NFS_FH(new_dir),
3517 .old_name = old_name,
3518 .new_name = new_name,
3519 };
3520 struct nfs_renameres res = {
3521 .server = server,
3522 };
3523 struct rpc_message msg = {
3524 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
3525 .rpc_argp = &arg,
3526 .rpc_resp = &res,
3527 };
3528 int status = -ENOMEM;
3529
3530 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3531 if (!status) {
3532 update_changeattr(old_dir, &res.old_cinfo);
3533 update_changeattr(new_dir, &res.new_cinfo);
3534 }
3535 return status;
3536}
3537
3538static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3539 struct inode *new_dir, struct qstr *new_name)
3540{
3541 struct nfs4_exception exception = { };
3542 int err;
3543 do {
3544 err = _nfs4_proc_rename(old_dir, old_name,
3545 new_dir, new_name);
3546 trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
3547 err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
3548 &exception);
3549 } while (exception.retry);
3550 return err;
3551}
3552
3553static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 3551static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
3554{ 3552{
3555 struct nfs_server *server = NFS_SERVER(inode); 3553 struct nfs_server *server = NFS_SERVER(inode);
@@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,
4884 nodename); 4882 nodename);
4885} 4883}
4886 4884
4885/*
4886 * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
4887 * services. Advertise one based on the address family of the
4888 * clientaddr.
4889 */
4890static unsigned int
4891nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
4892{
4893 if (strchr(clp->cl_ipaddr, ':') != NULL)
4894 return scnprintf(buf, len, "tcp6");
4895 else
4896 return scnprintf(buf, len, "tcp");
4897}
4898
4887/** 4899/**
4888 * nfs4_proc_setclientid - Negotiate client ID 4900 * nfs4_proc_setclientid - Negotiate client ID
4889 * @clp: state data structure 4901 * @clp: state data structure
@@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4925 setclientid.sc_name, 4937 setclientid.sc_name,
4926 sizeof(setclientid.sc_name)); 4938 sizeof(setclientid.sc_name));
4927 /* cb_client4 */ 4939 /* cb_client4 */
4928 rcu_read_lock(); 4940 setclientid.sc_netid_len =
4929 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4941 nfs4_init_callback_netid(clp,
4930 sizeof(setclientid.sc_netid), "%s", 4942 setclientid.sc_netid,
4931 rpc_peeraddr2str(clp->cl_rpcclient, 4943 sizeof(setclientid.sc_netid));
4932 RPC_DISPLAY_NETID));
4933 rcu_read_unlock();
4934 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, 4944 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
4935 sizeof(setclientid.sc_uaddr), "%s.%u.%u", 4945 sizeof(setclientid.sc_uaddr), "%s.%u.%u",
4936 clp->cl_ipaddr, port >> 8, port & 255); 4946 clp->cl_ipaddr, port >> 8, port & 255);
@@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
8408 .unlink_setup = nfs4_proc_unlink_setup, 8418 .unlink_setup = nfs4_proc_unlink_setup,
8409 .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, 8419 .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
8410 .unlink_done = nfs4_proc_unlink_done, 8420 .unlink_done = nfs4_proc_unlink_done,
8411 .rename = nfs4_proc_rename,
8412 .rename_setup = nfs4_proc_rename_setup, 8421 .rename_setup = nfs4_proc_rename_setup,
8413 .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, 8422 .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
8414 .rename_done = nfs4_proc_rename_done, 8423 .rename_done = nfs4_proc_rename_done,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0deb32105ccf..2349518eef2c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
1316 return 1; 1316 return 1;
1317} 1317}
1318 1318
1319static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) 1319int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
1320{ 1320{
1321 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); 1321 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
1322 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1322 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -2075,8 +2075,10 @@ again:
2075 switch (status) { 2075 switch (status) {
2076 case 0: 2076 case 0:
2077 break; 2077 break;
2078 case -NFS4ERR_DELAY:
2079 case -ETIMEDOUT: 2078 case -ETIMEDOUT:
2079 if (clnt->cl_softrtry)
2080 break;
2081 case -NFS4ERR_DELAY:
2080 case -EAGAIN: 2082 case -EAGAIN:
2081 ssleep(1); 2083 ssleep(1);
2082 case -NFS4ERR_STALE_CLIENTID: 2084 case -NFS4ERR_STALE_CLIENTID:
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 808f29574412..6f340f02f2ba 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
90 */ 90 */
91static void nfs4_evict_inode(struct inode *inode) 91static void nfs4_evict_inode(struct inode *inode)
92{ 92{
93 truncate_inode_pages(&inode->i_data, 0); 93 truncate_inode_pages_final(&inode->i_data);
94 clear_inode(inode); 94 clear_inode(inode);
95 pnfs_return_layout(inode); 95 pnfs_return_layout(inode);
96 pnfs_destroy_layout(NFS_I(inode)); 96 pnfs_destroy_layout(NFS_I(inode));
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 72f3bf1754ef..73ce8d4fe2c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int);
203 2 + encode_verifier_maxsz + 5 + \ 203 2 + encode_verifier_maxsz + 5 + \
204 nfs4_label_maxsz) 204 nfs4_label_maxsz)
205#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ 205#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
206 decode_verifier_maxsz + \ 206 decode_verifier_maxsz)
207 nfs4_label_maxsz + nfs4_fattr_maxsz)
208#define encode_readlink_maxsz (op_encode_hdr_maxsz) 207#define encode_readlink_maxsz (op_encode_hdr_maxsz)
209#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) 208#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
210#define encode_write_maxsz (op_encode_hdr_maxsz + \ 209#define encode_write_maxsz (op_encode_hdr_maxsz + \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4755858e37a0..cb53d450ae32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
662 */ 662 */
663static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 663static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
664{ 664{
665 return (s32)s1 - (s32)s2 > 0; 665 return (s32)(s1 - s2) > 0;
666}
667
668static void
669pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
670 const nfs4_stateid *new,
671 struct list_head *free_me_list)
672{
673 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
674 return;
675 /* Layout is new! Kill existing layout segments */
676 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
666} 677}
667 678
668/* update lo->plh_stateid with new if is more recent */ 679/* update lo->plh_stateid with new if is more recent */
@@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1315 struct nfs4_layoutget_res *res = &lgp->res; 1326 struct nfs4_layoutget_res *res = &lgp->res;
1316 struct pnfs_layout_segment *lseg; 1327 struct pnfs_layout_segment *lseg;
1317 struct inode *ino = lo->plh_inode; 1328 struct inode *ino = lo->plh_inode;
1329 LIST_HEAD(free_me);
1318 int status = 0; 1330 int status = 0;
1319 1331
1320 /* Inject layout blob into I/O device driver */ 1332 /* Inject layout blob into I/O device driver */
@@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1341 goto out_forget_reply; 1353 goto out_forget_reply;
1342 } 1354 }
1343 1355
1356 /* Check that the new stateid matches the old stateid */
1357 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
1344 /* Done processing layoutget. Set the layout stateid */ 1358 /* Done processing layoutget. Set the layout stateid */
1345 pnfs_set_layout_stateid(lo, &res->stateid, false); 1359 pnfs_set_layout_stateid(lo, &res->stateid, false);
1346 1360
@@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1355 } 1369 }
1356 1370
1357 spin_unlock(&ino->i_lock); 1371 spin_unlock(&ino->i_lock);
1372 pnfs_free_lseg_list(&free_me);
1358 return lseg; 1373 return lseg;
1359out: 1374out:
1360 return ERR_PTR(status); 1375 return ERR_PTR(status);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fddbba2d9eff..e55ce9e8b034 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
357} 357}
358 358
359static int 359static int
360nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
361 struct inode *new_dir, struct qstr *new_name)
362{
363 struct nfs_renameargs arg = {
364 .old_dir = NFS_FH(old_dir),
365 .old_name = old_name,
366 .new_dir = NFS_FH(new_dir),
367 .new_name = new_name,
368 };
369 struct rpc_message msg = {
370 .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
371 .rpc_argp = &arg,
372 };
373 int status;
374
375 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
376 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
377 nfs_mark_for_revalidate(old_dir);
378 nfs_mark_for_revalidate(new_dir);
379 dprintk("NFS reply rename: %d\n", status);
380 return status;
381}
382
383static int
384nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 360nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
385{ 361{
386 struct nfs_linkargs arg = { 362 struct nfs_linkargs arg = {
@@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
745 .unlink_setup = nfs_proc_unlink_setup, 721 .unlink_setup = nfs_proc_unlink_setup,
746 .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, 722 .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
747 .unlink_done = nfs_proc_unlink_done, 723 .unlink_done = nfs_proc_unlink_done,
748 .rename = nfs_proc_rename,
749 .rename_setup = nfs_proc_rename_setup, 724 .rename_setup = nfs_proc_rename_setup,
750 .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, 725 .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
751 .rename_done = nfs_proc_rename_done, 726 .rename_done = nfs_proc_rename_done,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed906eb82..2cb56943e232 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; 2215 struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version; 2216 u32 nfsvers = nfss->nfs_client->rpc_ops->version;
2217 2217
2218 sync_filesystem(sb);
2219
2218 /* 2220 /*
2219 * Userspace mount programs that send binary options generally send 2221 * Userspace mount programs that send binary options generally send
2220 * them populated with default values. We have no way to know which 2222 * them populated with default values. We have no way to know which
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 11d78944de79..de54129336c6 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,7 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/fsnotify.h>
17 18
18#include "internal.h" 19#include "internal.h"
19#include "nfs4_fs.h" 20#include "nfs4_fs.h"
@@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
353 return; 354 return;
354 } 355 }
355 356
356 if (task->tk_status != 0) 357 if (data->complete)
357 nfs_cancel_async_unlink(old_dentry); 358 data->complete(task, data);
358} 359}
359 360
360/** 361/**
@@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {
399 * 400 *
400 * It's expected that valid references to the dentries and inodes are held 401 * It's expected that valid references to the dentries and inodes are held
401 */ 402 */
402static struct rpc_task * 403struct rpc_task *
403nfs_async_rename(struct inode *old_dir, struct inode *new_dir, 404nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
404 struct dentry *old_dentry, struct dentry *new_dentry) 405 struct dentry *old_dentry, struct dentry *new_dentry,
406 void (*complete)(struct rpc_task *, struct nfs_renamedata *))
405{ 407{
406 struct nfs_renamedata *data; 408 struct nfs_renamedata *data;
407 struct rpc_message msg = { }; 409 struct rpc_message msg = { };
@@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
438 data->new_dentry = dget(new_dentry); 440 data->new_dentry = dget(new_dentry);
439 nfs_fattr_init(&data->old_fattr); 441 nfs_fattr_init(&data->old_fattr);
440 nfs_fattr_init(&data->new_fattr); 442 nfs_fattr_init(&data->new_fattr);
443 data->complete = complete;
441 444
442 /* set up nfs_renameargs */ 445 /* set up nfs_renameargs */
443 data->args.old_dir = NFS_FH(old_dir); 446 data->args.old_dir = NFS_FH(old_dir);
@@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
456 return rpc_run_task(&task_setup_data); 459 return rpc_run_task(&task_setup_data);
457} 460}
458 461
462/*
463 * Perform tasks needed when a sillyrename is done such as cancelling the
464 * queued async unlink if it failed.
465 */
466static void
467nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
468{
469 struct dentry *dentry = data->old_dentry;
470
471 if (task->tk_status != 0) {
472 nfs_cancel_async_unlink(dentry);
473 return;
474 }
475
476 /*
477 * vfs_unlink and the like do not issue this when a file is
478 * sillyrenamed, so do it here.
479 */
480 fsnotify_nameremove(dentry, 0);
481}
482
459#define SILLYNAME_PREFIX ".nfs" 483#define SILLYNAME_PREFIX ".nfs"
460#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) 484#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
461#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) 485#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
@@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
548 } 572 }
549 573
550 /* run the rename task, undo unlink if it fails */ 574 /* run the rename task, undo unlink if it fails */
551 task = nfs_async_rename(dir, dir, dentry, sdentry); 575 task = nfs_async_rename(dir, dir, dentry, sdentry,
576 nfs_complete_sillyrename);
552 if (IS_ERR(task)) { 577 if (IS_ERR(task)) {
553 error = -EBUSY; 578 error = -EBUSY;
554 nfs_cancel_async_unlink(dentry); 579 nfs_cancel_async_unlink(dentry);
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a812fd1b92a4..b481e1f5eecc 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -39,9 +39,13 @@ struct nfs4_acl;
39struct svc_fh; 39struct svc_fh;
40struct svc_rqst; 40struct svc_rqst;
41 41
42/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to 42/*
43 * fit in a page: */ 43 * Maximum ACL we'll accept from a client; chosen (somewhat
44#define NFS4_ACL_MAX 170 44 * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
45 * high-order allocation. This allows 204 ACEs on x86_64:
46 */
47#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
48 / sizeof(struct nfs4_ace))
45 49
46struct nfs4_acl *nfs4_acl_new(int); 50struct nfs4_acl *nfs4_acl_new(int);
47int nfs4_acl_get_whotype(char *, u32); 51int nfs4_acl_get_whotype(char *, u32);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd572264..2645be435e75 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
71 if (gid_eq(new->fsgid, INVALID_GID)) 71 if (gid_eq(new->fsgid, INVALID_GID))
72 new->fsgid = exp->ex_anon_gid; 72 new->fsgid = exp->ex_anon_gid;
73 73
74 ret = set_groups(new, gi); 74 set_groups(new, gi);
75 put_group_info(gi); 75 put_group_info(gi);
76 if (ret < 0)
77 goto error;
78 76
79 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) 77 if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
80 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 78 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
89 87
90oom: 88oom:
91 ret = -ENOMEM; 89 ret = -ENOMEM;
92error:
93 abort_creds(new); 90 abort_creds(new);
94 return ret; 91 return ret;
95} 92}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d190e33d0ec2..f66c66b9f182 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl)
402 * by uid/gid. */ 402 * by uid/gid. */
403 int i, j; 403 int i, j;
404 404
405 if (pacl->a_count <= 4) 405 /* no users or groups */
406 return; /* no users or groups */ 406 if (!pacl || pacl->a_count <= 4)
407 return;
408
407 i = 1; 409 i = 1;
408 while (pacl->a_entries[i].e_tag == ACL_USER) 410 while (pacl->a_entries[i].e_tag == ACL_USER)
409 i++; 411 i++;
@@ -530,19 +532,21 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
530 532
531 /* 533 /*
532 * ACLs with no ACEs are treated differently in the inheritable 534 * ACLs with no ACEs are treated differently in the inheritable
533 * and effective cases: when there are no inheritable ACEs, we 535 * and effective cases: when there are no inheritable ACEs,
534 * set a zero-length default posix acl: 536 * calls ->set_acl with a NULL ACL structure.
535 */ 537 */
536 if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { 538 if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
537 pacl = posix_acl_alloc(0, GFP_KERNEL); 539 return NULL;
538 return pacl ? pacl : ERR_PTR(-ENOMEM); 540
539 }
540 /* 541 /*
541 * When there are no effective ACEs, the following will end 542 * When there are no effective ACEs, the following will end
542 * up setting a 3-element effective posix ACL with all 543 * up setting a 3-element effective posix ACL with all
543 * permissions zero. 544 * permissions zero.
544 */ 545 */
545 nace = 4 + state->users->n + state->groups->n; 546 if (!state->users->n && !state->groups->n)
547 nace = 3;
548 else /* Note we also include a MASK ACE in this case: */
549 nace = 4 + state->users->n + state->groups->n;
546 pacl = posix_acl_alloc(nace, GFP_KERNEL); 550 pacl = posix_acl_alloc(nace, GFP_KERNEL);
547 if (!pacl) 551 if (!pacl)
548 return ERR_PTR(-ENOMEM); 552 return ERR_PTR(-ENOMEM);
@@ -586,9 +590,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
586 add_to_mask(state, &state->groups->aces[i].perms); 590 add_to_mask(state, &state->groups->aces[i].perms);
587 } 591 }
588 592
589 pace++; 593 if (state->users->n || state->groups->n) {
590 pace->e_tag = ACL_MASK; 594 pace++;
591 low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); 595 pace->e_tag = ACL_MASK;
596 low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
597 }
592 598
593 pace++; 599 pace++;
594 pace->e_tag = ACL_OTHER; 600 pace->e_tag = ACL_OTHER;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7f05cd140de3..2c73cae9899d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/sunrpc/xprt.h>
35#include <linux/sunrpc/svc_xprt.h> 36#include <linux/sunrpc/svc_xprt.h>
36#include <linux/slab.h> 37#include <linux/slab.h>
37#include "nfsd.h" 38#include "nfsd.h"
@@ -635,11 +636,29 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
635 } 636 }
636} 637}
637 638
639static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
640{
641 struct rpc_xprt *xprt;
642
643 if (args->protocol != XPRT_TRANSPORT_BC_TCP)
644 return rpc_create(args);
645
646 xprt = args->bc_xprt->xpt_bc_xprt;
647 if (xprt) {
648 xprt_get(xprt);
649 return rpc_create_xprt(args, xprt);
650 }
651
652 return rpc_create(args);
653}
654
638static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) 655static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
639{ 656{
657 int maxtime = max_cb_time(clp->net);
640 struct rpc_timeout timeparms = { 658 struct rpc_timeout timeparms = {
641 .to_initval = max_cb_time(clp->net), 659 .to_initval = maxtime,
642 .to_retries = 0, 660 .to_retries = 0,
661 .to_maxval = maxtime,
643 }; 662 };
644 struct rpc_create_args args = { 663 struct rpc_create_args args = {
645 .net = clp->net, 664 .net = clp->net,
@@ -674,7 +693,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
674 args.authflavor = ses->se_cb_sec.flavor; 693 args.authflavor = ses->se_cb_sec.flavor;
675 } 694 }
676 /* Create RPC client */ 695 /* Create RPC client */
677 client = rpc_create(&args); 696 client = create_backchannel_client(&args);
678 if (IS_ERR(client)) { 697 if (IS_ERR(client)) {
679 dprintk("NFSD: couldn't create callback client: %ld\n", 698 dprintk("NFSD: couldn't create callback client: %ld\n",
680 PTR_ERR(client)); 699 PTR_ERR(client));
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 82189b208af3..d543222babf3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1273,6 +1273,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1273 struct nfsd4_op *op; 1273 struct nfsd4_op *op;
1274 struct nfsd4_operation *opdesc; 1274 struct nfsd4_operation *opdesc;
1275 struct nfsd4_compound_state *cstate = &resp->cstate; 1275 struct nfsd4_compound_state *cstate = &resp->cstate;
1276 struct svc_fh *current_fh = &cstate->current_fh;
1277 struct svc_fh *save_fh = &cstate->save_fh;
1276 int slack_bytes; 1278 int slack_bytes;
1277 u32 plen = 0; 1279 u32 plen = 0;
1278 __be32 status; 1280 __be32 status;
@@ -1288,11 +1290,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1288 resp->tag = args->tag; 1290 resp->tag = args->tag;
1289 resp->opcnt = 0; 1291 resp->opcnt = 0;
1290 resp->rqstp = rqstp; 1292 resp->rqstp = rqstp;
1291 resp->cstate.minorversion = args->minorversion; 1293 cstate->minorversion = args->minorversion;
1292 resp->cstate.replay_owner = NULL; 1294 cstate->replay_owner = NULL;
1293 resp->cstate.session = NULL; 1295 cstate->session = NULL;
1294 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); 1296 fh_init(current_fh, NFS4_FHSIZE);
1295 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); 1297 fh_init(save_fh, NFS4_FHSIZE);
1296 /* 1298 /*
1297 * Don't use the deferral mechanism for NFSv4; compounds make it 1299 * Don't use the deferral mechanism for NFSv4; compounds make it
1298 * too hard to avoid non-idempotency problems. 1300 * too hard to avoid non-idempotency problems.
@@ -1345,20 +1347,28 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1345 1347
1346 opdesc = OPDESC(op); 1348 opdesc = OPDESC(op);
1347 1349
1348 if (!cstate->current_fh.fh_dentry) { 1350 if (!current_fh->fh_dentry) {
1349 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { 1351 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
1350 op->status = nfserr_nofilehandle; 1352 op->status = nfserr_nofilehandle;
1351 goto encode_op; 1353 goto encode_op;
1352 } 1354 }
1353 } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && 1355 } else if (current_fh->fh_export->ex_fslocs.migrated &&
1354 !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { 1356 !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
1355 op->status = nfserr_moved; 1357 op->status = nfserr_moved;
1356 goto encode_op; 1358 goto encode_op;
1357 } 1359 }
1358 1360
1361 fh_clear_wcc(current_fh);
1362
1359 /* If op is non-idempotent */ 1363 /* If op is non-idempotent */
1360 if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { 1364 if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
1361 plen = opdesc->op_rsize_bop(rqstp, op); 1365 plen = opdesc->op_rsize_bop(rqstp, op);
1366 /*
1367 * If there's still another operation, make sure
1368 * we'll have space to at least encode an error:
1369 */
1370 if (resp->opcnt < args->opcnt)
1371 plen += COMPOUND_ERR_SLACK_SPACE;
1362 op->status = nfsd4_check_resp_size(resp, plen); 1372 op->status = nfsd4_check_resp_size(resp, plen);
1363 } 1373 }
1364 1374
@@ -1377,12 +1387,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1377 clear_current_stateid(cstate); 1387 clear_current_stateid(cstate);
1378 1388
1379 if (need_wrongsec_check(rqstp)) 1389 if (need_wrongsec_check(rqstp))
1380 op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); 1390 op->status = check_nfsd_access(current_fh->fh_export, rqstp);
1381 } 1391 }
1382 1392
1383encode_op: 1393encode_op:
1384 /* Only from SEQUENCE */ 1394 /* Only from SEQUENCE */
1385 if (resp->cstate.status == nfserr_replay_cache) { 1395 if (cstate->status == nfserr_replay_cache) {
1386 dprintk("%s NFS4.1 replay from cache\n", __func__); 1396 dprintk("%s NFS4.1 replay from cache\n", __func__);
1387 status = op->status; 1397 status = op->status;
1388 goto out; 1398 goto out;
@@ -1411,10 +1421,10 @@ encode_op:
1411 nfsd4_increment_op_stats(op->opnum); 1421 nfsd4_increment_op_stats(op->opnum);
1412 } 1422 }
1413 1423
1414 resp->cstate.status = status; 1424 cstate->status = status;
1415 fh_put(&resp->cstate.current_fh); 1425 fh_put(current_fh);
1416 fh_put(&resp->cstate.save_fh); 1426 fh_put(save_fh);
1417 BUG_ON(resp->cstate.replay_owner); 1427 BUG_ON(cstate->replay_owner);
1418out: 1428out:
1419 /* Reset deferral mechanism for RPC deferrals */ 1429 /* Reset deferral mechanism for RPC deferrals */
1420 rqstp->rq_usedeferral = 1; 1430 rqstp->rq_usedeferral = 1;
@@ -1523,7 +1533,8 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
1523 1533
1524static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1534static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1525{ 1535{
1526 return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); 1536 return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
1537 sizeof(__be32);
1527} 1538}
1528 1539
1529static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1540static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5d070fbeb35..9a77a5a21557 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1078 return NULL; 1078 return NULL;
1079 } 1079 }
1080 clp->cl_name.len = name.len; 1080 clp->cl_name.len = name.len;
1081 INIT_LIST_HEAD(&clp->cl_sessions);
1082 idr_init(&clp->cl_stateids);
1083 atomic_set(&clp->cl_refcount, 0);
1084 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1085 INIT_LIST_HEAD(&clp->cl_idhash);
1086 INIT_LIST_HEAD(&clp->cl_openowners);
1087 INIT_LIST_HEAD(&clp->cl_delegations);
1088 INIT_LIST_HEAD(&clp->cl_lru);
1089 INIT_LIST_HEAD(&clp->cl_callbacks);
1090 INIT_LIST_HEAD(&clp->cl_revoked);
1091 spin_lock_init(&clp->cl_lock);
1092 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1081 return clp; 1093 return clp;
1082} 1094}
1083 1095
@@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp)
1095 WARN_ON_ONCE(atomic_read(&ses->se_ref)); 1107 WARN_ON_ONCE(atomic_read(&ses->se_ref));
1096 free_session(ses); 1108 free_session(ses);
1097 } 1109 }
1110 rpc_destroy_wait_queue(&clp->cl_cb_waitq);
1098 free_svc_cred(&clp->cl_cred); 1111 free_svc_cred(&clp->cl_cred);
1099 kfree(clp->cl_name.data); 1112 kfree(clp->cl_name.data);
1100 idr_destroy(&clp->cl_stateids); 1113 idr_destroy(&clp->cl_stateids);
@@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1347 if (clp == NULL) 1360 if (clp == NULL)
1348 return NULL; 1361 return NULL;
1349 1362
1350 INIT_LIST_HEAD(&clp->cl_sessions);
1351 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1363 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1352 if (ret) { 1364 if (ret) {
1353 spin_lock(&nn->client_lock); 1365 spin_lock(&nn->client_lock);
@@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1355 spin_unlock(&nn->client_lock); 1367 spin_unlock(&nn->client_lock);
1356 return NULL; 1368 return NULL;
1357 } 1369 }
1358 idr_init(&clp->cl_stateids);
1359 atomic_set(&clp->cl_refcount, 0);
1360 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1361 INIT_LIST_HEAD(&clp->cl_idhash);
1362 INIT_LIST_HEAD(&clp->cl_openowners);
1363 INIT_LIST_HEAD(&clp->cl_delegations);
1364 INIT_LIST_HEAD(&clp->cl_lru);
1365 INIT_LIST_HEAD(&clp->cl_callbacks);
1366 INIT_LIST_HEAD(&clp->cl_revoked);
1367 spin_lock_init(&clp->cl_lock);
1368 nfsd4_init_callback(&clp->cl_cb_null); 1370 nfsd4_init_callback(&clp->cl_cb_null);
1369 clp->cl_time = get_seconds(); 1371 clp->cl_time = get_seconds();
1370 clear_bit(0, &clp->cl_cb_slot_busy); 1372 clear_bit(0, &clp->cl_cb_slot_busy);
1371 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1372 copy_verf(clp, verf); 1373 copy_verf(clp, verf);
1373 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); 1374 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
1374 gen_confirm(clp); 1375 gen_confirm(clp);
@@ -1538,7 +1539,7 @@ out_err:
1538} 1539}
1539 1540
1540/* 1541/*
1541 * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. 1542 * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
1542 */ 1543 */
1543void 1544void
1544nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) 1545nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
@@ -1596,7 +1597,7 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
1596 * The sequence operation is not cached because we can use the slot and 1597 * The sequence operation is not cached because we can use the slot and
1597 * session values. 1598 * session values.
1598 */ 1599 */
1599__be32 1600static __be32
1600nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 1601nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1601 struct nfsd4_sequence *seq) 1602 struct nfsd4_sequence *seq)
1602{ 1603{
@@ -1605,9 +1606,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1605 1606
1606 dprintk("--> %s slot %p\n", __func__, slot); 1607 dprintk("--> %s slot %p\n", __func__, slot);
1607 1608
1608 /* Either returns 0 or nfserr_retry_uncached */
1609 status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); 1609 status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
1610 if (status == nfserr_retry_uncached_rep) 1610 if (status)
1611 return status; 1611 return status;
1612 1612
1613 /* The sequence operation has been encoded, cstate->datap set. */ 1613 /* The sequence operation has been encoded, cstate->datap set. */
@@ -2287,7 +2287,8 @@ out:
2287 if (!list_empty(&clp->cl_revoked)) 2287 if (!list_empty(&clp->cl_revoked))
2288 seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; 2288 seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
2289out_no_session: 2289out_no_session:
2290 kfree(conn); 2290 if (conn)
2291 free_conn(conn);
2291 spin_unlock(&nn->client_lock); 2292 spin_unlock(&nn->client_lock);
2292 return status; 2293 return status;
2293out_put_session: 2294out_put_session:
@@ -3627,8 +3628,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
3627 return nfserr_bad_stateid; 3628 return nfserr_bad_stateid;
3628 status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, 3629 status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
3629 nn, &cl); 3630 nn, &cl);
3630 if (status == nfserr_stale_clientid) 3631 if (status == nfserr_stale_clientid) {
3632 if (sessions)
3633 return nfserr_bad_stateid;
3631 return nfserr_stale_stateid; 3634 return nfserr_stale_stateid;
3635 }
3632 if (status) 3636 if (status)
3633 return status; 3637 return status;
3634 *s = find_stateid_by_type(cl, stateid, typemask); 3638 *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3713,9 +3717,16 @@ out:
3713static __be32 3717static __be32
3714nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) 3718nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
3715{ 3719{
3716 if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) 3720 struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
3721
3722 if (check_for_locks(stp->st_file, lo))
3717 return nfserr_locks_held; 3723 return nfserr_locks_held;
3718 release_lock_stateid(stp); 3724 /*
3725 * Currently there's a 1-1 lock stateid<->lockowner
3726 * correspondance, and we have to delete the lockowner when we
3727 * delete the lock stateid:
3728 */
3729 unhash_lockowner(lo);
3719 return nfs_ok; 3730 return nfs_ok;
3720} 3731}
3721 3732
@@ -4155,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
4155 4166
4156 if (!same_owner_str(&lo->lo_owner, owner, clid)) 4167 if (!same_owner_str(&lo->lo_owner, owner, clid))
4157 return false; 4168 return false;
4169 if (list_empty(&lo->lo_owner.so_stateids)) {
4170 WARN_ON_ONCE(1);
4171 return false;
4172 }
4158 lst = list_first_entry(&lo->lo_owner.so_stateids, 4173 lst = list_first_entry(&lo->lo_owner.so_stateids,
4159 struct nfs4_ol_stateid, st_perstateowner); 4174 struct nfs4_ol_stateid, st_perstateowner);
4160 return lst->st_file->fi_inode == inode; 4175 return lst->st_file->fi_inode == inode;
@@ -5062,7 +5077,6 @@ nfs4_state_destroy_net(struct net *net)
5062 int i; 5077 int i;
5063 struct nfs4_client *clp = NULL; 5078 struct nfs4_client *clp = NULL;
5064 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 5079 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
5065 struct rb_node *node, *tmp;
5066 5080
5067 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 5081 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
5068 while (!list_empty(&nn->conf_id_hashtbl[i])) { 5082 while (!list_empty(&nn->conf_id_hashtbl[i])) {
@@ -5071,13 +5085,11 @@ nfs4_state_destroy_net(struct net *net)
5071 } 5085 }
5072 } 5086 }
5073 5087
5074 node = rb_first(&nn->unconf_name_tree); 5088 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
5075 while (node != NULL) { 5089 while (!list_empty(&nn->unconf_id_hashtbl[i])) {
5076 tmp = node; 5090 clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
5077 node = rb_next(tmp); 5091 destroy_client(clp);
5078 clp = rb_entry(tmp, struct nfs4_client, cl_namenode); 5092 }
5079 rb_erase(tmp, &nn->unconf_name_tree);
5080 destroy_client(clp);
5081 } 5093 }
5082 5094
5083 kfree(nn->sessionid_hashtbl); 5095 kfree(nn->sessionid_hashtbl);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 63f2395c57ed..18881f34737a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -294,7 +294,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
294 READ32(nace); 294 READ32(nace);
295 295
296 if (nace > NFS4_ACL_MAX) 296 if (nace > NFS4_ACL_MAX)
297 return nfserr_resource; 297 return nfserr_fbig;
298 298
299 *acl = nfs4_acl_new(nace); 299 *acl = nfs4_acl_new(nace);
300 if (*acl == NULL) 300 if (*acl == NULL)
@@ -1222,7 +1222,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1222 } 1222 }
1223 write->wr_head.iov_base = p; 1223 write->wr_head.iov_base = p;
1224 write->wr_head.iov_len = avail; 1224 write->wr_head.iov_len = avail;
1225 WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
1226 write->wr_pagelist = argp->pagelist; 1225 write->wr_pagelist = argp->pagelist;
1227 1226
1228 len = XDR_QUADLEN(write->wr_buflen) << 2; 1227 len = XDR_QUADLEN(write->wr_buflen) << 2;
@@ -2483,6 +2482,8 @@ out_acl:
2483 goto out; 2482 goto out;
2484 } 2483 }
2485 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2484 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2485 if ((buflen -= 16) < 0)
2486 goto out_resource;
2486 WRITE32(3); 2487 WRITE32(3);
2487 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); 2488 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2488 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); 2489 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
@@ -2499,8 +2500,10 @@ out:
2499 security_release_secctx(context, contextlen); 2500 security_release_secctx(context, contextlen);
2500#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ 2501#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2501 kfree(acl); 2502 kfree(acl);
2502 if (tempfh) 2503 if (tempfh) {
2503 fh_put(tempfh); 2504 fh_put(tempfh);
2505 kfree(tempfh);
2506 }
2504 return status; 2507 return status;
2505out_nfserr: 2508out_nfserr:
2506 status = nfserrno(err); 2509 status = nfserrno(err);
@@ -3471,6 +3474,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3471 struct nfsd4_test_stateid_id *stateid, *next; 3474 struct nfsd4_test_stateid_id *stateid, *next;
3472 __be32 *p; 3475 __be32 *p;
3473 3476
3477 if (nfserr)
3478 return nfserr;
3479
3474 RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); 3480 RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
3475 *p++ = htonl(test_stateid->ts_num_ids); 3481 *p++ = htonl(test_stateid->ts_num_ids);
3476 3482
@@ -3579,8 +3585,6 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
3579 return 0; 3585 return 0;
3580 3586
3581 session = resp->cstate.session; 3587 session = resp->cstate.session;
3582 if (session == NULL)
3583 return 0;
3584 3588
3585 if (xb->page_len == 0) { 3589 if (xb->page_len == 0) {
3586 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; 3590 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3620,7 +3624,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
3620 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3624 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
3621 !nfsd4_enc_ops[op->opnum]); 3625 !nfsd4_enc_ops[op->opnum]);
3622 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3626 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3623 /* nfsd4_check_drc_limit guarantees enough room for error status */ 3627 /* nfsd4_check_resp_size guarantees enough room for error status */
3624 if (!op->status) 3628 if (!op->status)
3625 op->status = nfsd4_check_resp_size(resp, 0); 3629 op->status = nfsd4_check_resp_size(resp, 0);
3626 if (so) { 3630 if (so) {
@@ -3691,6 +3695,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
3691int 3695int
3692nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) 3696nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
3693{ 3697{
3698 if (rqstp->rq_arg.head[0].iov_len % 4) {
3699 /* client is nuts */
3700 dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
3701 __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
3702 return 0;
3703 }
3694 args->p = p; 3704 args->p = p;
3695 args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; 3705 args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
3696 args->pagelist = rqstp->rq_arg.pages; 3706 args->pagelist = rqstp->rq_arg.pages;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7f555179bf81..f34d9de802ab 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net)
699 if (err != 0 || fd < 0) 699 if (err != 0 || fd < 0)
700 return -EINVAL; 700 return -EINVAL;
701 701
702 if (svc_alien_sock(net, fd)) {
703 printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
704 return -EINVAL;
705 }
706
702 err = nfsd_create_serv(net); 707 err = nfsd_create_serv(net);
703 if (err != 0) 708 if (err != 0)
704 return err; 709 return err;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 30f34ab02137..479eb681c27c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -282,7 +282,7 @@ void nfsd_lockd_shutdown(void);
282 * reason. 282 * reason.
283 */ 283 */
284#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ 284#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
285#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ 285#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
286 286
287#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ 287#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
288 288
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 4775bc4896c8..ad67964d0bb1 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -133,6 +133,17 @@ fh_init(struct svc_fh *fhp, int maxsize)
133 133
134#ifdef CONFIG_NFSD_V3 134#ifdef CONFIG_NFSD_V3
135/* 135/*
136 * The wcc data stored in current_fh should be cleared
137 * between compound ops.
138 */
139static inline void
140fh_clear_wcc(struct svc_fh *fhp)
141{
142 fhp->fh_post_saved = 0;
143 fhp->fh_pre_saved = 0;
144}
145
146/*
136 * Fill in the pre_op attr for the wcc data 147 * Fill in the pre_op attr for the wcc data
137 */ 148 */
138static inline void 149static inline void
@@ -152,7 +163,8 @@ fill_pre_wcc(struct svc_fh *fhp)
152 163
153extern void fill_post_wcc(struct svc_fh *); 164extern void fill_post_wcc(struct svc_fh *);
154#else 165#else
155#define fill_pre_wcc(ignored) 166#define fh_clear_wcc(ignored)
167#define fill_pre_wcc(ignored)
156#define fill_post_wcc(notused) 168#define fill_post_wcc(notused)
157#endif /* CONFIG_NFSD_V3 */ 169#endif /* CONFIG_NFSD_V3 */
158 170
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b17d93214d01..9c769a47ac5a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
152 type = (stat->mode & S_IFMT); 152 type = (stat->mode & S_IFMT);
153 153
154 *p++ = htonl(nfs_ftypes[type >> 12]); 154 *p++ = htonl(nfs_ftypes[type >> 12]);
155 *p++ = htonl((u32) (stat->mode & S_IALLUGO)); 155 *p++ = htonl((u32) stat->mode);
156 *p++ = htonl((u32) stat->nlink); 156 *p++ = htonl((u32) stat->nlink);
157 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); 157 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
158 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); 158 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6d7be3f80356..16f0673a423c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -404,6 +404,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
404 umode_t ftype = 0; 404 umode_t ftype = 0;
405 __be32 err; 405 __be32 err;
406 int host_err; 406 int host_err;
407 bool get_write_count;
407 int size_change = 0; 408 int size_change = 0;
408 409
409 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) 410 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -411,10 +412,18 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
411 if (iap->ia_valid & ATTR_SIZE) 412 if (iap->ia_valid & ATTR_SIZE)
412 ftype = S_IFREG; 413 ftype = S_IFREG;
413 414
415 /* Callers that do fh_verify should do the fh_want_write: */
416 get_write_count = !fhp->fh_dentry;
417
414 /* Get inode */ 418 /* Get inode */
415 err = fh_verify(rqstp, fhp, ftype, accmode); 419 err = fh_verify(rqstp, fhp, ftype, accmode);
416 if (err) 420 if (err)
417 goto out; 421 goto out;
422 if (get_write_count) {
423 host_err = fh_want_write(fhp);
424 if (host_err)
425 return nfserrno(host_err);
426 }
418 427
419 dentry = fhp->fh_dentry; 428 dentry = fhp->fh_dentry;
420 inode = dentry->d_inode; 429 inode = dentry->d_inode;
@@ -1694,7 +1703,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) 1703 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
1695 goto out_dput_new; 1704 goto out_dput_new;
1696 1705
1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); 1706 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
1698 if (!host_err) { 1707 if (!host_err) {
1699 host_err = commit_metadata(tfhp); 1708 host_err = commit_metadata(tfhp);
1700 if (!host_err) 1709 if (!host_err)
@@ -1706,10 +1715,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1706 dput(odentry); 1715 dput(odentry);
1707 out_nfserr: 1716 out_nfserr:
1708 err = nfserrno(host_err); 1717 err = nfserrno(host_err);
1709 1718 /*
1710 /* we cannot reply on fh_unlock on the two filehandles, 1719 * We cannot rely on fh_unlock on the two filehandles,
1711 * as that would do the wrong thing if the two directories 1720 * as that would do the wrong thing if the two directories
1712 * were the same, so again we do it by hand 1721 * were the same, so again we do it by hand.
1713 */ 1722 */
1714 fill_post_wcc(ffhp); 1723 fill_post_wcc(ffhp);
1715 fill_post_wcc(tfhp); 1724 fill_post_wcc(tfhp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d278a0d03496..5ea7df305083 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -574,8 +574,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
574 struct nfsd4_compound_state *, 574 struct nfsd4_compound_state *,
575 struct nfsd4_setclientid_confirm *setclientid_confirm); 575 struct nfsd4_setclientid_confirm *setclientid_confirm);
576extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); 576extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
577extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
578 struct nfsd4_sequence *seq);
579extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 577extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
580 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 578 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
581extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); 579extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index deaa3d33a0aa..0d58075f34e2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
942 struct inode *cpfile; 942 struct inode *cpfile;
943 int err; 943 int err;
944 944
945 if (cpsize > sb->s_blocksize) {
946 printk(KERN_ERR
947 "NILFS: too large checkpoint size: %zu bytes.\n",
948 cpsize);
949 return -EINVAL;
950 } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
951 printk(KERN_ERR
952 "NILFS: too small checkpoint size: %zu bytes.\n",
953 cpsize);
954 return -EINVAL;
955 }
956
945 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); 957 cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
946 if (unlikely(!cpfile)) 958 if (unlikely(!cpfile))
947 return -ENOMEM; 959 return -ENOMEM;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fa0f80308c2d..0d5fada91191 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
484 struct nilfs_dat_info *di; 484 struct nilfs_dat_info *di;
485 int err; 485 int err;
486 486
487 if (entry_size > sb->s_blocksize) {
488 printk(KERN_ERR
489 "NILFS: too large DAT entry size: %zu bytes.\n",
490 entry_size);
491 return -EINVAL;
492 } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
493 printk(KERN_ERR
494 "NILFS: too small DAT entry size: %zu bytes.\n",
495 entry_size);
496 return -EINVAL;
497 }
498
487 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); 499 dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
488 if (unlikely(!dat)) 500 if (unlikely(!dat))
489 return -ENOMEM; 501 return -ENOMEM;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 08fdb77852ac..f3a82fbcae02 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
134 134
135static const struct vm_operations_struct nilfs_file_vm_ops = { 135static const struct vm_operations_struct nilfs_file_vm_ops = {
136 .fault = filemap_fault, 136 .fault = filemap_fault,
137 .map_pages = filemap_map_pages,
137 .page_mkwrite = nilfs_page_mkwrite, 138 .page_mkwrite = nilfs_page_mkwrite,
138 .remap_pages = generic_file_remap_pages, 139 .remap_pages = generic_file_remap_pages,
139}; 140};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0e..b9c5726120e3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode)
783 int ret; 783 int ret;
784 784
785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
786 if (inode->i_data.nrpages) 786 truncate_inode_pages_final(&inode->i_data);
787 truncate_inode_pages(&inode->i_data, 0);
788 clear_inode(inode); 787 clear_inode(inode);
789 nilfs_clear_inode(inode); 788 nilfs_clear_inode(inode);
790 return; 789 return;
791 } 790 }
792 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 791 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
793 792
794 if (inode->i_data.nrpages) 793 truncate_inode_pages_final(&inode->i_data);
795 truncate_inode_pages(&inode->i_data, 0);
796 794
797 /* TODO: some of the following operations may fail. */ 795 /* TODO: some of the following operations may fail. */
798 nilfs_truncate_bmap(ii, 0); 796 nilfs_truncate_bmap(ii, 0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021948e4..422fb54b7377 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,48 @@ out:
1072} 1072}
1073 1073
1074/** 1074/**
1075 * nilfs_ioctl_trim_fs() - trim ioctl handle function
1076 * @inode: inode object
1077 * @argp: pointer on argument from userspace
1078 *
1079 * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
1080 * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
1081 * performs the actual trim operation.
1082 *
1083 * Return Value: On success, 0 is returned or negative error code, otherwise.
1084 */
1085static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
1086{
1087 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1088 struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
1089 struct fstrim_range range;
1090 int ret;
1091
1092 if (!capable(CAP_SYS_ADMIN))
1093 return -EPERM;
1094
1095 if (!blk_queue_discard(q))
1096 return -EOPNOTSUPP;
1097
1098 if (copy_from_user(&range, argp, sizeof(range)))
1099 return -EFAULT;
1100
1101 range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
1102
1103 down_read(&nilfs->ns_segctor_sem);
1104 ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
1105 up_read(&nilfs->ns_segctor_sem);
1106
1107 if (ret < 0)
1108 return ret;
1109
1110 if (copy_to_user(argp, &range, sizeof(range)))
1111 return -EFAULT;
1112
1113 return 0;
1114}
1115
1116/**
1075 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated 1117 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
1076 * @inode: inode object 1118 * @inode: inode object
1077 * @argp: pointer on argument from userspace 1119 * @argp: pointer on argument from userspace
@@ -1163,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
1163 return ret; 1205 return ret;
1164} 1206}
1165 1207
1208/**
1209 * nilfs_ioctl_set_suinfo - set segment usage info
1210 * @inode: inode object
1211 * @filp: file object
1212 * @cmd: ioctl's request code
1213 * @argp: pointer on argument from userspace
1214 *
1215 * Description: Expects an array of nilfs_suinfo_update structures
1216 * encapsulated in nilfs_argv and updates the segment usage info
1217 * according to the flags in nilfs_suinfo_update.
1218 *
1219 * Return Value: On success, 0 is returned. On error, one of the
1220 * following negative error codes is returned.
1221 *
1222 * %-EPERM - Not enough permissions
1223 *
1224 * %-EFAULT - Error copying input data
1225 *
1226 * %-EIO - I/O error.
1227 *
1228 * %-ENOMEM - Insufficient amount of memory available.
1229 *
1230 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
1231 */
1232static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
1233 unsigned int cmd, void __user *argp)
1234{
1235 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
1236 struct nilfs_transaction_info ti;
1237 struct nilfs_argv argv;
1238 size_t len;
1239 void __user *base;
1240 void *kbuf;
1241 int ret;
1242
1243 if (!capable(CAP_SYS_ADMIN))
1244 return -EPERM;
1245
1246 ret = mnt_want_write_file(filp);
1247 if (ret)
1248 return ret;
1249
1250 ret = -EFAULT;
1251 if (copy_from_user(&argv, argp, sizeof(argv)))
1252 goto out;
1253
1254 ret = -EINVAL;
1255 if (argv.v_size < sizeof(struct nilfs_suinfo_update))
1256 goto out;
1257
1258 if (argv.v_nmembs > nilfs->ns_nsegments)
1259 goto out;
1260
1261 if (argv.v_nmembs >= UINT_MAX / argv.v_size)
1262 goto out;
1263
1264 len = argv.v_size * argv.v_nmembs;
1265 if (!len) {
1266 ret = 0;
1267 goto out;
1268 }
1269
1270 base = (void __user *)(unsigned long)argv.v_base;
1271 kbuf = vmalloc(len);
1272 if (!kbuf) {
1273 ret = -ENOMEM;
1274 goto out;
1275 }
1276
1277 if (copy_from_user(kbuf, base, len)) {
1278 ret = -EFAULT;
1279 goto out_free;
1280 }
1281
1282 nilfs_transaction_begin(inode->i_sb, &ti, 0);
1283 ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
1284 argv.v_nmembs);
1285 if (unlikely(ret < 0))
1286 nilfs_transaction_abort(inode->i_sb);
1287 else
1288 nilfs_transaction_commit(inode->i_sb); /* never fails */
1289
1290out_free:
1291 vfree(kbuf);
1292out:
1293 mnt_drop_write_file(filp);
1294 return ret;
1295}
1296
1166long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1297long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1167{ 1298{
1168 struct inode *inode = file_inode(filp); 1299 struct inode *inode = file_inode(filp);
@@ -1189,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1189 return nilfs_ioctl_get_info(inode, filp, cmd, argp, 1320 return nilfs_ioctl_get_info(inode, filp, cmd, argp,
1190 sizeof(struct nilfs_suinfo), 1321 sizeof(struct nilfs_suinfo),
1191 nilfs_ioctl_do_get_suinfo); 1322 nilfs_ioctl_do_get_suinfo);
1323 case NILFS_IOCTL_SET_SUINFO:
1324 return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
1192 case NILFS_IOCTL_GET_SUSTAT: 1325 case NILFS_IOCTL_GET_SUSTAT:
1193 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); 1326 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
1194 case NILFS_IOCTL_GET_VINFO: 1327 case NILFS_IOCTL_GET_VINFO:
@@ -1205,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1205 return nilfs_ioctl_resize(inode, filp, argp); 1338 return nilfs_ioctl_resize(inode, filp, argp);
1206 case NILFS_IOCTL_SET_ALLOC_RANGE: 1339 case NILFS_IOCTL_SET_ALLOC_RANGE:
1207 return nilfs_ioctl_set_alloc_range(inode, argp); 1340 return nilfs_ioctl_set_alloc_range(inode, argp);
1341 case FITRIM:
1342 return nilfs_ioctl_trim_fs(inode, argp);
1208 default: 1343 default:
1209 return -ENOTTY; 1344 return -ENOTTY;
1210 } 1345 }
@@ -1228,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1228 case NILFS_IOCTL_GET_CPINFO: 1363 case NILFS_IOCTL_GET_CPINFO:
1229 case NILFS_IOCTL_GET_CPSTAT: 1364 case NILFS_IOCTL_GET_CPSTAT:
1230 case NILFS_IOCTL_GET_SUINFO: 1365 case NILFS_IOCTL_GET_SUINFO:
1366 case NILFS_IOCTL_SET_SUINFO:
1231 case NILFS_IOCTL_GET_SUSTAT: 1367 case NILFS_IOCTL_GET_SUSTAT:
1232 case NILFS_IOCTL_GET_VINFO: 1368 case NILFS_IOCTL_GET_VINFO:
1233 case NILFS_IOCTL_GET_BDESCS: 1369 case NILFS_IOCTL_GET_BDESCS:
@@ -1235,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1235 case NILFS_IOCTL_SYNC: 1371 case NILFS_IOCTL_SYNC:
1236 case NILFS_IOCTL_RESIZE: 1372 case NILFS_IOCTL_RESIZE:
1237 case NILFS_IOCTL_SET_ALLOC_RANGE: 1373 case NILFS_IOCTL_SET_ALLOC_RANGE:
1374 case FITRIM:
1238 break; 1375 break;
1239 default: 1376 default:
1240 return -ENOIOCTLCMD; 1377 return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f438a7..2a869c35c362 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
870} 870}
871 871
872/** 872/**
873 * nilfs_sufile_set_suinfo - sets segment usage info
874 * @sufile: inode of segment usage file
875 * @buf: array of suinfo_update
876 * @supsz: byte size of suinfo_update
877 * @nsup: size of suinfo_update array
878 *
879 * Description: Takes an array of nilfs_suinfo_update structs and updates
880 * segment usage accordingly. Only the fields indicated by the sup_flags
881 * are updated.
882 *
883 * Return Value: On success, 0 is returned. On error, one of the
884 * following negative error codes is returned.
885 *
886 * %-EIO - I/O error.
887 *
888 * %-ENOMEM - Insufficient amount of memory available.
889 *
890 * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
891 */
892ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
893 unsigned supsz, size_t nsup)
894{
895 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
896 struct buffer_head *header_bh, *bh;
897 struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
898 struct nilfs_segment_usage *su;
899 void *kaddr;
900 unsigned long blkoff, prev_blkoff;
901 int cleansi, cleansu, dirtysi, dirtysu;
902 long ncleaned = 0, ndirtied = 0;
903 int ret = 0;
904
905 if (unlikely(nsup == 0))
906 return ret;
907
908 for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
909 if (sup->sup_segnum >= nilfs->ns_nsegments
910 || (sup->sup_flags &
911 (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
912 || (nilfs_suinfo_update_nblocks(sup) &&
913 sup->sup_sui.sui_nblocks >
914 nilfs->ns_blocks_per_segment))
915 return -EINVAL;
916 }
917
918 down_write(&NILFS_MDT(sufile)->mi_sem);
919
920 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
921 if (ret < 0)
922 goto out_sem;
923
924 sup = buf;
925 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
926 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
927 if (ret < 0)
928 goto out_header;
929
930 for (;;) {
931 kaddr = kmap_atomic(bh->b_page);
932 su = nilfs_sufile_block_get_segment_usage(
933 sufile, sup->sup_segnum, bh, kaddr);
934
935 if (nilfs_suinfo_update_lastmod(sup))
936 su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
937
938 if (nilfs_suinfo_update_nblocks(sup))
939 su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
940
941 if (nilfs_suinfo_update_flags(sup)) {
942 /*
943 * Active flag is a virtual flag projected by running
944 * nilfs kernel code - drop it not to write it to
945 * disk.
946 */
947 sup->sup_sui.sui_flags &=
948 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
949
950 cleansi = nilfs_suinfo_clean(&sup->sup_sui);
951 cleansu = nilfs_segment_usage_clean(su);
952 dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
953 dirtysu = nilfs_segment_usage_dirty(su);
954
955 if (cleansi && !cleansu)
956 ++ncleaned;
957 else if (!cleansi && cleansu)
958 --ncleaned;
959
960 if (dirtysi && !dirtysu)
961 ++ndirtied;
962 else if (!dirtysi && dirtysu)
963 --ndirtied;
964
965 su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
966 }
967
968 kunmap_atomic(kaddr);
969
970 sup = (void *)sup + supsz;
971 if (sup >= supend)
972 break;
973
974 prev_blkoff = blkoff;
975 blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
976 if (blkoff == prev_blkoff)
977 continue;
978
979 /* get different block */
980 mark_buffer_dirty(bh);
981 put_bh(bh);
982 ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
983 if (unlikely(ret < 0))
984 goto out_mark;
985 }
986 mark_buffer_dirty(bh);
987 put_bh(bh);
988
989 out_mark:
990 if (ncleaned || ndirtied) {
991 nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
992 (u64)ndirtied);
993 NILFS_SUI(sufile)->ncleansegs += ncleaned;
994 }
995 nilfs_mdt_mark_dirty(sufile);
996 out_header:
997 put_bh(header_bh);
998 out_sem:
999 up_write(&NILFS_MDT(sufile)->mi_sem);
1000 return ret;
1001}
1002
1003/**
1004 * nilfs_sufile_trim_fs() - trim ioctl handle function
1005 * @sufile: inode of segment usage file
1006 * @range: fstrim_range structure
1007 *
1008 * start: First Byte to trim
1009 * len: number of Bytes to trim from start
1010 * minlen: minimum extent length in Bytes
1011 *
1012 * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
1013 * from start to start+len. start is rounded up to the next block boundary
1014 * and start+len is rounded down. For each clean segment blkdev_issue_discard
1015 * function is invoked.
1016 *
1017 * Return Value: On success, 0 is returned or negative error code, otherwise.
1018 */
1019int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
1020{
1021 struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
1022 struct buffer_head *su_bh;
1023 struct nilfs_segment_usage *su;
1024 void *kaddr;
1025 size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
1026 sector_t seg_start, seg_end, start_block, end_block;
1027 sector_t start = 0, nblocks = 0;
1028 u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
1029 int ret = 0;
1030 unsigned int sects_per_block;
1031
1032 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
1033 bdev_logical_block_size(nilfs->ns_bdev);
1034 len = range->len >> nilfs->ns_blocksize_bits;
1035 minlen = range->minlen >> nilfs->ns_blocksize_bits;
1036 max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
1037
1038 if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
1039 return -EINVAL;
1040
1041 start_block = (range->start + nilfs->ns_blocksize - 1) >>
1042 nilfs->ns_blocksize_bits;
1043
1044 /*
1045 * range->len can be very large (actually, it is set to
1046 * ULLONG_MAX by default) - truncate upper end of the range
1047 * carefully so as not to overflow.
1048 */
1049 if (max_blocks - start_block < len)
1050 end_block = max_blocks - 1;
1051 else
1052 end_block = start_block + len - 1;
1053
1054 segnum = nilfs_get_segnum_of_block(nilfs, start_block);
1055 segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
1056
1057 down_read(&NILFS_MDT(sufile)->mi_sem);
1058
1059 while (segnum <= segnum_end) {
1060 n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
1061 segnum_end);
1062
1063 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
1064 &su_bh);
1065 if (ret < 0) {
1066 if (ret != -ENOENT)
1067 goto out_sem;
1068 /* hole */
1069 segnum += n;
1070 continue;
1071 }
1072
1073 kaddr = kmap_atomic(su_bh->b_page);
1074 su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
1075 su_bh, kaddr);
1076 for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
1077 if (!nilfs_segment_usage_clean(su))
1078 continue;
1079
1080 nilfs_get_segment_range(nilfs, segnum, &seg_start,
1081 &seg_end);
1082
1083 if (!nblocks) {
1084 /* start new extent */
1085 start = seg_start;
1086 nblocks = seg_end - seg_start + 1;
1087 continue;
1088 }
1089
1090 if (start + nblocks == seg_start) {
1091 /* add to previous extent */
1092 nblocks += seg_end - seg_start + 1;
1093 continue;
1094 }
1095
1096 /* discard previous extent */
1097 if (start < start_block) {
1098 nblocks -= start_block - start;
1099 start = start_block;
1100 }
1101
1102 if (nblocks >= minlen) {
1103 kunmap_atomic(kaddr);
1104
1105 ret = blkdev_issue_discard(nilfs->ns_bdev,
1106 start * sects_per_block,
1107 nblocks * sects_per_block,
1108 GFP_NOFS, 0);
1109 if (ret < 0) {
1110 put_bh(su_bh);
1111 goto out_sem;
1112 }
1113
1114 ndiscarded += nblocks;
1115 kaddr = kmap_atomic(su_bh->b_page);
1116 su = nilfs_sufile_block_get_segment_usage(
1117 sufile, segnum, su_bh, kaddr);
1118 }
1119
1120 /* start new extent */
1121 start = seg_start;
1122 nblocks = seg_end - seg_start + 1;
1123 }
1124 kunmap_atomic(kaddr);
1125 put_bh(su_bh);
1126 }
1127
1128
1129 if (nblocks) {
1130 /* discard last extent */
1131 if (start < start_block) {
1132 nblocks -= start_block - start;
1133 start = start_block;
1134 }
1135 if (start + nblocks > end_block + 1)
1136 nblocks = end_block - start + 1;
1137
1138 if (nblocks >= minlen) {
1139 ret = blkdev_issue_discard(nilfs->ns_bdev,
1140 start * sects_per_block,
1141 nblocks * sects_per_block,
1142 GFP_NOFS, 0);
1143 if (!ret)
1144 ndiscarded += nblocks;
1145 }
1146 }
1147
1148out_sem:
1149 up_read(&NILFS_MDT(sufile)->mi_sem);
1150
1151 range->len = ndiscarded << nilfs->ns_blocksize_bits;
1152 return ret;
1153}
1154
1155/**
873 * nilfs_sufile_read - read or get sufile inode 1156 * nilfs_sufile_read - read or get sufile inode
874 * @sb: super block instance 1157 * @sb: super block instance
875 * @susize: size of a segment usage entry 1158 * @susize: size of a segment usage entry
@@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
886 void *kaddr; 1169 void *kaddr;
887 int err; 1170 int err;
888 1171
1172 if (susize > sb->s_blocksize) {
1173 printk(KERN_ERR
1174 "NILFS: too large segment usage size: %zu bytes.\n",
1175 susize);
1176 return -EINVAL;
1177 } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
1178 printk(KERN_ERR
1179 "NILFS: too small segment usage size: %zu bytes.\n",
1180 susize);
1181 return -EINVAL;
1182 }
1183
889 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); 1184 sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
890 if (unlikely(!sufile)) 1185 if (unlikely(!sufile))
891 return -ENOMEM; 1186 return -ENOMEM;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b51fc1..b8afd72f2379 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); 44int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, 45ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
46 size_t); 46 size_t);
47ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
47 48
48int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, 49int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
49 void (*dofunc)(struct inode *, __u64, 50 void (*dofunc)(struct inode *, __u64,
@@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
65int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); 66int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
66int nilfs_sufile_read(struct super_block *sb, size_t susize, 67int nilfs_sufile_read(struct super_block *sb, size_t susize,
67 struct nilfs_inode *raw_inode, struct inode **inodep); 68 struct nilfs_inode *raw_inode, struct inode **inodep);
69int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
68 70
69/** 71/**
70 * nilfs_sufile_scrap - make a segment garbage 72 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1d..8c532b2ca3ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1129 unsigned long old_mount_opt; 1129 unsigned long old_mount_opt;
1130 int err; 1130 int err;
1131 1131
1132 sync_filesystem(sb);
1132 old_sb_flags = sb->s_flags; 1133 old_sb_flags = sb->s_flags;
1133 old_mount_opt = nilfs->ns_mount_opt; 1134 old_mount_opt = nilfs->ns_mount_opt;
1134 1135
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 94c451ce6d24..8ba8229ba076 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
399 return -EINVAL; 399 return -EINVAL;
400 400
401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); 401 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
402 if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
403 printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
404 nilfs->ns_inode_size);
405 return -EINVAL;
406 } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
407 printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
408 nilfs->ns_inode_size);
409 return -EINVAL;
410 }
411
402 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); 412 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
403 413
404 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 414 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index dc638f786d5c..ee9cb3795c2b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
60} 60}
61 61
62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
63static int fanotify_get_response_from_access(struct fsnotify_group *group, 63static int fanotify_get_response(struct fsnotify_group *group,
64 struct fanotify_event_info *event) 64 struct fanotify_perm_event_info *event)
65{ 65{
66 int ret; 66 int ret;
67 67
@@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
142 return false; 142 return false;
143} 143}
144 144
145struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
146 struct path *path)
147{
148 struct fanotify_event_info *event;
149
150#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
151 if (mask & FAN_ALL_PERM_EVENTS) {
152 struct fanotify_perm_event_info *pevent;
153
154 pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
155 GFP_KERNEL);
156 if (!pevent)
157 return NULL;
158 event = &pevent->fae;
159 pevent->response = 0;
160 goto init;
161 }
162#endif
163 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
164 if (!event)
165 return NULL;
166init: __maybe_unused
167 fsnotify_init_event(&event->fse, inode, mask);
168 event->tgid = get_pid(task_tgid(current));
169 if (path) {
170 event->path = *path;
171 path_get(&event->path);
172 } else {
173 event->path.mnt = NULL;
174 event->path.dentry = NULL;
175 }
176 return event;
177}
178
145static int fanotify_handle_event(struct fsnotify_group *group, 179static int fanotify_handle_event(struct fsnotify_group *group,
146 struct inode *inode, 180 struct inode *inode,
147 struct fsnotify_mark *inode_mark, 181 struct fsnotify_mark *inode_mark,
@@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
171 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, 205 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
172 mask); 206 mask);
173 207
174 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 208 event = fanotify_alloc_event(inode, mask, data);
175 if (unlikely(!event)) 209 if (unlikely(!event))
176 return -ENOMEM; 210 return -ENOMEM;
177 211
178 fsn_event = &event->fse; 212 fsn_event = &event->fse;
179 fsnotify_init_event(fsn_event, inode, mask);
180 event->tgid = get_pid(task_tgid(current));
181 if (data_type == FSNOTIFY_EVENT_PATH) {
182 struct path *path = data;
183 event->path = *path;
184 path_get(&event->path);
185 } else {
186 event->path.mnt = NULL;
187 event->path.dentry = NULL;
188 }
189#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
190 event->response = 0;
191#endif
192
193 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); 213 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
194 if (ret) { 214 if (ret) {
195 /* Permission events shouldn't be merged */ 215 /* Permission events shouldn't be merged */
@@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
202 222
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 223#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (mask & FAN_ALL_PERM_EVENTS) { 224 if (mask & FAN_ALL_PERM_EVENTS) {
205 ret = fanotify_get_response_from_access(group, event); 225 ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
206 fsnotify_destroy_event(group, fsn_event); 226 fsnotify_destroy_event(group, fsn_event);
207 } 227 }
208#endif 228#endif
@@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
225 event = FANOTIFY_E(fsn_event); 245 event = FANOTIFY_E(fsn_event);
226 path_put(&event->path); 246 path_put(&event->path);
227 put_pid(event->tgid); 247 put_pid(event->tgid);
248#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
249 if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
250 kmem_cache_free(fanotify_perm_event_cachep,
251 FANOTIFY_PE(fsn_event));
252 return;
253 }
254#endif
228 kmem_cache_free(fanotify_event_cachep, event); 255 kmem_cache_free(fanotify_event_cachep, event);
229} 256}
230 257
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 32a2f034fb94..2a5fb14115df 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -3,13 +3,12 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5extern struct kmem_cache *fanotify_event_cachep; 5extern struct kmem_cache *fanotify_event_cachep;
6extern struct kmem_cache *fanotify_perm_event_cachep;
6 7
7/* 8/*
8 * Lifetime of the structure differs for normal and permission events. In both 9 * Structure for normal fanotify events. It gets allocated in
9 * cases the structure is allocated in fanotify_handle_event(). For normal 10 * fanotify_handle_event() and freed when the information is retrieved by
10 * events the structure is freed immediately after reporting it to userspace. 11 * userspace
11 * For permission events we free it only after we receive response from
12 * userspace.
13 */ 12 */
14struct fanotify_event_info { 13struct fanotify_event_info {
15 struct fsnotify_event fse; 14 struct fsnotify_event fse;
@@ -19,12 +18,33 @@ struct fanotify_event_info {
19 */ 18 */
20 struct path path; 19 struct path path;
21 struct pid *tgid; 20 struct pid *tgid;
21};
22
22#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 23#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
23 u32 response; /* userspace answer to question */ 24/*
24#endif 25 * Structure for permission fanotify events. It gets allocated and freed in
26 * fanotify_handle_event() since we wait there for user response. When the
27 * information is retrieved by userspace the structure is moved from
28 * group->notification_list to group->fanotify_data.access_list to wait for
29 * user response.
30 */
31struct fanotify_perm_event_info {
32 struct fanotify_event_info fae;
33 int response; /* userspace answer to question */
34 int fd; /* fd we passed to userspace for this event */
25}; 35};
26 36
37static inline struct fanotify_perm_event_info *
38FANOTIFY_PE(struct fsnotify_event *fse)
39{
40 return container_of(fse, struct fanotify_perm_event_info, fae.fse);
41}
42#endif
43
27static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) 44static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
28{ 45{
29 return container_of(fse, struct fanotify_event_info, fse); 46 return container_of(fse, struct fanotify_event_info, fse);
30} 47}
48
49struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
50 struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 287a22c04149..732648b270dc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -28,14 +28,8 @@
28extern const struct fsnotify_ops fanotify_fsnotify_ops; 28extern const struct fsnotify_ops fanotify_fsnotify_ops;
29 29
30static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly; 31struct kmem_cache *fanotify_event_cachep __read_mostly;
33 32struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
34struct fanotify_response_event {
35 struct list_head list;
36 __s32 fd;
37 struct fanotify_event_info *event;
38};
39 33
40/* 34/*
41 * Get an fsnotify notification event if one exists and is small 35 * Get an fsnotify notification event if one exists and is small
@@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group,
135} 129}
136 130
137#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 131#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
138static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, 132static struct fanotify_perm_event_info *dequeue_event(
139 __s32 fd) 133 struct fsnotify_group *group, int fd)
140{ 134{
141 struct fanotify_response_event *re, *return_re = NULL; 135 struct fanotify_perm_event_info *event, *return_e = NULL;
142 136
143 mutex_lock(&group->fanotify_data.access_mutex); 137 spin_lock(&group->fanotify_data.access_lock);
144 list_for_each_entry(re, &group->fanotify_data.access_list, list) { 138 list_for_each_entry(event, &group->fanotify_data.access_list,
145 if (re->fd != fd) 139 fae.fse.list) {
140 if (event->fd != fd)
146 continue; 141 continue;
147 142
148 list_del_init(&re->list); 143 list_del_init(&event->fae.fse.list);
149 return_re = re; 144 return_e = event;
150 break; 145 break;
151 } 146 }
152 mutex_unlock(&group->fanotify_data.access_mutex); 147 spin_unlock(&group->fanotify_data.access_lock);
153 148
154 pr_debug("%s: found return_re=%p\n", __func__, return_re); 149 pr_debug("%s: found return_re=%p\n", __func__, return_e);
155 150
156 return return_re; 151 return return_e;
157} 152}
158 153
159static int process_access_response(struct fsnotify_group *group, 154static int process_access_response(struct fsnotify_group *group,
160 struct fanotify_response *response_struct) 155 struct fanotify_response *response_struct)
161{ 156{
162 struct fanotify_response_event *re; 157 struct fanotify_perm_event_info *event;
163 __s32 fd = response_struct->fd; 158 int fd = response_struct->fd;
164 __u32 response = response_struct->response; 159 int response = response_struct->response;
165 160
166 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, 161 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
167 fd, response); 162 fd, response);
@@ -181,58 +176,15 @@ static int process_access_response(struct fsnotify_group *group,
181 if (fd < 0) 176 if (fd < 0)
182 return -EINVAL; 177 return -EINVAL;
183 178
184 re = dequeue_re(group, fd); 179 event = dequeue_event(group, fd);
185 if (!re) 180 if (!event)
186 return -ENOENT; 181 return -ENOENT;
187 182
188 re->event->response = response; 183 event->response = response;
189
190 wake_up(&group->fanotify_data.access_waitq); 184 wake_up(&group->fanotify_data.access_waitq);
191 185
192 kmem_cache_free(fanotify_response_event_cache, re);
193
194 return 0;
195}
196
197static int prepare_for_access_response(struct fsnotify_group *group,
198 struct fsnotify_event *event,
199 __s32 fd)
200{
201 struct fanotify_response_event *re;
202
203 if (!(event->mask & FAN_ALL_PERM_EVENTS))
204 return 0;
205
206 re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
207 if (!re)
208 return -ENOMEM;
209
210 re->event = FANOTIFY_E(event);
211 re->fd = fd;
212
213 mutex_lock(&group->fanotify_data.access_mutex);
214
215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
216 mutex_unlock(&group->fanotify_data.access_mutex);
217 kmem_cache_free(fanotify_response_event_cache, re);
218 FANOTIFY_E(event)->response = FAN_ALLOW;
219 return 0;
220 }
221
222 list_add_tail(&re->list, &group->fanotify_data.access_list);
223 mutex_unlock(&group->fanotify_data.access_mutex);
224
225 return 0;
226}
227
228#else
229static int prepare_for_access_response(struct fsnotify_group *group,
230 struct fsnotify_event *event,
231 __s32 fd)
232{
233 return 0; 186 return 0;
234} 187}
235
236#endif 188#endif
237 189
238static ssize_t copy_event_to_user(struct fsnotify_group *group, 190static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -247,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
247 199
248 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); 200 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
249 if (ret < 0) 201 if (ret < 0)
250 goto out; 202 return ret;
251 203
252 fd = fanotify_event_metadata.fd; 204 fd = fanotify_event_metadata.fd;
253 ret = -EFAULT; 205 ret = -EFAULT;
@@ -255,9 +207,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
255 fanotify_event_metadata.event_len)) 207 fanotify_event_metadata.event_len))
256 goto out_close_fd; 208 goto out_close_fd;
257 209
258 ret = prepare_for_access_response(group, event, fd); 210#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
259 if (ret) 211 if (event->mask & FAN_ALL_PERM_EVENTS)
260 goto out_close_fd; 212 FANOTIFY_PE(event)->fd = fd;
213#endif
261 214
262 if (fd != FAN_NOFD) 215 if (fd != FAN_NOFD)
263 fd_install(fd, f); 216 fd_install(fd, f);
@@ -268,13 +221,6 @@ out_close_fd:
268 put_unused_fd(fd); 221 put_unused_fd(fd);
269 fput(f); 222 fput(f);
270 } 223 }
271out:
272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
273 if (event->mask & FAN_ALL_PERM_EVENTS) {
274 FANOTIFY_E(event)->response = FAN_DENY;
275 wake_up(&group->fanotify_data.access_waitq);
276 }
277#endif
278 return ret; 224 return ret;
279} 225}
280 226
@@ -314,35 +260,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
314 kevent = get_one_event(group, count); 260 kevent = get_one_event(group, count);
315 mutex_unlock(&group->notification_mutex); 261 mutex_unlock(&group->notification_mutex);
316 262
317 if (kevent) { 263 if (IS_ERR(kevent)) {
318 ret = PTR_ERR(kevent); 264 ret = PTR_ERR(kevent);
319 if (IS_ERR(kevent)) 265 break;
266 }
267
268 if (!kevent) {
269 ret = -EAGAIN;
270 if (file->f_flags & O_NONBLOCK)
320 break; 271 break;
321 ret = copy_event_to_user(group, kevent, buf); 272
322 /* 273 ret = -ERESTARTSYS;
323 * Permission events get destroyed after we 274 if (signal_pending(current))
324 * receive response 275 break;
325 */ 276
326 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) 277 if (start != buf)
327 fsnotify_destroy_event(group, kevent);
328 if (ret < 0)
329 break; 278 break;
330 buf += ret; 279 schedule();
331 count -= ret;
332 continue; 280 continue;
333 } 281 }
334 282
335 ret = -EAGAIN; 283 ret = copy_event_to_user(group, kevent, buf);
336 if (file->f_flags & O_NONBLOCK) 284 /*
337 break; 285 * Permission events get queued to wait for response. Other
338 ret = -ERESTARTSYS; 286 * events can be destroyed now.
339 if (signal_pending(current)) 287 */
340 break; 288 if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
341 289 fsnotify_destroy_event(group, kevent);
342 if (start != buf) 290 if (ret < 0)
343 break; 291 break;
344 292 } else {
345 schedule(); 293#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
294 if (ret < 0) {
295 FANOTIFY_PE(kevent)->response = FAN_DENY;
296 wake_up(&group->fanotify_data.access_waitq);
297 break;
298 }
299 spin_lock(&group->fanotify_data.access_lock);
300 list_add_tail(&kevent->list,
301 &group->fanotify_data.access_list);
302 spin_unlock(&group->fanotify_data.access_lock);
303#endif
304 }
305 buf += ret;
306 count -= ret;
346 } 307 }
347 308
348 finish_wait(&group->notification_waitq, &wait); 309 finish_wait(&group->notification_waitq, &wait);
@@ -383,22 +344,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
383 struct fsnotify_group *group = file->private_data; 344 struct fsnotify_group *group = file->private_data;
384 345
385#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 346#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
386 struct fanotify_response_event *re, *lre; 347 struct fanotify_perm_event_info *event, *next;
387 348
388 mutex_lock(&group->fanotify_data.access_mutex); 349 spin_lock(&group->fanotify_data.access_lock);
389 350
390 atomic_inc(&group->fanotify_data.bypass_perm); 351 atomic_inc(&group->fanotify_data.bypass_perm);
391 352
392 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { 353 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
393 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, 354 fae.fse.list) {
394 re, re->event); 355 pr_debug("%s: found group=%p event=%p\n", __func__, group,
356 event);
395 357
396 list_del_init(&re->list); 358 list_del_init(&event->fae.fse.list);
397 re->event->response = FAN_ALLOW; 359 event->response = FAN_ALLOW;
398
399 kmem_cache_free(fanotify_response_event_cache, re);
400 } 360 }
401 mutex_unlock(&group->fanotify_data.access_mutex); 361 spin_unlock(&group->fanotify_data.access_lock);
402 362
403 wake_up(&group->fanotify_data.access_waitq); 363 wake_up(&group->fanotify_data.access_waitq);
404#endif 364#endif
@@ -731,21 +691,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
731 group->fanotify_data.user = user; 691 group->fanotify_data.user = user;
732 atomic_inc(&user->fanotify_listeners); 692 atomic_inc(&user->fanotify_listeners);
733 693
734 oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); 694 oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
735 if (unlikely(!oevent)) { 695 if (unlikely(!oevent)) {
736 fd = -ENOMEM; 696 fd = -ENOMEM;
737 goto out_destroy_group; 697 goto out_destroy_group;
738 } 698 }
739 group->overflow_event = &oevent->fse; 699 group->overflow_event = &oevent->fse;
740 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
741 oevent->tgid = get_pid(task_tgid(current));
742 oevent->path.mnt = NULL;
743 oevent->path.dentry = NULL;
744 700
701 if (force_o_largefile())
702 event_f_flags |= O_LARGEFILE;
745 group->fanotify_data.f_flags = event_f_flags; 703 group->fanotify_data.f_flags = event_f_flags;
746#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 704#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
747 oevent->response = 0; 705 spin_lock_init(&group->fanotify_data.access_lock);
748 mutex_init(&group->fanotify_data.access_mutex);
749 init_waitqueue_head(&group->fanotify_data.access_waitq); 706 init_waitqueue_head(&group->fanotify_data.access_waitq);
750 INIT_LIST_HEAD(&group->fanotify_data.access_list); 707 INIT_LIST_HEAD(&group->fanotify_data.access_list);
751 atomic_set(&group->fanotify_data.bypass_perm, 0); 708 atomic_set(&group->fanotify_data.bypass_perm, 0);
@@ -920,9 +877,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
920static int __init fanotify_user_setup(void) 877static int __init fanotify_user_setup(void)
921{ 878{
922 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 879 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
923 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
924 SLAB_PANIC);
925 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); 880 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
881#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
882 fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
883 SLAB_PANIC);
884#endif
926 885
927 return 0; 886 return 0;
928} 887}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 807150e2c2b9..dd6103cc93c1 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
18 * distribution in the file COPYING); if not, write to the Free Software 18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22#include "debug.h" 22#include "debug.h"
23 23
24/*
25 * A static buffer to hold the error string being displayed and a spinlock
26 * to protect concurrent accesses to it.
27 */
28static char err_buf[1024];
29static DEFINE_SPINLOCK(err_buf_lock);
30
31/** 24/**
32 * __ntfs_warning - output a warning to the syslog 25 * __ntfs_warning - output a warning to the syslog
33 * @function: name of function outputting the warning 26 * @function: name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
50void __ntfs_warning(const char *function, const struct super_block *sb, 43void __ntfs_warning(const char *function, const struct super_block *sb,
51 const char *fmt, ...) 44 const char *fmt, ...)
52{ 45{
46 struct va_format vaf;
53 va_list args; 47 va_list args;
54 int flen = 0; 48 int flen = 0;
55 49
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
59#endif 53#endif
60 if (function) 54 if (function)
61 flen = strlen(function); 55 flen = strlen(function);
62 spin_lock(&err_buf_lock);
63 va_start(args, fmt); 56 va_start(args, fmt);
64 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 57 vaf.fmt = fmt;
65 va_end(args); 58 vaf.va = &args;
66 if (sb) 59 if (sb)
67 printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", 60 pr_warn("(device %s): %s(): %pV\n",
68 sb->s_id, flen ? function : "", err_buf); 61 sb->s_id, flen ? function : "", &vaf);
69 else 62 else
70 printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", 63 pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
71 flen ? function : "", err_buf); 64 va_end(args);
72 spin_unlock(&err_buf_lock);
73} 65}
74 66
75/** 67/**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
94void __ntfs_error(const char *function, const struct super_block *sb, 86void __ntfs_error(const char *function, const struct super_block *sb,
95 const char *fmt, ...) 87 const char *fmt, ...)
96{ 88{
89 struct va_format vaf;
97 va_list args; 90 va_list args;
98 int flen = 0; 91 int flen = 0;
99 92
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
103#endif 96#endif
104 if (function) 97 if (function)
105 flen = strlen(function); 98 flen = strlen(function);
106 spin_lock(&err_buf_lock);
107 va_start(args, fmt); 99 va_start(args, fmt);
108 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 100 vaf.fmt = fmt;
109 va_end(args); 101 vaf.va = &args;
110 if (sb) 102 if (sb)
111 printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", 103 pr_err("(device %s): %s(): %pV\n",
112 sb->s_id, flen ? function : "", err_buf); 104 sb->s_id, flen ? function : "", &vaf);
113 else 105 else
114 printk(KERN_ERR "NTFS-fs error: %s(): %s\n", 106 pr_err("%s(): %pV\n", flen ? function : "", &vaf);
115 flen ? function : "", err_buf); 107 va_end(args);
116 spin_unlock(&err_buf_lock);
117} 108}
118 109
119#ifdef DEBUG 110#ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
124void __ntfs_debug (const char *file, int line, const char *function, 115void __ntfs_debug (const char *file, int line, const char *function,
125 const char *fmt, ...) 116 const char *fmt, ...)
126{ 117{
118 struct va_format vaf;
127 va_list args; 119 va_list args;
128 int flen = 0; 120 int flen = 0;
129 121
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
131 return; 123 return;
132 if (function) 124 if (function)
133 flen = strlen(function); 125 flen = strlen(function);
134 spin_lock(&err_buf_lock);
135 va_start(args, fmt); 126 va_start(args, fmt);
136 vsnprintf(err_buf, sizeof(err_buf), fmt, args); 127 vaf.fmt = fmt;
128 vaf.va = &args;
129 pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
137 va_end(args); 130 va_end(args);
138 printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
139 flen ? function : "", err_buf);
140 spin_unlock(&err_buf_lock);
141} 131}
142 132
143/* Dump a runlist. Caller has to provide synchronisation for @rl. */ 133/* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
149 139
150 if (!debug_msgs) 140 if (!debug_msgs)
151 return; 141 return;
152 printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); 142 pr_debug("Dumping runlist (values in hex):\n");
153 if (!rl) { 143 if (!rl) {
154 printk(KERN_DEBUG "Run list not present.\n"); 144 pr_debug("Run list not present.\n");
155 return; 145 return;
156 } 146 }
157 printk(KERN_DEBUG "VCN LCN Run length\n"); 147 pr_debug("VCN LCN Run length\n");
158 for (i = 0; ; i++) { 148 for (i = 0; ; i++) {
159 LCN lcn = (rl + i)->lcn; 149 LCN lcn = (rl + i)->lcn;
160 150
@@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
163 153
164 if (index > -LCN_ENOENT - 1) 154 if (index > -LCN_ENOENT - 1)
165 index = 3; 155 index = 3;
166 printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", 156 pr_debug("%-16Lx %s %-16Lx%s\n",
167 (long long)(rl + i)->vcn, lcn_str[index], 157 (long long)(rl + i)->vcn, lcn_str[index],
168 (long long)(rl + i)->length, 158 (long long)(rl + i)->length,
169 (rl + i)->length ? "" : 159 (rl + i)->length ? "" :
170 " (runlist end)"); 160 " (runlist end)");
171 } else 161 } else
172 printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", 162 pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
173 (long long)(rl + i)->vcn, 163 (long long)(rl + i)->vcn,
174 (long long)(rl + i)->lcn, 164 (long long)(rl + i)->lcn,
175 (long long)(rl + i)->length, 165 (long long)(rl + i)->length,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 53c27eaf2307..61bf091e32a8 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
48 48
49#else /* !DEBUG */ 49#else /* !DEBUG */
50 50
51#define ntfs_debug(f, a...) do {} while (0) 51#define ntfs_debug(fmt, ...) \
52do { \
53 if (0) \
54 no_printk(fmt, ##__VA_ARGS__); \
55} while (0)
56
52#define ntfs_debug_dump_runlist(rl) do {} while (0) 57#define ntfs_debug_dump_runlist(rl) do {} while (0)
53 58
54#endif /* !DEBUG */ 59#endif /* !DEBUG */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ffb9b3675736..f47af5e6e230 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
1704 iput(bvi); 1704 iput(bvi);
1705skip_large_index_stuff: 1705skip_large_index_stuff:
1706 /* Setup the operations for this index inode. */ 1706 /* Setup the operations for this index inode. */
1707 vi->i_op = NULL;
1708 vi->i_fop = NULL;
1709 vi->i_mapping->a_ops = &ntfs_mst_aops; 1707 vi->i_mapping->a_ops = &ntfs_mst_aops;
1710 vi->i_blocks = ni->allocated_size >> 9; 1708 vi->i_blocks = ni->allocated_size >> 9;
1711 /* 1709 /*
@@ -2259,7 +2257,7 @@ void ntfs_evict_big_inode(struct inode *vi)
2259{ 2257{
2260 ntfs_inode *ni = NTFS_I(vi); 2258 ntfs_inode *ni = NTFS_I(vi);
2261 2259
2262 truncate_inode_pages(&vi->i_data, 0); 2260 truncate_inode_pages_final(&vi->i_data);
2263 clear_inode(vi); 2261 clear_inode(vi);
2264 2262
2265#ifdef NTFS_RW 2263#ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d916..9de2491f2926 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -19,6 +19,7 @@
19 * distribution in the file COPYING); if not, write to the Free Software 19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */ 21 */
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 23
23#include <linux/stddef.h> 24#include <linux/stddef.h>
24#include <linux/init.h> 25#include <linux/init.h>
@@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
468 469
469 ntfs_debug("Entering with remount options string: %s", opt); 470 ntfs_debug("Entering with remount options string: %s", opt);
470 471
472 sync_filesystem(sb);
473
471#ifndef NTFS_RW 474#ifndef NTFS_RW
472 /* For read-only compiled driver, enforce read-only flag. */ 475 /* For read-only compiled driver, enforce read-only flag. */
473 *flags |= MS_RDONLY; 476 *flags |= MS_RDONLY;
@@ -1894,7 +1897,7 @@ get_ctx_vol_failed:
1894 vol->minor_ver = vi->minor_ver; 1897 vol->minor_ver = vi->minor_ver;
1895 ntfs_attr_put_search_ctx(ctx); 1898 ntfs_attr_put_search_ctx(ctx);
1896 unmap_mft_record(NTFS_I(vol->vol_ino)); 1899 unmap_mft_record(NTFS_I(vol->vol_ino));
1897 printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, 1900 pr_info("volume version %i.%i.\n", vol->major_ver,
1898 vol->minor_ver); 1901 vol->minor_ver);
1899 if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { 1902 if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
1900 ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " 1903 ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
@@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void)
3093 int err = 0; 3096 int err = 0;
3094 3097
3095 /* This may be ugly but it results in pretty output so who cares. (-8 */ 3098 /* This may be ugly but it results in pretty output so who cares. (-8 */
3096 printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" 3099 pr_info("driver " NTFS_VERSION " [Flags: R/"
3097#ifdef NTFS_RW 3100#ifdef NTFS_RW
3098 "W" 3101 "W"
3099#else 3102#else
@@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void)
3113 sizeof(ntfs_index_context), 0 /* offset */, 3116 sizeof(ntfs_index_context), 0 /* offset */,
3114 SLAB_HWCACHE_ALIGN, NULL /* ctor */); 3117 SLAB_HWCACHE_ALIGN, NULL /* ctor */);
3115 if (!ntfs_index_ctx_cache) { 3118 if (!ntfs_index_ctx_cache) {
3116 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3119 pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
3117 ntfs_index_ctx_cache_name);
3118 goto ictx_err_out; 3120 goto ictx_err_out;
3119 } 3121 }
3120 ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, 3122 ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
3121 sizeof(ntfs_attr_search_ctx), 0 /* offset */, 3123 sizeof(ntfs_attr_search_ctx), 0 /* offset */,
3122 SLAB_HWCACHE_ALIGN, NULL /* ctor */); 3124 SLAB_HWCACHE_ALIGN, NULL /* ctor */);
3123 if (!ntfs_attr_ctx_cache) { 3125 if (!ntfs_attr_ctx_cache) {
3124 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3126 pr_crit("NTFS: Failed to create %s!\n",
3125 ntfs_attr_ctx_cache_name); 3127 ntfs_attr_ctx_cache_name);
3126 goto actx_err_out; 3128 goto actx_err_out;
3127 } 3129 }
3128 3130
@@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void)
3130 (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, 3132 (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
3131 SLAB_HWCACHE_ALIGN, NULL); 3133 SLAB_HWCACHE_ALIGN, NULL);
3132 if (!ntfs_name_cache) { 3134 if (!ntfs_name_cache) {
3133 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3135 pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
3134 ntfs_name_cache_name);
3135 goto name_err_out; 3136 goto name_err_out;
3136 } 3137 }
3137 3138
@@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void)
3139 sizeof(ntfs_inode), 0, 3140 sizeof(ntfs_inode), 0,
3140 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); 3141 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
3141 if (!ntfs_inode_cache) { 3142 if (!ntfs_inode_cache) {
3142 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3143 pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
3143 ntfs_inode_cache_name);
3144 goto inode_err_out; 3144 goto inode_err_out;
3145 } 3145 }
3146 3146
@@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void)
3149 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, 3149 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
3150 ntfs_big_inode_init_once); 3150 ntfs_big_inode_init_once);
3151 if (!ntfs_big_inode_cache) { 3151 if (!ntfs_big_inode_cache) {
3152 printk(KERN_CRIT "NTFS: Failed to create %s!\n", 3152 pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
3153 ntfs_big_inode_cache_name);
3154 goto big_inode_err_out; 3153 goto big_inode_err_out;
3155 } 3154 }
3156 3155
3157 /* Register the ntfs sysctls. */ 3156 /* Register the ntfs sysctls. */
3158 err = ntfs_sysctl(1); 3157 err = ntfs_sysctl(1);
3159 if (err) { 3158 if (err) {
3160 printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); 3159 pr_crit("Failed to register NTFS sysctls!\n");
3161 goto sysctl_err_out; 3160 goto sysctl_err_out;
3162 } 3161 }
3163 3162
@@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void)
3166 ntfs_debug("NTFS driver registered successfully."); 3165 ntfs_debug("NTFS driver registered successfully.");
3167 return 0; /* Success! */ 3166 return 0; /* Success! */
3168 } 3167 }
3169 printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); 3168 pr_crit("Failed to register NTFS filesystem driver!\n");
3170 3169
3171 /* Unregister the ntfs sysctls. */ 3170 /* Unregister the ntfs sysctls. */
3172 ntfs_sysctl(0); 3171 ntfs_sysctl(0);
@@ -3182,8 +3181,7 @@ actx_err_out:
3182 kmem_cache_destroy(ntfs_index_ctx_cache); 3181 kmem_cache_destroy(ntfs_index_ctx_cache);
3183ictx_err_out: 3182ictx_err_out:
3184 if (!err) { 3183 if (!err) {
3185 printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " 3184 pr_crit("Aborting NTFS filesystem driver registration...\n");
3186 "registration...\n");
3187 err = -ENOMEM; 3185 err = -ENOMEM;
3188 } 3186 }
3189 return err; 3187 return err;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 555f4cddefe3..7e8282dcea2a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
205 di->i_mode = cpu_to_le16(inode->i_mode); 205 di->i_mode = cpu_to_le16(inode->i_mode);
206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 206 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 207 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
208 ocfs2_update_inode_fsync_trans(handle, inode, 0);
208 209
209 ocfs2_journal_dirty(handle, di_bh); 210 ocfs2_journal_dirty(handle, di_bh);
210 211
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e2edff38be52..b4deb5f750d9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5728 } 5728 }
5729 5729
5730 ocfs2_et_update_clusters(et, -len); 5730 ocfs2_et_update_clusters(et, -len);
5731 ocfs2_update_inode_fsync_trans(handle, inode, 1);
5731 5732
5732 ocfs2_journal_dirty(handle, et->et_root_bh); 5733 ocfs2_journal_dirty(handle, et->et_root_bh);
5733 5734
@@ -6932,6 +6933,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6932 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6933 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6933 spin_unlock(&oi->ip_lock); 6934 spin_unlock(&oi->ip_lock);
6934 6935
6936 ocfs2_update_inode_fsync_trans(handle, inode, 1);
6935 ocfs2_dinode_new_extent_list(inode, di); 6937 ocfs2_dinode_new_extent_list(inode, di);
6936 6938
6937 ocfs2_journal_dirty(handle, di_bh); 6939 ocfs2_journal_dirty(handle, di_bh);
@@ -7208,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7208 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7210 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7209 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7211 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7210 7212
7213 ocfs2_update_inode_fsync_trans(handle, inode, 1);
7211 ocfs2_journal_dirty(handle, di_bh); 7214 ocfs2_journal_dirty(handle, di_bh);
7212 7215
7213out_commit: 7216out_commit:
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index aeb44e879c51..d310d12a9adc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
571{ 571{
572 struct inode *inode = file_inode(iocb->ki_filp); 572 struct inode *inode = file_inode(iocb->ki_filp);
573 int level; 573 int level;
574 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
575 574
576 /* this io's submitter should not have unlocked this before we could */ 575 /* this io's submitter should not have unlocked this before we could */
577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 576 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
582 if (ocfs2_iocb_is_unaligned_aio(iocb)) { 581 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
583 ocfs2_iocb_clear_unaligned_aio(iocb); 582 ocfs2_iocb_clear_unaligned_aio(iocb);
584 583
585 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && 584 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
586 waitqueue_active(wq)) {
587 wake_up_all(wq);
588 }
589 } 585 }
590 586
591 ocfs2_iocb_clear_rw_locked(iocb); 587 ocfs2_iocb_clear_rw_locked(iocb);
@@ -2043,6 +2039,7 @@ out_write_size:
2043 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2039 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2044 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2040 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2045 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2041 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2042 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2046 ocfs2_journal_dirty(handle, wc->w_di_bh); 2043 ocfs2_journal_dirty(handle, wc->w_di_bh);
2047 2044
2048 ocfs2_commit_trans(osb, handle); 2045 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb34..6cae155d54df 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {
102#define ocfs2_iocb_is_unaligned_aio(iocb) \ 102#define ocfs2_iocb_is_unaligned_aio(iocb) \
103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 103 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
104 104
105#define OCFS2_IOEND_WQ_HASH_SZ 37
106#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
107 OCFS2_IOEND_WQ_HASH_SZ])
108extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
109
110#endif /* OCFS2_FILE_H */ 105#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5b704c63a103..1edcb141f639 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
90 * information for this bh as it's not marked locally 90 * information for this bh as it's not marked locally
91 * uptodate. */ 91 * uptodate. */
92 ret = -EIO; 92 ret = -EIO;
93 put_bh(bh);
94 mlog_errno(ret); 93 mlog_errno(ret);
95 } 94 }
96 95
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
420 419
421 if (!buffer_uptodate(bh)) { 420 if (!buffer_uptodate(bh)) {
422 ret = -EIO; 421 ret = -EIO;
423 put_bh(bh);
424 mlog_errno(ret); 422 mlog_errno(ret);
425 } 423 }
426 424
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..b7f57271d49c 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); 41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
42} 42}
43static struct kobj_attribute attr_version = 43static struct kobj_attribute attr_version =
44 __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); 44 __ATTR(interface_revision, S_IRUGO, version_show, NULL);
45 45
46static struct attribute *o2cb_attrs[] = { 46static struct attribute *o2cb_attrs[] = {
47 &attr_version.attr, 47 &attr_version.attr,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b4140..c6b90e670389 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
137static void o2net_sc_connect_completed(struct work_struct *work); 137static void o2net_sc_connect_completed(struct work_struct *work);
138static void o2net_rx_until_empty(struct work_struct *work); 138static void o2net_rx_until_empty(struct work_struct *work);
139static void o2net_shutdown_sc(struct work_struct *work); 139static void o2net_shutdown_sc(struct work_struct *work);
140static void o2net_listen_data_ready(struct sock *sk, int bytes); 140static void o2net_listen_data_ready(struct sock *sk);
141static void o2net_sc_send_keep_req(struct work_struct *work); 141static void o2net_sc_send_keep_req(struct work_struct *work);
142static void o2net_idle_timer(unsigned long data); 142static void o2net_idle_timer(unsigned long data);
143static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); 143static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
262 262
263#endif /* CONFIG_OCFS2_FS_STATS */ 263#endif /* CONFIG_OCFS2_FS_STATS */
264 264
265static inline int o2net_reconnect_delay(void) 265static inline unsigned int o2net_reconnect_delay(void)
266{ 266{
267 return o2nm_single_cluster->cl_reconnect_delay_ms; 267 return o2nm_single_cluster->cl_reconnect_delay_ms;
268} 268}
269 269
270static inline int o2net_keepalive_delay(void) 270static inline unsigned int o2net_keepalive_delay(void)
271{ 271{
272 return o2nm_single_cluster->cl_keepalive_delay_ms; 272 return o2nm_single_cluster->cl_keepalive_delay_ms;
273} 273}
274 274
275static inline int o2net_idle_timeout(void) 275static inline unsigned int o2net_idle_timeout(void)
276{ 276{
277 return o2nm_single_cluster->cl_idle_timeout_ms; 277 return o2nm_single_cluster->cl_idle_timeout_ms;
278} 278}
@@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
597} 597}
598 598
599/* see o2net_register_callbacks() */ 599/* see o2net_register_callbacks() */
600static void o2net_data_ready(struct sock *sk, int bytes) 600static void o2net_data_ready(struct sock *sk)
601{ 601{
602 void (*ready)(struct sock *sk, int bytes); 602 void (*ready)(struct sock *sk);
603 603
604 read_lock(&sk->sk_callback_lock); 604 read_lock(&sk->sk_callback_lock);
605 if (sk->sk_user_data) { 605 if (sk->sk_user_data) {
@@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
613 } 613 }
614 read_unlock(&sk->sk_callback_lock); 614 read_unlock(&sk->sk_callback_lock);
615 615
616 ready(sk, bytes); 616 ready(sk);
617} 617}
618 618
619/* see o2net_register_callbacks() */ 619/* see o2net_register_callbacks() */
@@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
916 916
917static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) 917static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
918{ 918{
919 int ret; 919 struct kvec vec = { .iov_len = len, .iov_base = data, };
920 mm_segment_t oldfs; 920 struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
921 struct kvec vec = { 921 return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
922 .iov_len = len,
923 .iov_base = data,
924 };
925 struct msghdr msg = {
926 .msg_iovlen = 1,
927 .msg_iov = (struct iovec *)&vec,
928 .msg_flags = MSG_DONTWAIT,
929 };
930
931 oldfs = get_fs();
932 set_fs(get_ds());
933 ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
934 set_fs(oldfs);
935
936 return ret;
937} 922}
938 923
939static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, 924static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
940 size_t veclen, size_t total) 925 size_t veclen, size_t total)
941{ 926{
942 int ret; 927 int ret;
943 mm_segment_t oldfs; 928 struct msghdr msg;
944 struct msghdr msg = {
945 .msg_iov = (struct iovec *)vec,
946 .msg_iovlen = veclen,
947 };
948 929
949 if (sock == NULL) { 930 if (sock == NULL) {
950 ret = -EINVAL; 931 ret = -EINVAL;
951 goto out; 932 goto out;
952 } 933 }
953 934
954 oldfs = get_fs(); 935 ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
955 set_fs(get_ds()); 936 if (likely(ret == total))
956 ret = sock_sendmsg(sock, &msg, total); 937 return 0;
957 set_fs(oldfs); 938 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
958 if (ret != total) { 939 if (ret >= 0)
959 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, 940 ret = -EPIPE; /* should be smarter, I bet */
960 total);
961 if (ret >= 0)
962 ret = -EPIPE; /* should be smarter, I bet */
963 goto out;
964 }
965
966 ret = 0;
967out: 941out:
968 if (ret < 0) 942 mlog(0, "returning error: %d\n", ret);
969 mlog(0, "returning error: %d\n", ret);
970 return ret; 943 return ret;
971} 944}
972 945
@@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work)
1953 cond_resched(); 1926 cond_resched();
1954} 1927}
1955 1928
1956static void o2net_listen_data_ready(struct sock *sk, int bytes) 1929static void o2net_listen_data_ready(struct sock *sk)
1957{ 1930{
1958 void (*ready)(struct sock *sk, int bytes); 1931 void (*ready)(struct sock *sk);
1959 1932
1960 read_lock(&sk->sk_callback_lock); 1933 read_lock(&sk->sk_callback_lock);
1961 ready = sk->sk_user_data; 1934 ready = sk->sk_user_data;
@@ -1964,18 +1937,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
1964 goto out; 1937 goto out;
1965 } 1938 }
1966 1939
1967 /* ->sk_data_ready is also called for a newly established child socket 1940 /* This callback may called twice when a new connection
1968 * before it has been accepted and the acceptor has set up their 1941 * is being established as a child socket inherits everything
1969 * data_ready.. we only want to queue listen work for our listening 1942 * from a parent LISTEN socket, including the data_ready cb of
1970 * socket */ 1943 * the parent. This leads to a hazard. In o2net_accept_one()
1944 * we are still initializing the child socket but have not
1945 * changed the inherited data_ready callback yet when
1946 * data starts arriving.
1947 * We avoid this hazard by checking the state.
1948 * For the listening socket, the state will be TCP_LISTEN; for the new
1949 * socket, will be TCP_ESTABLISHED. Also, in this case,
1950 * sk->sk_user_data is not a valid function pointer.
1951 */
1952
1971 if (sk->sk_state == TCP_LISTEN) { 1953 if (sk->sk_state == TCP_LISTEN) {
1972 mlog(ML_TCP, "bytes: %d\n", bytes);
1973 queue_work(o2net_wq, &o2net_listen_work); 1954 queue_work(o2net_wq, &o2net_listen_work);
1955 } else {
1956 ready = NULL;
1974 } 1957 }
1975 1958
1976out: 1959out:
1977 read_unlock(&sk->sk_callback_lock); 1960 read_unlock(&sk->sk_callback_lock);
1978 ready(sk, bytes); 1961 if (ready != NULL)
1962 ready(sk);
1979} 1963}
1980 1964
1981static int o2net_open_listening_sock(__be32 addr, __be16 port) 1965static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a3..dc024367110a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -165,7 +165,7 @@ struct o2net_sock_container {
165 165
166 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
167 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
168 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk);
169 169
170 u32 sc_msg_key; 170 u32 sc_msg_key;
171 u16 sc_msg_type; 171 u16 sc_msg_type;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f6..e2e05a106beb 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
37#include "dlmglue.h" 37#include "dlmglue.h"
38#include "file.h" 38#include "file.h"
39#include "inode.h" 39#include "inode.h"
40#include "super.h"
41#include "ocfs2_trace.h" 40#include "ocfs2_trace.h"
42 41
43void ocfs2_dentry_attach_gen(struct dentry *dentry) 42void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
346 return ret; 345 return ret;
347} 346}
348 347
349DEFINE_SPINLOCK(dentry_list_lock);
350
351/* We limit the number of dentry locks to drop in one go. We have
352 * this limit so that we don't starve other users of ocfs2_wq. */
353#define DL_INODE_DROP_COUNT 64
354
355/* Drop inode references from dentry locks */
356static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
357{
358 struct ocfs2_dentry_lock *dl;
359
360 spin_lock(&dentry_list_lock);
361 while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
362 dl = osb->dentry_lock_list;
363 osb->dentry_lock_list = dl->dl_next;
364 spin_unlock(&dentry_list_lock);
365 iput(dl->dl_inode);
366 kfree(dl);
367 spin_lock(&dentry_list_lock);
368 }
369 spin_unlock(&dentry_list_lock);
370}
371
372void ocfs2_drop_dl_inodes(struct work_struct *work)
373{
374 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
375 dentry_lock_work);
376
377 __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
378 /*
379 * Don't queue dropping if umount is in progress. We flush the
380 * list in ocfs2_dismount_volume
381 */
382 spin_lock(&dentry_list_lock);
383 if (osb->dentry_lock_list &&
384 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
385 queue_work(ocfs2_wq, &osb->dentry_lock_work);
386 spin_unlock(&dentry_list_lock);
387}
388
389/* Flush the whole work queue */
390void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
391{
392 __ocfs2_drop_dl_inodes(osb, -1);
393}
394
395/* 348/*
396 * ocfs2_dentry_iput() and friends. 349 * ocfs2_dentry_iput() and friends.
397 * 350 *
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
416static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 369static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
417 struct ocfs2_dentry_lock *dl) 370 struct ocfs2_dentry_lock *dl)
418{ 371{
372 iput(dl->dl_inode);
419 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 373 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
420 ocfs2_lock_res_free(&dl->dl_lockres); 374 ocfs2_lock_res_free(&dl->dl_lockres);
421 375 kfree(dl);
422 /* We leave dropping of inode reference to ocfs2_wq as that can
423 * possibly lead to inode deletion which gets tricky */
424 spin_lock(&dentry_list_lock);
425 if (!osb->dentry_lock_list &&
426 !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
427 queue_work(ocfs2_wq, &osb->dentry_lock_work);
428 dl->dl_next = osb->dentry_lock_list;
429 osb->dentry_lock_list = dl;
430 spin_unlock(&dentry_list_lock);
431} 376}
432 377
433void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 378void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
434 struct ocfs2_dentry_lock *dl) 379 struct ocfs2_dentry_lock *dl)
435{ 380{
436 int unlock; 381 int unlock = 0;
437 382
438 BUG_ON(dl->dl_count == 0); 383 BUG_ON(dl->dl_count == 0);
439 384
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff709958..55f58892b153 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
29extern const struct dentry_operations ocfs2_dentry_ops; 29extern const struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
33 unsigned int dl_count; 32 unsigned int dl_count;
34 union { 33 u64 dl_parent_blkno;
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
39 34
40 /* 35 /*
41 * The ocfs2_dentry_lock keeps an inode reference until 36 * The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
49int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, 44int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
50 u64 parent_blkno); 45 u64 parent_blkno);
51 46
52extern spinlock_t dentry_list_lock;
53
54void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 47void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
55 struct ocfs2_dentry_lock *dl); 48 struct ocfs2_dentry_lock *dl);
56 49
57void ocfs2_drop_dl_inodes(struct work_struct *work);
58void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
59
60struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
61 int skip_unhashed); 51 int skip_unhashed);
62 52
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..0717662b4aef 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2957 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
2958 } 2958 }
2959 2959
2960 ocfs2_update_inode_fsync_trans(handle, dir, 1);
2960 ocfs2_journal_dirty(handle, dirdata_bh); 2961 ocfs2_journal_dirty(handle, dirdata_bh);
2961 2962
2962 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2963 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 di->i_size = cpu_to_le64(sb->s_blocksize); 3006 di->i_size = cpu_to_le64(sb->s_blocksize);
3006 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); 3007 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
3007 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); 3008 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
3009 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3008 3010
3009 /* 3011 /*
3010 * This should never fail as our extent list is empty and all 3012 * This should never fail as our extent list is empty and all
@@ -3338,6 +3340,7 @@ do_extend:
3338 } else { 3340 } else {
3339 de->rec_len = cpu_to_le16(sb->s_blocksize); 3341 de->rec_len = cpu_to_le16(sb->s_blocksize);
3340 } 3342 }
3343 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3341 ocfs2_journal_dirty(handle, new_bh); 3344 ocfs2_journal_dirty(handle, new_bh);
3342 3345
3343 dir_i_size += dir->i_sb->s_blocksize; 3346 dir_i_size += dir->i_sb->s_blocksize;
@@ -3896,6 +3899,7 @@ out_commit:
3896 dquot_free_space_nodirty(dir, 3899 dquot_free_space_nodirty(dir,
3897 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3900 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3898 3901
3902 ocfs2_update_inode_fsync_trans(handle, dir, 1);
3899 ocfs2_commit_trans(osb, handle); 3903 ocfs2_commit_trans(osb, handle);
3900 3904
3901out: 3905out:
@@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4134 mlog_errno(ret); 4138 mlog_errno(ret);
4135 did_quota = 0; 4139 did_quota = 0;
4136 4140
4141 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4137 ocfs2_journal_dirty(handle, dx_root_bh); 4142 ocfs2_journal_dirty(handle, dx_root_bh);
4138 4143
4139out_commit: 4144out_commit:
@@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4401 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4406 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4402 spin_unlock(&OCFS2_I(dir)->ip_lock); 4407 spin_unlock(&OCFS2_I(dir)->ip_lock);
4403 di->i_dx_root = cpu_to_le64(0ULL); 4408 di->i_dx_root = cpu_to_le64(0ULL);
4409 ocfs2_update_inode_fsync_trans(handle, dir, 1);
4404 4410
4405 ocfs2_journal_dirty(handle, di_bh); 4411 ocfs2_journal_dirty(handle, di_bh);
4406 4412
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 33660a4a52fa..c973690dc0bc 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1123 struct dlm_ctxt *dlm = NULL; 1123 struct dlm_ctxt *dlm = NULL;
1124 char *local = NULL; 1124 char *local = NULL;
1125 int status = 0; 1125 int status = 0;
1126 int locked = 0;
1127 1126
1128 qr = (struct dlm_query_region *) msg->buf; 1127 qr = (struct dlm_query_region *) msg->buf;
1129 1128
@@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1132 1131
1133 /* buffer used in dlm_mast_regions() */ 1132 /* buffer used in dlm_mast_regions() */
1134 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); 1133 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
1135 if (!local) { 1134 if (!local)
1136 status = -ENOMEM; 1135 return -ENOMEM;
1137 goto bail;
1138 }
1139 1136
1140 status = -EINVAL; 1137 status = -EINVAL;
1141 1138
@@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1144 if (!dlm) { 1141 if (!dlm) {
1145 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1142 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1146 "before join domain\n", qr->qr_node, qr->qr_domain); 1143 "before join domain\n", qr->qr_node, qr->qr_domain);
1147 goto bail; 1144 goto out_domain_lock;
1148 } 1145 }
1149 1146
1150 spin_lock(&dlm->spinlock); 1147 spin_lock(&dlm->spinlock);
1151 locked = 1;
1152 if (dlm->joining_node != qr->qr_node) { 1148 if (dlm->joining_node != qr->qr_node) {
1153 mlog(ML_ERROR, "Node %d queried hb regions on domain %s " 1149 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1154 "but joining node is %d\n", qr->qr_node, qr->qr_domain, 1150 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1155 dlm->joining_node); 1151 dlm->joining_node);
1156 goto bail; 1152 goto out_dlm_lock;
1157 } 1153 }
1158 1154
1159 /* Support for global heartbeat was added in 1.1 */ 1155 /* Support for global heartbeat was added in 1.1 */
@@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1163 "but active dlm protocol is %d.%d\n", qr->qr_node, 1159 "but active dlm protocol is %d.%d\n", qr->qr_node,
1164 qr->qr_domain, dlm->dlm_locking_proto.pv_major, 1160 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1165 dlm->dlm_locking_proto.pv_minor); 1161 dlm->dlm_locking_proto.pv_minor);
1166 goto bail; 1162 goto out_dlm_lock;
1167 } 1163 }
1168 1164
1169 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); 1165 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
1170 1166
1171bail: 1167out_dlm_lock:
1172 if (locked) 1168 spin_unlock(&dlm->spinlock);
1173 spin_unlock(&dlm->spinlock); 1169
1170out_domain_lock:
1174 spin_unlock(&dlm_domain_lock); 1171 spin_unlock(&dlm_domain_lock);
1175 1172
1176 kfree(local); 1173 kfree(local);
@@ -1877,19 +1874,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1877 goto bail; 1874 goto bail;
1878 } 1875 }
1879 1876
1880 status = dlm_debug_init(dlm); 1877 status = dlm_launch_thread(dlm);
1881 if (status < 0) { 1878 if (status < 0) {
1882 mlog_errno(status); 1879 mlog_errno(status);
1883 goto bail; 1880 goto bail;
1884 } 1881 }
1885 1882
1886 status = dlm_launch_thread(dlm); 1883 status = dlm_launch_recovery_thread(dlm);
1887 if (status < 0) { 1884 if (status < 0) {
1888 mlog_errno(status); 1885 mlog_errno(status);
1889 goto bail; 1886 goto bail;
1890 } 1887 }
1891 1888
1892 status = dlm_launch_recovery_thread(dlm); 1889 status = dlm_debug_init(dlm);
1893 if (status < 0) { 1890 if (status < 0) {
1894 mlog_errno(status); 1891 mlog_errno(status);
1895 goto bail; 1892 goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..ee1f88419cb0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -472,11 +472,15 @@ bail:
472 472
473void dlm_destroy_master_caches(void) 473void dlm_destroy_master_caches(void)
474{ 474{
475 if (dlm_lockname_cache) 475 if (dlm_lockname_cache) {
476 kmem_cache_destroy(dlm_lockname_cache); 476 kmem_cache_destroy(dlm_lockname_cache);
477 dlm_lockname_cache = NULL;
478 }
477 479
478 if (dlm_lockres_cache) 480 if (dlm_lockres_cache) {
479 kmem_cache_destroy(dlm_lockres_cache); 481 kmem_cache_destroy(dlm_lockres_cache);
482 dlm_lockres_cache = NULL;
483 }
480} 484}
481 485
482static void dlm_lockres_release(struct kref *kref) 486static void dlm_lockres_release(struct kref *kref)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7035af09cc03..fe29f7978f81 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
537 /* success! see if any other nodes need recovery */ 537 /* success! see if any other nodes need recovery */
538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", 538 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
539 dlm->name, dlm->reco.dead_node, dlm->node_num); 539 dlm->name, dlm->reco.dead_node, dlm->node_num);
540 dlm_reset_recovery(dlm); 540 spin_lock(&dlm->spinlock);
541 __dlm_reset_recovery(dlm);
542 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
543 spin_unlock(&dlm->spinlock);
541 } 544 }
542 dlm_end_recovery(dlm); 545 dlm_end_recovery(dlm);
543 546
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
695 if (all_nodes_done) { 698 if (all_nodes_done) {
696 int ret; 699 int ret;
697 700
701 /* Set this flag on recovery master to avoid
702 * a new recovery for another dead node start
703 * before the recovery is not done. That may
704 * cause recovery hung.*/
705 spin_lock(&dlm->spinlock);
706 dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
707 spin_unlock(&dlm->spinlock);
708
698 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state 709 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
699 * just send a finalize message to everyone and 710 * just send a finalize message to everyone and
700 * clean up */ 711 * clean up */
@@ -1750,13 +1761,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1750 struct dlm_migratable_lockres *mres) 1761 struct dlm_migratable_lockres *mres)
1751{ 1762{
1752 struct dlm_migratable_lock *ml; 1763 struct dlm_migratable_lock *ml;
1753 struct list_head *queue; 1764 struct list_head *queue, *iter;
1754 struct list_head *tmpq = NULL; 1765 struct list_head *tmpq = NULL;
1755 struct dlm_lock *newlock = NULL; 1766 struct dlm_lock *newlock = NULL;
1756 struct dlm_lockstatus *lksb = NULL; 1767 struct dlm_lockstatus *lksb = NULL;
1757 int ret = 0; 1768 int ret = 0;
1758 int i, j, bad; 1769 int i, j, bad;
1759 struct dlm_lock *lock = NULL; 1770 struct dlm_lock *lock;
1760 u8 from = O2NM_MAX_NODES; 1771 u8 from = O2NM_MAX_NODES;
1761 unsigned int added = 0; 1772 unsigned int added = 0;
1762 __be64 c; 1773 __be64 c;
@@ -1791,14 +1802,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1791 /* MIGRATION ONLY! */ 1802 /* MIGRATION ONLY! */
1792 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); 1803 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1793 1804
1805 lock = NULL;
1794 spin_lock(&res->spinlock); 1806 spin_lock(&res->spinlock);
1795 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1807 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1796 tmpq = dlm_list_idx_to_ptr(res, j); 1808 tmpq = dlm_list_idx_to_ptr(res, j);
1797 list_for_each_entry(lock, tmpq, list) { 1809 list_for_each(iter, tmpq) {
1798 if (lock->ml.cookie != ml->cookie) 1810 lock = list_entry(iter,
1799 lock = NULL; 1811 struct dlm_lock, list);
1800 else 1812 if (lock->ml.cookie == ml->cookie)
1801 break; 1813 break;
1814 lock = NULL;
1802 } 1815 }
1803 if (lock) 1816 if (lock)
1804 break; 1817 break;
@@ -2882,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2882 BUG(); 2895 BUG();
2883 } 2896 }
2884 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; 2897 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2898 __dlm_reset_recovery(dlm);
2885 spin_unlock(&dlm->spinlock); 2899 spin_unlock(&dlm->spinlock);
2886 dlm_reset_recovery(dlm);
2887 dlm_kick_recovery_thread(dlm); 2900 dlm_kick_recovery_thread(dlm);
2888 break; 2901 break;
2889 default: 2902 default:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 19986959d149..6bd690b5a061 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3144,22 +3144,60 @@ out:
3144 return 0; 3144 return 0;
3145} 3145}
3146 3146
3147static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3148 struct ocfs2_lock_res *lockres);
3149
3147/* Mark the lockres as being dropped. It will no longer be 3150/* Mark the lockres as being dropped. It will no longer be
3148 * queued if blocking, but we still may have to wait on it 3151 * queued if blocking, but we still may have to wait on it
3149 * being dequeued from the downconvert thread before we can consider 3152 * being dequeued from the downconvert thread before we can consider
3150 * it safe to drop. 3153 * it safe to drop.
3151 * 3154 *
3152 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3155 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3156void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
3157 struct ocfs2_lock_res *lockres)
3154{ 3158{
3155 int status; 3159 int status;
3156 struct ocfs2_mask_waiter mw; 3160 struct ocfs2_mask_waiter mw;
3157 unsigned long flags; 3161 unsigned long flags, flags2;
3158 3162
3159 ocfs2_init_mask_waiter(&mw); 3163 ocfs2_init_mask_waiter(&mw);
3160 3164
3161 spin_lock_irqsave(&lockres->l_lock, flags); 3165 spin_lock_irqsave(&lockres->l_lock, flags);
3162 lockres->l_flags |= OCFS2_LOCK_FREEING; 3166 lockres->l_flags |= OCFS2_LOCK_FREEING;
3167 if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
3168 /*
3169 * We know the downconvert is queued but not in progress
3170 * because we are the downconvert thread and processing
3171 * different lock. So we can just remove the lock from the
3172 * queue. This is not only an optimization but also a way
3173 * to avoid the following deadlock:
3174 * ocfs2_dentry_post_unlock()
3175 * ocfs2_dentry_lock_put()
3176 * ocfs2_drop_dentry_lock()
3177 * iput()
3178 * ocfs2_evict_inode()
3179 * ocfs2_clear_inode()
3180 * ocfs2_mark_lockres_freeing()
3181 * ... blocks waiting for OCFS2_LOCK_QUEUED
3182 * since we are the downconvert thread which
3183 * should clear the flag.
3184 */
3185 spin_unlock_irqrestore(&lockres->l_lock, flags);
3186 spin_lock_irqsave(&osb->dc_task_lock, flags2);
3187 list_del_init(&lockres->l_blocked_list);
3188 osb->blocked_lock_count--;
3189 spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
3190 /*
3191 * Warn if we recurse into another post_unlock call. Strictly
3192 * speaking it isn't a problem but we need to be careful if
3193 * that happens (stack overflow, deadlocks, ...) so warn if
3194 * ocfs2 grows a path for which this can happen.
3195 */
3196 WARN_ON_ONCE(lockres->l_ops->post_unlock);
3197 /* Since the lock is freeing we don't do much in the fn below */
3198 ocfs2_process_blocked_lock(osb, lockres);
3199 return;
3200 }
3163 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 3201 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
3164 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 3202 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
3165 spin_unlock_irqrestore(&lockres->l_lock, flags); 3203 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
3180{ 3218{
3181 int ret; 3219 int ret;
3182 3220
3183 ocfs2_mark_lockres_freeing(lockres); 3221 ocfs2_mark_lockres_freeing(osb, lockres);
3184 ret = ocfs2_drop_lock(osb, lockres); 3222 ret = ocfs2_drop_lock(osb, lockres);
3185 if (ret) 3223 if (ret)
3186 mlog_errno(ret); 3224 mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..d293a22c32c5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); 157void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
158 158
159 159
160void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 160void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
161 struct ocfs2_lock_res *lockres);
161void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 162void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
162 struct ocfs2_lock_res *lockres); 163 struct ocfs2_lock_res *lockres);
163 164
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c40e896..8970dcf74de5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync) 175 int datasync)
176{ 176{
177 int err = 0; 177 int err = 0;
178 journal_t *journal;
179 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
180 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
180 struct ocfs2_inode_info *oi = OCFS2_I(inode);
181 journal_t *journal = osb->journal->j_journal;
182 int ret;
183 tid_t commit_tid;
184 bool needs_barrier = false;
181 185
182 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 186 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
183 OCFS2_I(inode)->ip_blkno, 187 OCFS2_I(inode)->ip_blkno,
@@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
192 if (err) 196 if (err)
193 return err; 197 return err;
194 198
195 /* 199 commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
196 * Probably don't need the i_mutex at all in here, just putting it here 200 if (journal->j_flags & JBD2_BARRIER &&
197 * to be consistent with how fsync used to be called, someone more 201 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
198 * familiar with the fs could possibly remove it. 202 needs_barrier = true;
199 */ 203 err = jbd2_complete_transaction(journal, commit_tid);
200 mutex_lock(&inode->i_mutex); 204 if (needs_barrier) {
201 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 205 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
202 /* 206 if (!err)
203 * We still have to flush drive's caches to get data to the 207 err = ret;
204 * platter
205 */
206 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
207 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
208 goto bail;
209 } 208 }
210 209
211 journal = osb->journal->j_journal;
212 err = jbd2_journal_force_commit(journal);
213
214bail:
215 if (err) 210 if (err)
216 mlog_errno(err); 211 mlog_errno(err);
217 mutex_unlock(&inode->i_mutex);
218 212
219 return (err < 0) ? -EIO : 0; 213 return (err < 0) ? -EIO : 0;
220} 214}
@@ -292,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
292 inode->i_atime = CURRENT_TIME; 286 inode->i_atime = CURRENT_TIME;
293 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 287 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
294 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 288 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
289 ocfs2_update_inode_fsync_trans(handle, inode, 0);
295 ocfs2_journal_dirty(handle, bh); 290 ocfs2_journal_dirty(handle, bh);
296 291
297out_commit: 292out_commit:
@@ -341,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
341 if (ret < 0) 336 if (ret < 0)
342 mlog_errno(ret); 337 mlog_errno(ret);
343 338
339 ocfs2_update_inode_fsync_trans(handle, inode, 0);
344 ocfs2_commit_trans(osb, handle); 340 ocfs2_commit_trans(osb, handle);
345out: 341out:
346 return ret; 342 return ret;
@@ -435,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
435 di->i_size = cpu_to_le64(new_i_size); 431 di->i_size = cpu_to_le64(new_i_size);
436 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 432 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
437 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 433 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
434 ocfs2_update_inode_fsync_trans(handle, inode, 0);
438 435
439 ocfs2_journal_dirty(handle, fe_bh); 436 ocfs2_journal_dirty(handle, fe_bh);
440 437
@@ -650,7 +647,7 @@ restarted_transaction:
650 mlog_errno(status); 647 mlog_errno(status);
651 goto leave; 648 goto leave;
652 } 649 }
653 650 ocfs2_update_inode_fsync_trans(handle, inode, 1);
654 ocfs2_journal_dirty(handle, bh); 651 ocfs2_journal_dirty(handle, bh);
655 652
656 spin_lock(&OCFS2_I(inode)->ip_lock); 653 spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -743,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
743 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
744 if (ret) 741 if (ret)
745 mlog_errno(ret); 742 mlog_errno(ret);
743 ocfs2_update_inode_fsync_trans(handle, inode, 1);
746 744
747out: 745out:
748 if (ret) { 746 if (ret) {
@@ -840,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
840 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
841 di->i_mtime_nsec = di->i_ctime_nsec; 839 di->i_mtime_nsec = di->i_ctime_nsec;
842 ocfs2_journal_dirty(handle, di_bh); 840 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1);
843 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
844 } 843 }
845 844
@@ -1344,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1344 1343
1345 di = (struct ocfs2_dinode *) bh->b_data; 1344 di = (struct ocfs2_dinode *) bh->b_data;
1346 di->i_mode = cpu_to_le16(inode->i_mode); 1345 di->i_mode = cpu_to_le16(inode->i_mode);
1346 ocfs2_update_inode_fsync_trans(handle, inode, 0);
1347 1347
1348 ocfs2_journal_dirty(handle, bh); 1348 ocfs2_journal_dirty(handle, bh);
1349 1349
@@ -1576,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1576 if (ret) 1576 if (ret)
1577 mlog_errno(ret); 1577 mlog_errno(ret);
1578 } 1578 }
1579 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1579 1580
1580 ocfs2_commit_trans(osb, handle); 1581 ocfs2_commit_trans(osb, handle);
1581out: 1582out:
@@ -2061,13 +2062,6 @@ out:
2061 return ret; 2062 return ret;
2062} 2063}
2063 2064
2064static void ocfs2_aiodio_wait(struct inode *inode)
2065{
2066 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2067
2068 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2069}
2070
2071static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) 2065static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2072{ 2066{
2073 int blockmask = inode->i_sb->s_blocksize - 1; 2067 int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2345,10 +2339,8 @@ relock:
2345 * Wait on previous unaligned aio to complete before 2339 * Wait on previous unaligned aio to complete before
2346 * proceeding. 2340 * proceeding.
2347 */ 2341 */
2348 ocfs2_aiodio_wait(inode); 2342 mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
2349 2343 /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
2350 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2351 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2352 ocfs2_iocb_set_unaligned_aio(iocb); 2344 ocfs2_iocb_set_unaligned_aio(iocb);
2353 } 2345 }
2354 2346
@@ -2375,15 +2367,18 @@ relock:
2375 2367
2376 if (direct_io) { 2368 if (direct_io) {
2377 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2369 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2378 ppos, count, ocount); 2370 count, ocount);
2379 if (written < 0) { 2371 if (written < 0) {
2380 ret = written; 2372 ret = written;
2381 goto out_dio; 2373 goto out_dio;
2382 } 2374 }
2383 } else { 2375 } else {
2376 struct iov_iter from;
2377 iov_iter_init(&from, iov, nr_segs, count, 0);
2384 current->backing_dev_info = file->f_mapping->backing_dev_info; 2378 current->backing_dev_info = file->f_mapping->backing_dev_info;
2385 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, 2379 written = generic_perform_write(file, &from, *ppos);
2386 ppos, count, 0); 2380 if (likely(written >= 0))
2381 iocb->ki_pos = *ppos + written;
2387 current->backing_dev_info = NULL; 2382 current->backing_dev_info = NULL;
2388 } 2383 }
2389 2384
@@ -2428,7 +2423,7 @@ out_dio:
2428 2423
2429 if (unaligned_dio) { 2424 if (unaligned_dio) {
2430 ocfs2_iocb_clear_unaligned_aio(iocb); 2425 ocfs2_iocb_clear_unaligned_aio(iocb);
2431 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2426 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2432 } 2427 }
2433 2428
2434out: 2429out:
@@ -2645,7 +2640,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2645 case SEEK_SET: 2640 case SEEK_SET:
2646 break; 2641 break;
2647 case SEEK_END: 2642 case SEEK_END:
2648 offset += inode->i_size; 2643 /* SEEK_END requires the OCFS2 inode lock for the file
2644 * because it references the file's size.
2645 */
2646 ret = ocfs2_inode_lock(inode, NULL, 0);
2647 if (ret < 0) {
2648 mlog_errno(ret);
2649 goto out;
2650 }
2651 offset += i_size_read(inode);
2652 ocfs2_inode_unlock(inode, 0);
2649 break; 2653 break;
2650 case SEEK_CUR: 2654 case SEEK_CUR:
2651 if (offset == 0) { 2655 if (offset == 0) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..437de7f768c6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
130 struct inode *inode = NULL; 130 struct inode *inode = NULL;
131 struct super_block *sb = osb->sb; 131 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 132 struct ocfs2_find_inode_args args;
133 journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
133 134
134 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 135 trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135 sysfile_type); 136 sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
169 goto bail; 170 goto bail;
170 } 171 }
171 172
173 /*
174 * Set transaction id's of transactions that have to be committed
175 * to finish f[data]sync. We set them to currently running transaction
176 * as we cannot be sure that the inode or some of its metadata isn't
177 * part of the transaction - the inode could have been reclaimed and
178 * now it is reread from disk.
179 */
180 if (journal) {
181 transaction_t *transaction;
182 tid_t tid;
183 struct ocfs2_inode_info *oi = OCFS2_I(inode);
184
185 read_lock(&journal->j_state_lock);
186 if (journal->j_running_transaction)
187 transaction = journal->j_running_transaction;
188 else
189 transaction = journal->j_committing_transaction;
190 if (transaction)
191 tid = transaction->t_tid;
192 else
193 tid = journal->j_commit_sequence;
194 read_unlock(&journal->j_state_lock);
195 oi->i_sync_tid = tid;
196 oi->i_datasync_tid = tid;
197 }
198
172bail: 199bail:
173 if (!IS_ERR(inode)) { 200 if (!IS_ERR(inode)) {
174 trace_ocfs2_iget_end(inode, 201 trace_ocfs2_iget_end(inode,
@@ -804,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
804 goto bail; 831 goto bail;
805 } 832 }
806 833
807 /* If we're coming from downconvert_thread we can't go into our own 834 /*
808 * voting [hello, deadlock city!], so unforuntately we just 835 * If we're coming from downconvert_thread we can't go into our own
809 * have to skip deleting this guy. That's OK though because 836 * voting [hello, deadlock city!] so we cannot delete the inode. But
810 * the node who's doing the actual deleting should handle it 837 * since we dropped last inode ref when downconverting dentry lock,
811 * anyway. */ 838 * we cannot have the file open and thus the node doing unlink will
839 * take care of deleting the inode.
840 */
812 if (current == osb->dc_task) 841 if (current == osb->dc_task)
813 goto bail; 842 goto bail;
814 843
@@ -822,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
822 goto bail_unlock; 851 goto bail_unlock;
823 } 852 }
824 853
825 /* If we have allowd wipe of this inode for another node, it
826 * will be marked here so we can safely skip it. Recovery will
827 * cleanup any inodes we might inadvertently skip here. */
828 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
829 goto bail_unlock;
830
831 ret = 1; 854 ret = 1;
832bail_unlock: 855bail_unlock:
833 spin_unlock(&oi->ip_lock); 856 spin_unlock(&oi->ip_lock);
@@ -941,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
941 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 964 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
942 if (sync_data) 965 if (sync_data)
943 filemap_write_and_wait(inode->i_mapping); 966 filemap_write_and_wait(inode->i_mapping);
944 truncate_inode_pages(&inode->i_data, 0); 967 truncate_inode_pages_final(&inode->i_data);
945} 968}
946 969
947static void ocfs2_delete_inode(struct inode *inode) 970static void ocfs2_delete_inode(struct inode *inode)
@@ -960,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)
960 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) 983 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
961 goto bail; 984 goto bail;
962 985
963 dquot_initialize(inode);
964
965 if (!ocfs2_inode_is_valid_to_delete(inode)) { 986 if (!ocfs2_inode_is_valid_to_delete(inode)) {
966 /* It's probably not necessary to truncate_inode_pages 987 /* It's probably not necessary to truncate_inode_pages
967 * here but we do it for safety anyway (it will most 988 * here but we do it for safety anyway (it will most
@@ -970,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)
970 goto bail; 991 goto bail;
971 } 992 }
972 993
994 dquot_initialize(inode);
995
973 /* We want to block signals in delete_inode as the lock and 996 /* We want to block signals in delete_inode as the lock and
974 * messaging paths may return us -ERESTARTSYS. Which would 997 * messaging paths may return us -ERESTARTSYS. Which would
975 * cause us to exit early, resulting in inodes being orphaned 998 * cause us to exit early, resulting in inodes being orphaned
@@ -1057,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1057{ 1080{
1058 int status; 1081 int status;
1059 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1082 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1083 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1060 1084
1061 clear_inode(inode); 1085 clear_inode(inode);
1062 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1086 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
@@ -1073,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1073 1097
1074 /* Do these before all the other work so that we don't bounce 1098 /* Do these before all the other work so that we don't bounce
1075 * the downconvert thread while waiting to destroy the locks. */ 1099 * the downconvert thread while waiting to destroy the locks. */
1076 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1100 ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
1077 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); 1101 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
1078 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1102 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
1079 1103
1080 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, 1104 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
1081 &oi->ip_la_data_resv); 1105 &oi->ip_la_data_resv);
@@ -1157,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)
1157 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { 1181 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1158 ocfs2_delete_inode(inode); 1182 ocfs2_delete_inode(inode);
1159 } else { 1183 } else {
1160 truncate_inode_pages(&inode->i_data, 0); 1184 truncate_inode_pages_final(&inode->i_data);
1161 } 1185 }
1162 ocfs2_clear_inode(inode); 1186 ocfs2_clear_inode(inode);
1163} 1187}
@@ -1260,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1260 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1284 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1261 1285
1262 ocfs2_journal_dirty(handle, bh); 1286 ocfs2_journal_dirty(handle, bh);
1287 ocfs2_update_inode_fsync_trans(handle, inode, 1);
1263leave: 1288leave:
1264 return status; 1289 return status;
1265} 1290}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..a6c991c0fc98 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -44,7 +44,7 @@ struct ocfs2_inode_info
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */ 46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio; 47 struct mutex ip_unaligned_aio;
48 48
49 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
50 spinlock_t ip_lock; 50 spinlock_t ip_lock;
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
73 u32 ip_dir_lock_gen; 73 u32 ip_dir_lock_gen;
74 74
75 struct ocfs2_alloc_reservation ip_la_data_resv; 75 struct ocfs2_alloc_reservation ip_la_data_resv;
76
77 /*
78 * Transactions that contain inode's metadata needed to complete
79 * fsync and fdatasync, respectively.
80 */
81 tid_t i_sync_tid;
82 tid_t i_datasync_tid;
76}; 83};
77 84
78/* 85/*
@@ -84,8 +91,6 @@ struct ocfs2_inode_info
84#define OCFS2_INODE_BITMAP 0x00000004 91#define OCFS2_INODE_BITMAP 0x00000004
85/* This inode has been wiped from disk */ 92/* This inode has been wiped from disk */
86#define OCFS2_INODE_DELETED 0x00000008 93#define OCFS2_INODE_DELETED 0x00000008
87/* Another node is deleting, so our delete is a nop */
88#define OCFS2_INODE_SKIP_DELETE 0x00000010
89/* Has the inode been orphaned on another node? 94/* Has the inode been orphaned on another node?
90 * 95 *
91 * This hints to ocfs2_drop_inode that it should clear i_nlink before 96 * This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +105,11 @@ struct ocfs2_inode_info
100 * rely on ocfs2_delete_inode to sort things out under the proper 105 * rely on ocfs2_delete_inode to sort things out under the proper
101 * cluster locks. 106 * cluster locks.
102 */ 107 */
103#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 108#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
104/* Does someone have the file open O_DIRECT */ 109/* Does someone have the file open O_DIRECT */
105#define OCFS2_INODE_OPEN_DIRECT 0x00000040 110#define OCFS2_INODE_OPEN_DIRECT 0x00000020
106/* Tell the inode wipe code it's not in orphan dir */ 111/* Tell the inode wipe code it's not in orphan dir */
107#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 112#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
108 113
109static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 114static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
110{ 115{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8ca3c29accbf..490229f43731 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
413 } 413 }
414 414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); 415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418 416
419 iput(inode_alloc); 417 iput(inode_alloc);
420 inode_alloc = NULL; 418 inode_alloc = NULL;
419
420 if (status < 0)
421 goto bail;
421 } 422 }
422 423
423 o2info_set_request_filled(&oifi->ifi_req); 424 o2info_set_request_filled(&oifi->ifi_req);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..03ea9314fecd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2132 iter = oi->ip_next_orphan; 2132 iter = oi->ip_next_orphan;
2133 2133
2134 spin_lock(&oi->ip_lock); 2134 spin_lock(&oi->ip_lock);
2135 /* The remote delete code may have set these on the
2136 * assumption that the other node would wipe them
2137 * successfully. If they are still in the node's
2138 * orphan dir, we need to reset that state. */
2139 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
2140
2141 /* Set the proper information to get us going into 2135 /* Set the proper information to get us going into
2142 * ocfs2_delete_inode. */ 2136 * ocfs2_delete_inode. */
2143 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 2137 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9ff4e8cf9d97..7f8cde94abfe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
626 new_size); 626 new_size);
627} 627}
628 628
629static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
630 struct inode *inode,
631 int datasync)
632{
633 struct ocfs2_inode_info *oi = OCFS2_I(inode);
634
635 oi->i_sync_tid = handle->h_transaction->t_tid;
636 if (datasync)
637 oi->i_datasync_tid = handle->h_transaction->t_tid;
638}
639
629#endif /* OCFS2_JOURNAL_H */ 640#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069ea..6b6d092b0998 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
82 } 82 }
83 83
84 ret = flock_lock_file_wait(file, fl); 84 ret = flock_lock_file_wait(file, fl);
85 if (ret)
86 ocfs2_file_unlock(file);
85 87
86out: 88out:
87 mutex_unlock(&fp->fp_mutex); 89 mutex_unlock(&fp->fp_mutex);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 64c304d668f0..599eb4c4c8be 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,
151 old_blkno, len); 151 old_blkno, len);
152 } 152 }
153 153
154 ocfs2_update_inode_fsync_trans(handle, inode, 0);
154out: 155out:
155 ocfs2_free_path(path); 156 ocfs2_free_path(path);
156 return ret; 157 return ret;
@@ -690,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
690 691
691 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 692 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
692 goal_bit, len); 693 goal_bit, len);
693 if (ret) 694 if (ret) {
695 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
696 le16_to_cpu(gd->bg_chain));
694 mlog_errno(ret); 697 mlog_errno(ret);
698 }
695 699
696 /* 700 /*
697 * Here we should write the new page out first if we are 701 * Here we should write the new page out first if we are
@@ -957,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
957 inode->i_ctime = CURRENT_TIME; 961 inode->i_ctime = CURRENT_TIME;
958 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 962 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
959 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 963 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
964 ocfs2_update_inode_fsync_trans(handle, inode, 0);
960 965
961 ocfs2_journal_dirty(handle, di_bh); 966 ocfs2_journal_dirty(handle, di_bh);
962 967
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3683643f3f0e..2060fc398445 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -450,7 +450,6 @@ leave:
450 450
451 brelse(new_fe_bh); 451 brelse(new_fe_bh);
452 brelse(parent_fe_bh); 452 brelse(parent_fe_bh);
453 kfree(si.name);
454 kfree(si.value); 453 kfree(si.value);
455 454
456 ocfs2_free_dir_lookup_result(&lookup); 455 ocfs2_free_dir_lookup_result(&lookup);
@@ -495,6 +494,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
495 struct ocfs2_dinode *fe = NULL; 494 struct ocfs2_dinode *fe = NULL;
496 struct ocfs2_extent_list *fel; 495 struct ocfs2_extent_list *fel;
497 u16 feat; 496 u16 feat;
497 struct ocfs2_inode_info *oi = OCFS2_I(inode);
498 498
499 *new_fe_bh = NULL; 499 *new_fe_bh = NULL;
500 500
@@ -576,8 +576,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
576 mlog_errno(status); 576 mlog_errno(status);
577 } 577 }
578 578
579 status = 0; /* error in ocfs2_create_new_inode_locks is not 579 oi->i_sync_tid = handle->h_transaction->t_tid;
580 * critical */ 580 oi->i_datasync_tid = handle->h_transaction->t_tid;
581 581
582leave: 582leave:
583 if (status < 0) { 583 if (status < 0) {
@@ -1855,7 +1855,6 @@ bail:
1855 1855
1856 brelse(new_fe_bh); 1856 brelse(new_fe_bh);
1857 brelse(parent_fe_bh); 1857 brelse(parent_fe_bh);
1858 kfree(si.name);
1859 kfree(si.value); 1858 kfree(si.value);
1860 ocfs2_free_dir_lookup_result(&lookup); 1859 ocfs2_free_dir_lookup_result(&lookup);
1861 if (inode_ac) 1860 if (inode_ac)
@@ -2481,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2481 di->i_orphaned_slot = 0; 2480 di->i_orphaned_slot = 0;
2482 set_nlink(inode, 1); 2481 set_nlink(inode, 1);
2483 ocfs2_set_links_count(di, inode->i_nlink); 2482 ocfs2_set_links_count(di, inode->i_nlink);
2483 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2484 ocfs2_journal_dirty(handle, di_bh); 2484 ocfs2_journal_dirty(handle, di_bh);
2485 2485
2486 status = ocfs2_add_entry(handle, dentry, inode, 2486 status = ocfs2_add_entry(handle, dentry, inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 553f53cc73ae..8d64a97a9d5e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/wait.h> 31#include <linux/wait.h>
32#include <linux/list.h> 32#include <linux/list.h>
33#include <linux/llist.h>
33#include <linux/rbtree.h> 34#include <linux/rbtree.h>
34#include <linux/workqueue.h> 35#include <linux/workqueue.h>
35#include <linux/kref.h> 36#include <linux/kref.h>
@@ -274,19 +275,16 @@ enum ocfs2_mount_options
274 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 275 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
275}; 276};
276 277
277#define OCFS2_OSB_SOFT_RO 0x0001 278#define OCFS2_OSB_SOFT_RO 0x0001
278#define OCFS2_OSB_HARD_RO 0x0002 279#define OCFS2_OSB_HARD_RO 0x0002
279#define OCFS2_OSB_ERROR_FS 0x0004 280#define OCFS2_OSB_ERROR_FS 0x0004
280#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 281#define OCFS2_DEFAULT_ATIME_QUANTUM 60
281
282#define OCFS2_DEFAULT_ATIME_QUANTUM 60
283 282
284struct ocfs2_journal; 283struct ocfs2_journal;
285struct ocfs2_slot_info; 284struct ocfs2_slot_info;
286struct ocfs2_recovery_map; 285struct ocfs2_recovery_map;
287struct ocfs2_replay_map; 286struct ocfs2_replay_map;
288struct ocfs2_quota_recovery; 287struct ocfs2_quota_recovery;
289struct ocfs2_dentry_lock;
290struct ocfs2_super 288struct ocfs2_super
291{ 289{
292 struct task_struct *commit_task; 290 struct task_struct *commit_task;
@@ -414,10 +412,9 @@ struct ocfs2_super
414 struct list_head blocked_lock_list; 412 struct list_head blocked_lock_list;
415 unsigned long blocked_lock_count; 413 unsigned long blocked_lock_count;
416 414
417 /* List of dentry locks to release. Anyone can add locks to 415 /* List of dquot structures to drop last reference to */
418 * the list, ocfs2_wq processes the list */ 416 struct llist_head dquot_drop_list;
419 struct ocfs2_dentry_lock *dentry_lock_list; 417 struct work_struct dquot_drop_work;
420 struct work_struct dentry_lock_work;
421 418
422 wait_queue_head_t osb_mount_event; 419 wait_queue_head_t osb_mount_event;
423 420
@@ -449,6 +446,8 @@ struct ocfs2_super
449 /* rb tree root for refcount lock. */ 446 /* rb tree root for refcount lock. */
450 struct rb_root osb_rf_lock_tree; 447 struct rb_root osb_rf_lock_tree;
451 struct ocfs2_refcount_tree *osb_ref_tree_lru; 448 struct ocfs2_refcount_tree *osb_ref_tree_lru;
449
450 struct mutex system_file_mutex;
452}; 451};
453 452
454#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 453#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -579,18 +578,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
579 spin_unlock(&osb->osb_lock); 578 spin_unlock(&osb->osb_lock);
580} 579}
581 580
582
583static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
584 unsigned long flag)
585{
586 unsigned long ret;
587
588 spin_lock(&osb->osb_lock);
589 ret = osb->osb_flags & flag;
590 spin_unlock(&osb->osb_lock);
591 return ret;
592}
593
594static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, 581static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
595 int hard) 582 int hard)
596{ 583{
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..f266d67df3c6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ 28 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
29 s64 dq_origspace; /* Last globally synced space usage */ 29 s64 dq_origspace; /* Last globally synced space usage */
30 s64 dq_originodes; /* Last globally synced inode usage */ 30 s64 dq_originodes; /* Last globally synced inode usage */
31 struct llist_node list; /* Member of list of dquots to drop */
31}; 32};
32 33
33/* Description of one chunk to recover in memory */ 34/* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
110int ocfs2_create_local_dquot(struct dquot *dquot); 111int ocfs2_create_local_dquot(struct dquot *dquot);
111int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); 112int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
112int ocfs2_local_write_dquot(struct dquot *dquot); 113int ocfs2_local_write_dquot(struct dquot *dquot);
114void ocfs2_drop_dquot_refs(struct work_struct *work);
113 115
114extern const struct dquot_operations ocfs2_quota_operations; 116extern const struct dquot_operations ocfs2_quota_operations;
115extern struct quota_format_type ocfs2_quota_format; 117extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d7b5108789e2..b990a62cff50 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/workqueue.h> 12#include <linux/workqueue.h>
13#include <linux/llist.h>
13 14
14#include <cluster/masklog.h> 15#include <cluster/masklog.h>
15 16
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
679 OCFS2_INODE_UPDATE_CREDITS; 680 OCFS2_INODE_UPDATE_CREDITS;
680} 681}
681 682
683void ocfs2_drop_dquot_refs(struct work_struct *work)
684{
685 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
686 dquot_drop_work);
687 struct llist_node *list;
688 struct ocfs2_dquot *odquot, *next_odquot;
689
690 list = llist_del_all(&osb->dquot_drop_list);
691 llist_for_each_entry_safe(odquot, next_odquot, list, list) {
692 /* Drop the reference we acquired in ocfs2_dquot_release() */
693 dqput(&odquot->dq_dquot);
694 }
695}
696
697/*
698 * Called when the last reference to dquot is dropped. If we are called from
699 * downconvert thread, we cannot do all the handling here because grabbing
700 * quota lock could deadlock (the node holding the quota lock could need some
701 * other cluster lock to proceed but with blocked downconvert thread we cannot
702 * release any lock).
703 */
682static int ocfs2_release_dquot(struct dquot *dquot) 704static int ocfs2_release_dquot(struct dquot *dquot)
683{ 705{
684 handle_t *handle; 706 handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
694 /* Check whether we are not racing with some other dqget() */ 716 /* Check whether we are not racing with some other dqget() */
695 if (atomic_read(&dquot->dq_count) > 1) 717 if (atomic_read(&dquot->dq_count) > 1)
696 goto out; 718 goto out;
719 /* Running from downconvert thread? Postpone quota processing to wq */
720 if (current == osb->dc_task) {
721 /*
722 * Grab our own reference to dquot and queue it for delayed
723 * dropping. Quota code rechecks after calling
724 * ->release_dquot() and won't free dquot structure.
725 */
726 dqgrab(dquot);
727 /* First entry on list -> queue work */
728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
729 queue_work(ocfs2_wq, &osb->dquot_drop_work);
730 goto out;
731 }
697 status = ocfs2_lock_global_qf(oinfo, 1); 732 status = ocfs2_lock_global_qf(oinfo, 1);
698 if (status < 0) 733 if (status < 0)
699 goto out; 734 goto out;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ca5ce14cbddc..83f1a665ae97 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
496} 496}
497 497
498static struct kobj_attribute ocfs2_attr_max_locking_protocol = 498static struct kobj_attribute ocfs2_attr_max_locking_protocol =
499 __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, 499 __ATTR(max_locking_protocol, S_IRUGO,
500 ocfs2_max_locking_protocol_show, NULL); 500 ocfs2_max_locking_protocol_show, NULL);
501 501
502static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, 502static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
528} 528}
529 529
530static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = 530static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
531 __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, 531 __ATTR(loaded_cluster_plugins, S_IRUGO,
532 ocfs2_loaded_cluster_plugins_show, NULL); 532 ocfs2_loaded_cluster_plugins_show, NULL);
533 533
534static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, 534static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
550} 550}
551 551
552static struct kobj_attribute ocfs2_attr_active_cluster_plugin = 552static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
553 __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, 553 __ATTR(active_cluster_plugin, S_IRUGO,
554 ocfs2_active_cluster_plugin_show, NULL); 554 ocfs2_active_cluster_plugin_show, NULL);
555 555
556static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, 556static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -599,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
599 599
600 600
601static struct kobj_attribute ocfs2_attr_cluster_stack = 601static struct kobj_attribute ocfs2_attr_cluster_stack =
602 __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, 602 __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
603 ocfs2_cluster_stack_show, 603 ocfs2_cluster_stack_show,
604 ocfs2_cluster_stack_store); 604 ocfs2_cluster_stack_store);
605 605
606
607
608static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
609 struct kobj_attribute *attr,
610 char *buf)
611{
612 return snprintf(buf, PAGE_SIZE, "1\n");
613}
614
615static struct kobj_attribute ocfs2_attr_dlm_recover_support =
616 __ATTR(dlm_recover_callback_support, S_IRUGO,
617 ocfs2_dlm_recover_show, NULL);
618
606static struct attribute *ocfs2_attrs[] = { 619static struct attribute *ocfs2_attrs[] = {
607 &ocfs2_attr_max_locking_protocol.attr, 620 &ocfs2_attr_max_locking_protocol.attr,
608 &ocfs2_attr_loaded_cluster_plugins.attr, 621 &ocfs2_attr_loaded_cluster_plugins.attr,
609 &ocfs2_attr_active_cluster_plugin.attr, 622 &ocfs2_attr_active_cluster_plugin.attr,
610 &ocfs2_attr_cluster_stack.attr, 623 &ocfs2_attr_cluster_stack.attr,
624 &ocfs2_attr_dlm_recover_support.attr,
611 NULL, 625 NULL,
612}; 626};
613 627
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 47ae2663a6f5..0cb889a17ae1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 771 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 772 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 773 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
774 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
774 775
775 status = 0; 776 status = 0;
776 777
@@ -1607,6 +1608,21 @@ out:
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1611void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1612 struct buffer_head *di_bh,
1613 u32 num_bits,
1614 u16 chain)
1615{
1616 u32 tmp_used;
1617 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1618 struct ocfs2_chain_list *cl;
1619
1620 cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1621 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1622 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1623 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1624}
1625
1610static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, 1626static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1611 struct ocfs2_extent_rec *rec, 1627 struct ocfs2_extent_rec *rec,
1612 struct ocfs2_chain_list *cl) 1628 struct ocfs2_chain_list *cl)
@@ -1707,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1707 1723
1708 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, 1724 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1709 res->sr_bit_offset, res->sr_bits); 1725 res->sr_bit_offset, res->sr_bits);
1710 if (ret < 0) 1726 if (ret < 0) {
1727 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1728 res->sr_bits,
1729 le16_to_cpu(gd->bg_chain));
1711 mlog_errno(ret); 1730 mlog_errno(ret);
1731 }
1712 1732
1713out_loc_only: 1733out_loc_only:
1714 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1734 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1838,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1838 res->sr_bit_offset, 1858 res->sr_bit_offset,
1839 res->sr_bits); 1859 res->sr_bits);
1840 if (status < 0) { 1860 if (status < 0) {
1861 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1862 ac->ac_bh, res->sr_bits, chain);
1841 mlog_errno(status); 1863 mlog_errno(status);
1842 goto bail; 1864 goto bail;
1843 } 1865 }
@@ -2091,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
2091 2113
2092 ac->ac_find_loc_priv = res; 2114 ac->ac_find_loc_priv = res;
2093 *fe_blkno = res->sr_blkno; 2115 *fe_blkno = res->sr_blkno;
2094 2116 ocfs2_update_inode_fsync_trans(handle, dir, 0);
2095out: 2117out:
2096 if (handle) 2118 if (handle)
2097 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); 2119 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2149,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2149 res->sr_bit_offset, 2171 res->sr_bit_offset,
2150 res->sr_bits); 2172 res->sr_bits);
2151 if (ret < 0) { 2173 if (ret < 0) {
2174 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2175 ac->ac_bh, res->sr_bits, chain);
2152 mlog_errno(ret); 2176 mlog_errno(ret);
2153 goto out; 2177 goto out;
2154 } 2178 }
@@ -2870,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2870 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); 2894 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2871 if (status < 0) { 2895 if (status < 0) {
2872 mutex_unlock(&inode_alloc_inode->i_mutex); 2896 mutex_unlock(&inode_alloc_inode->i_mutex);
2897 iput(inode_alloc_inode);
2873 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", 2898 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2874 (u32)suballoc_slot, status); 2899 (u32)suballoc_slot, status);
2875 goto bail; 2900 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 218d8036b3e7..2d2501767c0c 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode,
91 struct buffer_head *di_bh, 91 struct buffer_head *di_bh,
92 u32 num_bits, 92 u32 num_bits,
93 u16 chain); 93 u16 chain);
94void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
95 struct buffer_head *di_bh,
96 u32 num_bits,
97 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle, 98int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode, 99 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg, 100 struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 49d84f80f36c..a7cdd56f4c79 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
561 if (!oi) 561 if (!oi)
562 return NULL; 562 return NULL;
563 563
564 oi->i_sync_tid = 0;
565 oi->i_datasync_tid = 0;
566
564 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); 567 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
565 return &oi->vfs_inode; 568 return &oi->vfs_inode;
566} 569}
@@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
631 struct ocfs2_super *osb = OCFS2_SB(sb); 634 struct ocfs2_super *osb = OCFS2_SB(sb);
632 u32 tmp; 635 u32 tmp;
633 636
637 sync_filesystem(sb);
638
634 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 639 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
635 !ocfs2_check_set_options(sb, &parsed_options)) { 640 !ocfs2_check_set_options(sb, &parsed_options)) {
636 ret = -EINVAL; 641 ret = -EINVAL;
@@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
1238 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); 1243 return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
1239} 1244}
1240 1245
1241static void ocfs2_kill_sb(struct super_block *sb)
1242{
1243 struct ocfs2_super *osb = OCFS2_SB(sb);
1244
1245 /* Failed mount? */
1246 if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
1247 goto out;
1248
1249 /* Prevent further queueing of inode drop events */
1250 spin_lock(&dentry_list_lock);
1251 ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
1252 spin_unlock(&dentry_list_lock);
1253 /* Wait for work to finish and/or remove it */
1254 cancel_work_sync(&osb->dentry_lock_work);
1255out:
1256 kill_block_super(sb);
1257}
1258
1259static struct file_system_type ocfs2_fs_type = { 1246static struct file_system_type ocfs2_fs_type = {
1260 .owner = THIS_MODULE, 1247 .owner = THIS_MODULE,
1261 .name = "ocfs2", 1248 .name = "ocfs2",
1262 .mount = ocfs2_mount, 1249 .mount = ocfs2_mount,
1263 .kill_sb = ocfs2_kill_sb, 1250 .kill_sb = kill_block_super,
1264
1265 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, 1251 .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
1266 .next = NULL 1252 .next = NULL
1267}; 1253};
@@ -1612,14 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1612 return 0; 1598 return 0;
1613} 1599}
1614 1600
1615wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1616
1617static int __init ocfs2_init(void) 1601static int __init ocfs2_init(void)
1618{ 1602{
1619 int status, i; 1603 int status;
1620
1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1623 1604
1624 status = init_ocfs2_uptodate_cache(); 1605 status = init_ocfs2_uptodate_cache();
1625 if (status < 0) 1606 if (status < 0)
@@ -1761,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
1761 ocfs2_extent_map_init(&oi->vfs_inode); 1742 ocfs2_extent_map_init(&oi->vfs_inode);
1762 INIT_LIST_HEAD(&oi->ip_io_markers); 1743 INIT_LIST_HEAD(&oi->ip_io_markers);
1763 oi->ip_dir_start_lookup = 0; 1744 oi->ip_dir_start_lookup = 0;
1764 atomic_set(&oi->ip_unaligned_aio, 0); 1745 mutex_init(&oi->ip_unaligned_aio);
1765 init_rwsem(&oi->ip_alloc_sem); 1746 init_rwsem(&oi->ip_alloc_sem);
1766 init_rwsem(&oi->ip_xattr_sem); 1747 init_rwsem(&oi->ip_xattr_sem);
1767 mutex_init(&oi->ip_io_mutex); 1748 mutex_init(&oi->ip_io_mutex);
@@ -1932,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1932 1913
1933 debugfs_remove(osb->osb_ctxt); 1914 debugfs_remove(osb->osb_ctxt);
1934 1915
1935 /*
1936 * Flush inode dropping work queue so that deletes are
1937 * performed while the filesystem is still working
1938 */
1939 ocfs2_drop_all_dl_inodes(osb);
1940
1941 /* Orphan scan should be stopped as early as possible */ 1916 /* Orphan scan should be stopped as early as possible */
1942 ocfs2_orphan_scan_stop(osb); 1917 ocfs2_orphan_scan_stop(osb);
1943 1918
1944 ocfs2_disable_quotas(osb); 1919 ocfs2_disable_quotas(osb);
1945 1920
1921 /* All dquots should be freed by now */
1922 WARN_ON(!llist_empty(&osb->dquot_drop_list));
1923 /* Wait for worker to be done with the work structure in osb */
1924 cancel_work_sync(&osb->dquot_drop_work);
1925
1946 ocfs2_shutdown_local_alloc(osb); 1926 ocfs2_shutdown_local_alloc(osb);
1947 1927
1948 /* This will disable recovery and flush any recovery work. */ 1928 /* This will disable recovery and flush any recovery work. */
@@ -2077,7 +2057,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2077 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 2057 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2078 struct inode *inode = NULL; 2058 struct inode *inode = NULL;
2079 struct ocfs2_journal *journal; 2059 struct ocfs2_journal *journal;
2080 __le32 uuid_net_key;
2081 struct ocfs2_super *osb; 2060 struct ocfs2_super *osb;
2082 u64 total_blocks; 2061 u64 total_blocks;
2083 2062
@@ -2123,6 +2102,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2123 spin_lock_init(&osb->osb_xattr_lock); 2102 spin_lock_init(&osb->osb_xattr_lock);
2124 ocfs2_init_steal_slots(osb); 2103 ocfs2_init_steal_slots(osb);
2125 2104
2105 mutex_init(&osb->system_file_mutex);
2106
2126 atomic_set(&osb->alloc_stats.moves, 0); 2107 atomic_set(&osb->alloc_stats.moves, 0);
2127 atomic_set(&osb->alloc_stats.local_data, 0); 2108 atomic_set(&osb->alloc_stats.local_data, 0);
2128 atomic_set(&osb->alloc_stats.bitmap_data, 0); 2109 atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2276,8 +2257,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2276 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 2257 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
2277 journal->j_state = OCFS2_JOURNAL_FREE; 2258 journal->j_state = OCFS2_JOURNAL_FREE;
2278 2259
2279 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); 2260 INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
2280 osb->dentry_lock_list = NULL; 2261 init_llist_head(&osb->dquot_drop_list);
2281 2262
2282 /* get some pseudo constants for clustersize bits */ 2263 /* get some pseudo constants for clustersize bits */
2283 osb->s_clustersize_bits = 2264 osb->s_clustersize_bits =
@@ -2311,8 +2292,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2311 goto bail; 2292 goto bail;
2312 } 2293 }
2313 2294
2314 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
2315
2316 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 2295 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
2317 osb->vol_label[63] = '\0'; 2296 osb->vol_label[63] = '\0';
2318 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); 2297 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a3..af155c183123 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
113 } else 113 } else
114 arr = get_local_system_inode(osb, type, slot); 114 arr = get_local_system_inode(osb, type, slot);
115 115
116 mutex_lock(&osb->system_file_mutex);
116 if (arr && ((inode = *arr) != NULL)) { 117 if (arr && ((inode = *arr) != NULL)) {
117 /* get a ref in addition to the array ref */ 118 /* get a ref in addition to the array ref */
118 inode = igrab(inode); 119 inode = igrab(inode);
120 mutex_unlock(&osb->system_file_mutex);
119 BUG_ON(!inode); 121 BUG_ON(!inode);
120 122
121 return inode; 123 return inode;
@@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
129 *arr = igrab(inode); 131 *arr = igrab(inode);
130 BUG_ON(!*arr); 132 BUG_ON(!*arr);
131 } 133 }
134 mutex_unlock(&osb->system_file_mutex);
132 return inode; 135 return inode;
133} 136}
134 137
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 185fa3b7f962..016f01df3825 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
369 * them fully. 369 * them fully.
370 */ 370 */
371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, 371static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
372 u64 xb_blkno) 372 u64 xb_blkno, int new)
373{ 373{
374 int i, rc = 0; 374 int i, rc = 0;
375 375
@@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
383 } 383 }
384 384
385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 385 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
386 bucket->bu_bhs[i])) 386 bucket->bu_bhs[i])) {
387 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), 387 if (new)
388 bucket->bu_bhs[i]); 388 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
389 bucket->bu_bhs[i]);
390 else {
391 set_buffer_uptodate(bucket->bu_bhs[i]);
392 ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
393 bucket->bu_bhs[i]);
394 }
395 }
389 } 396 }
390 397
391 if (rc) 398 if (rc)
@@ -2602,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
2602 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); 2609 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
2603 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2610 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2604 spin_unlock(&oi->ip_lock); 2611 spin_unlock(&oi->ip_lock);
2612 ocfs2_update_inode_fsync_trans(handle, inode, 0);
2605 2613
2606 ocfs2_journal_dirty(handle, di_bh); 2614 ocfs2_journal_dirty(handle, di_bh);
2607out_commit: 2615out_commit:
@@ -3200,8 +3208,15 @@ meta_guess:
3200 clusters_add += 1; 3208 clusters_add += 1;
3201 } 3209 }
3202 } else { 3210 } else {
3203 meta_add += 1;
3204 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 3211 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
3212 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
3213 struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
3214 meta_add += ocfs2_extend_meta_needed(el);
3215 credits += ocfs2_calc_extend_credits(inode->i_sb,
3216 el);
3217 } else {
3218 meta_add += 1;
3219 }
3205 } 3220 }
3206out: 3221out:
3207 if (clusters_need) 3222 if (clusters_need)
@@ -3614,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,
3614 } 3629 }
3615 3630
3616 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); 3631 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
3632 ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
3617 3633
3618 ocfs2_commit_trans(osb, ctxt.handle); 3634 ocfs2_commit_trans(osb, ctxt.handle);
3619 3635
@@ -4294,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4294 4310
4295 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); 4311 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
4296 4312
4297 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); 4313 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
4298 if (ret) { 4314 if (ret) {
4299 mlog_errno(ret); 4315 mlog_errno(ret);
4300 goto out; 4316 goto out;
@@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4638 * Even if !new_bucket_head, we're overwriting t_bucket. Thus, 4654 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
4639 * there's no need to read it. 4655 * there's no need to read it.
4640 */ 4656 */
4641 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); 4657 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
4642 if (ret) { 4658 if (ret) {
4643 mlog_errno(ret); 4659 mlog_errno(ret);
4644 goto out; 4660 goto out;
@@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
4804 * Even if !t_is_new, we're overwriting t_bucket. Thus, 4820 * Even if !t_is_new, we're overwriting t_bucket. Thus,
4805 * there's no need to read it. 4821 * there's no need to read it.
4806 */ 4822 */
4807 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); 4823 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
4808 if (ret) 4824 if (ret)
4809 goto out; 4825 goto out;
4810 4826
@@ -5476,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5476 ret = ocfs2_truncate_log_append(osb, handle, blkno, len); 5492 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
5477 if (ret) 5493 if (ret)
5478 mlog_errno(ret); 5494 mlog_errno(ret);
5495 ocfs2_update_inode_fsync_trans(handle, inode, 0);
5479 5496
5480out_commit: 5497out_commit:
5481 ocfs2_commit_trans(osb, handle); 5498 ocfs2_commit_trans(osb, handle);
@@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
6830 break; 6847 break;
6831 } 6848 }
6832 6849
6833 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); 6850 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
6834 if (ret) { 6851 if (ret) {
6835 mlog_errno(ret); 6852 mlog_errno(ret);
6836 break; 6853 break;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afde2179..ec58c7659183 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
183 */ 183 */
184static void omfs_evict_inode(struct inode *inode) 184static void omfs_evict_inode(struct inode *inode)
185{ 185{
186 truncate_inode_pages(&inode->i_data, 0); 186 truncate_inode_pages_final(&inode->i_data);
187 clear_inode(inode); 187 clear_inode(inode);
188 188
189 if (inode->i_nlink) 189 if (inode->i_nlink)
diff --git a/fs/open.c b/fs/open.c
index b9ed8b25c108..9d64679cec73 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
231 return -EINVAL; 231 return -EINVAL;
232 232
233 /* Return error if mode is not supported */ 233 /* Return error if mode is not supported */
234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
235 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
236 return -EOPNOTSUPP;
237
238 /* Punch hole and zero range are mutually exclusive */
239 if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
240 (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
235 return -EOPNOTSUPP; 241 return -EOPNOTSUPP;
236 242
237 /* Punch hole must have keep size set */ 243 /* Punch hole must have keep size set */
@@ -239,17 +245,31 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
239 !(mode & FALLOC_FL_KEEP_SIZE)) 245 !(mode & FALLOC_FL_KEEP_SIZE))
240 return -EOPNOTSUPP; 246 return -EOPNOTSUPP;
241 247
248 /* Collapse range should only be used exclusively. */
249 if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
250 (mode & ~FALLOC_FL_COLLAPSE_RANGE))
251 return -EINVAL;
252
242 if (!(file->f_mode & FMODE_WRITE)) 253 if (!(file->f_mode & FMODE_WRITE))
243 return -EBADF; 254 return -EBADF;
244 255
245 /* It's not possible punch hole on append only file */ 256 /*
246 if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) 257 * We can only allow pure fallocate on append only files
258 */
259 if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
247 return -EPERM; 260 return -EPERM;
248 261
249 if (IS_IMMUTABLE(inode)) 262 if (IS_IMMUTABLE(inode))
250 return -EPERM; 263 return -EPERM;
251 264
252 /* 265 /*
266 * We can not allow to do any fallocate operation on an active
267 * swapfile
268 */
269 if (IS_SWAPFILE(inode))
270 ret = -ETXTBSY;
271
272 /*
253 * Revalidate the write permissions, in case security policy has 273 * Revalidate the write permissions, in case security policy has
254 * changed since the files were opened. 274 * changed since the files were opened.
255 */ 275 */
@@ -632,35 +652,6 @@ out:
632 return error; 652 return error;
633} 653}
634 654
635/*
636 * You have to be very careful that these write
637 * counts get cleaned up in error cases and
638 * upon __fput(). This should probably never
639 * be called outside of __dentry_open().
640 */
641static inline int __get_file_write_access(struct inode *inode,
642 struct vfsmount *mnt)
643{
644 int error;
645 error = get_write_access(inode);
646 if (error)
647 return error;
648 /*
649 * Do not take mount writer counts on
650 * special files since no writes to
651 * the mount itself will occur.
652 */
653 if (!special_file(inode->i_mode)) {
654 /*
655 * Balanced in __fput()
656 */
657 error = __mnt_want_write(mnt);
658 if (error)
659 put_write_access(inode);
660 }
661 return error;
662}
663
664int open_check_o_direct(struct file *f) 655int open_check_o_direct(struct file *f)
665{ 656{
666 /* NB: we're sure to have correct a_ops only after f_op->open */ 657 /* NB: we're sure to have correct a_ops only after f_op->open */
@@ -685,26 +676,28 @@ static int do_dentry_open(struct file *f,
685 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | 676 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
686 FMODE_PREAD | FMODE_PWRITE; 677 FMODE_PREAD | FMODE_PWRITE;
687 678
688 if (unlikely(f->f_flags & O_PATH))
689 f->f_mode = FMODE_PATH;
690
691 path_get(&f->f_path); 679 path_get(&f->f_path);
692 inode = f->f_inode = f->f_path.dentry->d_inode; 680 inode = f->f_inode = f->f_path.dentry->d_inode;
693 if (f->f_mode & FMODE_WRITE) {
694 error = __get_file_write_access(inode, f->f_path.mnt);
695 if (error)
696 goto cleanup_file;
697 if (!special_file(inode->i_mode))
698 file_take_write(f);
699 }
700
701 f->f_mapping = inode->i_mapping; 681 f->f_mapping = inode->i_mapping;
702 682
703 if (unlikely(f->f_mode & FMODE_PATH)) { 683 if (unlikely(f->f_flags & O_PATH)) {
684 f->f_mode = FMODE_PATH;
704 f->f_op = &empty_fops; 685 f->f_op = &empty_fops;
705 return 0; 686 return 0;
706 } 687 }
707 688
689 if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
690 error = get_write_access(inode);
691 if (unlikely(error))
692 goto cleanup_file;
693 error = __mnt_want_write(f->f_path.mnt);
694 if (unlikely(error)) {
695 put_write_access(inode);
696 goto cleanup_file;
697 }
698 f->f_mode |= FMODE_WRITER;
699 }
700
708 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ 701 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
709 if (S_ISREG(inode->i_mode)) 702 if (S_ISREG(inode->i_mode))
710 f->f_mode |= FMODE_ATOMIC_POS; 703 f->f_mode |= FMODE_ATOMIC_POS;
@@ -741,18 +734,9 @@ static int do_dentry_open(struct file *f,
741 734
742cleanup_all: 735cleanup_all:
743 fops_put(f->f_op); 736 fops_put(f->f_op);
744 if (f->f_mode & FMODE_WRITE) { 737 if (f->f_mode & FMODE_WRITER) {
745 put_write_access(inode); 738 put_write_access(inode);
746 if (!special_file(inode->i_mode)) { 739 __mnt_drop_write(f->f_path.mnt);
747 /*
748 * We don't consider this a real
749 * mnt_want/drop_write() pair
750 * because it all happenend right
751 * here, so just reset the state.
752 */
753 file_reset_write(f);
754 __mnt_drop_write(f->f_path.mnt);
755 }
756 } 740 }
757cleanup_file: 741cleanup_file:
758 path_put(&f->f_path); 742 path_put(&f->f_path);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f7..15e4500cda3e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
368 368
369static int openprom_remount(struct super_block *sb, int *flags, char *data) 369static int openprom_remount(struct super_block *sb, int *flags, char *data)
370{ 370{
371 sync_filesystem(sb);
371 *flags |= MS_NOATIME; 372 *flags |= MS_NOATIME;
372 return 0; 373 return 0;
373} 374}
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0788db..034bffac3f97 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
142 return 0; 142 return 0;
143} 143}
144 144
145static int
146pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
147 int atomic)
148{
149 unsigned long copy;
150
151 while (len > 0) {
152 while (!iov->iov_len)
153 iov++;
154 copy = min_t(unsigned long, len, iov->iov_len);
155
156 if (atomic) {
157 if (__copy_to_user_inatomic(iov->iov_base, from, copy))
158 return -EFAULT;
159 } else {
160 if (copy_to_user(iov->iov_base, from, copy))
161 return -EFAULT;
162 }
163 from += copy;
164 len -= copy;
165 iov->iov_base += copy;
166 iov->iov_len -= copy;
167 }
168 return 0;
169}
170
171/*
172 * Attempt to pre-fault in the user memory, so we can use atomic copies.
173 * Returns the number of bytes not faulted in.
174 */
175static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
176{
177 while (!iov->iov_len)
178 iov++;
179
180 while (len > 0) {
181 unsigned long this_len;
182
183 this_len = min_t(unsigned long, len, iov->iov_len);
184 if (fault_in_pages_writeable(iov->iov_base, this_len))
185 break;
186
187 len -= this_len;
188 iov++;
189 }
190
191 return len;
192}
193
194/* 145/*
195 * Pre-fault in the user memory, so we can use atomic copies. 146 * Pre-fault in the user memory, so we can use atomic copies.
196 */ 147 */
@@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
226} 177}
227 178
228/** 179/**
229 * generic_pipe_buf_map - virtually map a pipe buffer
230 * @pipe: the pipe that the buffer belongs to
231 * @buf: the buffer that should be mapped
232 * @atomic: whether to use an atomic map
233 *
234 * Description:
235 * This function returns a kernel virtual address mapping for the
236 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
237 * and the caller has to be careful not to fault before calling
238 * the unmap function.
239 *
240 * Note that this function calls kmap_atomic() if @atomic != 0.
241 */
242void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
243 struct pipe_buffer *buf, int atomic)
244{
245 if (atomic) {
246 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
247 return kmap_atomic(buf->page);
248 }
249
250 return kmap(buf->page);
251}
252EXPORT_SYMBOL(generic_pipe_buf_map);
253
254/**
255 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
256 * @pipe: the pipe that the buffer belongs to
257 * @buf: the buffer that should be unmapped
258 * @map_data: the data that the mapping function returned
259 *
260 * Description:
261 * This function undoes the mapping that ->map() provided.
262 */
263void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
264 struct pipe_buffer *buf, void *map_data)
265{
266 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
267 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
268 kunmap_atomic(map_data);
269 } else
270 kunmap(buf->page);
271}
272EXPORT_SYMBOL(generic_pipe_buf_unmap);
273
274/**
275 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 180 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
276 * @pipe: the pipe that the buffer belongs to 181 * @pipe: the pipe that the buffer belongs to
277 * @buf: the buffer to attempt to steal 182 * @buf: the buffer to attempt to steal
@@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
351 256
352static const struct pipe_buf_operations anon_pipe_buf_ops = { 257static const struct pipe_buf_operations anon_pipe_buf_ops = {
353 .can_merge = 1, 258 .can_merge = 1,
354 .map = generic_pipe_buf_map,
355 .unmap = generic_pipe_buf_unmap,
356 .confirm = generic_pipe_buf_confirm, 259 .confirm = generic_pipe_buf_confirm,
357 .release = anon_pipe_buf_release, 260 .release = anon_pipe_buf_release,
358 .steal = generic_pipe_buf_steal, 261 .steal = generic_pipe_buf_steal,
@@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
361 264
362static const struct pipe_buf_operations packet_pipe_buf_ops = { 265static const struct pipe_buf_operations packet_pipe_buf_ops = {
363 .can_merge = 0, 266 .can_merge = 0,
364 .map = generic_pipe_buf_map,
365 .unmap = generic_pipe_buf_unmap,
366 .confirm = generic_pipe_buf_confirm, 267 .confirm = generic_pipe_buf_confirm,
367 .release = anon_pipe_buf_release, 268 .release = anon_pipe_buf_release,
368 .steal = generic_pipe_buf_steal, 269 .steal = generic_pipe_buf_steal,
@@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
379 ssize_t ret; 280 ssize_t ret;
380 struct iovec *iov = (struct iovec *)_iov; 281 struct iovec *iov = (struct iovec *)_iov;
381 size_t total_len; 282 size_t total_len;
283 struct iov_iter iter;
382 284
383 total_len = iov_length(iov, nr_segs); 285 total_len = iov_length(iov, nr_segs);
384 /* Null read succeeds. */ 286 /* Null read succeeds. */
385 if (unlikely(total_len == 0)) 287 if (unlikely(total_len == 0))
386 return 0; 288 return 0;
387 289
290 iov_iter_init(&iter, iov, nr_segs, total_len, 0);
291
388 do_wakeup = 0; 292 do_wakeup = 0;
389 ret = 0; 293 ret = 0;
390 __pipe_lock(pipe); 294 __pipe_lock(pipe);
@@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
394 int curbuf = pipe->curbuf; 298 int curbuf = pipe->curbuf;
395 struct pipe_buffer *buf = pipe->bufs + curbuf; 299 struct pipe_buffer *buf = pipe->bufs + curbuf;
396 const struct pipe_buf_operations *ops = buf->ops; 300 const struct pipe_buf_operations *ops = buf->ops;
397 void *addr;
398 size_t chars = buf->len; 301 size_t chars = buf->len;
399 int error, atomic; 302 size_t written;
303 int error;
400 304
401 if (chars > total_len) 305 if (chars > total_len)
402 chars = total_len; 306 chars = total_len;
@@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
408 break; 312 break;
409 } 313 }
410 314
411 atomic = !iov_fault_in_pages_write(iov, chars); 315 written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
412redo: 316 if (unlikely(written < chars)) {
413 addr = ops->map(pipe, buf, atomic);
414 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
415 ops->unmap(pipe, buf, addr);
416 if (unlikely(error)) {
417 /*
418 * Just retry with the slow path if we failed.
419 */
420 if (atomic) {
421 atomic = 0;
422 goto redo;
423 }
424 if (!ret) 317 if (!ret)
425 ret = error; 318 ret = -EFAULT;
426 break; 319 break;
427 } 320 }
428 ret += chars; 321 ret += chars;
@@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
538 431
539 iov_fault_in_pages_read(iov, chars); 432 iov_fault_in_pages_read(iov, chars);
540redo1: 433redo1:
541 addr = ops->map(pipe, buf, atomic); 434 if (atomic)
435 addr = kmap_atomic(buf->page);
436 else
437 addr = kmap(buf->page);
542 error = pipe_iov_copy_from_user(offset + addr, iov, 438 error = pipe_iov_copy_from_user(offset + addr, iov,
543 chars, atomic); 439 chars, atomic);
544 ops->unmap(pipe, buf, addr); 440 if (atomic)
441 kunmap_atomic(addr);
442 else
443 kunmap(buf->page);
545 ret = error; 444 ret = error;
546 do_wakeup = 1; 445 do_wakeup = 1;
547 if (error) { 446 if (error) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 88396df725b4..302bf22c4a30 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
164 } 164 }
165} 165}
166 166
167/* 167static struct mount *next_group(struct mount *m, struct mount *origin)
168 * return the source mount to be used for cloning
169 *
170 * @dest the current destination mount
171 * @last_dest the last seen destination mount
172 * @last_src the last seen source mount
173 * @type return CL_SLAVE if the new mount has to be
174 * cloned as a slave.
175 */
176static struct mount *get_source(struct mount *dest,
177 struct mount *last_dest,
178 struct mount *last_src,
179 int *type)
180{ 168{
181 struct mount *p_last_src = NULL; 169 while (1) {
182 struct mount *p_last_dest = NULL; 170 while (1) {
183 171 struct mount *next;
184 while (last_dest != dest->mnt_master) { 172 if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
185 p_last_dest = last_dest; 173 return first_slave(m);
186 p_last_src = last_src; 174 next = next_peer(m);
187 last_dest = last_dest->mnt_master; 175 if (m->mnt_group_id == origin->mnt_group_id) {
188 last_src = last_src->mnt_master; 176 if (next == origin)
177 return NULL;
178 } else if (m->mnt_slave.next != &next->mnt_slave)
179 break;
180 m = next;
181 }
182 /* m is the last peer */
183 while (1) {
184 struct mount *master = m->mnt_master;
185 if (m->mnt_slave.next != &master->mnt_slave_list)
186 return next_slave(m);
187 m = next_peer(master);
188 if (master->mnt_group_id == origin->mnt_group_id)
189 break;
190 if (master->mnt_slave.next == &m->mnt_slave)
191 break;
192 m = master;
193 }
194 if (m == origin)
195 return NULL;
189 } 196 }
197}
190 198
191 if (p_last_dest) { 199/* all accesses are serialized by namespace_sem */
192 do { 200static struct user_namespace *user_ns;
193 p_last_dest = next_peer(p_last_dest); 201static struct mount *last_dest, *last_source, *dest_master;
194 } while (IS_MNT_NEW(p_last_dest)); 202static struct mountpoint *mp;
195 /* is that a peer of the earlier? */ 203static struct hlist_head *list;
196 if (dest == p_last_dest) { 204
197 *type = CL_MAKE_SHARED; 205static int propagate_one(struct mount *m)
198 return p_last_src; 206{
207 struct mount *child;
208 int type;
209 /* skip ones added by this propagate_mnt() */
210 if (IS_MNT_NEW(m))
211 return 0;
212 /* skip if mountpoint isn't covered by it */
213 if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
214 return 0;
215 if (m->mnt_group_id == last_dest->mnt_group_id) {
216 type = CL_MAKE_SHARED;
217 } else {
218 struct mount *n, *p;
219 for (n = m; ; n = p) {
220 p = n->mnt_master;
221 if (p == dest_master || IS_MNT_MARKED(p)) {
222 while (last_dest->mnt_master != p) {
223 last_source = last_source->mnt_master;
224 last_dest = last_source->mnt_parent;
225 }
226 if (n->mnt_group_id != last_dest->mnt_group_id) {
227 last_source = last_source->mnt_master;
228 last_dest = last_source->mnt_parent;
229 }
230 break;
231 }
199 } 232 }
233 type = CL_SLAVE;
234 /* beginning of peer group among the slaves? */
235 if (IS_MNT_SHARED(m))
236 type |= CL_MAKE_SHARED;
200 } 237 }
201 /* slave of the earlier, then */ 238
202 *type = CL_SLAVE; 239 /* Notice when we are propagating across user namespaces */
203 /* beginning of peer group among the slaves? */ 240 if (m->mnt_ns->user_ns != user_ns)
204 if (IS_MNT_SHARED(dest)) 241 type |= CL_UNPRIVILEGED;
205 *type |= CL_MAKE_SHARED; 242 child = copy_tree(last_source, last_source->mnt.mnt_root, type);
206 return last_src; 243 if (IS_ERR(child))
244 return PTR_ERR(child);
245 mnt_set_mountpoint(m, mp, child);
246 last_dest = m;
247 last_source = child;
248 if (m->mnt_master != dest_master) {
249 read_seqlock_excl(&mount_lock);
250 SET_MNT_MARK(m->mnt_master);
251 read_sequnlock_excl(&mount_lock);
252 }
253 hlist_add_head(&child->mnt_hash, list);
254 return 0;
207} 255}
208 256
209/* 257/*
@@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest,
222int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, 270int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
223 struct mount *source_mnt, struct hlist_head *tree_list) 271 struct mount *source_mnt, struct hlist_head *tree_list)
224{ 272{
225 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 273 struct mount *m, *n;
226 struct mount *m, *child;
227 int ret = 0; 274 int ret = 0;
228 struct mount *prev_dest_mnt = dest_mnt; 275
229 struct mount *prev_src_mnt = source_mnt; 276 /*
230 HLIST_HEAD(tmp_list); 277 * we don't want to bother passing tons of arguments to
231 278 * propagate_one(); everything is serialized by namespace_sem,
232 for (m = propagation_next(dest_mnt, dest_mnt); m; 279 * so globals will do just fine.
233 m = propagation_next(m, dest_mnt)) { 280 */
234 int type; 281 user_ns = current->nsproxy->mnt_ns->user_ns;
235 struct mount *source; 282 last_dest = dest_mnt;
236 283 last_source = source_mnt;
237 if (IS_MNT_NEW(m)) 284 mp = dest_mp;
238 continue; 285 list = tree_list;
239 286 dest_master = dest_mnt->mnt_master;
240 source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); 287
241 288 /* all peers of dest_mnt, except dest_mnt itself */
242 /* Notice when we are propagating across user namespaces */ 289 for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
243 if (m->mnt_ns->user_ns != user_ns) 290 ret = propagate_one(n);
244 type |= CL_UNPRIVILEGED; 291 if (ret)
245
246 child = copy_tree(source, source->mnt.mnt_root, type);
247 if (IS_ERR(child)) {
248 ret = PTR_ERR(child);
249 tmp_list = *tree_list;
250 tmp_list.first->pprev = &tmp_list.first;
251 INIT_HLIST_HEAD(tree_list);
252 goto out; 292 goto out;
253 } 293 }
254 294
255 if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { 295 /* all slave groups */
256 mnt_set_mountpoint(m, dest_mp, child); 296 for (m = next_group(dest_mnt, dest_mnt); m;
257 hlist_add_head(&child->mnt_hash, tree_list); 297 m = next_group(m, dest_mnt)) {
258 } else { 298 /* everything in that slave group */
259 /* 299 n = m;
260 * This can happen if the parent mount was bind mounted 300 do {
261 * on some subdirectory of a shared/slave mount. 301 ret = propagate_one(n);
262 */ 302 if (ret)
263 hlist_add_head(&child->mnt_hash, &tmp_list); 303 goto out;
264 } 304 n = next_peer(n);
265 prev_dest_mnt = m; 305 } while (n != m);
266 prev_src_mnt = child;
267 } 306 }
268out: 307out:
269 lock_mount_hash(); 308 read_seqlock_excl(&mount_lock);
270 while (!hlist_empty(&tmp_list)) { 309 hlist_for_each_entry(n, tree_list, mnt_hash) {
271 child = hlist_entry(tmp_list.first, struct mount, mnt_hash); 310 m = n->mnt_parent;
272 umount_tree(child, 0); 311 if (m->mnt_master != dest_mnt->mnt_master)
312 CLEAR_MNT_MARK(m->mnt_master);
273 } 313 }
274 unlock_mount_hash(); 314 read_sequnlock_excl(&mount_lock);
275 return ret; 315 return ret;
276} 316}
277 317
diff --git a/fs/pnode.h b/fs/pnode.h
index fc28a27fa892..4a246358b031 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
16#define IS_MNT_NEW(m) (!(m)->mnt_ns) 16#define IS_MNT_NEW(m) (!(m)->mnt_ns)
17#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) 17#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
18#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) 18#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
19#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
20#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
21#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
19 22
20#define CL_EXPIRE 0x01 23#define CL_EXPIRE 0x01
21#define CL_SLAVE 0x02 24#define CL_SLAVE 0x02
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 11c54fd51e16..0855f772cd41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
246 umode_t mode = 0; 246 umode_t mode = 0;
247 int not_equiv = 0; 247 int not_equiv = 0;
248 248
249 /*
250 * A null ACL can always be presented as mode bits.
251 */
252 if (!acl)
253 return 0;
254
249 FOREACH_ACL_ENTRY(pa, acl, pe) { 255 FOREACH_ACL_ENTRY(pa, acl, pe) {
250 switch (pa->e_tag) { 256 switch (pa->e_tag) {
251 case ACL_USER_OBJ: 257 case ACL_USER_OBJ:
@@ -723,7 +729,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
723 void *buffer, size_t size) 729 void *buffer, size_t size)
724{ 730{
725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; 731 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
726 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; 732 posix_acl_xattr_entry *ext_entry;
727 int real_size, n; 733 int real_size, n;
728 734
729 real_size = posix_acl_xattr_size(acl->a_count); 735 real_size = posix_acl_xattr_size(acl->a_count);
@@ -731,7 +737,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
731 return real_size; 737 return real_size;
732 if (real_size > size) 738 if (real_size > size)
733 return -ERANGE; 739 return -ERANGE;
734 740
741 ext_entry = ext_acl->a_entries;
735 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); 742 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
736 743
737 for (n=0; n < acl->a_count; n++, ext_entry++) { 744 for (n=0; n < acl->a_count; n++, ext_entry++) {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f5..239493ec718e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
27proc-$(CONFIG_NET) += proc_net.o 27proc-$(CONFIG_NET) += proc_net.o
28proc-$(CONFIG_PROC_KCORE) += kcore.o 28proc-$(CONFIG_PROC_KCORE) += kcore.o
29proc-$(CONFIG_PROC_VMCORE) += vmcore.o 29proc-$(CONFIG_PROC_VMCORE) += vmcore.o
30proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
31proc-$(CONFIG_PRINTK) += kmsg.o 30proc-$(CONFIG_PRINTK) += kmsg.o
32proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o 31proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 656e401794de..64db2bceac59 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,8 +138,8 @@ static const char * const task_state_array[] = {
138 "D (disk sleep)", /* 2 */ 138 "D (disk sleep)", /* 2 */
139 "T (stopped)", /* 4 */ 139 "T (stopped)", /* 4 */
140 "t (tracing stop)", /* 8 */ 140 "t (tracing stop)", /* 8 */
141 "Z (zombie)", /* 16 */ 141 "X (dead)", /* 16 */
142 "X (dead)", /* 32 */ 142 "Z (zombie)", /* 32 */
143}; 143};
144 144
145static inline const char *get_task_state(struct task_struct *tsk) 145static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9760628e1fd..2d696b0c93bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
200 return result; 200 return result;
201} 201}
202 202
203static int proc_pid_cmdline(struct task_struct *task, char * buffer) 203static int proc_pid_cmdline(struct task_struct *task, char *buffer)
204{ 204{
205 int res = 0; 205 return get_cmdline(task, buffer, PAGE_SIZE);
206 unsigned int len;
207 struct mm_struct *mm = get_task_mm(task);
208 if (!mm)
209 goto out;
210 if (!mm->arg_end)
211 goto out_mm; /* Shh! No looking before we're done */
212
213 len = mm->arg_end - mm->arg_start;
214
215 if (len > PAGE_SIZE)
216 len = PAGE_SIZE;
217
218 res = access_process_vm(task, mm->arg_start, buffer, len, 0);
219
220 // If the nul at the end of args has been overwritten, then
221 // assume application is using setproctitle(3).
222 if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
223 len = strnlen(buffer, res);
224 if (len < res) {
225 res = len;
226 } else {
227 len = mm->env_end - mm->env_start;
228 if (len > PAGE_SIZE - res)
229 len = PAGE_SIZE - res;
230 res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
231 res = strnlen(buffer, res);
232 }
233 }
234out_mm:
235 mmput(mm);
236out:
237 return res;
238} 206}
239 207
240static int proc_pid_auxv(struct task_struct *task, char *buffer) 208static int proc_pid_auxv(struct task_struct *task, char *buffer)
@@ -1236,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1236 make_it_fail = simple_strtol(strstrip(buffer), &end, 0); 1204 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1237 if (*end) 1205 if (*end)
1238 return -EINVAL; 1206 return -EINVAL;
1207 if (make_it_fail < 0 || make_it_fail > 1)
1208 return -EINVAL;
1209
1239 task = get_proc_task(file_inode(file)); 1210 task = get_proc_task(file_inode(file));
1240 if (!task) 1211 if (!task)
1241 return -ESRCH; 1212 return -ESRCH;
@@ -2588,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2588 REG("environ", S_IRUSR, proc_environ_operations), 2559 REG("environ", S_IRUSR, proc_environ_operations),
2589 INF("auxv", S_IRUSR, proc_pid_auxv), 2560 INF("auxv", S_IRUSR, proc_pid_auxv),
2590 ONE("status", S_IRUGO, proc_pid_status), 2561 ONE("status", S_IRUGO, proc_pid_status),
2591 ONE("personality", S_IRUGO, proc_pid_personality), 2562 ONE("personality", S_IRUSR, proc_pid_personality),
2592 INF("limits", S_IRUGO, proc_pid_limits), 2563 INF("limits", S_IRUGO, proc_pid_limits),
2593#ifdef CONFIG_SCHED_DEBUG 2564#ifdef CONFIG_SCHED_DEBUG
2594 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2565 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2598,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2598#endif 2569#endif
2599 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2570 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2600#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2571#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2601 INF("syscall", S_IRUGO, proc_pid_syscall), 2572 INF("syscall", S_IRUSR, proc_pid_syscall),
2602#endif 2573#endif
2603 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2574 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2604 ONE("stat", S_IRUGO, proc_tgid_stat), 2575 ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2617,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2617#ifdef CONFIG_PROC_PAGE_MONITOR 2588#ifdef CONFIG_PROC_PAGE_MONITOR
2618 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2589 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2619 REG("smaps", S_IRUGO, proc_pid_smaps_operations), 2590 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
2620 REG("pagemap", S_IRUGO, proc_pagemap_operations), 2591 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2621#endif 2592#endif
2622#ifdef CONFIG_SECURITY 2593#ifdef CONFIG_SECURITY
2623 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2594 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2626,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2626 INF("wchan", S_IRUGO, proc_pid_wchan), 2597 INF("wchan", S_IRUGO, proc_pid_wchan),
2627#endif 2598#endif
2628#ifdef CONFIG_STACKTRACE 2599#ifdef CONFIG_STACKTRACE
2629 ONE("stack", S_IRUGO, proc_pid_stack), 2600 ONE("stack", S_IRUSR, proc_pid_stack),
2630#endif 2601#endif
2631#ifdef CONFIG_SCHEDSTATS 2602#ifdef CONFIG_SCHEDSTATS
2632 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2603 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -2927,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = {
2927 REG("environ", S_IRUSR, proc_environ_operations), 2898 REG("environ", S_IRUSR, proc_environ_operations),
2928 INF("auxv", S_IRUSR, proc_pid_auxv), 2899 INF("auxv", S_IRUSR, proc_pid_auxv),
2929 ONE("status", S_IRUGO, proc_pid_status), 2900 ONE("status", S_IRUGO, proc_pid_status),
2930 ONE("personality", S_IRUGO, proc_pid_personality), 2901 ONE("personality", S_IRUSR, proc_pid_personality),
2931 INF("limits", S_IRUGO, proc_pid_limits), 2902 INF("limits", S_IRUGO, proc_pid_limits),
2932#ifdef CONFIG_SCHED_DEBUG 2903#ifdef CONFIG_SCHED_DEBUG
2933 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2904 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2934#endif 2905#endif
2935 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2906 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2936#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2907#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2937 INF("syscall", S_IRUGO, proc_pid_syscall), 2908 INF("syscall", S_IRUSR, proc_pid_syscall),
2938#endif 2909#endif
2939 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2910 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2940 ONE("stat", S_IRUGO, proc_tid_stat), 2911 ONE("stat", S_IRUGO, proc_tid_stat),
@@ -2955,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = {
2955#ifdef CONFIG_PROC_PAGE_MONITOR 2926#ifdef CONFIG_PROC_PAGE_MONITOR
2956 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2927 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2957 REG("smaps", S_IRUGO, proc_tid_smaps_operations), 2928 REG("smaps", S_IRUGO, proc_tid_smaps_operations),
2958 REG("pagemap", S_IRUGO, proc_pagemap_operations), 2929 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2959#endif 2930#endif
2960#ifdef CONFIG_SECURITY 2931#ifdef CONFIG_SECURITY
2961 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2932 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2964,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = {
2964 INF("wchan", S_IRUGO, proc_pid_wchan), 2935 INF("wchan", S_IRUGO, proc_pid_wchan),
2965#endif 2936#endif
2966#ifdef CONFIG_STACKTRACE 2937#ifdef CONFIG_STACKTRACE
2967 ONE("stack", S_IRUGO, proc_pid_stack), 2938 ONE("stack", S_IRUSR, proc_pid_stack),
2968#endif 2939#endif
2969#ifdef CONFIG_SCHEDSTATS 2940#ifdef CONFIG_SCHEDSTATS
2970 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2941 INF("schedstat", S_IRUGO, proc_pid_schedstat),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 985ea881b5bc..0788d093f5d8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/proc_fs.h> 12#include <linux/proc_fs.h>
13 13
14#include "../mount.h"
14#include "internal.h" 15#include "internal.h"
15#include "fd.h" 16#include "fd.h"
16 17
@@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v)
48 } 49 }
49 50
50 if (!ret) { 51 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", 52 seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
52 (long long)file->f_pos, f_flags); 53 (long long)file->f_pos, f_flags,
54 real_mount(file->f_path.mnt)->mnt_id);
53 if (file->f_op->show_fdinfo) 55 if (file->f_op->show_fdinfo)
54 ret = file->f_op->show_fdinfo(m, file); 56 ret = file->f_op->show_fdinfo(m, file);
55 fput(file); 57 fput(file);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 124fc43c7090..0adbc02d60e3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
35 const struct proc_ns_operations *ns_ops; 35 const struct proc_ns_operations *ns_ops;
36 void *ns; 36 void *ns;
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 clear_inode(inode); 39 clear_inode(inode);
40 40
41 /* Stop tracking associated processes */ 41 /* Stop tracking associated processes */
@@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode)
47 pde_put(de); 47 pde_put(de);
48 head = PROC_I(inode)->sysctl; 48 head = PROC_I(inode)->sysctl;
49 if (head) { 49 if (head) {
50 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); 50 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
51 sysctl_head_put(head); 51 sysctl_head_put(head);
52 } 52 }
53 /* Release any associated namespace */ 53 /* Release any associated namespace */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dde..3ab6d14e71c5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *);
211extern void proc_entry_rundown(struct proc_dir_entry *); 211extern void proc_entry_rundown(struct proc_dir_entry *);
212 212
213/* 213/*
214 * proc_devtree.c
215 */
216#ifdef CONFIG_PROC_DEVICETREE
217extern void proc_device_tree_init(void);
218#endif
219
220/*
221 * proc_namespaces.c 214 * proc_namespaces.c
222 */ 215 */
223extern const struct inode_operations proc_ns_dir_inode_operations; 216extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548d9567..7445af0b1aa3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
73 available += pagecache; 73 available += pagecache;
74 74
75 /* 75 /*
76 * Part of the reclaimable swap consists of items that are in use, 76 * Part of the reclaimable slab consists of items that are in use,
77 * and cannot be freed. Cap this estimate at the low watermark. 77 * and cannot be freed. Cap this estimate at the low watermark.
78 */ 78 */
79 available += global_page_state(NR_SLAB_RECLAIMABLE) - 79 available += global_page_state(NR_SLAB_RECLAIMABLE) -
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b87470d..89026095f2b5 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
146 struct task_struct *task; 146 struct task_struct *task;
147 void *ns; 147 void *ns;
148 char name[50]; 148 char name[50];
149 int len = -EACCES; 149 int res = -EACCES;
150 150
151 task = get_proc_task(inode); 151 task = get_proc_task(inode);
152 if (!task) 152 if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
155 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 155 if (!ptrace_may_access(task, PTRACE_MODE_READ))
156 goto out_put_task; 156 goto out_put_task;
157 157
158 len = -ENOENT; 158 res = -ENOENT;
159 ns = ns_ops->get(task); 159 ns = ns_ops->get(task);
160 if (!ns) 160 if (!ns)
161 goto out_put_task; 161 goto out_put_task;
162 162
163 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); 163 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
164 len = strlen(name); 164 res = readlink_copy(buffer, buflen, name);
165
166 if (len > buflen)
167 len = buflen;
168 if (copy_to_user(buffer, name, len))
169 len = -EFAULT;
170
171 ns_ops->put(ns); 165 ns_ops->put(ns);
172out_put_task: 166out_put_task:
173 put_task_struct(task); 167 put_task_struct(task);
174out: 168out:
175 return len; 169 return res;
176} 170}
177 171
178static const struct inode_operations proc_ns_link_inode_operations = { 172static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index c82dd5147845..000000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,241 +0,0 @@
1/*
2 * proc_devtree.c - handles /proc/device-tree
3 *
4 * Copyright 1997 Paul Mackerras
5 */
6#include <linux/errno.h>
7#include <linux/init.h>
8#include <linux/time.h>
9#include <linux/proc_fs.h>
10#include <linux/seq_file.h>
11#include <linux/printk.h>
12#include <linux/stat.h>
13#include <linux/string.h>
14#include <linux/of.h>
15#include <linux/export.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h>
18#include "internal.h"
19
20static inline void set_node_proc_entry(struct device_node *np,
21 struct proc_dir_entry *de)
22{
23 np->pde = de;
24}
25
26static struct proc_dir_entry *proc_device_tree;
27
28/*
29 * Supply data on a read from /proc/device-tree/node/property.
30 */
31static int property_proc_show(struct seq_file *m, void *v)
32{
33 struct property *pp = m->private;
34
35 seq_write(m, pp->value, pp->length);
36 return 0;
37}
38
39static int property_proc_open(struct inode *inode, struct file *file)
40{
41 return single_open(file, property_proc_show, __PDE_DATA(inode));
42}
43
44static const struct file_operations property_proc_fops = {
45 .owner = THIS_MODULE,
46 .open = property_proc_open,
47 .read = seq_read,
48 .llseek = seq_lseek,
49 .release = single_release,
50};
51
52/*
53 * For a node with a name like "gc@10", we make symlinks called "gc"
54 * and "@10" to it.
55 */
56
57/*
58 * Add a property to a node
59 */
60static struct proc_dir_entry *
61__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
62 const char *name)
63{
64 struct proc_dir_entry *ent;
65
66 /*
67 * Unfortunately proc_register puts each new entry
68 * at the beginning of the list. So we rearrange them.
69 */
70 ent = proc_create_data(name,
71 strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
72 de, &property_proc_fops, pp);
73 if (ent == NULL)
74 return NULL;
75
76 if (!strncmp(name, "security-", 9))
77 proc_set_size(ent, 0); /* don't leak number of password chars */
78 else
79 proc_set_size(ent, pp->length);
80
81 return ent;
82}
83
84
85void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
86{
87 __proc_device_tree_add_prop(pde, prop, prop->name);
88}
89
90void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
91 struct property *prop)
92{
93 remove_proc_entry(prop->name, pde);
94}
95
96void proc_device_tree_update_prop(struct proc_dir_entry *pde,
97 struct property *newprop,
98 struct property *oldprop)
99{
100 struct proc_dir_entry *ent;
101
102 if (!oldprop) {
103 proc_device_tree_add_prop(pde, newprop);
104 return;
105 }
106
107 for (ent = pde->subdir; ent != NULL; ent = ent->next)
108 if (ent->data == oldprop)
109 break;
110 if (ent == NULL) {
111 pr_warn("device-tree: property \"%s\" does not exist\n",
112 oldprop->name);
113 } else {
114 ent->data = newprop;
115 ent->size = newprop->length;
116 }
117}
118
119/*
120 * Various dodgy firmware might give us nodes and/or properties with
121 * conflicting names. That's generally ok, except for exporting via /proc,
122 * so munge names here to ensure they're unique.
123 */
124
125static int duplicate_name(struct proc_dir_entry *de, const char *name)
126{
127 struct proc_dir_entry *ent;
128 int found = 0;
129
130 spin_lock(&proc_subdir_lock);
131
132 for (ent = de->subdir; ent != NULL; ent = ent->next) {
133 if (strcmp(ent->name, name) == 0) {
134 found = 1;
135 break;
136 }
137 }
138
139 spin_unlock(&proc_subdir_lock);
140
141 return found;
142}
143
144static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
145 const char *name)
146{
147 char *fixed_name;
148 int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
149 int i = 1, size;
150
151realloc:
152 fixed_name = kmalloc(fixup_len, GFP_KERNEL);
153 if (fixed_name == NULL) {
154 pr_err("device-tree: Out of memory trying to fixup "
155 "name \"%s\"\n", name);
156 return name;
157 }
158
159retry:
160 size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
161 size++; /* account for NULL */
162
163 if (size > fixup_len) {
164 /* We ran out of space, free and reallocate. */
165 kfree(fixed_name);
166 fixup_len = size;
167 goto realloc;
168 }
169
170 if (duplicate_name(de, fixed_name)) {
171 /* Multiple duplicates. Retry with a different offset. */
172 i++;
173 goto retry;
174 }
175
176 pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
177 np->full_name, fixed_name);
178
179 return fixed_name;
180}
181
182/*
183 * Process a node, adding entries for its children and its properties.
184 */
185void proc_device_tree_add_node(struct device_node *np,
186 struct proc_dir_entry *de)
187{
188 struct property *pp;
189 struct proc_dir_entry *ent;
190 struct device_node *child;
191 const char *p;
192
193 set_node_proc_entry(np, de);
194 for (child = NULL; (child = of_get_next_child(np, child));) {
195 /* Use everything after the last slash, or the full name */
196 p = kbasename(child->full_name);
197
198 if (duplicate_name(de, p))
199 p = fixup_name(np, de, p);
200
201 ent = proc_mkdir(p, de);
202 if (ent == NULL)
203 break;
204 proc_device_tree_add_node(child, ent);
205 }
206 of_node_put(child);
207
208 for (pp = np->properties; pp != NULL; pp = pp->next) {
209 p = pp->name;
210
211 if (strchr(p, '/'))
212 continue;
213
214 if (duplicate_name(de, p))
215 p = fixup_name(np, de, p);
216
217 ent = __proc_device_tree_add_prop(de, pp, p);
218 if (ent == NULL)
219 break;
220 }
221}
222
223/*
224 * Called on initialization to set up the /proc/device-tree subtree
225 */
226void __init proc_device_tree_init(void)
227{
228 struct device_node *root;
229
230 proc_device_tree = proc_mkdir("device-tree", NULL);
231 if (proc_device_tree == NULL)
232 return;
233 root = of_find_node_by_path("/");
234 if (root == NULL) {
235 remove_proc_entry("device-tree", NULL);
236 pr_debug("/proc/device-tree: can't find root\n");
237 return;
238 }
239 proc_device_tree_add_node(root, proc_device_tree);
240 of_node_put(root);
241}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe4..5dbadecb234d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
92int proc_remount(struct super_block *sb, int *flags, char *data) 92int proc_remount(struct super_block *sb, int *flags, char *data)
93{ 93{
94 struct pid_namespace *pid = sb->s_fs_info; 94 struct pid_namespace *pid = sb->s_fs_info;
95
96 sync_filesystem(sb);
95 return !proc_parse_options(data, pid); 97 return !proc_parse_options(data, pid);
96} 98}
97 99
@@ -183,9 +185,6 @@ void __init proc_root_init(void)
183 proc_mkdir("openprom", NULL); 185 proc_mkdir("openprom", NULL);
184#endif 186#endif
185 proc_tty_init(); 187 proc_tty_init();
186#ifdef CONFIG_PROC_DEVICETREE
187 proc_device_tree_init();
188#endif
189 proc_mkdir("bus", NULL); 188 proc_mkdir("bus", NULL);
190 proc_sys_init(); 189 proc_sys_init();
191} 190}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec942..4348bb8907c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
16 if (!tgid) 16 if (!tgid)
17 return -ENOENT; 17 return -ENOENT;
18 sprintf(tmp, "%d", tgid); 18 sprintf(tmp, "%d", tgid);
19 return vfs_readlink(dentry,buffer,buflen,tmp); 19 return readlink_copy(buffer, buflen, tmp);
20} 20}
21 21
22static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 22static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6f599c62f0cc..9d231e9e5f0e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,7 +9,7 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <linux/cputime.h>
13#include <linux/tick.h> 13#include <linux/tick.h>
14 14
15#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..c4b2646b6d7c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/vmacache.h>
2#include <linux/hugetlb.h> 3#include <linux/hugetlb.h>
3#include <linux/huge_mm.h> 4#include <linux/huge_mm.h>
4#include <linux/mount.h> 5#include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
152 153
153 /* 154 /*
154 * We remember last_addr rather than next_addr to hit with 155 * We remember last_addr rather than next_addr to hit with
155 * mmap_cache most of the time. We have zero last_addr at 156 * vmacache most of the time. We have zero last_addr at
156 * the beginning and also after lseek. We will have -1 last_addr 157 * the beginning and also after lseek. We will have -1 last_addr
157 * after the end of the vmas. 158 * after the end of the vmas.
158 */ 159 */
@@ -1350,7 +1351,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1350 struct numa_maps *md; 1351 struct numa_maps *md;
1351 struct page *page; 1352 struct page *page;
1352 1353
1353 if (pte_none(*pte)) 1354 if (!pte_present(*pte))
1354 return 0; 1355 return 0;
1355 1356
1356 page = pte_page(*pte); 1357 page = pte_page(*pte);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7141b8d0ca9e..33de567c25af 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,7 +5,7 @@
5#include <linux/seq_file.h> 5#include <linux/seq_file.h>
6#include <linux/time.h> 6#include <linux/time.h>
7#include <linux/kernel_stat.h> 7#include <linux/kernel_stat.h>
8#include <asm/cputime.h> 8#include <linux/cputime.h>
9 9
10static int uptime_proc_show(struct seq_file *m, void *v) 10static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 88d4585b30f1..6a8e785b29da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
484 phdr_ptr->p_memsz = real_sz; 484 phdr_ptr->p_memsz = real_sz;
485 if (real_sz == 0) { 485 if (real_sz == 0) {
486 pr_warn("Warning: Zero PT_NOTE entries found\n"); 486 pr_warn("Warning: Zero PT_NOTE entries found\n");
487 return -EINVAL;
488 } 487 }
489 } 488 }
490 489
@@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
671 phdr_ptr->p_memsz = real_sz; 670 phdr_ptr->p_memsz = real_sz;
672 if (real_sz == 0) { 671 if (real_sz == 0) {
673 pr_warn("Warning: Zero PT_NOTE entries found\n"); 672 pr_warn("Warning: Zero PT_NOTE entries found\n");
674 return -EINVAL;
675 } 673 }
676 } 674 }
677 675
@@ -1118,4 +1116,3 @@ void vmcore_cleanup(void)
1118 } 1116 }
1119 free_elfcorebuf(); 1117 free_elfcorebuf();
1120} 1118}
1121EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7be26f03a3f5..1a81373947f3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
267 p->root = root; 267 p->root = root;
268 p->m.poll_event = ns->event; 268 p->m.poll_event = ns->event;
269 p->show = show; 269 p->show = show;
270 p->cached_event = ~0ULL;
270 271
271 return 0; 272 return 0;
272 273
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..192297b0090d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
249 249
250static int pstore_remount(struct super_block *sb, int *flags, char *data) 250static int pstore_remount(struct super_block *sb, int *flags, char *data)
251{ 251{
252 sync_filesystem(sb);
252 parse_options(data); 253 parse_options(data);
253 254
254 return 0; 255 return 0;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 78c3c2097787..46d269e38706 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -497,6 +497,7 @@ void pstore_get_records(int quiet)
497 big_oops_buf_sz); 497 big_oops_buf_sz);
498 498
499 if (unzipped_len > 0) { 499 if (unzipped_len > 0) {
500 kfree(buf);
500 buf = big_oops_buf; 501 buf = big_oops_buf;
501 size = unzipped_len; 502 size = unzipped_len;
502 compressed = false; 503 compressed = false;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3a..3b5744306ed8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -86,6 +86,7 @@ struct ramoops_context {
86 struct persistent_ram_ecc_info ecc_info; 86 struct persistent_ram_ecc_info ecc_info;
87 unsigned int max_dump_cnt; 87 unsigned int max_dump_cnt;
88 unsigned int dump_write_cnt; 88 unsigned int dump_write_cnt;
89 /* _read_cnt need clear on ramoops_pstore_open */
89 unsigned int dump_read_cnt; 90 unsigned int dump_read_cnt;
90 unsigned int console_read_cnt; 91 unsigned int console_read_cnt;
91 unsigned int ftrace_read_cnt; 92 unsigned int ftrace_read_cnt;
@@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
101 102
102 cxt->dump_read_cnt = 0; 103 cxt->dump_read_cnt = 0;
103 cxt->console_read_cnt = 0; 104 cxt->console_read_cnt = 0;
105 cxt->ftrace_read_cnt = 0;
104 return 0; 106 return 0;
105} 107}
106 108
@@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
117 return NULL; 119 return NULL;
118 120
119 prz = przs[i]; 121 prz = przs[i];
122 if (!prz)
123 return NULL;
120 124
121 if (update) { 125 /* Update old/shadowed buffer. */
122 /* Update old/shadowed buffer. */ 126 if (update)
123 persistent_ram_save_old(prz); 127 persistent_ram_save_old(prz);
124 if (!persistent_ram_old_size(prz)) 128
125 return NULL; 129 if (!persistent_ram_old_size(prz))
126 } 130 return NULL;
127 131
128 *typep = type; 132 *typep = type;
129 *id = i; 133 *id = i;
@@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
316{ 320{
317 int i; 321 int i;
318 322
323 cxt->max_dump_cnt = 0;
319 if (!cxt->przs) 324 if (!cxt->przs)
320 return; 325 return;
321 326
@@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
346 GFP_KERNEL); 351 GFP_KERNEL);
347 if (!cxt->przs) { 352 if (!cxt->przs) {
348 dev_err(dev, "failed to initialize a prz array for dumps\n"); 353 dev_err(dev, "failed to initialize a prz array for dumps\n");
349 return -ENOMEM; 354 goto fail_prz;
350 } 355 }
351 356
352 for (i = 0; i < cxt->max_dump_cnt; i++) { 357 for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev)
428 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) 433 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
429 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 434 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
430 435
431 cxt->dump_read_cnt = 0;
432 cxt->size = pdata->mem_size; 436 cxt->size = pdata->mem_size;
433 cxt->phys_addr = pdata->mem_address; 437 cxt->phys_addr = pdata->mem_address;
434 cxt->record_size = pdata->record_size; 438 cxt->record_size = pdata->record_size;
@@ -505,7 +509,6 @@ fail_buf:
505 kfree(cxt->pstore.buf); 509 kfree(cxt->pstore.buf);
506fail_clear: 510fail_clear:
507 cxt->pstore.bufsize = 0; 511 cxt->pstore.bufsize = 0;
508 cxt->max_dump_cnt = 0;
509fail_cnt: 512fail_cnt:
510 kfree(cxt->fprz); 513 kfree(cxt->fprz);
511fail_init_fprz: 514fail_init_fprz:
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d426763..ff7e3d4df5a1 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -54,7 +54,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
54 do { 54 do {
55 old = atomic_read(&prz->buffer->start); 55 old = atomic_read(&prz->buffer->start);
56 new = old + a; 56 new = old + a;
57 while (unlikely(new > prz->buffer_size)) 57 while (unlikely(new >= prz->buffer_size))
58 new -= prz->buffer_size; 58 new -= prz->buffer_size;
59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); 59 } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
60 60
@@ -91,7 +91,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
91 91
92 old = atomic_read(&prz->buffer->start); 92 old = atomic_read(&prz->buffer->start);
93 new = old + a; 93 new = old + a;
94 while (unlikely(new > prz->buffer_size)) 94 while (unlikely(new >= prz->buffer_size))
95 new -= prz->buffer_size; 95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new); 96 atomic_set(&prz->buffer->start, new);
97 97
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810381c..c4bcb778886e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
44{ 44{
45 struct qnx4_sb_info *qs; 45 struct qnx4_sb_info *qs;
46 46
47 sync_filesystem(sb);
47 qs = qnx4_sb(sb); 48 qs = qnx4_sb(sb);
48 qs->Version = QNX4_VERSION; 49 qs->Version = QNX4_VERSION;
49 *flags |= MS_RDONLY; 50 *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa1..65cdaab3ed49 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
55 55
56static int qnx6_remount(struct super_block *sb, int *flags, char *data) 56static int qnx6_remount(struct super_block *sb, int *flags, char *data)
57{ 57{
58 sync_filesystem(sb);
58 *flags |= MS_RDONLY; 59 *flags |= MS_RDONLY;
59 return 0; 60 return 0;
60} 61}
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 880fd9884366..c51df1dd237e 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -8,9 +8,10 @@ config QUOTA
8 help 8 help
9 If you say Y here, you will be able to set per user limits for disk 9 If you say Y here, you will be able to set per user limits for disk
10 usage (also called disk quotas). Currently, it works for the 10 usage (also called disk quotas). Currently, it works for the
11 ext2, ext3, and reiserfs file system. ext3 also supports journalled 11 ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
12 quotas for which you don't need to run quotacheck(8) after an unclean 12 Note that gfs2 and xfs use their own quota system.
13 shutdown. 13 Ext3, ext4 and reiserfs also support journaled quotas for which
14 you don't need to run quotacheck(8) after an unclean shutdown.
14 For further details, read the Quota mini-HOWTO, available from 15 For further details, read the Quota mini-HOWTO, available from
15 <http://www.tldp.org/docs.html#howto>, or the documentation provided 16 <http://www.tldp.org/docs.html#howto>, or the documentation provided
16 with the quota tools. Probably the quota support is only useful for 17 with the quota tools. Probably the quota support is only useful for
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index cfc8dcc16043..9cd5f63715c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
528 if (atomic_read(&dquot->dq_count)) { 528 if (atomic_read(&dquot->dq_count)) {
529 DEFINE_WAIT(wait); 529 DEFINE_WAIT(wait);
530 530
531 atomic_inc(&dquot->dq_count); 531 dqgrab(dquot);
532 prepare_to_wait(&dquot->dq_wait_unused, &wait, 532 prepare_to_wait(&dquot->dq_wait_unused, &wait,
533 TASK_UNINTERRUPTIBLE); 533 TASK_UNINTERRUPTIBLE);
534 spin_unlock(&dq_list_lock); 534 spin_unlock(&dq_list_lock);
@@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
632 /* Now we have active dquot from which someone is 632 /* Now we have active dquot from which someone is
633 * holding reference so we can safely just increase 633 * holding reference so we can safely just increase
634 * use count */ 634 * use count */
635 atomic_inc(&dquot->dq_count); 635 dqgrab(dquot);
636 spin_unlock(&dq_list_lock); 636 spin_unlock(&dq_list_lock);
637 dqstats_inc(DQST_LOOKUPS); 637 dqstats_inc(DQST_LOOKUPS);
638 err = sb->dq_op->write_dquot(dquot); 638 err = sb->dq_op->write_dquot(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index 28cc9c810744..31c6efa43183 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -994,9 +994,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
994 return ret; 994 return ret;
995} 995}
996 996
997COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 997static long __compat_sys_preadv64(unsigned long fd,
998 const struct compat_iovec __user *,vec, 998 const struct compat_iovec __user *vec,
999 unsigned long, vlen, loff_t, pos) 999 unsigned long vlen, loff_t pos)
1000{ 1000{
1001 struct fd f; 1001 struct fd f;
1002 ssize_t ret; 1002 ssize_t ret;
@@ -1013,12 +1013,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1013 return ret; 1013 return ret;
1014} 1014}
1015 1015
1016#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1017COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1018 const struct compat_iovec __user *,vec,
1019 unsigned long, vlen, loff_t, pos)
1020{
1021 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022}
1023#endif
1024
1016COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1025COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1017 const struct compat_iovec __user *,vec, 1026 const struct compat_iovec __user *,vec,
1018 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1027 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1019{ 1028{
1020 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1029 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1021 return compat_sys_preadv64(fd, vec, vlen, pos); 1030
1031 return __compat_sys_preadv64(fd, vec, vlen, pos);
1022} 1032}
1023 1033
1024static size_t compat_writev(struct file *file, 1034static size_t compat_writev(struct file *file,
@@ -1061,9 +1071,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1061 return ret; 1071 return ret;
1062} 1072}
1063 1073
1064COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1074static long __compat_sys_pwritev64(unsigned long fd,
1065 const struct compat_iovec __user *,vec, 1075 const struct compat_iovec __user *vec,
1066 unsigned long, vlen, loff_t, pos) 1076 unsigned long vlen, loff_t pos)
1067{ 1077{
1068 struct fd f; 1078 struct fd f;
1069 ssize_t ret; 1079 ssize_t ret;
@@ -1080,12 +1090,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1080 return ret; 1090 return ret;
1081} 1091}
1082 1092
1093#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1094COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1095 const struct compat_iovec __user *,vec,
1096 unsigned long, vlen, loff_t, pos)
1097{
1098 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1099}
1100#endif
1101
1083COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1102COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1084 const struct compat_iovec __user *,vec, 1103 const struct compat_iovec __user *,vec,
1085 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1104 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1086{ 1105{
1087 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1106 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1088 return compat_sys_pwritev64(fd, vec, vlen, pos); 1107
1108 return __compat_sys_pwritev64(fd, vec, vlen, pos);
1089} 1109}
1090#endif 1110#endif
1091 1111
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 1fd2051109a3..af677353a3f5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
125 int d_reclen; 125 int d_reclen;
126 char *d_name; 126 char *d_name;
127 ino_t d_ino; 127 ino_t d_ino;
128 loff_t cur_pos = deh_offset(deh);
128 129
129 if (!de_visible(deh)) 130 if (!de_visible(deh))
130 /* it is hidden entry */ 131 /* it is hidden entry */
@@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
196 if (local_buf != small_buf) { 197 if (local_buf != small_buf) {
197 kfree(local_buf); 198 kfree(local_buf);
198 } 199 }
199 // next entry should be looked for with such offset 200
200 next_pos = deh_offset(deh) + 1; 201 /* deh_offset(deh) may be invalid now. */
202 next_pos = cur_pos + 1;
201 203
202 if (item_moved(&tmp_ih, &path_to_entry)) { 204 if (item_moved(&tmp_ih, &path_to_entry)) {
203 set_cpu_key_k_offset(&pos_key, 205 set_cpu_key_k_offset(&pos_key,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb451e..bc8b8009897d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode)
35 if (!inode->i_nlink && !is_bad_inode(inode)) 35 if (!inode->i_nlink && !is_bad_inode(inode))
36 dquot_initialize(inode); 36 dquot_initialize(inode);
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages_final(&inode->i_data);
39 if (inode->i_nlink) 39 if (inode->i_nlink)
40 goto no_delete; 40 goto no_delete;
41 41
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 8d06adf89948..83d4eac8059a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s);
2831 */ 2831 */
2832__le32 reiserfs_choose_packing(struct inode *dir); 2832__le32 reiserfs_choose_packing(struct inode *dir);
2833 2833
2834void show_alloc_options(struct seq_file *seq, struct super_block *s);
2834int reiserfs_init_bitmap_cache(struct super_block *sb); 2835int reiserfs_init_bitmap_cache(struct super_block *sb);
2835void reiserfs_free_bitmap_cache(struct super_block *sb); 2836void reiserfs_free_bitmap_cache(struct super_block *sb);
2836void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); 2837void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2c803353f8ac..9fb20426005e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
62 62
63static int reiserfs_remount(struct super_block *s, int *flags, char *data); 63static int reiserfs_remount(struct super_block *s, int *flags, char *data);
64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
65void show_alloc_options(struct seq_file *seq, struct super_block *s);
66 65
67static int reiserfs_sync_fs(struct super_block *s, int wait) 66static int reiserfs_sync_fs(struct super_block *s, int wait)
68{ 67{
@@ -597,7 +596,7 @@ static void init_once(void *foo)
597 inode_init_once(&ei->vfs_inode); 596 inode_init_once(&ei->vfs_inode);
598} 597}
599 598
600static int init_inodecache(void) 599static int __init init_inodecache(void)
601{ 600{
602 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", 601 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
603 sizeof(struct 602 sizeof(struct
@@ -1319,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1319 int i; 1318 int i;
1320#endif 1319#endif
1321 1320
1321 sync_filesystem(s);
1322 reiserfs_write_lock(s); 1322 reiserfs_write_lock(s);
1323 1323
1324#ifdef CONFIG_QUOTA 1324#ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d8418782862b..ef90e8bca95a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
432 */ 432 */
433static int romfs_remount(struct super_block *sb, int *flags, char *data) 433static int romfs_remount(struct super_block *sb, int *flags, char *data)
434{ 434{
435 sync_filesystem(sb);
435 *flags |= MS_RDONLY; 436 *flags |= MS_RDONLY;
436 return 0; 437 return 0;
437} 438}
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..e246954ea48c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
136 136
137const struct pipe_buf_operations page_cache_pipe_buf_ops = { 137const struct pipe_buf_operations page_cache_pipe_buf_ops = {
138 .can_merge = 0, 138 .can_merge = 0,
139 .map = generic_pipe_buf_map,
140 .unmap = generic_pipe_buf_unmap,
141 .confirm = page_cache_pipe_buf_confirm, 139 .confirm = page_cache_pipe_buf_confirm,
142 .release = page_cache_pipe_buf_release, 140 .release = page_cache_pipe_buf_release,
143 .steal = page_cache_pipe_buf_steal, 141 .steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
156 154
157static const struct pipe_buf_operations user_page_pipe_buf_ops = { 155static const struct pipe_buf_operations user_page_pipe_buf_ops = {
158 .can_merge = 0, 156 .can_merge = 0,
159 .map = generic_pipe_buf_map,
160 .unmap = generic_pipe_buf_unmap,
161 .confirm = generic_pipe_buf_confirm, 157 .confirm = generic_pipe_buf_confirm,
162 .release = page_cache_pipe_buf_release, 158 .release = page_cache_pipe_buf_release,
163 .steal = user_page_pipe_buf_steal, 159 .steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
547 543
548static const struct pipe_buf_operations default_pipe_buf_ops = { 544static const struct pipe_buf_operations default_pipe_buf_ops = {
549 .can_merge = 0, 545 .can_merge = 0,
550 .map = generic_pipe_buf_map,
551 .unmap = generic_pipe_buf_unmap,
552 .confirm = generic_pipe_buf_confirm, 546 .confirm = generic_pipe_buf_confirm,
553 .release = generic_pipe_buf_release, 547 .release = generic_pipe_buf_release,
554 .steal = generic_pipe_buf_steal, 548 .steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
564/* Pipe buffer operations for a socket and similar. */ 558/* Pipe buffer operations for a socket and similar. */
565const struct pipe_buf_operations nosteal_pipe_buf_ops = { 559const struct pipe_buf_operations nosteal_pipe_buf_ops = {
566 .can_merge = 0, 560 .can_merge = 0,
567 .map = generic_pipe_buf_map,
568 .unmap = generic_pipe_buf_unmap,
569 .confirm = generic_pipe_buf_confirm, 561 .confirm = generic_pipe_buf_confirm,
570 .release = generic_pipe_buf_release, 562 .release = generic_pipe_buf_release,
571 .steal = generic_pipe_buf_nosteal, 563 .steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
767 goto out; 759 goto out;
768 760
769 if (buf->page != page) { 761 if (buf->page != page) {
770 char *src = buf->ops->map(pipe, buf, 1); 762 char *src = kmap_atomic(buf->page);
771 char *dst = kmap_atomic(page); 763 char *dst = kmap_atomic(page);
772 764
773 memcpy(dst + offset, src + buf->offset, this_len); 765 memcpy(dst + offset, src + buf->offset, this_len);
774 flush_dcache_page(page); 766 flush_dcache_page(page);
775 kunmap_atomic(dst); 767 kunmap_atomic(dst);
776 buf->ops->unmap(pipe, buf, src); 768 kunmap_atomic(src);
777 } 769 }
778 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, 770 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
779 page, fsdata); 771 page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1067 void *data; 1059 void *data;
1068 loff_t tmp = sd->pos; 1060 loff_t tmp = sd->pos;
1069 1061
1070 data = buf->ops->map(pipe, buf, 0); 1062 data = kmap(buf->page);
1071 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 1063 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
1072 buf->ops->unmap(pipe, buf, data); 1064 kunmap(buf->page);
1073 1065
1074 return ret; 1066 return ret;
1075} 1067}
@@ -1528,116 +1520,50 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1528static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1520static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1529 struct splice_desc *sd) 1521 struct splice_desc *sd)
1530{ 1522{
1531 char *src; 1523 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1532 int ret; 1524 return n == sd->len ? n : -EFAULT;
1533
1534 /*
1535 * See if we can use the atomic maps, by prefaulting in the
1536 * pages and doing an atomic copy
1537 */
1538 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1539 src = buf->ops->map(pipe, buf, 1);
1540 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1541 sd->len);
1542 buf->ops->unmap(pipe, buf, src);
1543 if (!ret) {
1544 ret = sd->len;
1545 goto out;
1546 }
1547 }
1548
1549 /*
1550 * No dice, use slow non-atomic map and copy
1551 */
1552 src = buf->ops->map(pipe, buf, 0);
1553
1554 ret = sd->len;
1555 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1556 ret = -EFAULT;
1557
1558 buf->ops->unmap(pipe, buf, src);
1559out:
1560 if (ret > 0)
1561 sd->u.userptr += ret;
1562 return ret;
1563} 1525}
1564 1526
1565/* 1527/*
1566 * For lack of a better implementation, implement vmsplice() to userspace 1528 * For lack of a better implementation, implement vmsplice() to userspace
1567 * as a simple copy of the pipes pages to the user iov. 1529 * as a simple copy of the pipes pages to the user iov.
1568 */ 1530 */
1569static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1531static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1570 unsigned long nr_segs, unsigned int flags) 1532 unsigned long nr_segs, unsigned int flags)
1571{ 1533{
1572 struct pipe_inode_info *pipe; 1534 struct pipe_inode_info *pipe;
1573 struct splice_desc sd; 1535 struct splice_desc sd;
1574 ssize_t size;
1575 int error;
1576 long ret; 1536 long ret;
1537 struct iovec iovstack[UIO_FASTIOV];
1538 struct iovec *iov = iovstack;
1539 struct iov_iter iter;
1540 ssize_t count;
1577 1541
1578 pipe = get_pipe_info(file); 1542 pipe = get_pipe_info(file);
1579 if (!pipe) 1543 if (!pipe)
1580 return -EBADF; 1544 return -EBADF;
1581 1545
1582 pipe_lock(pipe); 1546 ret = rw_copy_check_uvector(READ, uiov, nr_segs,
1583 1547 ARRAY_SIZE(iovstack), iovstack, &iov);
1584 error = ret = 0; 1548 if (ret <= 0)
1585 while (nr_segs) { 1549 goto out;
1586 void __user *base;
1587 size_t len;
1588
1589 /*
1590 * Get user address base and length for this iovec.
1591 */
1592 error = get_user(base, &iov->iov_base);
1593 if (unlikely(error))
1594 break;
1595 error = get_user(len, &iov->iov_len);
1596 if (unlikely(error))
1597 break;
1598
1599 /*
1600 * Sanity check this iovec. 0 read succeeds.
1601 */
1602 if (unlikely(!len))
1603 break;
1604 if (unlikely(!base)) {
1605 error = -EFAULT;
1606 break;
1607 }
1608
1609 if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1610 error = -EFAULT;
1611 break;
1612 }
1613
1614 sd.len = 0;
1615 sd.total_len = len;
1616 sd.flags = flags;
1617 sd.u.userptr = base;
1618 sd.pos = 0;
1619
1620 size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1621 if (size < 0) {
1622 if (!ret)
1623 ret = size;
1624
1625 break;
1626 }
1627
1628 ret += size;
1629 1550
1630 if (size < len) 1551 count = ret;
1631 break; 1552 iov_iter_init(&iter, iov, nr_segs, count, 0);
1632 1553
1633 nr_segs--; 1554 sd.len = 0;
1634 iov++; 1555 sd.total_len = count;
1635 } 1556 sd.flags = flags;
1557 sd.u.data = &iter;
1558 sd.pos = 0;
1636 1559
1560 pipe_lock(pipe);
1561 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1637 pipe_unlock(pipe); 1562 pipe_unlock(pipe);
1638 1563
1639 if (!ret) 1564out:
1640 ret = error; 1565 if (iov != iovstack)
1566 kfree(iov);
1641 1567
1642 return ret; 1568 return ret;
1643} 1569}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df6312d4e..031c8d67fd51 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
371 371
372static int squashfs_remount(struct super_block *sb, int *flags, char *data) 372static int squashfs_remount(struct super_block *sb, int *flags, char *data)
373{ 373{
374 sync_filesystem(sb);
374 *flags |= MS_RDONLY; 375 *flags |= MS_RDONLY;
375 return 0; 376 return 0;
376} 377}
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2ca765..48377f7463c0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
719 } 719 }
720 } 720 }
721 721
722 sync_filesystem(sb);
723
724 if (sb->s_op->remount_fs) { 722 if (sb->s_op->remount_fs) {
725 retval = sb->s_op->remount_fs(sb, &flags, data); 723 retval = sb->s_op->remount_fs(sb, &flags, data);
726 if (retval) { 724 if (retval) {
@@ -802,7 +800,10 @@ void emergency_remount(void)
802 800
803static DEFINE_IDA(unnamed_dev_ida); 801static DEFINE_IDA(unnamed_dev_ida);
804static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ 802static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
805static int unnamed_dev_start = 0; /* don't bother trying below it */ 803/* Many userspace utilities consider an FSID of 0 invalid.
804 * Always return at least 1 from get_anon_bdev.
805 */
806static int unnamed_dev_start = 1;
806 807
807int get_anon_bdev(dev_t *p) 808int get_anon_bdev(dev_t *p)
808{ 809{
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index 8c41feacbac5..b2756014508c 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,6 +1,7 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EXPERT 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 select KERNFS
4 help 5 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 6 The sysfs filesystem is a virtual filesystem that the kernel uses to
6 export internal kernel objects, their attributes, and their 7 export internal kernel objects, their attributes, and their
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ee0d761c3179..0b45ff42f374 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -19,39 +19,18 @@
19 19
20DEFINE_SPINLOCK(sysfs_symlink_target_lock); 20DEFINE_SPINLOCK(sysfs_symlink_target_lock);
21 21
22/**
23 * sysfs_pathname - return full path to sysfs dirent
24 * @kn: kernfs_node whose path we want
25 * @path: caller allocated buffer of size PATH_MAX
26 *
27 * Gives the name "/" to the sysfs_root entry; any path returned
28 * is relative to wherever sysfs is mounted.
29 */
30static char *sysfs_pathname(struct kernfs_node *kn, char *path)
31{
32 if (kn->parent) {
33 sysfs_pathname(kn->parent, path);
34 strlcat(path, "/", PATH_MAX);
35 }
36 strlcat(path, kn->name, PATH_MAX);
37 return path;
38}
39
40void sysfs_warn_dup(struct kernfs_node *parent, const char *name) 22void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
41{ 23{
42 char *path; 24 char *buf, *path = NULL;
43 25
44 path = kzalloc(PATH_MAX, GFP_KERNEL); 26 buf = kzalloc(PATH_MAX, GFP_KERNEL);
45 if (path) { 27 if (buf)
46 sysfs_pathname(parent, path); 28 path = kernfs_path(parent, buf, PATH_MAX);
47 strlcat(path, "/", PATH_MAX);
48 strlcat(path, name, PATH_MAX);
49 }
50 29
51 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", 30 WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
52 path ? path : name); 31 path, name);
53 32
54 kfree(path); 33 kfree(buf);
55} 34}
56 35
57/** 36/**
@@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj)
122int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, 101int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
123 const void *new_ns) 102 const void *new_ns)
124{ 103{
125 struct kernfs_node *parent = kobj->sd->parent; 104 struct kernfs_node *parent;
105 int ret;
126 106
127 return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); 107 parent = kernfs_get_parent(kobj->sd);
108 ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
109 kernfs_put(parent);
110 return ret;
128} 111}
129 112
130int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, 113int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
133 struct kernfs_node *kn = kobj->sd; 116 struct kernfs_node *kn = kobj->sd;
134 struct kernfs_node *new_parent; 117 struct kernfs_node *new_parent;
135 118
136 BUG_ON(!kn->parent);
137 new_parent = new_parent_kobj && new_parent_kobj->sd ? 119 new_parent = new_parent_kobj && new_parent_kobj->sd ?
138 new_parent_kobj->sd : sysfs_root_kn; 120 new_parent_kobj->sd : sysfs_root_kn;
139 121
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 810cf6e613e5..e9ef59b3abb1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
47 ssize_t count; 47 ssize_t count;
48 char *buf; 48 char *buf;
49 49
50 /* acquire buffer and ensure that it's >= PAGE_SIZE */ 50 /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
51 count = seq_get_buf(sf, &buf); 51 count = seq_get_buf(sf, &buf);
52 if (count < PAGE_SIZE) { 52 if (count < PAGE_SIZE) {
53 seq_commit(sf, -1); 53 seq_commit(sf, -1);
54 return 0; 54 return 0;
55 } 55 }
56 memset(buf, 0, PAGE_SIZE);
56 57
57 /* 58 /*
58 * Invoke show(). Control may reach here via seq file lseek even 59 * Invoke show(). Control may reach here via seq file lseek even
@@ -372,6 +373,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
372} 373}
373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); 374EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
374 375
376/**
377 * sysfs_remove_file_self - remove an object attribute from its own method
378 * @kobj: object we're acting for
379 * @attr: attribute descriptor
380 *
381 * See kernfs_remove_self() for details.
382 */
383bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
384{
385 struct kernfs_node *parent = kobj->sd;
386 struct kernfs_node *kn;
387 bool ret;
388
389 kn = kernfs_find_and_get(parent, attr->name);
390 if (WARN_ON_ONCE(!kn))
391 return false;
392
393 ret = kernfs_remove_self(kn);
394
395 kernfs_put(kn);
396 return ret;
397}
398
375void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) 399void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
376{ 400{
377 int i; 401 int i;
@@ -430,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj,
430 kernfs_remove_by_name(kobj->sd, attr->attr.name); 454 kernfs_remove_by_name(kobj->sd, attr->attr.name);
431} 455}
432EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); 456EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
433
434struct sysfs_schedule_callback_struct {
435 struct list_head workq_list;
436 struct kobject *kobj;
437 void (*func)(void *);
438 void *data;
439 struct module *owner;
440 struct work_struct work;
441};
442
443static struct workqueue_struct *sysfs_workqueue;
444static DEFINE_MUTEX(sysfs_workq_mutex);
445static LIST_HEAD(sysfs_workq);
446static void sysfs_schedule_callback_work(struct work_struct *work)
447{
448 struct sysfs_schedule_callback_struct *ss = container_of(work,
449 struct sysfs_schedule_callback_struct, work);
450
451 (ss->func)(ss->data);
452 kobject_put(ss->kobj);
453 module_put(ss->owner);
454 mutex_lock(&sysfs_workq_mutex);
455 list_del(&ss->workq_list);
456 mutex_unlock(&sysfs_workq_mutex);
457 kfree(ss);
458}
459
460/**
461 * sysfs_schedule_callback - helper to schedule a callback for a kobject
462 * @kobj: object we're acting for.
463 * @func: callback function to invoke later.
464 * @data: argument to pass to @func.
465 * @owner: module owning the callback code
466 *
467 * sysfs attribute methods must not unregister themselves or their parent
468 * kobject (which would amount to the same thing). Attempts to do so will
469 * deadlock, since unregistration is mutually exclusive with driver
470 * callbacks.
471 *
472 * Instead methods can call this routine, which will attempt to allocate
473 * and schedule a workqueue request to call back @func with @data as its
474 * argument in the workqueue's process context. @kobj will be pinned
475 * until @func returns.
476 *
477 * Returns 0 if the request was submitted, -ENOMEM if storage could not
478 * be allocated, -ENODEV if a reference to @owner isn't available,
479 * -EAGAIN if a callback has already been scheduled for @kobj.
480 */
481int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
482 void *data, struct module *owner)
483{
484 struct sysfs_schedule_callback_struct *ss, *tmp;
485
486 if (!try_module_get(owner))
487 return -ENODEV;
488
489 mutex_lock(&sysfs_workq_mutex);
490 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
491 if (ss->kobj == kobj) {
492 module_put(owner);
493 mutex_unlock(&sysfs_workq_mutex);
494 return -EAGAIN;
495 }
496 mutex_unlock(&sysfs_workq_mutex);
497
498 if (sysfs_workqueue == NULL) {
499 sysfs_workqueue = create_singlethread_workqueue("sysfsd");
500 if (sysfs_workqueue == NULL) {
501 module_put(owner);
502 return -ENOMEM;
503 }
504 }
505
506 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
507 if (!ss) {
508 module_put(owner);
509 return -ENOMEM;
510 }
511 kobject_get(kobj);
512 ss->kobj = kobj;
513 ss->func = func;
514 ss->data = data;
515 ss->owner = owner;
516 INIT_WORK(&ss->work, sysfs_schedule_callback_work);
517 INIT_LIST_HEAD(&ss->workq_list);
518 mutex_lock(&sysfs_workq_mutex);
519 list_add_tail(&ss->workq_list, &sysfs_workq);
520 mutex_unlock(&sysfs_workq_mutex);
521 queue_work(sysfs_workqueue, &ss->work);
522 return 0;
523}
524EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 6b579387c67a..aa0406895b53 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,8 +70,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
70 if (grp->bin_attrs) { 70 if (grp->bin_attrs) {
71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { 71 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
72 if (update) 72 if (update)
73 sysfs_remove_bin_file(kobj, *bin_attr); 73 kernfs_remove_by_name(parent,
74 error = sysfs_create_bin_file(kobj, *bin_attr); 74 (*bin_attr)->attr.name);
75 error = sysfs_add_file_mode_ns(parent,
76 &(*bin_attr)->attr, true,
77 (*bin_attr)->attr.mode, NULL);
75 if (error) 78 if (error)
76 break; 79 break;
77 } 80 }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 3eaf5c6622eb..8a49486bf30c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,6 +13,7 @@
13#define DEBUG 13#define DEBUG
14 14
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/user_namespace.h> 19#include <linux/user_namespace.h>
@@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
38 } 39 }
39 40
40 ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); 41 ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
41 root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); 42 root = kernfs_mount_ns(fs_type, flags, sysfs_root,
43 SYSFS_MAGIC, &new_sb, ns);
42 if (IS_ERR(root) || !new_sb) 44 if (IS_ERR(root) || !new_sb)
43 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); 45 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
44 return root; 46 return root;
@@ -63,7 +65,8 @@ int __init sysfs_init(void)
63{ 65{
64 int err; 66 int err;
65 67
66 sysfs_root = kernfs_create_root(NULL, NULL); 68 sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
69 NULL);
67 if (IS_ERR(sysfs_root)) 70 if (IS_ERR(sysfs_root))
68 return PTR_ERR(sysfs_root); 71 return PTR_ERR(sysfs_root);
69 72
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4ee1235..88956309cc86 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
60{ 60{
61 struct sysv_sb_info *sbi = SYSV_SB(sb); 61 struct sysv_sb_info *sbi = SYSV_SB(sb);
62 62
63 sync_filesystem(sb);
63 if (sbi->s_forced_ro) 64 if (sbi->s_forced_ro)
64 *flags |= MS_RDONLY; 65 *flags |= MS_RDONLY;
65 return 0; 66 return 0;
@@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode)
295 296
296static void sysv_evict_inode(struct inode *inode) 297static void sysv_evict_inode(struct inode *inode)
297{ 298{
298 truncate_inode_pages(&inode->i_data, 0); 299 truncate_inode_pages_final(&inode->i_data);
299 if (!inode->i_nlink) { 300 if (!inode->i_nlink) {
300 inode->i_size = 0; 301 inode->i_size = 0;
301 sysv_truncate(inode); 302 sysv_truncate(inode);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 929312180dd0..0013142c0475 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
317 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
318 clockid != CLOCK_REALTIME && 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM && 319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME &&
320 clockid != CLOCK_BOOTTIME_ALARM)) 321 clockid != CLOCK_BOOTTIME_ALARM))
321 return -EINVAL; 322 return -EINVAL;
322 323
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 123c79b7261e..4f34dbae823d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,6 +1538,7 @@ out_unlock:
1538 1538
1539static const struct vm_operations_struct ubifs_file_vm_ops = { 1539static const struct vm_operations_struct ubifs_file_vm_ops = {
1540 .fault = filemap_fault, 1540 .fault = filemap_fault,
1541 .map_pages = filemap_map_pages,
1541 .page_mkwrite = ubifs_vm_page_mkwrite, 1542 .page_mkwrite = ubifs_vm_page_mkwrite,
1542 .remap_pages = generic_file_remap_pages, 1543 .remap_pages = generic_file_remap_pages,
1543}; 1544};
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded8490c0c6..a81c7b556896 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); 351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
352 ubifs_assert(!atomic_read(&inode->i_count)); 352 ubifs_assert(!atomic_read(&inode->i_count));
353 353
354 truncate_inode_pages(&inode->i_data, 0); 354 truncate_inode_pages_final(&inode->i_data);
355 355
356 if (inode->i_nlink) 356 if (inode->i_nlink)
357 goto done; 357 goto done;
@@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1556 if (c->space_fixup) { 1556 if (c->space_fixup) {
1557 err = ubifs_fixup_free_space(c); 1557 err = ubifs_fixup_free_space(c);
1558 if (err) 1558 if (err)
1559 return err; 1559 goto out;
1560 } 1560 }
1561 1561
1562 err = check_free_space(c); 1562 err = check_free_space(c);
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1827 int err; 1827 int err;
1828 struct ubifs_info *c = sb->s_fs_info; 1828 struct ubifs_info *c = sb->s_fs_info;
1829 1829
1830 sync_filesystem(sb);
1830 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); 1831 dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
1831 1832
1832 err = ubifs_parse_options(c, data, 1); 1833 err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1037637957c7..d2c170f8b035 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
171 } else 171 } else
172 up_write(&iinfo->i_data_sem); 172 up_write(&iinfo->i_data_sem);
173 173
174 retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 174 retval = __generic_file_aio_write(iocb, iov, nr_segs);
175 mutex_unlock(&inode->i_mutex); 175 mutex_unlock(&inode->i_mutex);
176 176
177 if (retval > 0) { 177 if (retval > 0) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 982ce05c87ed..5d643706212f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
146 want_delete = 1; 146 want_delete = 1;
147 udf_setsize(inode, 0); 147 udf_setsize(inode, 0);
148 udf_update_inode(inode, IS_SYNC(inode)); 148 udf_update_inode(inode, IS_SYNC(inode));
149 } else 149 }
150 truncate_inode_pages(&inode->i_data, 0); 150 truncate_inode_pages_final(&inode->i_data);
151 invalidate_inode_buffers(inode); 151 invalidate_inode_buffers(inode);
152 clear_inode(inode); 152 clear_inode(inode);
153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f69bed..3286db047a40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,7 +175,7 @@ static void init_once(void *foo)
175 inode_init_once(&ei->vfs_inode); 175 inode_init_once(&ei->vfs_inode);
176} 176}
177 177
178static int init_inodecache(void) 178static int __init init_inodecache(void)
179{ 179{
180 udf_inode_cachep = kmem_cache_create("udf_inode_cache", 180 udf_inode_cachep = kmem_cache_create("udf_inode_cache",
181 sizeof(struct udf_inode_info), 181 sizeof(struct udf_inode_info),
@@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
505 while ((p = strsep(&options, ",")) != NULL) { 505 while ((p = strsep(&options, ",")) != NULL) {
506 substring_t args[MAX_OPT_ARGS]; 506 substring_t args[MAX_OPT_ARGS];
507 int token; 507 int token;
508 unsigned n;
508 if (!*p) 509 if (!*p)
509 continue; 510 continue;
510 511
@@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
516 case Opt_bs: 517 case Opt_bs:
517 if (match_int(&args[0], &option)) 518 if (match_int(&args[0], &option))
518 return 0; 519 return 0;
519 uopt->blocksize = option; 520 n = option;
521 if (n != 512 && n != 1024 && n != 2048 && n != 4096)
522 return 0;
523 uopt->blocksize = n;
520 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); 524 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
521 break; 525 break;
522 case Opt_unhide: 526 case Opt_unhide:
@@ -646,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
646 int error = 0; 650 int error = 0;
647 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); 651 struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
648 652
653 sync_filesystem(sb);
649 if (lvidiu) { 654 if (lvidiu) {
650 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); 655 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
651 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) 656 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae660..0ab1de4b39a5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
38{ 38{
39 struct super_block * sb; 39 struct super_block * sb;
40 struct ufs_sb_private_info * uspi; 40 struct ufs_sb_private_info * uspi;
41 struct ufs_super_block_first * usb1;
42 struct ufs_cg_private_info * ucpi; 41 struct ufs_cg_private_info * ucpi;
43 struct ufs_cylinder_group * ucg; 42 struct ufs_cylinder_group * ucg;
44 unsigned cgno, bit, end_bit, bbase, blkmap, i; 43 unsigned cgno, bit, end_bit, bbase, blkmap, i;
@@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
46 45
47 sb = inode->i_sb; 46 sb = inode->i_sb;
48 uspi = UFS_SB(sb)->s_uspi; 47 uspi = UFS_SB(sb)->s_uspi;
49 usb1 = ubh_get_usb_first(uspi);
50 48
51 UFSD("ENTER, fragment %llu, count %u\n", 49 UFSD("ENTER, fragment %llu, count %u\n",
52 (unsigned long long)fragment, count); 50 (unsigned long long)fragment, count);
@@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
135{ 133{
136 struct super_block * sb; 134 struct super_block * sb;
137 struct ufs_sb_private_info * uspi; 135 struct ufs_sb_private_info * uspi;
138 struct ufs_super_block_first * usb1;
139 struct ufs_cg_private_info * ucpi; 136 struct ufs_cg_private_info * ucpi;
140 struct ufs_cylinder_group * ucg; 137 struct ufs_cylinder_group * ucg;
141 unsigned overflow, cgno, bit, end_bit, i; 138 unsigned overflow, cgno, bit, end_bit, i;
@@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
143 140
144 sb = inode->i_sb; 141 sb = inode->i_sb;
145 uspi = UFS_SB(sb)->s_uspi; 142 uspi = UFS_SB(sb)->s_uspi;
146 usb1 = ubh_get_usb_first(uspi);
147 143
148 UFSD("ENTER, fragment %llu, count %u\n", 144 UFSD("ENTER, fragment %llu, count %u\n",
149 (unsigned long long)fragment, count); 145 (unsigned long long)fragment, count);
@@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
499{ 495{
500 struct super_block * sb; 496 struct super_block * sb;
501 struct ufs_sb_private_info * uspi; 497 struct ufs_sb_private_info * uspi;
502 struct ufs_super_block_first * usb1;
503 struct ufs_cg_private_info * ucpi; 498 struct ufs_cg_private_info * ucpi;
504 struct ufs_cylinder_group * ucg; 499 struct ufs_cylinder_group * ucg;
505 unsigned cgno, fragno, fragoff, count, fragsize, i; 500 unsigned cgno, fragno, fragoff, count, fragsize, i;
@@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
509 504
510 sb = inode->i_sb; 505 sb = inode->i_sb;
511 uspi = UFS_SB(sb)->s_uspi; 506 uspi = UFS_SB(sb)->s_uspi;
512 usb1 = ubh_get_usb_first (uspi);
513 count = newcount - oldcount; 507 count = newcount - oldcount;
514 508
515 cgno = ufs_dtog(uspi, fragment); 509 cgno = ufs_dtog(uspi, fragment);
@@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
577{ 571{
578 struct super_block * sb; 572 struct super_block * sb;
579 struct ufs_sb_private_info * uspi; 573 struct ufs_sb_private_info * uspi;
580 struct ufs_super_block_first * usb1;
581 struct ufs_cg_private_info * ucpi; 574 struct ufs_cg_private_info * ucpi;
582 struct ufs_cylinder_group * ucg; 575 struct ufs_cylinder_group * ucg;
583 unsigned oldcg, i, j, k, allocsize; 576 unsigned oldcg, i, j, k, allocsize;
@@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
588 581
589 sb = inode->i_sb; 582 sb = inode->i_sb;
590 uspi = UFS_SB(sb)->s_uspi; 583 uspi = UFS_SB(sb)->s_uspi;
591 usb1 = ubh_get_usb_first(uspi);
592 oldcg = cgno; 584 oldcg = cgno;
593 585
594 /* 586 /*
@@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
690{ 682{
691 struct super_block * sb; 683 struct super_block * sb;
692 struct ufs_sb_private_info * uspi; 684 struct ufs_sb_private_info * uspi;
693 struct ufs_super_block_first * usb1;
694 struct ufs_cylinder_group * ucg; 685 struct ufs_cylinder_group * ucg;
695 u64 result, blkno; 686 u64 result, blkno;
696 687
@@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
698 689
699 sb = inode->i_sb; 690 sb = inode->i_sb;
700 uspi = UFS_SB(sb)->s_uspi; 691 uspi = UFS_SB(sb)->s_uspi;
701 usb1 = ubh_get_usb_first(uspi);
702 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 692 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
703 693
704 if (goal == 0) { 694 if (goal == 0) {
@@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
794 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe 784 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
795 }; 785 };
796 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 786 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
797 struct ufs_super_block_first *usb1;
798 struct ufs_cylinder_group *ucg; 787 struct ufs_cylinder_group *ucg;
799 unsigned start, length, loc; 788 unsigned start, length, loc;
800 unsigned pos, want, blockmap, mask, end; 789 unsigned pos, want, blockmap, mask, end;
@@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
803 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, 792 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
804 (unsigned long long)goal, count); 793 (unsigned long long)goal, count);
805 794
806 usb1 = ubh_get_usb_first (uspi);
807 ucg = ubh_get_ucg(UCPI_UBH(ucpi)); 795 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
808 796
809 if (goal) 797 if (goal)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817b..98f7211599ff 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode)
57{ 57{
58 struct super_block * sb; 58 struct super_block * sb;
59 struct ufs_sb_private_info * uspi; 59 struct ufs_sb_private_info * uspi;
60 struct ufs_super_block_first * usb1;
61 struct ufs_cg_private_info * ucpi; 60 struct ufs_cg_private_info * ucpi;
62 struct ufs_cylinder_group * ucg; 61 struct ufs_cylinder_group * ucg;
63 int is_directory; 62 int is_directory;
@@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode)
67 66
68 sb = inode->i_sb; 67 sb = inode->i_sb;
69 uspi = UFS_SB(sb)->s_uspi; 68 uspi = UFS_SB(sb)->s_uspi;
70 usb1 = ubh_get_usb_first(uspi);
71 69
72 ino = inode->i_ino; 70 ino = inode->i_ino;
73 71
@@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
175 struct super_block * sb; 173 struct super_block * sb;
176 struct ufs_sb_info * sbi; 174 struct ufs_sb_info * sbi;
177 struct ufs_sb_private_info * uspi; 175 struct ufs_sb_private_info * uspi;
178 struct ufs_super_block_first * usb1;
179 struct ufs_cg_private_info * ucpi; 176 struct ufs_cg_private_info * ucpi;
180 struct ufs_cylinder_group * ucg; 177 struct ufs_cylinder_group * ucg;
181 struct inode * inode; 178 struct inode * inode;
@@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
195 ufsi = UFS_I(inode); 192 ufsi = UFS_I(inode);
196 sbi = UFS_SB(sb); 193 sbi = UFS_SB(sb);
197 uspi = sbi->s_uspi; 194 uspi = sbi->s_uspi;
198 usb1 = ubh_get_usb_first(uspi);
199 195
200 mutex_lock(&sbi->s_lock); 196 mutex_lock(&sbi->s_lock);
201 197
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca96086784..61e8a9b021dd 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
885 if (!inode->i_nlink && !is_bad_inode(inode)) 885 if (!inode->i_nlink && !is_bad_inode(inode))
886 want_delete = 1; 886 want_delete = 1;
887 887
888 truncate_inode_pages(&inode->i_data, 0); 888 truncate_inode_pages_final(&inode->i_data);
889 if (want_delete) { 889 if (want_delete) {
890 loff_t old_i_size; 890 loff_t old_i_size;
891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..c1183f9f69dc 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
524 struct ufs_buffer_head * ubh; 524 struct ufs_buffer_head * ubh;
525 unsigned char * base, * space; 525 unsigned char * base, * space;
526 unsigned size, blks, i; 526 unsigned size, blks, i;
527 struct ufs_super_block_third *usb3;
528 527
529 UFSD("ENTER\n"); 528 UFSD("ENTER\n");
530 529
531 usb3 = ubh_get_usb_third(uspi);
532 /* 530 /*
533 * Read cs structures from (usually) first data block 531 * Read cs structures from (usually) first data block
534 * on the device. 532 * on the device.
@@ -1280,6 +1278,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 unsigned new_mount_opt, ufstype; 1278 unsigned new_mount_opt, ufstype;
1281 unsigned flags; 1279 unsigned flags;
1282 1280
1281 sync_filesystem(sb);
1283 lock_ufs(sb); 1282 lock_ufs(sb);
1284 mutex_lock(&UFS_SB(sb)->s_lock); 1283 mutex_lock(&UFS_SB(sb)->s_lock);
1285 uspi = UFS_SB(sb)->s_uspi; 1284 uspi = UFS_SB(sb)->s_uspi;
@@ -1389,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1389 struct super_block *sb = dentry->d_sb; 1388 struct super_block *sb = dentry->d_sb;
1390 struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; 1389 struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
1391 unsigned flags = UFS_SB(sb)->s_flags; 1390 unsigned flags = UFS_SB(sb)->s_flags;
1392 struct ufs_super_block_first *usb1;
1393 struct ufs_super_block_second *usb2;
1394 struct ufs_super_block_third *usb3; 1391 struct ufs_super_block_third *usb3;
1395 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 1392 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1396 1393
1397 lock_ufs(sb); 1394 lock_ufs(sb);
1398 1395
1399 usb1 = ubh_get_usb_first(uspi);
1400 usb2 = ubh_get_usb_second(uspi);
1401 usb3 = ubh_get_usb_third(uspi); 1396 usb3 = ubh_get_usb_third(uspi);
1402 1397
1403 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 1398 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1453,7 +1448,7 @@ static void init_once(void *foo)
1453 inode_init_once(&ei->vfs_inode); 1448 inode_init_once(&ei->vfs_inode);
1454} 1449}
1455 1450
1456static int init_inodecache(void) 1451static int __init init_inodecache(void)
1457{ 1452{
1458 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", 1453 ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
1459 sizeof(struct ufs_inode_info), 1454 sizeof(struct ufs_inode_info),
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 66a36befc5c0..844e288b9576 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
65void * 65void *
66kmem_zalloc_large(size_t size, xfs_km_flags_t flags) 66kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
67{ 67{
68 unsigned noio_flag = 0;
68 void *ptr; 69 void *ptr;
70 gfp_t lflags;
69 71
70 ptr = kmem_zalloc(size, flags | KM_MAYFAIL); 72 ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
71 if (ptr) 73 if (ptr)
72 return ptr; 74 return ptr;
73 return vzalloc(size); 75
76 /*
77 * __vmalloc() will allocate data pages and auxillary structures (e.g.
78 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
79 * here. Hence we need to tell memory reclaim that we are in such a
80 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
81 * the filesystem here and potentially deadlocking.
82 */
83 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
84 noio_flag = memalloc_noio_save();
85
86 lflags = kmem_flags_convert(flags);
87 ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
88
89 if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
90 memalloc_noio_restore(noio_flag);
91
92 return ptr;
74} 93}
75 94
76void 95void
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0ecec1896f25..6888ad886ff6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -281,7 +281,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
281 if (!acl) 281 if (!acl)
282 goto set_acl; 282 goto set_acl;
283 283
284 error = -EINVAL; 284 error = -E2BIG;
285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) 285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
286 return error; 286 return error;
287 287
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 3fc109819c34..0fdd4109c624 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -89,6 +89,8 @@ typedef struct xfs_agf {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_agf_t; 90} xfs_agf_t;
91 91
92#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
93
92#define XFS_AGF_MAGICNUM 0x00000001 94#define XFS_AGF_MAGICNUM 0x00000001
93#define XFS_AGF_VERSIONNUM 0x00000002 95#define XFS_AGF_VERSIONNUM 0x00000002
94#define XFS_AGF_SEQNO 0x00000004 96#define XFS_AGF_SEQNO 0x00000004
@@ -167,6 +169,8 @@ typedef struct xfs_agi {
167 /* structure must be padded to 64 bit alignment */ 169 /* structure must be padded to 64 bit alignment */
168} xfs_agi_t; 170} xfs_agi_t;
169 171
172#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
173
170#define XFS_AGI_MAGICNUM 0x00000001 174#define XFS_AGI_MAGICNUM 0x00000001
171#define XFS_AGI_VERSIONNUM 0x00000002 175#define XFS_AGI_VERSIONNUM 0x00000002
172#define XFS_AGI_SEQNO 0x00000004 176#define XFS_AGI_SEQNO 0x00000004
@@ -222,6 +226,8 @@ typedef struct xfs_agfl {
222 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ 226 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
223} xfs_agfl_t; 227} xfs_agfl_t;
224 228
229#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
230
225/* 231/*
226 * tags for inode radix tree 232 * tags for inode radix tree
227 */ 233 */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 9eab2dfdcbb5..c1cf6a336a72 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -474,7 +474,6 @@ xfs_agfl_read_verify(
474 struct xfs_buf *bp) 474 struct xfs_buf *bp)
475{ 475{
476 struct xfs_mount *mp = bp->b_target->bt_mount; 476 struct xfs_mount *mp = bp->b_target->bt_mount;
477 int agfl_ok = 1;
478 477
479 /* 478 /*
480 * There is no verification of non-crc AGFLs because mkfs does not 479 * There is no verification of non-crc AGFLs because mkfs does not
@@ -485,15 +484,13 @@ xfs_agfl_read_verify(
485 if (!xfs_sb_version_hascrc(&mp->m_sb)) 484 if (!xfs_sb_version_hascrc(&mp->m_sb))
486 return; 485 return;
487 486
488 agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 487 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
489 offsetof(struct xfs_agfl, agfl_crc)); 488 xfs_buf_ioerror(bp, EFSBADCRC);
490 489 else if (!xfs_agfl_verify(bp))
491 agfl_ok = agfl_ok && xfs_agfl_verify(bp);
492
493 if (!agfl_ok) {
494 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
495 xfs_buf_ioerror(bp, EFSCORRUPTED); 490 xfs_buf_ioerror(bp, EFSCORRUPTED);
496 } 491
492 if (bp->b_error)
493 xfs_verifier_error(bp);
497} 494}
498 495
499static void 496static void
@@ -508,16 +505,15 @@ xfs_agfl_write_verify(
508 return; 505 return;
509 506
510 if (!xfs_agfl_verify(bp)) { 507 if (!xfs_agfl_verify(bp)) {
511 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
512 xfs_buf_ioerror(bp, EFSCORRUPTED); 508 xfs_buf_ioerror(bp, EFSCORRUPTED);
509 xfs_verifier_error(bp);
513 return; 510 return;
514 } 511 }
515 512
516 if (bip) 513 if (bip)
517 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 514 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
518 515
519 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 516 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
520 offsetof(struct xfs_agfl, agfl_crc));
521} 517}
522 518
523const struct xfs_buf_ops xfs_agfl_buf_ops = { 519const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -2238,19 +2234,17 @@ xfs_agf_read_verify(
2238 struct xfs_buf *bp) 2234 struct xfs_buf *bp)
2239{ 2235{
2240 struct xfs_mount *mp = bp->b_target->bt_mount; 2236 struct xfs_mount *mp = bp->b_target->bt_mount;
2241 int agf_ok = 1;
2242
2243 if (xfs_sb_version_hascrc(&mp->m_sb))
2244 agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
2245 offsetof(struct xfs_agf, agf_crc));
2246 2237
2247 agf_ok = agf_ok && xfs_agf_verify(mp, bp); 2238 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2248 2239 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2249 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, 2240 xfs_buf_ioerror(bp, EFSBADCRC);
2250 XFS_RANDOM_ALLOC_READ_AGF))) { 2241 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
2251 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 2242 XFS_ERRTAG_ALLOC_READ_AGF,
2243 XFS_RANDOM_ALLOC_READ_AGF))
2252 xfs_buf_ioerror(bp, EFSCORRUPTED); 2244 xfs_buf_ioerror(bp, EFSCORRUPTED);
2253 } 2245
2246 if (bp->b_error)
2247 xfs_verifier_error(bp);
2254} 2248}
2255 2249
2256static void 2250static void
@@ -2261,8 +2255,8 @@ xfs_agf_write_verify(
2261 struct xfs_buf_log_item *bip = bp->b_fspriv; 2255 struct xfs_buf_log_item *bip = bp->b_fspriv;
2262 2256
2263 if (!xfs_agf_verify(mp, bp)) { 2257 if (!xfs_agf_verify(mp, bp)) {
2264 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
2265 xfs_buf_ioerror(bp, EFSCORRUPTED); 2258 xfs_buf_ioerror(bp, EFSCORRUPTED);
2259 xfs_verifier_error(bp);
2266 return; 2260 return;
2267 } 2261 }
2268 2262
@@ -2272,8 +2266,7 @@ xfs_agf_write_verify(
2272 if (bip) 2266 if (bip)
2273 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 2267 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2274 2268
2275 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 2269 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2276 offsetof(struct xfs_agf, agf_crc));
2277} 2270}
2278 2271
2279const struct xfs_buf_ops xfs_agf_buf_ops = { 2272const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 13085429e523..cc1eadcbb049 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -355,12 +355,14 @@ static void
355xfs_allocbt_read_verify( 355xfs_allocbt_read_verify(
356 struct xfs_buf *bp) 356 struct xfs_buf *bp)
357{ 357{
358 if (!(xfs_btree_sblock_verify_crc(bp) && 358 if (!xfs_btree_sblock_verify_crc(bp))
359 xfs_allocbt_verify(bp))) { 359 xfs_buf_ioerror(bp, EFSBADCRC);
360 trace_xfs_btree_corrupt(bp, _RET_IP_); 360 else if (!xfs_allocbt_verify(bp))
361 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
362 bp->b_target->bt_mount, bp->b_addr);
363 xfs_buf_ioerror(bp, EFSCORRUPTED); 361 xfs_buf_ioerror(bp, EFSCORRUPTED);
362
363 if (bp->b_error) {
364 trace_xfs_btree_corrupt(bp, _RET_IP_);
365 xfs_verifier_error(bp);
364 } 366 }
365} 367}
366 368
@@ -370,9 +372,9 @@ xfs_allocbt_write_verify(
370{ 372{
371 if (!xfs_allocbt_verify(bp)) { 373 if (!xfs_allocbt_verify(bp)) {
372 trace_xfs_btree_corrupt(bp, _RET_IP_); 374 trace_xfs_btree_corrupt(bp, _RET_IP_);
373 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
374 bp->b_target->bt_mount, bp->b_addr);
375 xfs_buf_ioerror(bp, EFSCORRUPTED); 375 xfs_buf_ioerror(bp, EFSCORRUPTED);
376 xfs_verifier_error(bp);
377 return;
376 } 378 }
377 xfs_btree_sblock_calc_crc(bp); 379 xfs_btree_sblock_calc_crc(bp);
378 380
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index db2cfb067d0b..0479c32c5eb1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -632,38 +632,46 @@ xfs_map_at_offset(
632} 632}
633 633
634/* 634/*
635 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page contains at least one buffer of a given @type.
636 * or delayed allocate extent. 636 * If @check_all_buffers is true, then we walk all the buffers in the page to
637 * try to find one of the type passed in. If it is not set, then the caller only
638 * needs to check the first buffer on the page for a match.
637 */ 639 */
638STATIC int 640STATIC bool
639xfs_check_page_type( 641xfs_check_page_type(
640 struct page *page, 642 struct page *page,
641 unsigned int type) 643 unsigned int type,
644 bool check_all_buffers)
642{ 645{
643 if (PageWriteback(page)) 646 struct buffer_head *bh;
644 return 0; 647 struct buffer_head *head;
645 648
646 if (page->mapping && page_has_buffers(page)) { 649 if (PageWriteback(page))
647 struct buffer_head *bh, *head; 650 return false;
648 int acceptable = 0; 651 if (!page->mapping)
652 return false;
653 if (!page_has_buffers(page))
654 return false;
649 655
650 bh = head = page_buffers(page); 656 bh = head = page_buffers(page);
651 do { 657 do {
652 if (buffer_unwritten(bh)) 658 if (buffer_unwritten(bh)) {
653 acceptable += (type == XFS_IO_UNWRITTEN); 659 if (type == XFS_IO_UNWRITTEN)
654 else if (buffer_delay(bh)) 660 return true;
655 acceptable += (type == XFS_IO_DELALLOC); 661 } else if (buffer_delay(bh)) {
656 else if (buffer_dirty(bh) && buffer_mapped(bh)) 662 if (type == XFS_IO_DELALLOC)
657 acceptable += (type == XFS_IO_OVERWRITE); 663 return true;
658 else 664 } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
659 break; 665 if (type == XFS_IO_OVERWRITE)
660 } while ((bh = bh->b_this_page) != head); 666 return true;
667 }
661 668
662 if (acceptable) 669 /* If we are only checking the first buffer, we are done now. */
663 return 1; 670 if (!check_all_buffers)
664 } 671 break;
672 } while ((bh = bh->b_this_page) != head);
665 673
666 return 0; 674 return false;
667} 675}
668 676
669/* 677/*
@@ -697,7 +705,7 @@ xfs_convert_page(
697 goto fail_unlock_page; 705 goto fail_unlock_page;
698 if (page->mapping != inode->i_mapping) 706 if (page->mapping != inode->i_mapping)
699 goto fail_unlock_page; 707 goto fail_unlock_page;
700 if (!xfs_check_page_type(page, (*ioendp)->io_type)) 708 if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
701 goto fail_unlock_page; 709 goto fail_unlock_page;
702 710
703 /* 711 /*
@@ -742,6 +750,15 @@ xfs_convert_page(
742 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; 750 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
743 page_dirty = p_offset / len; 751 page_dirty = p_offset / len;
744 752
753 /*
754 * The moment we find a buffer that doesn't match our current type
755 * specification or can't be written, abort the loop and start
756 * writeback. As per the above xfs_imap_valid() check, only
757 * xfs_vm_writepage() can handle partial page writeback fully - we are
758 * limited here to the buffers that are contiguous with the current
759 * ioend, and hence a buffer we can't write breaks that contiguity and
760 * we have to defer the rest of the IO to xfs_vm_writepage().
761 */
745 bh = head = page_buffers(page); 762 bh = head = page_buffers(page);
746 do { 763 do {
747 if (offset >= end_offset) 764 if (offset >= end_offset)
@@ -750,7 +767,7 @@ xfs_convert_page(
750 uptodate = 0; 767 uptodate = 0;
751 if (!(PageUptodate(page) || buffer_uptodate(bh))) { 768 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
752 done = 1; 769 done = 1;
753 continue; 770 break;
754 } 771 }
755 772
756 if (buffer_unwritten(bh) || buffer_delay(bh) || 773 if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -762,10 +779,11 @@ xfs_convert_page(
762 else 779 else
763 type = XFS_IO_OVERWRITE; 780 type = XFS_IO_OVERWRITE;
764 781
765 if (!xfs_imap_valid(inode, imap, offset)) { 782 /*
766 done = 1; 783 * imap should always be valid because of the above
767 continue; 784 * partial page end_offset check on the imap.
768 } 785 */
786 ASSERT(xfs_imap_valid(inode, imap, offset));
769 787
770 lock_buffer(bh); 788 lock_buffer(bh);
771 if (type != XFS_IO_OVERWRITE) 789 if (type != XFS_IO_OVERWRITE)
@@ -777,6 +795,7 @@ xfs_convert_page(
777 count++; 795 count++;
778 } else { 796 } else {
779 done = 1; 797 done = 1;
798 break;
780 } 799 }
781 } while (offset += len, (bh = bh->b_this_page) != head); 800 } while (offset += len, (bh = bh->b_this_page) != head);
782 801
@@ -868,7 +887,7 @@ xfs_aops_discard_page(
868 struct buffer_head *bh, *head; 887 struct buffer_head *bh, *head;
869 loff_t offset = page_offset(page); 888 loff_t offset = page_offset(page);
870 889
871 if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) 890 if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
872 goto out_invalidate; 891 goto out_invalidate;
873 892
874 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 893 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1325,6 +1344,14 @@ __xfs_get_blocks(
1325 /* 1344 /*
1326 * If this is O_DIRECT or the mpage code calling tell them how large 1345 * If this is O_DIRECT or the mpage code calling tell them how large
1327 * the mapping is, so that we can avoid repeated get_blocks calls. 1346 * the mapping is, so that we can avoid repeated get_blocks calls.
1347 *
1348 * If the mapping spans EOF, then we have to break the mapping up as the
1349 * mapping for blocks beyond EOF must be marked new so that sub block
1350 * regions can be correctly zeroed. We can't do this for mappings within
1351 * EOF unless the mapping was just allocated or is unwritten, otherwise
1352 * the callers would overwrite existing data with zeros. Hence we have
1353 * to split the mapping into a range up to and including EOF, and a
1354 * second mapping for beyond EOF.
1328 */ 1355 */
1329 if (direct || size > (1 << inode->i_blkbits)) { 1356 if (direct || size > (1 << inode->i_blkbits)) {
1330 xfs_off_t mapping_size; 1357 xfs_off_t mapping_size;
@@ -1335,6 +1362,12 @@ __xfs_get_blocks(
1335 ASSERT(mapping_size > 0); 1362 ASSERT(mapping_size > 0);
1336 if (mapping_size > size) 1363 if (mapping_size > size)
1337 mapping_size = size; 1364 mapping_size = size;
1365 if (offset < i_size_read(inode) &&
1366 offset + mapping_size >= i_size_read(inode)) {
1367 /* limit mapping to block that spans EOF */
1368 mapping_size = roundup_64(i_size_read(inode) - offset,
1369 1 << inode->i_blkbits);
1370 }
1338 if (mapping_size > LONG_MAX) 1371 if (mapping_size > LONG_MAX)
1339 mapping_size = LONG_MAX; 1372 mapping_size = LONG_MAX;
1340 1373
@@ -1441,7 +1474,8 @@ xfs_vm_direct_IO(
1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1474 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1442 offset, nr_segs, 1475 offset, nr_segs,
1443 xfs_get_blocks_direct, 1476 xfs_get_blocks_direct,
1444 xfs_end_io_direct_write, NULL, 0); 1477 xfs_end_io_direct_write, NULL,
1478 DIO_ASYNC_EXTEND);
1445 if (ret != -EIOCBQUEUED && iocb->private) 1479 if (ret != -EIOCBQUEUED && iocb->private)
1446 goto out_destroy_ioend; 1480 goto out_destroy_ioend;
1447 } else { 1481 } else {
@@ -1546,6 +1580,16 @@ xfs_vm_write_failed(
1546 1580
1547 xfs_vm_kill_delalloc_range(inode, block_offset, 1581 xfs_vm_kill_delalloc_range(inode, block_offset,
1548 block_offset + bh->b_size); 1582 block_offset + bh->b_size);
1583
1584 /*
1585 * This buffer does not contain data anymore. make sure anyone
1586 * who finds it knows that for certain.
1587 */
1588 clear_buffer_delay(bh);
1589 clear_buffer_uptodate(bh);
1590 clear_buffer_mapped(bh);
1591 clear_buffer_new(bh);
1592 clear_buffer_dirty(bh);
1549 } 1593 }
1550 1594
1551} 1595}
@@ -1579,12 +1623,21 @@ xfs_vm_write_begin(
1579 status = __block_write_begin(page, pos, len, xfs_get_blocks); 1623 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1580 if (unlikely(status)) { 1624 if (unlikely(status)) {
1581 struct inode *inode = mapping->host; 1625 struct inode *inode = mapping->host;
1626 size_t isize = i_size_read(inode);
1582 1627
1583 xfs_vm_write_failed(inode, page, pos, len); 1628 xfs_vm_write_failed(inode, page, pos, len);
1584 unlock_page(page); 1629 unlock_page(page);
1585 1630
1586 if (pos + len > i_size_read(inode)) 1631 /*
1587 truncate_pagecache(inode, i_size_read(inode)); 1632 * If the write is beyond EOF, we only want to kill blocks
1633 * allocated in this write, not blocks that were previously
1634 * written successfully.
1635 */
1636 if (pos + len > isize) {
1637 ssize_t start = max_t(ssize_t, pos, isize);
1638
1639 truncate_pagecache_range(inode, start, pos + len);
1640 }
1588 1641
1589 page_cache_release(page); 1642 page_cache_release(page);
1590 page = NULL; 1643 page = NULL;
@@ -1595,9 +1648,12 @@ xfs_vm_write_begin(
1595} 1648}
1596 1649
1597/* 1650/*
1598 * On failure, we only need to kill delalloc blocks beyond EOF because they 1651 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1599 * will never be written. For blocks within EOF, generic_write_end() zeros them 1652 * this specific write because they will never be written. Previous writes
1600 * so they are safe to leave alone and be written with all the other valid data. 1653 * beyond EOF where block allocation succeeded do not need to be trashed, so
1654 * only new blocks from this write should be trashed. For blocks within
1655 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1656 * written with all the other valid data.
1601 */ 1657 */
1602STATIC int 1658STATIC int
1603xfs_vm_write_end( 1659xfs_vm_write_end(
@@ -1620,8 +1676,11 @@ xfs_vm_write_end(
1620 loff_t to = pos + len; 1676 loff_t to = pos + len;
1621 1677
1622 if (to > isize) { 1678 if (to > isize) {
1623 truncate_pagecache(inode, isize); 1679 /* only kill blocks in this write beyond EOF */
1680 if (pos > isize)
1681 isize = pos;
1624 xfs_vm_kill_delalloc_range(inode, isize, to); 1682 xfs_vm_kill_delalloc_range(inode, isize, to);
1683 truncate_pagecache_range(inode, isize, to);
1625 } 1684 }
1626 } 1685 }
1627 return ret; 1686 return ret;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 01b6a0102fbd..abda1124a70f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -213,7 +213,7 @@ xfs_attr_calc_size(
213 * Out of line attribute, cannot double split, but 213 * Out of line attribute, cannot double split, but
214 * make room for the attribute value itself. 214 * make room for the attribute value itself.
215 */ 215 */
216 uint dblocks = XFS_B_TO_FSB(mp, valuelen); 216 uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen);
217 nblks += dblocks; 217 nblks += dblocks;
218 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); 218 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
219 } 219 }
@@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
698 698
699 trace_xfs_attr_leaf_replace(args); 699 trace_xfs_attr_leaf_replace(args);
700 700
701 /* save the attribute state for later removal*/
701 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ 702 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
702 args->blkno2 = args->blkno; /* set 2nd entry info*/ 703 args->blkno2 = args->blkno; /* set 2nd entry info*/
703 args->index2 = args->index; 704 args->index2 = args->index;
704 args->rmtblkno2 = args->rmtblkno; 705 args->rmtblkno2 = args->rmtblkno;
705 args->rmtblkcnt2 = args->rmtblkcnt; 706 args->rmtblkcnt2 = args->rmtblkcnt;
707 args->rmtvaluelen2 = args->rmtvaluelen;
708
709 /*
710 * clear the remote attr state now that it is saved so that the
711 * values reflect the state of the attribute we are about to
712 * add, not the attribute we just found and will remove later.
713 */
714 args->rmtblkno = 0;
715 args->rmtblkcnt = 0;
716 args->rmtvaluelen = 0;
706 } 717 }
707 718
708 /* 719 /*
@@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
794 args->blkno = args->blkno2; 805 args->blkno = args->blkno2;
795 args->rmtblkno = args->rmtblkno2; 806 args->rmtblkno = args->rmtblkno2;
796 args->rmtblkcnt = args->rmtblkcnt2; 807 args->rmtblkcnt = args->rmtblkcnt2;
808 args->rmtvaluelen = args->rmtvaluelen2;
797 if (args->rmtblkno) { 809 if (args->rmtblkno) {
798 error = xfs_attr_rmtval_remove(args); 810 error = xfs_attr_rmtval_remove(args);
799 if (error) 811 if (error)
@@ -999,13 +1011,22 @@ restart:
999 1011
1000 trace_xfs_attr_node_replace(args); 1012 trace_xfs_attr_node_replace(args);
1001 1013
1014 /* save the attribute state for later removal*/
1002 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ 1015 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
1003 args->blkno2 = args->blkno; /* set 2nd entry info*/ 1016 args->blkno2 = args->blkno; /* set 2nd entry info*/
1004 args->index2 = args->index; 1017 args->index2 = args->index;
1005 args->rmtblkno2 = args->rmtblkno; 1018 args->rmtblkno2 = args->rmtblkno;
1006 args->rmtblkcnt2 = args->rmtblkcnt; 1019 args->rmtblkcnt2 = args->rmtblkcnt;
1020 args->rmtvaluelen2 = args->rmtvaluelen;
1021
1022 /*
1023 * clear the remote attr state now that it is saved so that the
1024 * values reflect the state of the attribute we are about to
1025 * add, not the attribute we just found and will remove later.
1026 */
1007 args->rmtblkno = 0; 1027 args->rmtblkno = 0;
1008 args->rmtblkcnt = 0; 1028 args->rmtblkcnt = 0;
1029 args->rmtvaluelen = 0;
1009 } 1030 }
1010 1031
1011 retval = xfs_attr3_leaf_add(blk->bp, state->args); 1032 retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1133,6 +1154,7 @@ restart:
1133 args->blkno = args->blkno2; 1154 args->blkno = args->blkno2;
1134 args->rmtblkno = args->rmtblkno2; 1155 args->rmtblkno = args->rmtblkno2;
1135 args->rmtblkcnt = args->rmtblkcnt2; 1156 args->rmtblkcnt = args->rmtblkcnt2;
1157 args->rmtvaluelen = args->rmtvaluelen2;
1136 if (args->rmtblkno) { 1158 if (args->rmtblkno) {
1137 error = xfs_attr_rmtval_remove(args); 1159 error = xfs_attr_rmtval_remove(args);
1138 if (error) 1160 if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 7b126f46a2f9..511c283459b1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -213,8 +213,8 @@ xfs_attr3_leaf_write_verify(
213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; 213 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
214 214
215 if (!xfs_attr3_leaf_verify(bp)) { 215 if (!xfs_attr3_leaf_verify(bp)) {
216 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
217 xfs_buf_ioerror(bp, EFSCORRUPTED); 216 xfs_buf_ioerror(bp, EFSCORRUPTED);
217 xfs_verifier_error(bp);
218 return; 218 return;
219 } 219 }
220 220
@@ -224,7 +224,7 @@ xfs_attr3_leaf_write_verify(
224 if (bip) 224 if (bip)
225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 225 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
226 226
227 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); 227 xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
228} 228}
229 229
230/* 230/*
@@ -239,13 +239,14 @@ xfs_attr3_leaf_read_verify(
239{ 239{
240 struct xfs_mount *mp = bp->b_target->bt_mount; 240 struct xfs_mount *mp = bp->b_target->bt_mount;
241 241
242 if ((xfs_sb_version_hascrc(&mp->m_sb) && 242 if (xfs_sb_version_hascrc(&mp->m_sb) &&
243 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 243 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
244 XFS_ATTR3_LEAF_CRC_OFF)) || 244 xfs_buf_ioerror(bp, EFSBADCRC);
245 !xfs_attr3_leaf_verify(bp)) { 245 else if (!xfs_attr3_leaf_verify(bp))
246 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
247 xfs_buf_ioerror(bp, EFSCORRUPTED); 246 xfs_buf_ioerror(bp, EFSCORRUPTED);
248 } 247
248 if (bp->b_error)
249 xfs_verifier_error(bp);
249} 250}
250 251
251const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { 252const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
@@ -1228,6 +1229,7 @@ xfs_attr3_leaf_add_work(
1228 name_rmt->valueblk = 0; 1229 name_rmt->valueblk = 0;
1229 args->rmtblkno = 1; 1230 args->rmtblkno = 1;
1230 args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); 1231 args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
1232 args->rmtvaluelen = args->valuelen;
1231 } 1233 }
1232 xfs_trans_log_buf(args->trans, bp, 1234 xfs_trans_log_buf(args->trans, bp,
1233 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), 1235 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -2166,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
2166 if (!xfs_attr_namesp_match(args->flags, entry->flags)) 2168 if (!xfs_attr_namesp_match(args->flags, entry->flags))
2167 continue; 2169 continue;
2168 args->index = probe; 2170 args->index = probe;
2169 args->valuelen = be32_to_cpu(name_rmt->valuelen); 2171 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2170 args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2172 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2171 args->rmtblkcnt = xfs_attr3_rmt_blocks( 2173 args->rmtblkcnt = xfs_attr3_rmt_blocks(
2172 args->dp->i_mount, 2174 args->dp->i_mount,
2173 args->valuelen); 2175 args->rmtvaluelen);
2174 return XFS_ERROR(EEXIST); 2176 return XFS_ERROR(EEXIST);
2175 } 2177 }
2176 } 2178 }
@@ -2219,19 +2221,19 @@ xfs_attr3_leaf_getvalue(
2219 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2221 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2220 ASSERT(name_rmt->namelen == args->namelen); 2222 ASSERT(name_rmt->namelen == args->namelen);
2221 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2223 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2222 valuelen = be32_to_cpu(name_rmt->valuelen); 2224 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2223 args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2225 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2224 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, 2226 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2225 valuelen); 2227 args->rmtvaluelen);
2226 if (args->flags & ATTR_KERNOVAL) { 2228 if (args->flags & ATTR_KERNOVAL) {
2227 args->valuelen = valuelen; 2229 args->valuelen = args->rmtvaluelen;
2228 return 0; 2230 return 0;
2229 } 2231 }
2230 if (args->valuelen < valuelen) { 2232 if (args->valuelen < args->rmtvaluelen) {
2231 args->valuelen = valuelen; 2233 args->valuelen = args->rmtvaluelen;
2232 return XFS_ERROR(ERANGE); 2234 return XFS_ERROR(ERANGE);
2233 } 2235 }
2234 args->valuelen = valuelen; 2236 args->valuelen = args->rmtvaluelen;
2235 } 2237 }
2236 return 0; 2238 return 0;
2237} 2239}
@@ -2518,7 +2520,7 @@ xfs_attr3_leaf_clearflag(
2518 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); 2520 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
2519 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2521 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2520 name_rmt->valueblk = cpu_to_be32(args->rmtblkno); 2522 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2521 name_rmt->valuelen = cpu_to_be32(args->valuelen); 2523 name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
2522 xfs_trans_log_buf(args->trans, bp, 2524 xfs_trans_log_buf(args->trans, bp,
2523 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); 2525 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
2524 } 2526 }
@@ -2676,7 +2678,7 @@ xfs_attr3_leaf_flipflags(
2676 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); 2678 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
2677 name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); 2679 name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
2678 name_rmt->valueblk = cpu_to_be32(args->rmtblkno); 2680 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2679 name_rmt->valuelen = cpu_to_be32(args->valuelen); 2681 name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
2680 xfs_trans_log_buf(args->trans, bp1, 2682 xfs_trans_log_buf(args->trans, bp1,
2681 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); 2683 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
2682 } 2684 }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 01db96f60cf0..833fe5d98d80 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int(
447 args.dp = context->dp; 447 args.dp = context->dp;
448 args.whichfork = XFS_ATTR_FORK; 448 args.whichfork = XFS_ATTR_FORK;
449 args.valuelen = valuelen; 449 args.valuelen = valuelen;
450 args.rmtvaluelen = valuelen;
450 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); 451 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
451 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 452 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
452 args.rmtblkcnt = xfs_attr3_rmt_blocks( 453 args.rmtblkcnt = xfs_attr3_rmt_blocks(
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 5549d69ddb45..d2e6e948cec7 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -125,7 +125,6 @@ xfs_attr3_rmt_read_verify(
125 struct xfs_mount *mp = bp->b_target->bt_mount; 125 struct xfs_mount *mp = bp->b_target->bt_mount;
126 char *ptr; 126 char *ptr;
127 int len; 127 int len;
128 bool corrupt = false;
129 xfs_daddr_t bno; 128 xfs_daddr_t bno;
130 129
131 /* no verification of non-crc buffers */ 130 /* no verification of non-crc buffers */
@@ -140,11 +139,11 @@ xfs_attr3_rmt_read_verify(
140 while (len > 0) { 139 while (len > 0) {
141 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), 140 if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
142 XFS_ATTR3_RMT_CRC_OFF)) { 141 XFS_ATTR3_RMT_CRC_OFF)) {
143 corrupt = true; 142 xfs_buf_ioerror(bp, EFSBADCRC);
144 break; 143 break;
145 } 144 }
146 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 145 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
147 corrupt = true; 146 xfs_buf_ioerror(bp, EFSCORRUPTED);
148 break; 147 break;
149 } 148 }
150 len -= XFS_LBSIZE(mp); 149 len -= XFS_LBSIZE(mp);
@@ -152,10 +151,9 @@ xfs_attr3_rmt_read_verify(
152 bno += mp->m_bsize; 151 bno += mp->m_bsize;
153 } 152 }
154 153
155 if (corrupt) { 154 if (bp->b_error)
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 155 xfs_verifier_error(bp);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 156 else
158 } else
159 ASSERT(len == 0); 157 ASSERT(len == 0);
160} 158}
161 159
@@ -180,9 +178,8 @@ xfs_attr3_rmt_write_verify(
180 178
181 while (len > 0) { 179 while (len > 0) {
182 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { 180 if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
183 XFS_CORRUPTION_ERROR(__func__,
184 XFS_ERRLEVEL_LOW, mp, bp->b_addr);
185 xfs_buf_ioerror(bp, EFSCORRUPTED); 181 xfs_buf_ioerror(bp, EFSCORRUPTED);
182 xfs_verifier_error(bp);
186 return; 183 return;
187 } 184 }
188 if (bip) { 185 if (bip) {
@@ -340,7 +337,7 @@ xfs_attr_rmtval_get(
340 struct xfs_buf *bp; 337 struct xfs_buf *bp;
341 xfs_dablk_t lblkno = args->rmtblkno; 338 xfs_dablk_t lblkno = args->rmtblkno;
342 __uint8_t *dst = args->value; 339 __uint8_t *dst = args->value;
343 int valuelen = args->valuelen; 340 int valuelen;
344 int nmap; 341 int nmap;
345 int error; 342 int error;
346 int blkcnt = args->rmtblkcnt; 343 int blkcnt = args->rmtblkcnt;
@@ -350,7 +347,9 @@ xfs_attr_rmtval_get(
350 trace_xfs_attr_rmtval_get(args); 347 trace_xfs_attr_rmtval_get(args);
351 348
352 ASSERT(!(args->flags & ATTR_KERNOVAL)); 349 ASSERT(!(args->flags & ATTR_KERNOVAL));
350 ASSERT(args->rmtvaluelen == args->valuelen);
353 351
352 valuelen = args->rmtvaluelen;
354 while (valuelen > 0) { 353 while (valuelen > 0) {
355 nmap = ATTR_RMTVALUE_MAPSIZE; 354 nmap = ATTR_RMTVALUE_MAPSIZE;
356 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, 355 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -418,7 +417,7 @@ xfs_attr_rmtval_set(
418 * attributes have headers, we can't just do a straight byte to FSB 417 * attributes have headers, we can't just do a straight byte to FSB
419 * conversion and have to take the header space into account. 418 * conversion and have to take the header space into account.
420 */ 419 */
421 blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); 420 blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
422 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, 421 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
423 XFS_ATTR_FORK); 422 XFS_ATTR_FORK);
424 if (error) 423 if (error)
@@ -483,7 +482,7 @@ xfs_attr_rmtval_set(
483 */ 482 */
484 lblkno = args->rmtblkno; 483 lblkno = args->rmtblkno;
485 blkcnt = args->rmtblkcnt; 484 blkcnt = args->rmtblkcnt;
486 valuelen = args->valuelen; 485 valuelen = args->rmtvaluelen;
487 while (valuelen > 0) { 486 while (valuelen > 0) {
488 struct xfs_buf *bp; 487 struct xfs_buf *bp;
489 xfs_daddr_t dblkno; 488 xfs_daddr_t dblkno;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 152543c4ca70..f0efc7e970ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5378,3 +5378,201 @@ error0:
5378 } 5378 }
5379 return error; 5379 return error;
5380} 5380}
5381
5382/*
5383 * Shift extent records to the left to cover a hole.
5384 *
5385 * The maximum number of extents to be shifted in a single operation
5386 * is @num_exts, and @current_ext keeps track of the current extent
5387 * index we have shifted. @offset_shift_fsb is the length by which each
5388 * extent is shifted. If there is no hole to shift the extents
5389 * into, this will be considered invalid operation and we abort immediately.
5390 */
5391int
5392xfs_bmap_shift_extents(
5393 struct xfs_trans *tp,
5394 struct xfs_inode *ip,
5395 int *done,
5396 xfs_fileoff_t start_fsb,
5397 xfs_fileoff_t offset_shift_fsb,
5398 xfs_extnum_t *current_ext,
5399 xfs_fsblock_t *firstblock,
5400 struct xfs_bmap_free *flist,
5401 int num_exts)
5402{
5403 struct xfs_btree_cur *cur;
5404 struct xfs_bmbt_rec_host *gotp;
5405 struct xfs_bmbt_irec got;
5406 struct xfs_bmbt_irec left;
5407 struct xfs_mount *mp = ip->i_mount;
5408 struct xfs_ifork *ifp;
5409 xfs_extnum_t nexts = 0;
5410 xfs_fileoff_t startoff;
5411 int error = 0;
5412 int i;
5413 int whichfork = XFS_DATA_FORK;
5414 int logflags;
5415 xfs_filblks_t blockcount = 0;
5416 int total_extents;
5417
5418 if (unlikely(XFS_TEST_ERROR(
5419 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5420 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
5421 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5422 XFS_ERROR_REPORT("xfs_bmap_shift_extents",
5423 XFS_ERRLEVEL_LOW, mp);
5424 return XFS_ERROR(EFSCORRUPTED);
5425 }
5426
5427 if (XFS_FORCED_SHUTDOWN(mp))
5428 return XFS_ERROR(EIO);
5429
5430 ASSERT(current_ext != NULL);
5431
5432 ifp = XFS_IFORK_PTR(ip, whichfork);
5433 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
5434 /* Read in all the extents */
5435 error = xfs_iread_extents(tp, ip, whichfork);
5436 if (error)
5437 return error;
5438 }
5439
5440 /*
5441 * If *current_ext is 0, we would need to lookup the extent
5442 * from where we would start shifting and store it in gotp.
5443 */
5444 if (!*current_ext) {
5445 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
5446 /*
5447 * gotp can be null in 2 cases: 1) if there are no extents
5448 * or 2) start_fsb lies in a hole beyond which there are
5449 * no extents. Either way, we are done.
5450 */
5451 if (!gotp) {
5452 *done = 1;
5453 return 0;
5454 }
5455 }
5456
5457 /* We are going to change core inode */
5458 logflags = XFS_ILOG_CORE;
5459 if (ifp->if_flags & XFS_IFBROOT) {
5460 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5461 cur->bc_private.b.firstblock = *firstblock;
5462 cur->bc_private.b.flist = flist;
5463 cur->bc_private.b.flags = 0;
5464 } else {
5465 cur = NULL;
5466 logflags |= XFS_ILOG_DEXT;
5467 }
5468
5469 /*
5470 * There may be delalloc extents in the data fork before the range we
5471 * are collapsing out, so we cannot
5472 * use the count of real extents here. Instead we have to calculate it
5473 * from the incore fork.
5474 */
5475 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5476 while (nexts++ < num_exts && *current_ext < total_extents) {
5477
5478 gotp = xfs_iext_get_ext(ifp, *current_ext);
5479 xfs_bmbt_get_all(gotp, &got);
5480 startoff = got.br_startoff - offset_shift_fsb;
5481
5482 /*
5483 * Before shifting extent into hole, make sure that the hole
5484 * is large enough to accomodate the shift.
5485 */
5486 if (*current_ext) {
5487 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5488 *current_ext - 1), &left);
5489
5490 if (startoff < left.br_startoff + left.br_blockcount)
5491 error = XFS_ERROR(EINVAL);
5492 } else if (offset_shift_fsb > got.br_startoff) {
5493 /*
5494 * When first extent is shifted, offset_shift_fsb
5495 * should be less than the stating offset of
5496 * the first extent.
5497 */
5498 error = XFS_ERROR(EINVAL);
5499 }
5500
5501 if (error)
5502 goto del_cursor;
5503
5504 if (cur) {
5505 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5506 got.br_startblock,
5507 got.br_blockcount,
5508 &i);
5509 if (error)
5510 goto del_cursor;
5511 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5512 }
5513
5514 /* Check if we can merge 2 adjacent extents */
5515 if (*current_ext &&
5516 left.br_startoff + left.br_blockcount == startoff &&
5517 left.br_startblock + left.br_blockcount ==
5518 got.br_startblock &&
5519 left.br_state == got.br_state &&
5520 left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
5521 blockcount = left.br_blockcount +
5522 got.br_blockcount;
5523 xfs_iext_remove(ip, *current_ext, 1, 0);
5524 if (cur) {
5525 error = xfs_btree_delete(cur, &i);
5526 if (error)
5527 goto del_cursor;
5528 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5529 }
5530 XFS_IFORK_NEXT_SET(ip, whichfork,
5531 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5532 gotp = xfs_iext_get_ext(ifp, --*current_ext);
5533 xfs_bmbt_get_all(gotp, &got);
5534
5535 /* Make cursor point to the extent we will update */
5536 if (cur) {
5537 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5538 got.br_startblock,
5539 got.br_blockcount,
5540 &i);
5541 if (error)
5542 goto del_cursor;
5543 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5544 }
5545
5546 xfs_bmbt_set_blockcount(gotp, blockcount);
5547 got.br_blockcount = blockcount;
5548 } else {
5549 /* We have to update the startoff */
5550 xfs_bmbt_set_startoff(gotp, startoff);
5551 got.br_startoff = startoff;
5552 }
5553
5554 if (cur) {
5555 error = xfs_bmbt_update(cur, got.br_startoff,
5556 got.br_startblock,
5557 got.br_blockcount,
5558 got.br_state);
5559 if (error)
5560 goto del_cursor;
5561 }
5562
5563 (*current_ext)++;
5564 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5565 }
5566
5567 /* Check if we are done */
5568 if (*current_ext == total_extents)
5569 *done = 1;
5570
5571del_cursor:
5572 if (cur)
5573 xfs_btree_del_cursor(cur,
5574 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5575
5576 xfs_trans_log_inode(tp, ip, logflags);
5577 return error;
5578}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f351225..f84bd7af43be 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
127 { BMAP_RIGHT_FILLING, "RF" }, \ 127 { BMAP_RIGHT_FILLING, "RF" }, \
128 { BMAP_ATTRFORK, "ATTR" } 128 { BMAP_ATTRFORK, "ATTR" }
129 129
130
131/*
132 * This macro is used to determine how many extents will be shifted
133 * in one write transaction. We could require two splits,
134 * an extent move on the first and an extent merge on the second,
135 * So it is proper that one extent is shifted inside write transaction
136 * at a time.
137 */
138#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
139
130#ifdef DEBUG 140#ifdef DEBUG
131void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 141void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
132 int whichfork, unsigned long caller_ip); 142 int whichfork, unsigned long caller_ip);
@@ -169,5 +179,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
169int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, 179int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
170 xfs_extnum_t num); 180 xfs_extnum_t num);
171uint xfs_default_attroffset(struct xfs_inode *ip); 181uint xfs_default_attroffset(struct xfs_inode *ip);
182int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
183 int *done, xfs_fileoff_t start_fsb,
184 xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
185 xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
186 int num_exts);
172 187
173#endif /* __XFS_BMAP_H__ */ 188#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 706bc3f777cb..818d546664e7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -780,12 +780,14 @@ static void
780xfs_bmbt_read_verify( 780xfs_bmbt_read_verify(
781 struct xfs_buf *bp) 781 struct xfs_buf *bp)
782{ 782{
783 if (!(xfs_btree_lblock_verify_crc(bp) && 783 if (!xfs_btree_lblock_verify_crc(bp))
784 xfs_bmbt_verify(bp))) { 784 xfs_buf_ioerror(bp, EFSBADCRC);
785 trace_xfs_btree_corrupt(bp, _RET_IP_); 785 else if (!xfs_bmbt_verify(bp))
786 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
787 bp->b_target->bt_mount, bp->b_addr);
788 xfs_buf_ioerror(bp, EFSCORRUPTED); 786 xfs_buf_ioerror(bp, EFSCORRUPTED);
787
788 if (bp->b_error) {
789 trace_xfs_btree_corrupt(bp, _RET_IP_);
790 xfs_verifier_error(bp);
789 } 791 }
790} 792}
791 793
@@ -794,11 +796,9 @@ xfs_bmbt_write_verify(
794 struct xfs_buf *bp) 796 struct xfs_buf *bp)
795{ 797{
796 if (!xfs_bmbt_verify(bp)) { 798 if (!xfs_bmbt_verify(bp)) {
797 xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
798 trace_xfs_btree_corrupt(bp, _RET_IP_); 799 trace_xfs_btree_corrupt(bp, _RET_IP_);
799 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
800 bp->b_target->bt_mount, bp->b_addr);
801 xfs_buf_ioerror(bp, EFSCORRUPTED); 800 xfs_buf_ioerror(bp, EFSCORRUPTED);
801 xfs_verifier_error(bp);
802 return; 802 return;
803 } 803 }
804 xfs_btree_lblock_calc_crc(bp); 804 xfs_btree_lblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f264616080ca..296160b8e78c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1349,7 +1349,6 @@ xfs_free_file_space(
1349 * the freeing of the space succeeds at ENOSPC. 1349 * the freeing of the space succeeds at ENOSPC.
1350 */ 1350 */
1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 1351 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1352 tp->t_flags |= XFS_TRANS_RESERVE;
1353 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); 1352 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1354 1353
1355 /* 1354 /*
@@ -1419,6 +1418,8 @@ xfs_zero_file_space(
1419 xfs_off_t end_boundary; 1418 xfs_off_t end_boundary;
1420 int error; 1419 int error;
1421 1420
1421 trace_xfs_zero_file_space(ip);
1422
1422 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1423 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1423 1424
1424 /* 1425 /*
@@ -1433,9 +1434,18 @@ xfs_zero_file_space(
1433 ASSERT(end_boundary <= offset + len); 1434 ASSERT(end_boundary <= offset + len);
1434 1435
1435 if (start_boundary < end_boundary - 1) { 1436 if (start_boundary < end_boundary - 1) {
1436 /* punch out the page cache over the conversion range */ 1437 /*
1438 * punch out delayed allocation blocks and the page cache over
1439 * the conversion range
1440 */
1441 xfs_ilock(ip, XFS_ILOCK_EXCL);
1442 error = xfs_bmap_punch_delalloc_range(ip,
1443 XFS_B_TO_FSBT(mp, start_boundary),
1444 XFS_B_TO_FSB(mp, end_boundary - start_boundary));
1445 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1437 truncate_pagecache_range(VFS_I(ip), start_boundary, 1446 truncate_pagecache_range(VFS_I(ip), start_boundary,
1438 end_boundary - 1); 1447 end_boundary - 1);
1448
1439 /* convert the blocks */ 1449 /* convert the blocks */
1440 error = xfs_alloc_file_space(ip, start_boundary, 1450 error = xfs_alloc_file_space(ip, start_boundary,
1441 end_boundary - start_boundary - 1, 1451 end_boundary - start_boundary - 1,
@@ -1468,6 +1478,102 @@ out:
1468} 1478}
1469 1479
1470/* 1480/*
1481 * xfs_collapse_file_space()
1482 * This routine frees disk space and shift extent for the given file.
1483 * The first thing we do is to free data blocks in the specified range
1484 * by calling xfs_free_file_space(). It would also sync dirty data
1485 * and invalidate page cache over the region on which collapse range
1486 * is working. And Shift extent records to the left to cover a hole.
1487 * RETURNS:
1488 * 0 on success
1489 * errno on error
1490 *
1491 */
1492int
1493xfs_collapse_file_space(
1494 struct xfs_inode *ip,
1495 xfs_off_t offset,
1496 xfs_off_t len)
1497{
1498 int done = 0;
1499 struct xfs_mount *mp = ip->i_mount;
1500 struct xfs_trans *tp;
1501 int error;
1502 xfs_extnum_t current_ext = 0;
1503 struct xfs_bmap_free free_list;
1504 xfs_fsblock_t first_block;
1505 int committed;
1506 xfs_fileoff_t start_fsb;
1507 xfs_fileoff_t shift_fsb;
1508
1509 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1510
1511 trace_xfs_collapse_file_space(ip);
1512
1513 start_fsb = XFS_B_TO_FSB(mp, offset + len);
1514 shift_fsb = XFS_B_TO_FSB(mp, len);
1515
1516 error = xfs_free_file_space(ip, offset, len);
1517 if (error)
1518 return error;
1519
1520 while (!error && !done) {
1521 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1522 tp->t_flags |= XFS_TRANS_RESERVE;
1523 /*
1524 * We would need to reserve permanent block for transaction.
1525 * This will come into picture when after shifting extent into
1526 * hole we found that adjacent extents can be merged which
1527 * may lead to freeing of a block during record update.
1528 */
1529 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1530 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1531 if (error) {
1532 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1533 xfs_trans_cancel(tp, 0);
1534 break;
1535 }
1536
1537 xfs_ilock(ip, XFS_ILOCK_EXCL);
1538 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1539 ip->i_gdquot, ip->i_pdquot,
1540 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1541 XFS_QMOPT_RES_REGBLKS);
1542 if (error)
1543 goto out;
1544
1545 xfs_trans_ijoin(tp, ip, 0);
1546
1547 xfs_bmap_init(&free_list, &first_block);
1548
1549 /*
1550 * We are using the write transaction in which max 2 bmbt
1551 * updates are allowed
1552 */
1553 error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
1554 shift_fsb, &current_ext,
1555 &first_block, &free_list,
1556 XFS_BMAP_MAX_SHIFT_EXTENTS);
1557 if (error)
1558 goto out;
1559
1560 error = xfs_bmap_finish(&tp, &free_list, &committed);
1561 if (error)
1562 goto out;
1563
1564 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1565 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1566 }
1567
1568 return error;
1569
1570out:
1571 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1572 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1573 return error;
1574}
1575
1576/*
1471 * We need to check that the format of the data fork in the temporary inode is 1577 * We need to check that the format of the data fork in the temporary inode is
1472 * valid for the target inode before doing the swap. This is not a problem with 1578 * valid for the target inode before doing the swap. This is not a problem with
1473 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized 1579 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 900747b25772..935ed2b24edf 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -99,6 +99,8 @@ int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
99 xfs_off_t len); 99 xfs_off_t len);
100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, 100int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
101 xfs_off_t len); 101 xfs_off_t len);
102int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
103 xfs_off_t len);
102 104
103/* EOF block manipulation functions */ 105/* EOF block manipulation functions */
104bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); 106bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 9adaae4f3e2f..e80d59fdf89a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -234,8 +234,7 @@ xfs_btree_lblock_calc_crc(
234 return; 234 return;
235 if (bip) 235 if (bip)
236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 236 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 237 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238 XFS_BTREE_LBLOCK_CRC_OFF);
239} 238}
240 239
241bool 240bool
@@ -243,8 +242,8 @@ xfs_btree_lblock_verify_crc(
243 struct xfs_buf *bp) 242 struct xfs_buf *bp)
244{ 243{
245 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 244 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
246 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 245 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
247 XFS_BTREE_LBLOCK_CRC_OFF); 246
248 return true; 247 return true;
249} 248}
250 249
@@ -267,8 +266,7 @@ xfs_btree_sblock_calc_crc(
267 return; 266 return;
268 if (bip) 267 if (bip)
269 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 268 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
270 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 269 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
271 XFS_BTREE_SBLOCK_CRC_OFF);
272} 270}
273 271
274bool 272bool
@@ -276,8 +274,8 @@ xfs_btree_sblock_verify_crc(
276 struct xfs_buf *bp) 274 struct xfs_buf *bp)
277{ 275{
278 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) 276 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 277 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280 XFS_BTREE_SBLOCK_CRC_OFF); 278
281 return true; 279 return true;
282} 280}
283 281
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9c061ef2b0d9..cb10a0aaab3a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -396,7 +396,17 @@ _xfs_buf_map_pages(
396 bp->b_addr = NULL; 396 bp->b_addr = NULL;
397 } else { 397 } else {
398 int retried = 0; 398 int retried = 0;
399 unsigned noio_flag;
399 400
401 /*
402 * vm_map_ram() will allocate auxillary structures (e.g.
403 * pagetables) with GFP_KERNEL, yet we are likely to be under
404 * GFP_NOFS context here. Hence we need to tell memory reclaim
405 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
406 * memory reclaim re-entering the filesystem here and
407 * potentially deadlocking.
408 */
409 noio_flag = memalloc_noio_save();
400 do { 410 do {
401 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 411 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
402 -1, PAGE_KERNEL); 412 -1, PAGE_KERNEL);
@@ -404,6 +414,7 @@ _xfs_buf_map_pages(
404 break; 414 break;
405 vm_unmap_aliases(); 415 vm_unmap_aliases();
406 } while (retried++ <= 1); 416 } while (retried++ <= 1);
417 memalloc_noio_restore(noio_flag);
407 418
408 if (!bp->b_addr) 419 if (!bp->b_addr)
409 return -ENOMEM; 420 return -ENOMEM;
@@ -1361,21 +1372,29 @@ xfs_buf_iorequest(
1361 xfs_buf_wait_unpin(bp); 1372 xfs_buf_wait_unpin(bp);
1362 xfs_buf_hold(bp); 1373 xfs_buf_hold(bp);
1363 1374
1364 /* Set the count to 1 initially, this will stop an I/O 1375 /*
1376 * Set the count to 1 initially, this will stop an I/O
1365 * completion callout which happens before we have started 1377 * completion callout which happens before we have started
1366 * all the I/O from calling xfs_buf_ioend too early. 1378 * all the I/O from calling xfs_buf_ioend too early.
1367 */ 1379 */
1368 atomic_set(&bp->b_io_remaining, 1); 1380 atomic_set(&bp->b_io_remaining, 1);
1369 _xfs_buf_ioapply(bp); 1381 _xfs_buf_ioapply(bp);
1370 _xfs_buf_ioend(bp, 1); 1382 /*
1383 * If _xfs_buf_ioapply failed, we'll get back here with
1384 * only the reference we took above. _xfs_buf_ioend will
1385 * drop it to zero, so we'd better not queue it for later,
1386 * or we'll free it before it's done.
1387 */
1388 _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
1371 1389
1372 xfs_buf_rele(bp); 1390 xfs_buf_rele(bp);
1373} 1391}
1374 1392
1375/* 1393/*
1376 * Waits for I/O to complete on the buffer supplied. It returns immediately if 1394 * Waits for I/O to complete on the buffer supplied. It returns immediately if
1377 * no I/O is pending or there is already a pending error on the buffer. It 1395 * no I/O is pending or there is already a pending error on the buffer, in which
1378 * returns the I/O error code, if any, or 0 if there was no error. 1396 * case nothing will ever complete. It returns the I/O error code, if any, or
1397 * 0 if there was no error.
1379 */ 1398 */
1380int 1399int
1381xfs_buf_iowait( 1400xfs_buf_iowait(
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 995339534db6..b8a3abf6cf47 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -369,6 +369,20 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
369 xfs_buf_rele(bp); 369 xfs_buf_rele(bp);
370} 370}
371 371
372static inline int
373xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
374{
375 return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
376 cksum_offset);
377}
378
379static inline void
380xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
381{
382 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
383 cksum_offset);
384}
385
372/* 386/*
373 * Handling of buftargs. 387 * Handling of buftargs.
374 */ 388 */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 33149113e333..8752821443be 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -796,20 +796,6 @@ xfs_buf_item_init(
796 bip->bli_formats[i].blf_map_size = map_size; 796 bip->bli_formats[i].blf_map_size = map_size;
797 } 797 }
798 798
799#ifdef XFS_TRANS_DEBUG
800 /*
801 * Allocate the arrays for tracking what needs to be logged
802 * and what our callers request to be logged. bli_orig
803 * holds a copy of the original, clean buffer for comparison
804 * against, and bli_logged keeps a 1 bit flag per byte in
805 * the buffer to indicate which bytes the callers have asked
806 * to have logged.
807 */
808 bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
809 memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
810 bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
811#endif
812
813 /* 799 /*
814 * Put the buf item into the list of items attached to the 800 * Put the buf item into the list of items attached to the
815 * buffer at the front. 801 * buffer at the front.
@@ -957,11 +943,6 @@ STATIC void
957xfs_buf_item_free( 943xfs_buf_item_free(
958 xfs_buf_log_item_t *bip) 944 xfs_buf_log_item_t *bip)
959{ 945{
960#ifdef XFS_TRANS_DEBUG
961 kmem_free(bip->bli_orig);
962 kmem_free(bip->bli_logged);
963#endif /* XFS_TRANS_DEBUG */
964
965 xfs_buf_item_free_format(bip); 946 xfs_buf_item_free_format(bip);
966 kmem_zone_free(xfs_buf_item_zone, bip); 947 kmem_zone_free(xfs_buf_item_zone, bip);
967} 948}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 796272a2e129..6cc5f6785a77 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -185,8 +185,8 @@ xfs_da3_node_write_verify(
185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
186 186
187 if (!xfs_da3_node_verify(bp)) { 187 if (!xfs_da3_node_verify(bp)) {
188 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
189 xfs_buf_ioerror(bp, EFSCORRUPTED); 188 xfs_buf_ioerror(bp, EFSCORRUPTED);
189 xfs_verifier_error(bp);
190 return; 190 return;
191 } 191 }
192 192
@@ -196,7 +196,7 @@ xfs_da3_node_write_verify(
196 if (bip) 196 if (bip)
197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
198 198
199 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); 199 xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
200} 200}
201 201
202/* 202/*
@@ -209,18 +209,20 @@ static void
209xfs_da3_node_read_verify( 209xfs_da3_node_read_verify(
210 struct xfs_buf *bp) 210 struct xfs_buf *bp)
211{ 211{
212 struct xfs_mount *mp = bp->b_target->bt_mount;
213 struct xfs_da_blkinfo *info = bp->b_addr; 212 struct xfs_da_blkinfo *info = bp->b_addr;
214 213
215 switch (be16_to_cpu(info->magic)) { 214 switch (be16_to_cpu(info->magic)) {
216 case XFS_DA3_NODE_MAGIC: 215 case XFS_DA3_NODE_MAGIC:
217 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
218 XFS_DA3_NODE_CRC_OFF)) 217 xfs_buf_ioerror(bp, EFSBADCRC);
219 break; 218 break;
219 }
220 /* fall through */ 220 /* fall through */
221 case XFS_DA_NODE_MAGIC: 221 case XFS_DA_NODE_MAGIC:
222 if (!xfs_da3_node_verify(bp)) 222 if (!xfs_da3_node_verify(bp)) {
223 xfs_buf_ioerror(bp, EFSCORRUPTED);
223 break; 224 break;
225 }
224 return; 226 return;
225 case XFS_ATTR_LEAF_MAGIC: 227 case XFS_ATTR_LEAF_MAGIC:
226 case XFS_ATTR3_LEAF_MAGIC: 228 case XFS_ATTR3_LEAF_MAGIC:
@@ -237,8 +239,7 @@ xfs_da3_node_read_verify(
237 } 239 }
238 240
239 /* corrupt block */ 241 /* corrupt block */
240 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 242 xfs_verifier_error(bp);
241 xfs_buf_ioerror(bp, EFSCORRUPTED);
242} 243}
243 244
244const struct xfs_buf_ops xfs_da3_node_buf_ops = { 245const struct xfs_buf_ops xfs_da3_node_buf_ops = {
@@ -1295,7 +1296,7 @@ xfs_da3_fixhashpath(
1295 node = blk->bp->b_addr; 1296 node = blk->bp->b_addr;
1296 dp->d_ops->node_hdr_from_disk(&nodehdr, node); 1297 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1297 btree = dp->d_ops->node_tree_p(node); 1298 btree = dp->d_ops->node_tree_p(node);
1298 if (be32_to_cpu(btree->hashval) == lasthash) 1299 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1299 break; 1300 break;
1300 blk->hashval = lasthash; 1301 blk->hashval = lasthash;
1301 btree[blk->index].hashval = cpu_to_be32(lasthash); 1302 btree[blk->index].hashval = cpu_to_be32(lasthash);
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6e95ea79f5d7..201c6091d26a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -60,10 +60,12 @@ typedef struct xfs_da_args {
60 int index; /* index of attr of interest in blk */ 60 int index; /* index of attr of interest in blk */
61 xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ 61 xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
62 int rmtblkcnt; /* remote attr value block count */ 62 int rmtblkcnt; /* remote attr value block count */
63 int rmtvaluelen; /* remote attr value length in bytes */
63 xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ 64 xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
64 int index2; /* index of 2nd attr in blk */ 65 int index2; /* index of 2nd attr in blk */
65 xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ 66 xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
66 int rmtblkcnt2; /* remote attr value block count */ 67 int rmtblkcnt2; /* remote attr value block count */
68 int rmtvaluelen2; /* remote attr value length in bytes */
67 int op_flags; /* operation flags */ 69 int op_flags; /* operation flags */
68 enum xfs_dacmp cmpresult; /* name compare result for lookups */ 70 enum xfs_dacmp cmpresult; /* name compare result for lookups */
69} xfs_da_args_t; 71} xfs_da_args_t;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5869b50dc41..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -89,6 +89,8 @@ typedef struct xfs_dinode {
89 /* structure must be padded to 64 bit alignment */ 89 /* structure must be padded to 64 bit alignment */
90} xfs_dinode_t; 90} xfs_dinode_t;
91 91
92#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
93
92#define DI_MAX_FLUSH 0xffff 94#define DI_MAX_FLUSH 0xffff
93 95
94/* 96/*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index ce16ef02997a..fda46253966a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -180,16 +180,23 @@ xfs_dir_init(
180 xfs_inode_t *dp, 180 xfs_inode_t *dp,
181 xfs_inode_t *pdp) 181 xfs_inode_t *pdp)
182{ 182{
183 xfs_da_args_t args; 183 struct xfs_da_args *args;
184 int error; 184 int error;
185 185
186 memset((char *)&args, 0, sizeof(args));
187 args.dp = dp;
188 args.trans = tp;
189 ASSERT(S_ISDIR(dp->i_d.di_mode)); 186 ASSERT(S_ISDIR(dp->i_d.di_mode));
190 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) 187 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
188 if (error)
191 return error; 189 return error;
192 return xfs_dir2_sf_create(&args, pdp->i_ino); 190
191 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
192 if (!args)
193 return ENOMEM;
194
195 args->dp = dp;
196 args->trans = tp;
197 error = xfs_dir2_sf_create(args, pdp->i_ino);
198 kmem_free(args);
199 return error;
193} 200}
194 201
195/* 202/*
@@ -205,41 +212,56 @@ xfs_dir_createname(
205 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 212 xfs_bmap_free_t *flist, /* bmap's freeblock list */
206 xfs_extlen_t total) /* bmap's total block count */ 213 xfs_extlen_t total) /* bmap's total block count */
207{ 214{
208 xfs_da_args_t args; 215 struct xfs_da_args *args;
209 int rval; 216 int rval;
210 int v; /* type-checking value */ 217 int v; /* type-checking value */
211 218
212 ASSERT(S_ISDIR(dp->i_d.di_mode)); 219 ASSERT(S_ISDIR(dp->i_d.di_mode));
213 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 220 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
221 if (rval)
214 return rval; 222 return rval;
215 XFS_STATS_INC(xs_dir_create); 223 XFS_STATS_INC(xs_dir_create);
216 224
217 memset(&args, 0, sizeof(xfs_da_args_t)); 225 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
218 args.name = name->name; 226 if (!args)
219 args.namelen = name->len; 227 return ENOMEM;
220 args.filetype = name->type; 228
221 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 229 args->name = name->name;
222 args.inumber = inum; 230 args->namelen = name->len;
223 args.dp = dp; 231 args->filetype = name->type;
224 args.firstblock = first; 232 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
225 args.flist = flist; 233 args->inumber = inum;
226 args.total = total; 234 args->dp = dp;
227 args.whichfork = XFS_DATA_FORK; 235 args->firstblock = first;
228 args.trans = tp; 236 args->flist = flist;
229 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 237 args->total = total;
230 238 args->whichfork = XFS_DATA_FORK;
231 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 239 args->trans = tp;
232 rval = xfs_dir2_sf_addname(&args); 240 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
233 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 241
234 return rval; 242 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
235 else if (v) 243 rval = xfs_dir2_sf_addname(args);
236 rval = xfs_dir2_block_addname(&args); 244 goto out_free;
237 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 245 }
238 return rval; 246
239 else if (v) 247 rval = xfs_dir2_isblock(tp, dp, &v);
240 rval = xfs_dir2_leaf_addname(&args); 248 if (rval)
249 goto out_free;
250 if (v) {
251 rval = xfs_dir2_block_addname(args);
252 goto out_free;
253 }
254
255 rval = xfs_dir2_isleaf(tp, dp, &v);
256 if (rval)
257 goto out_free;
258 if (v)
259 rval = xfs_dir2_leaf_addname(args);
241 else 260 else
242 rval = xfs_dir2_node_addname(&args); 261 rval = xfs_dir2_node_addname(args);
262
263out_free:
264 kmem_free(args);
243 return rval; 265 return rval;
244} 266}
245 267
@@ -282,46 +304,66 @@ xfs_dir_lookup(
282 xfs_ino_t *inum, /* out: inode number */ 304 xfs_ino_t *inum, /* out: inode number */
283 struct xfs_name *ci_name) /* out: actual name if CI match */ 305 struct xfs_name *ci_name) /* out: actual name if CI match */
284{ 306{
285 xfs_da_args_t args; 307 struct xfs_da_args *args;
286 int rval; 308 int rval;
287 int v; /* type-checking value */ 309 int v; /* type-checking value */
288 310
289 ASSERT(S_ISDIR(dp->i_d.di_mode)); 311 ASSERT(S_ISDIR(dp->i_d.di_mode));
290 XFS_STATS_INC(xs_dir_lookup); 312 XFS_STATS_INC(xs_dir_lookup);
291 313
292 memset(&args, 0, sizeof(xfs_da_args_t)); 314 /*
293 args.name = name->name; 315 * We need to use KM_NOFS here so that lockdep will not throw false
294 args.namelen = name->len; 316 * positive deadlock warnings on a non-transactional lookup path. It is
295 args.filetype = name->type; 317 * safe to recurse into inode recalim in that case, but lockdep can't
296 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 318 * easily be taught about it. Hence KM_NOFS avoids having to add more
297 args.dp = dp; 319 * lockdep Doing this avoids having to add a bunch of lockdep class
298 args.whichfork = XFS_DATA_FORK; 320 * annotations into the reclaim path for the ilock.
299 args.trans = tp; 321 */
300 args.op_flags = XFS_DA_OP_OKNOENT; 322 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
323 args->name = name->name;
324 args->namelen = name->len;
325 args->filetype = name->type;
326 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
327 args->dp = dp;
328 args->whichfork = XFS_DATA_FORK;
329 args->trans = tp;
330 args->op_flags = XFS_DA_OP_OKNOENT;
301 if (ci_name) 331 if (ci_name)
302 args.op_flags |= XFS_DA_OP_CILOOKUP; 332 args->op_flags |= XFS_DA_OP_CILOOKUP;
303 333
304 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 334 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
305 rval = xfs_dir2_sf_lookup(&args); 335 rval = xfs_dir2_sf_lookup(args);
306 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 336 goto out_check_rval;
307 return rval; 337 }
308 else if (v) 338
309 rval = xfs_dir2_block_lookup(&args); 339 rval = xfs_dir2_isblock(tp, dp, &v);
310 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 340 if (rval)
311 return rval; 341 goto out_free;
312 else if (v) 342 if (v) {
313 rval = xfs_dir2_leaf_lookup(&args); 343 rval = xfs_dir2_block_lookup(args);
344 goto out_check_rval;
345 }
346
347 rval = xfs_dir2_isleaf(tp, dp, &v);
348 if (rval)
349 goto out_free;
350 if (v)
351 rval = xfs_dir2_leaf_lookup(args);
314 else 352 else
315 rval = xfs_dir2_node_lookup(&args); 353 rval = xfs_dir2_node_lookup(args);
354
355out_check_rval:
316 if (rval == EEXIST) 356 if (rval == EEXIST)
317 rval = 0; 357 rval = 0;
318 if (!rval) { 358 if (!rval) {
319 *inum = args.inumber; 359 *inum = args->inumber;
320 if (ci_name) { 360 if (ci_name) {
321 ci_name->name = args.value; 361 ci_name->name = args->value;
322 ci_name->len = args.valuelen; 362 ci_name->len = args->valuelen;
323 } 363 }
324 } 364 }
365out_free:
366 kmem_free(args);
325 return rval; 367 return rval;
326} 368}
327 369
@@ -338,38 +380,51 @@ xfs_dir_removename(
338 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 380 xfs_bmap_free_t *flist, /* bmap's freeblock list */
339 xfs_extlen_t total) /* bmap's total block count */ 381 xfs_extlen_t total) /* bmap's total block count */
340{ 382{
341 xfs_da_args_t args; 383 struct xfs_da_args *args;
342 int rval; 384 int rval;
343 int v; /* type-checking value */ 385 int v; /* type-checking value */
344 386
345 ASSERT(S_ISDIR(dp->i_d.di_mode)); 387 ASSERT(S_ISDIR(dp->i_d.di_mode));
346 XFS_STATS_INC(xs_dir_remove); 388 XFS_STATS_INC(xs_dir_remove);
347 389
348 memset(&args, 0, sizeof(xfs_da_args_t)); 390 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
349 args.name = name->name; 391 if (!args)
350 args.namelen = name->len; 392 return ENOMEM;
351 args.filetype = name->type; 393
352 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 394 args->name = name->name;
353 args.inumber = ino; 395 args->namelen = name->len;
354 args.dp = dp; 396 args->filetype = name->type;
355 args.firstblock = first; 397 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
356 args.flist = flist; 398 args->inumber = ino;
357 args.total = total; 399 args->dp = dp;
358 args.whichfork = XFS_DATA_FORK; 400 args->firstblock = first;
359 args.trans = tp; 401 args->flist = flist;
360 402 args->total = total;
361 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 403 args->whichfork = XFS_DATA_FORK;
362 rval = xfs_dir2_sf_removename(&args); 404 args->trans = tp;
363 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 405
364 return rval; 406 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
365 else if (v) 407 rval = xfs_dir2_sf_removename(args);
366 rval = xfs_dir2_block_removename(&args); 408 goto out_free;
367 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 409 }
368 return rval; 410
369 else if (v) 411 rval = xfs_dir2_isblock(tp, dp, &v);
370 rval = xfs_dir2_leaf_removename(&args); 412 if (rval)
413 goto out_free;
414 if (v) {
415 rval = xfs_dir2_block_removename(args);
416 goto out_free;
417 }
418
419 rval = xfs_dir2_isleaf(tp, dp, &v);
420 if (rval)
421 goto out_free;
422 if (v)
423 rval = xfs_dir2_leaf_removename(args);
371 else 424 else
372 rval = xfs_dir2_node_removename(&args); 425 rval = xfs_dir2_node_removename(args);
426out_free:
427 kmem_free(args);
373 return rval; 428 return rval;
374} 429}
375 430
@@ -386,40 +441,54 @@ xfs_dir_replace(
386 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 441 xfs_bmap_free_t *flist, /* bmap's freeblock list */
387 xfs_extlen_t total) /* bmap's total block count */ 442 xfs_extlen_t total) /* bmap's total block count */
388{ 443{
389 xfs_da_args_t args; 444 struct xfs_da_args *args;
390 int rval; 445 int rval;
391 int v; /* type-checking value */ 446 int v; /* type-checking value */
392 447
393 ASSERT(S_ISDIR(dp->i_d.di_mode)); 448 ASSERT(S_ISDIR(dp->i_d.di_mode));
394 449
395 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 450 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
451 if (rval)
396 return rval; 452 return rval;
397 453
398 memset(&args, 0, sizeof(xfs_da_args_t)); 454 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
399 args.name = name->name; 455 if (!args)
400 args.namelen = name->len; 456 return ENOMEM;
401 args.filetype = name->type; 457
402 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 458 args->name = name->name;
403 args.inumber = inum; 459 args->namelen = name->len;
404 args.dp = dp; 460 args->filetype = name->type;
405 args.firstblock = first; 461 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
406 args.flist = flist; 462 args->inumber = inum;
407 args.total = total; 463 args->dp = dp;
408 args.whichfork = XFS_DATA_FORK; 464 args->firstblock = first;
409 args.trans = tp; 465 args->flist = flist;
410 466 args->total = total;
411 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 467 args->whichfork = XFS_DATA_FORK;
412 rval = xfs_dir2_sf_replace(&args); 468 args->trans = tp;
413 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 469
414 return rval; 470 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
415 else if (v) 471 rval = xfs_dir2_sf_replace(args);
416 rval = xfs_dir2_block_replace(&args); 472 goto out_free;
417 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 473 }
418 return rval; 474
419 else if (v) 475 rval = xfs_dir2_isblock(tp, dp, &v);
420 rval = xfs_dir2_leaf_replace(&args); 476 if (rval)
477 goto out_free;
478 if (v) {
479 rval = xfs_dir2_block_replace(args);
480 goto out_free;
481 }
482
483 rval = xfs_dir2_isleaf(tp, dp, &v);
484 if (rval)
485 goto out_free;
486 if (v)
487 rval = xfs_dir2_leaf_replace(args);
421 else 488 else
422 rval = xfs_dir2_node_replace(&args); 489 rval = xfs_dir2_node_replace(args);
490out_free:
491 kmem_free(args);
423 return rval; 492 return rval;
424} 493}
425 494
@@ -434,7 +503,7 @@ xfs_dir_canenter(
434 struct xfs_name *name, /* name of entry to add */ 503 struct xfs_name *name, /* name of entry to add */
435 uint resblks) 504 uint resblks)
436{ 505{
437 xfs_da_args_t args; 506 struct xfs_da_args *args;
438 int rval; 507 int rval;
439 int v; /* type-checking value */ 508 int v; /* type-checking value */
440 509
@@ -443,29 +512,42 @@ xfs_dir_canenter(
443 512
444 ASSERT(S_ISDIR(dp->i_d.di_mode)); 513 ASSERT(S_ISDIR(dp->i_d.di_mode));
445 514
446 memset(&args, 0, sizeof(xfs_da_args_t)); 515 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
447 args.name = name->name; 516 if (!args)
448 args.namelen = name->len; 517 return ENOMEM;
449 args.filetype = name->type; 518
450 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 519 args->name = name->name;
451 args.dp = dp; 520 args->namelen = name->len;
452 args.whichfork = XFS_DATA_FORK; 521 args->filetype = name->type;
453 args.trans = tp; 522 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
454 args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | 523 args->dp = dp;
524 args->whichfork = XFS_DATA_FORK;
525 args->trans = tp;
526 args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
455 XFS_DA_OP_OKNOENT; 527 XFS_DA_OP_OKNOENT;
456 528
457 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 529 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
458 rval = xfs_dir2_sf_addname(&args); 530 rval = xfs_dir2_sf_addname(args);
459 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) 531 goto out_free;
460 return rval; 532 }
461 else if (v) 533
462 rval = xfs_dir2_block_addname(&args); 534 rval = xfs_dir2_isblock(tp, dp, &v);
463 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) 535 if (rval)
464 return rval; 536 goto out_free;
465 else if (v) 537 if (v) {
466 rval = xfs_dir2_leaf_addname(&args); 538 rval = xfs_dir2_block_addname(args);
539 goto out_free;
540 }
541
542 rval = xfs_dir2_isleaf(tp, dp, &v);
543 if (rval)
544 goto out_free;
545 if (v)
546 rval = xfs_dir2_leaf_addname(args);
467 else 547 else
468 rval = xfs_dir2_node_addname(&args); 548 rval = xfs_dir2_node_addname(args);
549out_free:
550 kmem_free(args);
469 return rval; 551 return rval;
470} 552}
471 553
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 90cdbf4b5f19..4f6a38cb83a4 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -89,13 +89,14 @@ xfs_dir3_block_read_verify(
89{ 89{
90 struct xfs_mount *mp = bp->b_target->bt_mount; 90 struct xfs_mount *mp = bp->b_target->bt_mount;
91 91
92 if ((xfs_sb_version_hascrc(&mp->m_sb) && 92 if (xfs_sb_version_hascrc(&mp->m_sb) &&
93 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
94 XFS_DIR3_DATA_CRC_OFF)) || 94 xfs_buf_ioerror(bp, EFSBADCRC);
95 !xfs_dir3_block_verify(bp)) { 95 else if (!xfs_dir3_block_verify(bp))
96 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
97 xfs_buf_ioerror(bp, EFSCORRUPTED); 96 xfs_buf_ioerror(bp, EFSCORRUPTED);
98 } 97
98 if (bp->b_error)
99 xfs_verifier_error(bp);
99} 100}
100 101
101static void 102static void
@@ -107,8 +108,8 @@ xfs_dir3_block_write_verify(
107 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
108 109
109 if (!xfs_dir3_block_verify(bp)) { 110 if (!xfs_dir3_block_verify(bp)) {
110 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
111 xfs_buf_ioerror(bp, EFSCORRUPTED); 111 xfs_buf_ioerror(bp, EFSCORRUPTED);
112 xfs_verifier_error(bp);
112 return; 113 return;
113 } 114 }
114 115
@@ -118,7 +119,7 @@ xfs_dir3_block_write_verify(
118 if (bip) 119 if (bip)
119 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 120 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
120 121
121 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 122 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
122} 123}
123 124
124const struct xfs_buf_ops xfs_dir3_block_buf_ops = { 125const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 70acff4ee173..afa4ad523f3f 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -241,7 +241,6 @@ static void
241xfs_dir3_data_reada_verify( 241xfs_dir3_data_reada_verify(
242 struct xfs_buf *bp) 242 struct xfs_buf *bp)
243{ 243{
244 struct xfs_mount *mp = bp->b_target->bt_mount;
245 struct xfs_dir2_data_hdr *hdr = bp->b_addr; 244 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
246 245
247 switch (hdr->magic) { 246 switch (hdr->magic) {
@@ -255,8 +254,8 @@ xfs_dir3_data_reada_verify(
255 xfs_dir3_data_verify(bp); 254 xfs_dir3_data_verify(bp);
256 return; 255 return;
257 default: 256 default:
258 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
259 xfs_buf_ioerror(bp, EFSCORRUPTED); 257 xfs_buf_ioerror(bp, EFSCORRUPTED);
258 xfs_verifier_error(bp);
260 break; 259 break;
261 } 260 }
262} 261}
@@ -267,13 +266,14 @@ xfs_dir3_data_read_verify(
267{ 266{
268 struct xfs_mount *mp = bp->b_target->bt_mount; 267 struct xfs_mount *mp = bp->b_target->bt_mount;
269 268
270 if ((xfs_sb_version_hascrc(&mp->m_sb) && 269 if (xfs_sb_version_hascrc(&mp->m_sb) &&
271 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 270 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
272 XFS_DIR3_DATA_CRC_OFF)) || 271 xfs_buf_ioerror(bp, EFSBADCRC);
273 !xfs_dir3_data_verify(bp)) { 272 else if (!xfs_dir3_data_verify(bp))
274 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
275 xfs_buf_ioerror(bp, EFSCORRUPTED); 273 xfs_buf_ioerror(bp, EFSCORRUPTED);
276 } 274
275 if (bp->b_error)
276 xfs_verifier_error(bp);
277} 277}
278 278
279static void 279static void
@@ -285,8 +285,8 @@ xfs_dir3_data_write_verify(
285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 285 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
286 286
287 if (!xfs_dir3_data_verify(bp)) { 287 if (!xfs_dir3_data_verify(bp)) {
288 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
289 xfs_buf_ioerror(bp, EFSCORRUPTED); 288 xfs_buf_ioerror(bp, EFSCORRUPTED);
289 xfs_verifier_error(bp);
290 return; 290 return;
291 } 291 }
292 292
@@ -296,7 +296,7 @@ xfs_dir3_data_write_verify(
296 if (bip) 296 if (bip)
297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 297 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
298 298
299 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); 299 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
300} 300}
301 301
302const struct xfs_buf_ops xfs_dir3_data_buf_ops = { 302const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ae47ec6e16c4..d36e97df1187 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -179,13 +179,14 @@ __read_verify(
179{ 179{
180 struct xfs_mount *mp = bp->b_target->bt_mount; 180 struct xfs_mount *mp = bp->b_target->bt_mount;
181 181
182 if ((xfs_sb_version_hascrc(&mp->m_sb) && 182 if (xfs_sb_version_hascrc(&mp->m_sb) &&
183 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 183 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
184 XFS_DIR3_LEAF_CRC_OFF)) || 184 xfs_buf_ioerror(bp, EFSBADCRC);
185 !xfs_dir3_leaf_verify(bp, magic)) { 185 else if (!xfs_dir3_leaf_verify(bp, magic))
186 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
187 xfs_buf_ioerror(bp, EFSCORRUPTED); 186 xfs_buf_ioerror(bp, EFSCORRUPTED);
188 } 187
188 if (bp->b_error)
189 xfs_verifier_error(bp);
189} 190}
190 191
191static void 192static void
@@ -198,8 +199,8 @@ __write_verify(
198 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; 199 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
199 200
200 if (!xfs_dir3_leaf_verify(bp, magic)) { 201 if (!xfs_dir3_leaf_verify(bp, magic)) {
201 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
202 xfs_buf_ioerror(bp, EFSCORRUPTED); 202 xfs_buf_ioerror(bp, EFSCORRUPTED);
203 xfs_verifier_error(bp);
203 return; 204 return;
204 } 205 }
205 206
@@ -209,7 +210,7 @@ __write_verify(
209 if (bip) 210 if (bip)
210 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); 211 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
211 212
212 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); 213 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
213} 214}
214 215
215static void 216static void
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 48c7d18f68c3..cb434d732681 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -115,13 +115,14 @@ xfs_dir3_free_read_verify(
115{ 115{
116 struct xfs_mount *mp = bp->b_target->bt_mount; 116 struct xfs_mount *mp = bp->b_target->bt_mount;
117 117
118 if ((xfs_sb_version_hascrc(&mp->m_sb) && 118 if (xfs_sb_version_hascrc(&mp->m_sb) &&
119 !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
120 XFS_DIR3_FREE_CRC_OFF)) || 120 xfs_buf_ioerror(bp, EFSBADCRC);
121 !xfs_dir3_free_verify(bp)) { 121 else if (!xfs_dir3_free_verify(bp))
122 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
123 xfs_buf_ioerror(bp, EFSCORRUPTED); 122 xfs_buf_ioerror(bp, EFSCORRUPTED);
124 } 123
124 if (bp->b_error)
125 xfs_verifier_error(bp);
125} 126}
126 127
127static void 128static void
@@ -133,8 +134,8 @@ xfs_dir3_free_write_verify(
133 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
134 135
135 if (!xfs_dir3_free_verify(bp)) { 136 if (!xfs_dir3_free_verify(bp)) {
136 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
137 xfs_buf_ioerror(bp, EFSCORRUPTED); 137 xfs_buf_ioerror(bp, EFSCORRUPTED);
138 xfs_verifier_error(bp);
138 return; 139 return;
139 } 140 }
140 141
@@ -144,7 +145,7 @@ xfs_dir3_free_write_verify(
144 if (bip) 145 if (bip)
145 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); 146 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
146 147
147 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); 148 xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
148} 149}
149 150
150const struct xfs_buf_ops xfs_dir3_free_buf_ops = { 151const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7aeb4c895b32..868b19f096bf 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -615,7 +615,7 @@ xfs_qm_dqread(
615 615
616 if (flags & XFS_QMOPT_DQALLOC) { 616 if (flags & XFS_QMOPT_DQALLOC) {
617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 617 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, 618 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
619 XFS_QM_DQALLOC_SPACE_RES(mp), 0); 619 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
620 if (error) 620 if (error)
621 goto error1; 621 goto error1;
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index d401457d2f25..610da8177737 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -257,10 +257,13 @@ xfs_dquot_buf_read_verify(
257{ 257{
258 struct xfs_mount *mp = bp->b_target->bt_mount; 258 struct xfs_mount *mp = bp->b_target->bt_mount;
259 259
260 if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { 260 if (!xfs_dquot_buf_verify_crc(mp, bp))
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 261 xfs_buf_ioerror(bp, EFSBADCRC);
262 else if (!xfs_dquot_buf_verify(mp, bp))
262 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
263 } 264
265 if (bp->b_error)
266 xfs_verifier_error(bp);
264} 267}
265 268
266/* 269/*
@@ -275,8 +278,8 @@ xfs_dquot_buf_write_verify(
275 struct xfs_mount *mp = bp->b_target->bt_mount; 278 struct xfs_mount *mp = bp->b_target->bt_mount;
276 279
277 if (!xfs_dquot_buf_verify(mp, bp)) { 280 if (!xfs_dquot_buf_verify(mp, bp)) {
278 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
279 xfs_buf_ioerror(bp, EFSCORRUPTED); 281 xfs_buf_ioerror(bp, EFSCORRUPTED);
282 xfs_verifier_error(bp);
280 return; 283 return;
281 } 284 }
282} 285}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9995b807d627..edac5b057d28 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -156,7 +156,7 @@ xfs_error_report(
156{ 156{
157 if (level <= xfs_error_level) { 157 if (level <= xfs_error_level) {
158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, 158 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
159 "Internal error %s at line %d of file %s. Caller 0x%p", 159 "Internal error %s at line %d of file %s. Caller %pF",
160 tag, linenum, filename, ra); 160 tag, linenum, filename, ra);
161 161
162 xfs_stack_trace(); 162 xfs_stack_trace();
@@ -178,3 +178,28 @@ xfs_corruption_error(
178 xfs_error_report(tag, level, mp, filename, linenum, ra); 178 xfs_error_report(tag, level, mp, filename, linenum, ra);
179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); 179 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
180} 180}
181
182/*
183 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
184 * values, and omit the stack trace unless the error level is tuned high.
185 */
186void
187xfs_verifier_error(
188 struct xfs_buf *bp)
189{
190 struct xfs_mount *mp = bp->b_target->bt_mount;
191
192 xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
193 bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
194 __return_address, bp->b_bn);
195
196 xfs_alert(mp, "Unmount and run xfs_repair");
197
198 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
199 xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
200 xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
201 }
202
203 if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
204 xfs_stack_trace();
205}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44ee..c1c57d4a4b5d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
34extern void xfs_corruption_error(const char *tag, int level, 34extern void xfs_corruption_error(const char *tag, int level,
35 struct xfs_mount *mp, void *p, const char *filename, 35 struct xfs_mount *mp, void *p, const char *filename,
36 int linenum, inst_t *ra); 36 int linenum, inst_t *ra);
37extern void xfs_verifier_error(struct xfs_buf *bp);
37 38
38#define XFS_ERROR_REPORT(e, lvl, mp) \ 39#define XFS_ERROR_REPORT(e, lvl, mp) \
39 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) 40 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1399e187d425..753e467aa1a5 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
237 237
238 if (!lsn) 238 if (!lsn)
239 return 0; 239 return 0;
240 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 240 return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
241} 241}
242 242
243const struct export_operations xfs_export_operations = { 243const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 64b48eade91d..830c1c937b88 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -155,7 +155,7 @@ xfs_dir_fsync(
155 155
156 if (!lsn) 156 if (!lsn)
157 return 0; 157 return 0;
158 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 158 return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
159} 159}
160 160
161STATIC int 161STATIC int
@@ -295,7 +295,7 @@ xfs_file_aio_read(
295 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 295 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
296 296
297 if (inode->i_mapping->nrpages) { 297 if (inode->i_mapping->nrpages) {
298 ret = -filemap_write_and_wait_range( 298 ret = filemap_write_and_wait_range(
299 VFS_I(ip)->i_mapping, 299 VFS_I(ip)->i_mapping,
300 pos, -1); 300 pos, -1);
301 if (ret) { 301 if (ret) {
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
679 goto out; 679 goto out;
680 680
681 if (mapping->nrpages) { 681 if (mapping->nrpages) {
682 ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 682 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
683 pos, -1); 683 pos, -1);
684 if (ret) 684 if (ret)
685 goto out; 685 goto out;
@@ -699,7 +699,7 @@ xfs_file_dio_aio_write(
699 699
700 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 700 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
701 ret = generic_file_direct_write(iocb, iovp, 701 ret = generic_file_direct_write(iocb, iovp,
702 &nr_segs, pos, &iocb->ki_pos, count, ocount); 702 &nr_segs, pos, count, ocount);
703 703
704out: 704out:
705 xfs_rw_iunlock(ip, iolock); 705 xfs_rw_iunlock(ip, iolock);
@@ -715,7 +715,7 @@ xfs_file_buffered_aio_write(
715 const struct iovec *iovp, 715 const struct iovec *iovp,
716 unsigned long nr_segs, 716 unsigned long nr_segs,
717 loff_t pos, 717 loff_t pos,
718 size_t ocount) 718 size_t count)
719{ 719{
720 struct file *file = iocb->ki_filp; 720 struct file *file = iocb->ki_filp;
721 struct address_space *mapping = file->f_mapping; 721 struct address_space *mapping = file->f_mapping;
@@ -724,7 +724,7 @@ xfs_file_buffered_aio_write(
724 ssize_t ret; 724 ssize_t ret;
725 int enospc = 0; 725 int enospc = 0;
726 int iolock = XFS_IOLOCK_EXCL; 726 int iolock = XFS_IOLOCK_EXCL;
727 size_t count = ocount; 727 struct iov_iter from;
728 728
729 xfs_rw_ilock(ip, iolock); 729 xfs_rw_ilock(ip, iolock);
730 730
@@ -732,14 +732,15 @@ xfs_file_buffered_aio_write(
732 if (ret) 732 if (ret)
733 goto out; 733 goto out;
734 734
735 iov_iter_init(&from, iovp, nr_segs, count, 0);
735 /* We can write back this queue in page reclaim */ 736 /* We can write back this queue in page reclaim */
736 current->backing_dev_info = mapping->backing_dev_info; 737 current->backing_dev_info = mapping->backing_dev_info;
737 738
738write_retry: 739write_retry:
739 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 740 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
740 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 741 ret = generic_perform_write(file, &from, pos);
741 pos, &iocb->ki_pos, count, 0); 742 if (likely(ret >= 0))
742 743 iocb->ki_pos = pos + ret;
743 /* 744 /*
744 * If we just got an ENOSPC, try to write back all dirty inodes to 745 * If we just got an ENOSPC, try to write back all dirty inodes to
745 * convert delalloc space to free up some of the excess reserved 746 * convert delalloc space to free up some of the excess reserved
@@ -823,7 +824,8 @@ xfs_file_fallocate(
823 824
824 if (!S_ISREG(inode->i_mode)) 825 if (!S_ISREG(inode->i_mode))
825 return -EINVAL; 826 return -EINVAL;
826 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 827 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
828 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
827 return -EOPNOTSUPP; 829 return -EOPNOTSUPP;
828 830
829 xfs_ilock(ip, XFS_IOLOCK_EXCL); 831 xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -831,6 +833,28 @@ xfs_file_fallocate(
831 error = xfs_free_file_space(ip, offset, len); 833 error = xfs_free_file_space(ip, offset, len);
832 if (error) 834 if (error)
833 goto out_unlock; 835 goto out_unlock;
836 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
837 unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
838
839 if (offset & blksize_mask || len & blksize_mask) {
840 error = EINVAL;
841 goto out_unlock;
842 }
843
844 /*
845 * There is no need to overlap collapse range with EOF,
846 * in which case it is effectively a truncate operation
847 */
848 if (offset + len >= i_size_read(inode)) {
849 error = EINVAL;
850 goto out_unlock;
851 }
852
853 new_size = i_size_read(inode) - len;
854
855 error = xfs_collapse_file_space(ip, offset, len);
856 if (error)
857 goto out_unlock;
834 } else { 858 } else {
835 if (!(mode & FALLOC_FL_KEEP_SIZE) && 859 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
836 offset + len > i_size_read(inode)) { 860 offset + len > i_size_read(inode)) {
@@ -840,8 +864,11 @@ xfs_file_fallocate(
840 goto out_unlock; 864 goto out_unlock;
841 } 865 }
842 866
843 error = xfs_alloc_file_space(ip, offset, len, 867 if (mode & FALLOC_FL_ZERO_RANGE)
844 XFS_BMAPI_PREALLOC); 868 error = xfs_zero_file_space(ip, offset, len);
869 else
870 error = xfs_alloc_file_space(ip, offset, len,
871 XFS_BMAPI_PREALLOC);
845 if (error) 872 if (error)
846 goto out_unlock; 873 goto out_unlock;
847 } 874 }
@@ -859,7 +886,7 @@ xfs_file_fallocate(
859 if (ip->i_d.di_mode & S_IXGRP) 886 if (ip->i_d.di_mode & S_IXGRP)
860 ip->i_d.di_mode &= ~S_ISGID; 887 ip->i_d.di_mode &= ~S_ISGID;
861 888
862 if (!(mode & FALLOC_FL_PUNCH_HOLE)) 889 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
863 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 890 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
864 891
865 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 892 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1465,6 +1492,7 @@ const struct file_operations xfs_dir_file_operations = {
1465 1492
1466static const struct vm_operations_struct xfs_file_vm_ops = { 1493static const struct vm_operations_struct xfs_file_vm_ops = {
1467 .fault = filemap_fault, 1494 .fault = filemap_fault,
1495 .map_pages = filemap_map_pages,
1468 .page_mkwrite = xfs_vm_page_mkwrite, 1496 .page_mkwrite = xfs_vm_page_mkwrite,
1469 .remap_pages = generic_file_remap_pages, 1497 .remap_pages = generic_file_remap_pages,
1470}; 1498};
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index b6ab5a3cfa12..9898f31d05d8 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr {
145 __be64 sl_lsn; 145 __be64 sl_lsn;
146}; 146};
147 147
148#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
149
148/* 150/*
149 * The maximum pathlen is 1024 bytes. Since the minimum file system 151 * The maximum pathlen is 1024 bytes. Since the minimum file system
150 * blocksize is 512 bytes, we can get a max of 3 extents back from 152 * blocksize is 512 bytes, we can get a max of 3 extents back from
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5d7f105a1c82..8f711db61a0c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc(
363 args.minleft = args.mp->m_in_maxlevels - 1; 363 args.minleft = args.mp->m_in_maxlevels - 1;
364 if ((error = xfs_alloc_vextent(&args))) 364 if ((error = xfs_alloc_vextent(&args)))
365 return error; 365 return error;
366
367 /*
368 * This request might have dirtied the transaction if the AG can
369 * satisfy the request, but the exact block was not available.
370 * If the allocation did fail, subsequent requests will relax
371 * the exact agbno requirement and increase the alignment
372 * instead. It is critical that the total size of the request
373 * (len + alignment + slop) does not increase from this point
374 * on, so reset minalignslop to ensure it is not included in
375 * subsequent requests.
376 */
377 args.minalignslop = 0;
366 } else 378 } else
367 args.fsbno = NULLFSBLOCK; 379 args.fsbno = NULLFSBLOCK;
368 380
@@ -1568,18 +1580,17 @@ xfs_agi_read_verify(
1568 struct xfs_buf *bp) 1580 struct xfs_buf *bp)
1569{ 1581{
1570 struct xfs_mount *mp = bp->b_target->bt_mount; 1582 struct xfs_mount *mp = bp->b_target->bt_mount;
1571 int agi_ok = 1;
1572
1573 if (xfs_sb_version_hascrc(&mp->m_sb))
1574 agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
1575 offsetof(struct xfs_agi, agi_crc));
1576 agi_ok = agi_ok && xfs_agi_verify(bp);
1577 1583
1578 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, 1584 if (xfs_sb_version_hascrc(&mp->m_sb) &&
1579 XFS_RANDOM_IALLOC_READ_AGI))) { 1585 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
1580 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); 1586 xfs_buf_ioerror(bp, EFSBADCRC);
1587 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
1588 XFS_ERRTAG_IALLOC_READ_AGI,
1589 XFS_RANDOM_IALLOC_READ_AGI))
1581 xfs_buf_ioerror(bp, EFSCORRUPTED); 1590 xfs_buf_ioerror(bp, EFSCORRUPTED);
1582 } 1591
1592 if (bp->b_error)
1593 xfs_verifier_error(bp);
1583} 1594}
1584 1595
1585static void 1596static void
@@ -1590,8 +1601,8 @@ xfs_agi_write_verify(
1590 struct xfs_buf_log_item *bip = bp->b_fspriv; 1601 struct xfs_buf_log_item *bip = bp->b_fspriv;
1591 1602
1592 if (!xfs_agi_verify(bp)) { 1603 if (!xfs_agi_verify(bp)) {
1593 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
1594 xfs_buf_ioerror(bp, EFSCORRUPTED); 1604 xfs_buf_ioerror(bp, EFSCORRUPTED);
1605 xfs_verifier_error(bp);
1595 return; 1606 return;
1596 } 1607 }
1597 1608
@@ -1600,8 +1611,7 @@ xfs_agi_write_verify(
1600 1611
1601 if (bip) 1612 if (bip)
1602 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); 1613 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
1603 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 1614 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
1604 offsetof(struct xfs_agi, agi_crc));
1605} 1615}
1606 1616
1607const struct xfs_buf_ops xfs_agi_buf_ops = { 1617const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c8fa5bbb36de..7e309b11e87d 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -243,12 +243,14 @@ static void
243xfs_inobt_read_verify( 243xfs_inobt_read_verify(
244 struct xfs_buf *bp) 244 struct xfs_buf *bp)
245{ 245{
246 if (!(xfs_btree_sblock_verify_crc(bp) && 246 if (!xfs_btree_sblock_verify_crc(bp))
247 xfs_inobt_verify(bp))) { 247 xfs_buf_ioerror(bp, EFSBADCRC);
248 trace_xfs_btree_corrupt(bp, _RET_IP_); 248 else if (!xfs_inobt_verify(bp))
249 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
250 bp->b_target->bt_mount, bp->b_addr);
251 xfs_buf_ioerror(bp, EFSCORRUPTED); 249 xfs_buf_ioerror(bp, EFSCORRUPTED);
250
251 if (bp->b_error) {
252 trace_xfs_btree_corrupt(bp, _RET_IP_);
253 xfs_verifier_error(bp);
252 } 254 }
253} 255}
254 256
@@ -258,9 +260,9 @@ xfs_inobt_write_verify(
258{ 260{
259 if (!xfs_inobt_verify(bp)) { 261 if (!xfs_inobt_verify(bp)) {
260 trace_xfs_btree_corrupt(bp, _RET_IP_); 262 trace_xfs_btree_corrupt(bp, _RET_IP_);
261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
262 bp->b_target->bt_mount, bp->b_addr);
263 xfs_buf_ioerror(bp, EFSCORRUPTED); 263 xfs_buf_ioerror(bp, EFSCORRUPTED);
264 xfs_verifier_error(bp);
265 return;
264 } 266 }
265 xfs_btree_sblock_calc_crc(bp); 267 xfs_btree_sblock_calc_crc(bp);
266 268
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a137e9f9a7d..768087bedbac 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,7 +42,6 @@
42#include "xfs_bmap_util.h" 42#include "xfs_bmap_util.h"
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_dinode.h"
46#include "xfs_filestream.h" 45#include "xfs_filestream.h"
47#include "xfs_cksum.h" 46#include "xfs_cksum.h"
48#include "xfs_trace.h" 47#include "xfs_trace.h"
@@ -62,6 +61,8 @@ kmem_zone_t *xfs_inode_zone;
62 61
63STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 62STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
64 63
64STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
65
65/* 66/*
66 * helper function to extract extent size hint from inode 67 * helper function to extract extent size hint from inode
67 */ 68 */
@@ -1115,7 +1116,7 @@ xfs_bumplink(
1115{ 1116{
1116 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1117 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1117 1118
1118 ASSERT(ip->i_d.di_nlink > 0); 1119 ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
1119 ip->i_d.di_nlink++; 1120 ip->i_d.di_nlink++;
1120 inc_nlink(VFS_I(ip)); 1121 inc_nlink(VFS_I(ip));
1121 if ((ip->i_d.di_version == 1) && 1122 if ((ip->i_d.di_version == 1) &&
@@ -1165,10 +1166,7 @@ xfs_create(
1165 if (XFS_FORCED_SHUTDOWN(mp)) 1166 if (XFS_FORCED_SHUTDOWN(mp))
1166 return XFS_ERROR(EIO); 1167 return XFS_ERROR(EIO);
1167 1168
1168 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1169 prid = xfs_get_initial_prid(dp);
1169 prid = xfs_get_projid(dp);
1170 else
1171 prid = XFS_PROJID_DEFAULT;
1172 1170
1173 /* 1171 /*
1174 * Make sure that we have allocated dquot(s) on disk. 1172 * Make sure that we have allocated dquot(s) on disk.
@@ -1333,6 +1331,114 @@ xfs_create(
1333} 1331}
1334 1332
1335int 1333int
1334xfs_create_tmpfile(
1335 struct xfs_inode *dp,
1336 struct dentry *dentry,
1337 umode_t mode,
1338 struct xfs_inode **ipp)
1339{
1340 struct xfs_mount *mp = dp->i_mount;
1341 struct xfs_inode *ip = NULL;
1342 struct xfs_trans *tp = NULL;
1343 int error;
1344 uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1345 prid_t prid;
1346 struct xfs_dquot *udqp = NULL;
1347 struct xfs_dquot *gdqp = NULL;
1348 struct xfs_dquot *pdqp = NULL;
1349 struct xfs_trans_res *tres;
1350 uint resblks;
1351
1352 if (XFS_FORCED_SHUTDOWN(mp))
1353 return XFS_ERROR(EIO);
1354
1355 prid = xfs_get_initial_prid(dp);
1356
1357 /*
1358 * Make sure that we have allocated dquot(s) on disk.
1359 */
1360 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1361 xfs_kgid_to_gid(current_fsgid()), prid,
1362 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1363 &udqp, &gdqp, &pdqp);
1364 if (error)
1365 return error;
1366
1367 resblks = XFS_IALLOC_SPACE_RES(mp);
1368 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
1369
1370 tres = &M_RES(mp)->tr_create_tmpfile;
1371 error = xfs_trans_reserve(tp, tres, resblks, 0);
1372 if (error == ENOSPC) {
1373 /* No space at all so try a "no-allocation" reservation */
1374 resblks = 0;
1375 error = xfs_trans_reserve(tp, tres, 0, 0);
1376 }
1377 if (error) {
1378 cancel_flags = 0;
1379 goto out_trans_cancel;
1380 }
1381
1382 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1383 pdqp, resblks, 1, 0);
1384 if (error)
1385 goto out_trans_cancel;
1386
1387 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
1388 prid, resblks > 0, &ip, NULL);
1389 if (error) {
1390 if (error == ENOSPC)
1391 goto out_trans_cancel;
1392 goto out_trans_abort;
1393 }
1394
1395 if (mp->m_flags & XFS_MOUNT_WSYNC)
1396 xfs_trans_set_sync(tp);
1397
1398 /*
1399 * Attach the dquot(s) to the inodes and modify them incore.
1400 * These ids of the inode couldn't have changed since the new
1401 * inode has been locked ever since it was created.
1402 */
1403 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1404
1405 ip->i_d.di_nlink--;
1406 error = xfs_iunlink(tp, ip);
1407 if (error)
1408 goto out_trans_abort;
1409
1410 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1411 if (error)
1412 goto out_release_inode;
1413
1414 xfs_qm_dqrele(udqp);
1415 xfs_qm_dqrele(gdqp);
1416 xfs_qm_dqrele(pdqp);
1417
1418 *ipp = ip;
1419 return 0;
1420
1421 out_trans_abort:
1422 cancel_flags |= XFS_TRANS_ABORT;
1423 out_trans_cancel:
1424 xfs_trans_cancel(tp, cancel_flags);
1425 out_release_inode:
1426 /*
1427 * Wait until after the current transaction is aborted to
1428 * release the inode. This prevents recursive transactions
1429 * and deadlocks from xfs_inactive.
1430 */
1431 if (ip)
1432 IRELE(ip);
1433
1434 xfs_qm_dqrele(udqp);
1435 xfs_qm_dqrele(gdqp);
1436 xfs_qm_dqrele(pdqp);
1437
1438 return error;
1439}
1440
1441int
1336xfs_link( 1442xfs_link(
1337 xfs_inode_t *tdp, 1443 xfs_inode_t *tdp,
1338 xfs_inode_t *sip, 1444 xfs_inode_t *sip,
@@ -1397,6 +1503,12 @@ xfs_link(
1397 1503
1398 xfs_bmap_init(&free_list, &first_block); 1504 xfs_bmap_init(&free_list, &first_block);
1399 1505
1506 if (sip->i_d.di_nlink == 0) {
1507 error = xfs_iunlink_remove(tp, sip);
1508 if (error)
1509 goto abort_return;
1510 }
1511
1400 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1512 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1401 &first_block, &free_list, resblks); 1513 &first_block, &free_list, resblks);
1402 if (error) 1514 if (error)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65e2350f449c..f2fcde52b66d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,6 +20,7 @@
20 20
21#include "xfs_inode_buf.h" 21#include "xfs_inode_buf.h"
22#include "xfs_inode_fork.h" 22#include "xfs_inode_fork.h"
23#include "xfs_dinode.h"
23 24
24/* 25/*
25 * Kernel only inode definitions 26 * Kernel only inode definitions
@@ -192,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
192 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); 193 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
193} 194}
194 195
196static inline prid_t
197xfs_get_initial_prid(struct xfs_inode *dp)
198{
199 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
200 return xfs_get_projid(dp);
201
202 return XFS_PROJID_DEFAULT;
203}
204
195/* 205/*
196 * In-core inode flags. 206 * In-core inode flags.
197 */ 207 */
@@ -323,6 +333,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
323 struct xfs_inode **ipp, struct xfs_name *ci_name); 333 struct xfs_inode **ipp, struct xfs_name *ci_name);
324int xfs_create(struct xfs_inode *dp, struct xfs_name *name, 334int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
325 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); 335 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
336int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
337 umode_t mode, struct xfs_inode **ipp);
326int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 338int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
327 struct xfs_inode *ip); 339 struct xfs_inode *ip);
328int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 340int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 4fc9f39dd89e..24e993996bdc 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -102,8 +102,7 @@ xfs_inode_buf_verify(
102 } 102 }
103 103
104 xfs_buf_ioerror(bp, EFSCORRUPTED); 104 xfs_buf_ioerror(bp, EFSCORRUPTED);
105 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, 105 xfs_verifier_error(bp);
106 mp, dip);
107#ifdef DEBUG 106#ifdef DEBUG
108 xfs_alert(mp, 107 xfs_alert(mp,
109 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 108 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
@@ -306,7 +305,7 @@ xfs_dinode_verify(
306 if (!xfs_sb_version_hascrc(&mp->m_sb)) 305 if (!xfs_sb_version_hascrc(&mp->m_sb))
307 return false; 306 return false;
308 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 307 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
309 offsetof(struct xfs_dinode, di_crc))) 308 XFS_DINODE_CRC_OFF))
310 return false; 309 return false;
311 if (be64_to_cpu(dip->di_ino) != ip->i_ino) 310 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
312 return false; 311 return false;
@@ -327,7 +326,7 @@ xfs_dinode_calc_crc(
327 326
328 ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); 327 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
329 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, 328 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
330 offsetof(struct xfs_dinode, di_crc)); 329 XFS_DINODE_CRC_OFF);
331 dip->di_crc = xfs_end_cksum(crc); 330 dip->di_crc = xfs_end_cksum(crc);
332} 331}
333 332
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcfe61202115..0b18776b075e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
271 return error; 271 return error;
272} 272}
273 273
274/*
275 * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
276 * unused first argument.
277 */
278STATIC int
279do_readlink(
280 char __user *buffer,
281 int buflen,
282 const char *link)
283{
284 int len;
285
286 len = PTR_ERR(link);
287 if (IS_ERR(link))
288 goto out;
289
290 len = strlen(link);
291 if (len > (unsigned) buflen)
292 len = buflen;
293 if (copy_to_user(buffer, link, len))
294 len = -EFAULT;
295 out:
296 return len;
297}
298
299
300int 274int
301xfs_readlink_by_handle( 275xfs_readlink_by_handle(
302 struct file *parfilp, 276 struct file *parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
334 error = -xfs_readlink(XFS_I(dentry->d_inode), link); 308 error = -xfs_readlink(XFS_I(dentry->d_inode), link);
335 if (error) 309 if (error)
336 goto out_kfree; 310 goto out_kfree;
337 error = do_readlink(hreq->ohandle, olen, link); 311 error = readlink_copy(hreq->ohandle, olen, link);
338 if (error) 312 if (error)
339 goto out_kfree; 313 goto out_kfree;
340 314
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 22d1cbea283d..3b80ebae05f5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -128,7 +128,6 @@ xfs_iomap_write_direct(
128 xfs_fsblock_t firstfsb; 128 xfs_fsblock_t firstfsb;
129 xfs_extlen_t extsz, temp; 129 xfs_extlen_t extsz, temp;
130 int nimaps; 130 int nimaps;
131 int bmapi_flag;
132 int quota_flag; 131 int quota_flag;
133 int rt; 132 int rt;
134 xfs_trans_t *tp; 133 xfs_trans_t *tp;
@@ -200,18 +199,15 @@ xfs_iomap_write_direct(
200 199
201 xfs_trans_ijoin(tp, ip, 0); 200 xfs_trans_ijoin(tp, ip, 0);
202 201
203 bmapi_flag = 0;
204 if (offset < XFS_ISIZE(ip) || extsz)
205 bmapi_flag |= XFS_BMAPI_PREALLOC;
206
207 /* 202 /*
208 * From this point onwards we overwrite the imap pointer that the 203 * From this point onwards we overwrite the imap pointer that the
209 * caller gave to us. 204 * caller gave to us.
210 */ 205 */
211 xfs_bmap_init(&free_list, &firstfsb); 206 xfs_bmap_init(&free_list, &firstfsb);
212 nimaps = 1; 207 nimaps = 1;
213 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, 208 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
214 &firstfsb, 0, imap, &nimaps, &free_list); 209 XFS_BMAPI_PREALLOC, &firstfsb, 0,
210 imap, &nimaps, &free_list);
215 if (error) 211 if (error)
216 goto out_bmap_cancel; 212 goto out_bmap_cancel;
217 213
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ddfb8190ca1..36d630319a27 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -39,6 +39,7 @@
39#include "xfs_da_btree.h" 39#include "xfs_da_btree.h"
40#include "xfs_dir2_priv.h" 40#include "xfs_dir2_priv.h"
41#include "xfs_dinode.h" 41#include "xfs_dinode.h"
42#include "xfs_trans_space.h"
42 43
43#include <linux/capability.h> 44#include <linux/capability.h>
44#include <linux/xattr.h> 45#include <linux/xattr.h>
@@ -48,6 +49,18 @@
48#include <linux/fiemap.h> 49#include <linux/fiemap.h>
49#include <linux/slab.h> 50#include <linux/slab.h>
50 51
52/*
53 * Directories have different lock order w.r.t. mmap_sem compared to regular
54 * files. This is due to readdir potentially triggering page faults on a user
55 * buffer inside filldir(), and this happens with the ilock on the directory
56 * held. For regular files, the lock order is the other way around - the
57 * mmap_sem is taken during the page fault, and then we lock the ilock to do
58 * block mapping. Hence we need a different class for the directory ilock so
59 * that lockdep can tell them apart.
60 */
61static struct lock_class_key xfs_nondir_ilock_class;
62static struct lock_class_key xfs_dir_ilock_class;
63
51static int 64static int
52xfs_initxattrs( 65xfs_initxattrs(
53 struct inode *inode, 66 struct inode *inode,
@@ -59,8 +72,8 @@ xfs_initxattrs(
59 int error = 0; 72 int error = 0;
60 73
61 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 74 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
62 error = xfs_attr_set(ip, xattr->name, xattr->value, 75 error = -xfs_attr_set(ip, xattr->name, xattr->value,
63 xattr->value_len, ATTR_SECURE); 76 xattr->value_len, ATTR_SECURE);
64 if (error < 0) 77 if (error < 0)
65 break; 78 break;
66 } 79 }
@@ -80,8 +93,8 @@ xfs_init_security(
80 struct inode *dir, 93 struct inode *dir,
81 const struct qstr *qstr) 94 const struct qstr *qstr)
82{ 95{
83 return security_inode_init_security(inode, dir, qstr, 96 return -security_inode_init_security(inode, dir, qstr,
84 &xfs_initxattrs, NULL); 97 &xfs_initxattrs, NULL);
85} 98}
86 99
87static void 100static void
@@ -111,15 +124,15 @@ xfs_cleanup_inode(
111 xfs_dentry_to_name(&teardown, dentry, 0); 124 xfs_dentry_to_name(&teardown, dentry, 0);
112 125
113 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); 126 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
114 iput(inode);
115} 127}
116 128
117STATIC int 129STATIC int
118xfs_vn_mknod( 130xfs_generic_create(
119 struct inode *dir, 131 struct inode *dir,
120 struct dentry *dentry, 132 struct dentry *dentry,
121 umode_t mode, 133 umode_t mode,
122 dev_t rdev) 134 dev_t rdev,
135 bool tmpfile) /* unnamed file */
123{ 136{
124 struct inode *inode; 137 struct inode *inode;
125 struct xfs_inode *ip = NULL; 138 struct xfs_inode *ip = NULL;
@@ -143,8 +156,12 @@ xfs_vn_mknod(
143 if (error) 156 if (error)
144 return error; 157 return error;
145 158
146 xfs_dentry_to_name(&name, dentry, mode); 159 if (!tmpfile) {
147 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); 160 xfs_dentry_to_name(&name, dentry, mode);
161 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
162 } else {
163 error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
164 }
148 if (unlikely(error)) 165 if (unlikely(error))
149 goto out_free_acl; 166 goto out_free_acl;
150 167
@@ -156,18 +173,22 @@ xfs_vn_mknod(
156 173
157#ifdef CONFIG_XFS_POSIX_ACL 174#ifdef CONFIG_XFS_POSIX_ACL
158 if (default_acl) { 175 if (default_acl) {
159 error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); 176 error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
160 if (error) 177 if (error)
161 goto out_cleanup_inode; 178 goto out_cleanup_inode;
162 } 179 }
163 if (acl) { 180 if (acl) {
164 error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); 181 error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
165 if (error) 182 if (error)
166 goto out_cleanup_inode; 183 goto out_cleanup_inode;
167 } 184 }
168#endif 185#endif
169 186
170 d_instantiate(dentry, inode); 187 if (tmpfile)
188 d_tmpfile(dentry, inode);
189 else
190 d_instantiate(dentry, inode);
191
171 out_free_acl: 192 out_free_acl:
172 if (default_acl) 193 if (default_acl)
173 posix_acl_release(default_acl); 194 posix_acl_release(default_acl);
@@ -176,11 +197,23 @@ xfs_vn_mknod(
176 return -error; 197 return -error;
177 198
178 out_cleanup_inode: 199 out_cleanup_inode:
179 xfs_cleanup_inode(dir, inode, dentry); 200 if (!tmpfile)
201 xfs_cleanup_inode(dir, inode, dentry);
202 iput(inode);
180 goto out_free_acl; 203 goto out_free_acl;
181} 204}
182 205
183STATIC int 206STATIC int
207xfs_vn_mknod(
208 struct inode *dir,
209 struct dentry *dentry,
210 umode_t mode,
211 dev_t rdev)
212{
213 return xfs_generic_create(dir, dentry, mode, rdev, false);
214}
215
216STATIC int
184xfs_vn_create( 217xfs_vn_create(
185 struct inode *dir, 218 struct inode *dir,
186 struct dentry *dentry, 219 struct dentry *dentry,
@@ -340,6 +373,7 @@ xfs_vn_symlink(
340 373
341 out_cleanup_inode: 374 out_cleanup_inode:
342 xfs_cleanup_inode(dir, inode, dentry); 375 xfs_cleanup_inode(dir, inode, dentry);
376 iput(inode);
343 out: 377 out:
344 return -error; 378 return -error;
345} 379}
@@ -1034,6 +1068,15 @@ xfs_vn_fiemap(
1034 return 0; 1068 return 0;
1035} 1069}
1036 1070
1071STATIC int
1072xfs_vn_tmpfile(
1073 struct inode *dir,
1074 struct dentry *dentry,
1075 umode_t mode)
1076{
1077 return xfs_generic_create(dir, dentry, mode, 0, true);
1078}
1079
1037static const struct inode_operations xfs_inode_operations = { 1080static const struct inode_operations xfs_inode_operations = {
1038 .get_acl = xfs_get_acl, 1081 .get_acl = xfs_get_acl,
1039 .set_acl = xfs_set_acl, 1082 .set_acl = xfs_set_acl,
@@ -1072,6 +1115,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1072 .removexattr = generic_removexattr, 1115 .removexattr = generic_removexattr,
1073 .listxattr = xfs_vn_listxattr, 1116 .listxattr = xfs_vn_listxattr,
1074 .update_time = xfs_vn_update_time, 1117 .update_time = xfs_vn_update_time,
1118 .tmpfile = xfs_vn_tmpfile,
1075}; 1119};
1076 1120
1077static const struct inode_operations xfs_dir_ci_inode_operations = { 1121static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1099,6 +1143,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1099 .removexattr = generic_removexattr, 1143 .removexattr = generic_removexattr,
1100 .listxattr = xfs_vn_listxattr, 1144 .listxattr = xfs_vn_listxattr,
1101 .update_time = xfs_vn_update_time, 1145 .update_time = xfs_vn_update_time,
1146 .tmpfile = xfs_vn_tmpfile,
1102}; 1147};
1103 1148
1104static const struct inode_operations xfs_symlink_inode_operations = { 1149static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1191,6 +1236,7 @@ xfs_setup_inode(
1191 xfs_diflags_to_iflags(inode, ip); 1236 xfs_diflags_to_iflags(inode, ip);
1192 1237
1193 ip->d_ops = ip->i_mount->m_nondir_inode_ops; 1238 ip->d_ops = ip->i_mount->m_nondir_inode_ops;
1239 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
1194 switch (inode->i_mode & S_IFMT) { 1240 switch (inode->i_mode & S_IFMT) {
1195 case S_IFREG: 1241 case S_IFREG:
1196 inode->i_op = &xfs_inode_operations; 1242 inode->i_op = &xfs_inode_operations;
@@ -1198,6 +1244,7 @@ xfs_setup_inode(
1198 inode->i_mapping->a_ops = &xfs_address_space_operations; 1244 inode->i_mapping->a_ops = &xfs_address_space_operations;
1199 break; 1245 break;
1200 case S_IFDIR: 1246 case S_IFDIR:
1247 lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
1201 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) 1248 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
1202 inode->i_op = &xfs_dir_ci_inode_operations; 1249 inode->i_op = &xfs_dir_ci_inode_operations;
1203 else 1250 else
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index f9bb590acc0e..825249d2dfc1 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t;
119#include "xfs_iops.h" 119#include "xfs_iops.h"
120#include "xfs_aops.h" 120#include "xfs_aops.h"
121#include "xfs_super.h" 121#include "xfs_super.h"
122#include "xfs_cksum.h"
122#include "xfs_buf.h" 123#include "xfs_buf.h"
123#include "xfs_message.h" 124#include "xfs_message.h"
124 125
@@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t;
178#define ENOATTR ENODATA /* Attribute not found */ 179#define ENOATTR ENODATA /* Attribute not found */
179#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ 180#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
180#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 181#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
182#define EFSBADCRC EBADMSG /* Bad CRC detected */
181 183
182#define SYNCHRONIZE() barrier() 184#define SYNCHRONIZE() barrier()
183#define __return_address __builtin_return_address(0) 185#define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..a5f8bd9899d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -616,11 +616,13 @@ xfs_log_mount(
616 int error = 0; 616 int error = 0;
617 int min_logfsbs; 617 int min_logfsbs;
618 618
619 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 619 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
620 xfs_notice(mp, "Mounting Filesystem"); 620 xfs_notice(mp, "Mounting V%d Filesystem",
621 else { 621 XFS_SB_VERSION_NUM(&mp->m_sb));
622 } else {
622 xfs_notice(mp, 623 xfs_notice(mp,
623"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); 624"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
625 XFS_SB_VERSION_NUM(&mp->m_sb));
624 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 626 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
625 } 627 }
626 628
@@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
1181 /* log I/O is always issued ASYNC */ 1183 /* log I/O is always issued ASYNC */
1182 ASSERT(XFS_BUF_ISASYNC(bp)); 1184 ASSERT(XFS_BUF_ISASYNC(bp));
1183 xlog_state_done_syncing(iclog, aborted); 1185 xlog_state_done_syncing(iclog, aborted);
1186
1184 /* 1187 /*
1185 * do not reference the buffer (bp) here as we could race 1188 * drop the buffer lock now that we are done. Nothing references
1186 * with it being freed after writing the unmount record to the 1189 * the buffer after this, so an unmount waiting on this lock can now
1187 * log. 1190 * tear it down safely. As such, it is unsafe to reference the buffer
1191 * (bp) after the unlock as we could race with it being freed.
1188 */ 1192 */
1193 xfs_buf_unlock(bp);
1189} 1194}
1190 1195
1191/* 1196/*
@@ -1368,8 +1373,16 @@ xlog_alloc_log(
1368 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); 1373 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
1369 if (!bp) 1374 if (!bp)
1370 goto out_free_log; 1375 goto out_free_log;
1371 bp->b_iodone = xlog_iodone; 1376
1377 /*
1378 * The iclogbuf buffer locks are held over IO but we are not going to do
1379 * IO yet. Hence unlock the buffer so that the log IO path can grab it
1380 * when appropriately.
1381 */
1372 ASSERT(xfs_buf_islocked(bp)); 1382 ASSERT(xfs_buf_islocked(bp));
1383 xfs_buf_unlock(bp);
1384
1385 bp->b_iodone = xlog_iodone;
1373 log->l_xbuf = bp; 1386 log->l_xbuf = bp;
1374 1387
1375 spin_lock_init(&log->l_icloglock); 1388 spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1411,9 @@ xlog_alloc_log(
1398 if (!bp) 1411 if (!bp)
1399 goto out_free_iclog; 1412 goto out_free_iclog;
1400 1413
1414 ASSERT(xfs_buf_islocked(bp));
1415 xfs_buf_unlock(bp);
1416
1401 bp->b_iodone = xlog_iodone; 1417 bp->b_iodone = xlog_iodone;
1402 iclog->ic_bp = bp; 1418 iclog->ic_bp = bp;
1403 iclog->ic_data = bp->b_addr; 1419 iclog->ic_data = bp->b_addr;
@@ -1422,7 +1438,6 @@ xlog_alloc_log(
1422 iclog->ic_callback_tail = &(iclog->ic_callback); 1438 iclog->ic_callback_tail = &(iclog->ic_callback);
1423 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1439 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1424 1440
1425 ASSERT(xfs_buf_islocked(iclog->ic_bp));
1426 init_waitqueue_head(&iclog->ic_force_wait); 1441 init_waitqueue_head(&iclog->ic_force_wait);
1427 init_waitqueue_head(&iclog->ic_write_wait); 1442 init_waitqueue_head(&iclog->ic_write_wait);
1428 1443
@@ -1631,6 +1646,12 @@ xlog_cksum(
1631 * we transition the iclogs to IOERROR state *after* flushing all existing 1646 * we transition the iclogs to IOERROR state *after* flushing all existing
1632 * iclogs to disk. This is because we don't want anymore new transactions to be 1647 * iclogs to disk. This is because we don't want anymore new transactions to be
1633 * started or completed afterwards. 1648 * started or completed afterwards.
1649 *
1650 * We lock the iclogbufs here so that we can serialise against IO completion
1651 * during unmount. We might be processing a shutdown triggered during unmount,
1652 * and that can occur asynchronously to the unmount thread, and hence we need to
1653 * ensure that completes before tearing down the iclogbufs. Hence we need to
1654 * hold the buffer lock across the log IO to acheive that.
1634 */ 1655 */
1635STATIC int 1656STATIC int
1636xlog_bdstrat( 1657xlog_bdstrat(
@@ -1638,6 +1659,7 @@ xlog_bdstrat(
1638{ 1659{
1639 struct xlog_in_core *iclog = bp->b_fspriv; 1660 struct xlog_in_core *iclog = bp->b_fspriv;
1640 1661
1662 xfs_buf_lock(bp);
1641 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1663 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1642 xfs_buf_ioerror(bp, EIO); 1664 xfs_buf_ioerror(bp, EIO);
1643 xfs_buf_stale(bp); 1665 xfs_buf_stale(bp);
@@ -1645,7 +1667,8 @@ xlog_bdstrat(
1645 /* 1667 /*
1646 * It would seem logical to return EIO here, but we rely on 1668 * It would seem logical to return EIO here, but we rely on
1647 * the log state machine to propagate I/O errors instead of 1669 * the log state machine to propagate I/O errors instead of
1648 * doing it here. 1670 * doing it here. Similarly, IO completion will unlock the
1671 * buffer, so we don't do it here.
1649 */ 1672 */
1650 return 0; 1673 return 0;
1651 } 1674 }
@@ -1847,14 +1870,28 @@ xlog_dealloc_log(
1847 xlog_cil_destroy(log); 1870 xlog_cil_destroy(log);
1848 1871
1849 /* 1872 /*
1850 * always need to ensure that the extra buffer does not point to memory 1873 * Cycle all the iclogbuf locks to make sure all log IO completion
1851 * owned by another log buffer before we free it. 1874 * is done before we tear down these buffers.
1852 */ 1875 */
1876 iclog = log->l_iclog;
1877 for (i = 0; i < log->l_iclog_bufs; i++) {
1878 xfs_buf_lock(iclog->ic_bp);
1879 xfs_buf_unlock(iclog->ic_bp);
1880 iclog = iclog->ic_next;
1881 }
1882
1883 /*
1884 * Always need to ensure that the extra buffer does not point to memory
1885 * owned by another log buffer before we free it. Also, cycle the lock
1886 * first to ensure we've completed IO on it.
1887 */
1888 xfs_buf_lock(log->l_xbuf);
1889 xfs_buf_unlock(log->l_xbuf);
1853 xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); 1890 xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
1854 xfs_buf_free(log->l_xbuf); 1891 xfs_buf_free(log->l_xbuf);
1855 1892
1856 iclog = log->l_iclog; 1893 iclog = log->l_iclog;
1857 for (i=0; i<log->l_iclog_bufs; i++) { 1894 for (i = 0; i < log->l_iclog_bufs; i++) {
1858 xfs_buf_free(iclog->ic_bp); 1895 xfs_buf_free(iclog->ic_bp);
1859 next_iclog = iclog->ic_next; 1896 next_iclog = iclog->ic_next;
1860 kmem_free(iclog); 1897 kmem_free(iclog);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b0f4ef77fa70..2c4004475e71 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -175,7 +175,7 @@ void xlog_iodone(struct xfs_buf *);
175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 175struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
176void xfs_log_ticket_put(struct xlog_ticket *ticket); 176void xfs_log_ticket_put(struct xlog_ticket *ticket);
177 177
178int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 178void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
179 xfs_lsn_t *commit_lsn, int flags); 179 xfs_lsn_t *commit_lsn, int flags);
180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 180bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
181 181
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4ef6fdbced78..7e5455391176 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -499,13 +499,6 @@ xlog_cil_push(
499 cil->xc_ctx = new_ctx; 499 cil->xc_ctx = new_ctx;
500 500
501 /* 501 /*
502 * mirror the new sequence into the cil structure so that we can do
503 * unlocked checks against the current sequence in log forces without
504 * risking deferencing a freed context pointer.
505 */
506 cil->xc_current_sequence = new_ctx->sequence;
507
508 /*
509 * The switch is now done, so we can drop the context lock and move out 502 * The switch is now done, so we can drop the context lock and move out
510 * of a shared context. We can't just go straight to the commit record, 503 * of a shared context. We can't just go straight to the commit record,
511 * though - we need to synchronise with previous and future commits so 504 * though - we need to synchronise with previous and future commits so
@@ -523,8 +516,15 @@ xlog_cil_push(
523 * Hence we need to add this context to the committing context list so 516 * Hence we need to add this context to the committing context list so
524 * that higher sequences will wait for us to write out a commit record 517 * that higher sequences will wait for us to write out a commit record
525 * before they do. 518 * before they do.
519 *
520 * xfs_log_force_lsn requires us to mirror the new sequence into the cil
521 * structure atomically with the addition of this sequence to the
522 * committing list. This also ensures that we can do unlocked checks
523 * against the current sequence in log forces without risking
524 * deferencing a freed context pointer.
526 */ 525 */
527 spin_lock(&cil->xc_push_lock); 526 spin_lock(&cil->xc_push_lock);
527 cil->xc_current_sequence = new_ctx->sequence;
528 list_add(&ctx->committing, &cil->xc_committing); 528 list_add(&ctx->committing, &cil->xc_committing);
529 spin_unlock(&cil->xc_push_lock); 529 spin_unlock(&cil->xc_push_lock);
530 up_write(&cil->xc_ctx_lock); 530 up_write(&cil->xc_ctx_lock);
@@ -662,8 +662,14 @@ xlog_cil_push_background(
662 662
663} 663}
664 664
665/*
666 * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
667 * number that is passed. When it returns, the work will be queued for
668 * @push_seq, but it won't be completed. The caller is expected to do any
669 * waiting for push_seq to complete if it is required.
670 */
665static void 671static void
666xlog_cil_push_foreground( 672xlog_cil_push_now(
667 struct xlog *log, 673 struct xlog *log,
668 xfs_lsn_t push_seq) 674 xfs_lsn_t push_seq)
669{ 675{
@@ -688,10 +694,8 @@ xlog_cil_push_foreground(
688 } 694 }
689 695
690 cil->xc_push_seq = push_seq; 696 cil->xc_push_seq = push_seq;
697 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
691 spin_unlock(&cil->xc_push_lock); 698 spin_unlock(&cil->xc_push_lock);
692
693 /* do the push now */
694 xlog_cil_push(log);
695} 699}
696 700
697bool 701bool
@@ -721,7 +725,7 @@ xlog_cil_empty(
721 * background commit, returns without it held once background commits are 725 * background commit, returns without it held once background commits are
722 * allowed again. 726 * allowed again.
723 */ 727 */
724int 728void
725xfs_log_commit_cil( 729xfs_log_commit_cil(
726 struct xfs_mount *mp, 730 struct xfs_mount *mp,
727 struct xfs_trans *tp, 731 struct xfs_trans *tp,
@@ -767,7 +771,6 @@ xfs_log_commit_cil(
767 xlog_cil_push_background(log); 771 xlog_cil_push_background(log);
768 772
769 up_read(&cil->xc_ctx_lock); 773 up_read(&cil->xc_ctx_lock);
770 return 0;
771} 774}
772 775
773/* 776/*
@@ -796,7 +799,8 @@ xlog_cil_force_lsn(
796 * xlog_cil_push() handles racing pushes for the same sequence, 799 * xlog_cil_push() handles racing pushes for the same sequence,
797 * so no need to deal with it here. 800 * so no need to deal with it here.
798 */ 801 */
799 xlog_cil_push_foreground(log, sequence); 802restart:
803 xlog_cil_push_now(log, sequence);
800 804
801 /* 805 /*
802 * See if we can find a previous sequence still committing. 806 * See if we can find a previous sequence still committing.
@@ -804,7 +808,6 @@ xlog_cil_force_lsn(
804 * before allowing the force of push_seq to go ahead. Hence block 808 * before allowing the force of push_seq to go ahead. Hence block
805 * on commits for those as well. 809 * on commits for those as well.
806 */ 810 */
807restart:
808 spin_lock(&cil->xc_push_lock); 811 spin_lock(&cil->xc_push_lock);
809 list_for_each_entry(ctx, &cil->xc_committing, committing) { 812 list_for_each_entry(ctx, &cil->xc_committing, committing) {
810 if (ctx->sequence > sequence) 813 if (ctx->sequence > sequence)
@@ -822,6 +825,28 @@ restart:
822 /* found it! */ 825 /* found it! */
823 commit_lsn = ctx->commit_lsn; 826 commit_lsn = ctx->commit_lsn;
824 } 827 }
828
829 /*
830 * The call to xlog_cil_push_now() executes the push in the background.
831 * Hence by the time we have got here it our sequence may not have been
832 * pushed yet. This is true if the current sequence still matches the
833 * push sequence after the above wait loop and the CIL still contains
834 * dirty objects.
835 *
836 * When the push occurs, it will empty the CIL and
837 * atomically increment the currect sequence past the push sequence and
838 * move it into the committing list. Of course, if the CIL is clean at
839 * the time of the push, it won't have pushed the CIL at all, so in that
840 * case we should try the push for this sequence again from the start
841 * just in case.
842 */
843
844 if (sequence == cil->xc_current_sequence &&
845 !list_empty(&cil->xc_cil)) {
846 spin_unlock(&cil->xc_push_lock);
847 goto restart;
848 }
849
825 spin_unlock(&cil->xc_push_lock); 850 spin_unlock(&cil->xc_push_lock);
826 return commit_lsn; 851 return commit_lsn;
827} 852}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f96c05669a9e..944f3d9456a8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,6 +314,9 @@ reread:
314 error = bp->b_error; 314 error = bp->b_error;
315 if (loud) 315 if (loud)
316 xfs_warn(mp, "SB validate failed with error %d.", error); 316 xfs_warn(mp, "SB validate failed with error %d.", error);
317 /* bad CRC means corrupted metadata */
318 if (error == EFSBADCRC)
319 error = EFSCORRUPTED;
317 goto release_buf; 320 goto release_buf;
318 } 321 }
319 322
@@ -740,8 +743,6 @@ xfs_mountfs(
740 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; 743 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
741 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) 744 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
742 mp->m_inode_cluster_size = new_size; 745 mp->m_inode_cluster_size = new_size;
743 xfs_info(mp, "Using inode cluster size of %d bytes",
744 mp->m_inode_cluster_size);
745 } 746 }
746 747
747 /* 748 /*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 348e4d2ed6e6..dc977b6e6a36 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -843,22 +843,17 @@ xfs_qm_init_quotainfo(
843 843
844 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 844 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
845 845
846 if ((error = list_lru_init(&qinf->qi_lru))) { 846 error = -list_lru_init(&qinf->qi_lru);
847 kmem_free(qinf); 847 if (error)
848 mp->m_quotainfo = NULL; 848 goto out_free_qinf;
849 return error;
850 }
851 849
852 /* 850 /*
853 * See if quotainodes are setup, and if not, allocate them, 851 * See if quotainodes are setup, and if not, allocate them,
854 * and change the superblock accordingly. 852 * and change the superblock accordingly.
855 */ 853 */
856 if ((error = xfs_qm_init_quotainos(mp))) { 854 error = xfs_qm_init_quotainos(mp);
857 list_lru_destroy(&qinf->qi_lru); 855 if (error)
858 kmem_free(qinf); 856 goto out_free_lru;
859 mp->m_quotainfo = NULL;
860 return error;
861 }
862 857
863 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); 858 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
864 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); 859 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -918,7 +913,7 @@ xfs_qm_init_quotainfo(
918 qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); 913 qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
919 qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); 914 qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
920 qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); 915 qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
921 916
922 xfs_qm_dqdestroy(dqp); 917 xfs_qm_dqdestroy(dqp);
923 } else { 918 } else {
924 qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; 919 qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -935,6 +930,13 @@ xfs_qm_init_quotainfo(
935 qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; 930 qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
936 register_shrinker(&qinf->qi_shrinker); 931 register_shrinker(&qinf->qi_shrinker);
937 return 0; 932 return 0;
933
934out_free_lru:
935 list_lru_destroy(&qinf->qi_lru);
936out_free_qinf:
937 kmem_free(qinf);
938 mp->m_quotainfo = NULL;
939 return error;
938} 940}
939 941
940 942
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a6a76b2b6a85..ec5ca65c6211 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -842,7 +842,7 @@ xfs_growfs_rt_alloc(
842 /* 842 /*
843 * Reserve space & log for one extent added to the file. 843 * Reserve space & log for one extent added to the file.
844 */ 844 */
845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, 845 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
846 resblks, 0); 846 resblks, 0);
847 if (error) 847 if (error)
848 goto error_cancel; 848 goto error_cancel;
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1e116794bb66..8baf61afae1d 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -201,10 +201,6 @@ xfs_mount_validate_sb(
201 * write validation, we don't need to check feature masks. 201 * write validation, we don't need to check feature masks.
202 */ 202 */
203 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { 203 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
204 xfs_alert(mp,
205"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
206"Use of these features in this kernel is at your own risk!");
207
208 if (xfs_sb_has_compat_feature(sbp, 204 if (xfs_sb_has_compat_feature(sbp,
209 XFS_SB_FEAT_COMPAT_UNKNOWN)) { 205 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
210 xfs_warn(mp, 206 xfs_warn(mp,
@@ -288,6 +284,7 @@ xfs_mount_validate_sb(
288 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 284 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
289 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 285 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
290 sbp->sb_inodesize != (1 << sbp->sb_inodelog) || 286 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
287 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
291 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 288 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
292 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 289 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
293 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 290 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
@@ -610,12 +607,11 @@ xfs_sb_read_verify(
610 XFS_SB_VERSION_5) || 607 XFS_SB_VERSION_5) ||
611 dsb->sb_crc != 0)) { 608 dsb->sb_crc != 0)) {
612 609
613 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 610 if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
614 offsetof(struct xfs_sb, sb_crc))) {
615 /* Only fail bad secondaries on a known V5 filesystem */ 611 /* Only fail bad secondaries on a known V5 filesystem */
616 if (bp->b_bn == XFS_SB_DADDR || 612 if (bp->b_bn == XFS_SB_DADDR ||
617 xfs_sb_version_hascrc(&mp->m_sb)) { 613 xfs_sb_version_hascrc(&mp->m_sb)) {
618 error = EFSCORRUPTED; 614 error = EFSBADCRC;
619 goto out_error; 615 goto out_error;
620 } 616 }
621 } 617 }
@@ -624,10 +620,9 @@ xfs_sb_read_verify(
624 620
625out_error: 621out_error:
626 if (error) { 622 if (error) {
627 if (error == EFSCORRUPTED)
628 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
629 mp, bp->b_addr);
630 xfs_buf_ioerror(bp, error); 623 xfs_buf_ioerror(bp, error);
624 if (error == EFSCORRUPTED || error == EFSBADCRC)
625 xfs_verifier_error(bp);
631 } 626 }
632} 627}
633 628
@@ -662,9 +657,8 @@ xfs_sb_write_verify(
662 657
663 error = xfs_sb_verify(bp, false); 658 error = xfs_sb_verify(bp, false);
664 if (error) { 659 if (error) {
665 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
666 mp, bp->b_addr);
667 xfs_buf_ioerror(bp, error); 660 xfs_buf_ioerror(bp, error);
661 xfs_verifier_error(bp);
668 return; 662 return;
669 } 663 }
670 664
@@ -674,8 +668,7 @@ xfs_sb_write_verify(
674 if (bip) 668 if (bip)
675 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 669 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
676 670
677 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 671 xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
678 offsetof(struct xfs_sb, sb_crc));
679} 672}
680 673
681const struct xfs_buf_ops xfs_sb_buf_ops = { 674const struct xfs_buf_ops xfs_sb_buf_ops = {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 35061d4b614c..f7b2fe77c5a5 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -182,6 +182,8 @@ typedef struct xfs_sb {
182 /* must be padded to 64 bit alignment */ 182 /* must be padded to 64 bit alignment */
183} xfs_sb_t; 183} xfs_sb_t;
184 184
185#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
186
185/* 187/*
186 * Superblock - on disk version. Must match the in core version above. 188 * Superblock - on disk version. Must match the in core version above.
187 * Must be padded to 64 bit alignment. 189 * Must be padded to 64 bit alignment.
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 8c5035a13df1..4484e5151395 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -104,7 +104,8 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
104#define XFS_TRANS_SB_COUNT 41 104#define XFS_TRANS_SB_COUNT 41
105#define XFS_TRANS_CHECKPOINT 42 105#define XFS_TRANS_CHECKPOINT 42
106#define XFS_TRANS_ICREATE 43 106#define XFS_TRANS_ICREATE 43
107#define XFS_TRANS_TYPE_MAX 43 107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
108/* new transaction types need to be reflected in xfs_logprint(8) */ 109/* new transaction types need to be reflected in xfs_logprint(8) */
109 110
110#define XFS_TRANS_TYPES \ 111#define XFS_TRANS_TYPES \
@@ -112,6 +113,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
112 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ 113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
113 { XFS_TRANS_INACTIVE, "INACTIVE" }, \ 114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
114 { XFS_TRANS_CREATE, "CREATE" }, \ 115 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
115 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ 117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
116 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ 118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
117 { XFS_TRANS_REMOVE, "REMOVE" }, \ 119 { XFS_TRANS_REMOVE, "REMOVE" }, \
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d971f4932b5d..3494eff8e4eb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -996,7 +996,7 @@ xfs_fs_evict_inode(
996 996
997 trace_xfs_evict_inode(ip); 997 trace_xfs_evict_inode(ip);
998 998
999 truncate_inode_pages(&inode->i_data, 0); 999 truncate_inode_pages_final(&inode->i_data);
1000 clear_inode(inode); 1000 clear_inode(inode);
1001 XFS_STATS_INC(vn_rele); 1001 XFS_STATS_INC(vn_rele);
1002 XFS_STATS_INC(vn_remove); 1002 XFS_STATS_INC(vn_remove);
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
1197 char *p; 1197 char *p;
1198 int error; 1198 int error;
1199 1199
1200 sync_filesystem(sb);
1200 while ((p = strsep(&options, ",")) != NULL) { 1201 while ((p = strsep(&options, ",")) != NULL) {
1201 int token; 1202 int token;
1202 1203
@@ -1432,11 +1433,11 @@ xfs_fs_fill_super(
1432 if (error) 1433 if (error)
1433 goto out_free_fsname; 1434 goto out_free_fsname;
1434 1435
1435 error = xfs_init_mount_workqueues(mp); 1436 error = -xfs_init_mount_workqueues(mp);
1436 if (error) 1437 if (error)
1437 goto out_close_devices; 1438 goto out_close_devices;
1438 1439
1439 error = xfs_icsb_init_counters(mp); 1440 error = -xfs_icsb_init_counters(mp);
1440 if (error) 1441 if (error)
1441 goto out_destroy_workqueues; 1442 goto out_destroy_workqueues;
1442 1443
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 14e58f2c96bd..52979aa90986 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -80,6 +80,10 @@ xfs_readlink_bmap(
80 if (error) { 80 if (error) {
81 xfs_buf_ioerror_alert(bp, __func__); 81 xfs_buf_ioerror_alert(bp, __func__);
82 xfs_buf_relse(bp); 82 xfs_buf_relse(bp);
83
84 /* bad CRC means corrupted metadata */
85 if (error == EFSBADCRC)
86 error = EFSCORRUPTED;
83 goto out; 87 goto out;
84 } 88 }
85 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); 89 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -208,10 +212,7 @@ xfs_symlink(
208 return XFS_ERROR(ENAMETOOLONG); 212 return XFS_ERROR(ENAMETOOLONG);
209 213
210 udqp = gdqp = NULL; 214 udqp = gdqp = NULL;
211 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 215 prid = xfs_get_initial_prid(dp);
212 prid = xfs_get_projid(dp);
213 else
214 prid = XFS_PROJID_DEFAULT;
215 216
216 /* 217 /*
217 * Make sure that we have allocated dquot(s) on disk. 218 * Make sure that we have allocated dquot(s) on disk.
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index bf59a2b45f8c..9b32052ff65e 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -133,12 +133,13 @@ xfs_symlink_read_verify(
133 if (!xfs_sb_version_hascrc(&mp->m_sb)) 133 if (!xfs_sb_version_hascrc(&mp->m_sb))
134 return; 134 return;
135 135
136 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), 136 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
137 offsetof(struct xfs_dsymlink_hdr, sl_crc)) || 137 xfs_buf_ioerror(bp, EFSBADCRC);
138 !xfs_symlink_verify(bp)) { 138 else if (!xfs_symlink_verify(bp))
139 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
140 xfs_buf_ioerror(bp, EFSCORRUPTED); 139 xfs_buf_ioerror(bp, EFSCORRUPTED);
141 } 140
141 if (bp->b_error)
142 xfs_verifier_error(bp);
142} 143}
143 144
144static void 145static void
@@ -153,8 +154,8 @@ xfs_symlink_write_verify(
153 return; 154 return;
154 155
155 if (!xfs_symlink_verify(bp)) { 156 if (!xfs_symlink_verify(bp)) {
156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
157 xfs_buf_ioerror(bp, EFSCORRUPTED); 157 xfs_buf_ioerror(bp, EFSCORRUPTED);
158 xfs_verifier_error(bp);
158 return; 159 return;
159 } 160 }
160 161
@@ -162,8 +163,7 @@ xfs_symlink_write_verify(
162 struct xfs_dsymlink_hdr *dsl = bp->b_addr; 163 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
163 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 164 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
164 } 165 }
165 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), 166 xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
166 offsetof(struct xfs_dsymlink_hdr, sl_crc));
167} 167}
168 168
169const struct xfs_buf_ops xfs_symlink_buf_ops = { 169const struct xfs_buf_ops xfs_symlink_buf_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 425dfa45b9a0..65d8c793a25c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,8 @@ DEFINE_INODE_EVENT(xfs_readlink);
603DEFINE_INODE_EVENT(xfs_inactive_symlink); 603DEFINE_INODE_EVENT(xfs_inactive_symlink);
604DEFINE_INODE_EVENT(xfs_alloc_file_space); 604DEFINE_INODE_EVENT(xfs_alloc_file_space);
605DEFINE_INODE_EVENT(xfs_free_file_space); 605DEFINE_INODE_EVENT(xfs_free_file_space);
606DEFINE_INODE_EVENT(xfs_zero_file_space);
607DEFINE_INODE_EVENT(xfs_collapse_file_space);
606DEFINE_INODE_EVENT(xfs_readdir); 608DEFINE_INODE_EVENT(xfs_readdir);
607#ifdef CONFIG_XFS_POSIX_ACL 609#ifdef CONFIG_XFS_POSIX_ACL
608DEFINE_INODE_EVENT(xfs_get_acl); 610DEFINE_INODE_EVENT(xfs_get_acl);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c812c5c060de..54a57326d85b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -887,12 +887,7 @@ xfs_trans_commit(
887 xfs_trans_apply_sb_deltas(tp); 887 xfs_trans_apply_sb_deltas(tp);
888 xfs_trans_apply_dquot_deltas(tp); 888 xfs_trans_apply_dquot_deltas(tp);
889 889
890 error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); 890 xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
891 if (error == ENOMEM) {
892 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
893 error = XFS_ERROR(EIO);
894 goto out_unreserve;
895 }
896 891
897 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 892 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
898 xfs_trans_free(tp); 893 xfs_trans_free(tp);
@@ -902,10 +897,7 @@ xfs_trans_commit(
902 * log out now and wait for it. 897 * log out now and wait for it.
903 */ 898 */
904 if (sync) { 899 if (sync) {
905 if (!error) { 900 error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
906 error = _xfs_log_force_lsn(mp, commit_lsn,
907 XFS_LOG_SYNC, NULL);
908 }
909 XFS_STATS_INC(xs_trans_sync); 901 XFS_STATS_INC(xs_trans_sync);
910 } else { 902 } else {
911 XFS_STATS_INC(xs_trans_async); 903 XFS_STATS_INC(xs_trans_async);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 647b6f1d8923..b8eef0549f3f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -275,6 +275,10 @@ xfs_trans_read_buf_map(
275 XFS_BUF_UNDONE(bp); 275 XFS_BUF_UNDONE(bp);
276 xfs_buf_stale(bp); 276 xfs_buf_stale(bp);
277 xfs_buf_relse(bp); 277 xfs_buf_relse(bp);
278
279 /* bad CRC means corrupted metadata */
280 if (error == EFSBADCRC)
281 error = EFSCORRUPTED;
278 return error; 282 return error;
279 } 283 }
280#ifdef DEBUG 284#ifdef DEBUG
@@ -338,6 +342,9 @@ xfs_trans_read_buf_map(
338 if (tp->t_flags & XFS_TRANS_DIRTY) 342 if (tp->t_flags & XFS_TRANS_DIRTY)
339 xfs_force_shutdown(tp->t_mountp, 343 xfs_force_shutdown(tp->t_mountp,
340 SHUTDOWN_META_IO_ERROR); 344 SHUTDOWN_META_IO_ERROR);
345 /* bad CRC means corrupted metadata */
346 if (error == EFSBADCRC)
347 error = EFSCORRUPTED;
341 return error; 348 return error;
342 } 349 }
343 } 350 }
@@ -375,6 +382,10 @@ xfs_trans_read_buf_map(
375 if (tp->t_flags & XFS_TRANS_DIRTY) 382 if (tp->t_flags & XFS_TRANS_DIRTY)
376 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 383 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
377 xfs_buf_relse(bp); 384 xfs_buf_relse(bp);
385
386 /* bad CRC means corrupted metadata */
387 if (error == EFSBADCRC)
388 error = EFSCORRUPTED;
378 return error; 389 return error;
379 } 390 }
380#ifdef DEBUG 391#ifdef DEBUG
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2ffd3e331b49..ae368165244d 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -81,20 +81,28 @@ xfs_calc_buf_res(
81 * on disk. Hence we need an inode reservation function that calculates all this 81 * on disk. Hence we need an inode reservation function that calculates all this
82 * correctly. So, we log: 82 * correctly. So, we log:
83 * 83 *
84 * - log op headers for object 84 * - 4 log op headers for object
85 * - for the ilf, the inode core and 2 forks
85 * - inode log format object 86 * - inode log format object
86 * - the entire inode contents (core + 2 forks) 87 * - the inode core
87 * - two bmap btree block headers 88 * - two inode forks containing bmap btree root blocks.
89 * - the btree data contained by both forks will fit into the inode size,
90 * hence when combined with the inode core above, we have a total of the
91 * actual inode size.
92 * - the BMBT headers need to be accounted separately, as they are
93 * additional to the records and pointers that fit inside the inode
94 * forks.
88 */ 95 */
89STATIC uint 96STATIC uint
90xfs_calc_inode_res( 97xfs_calc_inode_res(
91 struct xfs_mount *mp, 98 struct xfs_mount *mp,
92 uint ninodes) 99 uint ninodes)
93{ 100{
94 return ninodes * (sizeof(struct xlog_op_header) + 101 return ninodes *
95 sizeof(struct xfs_inode_log_format) + 102 (4 * sizeof(struct xlog_op_header) +
96 mp->m_sb.sb_inodesize + 103 sizeof(struct xfs_inode_log_format) +
97 2 * XFS_BMBT_BLOCK_LEN(mp)); 104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
98} 106}
99 107
100/* 108/*
@@ -204,6 +212,19 @@ xfs_calc_rename_reservation(
204} 212}
205 213
206/* 214/*
215 * For removing an inode from unlinked list at first, we can modify:
216 * the agi hash list and counters: sector size
217 * the on disk inode before ours in the agi hash list: inode cluster size
218 */
219STATIC uint
220xfs_calc_iunlink_remove_reservation(
221 struct xfs_mount *mp)
222{
223 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
224 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
225}
226
227/*
207 * For creating a link to an inode: 228 * For creating a link to an inode:
208 * the parent directory inode: inode size 229 * the parent directory inode: inode size
209 * the linked inode: inode size 230 * the linked inode: inode size
@@ -220,6 +241,7 @@ xfs_calc_link_reservation(
220 struct xfs_mount *mp) 241 struct xfs_mount *mp)
221{ 242{
222 return XFS_DQUOT_LOGRES(mp) + 243 return XFS_DQUOT_LOGRES(mp) +
244 xfs_calc_iunlink_remove_reservation(mp) +
223 MAX((xfs_calc_inode_res(mp, 2) + 245 MAX((xfs_calc_inode_res(mp, 2) +
224 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 246 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
225 XFS_FSB_TO_B(mp, 1))), 247 XFS_FSB_TO_B(mp, 1))),
@@ -229,6 +251,18 @@ xfs_calc_link_reservation(
229} 251}
230 252
231/* 253/*
254 * For adding an inode to unlinked list we can modify:
255 * the agi hash list: sector size
256 * the unlinked inode: inode size
257 */
258STATIC uint
259xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
260{
261 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
262 xfs_calc_inode_res(mp, 1);
263}
264
265/*
232 * For removing a directory entry we can modify: 266 * For removing a directory entry we can modify:
233 * the parent directory inode: inode size 267 * the parent directory inode: inode size
234 * the removed inode: inode size 268 * the removed inode: inode size
@@ -245,10 +279,11 @@ xfs_calc_remove_reservation(
245 struct xfs_mount *mp) 279 struct xfs_mount *mp)
246{ 280{
247 return XFS_DQUOT_LOGRES(mp) + 281 return XFS_DQUOT_LOGRES(mp) +
248 MAX((xfs_calc_inode_res(mp, 2) + 282 xfs_calc_iunlink_add_reservation(mp) +
283 MAX((xfs_calc_inode_res(mp, 1) +
249 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 284 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
250 XFS_FSB_TO_B(mp, 1))), 285 XFS_FSB_TO_B(mp, 1))),
251 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + 286 (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
252 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), 287 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
253 XFS_FSB_TO_B(mp, 1)))); 288 XFS_FSB_TO_B(mp, 1))));
254} 289}
@@ -343,6 +378,20 @@ xfs_calc_create_reservation(
343 378
344} 379}
345 380
381STATIC uint
382xfs_calc_create_tmpfile_reservation(
383 struct xfs_mount *mp)
384{
385 uint res = XFS_DQUOT_LOGRES(mp);
386
387 if (xfs_sb_version_hascrc(&mp->m_sb))
388 res += xfs_calc_icreate_resv_alloc(mp);
389 else
390 res += xfs_calc_create_resv_alloc(mp);
391
392 return res + xfs_calc_iunlink_add_reservation(mp);
393}
394
346/* 395/*
347 * Making a new directory is the same as creating a new file. 396 * Making a new directory is the same as creating a new file.
348 */ 397 */
@@ -383,9 +432,9 @@ xfs_calc_ifree_reservation(
383{ 432{
384 return XFS_DQUOT_LOGRES(mp) + 433 return XFS_DQUOT_LOGRES(mp) +
385 xfs_calc_inode_res(mp, 1) + 434 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 435 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 436 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + 437 xfs_calc_iunlink_remove_reservation(mp) +
389 xfs_calc_buf_res(1, 0) + 438 xfs_calc_buf_res(1, 0) +
390 xfs_calc_buf_res(2 + mp->m_ialloc_blks + 439 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
391 mp->m_in_maxlevels, 0) + 440 mp->m_in_maxlevels, 0) +
@@ -644,15 +693,14 @@ xfs_calc_qm_setqlim_reservation(
644 693
645/* 694/*
646 * Allocating quota on disk if needed. 695 * Allocating quota on disk if needed.
647 * the write transaction log space: M_RES(mp)->tr_write.tr_logres 696 * the write transaction log space for quota file extent allocation
648 * the unit of quota allocation: one system block size 697 * the unit of quota allocation: one system block size
649 */ 698 */
650STATIC uint 699STATIC uint
651xfs_calc_qm_dqalloc_reservation( 700xfs_calc_qm_dqalloc_reservation(
652 struct xfs_mount *mp) 701 struct xfs_mount *mp)
653{ 702{
654 ASSERT(M_RES(mp)->tr_write.tr_logres); 703 return xfs_calc_write_reservation(mp) +
655 return M_RES(mp)->tr_write.tr_logres +
656 xfs_calc_buf_res(1, 704 xfs_calc_buf_res(1,
657 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); 705 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
658} 706}
@@ -729,6 +777,11 @@ xfs_trans_resv_calc(
729 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; 777 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
730 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 778 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
731 779
780 resp->tr_create_tmpfile.tr_logres =
781 xfs_calc_create_tmpfile_reservation(mp);
782 resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
783 resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
784
732 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); 785 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
733 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; 786 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
734 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 787 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -784,7 +837,6 @@ xfs_trans_resv_calc(
784 /* The following transaction are logged in logical format */ 837 /* The following transaction are logged in logical format */
785 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); 838 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
786 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); 839 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
787 resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
788 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); 840 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
789 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); 841 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
790 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); 842 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
index de7de9aaad8a..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/xfs_trans_resv.h
@@ -38,11 +38,11 @@ struct xfs_trans_resv {
38 struct xfs_trans_res tr_remove; /* unlink trans */ 38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */ 39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */ 40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
41 struct xfs_trans_res tr_mkdir; /* mkdir trans */ 42 struct xfs_trans_res tr_mkdir; /* mkdir trans */
42 struct xfs_trans_res tr_ifree; /* inode free trans */ 43 struct xfs_trans_res tr_ifree; /* inode free trans */
43 struct xfs_trans_res tr_ichange; /* inode update trans */ 44 struct xfs_trans_res tr_ichange; /* inode update trans */
44 struct xfs_trans_res tr_growdata; /* fs data section grow trans */ 45 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
45 struct xfs_trans_res tr_swrite; /* sync write inode trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ 46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ 47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer 48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
@@ -100,6 +100,7 @@ struct xfs_trans_resv {
100#define XFS_ITRUNCATE_LOG_COUNT 2 100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2 101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2 102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_CREATE_TMPFILE_LOG_COUNT 2
103#define XFS_MKDIR_LOG_COUNT 3 104#define XFS_MKDIR_LOG_COUNT 3
104#define XFS_SYMLINK_LOG_COUNT 3 105#define XFS_SYMLINK_LOG_COUNT 3
105#define XFS_REMOVE_LOG_COUNT 2 106#define XFS_REMOVE_LOG_COUNT 2